From 3ca68df6ee61e1a2034f3307b9edb9b3d87e5ca1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Oct 2006 20:11:25 -0400
Subject: [GFS2] split gfs2_dinode into on-disk and host variants

The latter is used as part of gfs2-private part of struct inode.
It actually stores a lot of fields differently; for now the
declaration is just cloned, inode field is swtiched and changes
propagated.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index d470e52..191a3df 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -48,7 +48,7 @@
 void gfs2_inode_attr_in(struct gfs2_inode *ip)
 {
 	struct inode *inode = &ip->i_inode;
-	struct gfs2_dinode *di = &ip->i_di;
+	struct gfs2_dinode_host *di = &ip->i_di;
 
 	inode->i_ino = ip->i_num.no_addr;
 
@@ -98,7 +98,7 @@ void gfs2_inode_attr_in(struct gfs2_inode *ip)
 void gfs2_inode_attr_out(struct gfs2_inode *ip)
 {
 	struct inode *inode = &ip->i_inode;
-	struct gfs2_dinode *di = &ip->i_di;
+	struct gfs2_dinode_host *di = &ip->i_di;
 	gfs2_assert_withdraw(GFS2_SB(inode),
 		(di->di_mode & S_IFMT) == (inode->i_mode & S_IFMT));
 	di->di_mode = inode->i_mode;
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index 1025960..52cb9a2 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -153,7 +153,7 @@ void gfs2_quota_in(struct gfs2_quota *qu, const void *buf)
 	qu->qu_value = be64_to_cpu(str->qu_value);
 }
 
-void gfs2_dinode_in(struct gfs2_dinode *di, const void *buf)
+void gfs2_dinode_in(struct gfs2_dinode_host *di, const void *buf)
 {
 	const struct gfs2_dinode *str = buf;
 
@@ -187,7 +187,7 @@ void gfs2_dinode_in(struct gfs2_dinode *di, const void *buf)
 
 }
 
-void gfs2_dinode_out(const struct gfs2_dinode *di, void *buf)
+void gfs2_dinode_out(const struct gfs2_dinode_host *di, void *buf)
 {
 	struct gfs2_dinode *str = buf;
 
@@ -221,7 +221,7 @@ void gfs2_dinode_out(const struct gfs2_dinode *di, void *buf)
 
 }
 
-void gfs2_dinode_print(const struct gfs2_dinode *di)
+void gfs2_dinode_print(const struct gfs2_dinode_host *di)
 {
 	gfs2_meta_header_print(&di->di_header);
 	gfs2_inum_print(&di->di_num);
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index a7ae7c1..f334b4b 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -270,6 +270,48 @@ struct gfs2_dinode {
 	__u8 di_reserved[56];
 };
 
+struct gfs2_dinode_host {
+	struct gfs2_meta_header di_header;
+
+	struct gfs2_inum di_num;
+
+	__be32 di_mode;	/* mode of file */
+	__be32 di_uid;	/* owner's user id */
+	__be32 di_gid;	/* owner's group id */
+	__be32 di_nlink;	/* number of links to this file */
+	__be64 di_size;	/* number of bytes in file */
+	__be64 di_blocks;	/* number of blocks in file */
+	__be64 di_atime;	/* time last accessed */
+	__be64 di_mtime;	/* time last modified */
+	__be64 di_ctime;	/* time last changed */
+	__be32 di_major;	/* device major number */
+	__be32 di_minor;	/* device minor number */
+
+	/* This section varies from gfs1. Padding added to align with
+         * remainder of dinode
+	 */
+	__be64 di_goal_meta;	/* rgrp to alloc from next */
+	__be64 di_goal_data;	/* data block goal */
+	__be64 di_generation;	/* generation number for NFS */
+
+	__be32 di_flags;	/* GFS2_DIF_... */
+	__be32 di_payload_format;  /* GFS2_FORMAT_... */
+	__u16 __pad1;	/* Was ditype in gfs1 */
+	__be16 di_height;	/* height of metadata */
+	__u32 __pad2;	/* Unused incarnation number from gfs1 */
+
+	/* These only apply to directories  */
+	__u16 __pad3;	/* Padding */
+	__be16 di_depth;	/* Number of bits in the table */
+	__be32 di_entries;	/* The number of entries in the directory */
+
+	struct gfs2_inum __pad4; /* Unused even in current gfs1 */
+
+	__be64 di_eattr;	/* extended attribute block number */
+
+	__u8 di_reserved[56];
+};
+
 /*
  * directory structure - many of these per directory file
  */
@@ -422,8 +464,8 @@ extern void gfs2_rgrp_in(struct gfs2_rgrp *rg, const void *buf);
 extern void gfs2_rgrp_out(const struct gfs2_rgrp *rg, void *buf);
 extern void gfs2_quota_in(struct gfs2_quota *qu, const void *buf);
 extern void gfs2_quota_out(const struct gfs2_quota *qu, void *buf);
-extern void gfs2_dinode_in(struct gfs2_dinode *di, const void *buf);
-extern void gfs2_dinode_out(const struct gfs2_dinode *di, void *buf);
+extern void gfs2_dinode_in(struct gfs2_dinode_host *di, const void *buf);
+extern void gfs2_dinode_out(const struct gfs2_dinode_host *di, void *buf);
 extern void gfs2_ea_header_in(struct gfs2_ea_header *ea, const void *buf);
 extern void gfs2_ea_header_out(const struct gfs2_ea_header *ea, void *buf);
 extern void gfs2_log_header_in(struct gfs2_log_header *lh, const void *buf);
@@ -436,7 +478,7 @@ extern void gfs2_quota_change_in(struct gfs2_quota_change *qc, const void *buf);
 /* Printing functions */
 
 extern void gfs2_rindex_print(const struct gfs2_rindex *ri);
-extern void gfs2_dinode_print(const struct gfs2_dinode *di);
+extern void gfs2_dinode_print(const struct gfs2_dinode_host *di);
 
 #endif /* __KERNEL__ */
 
-- 
cgit v0.10.2


From 5c6edb576f3800723bb65dbfaff82517089e32d0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Oct 2006 20:33:01 -0400
Subject: [GFS2] gfs2_dinode_host fields are host-endian

Annotated scalar fields, dropped unused ones.  Note that
it's not at all obvious that we want to convert all of them
to host-endian...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 118dc69..1c876e0 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -229,7 +229,7 @@ struct gfs2_inode {
 	unsigned long i_flags;		/* GIF_... */
 
 	u64 i_vn;
-	struct gfs2_dinode i_di; /* To be replaced by ref to block */
+	struct gfs2_dinode_host i_di; /* To be replaced by ref to block */
 
 	struct gfs2_glock *i_gl; /* Move into i_gh? */
 	struct gfs2_holder i_iopen_gh;
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index f334b4b..0e67a89 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -275,41 +275,34 @@ struct gfs2_dinode_host {
 
 	struct gfs2_inum di_num;
 
-	__be32 di_mode;	/* mode of file */
-	__be32 di_uid;	/* owner's user id */
-	__be32 di_gid;	/* owner's group id */
-	__be32 di_nlink;	/* number of links to this file */
-	__be64 di_size;	/* number of bytes in file */
-	__be64 di_blocks;	/* number of blocks in file */
-	__be64 di_atime;	/* time last accessed */
-	__be64 di_mtime;	/* time last modified */
-	__be64 di_ctime;	/* time last changed */
-	__be32 di_major;	/* device major number */
-	__be32 di_minor;	/* device minor number */
+	__u32 di_mode;	/* mode of file */
+	__u32 di_uid;	/* owner's user id */
+	__u32 di_gid;	/* owner's group id */
+	__u32 di_nlink;	/* number of links to this file */
+	__u64 di_size;	/* number of bytes in file */
+	__u64 di_blocks;	/* number of blocks in file */
+	__u64 di_atime;	/* time last accessed */
+	__u64 di_mtime;	/* time last modified */
+	__u64 di_ctime;	/* time last changed */
+	__u32 di_major;	/* device major number */
+	__u32 di_minor;	/* device minor number */
 
 	/* This section varies from gfs1. Padding added to align with
          * remainder of dinode
 	 */
-	__be64 di_goal_meta;	/* rgrp to alloc from next */
-	__be64 di_goal_data;	/* data block goal */
-	__be64 di_generation;	/* generation number for NFS */
+	__u64 di_goal_meta;	/* rgrp to alloc from next */
+	__u64 di_goal_data;	/* data block goal */
+	__u64 di_generation;	/* generation number for NFS */
 
-	__be32 di_flags;	/* GFS2_DIF_... */
-	__be32 di_payload_format;  /* GFS2_FORMAT_... */
-	__u16 __pad1;	/* Was ditype in gfs1 */
-	__be16 di_height;	/* height of metadata */
-	__u32 __pad2;	/* Unused incarnation number from gfs1 */
+	__u32 di_flags;	/* GFS2_DIF_... */
+	__u32 di_payload_format;  /* GFS2_FORMAT_... */
+	__u16 di_height;	/* height of metadata */
 
 	/* These only apply to directories  */
-	__u16 __pad3;	/* Padding */
-	__be16 di_depth;	/* Number of bits in the table */
-	__be32 di_entries;	/* The number of entries in the directory */
-
-	struct gfs2_inum __pad4; /* Unused even in current gfs1 */
+	__u16 di_depth;	/* Number of bits in the table */
+	__u32 di_entries;	/* The number of entries in the directory */
 
-	__be64 di_eattr;	/* extended attribute block number */
-
-	__u8 di_reserved[56];
+	__u64 di_eattr;	/* extended attribute block number */
 };
 
 /*
-- 
cgit v0.10.2


From f50dfaf78c01df3cc2d8819f07d6661915567bae Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Oct 2006 20:45:02 -0400
Subject: [GFS2] split gfs2_sb

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 1c876e0..bd596ba 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -450,7 +450,7 @@ struct gfs2_sbd {
 	struct super_block *sd_vfs_meta;
 	struct kobject sd_kobj;
 	unsigned long sd_flags;	/* SDF_... */
-	struct gfs2_sb sd_sb;
+	struct gfs2_sb_host sd_sb;
 
 	/* Constants computed on mount */
 
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index 52cb9a2..5b32f1a 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -79,7 +79,7 @@ static void gfs2_meta_header_print(const struct gfs2_meta_header *mh)
 	pv(mh, mh_format, "%u");
 }
 
-void gfs2_sb_in(struct gfs2_sb *sb, const void *buf)
+void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
 {
 	const struct gfs2_sb *str = buf;
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 6a78b1b..52aa322 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -97,7 +97,7 @@ void gfs2_tune_init(struct gfs2_tune *gt)
  * changed.
  */
 
-int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent)
+int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
 {
 	unsigned int x;
 
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 5bb443a..ac95064 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -14,7 +14,7 @@
 
 void gfs2_tune_init(struct gfs2_tune *gt);
 
-int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent);
+int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent);
 int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
 struct page *gfs2_read_super(struct super_block *sb, sector_t sector);
 
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index 0e67a89..b7bdfef 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -128,6 +128,26 @@ struct gfs2_sb {
 	/* In gfs1, quota and license dinodes followed */
 };
 
+struct gfs2_sb_host {
+	struct gfs2_meta_header sb_header;
+
+	__be32 sb_fs_format;
+	__be32 sb_multihost_format;
+	__u32  __pad0;	/* Was superblock flags in gfs1 */
+
+	__be32 sb_bsize;
+	__be32 sb_bsize_shift;
+	__u32 __pad1;	/* Was journal segment size in gfs1 */
+
+	struct gfs2_inum sb_master_dir; /* Was jindex dinode in gfs1 */
+	struct gfs2_inum __pad2; /* Was rindex dinode in gfs1 */
+	struct gfs2_inum sb_root_dir;
+
+	char sb_lockproto[GFS2_LOCKNAME_LEN];
+	char sb_locktable[GFS2_LOCKNAME_LEN];
+	/* In gfs1, quota and license dinodes followed */
+};
+
 /*
  * resource index structure
  */
@@ -450,7 +470,7 @@ struct gfs2_quota_change {
 
 extern void gfs2_inum_in(struct gfs2_inum *no, const void *buf);
 extern void gfs2_inum_out(const struct gfs2_inum *no, void *buf);
-extern void gfs2_sb_in(struct gfs2_sb *sb, const void *buf);
+extern void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf);
 extern void gfs2_rindex_in(struct gfs2_rindex *ri, const void *buf);
 extern void gfs2_rindex_out(const struct gfs2_rindex *ri, void *buf);
 extern void gfs2_rgrp_in(struct gfs2_rgrp *rg, const void *buf);
-- 
cgit v0.10.2


From bc558c87bb7e50c4f728d32684a9f4f4c73ebde3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Oct 2006 21:02:41 -0400
Subject: [GFS2] fields of gfs2_sb_host are host-endian

... and several could be killed, but that's another story.
Annotate scalar ones, kill completely unused.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index b7bdfef..a5d36cd 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -131,16 +131,13 @@ struct gfs2_sb {
 struct gfs2_sb_host {
 	struct gfs2_meta_header sb_header;
 
-	__be32 sb_fs_format;
-	__be32 sb_multihost_format;
-	__u32  __pad0;	/* Was superblock flags in gfs1 */
+	__u32 sb_fs_format;
+	__u32 sb_multihost_format;
 
-	__be32 sb_bsize;
-	__be32 sb_bsize_shift;
-	__u32 __pad1;	/* Was journal segment size in gfs1 */
+	__u32 sb_bsize;
+	__u32 sb_bsize_shift;
 
 	struct gfs2_inum sb_master_dir; /* Was jindex dinode in gfs1 */
-	struct gfs2_inum __pad2; /* Was rindex dinode in gfs1 */
 	struct gfs2_inum sb_root_dir;
 
 	char sb_lockproto[GFS2_LOCKNAME_LEN];
-- 
cgit v0.10.2


From 68826664d12827d7a732192e2f00ba46fb899414 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Oct 2006 21:07:22 -0400
Subject: [GFS2] split and annotate gfs2_rgrp

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index bd596ba..8ca7a7f 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -68,7 +68,7 @@ struct gfs2_rgrpd {
 	struct list_head rd_recent;	/* Recently used rgrps */
 	struct gfs2_glock *rd_gl;	/* Glock for this rgrp */
 	struct gfs2_rindex rd_ri;
-	struct gfs2_rgrp rd_rg;
+	struct gfs2_rgrp_host rd_rg;
 	u64 rd_rg_vn;
 	struct gfs2_bitmap *rd_bits;
 	unsigned int rd_bh_count;
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index 5b32f1a..3b156a1 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -120,7 +120,7 @@ void gfs2_rindex_print(const struct gfs2_rindex *ri)
 	pv(ri, ri_bitbytes, "%u");
 }
 
-void gfs2_rgrp_in(struct gfs2_rgrp *rg, const void *buf)
+void gfs2_rgrp_in(struct gfs2_rgrp_host *rg, const void *buf)
 {
 	const struct gfs2_rgrp *str = buf;
 
@@ -131,7 +131,7 @@ void gfs2_rgrp_in(struct gfs2_rgrp *rg, const void *buf)
 	rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
 }
 
-void gfs2_rgrp_out(const struct gfs2_rgrp *rg, void *buf)
+void gfs2_rgrp_out(const struct gfs2_rgrp_host *rg, void *buf)
 {
 	struct gfs2_rgrp *str = buf;
 
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index a5d36cd..e4ca6e4 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -193,6 +193,15 @@ struct gfs2_rgrp {
 	__u8 rg_reserved[80]; /* Several fields from gfs1 now reserved */
 };
 
+struct gfs2_rgrp_host {
+	struct gfs2_meta_header rg_header;
+
+	__u32 rg_flags;
+	__u32 rg_free;
+	__u32 rg_dinodes;
+	__u64 rg_igeneration;
+};
+
 /*
  * quota structure
  */
@@ -470,8 +479,8 @@ extern void gfs2_inum_out(const struct gfs2_inum *no, void *buf);
 extern void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf);
 extern void gfs2_rindex_in(struct gfs2_rindex *ri, const void *buf);
 extern void gfs2_rindex_out(const struct gfs2_rindex *ri, void *buf);
-extern void gfs2_rgrp_in(struct gfs2_rgrp *rg, const void *buf);
-extern void gfs2_rgrp_out(const struct gfs2_rgrp *rg, void *buf);
+extern void gfs2_rgrp_in(struct gfs2_rgrp_host *rg, const void *buf);
+extern void gfs2_rgrp_out(const struct gfs2_rgrp_host *rg, void *buf);
 extern void gfs2_quota_in(struct gfs2_quota *qu, const void *buf);
 extern void gfs2_quota_out(const struct gfs2_quota *qu, void *buf);
 extern void gfs2_dinode_in(struct gfs2_dinode_host *di, const void *buf);
-- 
cgit v0.10.2


From e697264709c86040271cdd7abee781d7adbb7f91 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Oct 2006 21:29:46 -0400
Subject: [GFS2] split and annotate gfs2_inum_range

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 191a3df..7eb6b44 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -436,7 +436,7 @@ static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino)
 {
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
 	struct buffer_head *bh;
-	struct gfs2_inum_range ir;
+	struct gfs2_inum_range_host ir;
 	int error;
 
 	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
@@ -479,7 +479,7 @@ static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino)
 	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode);
 	struct gfs2_holder gh;
 	struct buffer_head *bh;
-	struct gfs2_inum_range ir;
+	struct gfs2_inum_range_host ir;
 	int error;
 
 	error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index 3b156a1..64f5f0c6 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -263,7 +263,7 @@ void gfs2_log_header_in(struct gfs2_log_header *lh, const void *buf)
 	lh->lh_hash = be32_to_cpu(str->lh_hash);
 }
 
-void gfs2_inum_range_in(struct gfs2_inum_range *ir, const void *buf)
+void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf)
 {
 	const struct gfs2_inum_range *str = buf;
 
@@ -271,7 +271,7 @@ void gfs2_inum_range_in(struct gfs2_inum_range *ir, const void *buf)
 	ir->ir_length = be64_to_cpu(str->ir_length);
 }
 
-void gfs2_inum_range_out(const struct gfs2_inum_range *ir, void *buf)
+void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf)
 {
 	struct gfs2_inum_range *str = buf;
 
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index e4ca6e4..c035587 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -445,6 +445,11 @@ struct gfs2_inum_range {
 	__be64 ir_length;
 };
 
+struct gfs2_inum_range_host {
+	__u64 ir_start;
+	__u64 ir_length;
+};
+
 /*
  * Statfs change
  * Describes an change to the pool of free and allocated
@@ -488,8 +493,8 @@ extern void gfs2_dinode_out(const struct gfs2_dinode_host *di, void *buf);
 extern void gfs2_ea_header_in(struct gfs2_ea_header *ea, const void *buf);
 extern void gfs2_ea_header_out(const struct gfs2_ea_header *ea, void *buf);
 extern void gfs2_log_header_in(struct gfs2_log_header *lh, const void *buf);
-extern void gfs2_inum_range_in(struct gfs2_inum_range *ir, const void *buf);
-extern void gfs2_inum_range_out(const struct gfs2_inum_range *ir, void *buf);
+extern void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf);
+extern void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf);
 extern void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, const void *buf);
 extern void gfs2_statfs_change_out(const struct gfs2_statfs_change *sc, void *buf);
 extern void gfs2_quota_change_in(struct gfs2_quota_change *qc, const void *buf);
-- 
cgit v0.10.2


From 551676226163379c217e8ec54bd287eab9b8521e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Oct 2006 21:47:13 -0400
Subject: [GFS2] split and annotate gfs2_log_head

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 41a6b68..5406b19 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -491,7 +491,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
 	struct gfs2_glock *j_gl = ip->i_gl;
-	struct gfs2_log_header head;
+	struct gfs2_log_header_host head;
 	int error;
 
 	if (gl->gl_state != LM_ST_UNLOCKED &&
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 8ca7a7f..e69f339 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -41,7 +41,7 @@ struct gfs2_log_operations {
 	void (*lo_before_commit) (struct gfs2_sbd *sdp);
 	void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
 	void (*lo_before_scan) (struct gfs2_jdesc *jd,
-				struct gfs2_log_header *head, int pass);
+				struct gfs2_log_header_host *head, int pass);
 	int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start,
 				 struct gfs2_log_descriptor *ld, __be64 *ptr,
 				 int pass);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index ab6d111..8a654cd 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -182,7 +182,7 @@ static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 }
 
 static void buf_lo_before_scan(struct gfs2_jdesc *jd,
-			       struct gfs2_log_header *head, int pass)
+			       struct gfs2_log_header_host *head, int pass)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
 
@@ -328,7 +328,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 }
 
 static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
-				  struct gfs2_log_header *head, int pass)
+				  struct gfs2_log_header_host *head, int pass)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
 
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 5839c05..965bc65 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -60,7 +60,7 @@ static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 }
 
 static inline void lops_before_scan(struct gfs2_jdesc *jd,
-				    struct gfs2_log_header *head,
+				    struct gfs2_log_header_host *head,
 				    unsigned int pass)
 {
 	int x;
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index 64f5f0c6..84b1ebc 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -251,7 +251,7 @@ void gfs2_dinode_print(const struct gfs2_dinode_host *di)
 	printk(KERN_INFO "  di_eattr = %llu\n", (unsigned long long)di->di_eattr);
 }
 
-void gfs2_log_header_in(struct gfs2_log_header *lh, const void *buf)
+void gfs2_log_header_in(struct gfs2_log_header_host *lh, const void *buf)
 {
 	const struct gfs2_log_header *str = buf;
 
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 62cd223..4478162 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -132,10 +132,10 @@ void gfs2_revoke_clean(struct gfs2_sbd *sdp)
  */
 
 static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
-			  struct gfs2_log_header *head)
+			  struct gfs2_log_header_host *head)
 {
 	struct buffer_head *bh;
-	struct gfs2_log_header lh;
+	struct gfs2_log_header_host lh;
 	u32 hash;
 	int error;
 
@@ -143,7 +143,7 @@ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
 	if (error)
 		return error;
 
-	memcpy(&lh, bh->b_data, sizeof(struct gfs2_log_header));
+	memcpy(&lh, bh->b_data, sizeof(struct gfs2_log_header));	/* XXX */
 	lh.lh_hash = 0;
 	hash = gfs2_disk_hash((char *)&lh, sizeof(struct gfs2_log_header));
 	gfs2_log_header_in(&lh, bh->b_data);
@@ -174,7 +174,7 @@ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
  */
 
 static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk,
-			struct gfs2_log_header *head)
+			struct gfs2_log_header_host *head)
 {
 	unsigned int orig_blk = *blk;
 	int error;
@@ -205,10 +205,10 @@ static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk,
  * Returns: errno
  */
 
-static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
+static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head)
 {
 	unsigned int blk = head->lh_blkno;
-	struct gfs2_log_header lh;
+	struct gfs2_log_header_host lh;
 	int error;
 
 	for (;;) {
@@ -245,9 +245,9 @@ static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
  * Returns: errno
  */
 
-int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
+int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head)
 {
-	struct gfs2_log_header lh_1, lh_m;
+	struct gfs2_log_header_host lh_1, lh_m;
 	u32 blk_1, blk_2, blk_m;
 	int error;
 
@@ -320,7 +320,7 @@ static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start,
 		length = be32_to_cpu(ld->ld_length);
 
 		if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) {
-			struct gfs2_log_header lh;
+			struct gfs2_log_header_host lh;
 			error = get_log_header(jd, start, &lh);
 			if (!error) {
 				gfs2_replay_incr_blk(sdp, &start);
@@ -363,7 +363,7 @@ static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start,
  * Returns: errno
  */
 
-static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
+static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head)
 {
 	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
 	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
@@ -425,7 +425,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
 {
 	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
 	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
-	struct gfs2_log_header head;
+	struct gfs2_log_header_host head;
 	struct gfs2_holder j_gh, ji_gh, t_gh;
 	unsigned long t;
 	int ro = 0;
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index 961feed..f7235e6 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -26,7 +26,7 @@ int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
 void gfs2_revoke_clean(struct gfs2_sbd *sdp);
 
 int gfs2_find_jhead(struct gfs2_jdesc *jd,
-		    struct gfs2_log_header *head);
+		    struct gfs2_log_header_host *head);
 int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
 void gfs2_check_journals(struct gfs2_sbd *sdp);
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 52aa322..0faf563 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -508,7 +508,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
 	struct gfs2_glock *j_gl = ip->i_gl;
 	struct gfs2_holder t_gh;
-	struct gfs2_log_header head;
+	struct gfs2_log_header_host head;
 	int error;
 
 	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
@@ -873,7 +873,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
 	struct gfs2_jdesc *jd;
 	struct lfcc *lfcc;
 	LIST_HEAD(list);
-	struct gfs2_log_header lh;
+	struct gfs2_log_header_host lh;
 	int error;
 
 	error = gfs2_jindex_hold(sdp, &ji_gh);
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index c035587..fb69a64 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -405,6 +405,16 @@ struct gfs2_log_header {
 	__be32 lh_hash;
 };
 
+struct gfs2_log_header_host {
+	struct gfs2_meta_header lh_header;
+
+	__u64 lh_sequence;	/* Sequence number of this transaction */
+	__u32 lh_flags;	/* GFS2_LOG_HEAD_... */
+	__u32 lh_tail;		/* Block number of log tail */
+	__u32 lh_blkno;
+	__u32 lh_hash;
+};
+
 /*
  * Log type descriptor
  */
@@ -492,7 +502,7 @@ extern void gfs2_dinode_in(struct gfs2_dinode_host *di, const void *buf);
 extern void gfs2_dinode_out(const struct gfs2_dinode_host *di, void *buf);
 extern void gfs2_ea_header_in(struct gfs2_ea_header *ea, const void *buf);
 extern void gfs2_ea_header_out(const struct gfs2_ea_header *ea, void *buf);
-extern void gfs2_log_header_in(struct gfs2_log_header *lh, const void *buf);
+extern void gfs2_log_header_in(struct gfs2_log_header_host *lh, const void *buf);
 extern void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf);
 extern void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf);
 extern void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, const void *buf);
-- 
cgit v0.10.2


From 2a2c98247b822db8df037a56c27201f9d716ac66 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 31 Oct 2006 14:44:50 -0500
Subject: [GFS2] Fix crc32 calculation in recovery.c

Commit "[GFS2] split and annotate gfs2_log_head" resulted in an incorrect
checksum calculation for log headers. This patch corrects the
problem without resorting to copying the whole log header as
the previous code used to.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 4478162..4acf238 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -136,6 +136,7 @@ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
 {
 	struct buffer_head *bh;
 	struct gfs2_log_header_host lh;
+static const u32 nothing = 0;
 	u32 hash;
 	int error;
 
@@ -143,11 +144,11 @@ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
 	if (error)
 		return error;
 
-	memcpy(&lh, bh->b_data, sizeof(struct gfs2_log_header));	/* XXX */
-	lh.lh_hash = 0;
-	hash = gfs2_disk_hash((char *)&lh, sizeof(struct gfs2_log_header));
+	hash = crc32_le((u32)~0, bh->b_data, sizeof(struct gfs2_log_header) -
+					     sizeof(u32));
+	hash = crc32_le(hash, (unsigned char const *)&nothing, sizeof(nothing));
+	hash ^= (u32)~0;
 	gfs2_log_header_in(&lh, bh->b_data);
-
 	brelse(bh);
 
 	if (lh.lh_header.mh_magic != GFS2_MAGIC ||
-- 
cgit v0.10.2


From e928a76f959e89884f6186bb6f846c533847d5df Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Oct 2006 21:57:23 -0400
Subject: [GFS2] split and annotate gfs2_meta_header

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index 84b1ebc..b5aa7ab 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -54,7 +54,7 @@ static void gfs2_inum_print(const struct gfs2_inum *no)
 	printk(KERN_INFO "  no_addr = %llu\n", (unsigned long long)no->no_addr);
 }
 
-static void gfs2_meta_header_in(struct gfs2_meta_header *mh, const void *buf)
+static void gfs2_meta_header_in(struct gfs2_meta_header_host *mh, const void *buf)
 {
 	const struct gfs2_meta_header *str = buf;
 
@@ -63,7 +63,7 @@ static void gfs2_meta_header_in(struct gfs2_meta_header *mh, const void *buf)
 	mh->mh_format = be32_to_cpu(str->mh_format);
 }
 
-static void gfs2_meta_header_out(const struct gfs2_meta_header *mh, void *buf)
+static void gfs2_meta_header_out(const struct gfs2_meta_header_host *mh, void *buf)
 {
 	struct gfs2_meta_header *str = buf;
 
@@ -72,7 +72,7 @@ static void gfs2_meta_header_out(const struct gfs2_meta_header *mh, void *buf)
 	str->mh_format = cpu_to_be32(mh->mh_format);
 }
 
-static void gfs2_meta_header_print(const struct gfs2_meta_header *mh)
+static void gfs2_meta_header_print(const struct gfs2_meta_header_host *mh)
 {
 	pv(mh, mh_magic, "0x%.8X");
 	pv(mh, mh_type, "%u");
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index fb69a64..76eb9e1 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -89,6 +89,12 @@ struct gfs2_meta_header {
 	__be32 __pad1;		/* Was incarnation number in gfs1 */
 };
 
+struct gfs2_meta_header_host {
+	__u32 mh_magic;
+	__u32 mh_type;
+	__u32 mh_format;
+};
+
 /*
  * super-block structure
  *
@@ -129,7 +135,7 @@ struct gfs2_sb {
 };
 
 struct gfs2_sb_host {
-	struct gfs2_meta_header sb_header;
+	struct gfs2_meta_header_host sb_header;
 
 	__u32 sb_fs_format;
 	__u32 sb_multihost_format;
@@ -194,7 +200,7 @@ struct gfs2_rgrp {
 };
 
 struct gfs2_rgrp_host {
-	struct gfs2_meta_header rg_header;
+	struct gfs2_meta_header_host rg_header;
 
 	__u32 rg_flags;
 	__u32 rg_free;
@@ -297,7 +303,7 @@ struct gfs2_dinode {
 };
 
 struct gfs2_dinode_host {
-	struct gfs2_meta_header di_header;
+	struct gfs2_meta_header_host di_header;
 
 	struct gfs2_inum di_num;
 
@@ -406,7 +412,7 @@ struct gfs2_log_header {
 };
 
 struct gfs2_log_header_host {
-	struct gfs2_meta_header lh_header;
+	struct gfs2_meta_header_host lh_header;
 
 	__u64 lh_sequence;	/* Sequence number of this transaction */
 	__u32 lh_flags;	/* GFS2_LOG_HEAD_... */
-- 
cgit v0.10.2


From 1e81c4c3e0f55c95b6278a827262b80debd0dc7e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Oct 2006 22:51:24 -0400
Subject: [GFS2] split and annotate gfs_rindex

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index e69f339..e4afc2c 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -67,7 +67,7 @@ struct gfs2_rgrpd {
 	struct list_head rd_list_mru;
 	struct list_head rd_recent;	/* Recently used rgrps */
 	struct gfs2_glock *rd_gl;	/* Glock for this rgrp */
-	struct gfs2_rindex rd_ri;
+	struct gfs2_rindex_host rd_ri;
 	struct gfs2_rgrp_host rd_rg;
 	u64 rd_rg_vn;
 	struct gfs2_bitmap *rd_bits;
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index b5aa7ab..c4e099d 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -97,7 +97,7 @@ void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
 	memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
 }
 
-void gfs2_rindex_in(struct gfs2_rindex *ri, const void *buf)
+void gfs2_rindex_in(struct gfs2_rindex_host *ri, const void *buf)
 {
 	const struct gfs2_rindex *str = buf;
 
@@ -109,7 +109,7 @@ void gfs2_rindex_in(struct gfs2_rindex *ri, const void *buf)
 
 }
 
-void gfs2_rindex_print(const struct gfs2_rindex *ri)
+void gfs2_rindex_print(const struct gfs2_rindex_host *ri)
 {
 	printk(KERN_INFO "  ri_addr = %llu\n", (unsigned long long)ri->ri_addr);
 	pv(ri, ri_length, "%u");
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index b261385..07dfd63 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -253,7 +253,7 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
 
 }
 
-static inline int rgrp_contains_block(struct gfs2_rindex *ri, u64 block)
+static inline int rgrp_contains_block(struct gfs2_rindex_host *ri, u64 block)
 {
 	u64 first = ri->ri_data0;
 	u64 last = first + ri->ri_data;
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index 76eb9e1..7dd5e4c 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -168,6 +168,14 @@ struct gfs2_rindex {
 	__u8 ri_reserved[64];
 };
 
+struct gfs2_rindex_host {
+	__u64 ri_addr;	/* grp block disk address */
+	__u64 ri_data0;	/* first data location */
+	__u32 ri_length;	/* length of rgrp header in fs blocks */
+	__u32 ri_data;	/* num of data blocks in rgrp */
+	__u32 ri_bitbytes;	/* number of bytes in data bitmaps */
+};
+
 /*
  * resource group header structure
  */
@@ -498,8 +506,8 @@ struct gfs2_quota_change {
 extern void gfs2_inum_in(struct gfs2_inum *no, const void *buf);
 extern void gfs2_inum_out(const struct gfs2_inum *no, void *buf);
 extern void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf);
-extern void gfs2_rindex_in(struct gfs2_rindex *ri, const void *buf);
-extern void gfs2_rindex_out(const struct gfs2_rindex *ri, void *buf);
+extern void gfs2_rindex_in(struct gfs2_rindex_host *ri, const void *buf);
+extern void gfs2_rindex_out(const struct gfs2_rindex_host *ri, void *buf);
 extern void gfs2_rgrp_in(struct gfs2_rgrp_host *rg, const void *buf);
 extern void gfs2_rgrp_out(const struct gfs2_rgrp_host *rg, void *buf);
 extern void gfs2_quota_in(struct gfs2_quota *qu, const void *buf);
@@ -517,7 +525,7 @@ extern void gfs2_quota_change_in(struct gfs2_quota_change *qc, const void *buf);
 
 /* Printing functions */
 
-extern void gfs2_rindex_print(const struct gfs2_rindex *ri);
+extern void gfs2_rindex_print(const struct gfs2_rindex_host *ri);
 extern void gfs2_dinode_print(const struct gfs2_dinode_host *di);
 
 #endif /* __KERNEL__ */
-- 
cgit v0.10.2


From 629a21e7ecedf779c68dcaa9a186069f57a7c652 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Oct 2006 22:51:24 -0400
Subject: [GFS2] split and annotate gfs2_inum

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index e24af28b1..d67a376 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1194,7 +1194,7 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
 			   int *copied)
 {
 	const struct gfs2_dirent *dent, *dent_next;
-	struct gfs2_inum inum;
+	struct gfs2_inum_host inum;
 	u64 off, off_next;
 	unsigned int x, y;
 	int run = 0;
@@ -1456,7 +1456,7 @@ out:
  */
 
 int gfs2_dir_search(struct inode *dir, const struct qstr *name,
-		    struct gfs2_inum *inum, unsigned int *type)
+		    struct gfs2_inum_host *inum, unsigned int *type)
 {
 	struct buffer_head *bh;
 	struct gfs2_dirent *dent;
@@ -1531,7 +1531,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
  */
 
 int gfs2_dir_add(struct inode *inode, const struct qstr *name,
-		 const struct gfs2_inum *inum, unsigned type)
+		 const struct gfs2_inum_host *inum, unsigned type)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct buffer_head *bh;
@@ -1666,7 +1666,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
  */
 
 int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
-		   struct gfs2_inum *inum, unsigned int new_type)
+		   struct gfs2_inum_host *inum, unsigned int new_type)
 {
 	struct buffer_head *bh;
 	struct gfs2_dirent *dent;
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 3712334..b21b336 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -31,17 +31,17 @@ struct gfs2_inum;
 typedef int (*gfs2_filldir_t) (void *opaque,
 			      const char *name, unsigned int length,
 			      u64 offset,
-			      struct gfs2_inum *inum, unsigned int type);
+			      struct gfs2_inum_host *inum, unsigned int type);
 
 int gfs2_dir_search(struct inode *dir, const struct qstr *filename,
-		    struct gfs2_inum *inum, unsigned int *type);
+		    struct gfs2_inum_host *inum, unsigned int *type);
 int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
-		 const struct gfs2_inum *inum, unsigned int type);
+		 const struct gfs2_inum_host *inum, unsigned int type);
 int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
 int gfs2_dir_read(struct inode *inode, u64 * offset, void *opaque,
 		  gfs2_filldir_t filldir);
 int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
-		   struct gfs2_inum *new_inum, unsigned int new_type);
+		   struct gfs2_inum_host *new_inum, unsigned int new_type);
 
 int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
 
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index e4afc2c..20c9b4f 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -224,7 +224,7 @@ enum {
 
 struct gfs2_inode {
 	struct inode i_inode;
-	struct gfs2_inum i_num;
+	struct gfs2_inum_host i_num;
 
 	unsigned long i_flags;		/* GIF_... */
 
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7eb6b44..dadd1f3 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -112,7 +112,7 @@ void gfs2_inode_attr_out(struct gfs2_inode *ip)
 static int iget_test(struct inode *inode, void *opaque)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_inum *inum = opaque;
+	struct gfs2_inum_host *inum = opaque;
 
 	if (ip && ip->i_num.no_addr == inum->no_addr)
 		return 1;
@@ -123,19 +123,19 @@ static int iget_test(struct inode *inode, void *opaque)
 static int iget_set(struct inode *inode, void *opaque)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_inum *inum = opaque;
+	struct gfs2_inum_host *inum = opaque;
 
 	ip->i_num = *inum;
 	return 0;
 }
 
-struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum)
+struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum_host *inum)
 {
 	return ilookup5(sb, (unsigned long)inum->no_formal_ino,
 			iget_test, inum);
 }
 
-static struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum *inum)
+static struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum_host *inum)
 {
 	return iget5_locked(sb, (unsigned long)inum->no_formal_ino,
 		     iget_test, iget_set, inum);
@@ -150,7 +150,7 @@ static struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum *inum)
  * Returns: A VFS inode, or an error
  */
 
-struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned int type)
+struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum_host *inum, unsigned int type)
 {
 	struct inode *inode = gfs2_iget(sb, inum);
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -394,7 +394,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 	struct super_block *sb = dir->i_sb;
 	struct gfs2_inode *dip = GFS2_I(dir);
 	struct gfs2_holder d_gh;
-	struct gfs2_inum inum;
+	struct gfs2_inum_host inum;
 	unsigned int type;
 	int error = 0;
 	struct inode *inode = NULL;
@@ -610,7 +610,7 @@ static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode,
 		*gid = current->fsgid;
 }
 
-static int alloc_dinode(struct gfs2_inode *dip, struct gfs2_inum *inum,
+static int alloc_dinode(struct gfs2_inode *dip, struct gfs2_inum_host *inum,
 			u64 *generation)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
@@ -650,7 +650,7 @@ out:
  */
 
 static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
-			const struct gfs2_inum *inum, unsigned int mode,
+			const struct gfs2_inum_host *inum, unsigned int mode,
 			unsigned int uid, unsigned int gid,
 			const u64 *generation)
 {
@@ -707,7 +707,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 }
 
 static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
-		       unsigned int mode, const struct gfs2_inum *inum,
+		       unsigned int mode, const struct gfs2_inum_host *inum,
 		       const u64 *generation)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
@@ -866,7 +866,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 	struct gfs2_inode *dip = ghs->gh_gl->gl_object;
 	struct inode *dir = &dip->i_inode;
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
-	struct gfs2_inum inum;
+	struct gfs2_inum_host inum;
 	int error;
 	u64 generation;
 
@@ -1018,7 +1018,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
 int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 		   struct gfs2_inode *ip)
 {
-	struct gfs2_inum inum;
+	struct gfs2_inum_host inum;
 	unsigned int type;
 	int error;
 
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index f5d8617..d699b92 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -27,8 +27,8 @@ static inline int gfs2_is_dir(struct gfs2_inode *ip)
 
 void gfs2_inode_attr_in(struct gfs2_inode *ip);
 void gfs2_inode_attr_out(struct gfs2_inode *ip);
-struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned type);
-struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum);
+struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum_host *inum, unsigned type);
+struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum_host *inum);
 
 int gfs2_inode_refresh(struct gfs2_inode *ip);
 
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index c4e099d..32f592f 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -32,7 +32,7 @@
  * first arg: the cpu-order structure
  */
 
-void gfs2_inum_in(struct gfs2_inum *no, const void *buf)
+void gfs2_inum_in(struct gfs2_inum_host *no, const void *buf)
 {
 	const struct gfs2_inum *str = buf;
 
@@ -40,7 +40,7 @@ void gfs2_inum_in(struct gfs2_inum *no, const void *buf)
 	no->no_addr = be64_to_cpu(str->no_addr);
 }
 
-void gfs2_inum_out(const struct gfs2_inum *no, void *buf)
+void gfs2_inum_out(const struct gfs2_inum_host *no, void *buf)
 {
 	struct gfs2_inum *str = buf;
 
@@ -48,7 +48,7 @@ void gfs2_inum_out(const struct gfs2_inum *no, void *buf)
 	str->no_addr = cpu_to_be64(no->no_addr);
 }
 
-static void gfs2_inum_print(const struct gfs2_inum *no)
+static void gfs2_inum_print(const struct gfs2_inum_host *no)
 {
 	printk(KERN_INFO "  no_formal_ino = %llu\n", (unsigned long long)no->no_formal_ino);
 	printk(KERN_INFO "  no_addr = %llu\n", (unsigned long long)no->no_addr);
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index 00041b1..c36f9e3 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -43,7 +43,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
 	struct inode *inode = dentry->d_inode;
 	struct gfs2_holder d_gh;
 	struct gfs2_inode *ip;
-	struct gfs2_inum inum;
+	struct gfs2_inum_host inum;
 	unsigned int type;
 	int error;
 
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 86127d9..33423a5 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -35,7 +35,7 @@ static struct dentry *gfs2_decode_fh(struct super_block *sb,
 				     void *context)
 {
 	struct gfs2_fh_obj fh_obj;
-	struct gfs2_inum *this, parent;
+	struct gfs2_inum_host *this, parent;
 
 	if (fh_type != fh_len)
 		return NULL;
@@ -114,12 +114,12 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
 }
 
 struct get_name_filldir {
-	struct gfs2_inum inum;
+	struct gfs2_inum_host inum;
 	char *name;
 };
 
 static int get_name_filldir(void *opaque, const char *name, unsigned int length,
-			    u64 offset, struct gfs2_inum *inum,
+			    u64 offset, struct gfs2_inum_host *inum,
 			    unsigned int type)
 {
 	struct get_name_filldir *gnfd = (struct get_name_filldir *)opaque;
@@ -202,7 +202,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	struct gfs2_fh_obj *fh_obj = (struct gfs2_fh_obj *)inum_obj;
-	struct gfs2_inum *inum = &fh_obj->this;
+	struct gfs2_inum_host *inum = &fh_obj->this;
 	struct gfs2_holder i_gh, ri_gh, rgd_gh;
 	struct gfs2_rgrpd *rgd;
 	struct inode *inode;
diff --git a/fs/gfs2/ops_export.h b/fs/gfs2/ops_export.h
index 09aca50..f925a95 100644
--- a/fs/gfs2/ops_export.h
+++ b/fs/gfs2/ops_export.h
@@ -15,7 +15,7 @@
 
 extern struct export_operations gfs2_export_ops;
 struct gfs2_fh_obj {
-	struct gfs2_inum this;
+	struct gfs2_inum_host this;
 	__u32            imode;
 };
 
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 3064f13..359965c 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -139,7 +139,7 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
  */
 
 static int filldir_func(void *opaque, const char *name, unsigned int length,
-			u64 offset, struct gfs2_inum *inum,
+			u64 offset, struct gfs2_inum_host *inum,
 			unsigned int type)
 {
 	struct filldir_reg *fdr = (struct filldir_reg *)opaque;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 882873a..d14e139 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -237,7 +237,7 @@ fail:
 }
 
 static struct inode *gfs2_lookup_root(struct super_block *sb,
-				      struct gfs2_inum *inum)
+				      struct gfs2_inum_host *inum)
 {
 	return gfs2_inode_lookup(sb, inum, DT_DIR);
 }
@@ -246,7 +246,7 @@ static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
 {
 	struct super_block *sb = sdp->sd_vfs;
 	struct gfs2_holder sb_gh;
-	struct gfs2_inum *inum;
+	struct gfs2_inum_host *inum;
 	struct inode *inode;
 	int error = 0;
 
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index 7dd5e4c..b16df6e 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -54,8 +54,13 @@ struct gfs2_inum {
 	__be64 no_addr;
 };
 
-static inline int gfs2_inum_equal(const struct gfs2_inum *ino1,
-				  const struct gfs2_inum *ino2)
+struct gfs2_inum_host {
+	__u64 no_formal_ino;
+	__u64 no_addr;
+};
+
+static inline int gfs2_inum_equal(const struct gfs2_inum_host *ino1,
+				  const struct gfs2_inum_host *ino2)
 {
 	return ino1->no_formal_ino == ino2->no_formal_ino &&
 	       ino1->no_addr == ino2->no_addr;
@@ -143,8 +148,8 @@ struct gfs2_sb_host {
 	__u32 sb_bsize;
 	__u32 sb_bsize_shift;
 
-	struct gfs2_inum sb_master_dir; /* Was jindex dinode in gfs1 */
-	struct gfs2_inum sb_root_dir;
+	struct gfs2_inum_host sb_master_dir; /* Was jindex dinode in gfs1 */
+	struct gfs2_inum_host sb_root_dir;
 
 	char sb_lockproto[GFS2_LOCKNAME_LEN];
 	char sb_locktable[GFS2_LOCKNAME_LEN];
@@ -313,7 +318,7 @@ struct gfs2_dinode {
 struct gfs2_dinode_host {
 	struct gfs2_meta_header_host di_header;
 
-	struct gfs2_inum di_num;
+	struct gfs2_inum_host di_num;
 
 	__u32 di_mode;	/* mode of file */
 	__u32 di_uid;	/* owner's user id */
@@ -503,8 +508,8 @@ struct gfs2_quota_change {
 #ifdef __KERNEL__
 /* Translation functions */
 
-extern void gfs2_inum_in(struct gfs2_inum *no, const void *buf);
-extern void gfs2_inum_out(const struct gfs2_inum *no, void *buf);
+extern void gfs2_inum_in(struct gfs2_inum_host *no, const void *buf);
+extern void gfs2_inum_out(const struct gfs2_inum_host *no, void *buf);
 extern void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf);
 extern void gfs2_rindex_in(struct gfs2_rindex_host *ri, const void *buf);
 extern void gfs2_rindex_out(const struct gfs2_rindex_host *ri, void *buf);
-- 
cgit v0.10.2


From b5bc9e8b065dbcd4c675e8c158d6e524f221b8e1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Oct 2006 23:31:55 -0400
Subject: [GFS2] split and annotate gfs2_quota

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index 32f592f..5db0737 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -144,7 +144,7 @@ void gfs2_rgrp_out(const struct gfs2_rgrp_host *rg, void *buf)
 	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
 }
 
-void gfs2_quota_in(struct gfs2_quota *qu, const void *buf)
+void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf)
 {
 	const struct gfs2_quota *str = buf;
 
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index a3deae7..e3f5b8d 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -743,7 +743,7 @@ static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
 	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
 	struct gfs2_holder i_gh;
-	struct gfs2_quota q;
+	struct gfs2_quota_host q;
 	char buf[sizeof(struct gfs2_quota)];
 	struct file_ra_state ra_state;
 	int error;
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index b16df6e..431e03b 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -232,6 +232,12 @@ struct gfs2_quota {
 	__u8 qu_reserved[64];
 };
 
+struct gfs2_quota_host {
+	__u64 qu_limit;
+	__u64 qu_warn;
+	__u64 qu_value;
+};
+
 /*
  * dinode structure
  */
@@ -515,8 +521,7 @@ extern void gfs2_rindex_in(struct gfs2_rindex_host *ri, const void *buf);
 extern void gfs2_rindex_out(const struct gfs2_rindex_host *ri, void *buf);
 extern void gfs2_rgrp_in(struct gfs2_rgrp_host *rg, const void *buf);
 extern void gfs2_rgrp_out(const struct gfs2_rgrp_host *rg, void *buf);
-extern void gfs2_quota_in(struct gfs2_quota *qu, const void *buf);
-extern void gfs2_quota_out(const struct gfs2_quota *qu, void *buf);
+extern void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf);
 extern void gfs2_dinode_in(struct gfs2_dinode_host *di, const void *buf);
 extern void gfs2_dinode_out(const struct gfs2_dinode_host *di, void *buf);
 extern void gfs2_ea_header_in(struct gfs2_ea_header *ea, const void *buf);
-- 
cgit v0.10.2


From bd209cc017f231e8536550bdab1bf5da93c32798 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Oct 2006 23:43:19 -0400
Subject: [GFS2] split and annotate gfs2_statfs_change

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 20c9b4f..c0a8c3b 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -503,8 +503,8 @@ struct gfs2_sbd {
 
 	spinlock_t sd_statfs_spin;
 	struct mutex sd_statfs_mutex;
-	struct gfs2_statfs_change sd_statfs_master;
-	struct gfs2_statfs_change sd_statfs_local;
+	struct gfs2_statfs_change_host sd_statfs_master;
+	struct gfs2_statfs_change_host sd_statfs_local;
 	unsigned long sd_statfs_sync_time;
 
 	/* Resource group stuff */
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index 5db0737..84c59b1 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -279,7 +279,7 @@ void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf)
 	str->ir_length = cpu_to_be64(ir->ir_length);
 }
 
-void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, const void *buf)
+void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
 {
 	const struct gfs2_statfs_change *str = buf;
 
@@ -288,7 +288,7 @@ void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, const void *buf)
 	sc->sc_dinodes = be64_to_cpu(str->sc_dinodes);
 }
 
-void gfs2_statfs_change_out(const struct gfs2_statfs_change *sc, void *buf)
+void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf)
 {
 	struct gfs2_statfs_change *str = buf;
 
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index b47d959..9c786d1 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -215,7 +215,7 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_inode->i_sb;
 	struct gfs2_sbd *sdp = sb->s_fs_info;
-	struct gfs2_statfs_change sc;
+	struct gfs2_statfs_change_host sc;
 	int error;
 
 	if (gfs2_tune_get(sdp, gt_statfs_slow))
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 0faf563..0ef8317 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -587,9 +587,9 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
 int gfs2_statfs_init(struct gfs2_sbd *sdp)
 {
 	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
-	struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
 	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
-	struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
 	struct buffer_head *m_bh, *l_bh;
 	struct gfs2_holder gh;
 	int error;
@@ -634,7 +634,7 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
 			s64 dinodes)
 {
 	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
-	struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
 	struct buffer_head *l_bh;
 	int error;
 
@@ -660,8 +660,8 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp)
 {
 	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
 	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
-	struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
-	struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
 	struct gfs2_holder gh;
 	struct buffer_head *m_bh, *l_bh;
 	int error;
@@ -727,10 +727,10 @@ out:
  * Returns: errno
  */
 
-int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
+int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
 {
-	struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
-	struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
 
 	spin_lock(&sdp->sd_statfs_spin);
 
@@ -760,7 +760,7 @@ int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
  */
 
 static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
-			    struct gfs2_statfs_change *sc)
+			    struct gfs2_statfs_change_host *sc)
 {
 	gfs2_rgrp_verify(rgd);
 	sc->sc_total += rgd->rd_ri.ri_data;
@@ -782,7 +782,7 @@ static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
  * Returns: errno
  */
 
-int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
+int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
 {
 	struct gfs2_holder ri_gh;
 	struct gfs2_rgrpd *rgd_next;
@@ -792,7 +792,7 @@ int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
 	int done;
 	int error = 0, err;
 
-	memset(sc, 0, sizeof(struct gfs2_statfs_change));
+	memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
 	gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
 	if (!gha)
 		return -ENOMEM;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index ac95064..e590b2d 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -45,8 +45,8 @@ int gfs2_statfs_init(struct gfs2_sbd *sdp);
 void gfs2_statfs_change(struct gfs2_sbd *sdp,
 			s64 total, s64 free, s64 dinodes);
 int gfs2_statfs_sync(struct gfs2_sbd *sdp);
-int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
-int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
+int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc);
+int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc);
 
 int gfs2_freeze_fs(struct gfs2_sbd *sdp);
 void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index 431e03b..3ce3a47 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -497,6 +497,12 @@ struct gfs2_statfs_change {
 	__be64 sc_dinodes;
 };
 
+struct gfs2_statfs_change_host {
+	__u64 sc_total;
+	__u64 sc_free;
+	__u64 sc_dinodes;
+};
+
 /*
  * Quota change
  * Describes an allocation change for a particular
@@ -529,8 +535,8 @@ extern void gfs2_ea_header_out(const struct gfs2_ea_header *ea, void *buf);
 extern void gfs2_log_header_in(struct gfs2_log_header_host *lh, const void *buf);
 extern void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf);
 extern void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf);
-extern void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, const void *buf);
-extern void gfs2_statfs_change_out(const struct gfs2_statfs_change *sc, void *buf);
+extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf);
+extern void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf);
 extern void gfs2_quota_change_in(struct gfs2_quota_change *qc, const void *buf);
 
 /* Printing functions */
-- 
cgit v0.10.2


From b62f963e1fdf838fed91faec21228d421a834f2d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Oct 2006 23:46:46 -0400
Subject: [GFS2] split and annotate gfs2_quota_change

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index 84c59b1..2d1682d 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -297,7 +297,7 @@ void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf)
 	str->sc_dinodes = cpu_to_be64(sc->sc_dinodes);
 }
 
-void gfs2_quota_change_in(struct gfs2_quota_change *qc, const void *buf)
+void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *buf)
 {
 	const struct gfs2_quota_change *str = buf;
 
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index e3f5b8d..009d86c 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1103,7 +1103,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
 
 		for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
 		     y++, slot++) {
-			struct gfs2_quota_change qc;
+			struct gfs2_quota_change_host qc;
 			struct gfs2_quota_data *qd;
 
 			gfs2_quota_change_in(&qc, bh->b_data +
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index 3ce3a47..10a507d 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -517,6 +517,12 @@ struct gfs2_quota_change {
 	__be32 qc_id;
 };
 
+struct gfs2_quota_change_host {
+	__u64 qc_change;
+	__u32 qc_flags;	/* GFS2_QCF_... */
+	__u32 qc_id;
+};
+
 #ifdef __KERNEL__
 /* Translation functions */
 
@@ -537,7 +543,7 @@ extern void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf)
 extern void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf);
 extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf);
 extern void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf);
-extern void gfs2_quota_change_in(struct gfs2_quota_change *qc, const void *buf);
+extern void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *buf);
 
 /* Printing functions */
 
-- 
cgit v0.10.2


From b44b84d765b02f813a67b96bf79e3b5d4d621631 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 14 Oct 2006 10:46:30 -0400
Subject: [GFS2] gfs2 misc endianness annotations

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 06e9a8c..51f6356 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -38,8 +38,8 @@ struct metapath {
 };
 
 typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
-			     struct buffer_head *bh, u64 *top,
-			     u64 *bottom, unsigned int height,
+			     struct buffer_head *bh, __be64 *top,
+			     __be64 *bottom, unsigned int height,
 			     void *data);
 
 struct strip_mine {
@@ -230,7 +230,7 @@ static int build_height(struct inode *inode, unsigned height)
 	struct buffer_head *blocks[GFS2_MAX_META_HEIGHT];
 	struct gfs2_dinode *di;
 	int error;
-	u64 *bp;
+	__be64 *bp;
 	u64 bn;
 	unsigned n;
 
@@ -255,7 +255,7 @@ static int build_height(struct inode *inode, unsigned height)
 					  GFS2_FORMAT_IN);
 			gfs2_buffer_clear_tail(blocks[n],
 					       sizeof(struct gfs2_meta_header));
-			bp = (u64 *)(blocks[n]->b_data +
+			bp = (__be64 *)(blocks[n]->b_data +
 				     sizeof(struct gfs2_meta_header));
 			*bp = cpu_to_be64(blocks[n+1]->b_blocknr);
 			brelse(blocks[n]);
@@ -360,15 +360,15 @@ static void find_metapath(struct gfs2_inode *ip, u64 block,
  * metadata tree.
  */
 
-static inline u64 *metapointer(struct buffer_head *bh, int *boundary,
+static inline __be64 *metapointer(struct buffer_head *bh, int *boundary,
 			       unsigned int height, const struct metapath *mp)
 {
 	unsigned int head_size = (height > 0) ?
 		sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
-	u64 *ptr;
+	__be64 *ptr;
 	*boundary = 0;
-	ptr = ((u64 *)(bh->b_data + head_size)) + mp->mp_list[height];
-	if (ptr + 1 == (u64 *)(bh->b_data + bh->b_size))
+	ptr = ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
+	if (ptr + 1 == (__be64 *)(bh->b_data + bh->b_size))
 		*boundary = 1;
 	return ptr;
 }
@@ -394,7 +394,7 @@ static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
 			int *new, u64 *block)
 {
 	int boundary;
-	u64 *ptr = metapointer(bh, &boundary, height, mp);
+	__be64 *ptr = metapointer(bh, &boundary, height, mp);
 
 	if (*ptr) {
 		*block = be64_to_cpu(*ptr);
@@ -600,7 +600,7 @@ static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct buffer_head *bh = NULL;
-	u64 *top, *bottom;
+	__be64 *top, *bottom;
 	u64 bn;
 	int error;
 	int mh_size = sizeof(struct gfs2_meta_header);
@@ -611,17 +611,17 @@ static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
 			return error;
 		dibh = bh;
 
-		top = (u64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
-		bottom = (u64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
+		top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
+		bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
 	} else {
 		error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
 		if (error)
 			return error;
 
-		top = (u64 *)(bh->b_data + mh_size) +
+		top = (__be64 *)(bh->b_data + mh_size) +
 				  (first ? mp->mp_list[height] : 0);
 
-		bottom = (u64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
+		bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
 	}
 
 	error = bc(ip, dibh, bh, top, bottom, height, data);
@@ -660,7 +660,7 @@ out:
  */
 
 static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
-		    struct buffer_head *bh, u64 *top, u64 *bottom,
+		    struct buffer_head *bh, __be64 *top, __be64 *bottom,
 		    unsigned int height, void *data)
 {
 	struct strip_mine *sm = data;
@@ -668,7 +668,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 	struct gfs2_rgrp_list rlist;
 	u64 bn, bstart;
 	u32 blen;
-	u64 *p;
+	__be64 *p;
 	unsigned int rg_blocks = 0;
 	int metadata;
 	unsigned int revokes = 0;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index d67a376..59dc823 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -713,12 +713,12 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
 static int get_leaf_nr(struct gfs2_inode *dip, u32 index,
 		       u64 *leaf_out)
 {
-	u64 leaf_no;
+	__be64 leaf_no;
 	int error;
 
 	error = gfs2_dir_read_data(dip, (char *)&leaf_no,
-				    index * sizeof(u64),
-				    sizeof(u64), 0);
+				    index * sizeof(__be64),
+				    sizeof(__be64), 0);
 	if (error != sizeof(u64))
 		return (error < 0) ? error : -EIO;
 
@@ -837,7 +837,8 @@ static int dir_make_exhash(struct inode *inode)
 	struct gfs2_leaf *leaf;
 	int y;
 	u32 x;
-	u64 *lp, bn;
+	__be64 *lp;
+	u64 bn;
 	int error;
 
 	error = gfs2_meta_inode_buffer(dip, &dibh);
@@ -893,7 +894,7 @@ static int dir_make_exhash(struct inode *inode)
 	gfs2_trans_add_bh(dip->i_gl, dibh, 1);
 	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 
-	lp = (u64 *)(dibh->b_data + sizeof(struct gfs2_dinode));
+	lp = (__be64 *)(dibh->b_data + sizeof(struct gfs2_dinode));
 
 	for (x = sdp->sd_hash_ptrs; x--; lp++)
 		*lp = cpu_to_be64(bn);
@@ -929,7 +930,8 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 	struct gfs2_leaf *nleaf, *oleaf;
 	struct gfs2_dirent *dent = NULL, *prev = NULL, *next = NULL, *new;
 	u32 start, len, half_len, divider;
-	u64 bn, *lp, leaf_no;
+	u64 bn, leaf_no;
+	__be64 *lp;
 	u32 index;
 	int x, moved = 0;
 	int error;
@@ -974,7 +976,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 	/* Change the pointers.
 	   Don't bother distinguishing stuffed from non-stuffed.
 	   This code is complicated enough already. */
-	lp = kmalloc(half_len * sizeof(u64), GFP_NOFS | __GFP_NOFAIL);
+	lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS | __GFP_NOFAIL);
 	/*  Change the pointers  */
 	for (x = 0; x < half_len; x++)
 		lp[x] = cpu_to_be64(bn);
@@ -1341,7 +1343,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
 	u32 hsize, len = 0;
 	u32 ht_offset, lp_offset, ht_offset_cur = -1;
 	u32 hash, index;
-	u64 *lp;
+	__be64 *lp;
 	int copied = 0;
 	int error = 0;
 	unsigned depth = 0;
@@ -1365,7 +1367,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
 
 		if (ht_offset_cur != ht_offset) {
 			error = gfs2_dir_read_data(dip, (char *)lp,
-						ht_offset * sizeof(u64),
+						ht_offset * sizeof(__be64),
 						sdp->sd_hash_bsize, 1);
 			if (error != sdp->sd_hash_bsize) {
 				if (error >= 0)
@@ -1715,7 +1717,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
 	u32 hsize, len;
 	u32 ht_offset, lp_offset, ht_offset_cur = -1;
 	u32 index = 0;
-	u64 *lp;
+	__be64 *lp;
 	u64 leaf_no;
 	int error = 0;
 
@@ -1735,7 +1737,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
 
 		if (ht_offset_cur != ht_offset) {
 			error = gfs2_dir_read_data(dip, (char *)lp,
-						ht_offset * sizeof(u64),
+						ht_offset * sizeof(__be64),
 						sdp->sd_hash_bsize, 1);
 			if (error != sdp->sd_hash_bsize) {
 				if (error >= 0)
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index a65a4cc..518f0c0 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -112,7 +112,7 @@ fail:
 static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
 {
 	struct buffer_head *bh, *eabh;
-	u64 *eablk, *end;
+	__be64 *eablk, *end;
 	int error;
 
 	error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &bh);
@@ -129,7 +129,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
 		goto out;
 	}
 
-	eablk = (u64 *)(bh->b_data + sizeof(struct gfs2_meta_header));
+	eablk = (__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header));
 	end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs;
 
 	for (; eablk < end; eablk++) {
@@ -224,7 +224,8 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
 	struct gfs2_rgrpd *rgd;
 	struct gfs2_holder rg_gh;
 	struct buffer_head *dibh;
-	u64 *dataptrs, bn = 0;
+	__be64 *dataptrs;
+	u64 bn = 0;
 	u64 bstart = 0;
 	unsigned int blen = 0;
 	unsigned int blks = 0;
@@ -444,7 +445,7 @@ static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 	struct buffer_head **bh;
 	unsigned int amount = GFS2_EA_DATA_LEN(ea);
 	unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
-	u64 *dataptrs = GFS2_EA2DATAPTRS(ea);
+	__be64 *dataptrs = GFS2_EA2DATAPTRS(ea);
 	unsigned int x;
 	int error = 0;
 
@@ -629,7 +630,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 		ea->ea_num_ptrs = 0;
 		memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len);
 	} else {
-		u64 *dataptr = GFS2_EA2DATAPTRS(ea);
+		__be64 *dataptr = GFS2_EA2DATAPTRS(ea);
 		const char *data = er->er_data;
 		unsigned int data_len = er->er_data_len;
 		unsigned int copy;
@@ -931,12 +932,12 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct buffer_head *indbh, *newbh;
-	u64 *eablk;
+	__be64 *eablk;
 	int error;
 	int mh_size = sizeof(struct gfs2_meta_header);
 
 	if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
-		u64 *end;
+		__be64 *end;
 
 		error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT,
 				       &indbh);
@@ -948,7 +949,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 			goto out;
 		}
 
-		eablk = (u64 *)(indbh->b_data + mh_size);
+		eablk = (__be64 *)(indbh->b_data + mh_size);
 		end = eablk + sdp->sd_inptrs;
 
 		for (; eablk < end; eablk++)
@@ -971,7 +972,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 		gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
 		gfs2_buffer_clear_tail(indbh, mh_size);
 
-		eablk = (u64 *)(indbh->b_data + mh_size);
+		eablk = (__be64 *)(indbh->b_data + mh_size);
 		*eablk = cpu_to_be64(ip->i_di.di_eattr);
 		ip->i_di.di_eattr = blk;
 		ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
@@ -1202,7 +1203,7 @@ static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
 	struct buffer_head **bh;
 	unsigned int amount = GFS2_EA_DATA_LEN(ea);
 	unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
-	u64 *dataptrs = GFS2_EA2DATAPTRS(ea);
+	__be64 *dataptrs = GFS2_EA2DATAPTRS(ea);
 	unsigned int x;
 	int error;
 
@@ -1300,7 +1301,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_rgrp_list rlist;
 	struct buffer_head *indbh, *dibh;
-	u64 *eablk, *end;
+	__be64 *eablk, *end;
 	unsigned int rg_blocks = 0;
 	u64 bstart = 0;
 	unsigned int blen = 0;
@@ -1319,7 +1320,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
 		goto out;
 	}
 
-	eablk = (u64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
+	eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
 	end = eablk + sdp->sd_inptrs;
 
 	for (; eablk < end; eablk++) {
@@ -1363,7 +1364,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
 
 	gfs2_trans_add_bh(ip->i_gl, indbh, 1);
 
-	eablk = (u64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
+	eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
 	bstart = 0;
 	blen = 0;
 
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/eattr.h
index ffa6594..c82dbe0 100644
--- a/fs/gfs2/eattr.h
+++ b/fs/gfs2/eattr.h
@@ -19,7 +19,7 @@ struct iattr;
 #define GFS2_EA_SIZE(ea) \
 ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
       ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
-                                  (sizeof(u64) * (ea)->ea_num_ptrs)), 8)
+                                  (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
 
 #define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
 #define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
@@ -29,13 +29,13 @@ ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
 
 #define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
 ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
-      sizeof(u64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
+      sizeof(__be64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
 
 #define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
 #define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
 
 #define GFS2_EA2DATAPTRS(ea) \
-((u64 *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8)))
+((__be64 *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8)))
 
 #define GFS2_EA2NEXT(ea) \
 ((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea)))
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index dadd1f3..fb96930 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -500,21 +500,22 @@ static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino)
 	if (!ir.ir_length) {
 		struct buffer_head *m_bh;
 		u64 x, y;
+		__be64 z;
 
 		error = gfs2_meta_inode_buffer(m_ip, &m_bh);
 		if (error)
 			goto out_brelse;
 
-		x = *(u64 *)(m_bh->b_data + sizeof(struct gfs2_dinode));
-		x = y = be64_to_cpu(x);
+		z = *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode));
+		x = y = be64_to_cpu(z);
 		ir.ir_start = x;
 		ir.ir_length = GFS2_INUM_QUANTUM;
 		x += GFS2_INUM_QUANTUM;
 		if (x < y)
 			gfs2_consist_inode(m_ip);
-		x = cpu_to_be64(x);
+		z = cpu_to_be64(x);
 		gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
-		*(u64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = x;
+		*(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = z;
 
 		brelse(m_bh);
 	}
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 33423a5..b4e7b87 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -27,13 +27,14 @@
 #include "util.h"
 
 static struct dentry *gfs2_decode_fh(struct super_block *sb,
-				     __u32 *fh,
+				     __u32 *p,
 				     int fh_len,
 				     int fh_type,
 				     int (*acceptable)(void *context,
 						       struct dentry *dentry),
 				     void *context)
 {
+	__be32 *fh = (__force __be32 *)p;
 	struct gfs2_fh_obj fh_obj;
 	struct gfs2_inum_host *this, parent;
 
@@ -65,9 +66,10 @@ static struct dentry *gfs2_decode_fh(struct super_block *sb,
 						    acceptable, context);
 }
 
-static int gfs2_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
+static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
 			  int connectable)
 {
+	__be32 *fh = (__force __be32 *)p;
 	struct inode *inode = dentry->d_inode;
 	struct super_block *sb = inode->i_sb;
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -76,14 +78,10 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
 	    (connectable && *len < GFS2_LARGE_FH_SIZE))
 		return 255;
 
-	fh[0] = ip->i_num.no_formal_ino >> 32;
-	fh[0] = cpu_to_be32(fh[0]);
-	fh[1] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
-	fh[1] = cpu_to_be32(fh[1]);
-	fh[2] = ip->i_num.no_addr >> 32;
-	fh[2] = cpu_to_be32(fh[2]);
-	fh[3] = ip->i_num.no_addr & 0xFFFFFFFF;
-	fh[3] = cpu_to_be32(fh[3]);
+	fh[0] = cpu_to_be32(ip->i_num.no_formal_ino >> 32);
+	fh[1] = cpu_to_be32(ip->i_num.no_formal_ino & 0xFFFFFFFF);
+	fh[2] = cpu_to_be32(ip->i_num.no_addr >> 32);
+	fh[3] = cpu_to_be32(ip->i_num.no_addr & 0xFFFFFFFF);
 	*len = GFS2_SMALL_FH_SIZE;
 
 	if (!connectable || inode == sb->s_root->d_inode)
@@ -95,14 +93,10 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
 	igrab(inode);
 	spin_unlock(&dentry->d_lock);
 
-	fh[4] = ip->i_num.no_formal_ino >> 32;
-	fh[4] = cpu_to_be32(fh[4]);
-	fh[5] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
-	fh[5] = cpu_to_be32(fh[5]);
-	fh[6] = ip->i_num.no_addr >> 32;
-	fh[6] = cpu_to_be32(fh[6]);
-	fh[7] = ip->i_num.no_addr & 0xFFFFFFFF;
-	fh[7] = cpu_to_be32(fh[7]);
+	fh[4] = cpu_to_be32(ip->i_num.no_formal_ino >> 32);
+	fh[5] = cpu_to_be32(ip->i_num.no_formal_ino & 0xFFFFFFFF);
+	fh[6] = cpu_to_be32(ip->i_num.no_addr >> 32);
+	fh[7] = cpu_to_be32(ip->i_num.no_addr & 0xFFFFFFFF);
 
 	fh[8]  = cpu_to_be32(inode->i_mode);
 	fh[9]  = 0;	/* pad to double word */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 009d86c..5d00e9b 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -539,8 +539,7 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
 		qc->qc_id = cpu_to_be32(qd->qd_id);
 	}
 
-	x = qc->qc_change;
-	x = be64_to_cpu(x) + change;
+	x = be64_to_cpu(qc->qc_change) + change;
 	qc->qc_change = cpu_to_be64(x);
 
 	spin_lock(&sdp->sd_quota_spin);
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 76a5089..ca8667c 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -83,8 +83,7 @@ static inline int gfs2_meta_check_i(struct gfs2_sbd *sdp,
 				    char *file, unsigned int line)
 {
 	struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
-	u32 magic = mh->mh_magic;
-	magic = be32_to_cpu(magic);
+	u32 magic = be32_to_cpu(mh->mh_magic);
 	if (unlikely(magic != GFS2_MAGIC))
 		return gfs2_meta_check_ii(sdp, bh, "magic number", function,
 					  file, line);
@@ -107,9 +106,8 @@ static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp,
 					char *file, unsigned int line)
 {
 	struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
-	u32 magic = mh->mh_magic;
+	u32 magic = be32_to_cpu(mh->mh_magic);
 	u16 t = be32_to_cpu(mh->mh_type);
-	magic = be32_to_cpu(magic);
 	if (unlikely(magic != GFS2_MAGIC))
 		return gfs2_meta_check_ii(sdp, bh, "magic number", function,
 					  file, line);
-- 
cgit v0.10.2


From 9c9ab3d5414653bfe5e5b9f4dfdaab0c6ab17196 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Oct 2006 23:49:23 -0400
Subject: [GFS2] gfs2 __user misannotation fix

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 359965c..2fc8868 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -71,7 +71,7 @@ static int gfs2_read_actor(read_descriptor_t *desc, struct page *page,
 		size = count;
 
 	kaddr = kmap(page);
-	memcpy(desc->arg.buf, kaddr + offset, size);
+	memcpy(desc->arg.data, kaddr + offset, size);
 	kunmap(page);
 
 	desc->count = count - size;
@@ -86,7 +86,7 @@ int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
 	struct inode *inode = &ip->i_inode;
 	read_descriptor_t desc;
 	desc.written = 0;
-	desc.arg.buf = buf;
+	desc.arg.data = buf;
 	desc.count = size;
 	desc.error = 0;
 	do_generic_mapping_read(inode->i_mapping, ra_state,
-- 
cgit v0.10.2


From 539e5d6b7ae8612c0393fe940d2da5b591318d3d Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 31 Oct 2006 15:07:05 -0500
Subject: [GFS2] Change argument of gfs2_dinode_out

Everywhere this was called, a struct gfs2_inode was available,
but despite that, it was always called with a struct gfs2_dinode
as an argument. By making this change it paves the way to start
eliminating fields duplicated between the kernel's struct inode
and the struct gfs2_dinode.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 5f959b8..906e403 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -201,7 +201,7 @@ static int munge_mode(struct gfs2_inode *ip, mode_t mode)
 				(ip->i_di.di_mode & S_IFMT) == (mode & S_IFMT));
 		ip->i_di.di_mode = mode;
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
 	}
 
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 51f6356..8c092ab 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -498,7 +498,7 @@ static int gfs2_block_pointers(struct inode *inode, u64 lblock, int create,
 			error = gfs2_meta_inode_buffer(ip, &dibh);
 			if (!error) {
 				gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-				gfs2_dinode_out(&ip->i_di, dibh->b_data);
+				gfs2_dinode_out(ip, dibh->b_data);
 				brelse(dibh);
 			}
 			set_buffer_new(bh_map);
@@ -780,7 +780,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 
 	ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
 
-	gfs2_dinode_out(&ip->i_di, dibh->b_data);
+	gfs2_dinode_out(ip, dibh->b_data);
 
 	up_write(&ip->i_rw_mutex);
 
@@ -860,7 +860,7 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
 		goto out_end_trans;
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	gfs2_dinode_out(&ip->i_di, dibh->b_data);
+	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
 
 out_end_trans:
@@ -970,7 +970,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
 		ip->i_di.di_size = size;
 		ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		gfs2_dinode_out(ip, dibh->b_data);
 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
 		error = 1;
 
@@ -983,7 +983,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
 			ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
 			ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
 			gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-			gfs2_dinode_out(&ip->i_di, dibh->b_data);
+			gfs2_dinode_out(ip, dibh->b_data);
 		}
 	}
 
@@ -1057,7 +1057,7 @@ static int trunc_end(struct gfs2_inode *ip)
 	ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	gfs2_dinode_out(&ip->i_di, dibh->b_data);
+	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
 
 out:
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 59dc823..0742761 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -132,7 +132,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
 	if (ip->i_di.di_size < offset + size)
 		ip->i_di.di_size = offset + size;
 	ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
-	gfs2_dinode_out(&ip->i_di, dibh->b_data);
+	gfs2_dinode_out(ip, dibh->b_data);
 
 	brelse(dibh);
 
@@ -232,7 +232,7 @@ out:
 	ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	gfs2_dinode_out(&ip->i_di, dibh->b_data);
+	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
 
 	return copied;
@@ -907,7 +907,7 @@ static int dir_make_exhash(struct inode *inode)
 	for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
 	dip->i_di.di_depth = y;
 
-	gfs2_dinode_out(&dip->i_di, dibh->b_data);
+	gfs2_dinode_out(dip, dibh->b_data);
 
 	brelse(dibh);
 
@@ -1039,7 +1039,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 	error = gfs2_meta_inode_buffer(dip, &dibh);
 	if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
 		dip->i_di.di_blocks++;
-		gfs2_dinode_out(&dip->i_di, dibh->b_data);
+		gfs2_dinode_out(dip, dibh->b_data);
 		brelse(dibh);
 	}
 
@@ -1119,7 +1119,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
 	error = gfs2_meta_inode_buffer(dip, &dibh);
 	if (!gfs2_assert_withdraw(sdp, !error)) {
 		dip->i_di.di_depth++;
-		gfs2_dinode_out(&dip->i_di, dibh->b_data);
+		gfs2_dinode_out(dip, dibh->b_data);
 		brelse(dibh);
 	}
 
@@ -1517,7 +1517,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
 		return error;
 	gfs2_trans_add_bh(ip->i_gl, bh, 1);
 	ip->i_di.di_blocks++;
-	gfs2_dinode_out(&ip->i_di, bh->b_data);
+	gfs2_dinode_out(ip, bh->b_data);
 	brelse(bh);
 	return 0;
 }
@@ -1561,7 +1561,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
 			gfs2_trans_add_bh(ip->i_gl, bh, 1);
 			ip->i_di.di_entries++;
 			ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
-			gfs2_dinode_out(&ip->i_di, bh->b_data);
+			gfs2_dinode_out(ip, bh->b_data);
 			brelse(bh);
 			error = 0;
 			break;
@@ -1647,7 +1647,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
 	gfs2_trans_add_bh(dip->i_gl, bh, 1);
 	dip->i_di.di_entries--;
 	dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
-	gfs2_dinode_out(&dip->i_di, bh->b_data);
+	gfs2_dinode_out(dip, bh->b_data);
 	brelse(bh);
 	mark_inode_dirty(&dip->i_inode);
 
@@ -1695,7 +1695,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
 	}
 
 	dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
-	gfs2_dinode_out(&dip->i_di, bh->b_data);
+	gfs2_dinode_out(dip, bh->b_data);
 	brelse(bh);
 	return 0;
 }
@@ -1875,7 +1875,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
 		goto out_end_trans;
 
 	gfs2_trans_add_bh(dip->i_gl, dibh, 1);
-	gfs2_dinode_out(&dip->i_di, dibh->b_data);
+	gfs2_dinode_out(dip, dibh->b_data);
 	brelse(dibh);
 
 out_end_trans:
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 518f0c0..9b7bb56 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -302,7 +302,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
 	if (!error) {
 		ip->i_di.di_ctime = get_seconds();
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
 	}
 
@@ -717,7 +717,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 		}
 		ip->i_di.di_ctime = get_seconds();
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
 	}
 
@@ -852,7 +852,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
 	}
 	ip->i_di.di_ctime = get_seconds();
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	gfs2_dinode_out(&ip->i_di, dibh->b_data);
+	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
 out:
 	gfs2_trans_end(GFS2_SB(&ip->i_inode));
@@ -1132,7 +1132,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
 	if (!error) {
 		ip->i_di.di_ctime = get_seconds();
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
 	}
 
@@ -1287,7 +1287,7 @@ int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
 		gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
 		gfs2_inode_attr_out(ip);
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
 	}
 
@@ -1397,7 +1397,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (!error) {
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
 	}
 
@@ -1446,7 +1446,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (!error) {
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
 	}
 
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index fb96930..b861ddb 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -338,7 +338,7 @@ int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
 	ip->i_inode.i_nlink = nlink;
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	gfs2_dinode_out(&ip->i_di, dibh->b_data);
+	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
 	mark_inode_dirty(&ip->i_inode);
 
@@ -792,7 +792,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 		goto fail_end_trans;
 	ip->i_di.di_nlink = 1;
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	gfs2_dinode_out(&ip->i_di, dibh->b_data);
+	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
 	return 0;
 
@@ -1349,7 +1349,7 @@ __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 		gfs2_inode_attr_out(ip);
 
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
 	}
 	return error;
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index 2d1682d..2c50fa0 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -15,6 +15,8 @@
 
 #include "gfs2.h"
 #include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "incore.h"
 
 #define pv(struct, member, fmt) printk(KERN_INFO "  "#member" = "fmt"\n", \
 				       struct->member);
@@ -187,8 +189,9 @@ void gfs2_dinode_in(struct gfs2_dinode_host *di, const void *buf)
 
 }
 
-void gfs2_dinode_out(const struct gfs2_dinode_host *di, void *buf)
+void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 {
+	const struct gfs2_dinode_host *di = &ip->i_di;
 	struct gfs2_dinode *str = buf;
 
 	gfs2_meta_header_out(&di->di_header, buf);
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 2fc8868..7ea4175 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -336,7 +336,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 		goto out_trans_end;
 	gfs2_trans_add_bh(ip->i_gl, bh, 1);
 	ip->i_di.di_flags = new_flags;
-	gfs2_dinode_out(&ip->i_di, bh->b_data);
+	gfs2_dinode_out(ip, bh->b_data);
 	brelse(bh);
 out_trans_end:
 	gfs2_trans_end(sdp);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index ef6e5ed..bd26885 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -339,7 +339,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 
 	if (!gfs2_assert_withdraw(sdp, !error)) {
-		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		gfs2_dinode_out(ip, dibh->b_data);
 		memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname,
 		       size);
 		brelse(dibh);
@@ -414,7 +414,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		gfs2_inum_out(&dip->i_num, &dent->de_inum);
 		dent->de_type = cpu_to_be16(DT_DIR);
 
-		gfs2_dinode_out(&ip->i_di, di);
+		gfs2_dinode_out(ip, di);
 
 		brelse(dibh);
 	}
@@ -541,7 +541,7 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 
 	if (!gfs2_assert_withdraw(sdp, !error)) {
-		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
 	}
 
@@ -762,7 +762,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			goto out_end_trans;
 		ip->i_di.di_ctime = get_seconds();
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
 	}
 
@@ -949,7 +949,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 	gfs2_inode_attr_out(ip);
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	gfs2_dinode_out(&ip->i_di, dibh->b_data);
+	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
 
 	if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index 10a507d..550effa 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -535,7 +535,8 @@ extern void gfs2_rgrp_in(struct gfs2_rgrp_host *rg, const void *buf);
 extern void gfs2_rgrp_out(const struct gfs2_rgrp_host *rg, void *buf);
 extern void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf);
 extern void gfs2_dinode_in(struct gfs2_dinode_host *di, const void *buf);
-extern void gfs2_dinode_out(const struct gfs2_dinode_host *di, void *buf);
+struct gfs2_inode;
+extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
 extern void gfs2_ea_header_in(struct gfs2_ea_header *ea, const void *buf);
 extern void gfs2_ea_header_out(const struct gfs2_ea_header *ea, void *buf);
 extern void gfs2_log_header_in(struct gfs2_log_header_host *lh, const void *buf);
-- 
cgit v0.10.2


From 891ea14712da68e282de8583e5fa14f0d3f3731e Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 31 Oct 2006 15:22:10 -0500
Subject: [GFS2] Change argument to gfs2_dinode_in

This is a preliminary patch to enable the removal of fields
in gfs2_dinode_host which are duplicated in struct inode.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index b861ddb..9875e93 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -229,7 +229,7 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
 		return -EIO;
 	}
 
-	gfs2_dinode_in(&ip->i_di, dibh->b_data);
+	gfs2_dinode_in(ip, dibh->b_data);
 
 	brelse(dibh);
 
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index 2c50fa0..edf8756 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -155,8 +155,9 @@ void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf)
 	qu->qu_value = be64_to_cpu(str->qu_value);
 }
 
-void gfs2_dinode_in(struct gfs2_dinode_host *di, const void *buf)
+void gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 {
+	struct gfs2_dinode_host *di = &ip->i_di;
 	const struct gfs2_dinode *str = buf;
 
 	gfs2_meta_header_in(&di->di_header, buf);
@@ -186,7 +187,6 @@ void gfs2_dinode_in(struct gfs2_dinode_host *di, const void *buf)
 	di->di_entries = be32_to_cpu(str->di_entries);
 
 	di->di_eattr = be64_to_cpu(str->di_eattr);
-
 }
 
 void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index 550effa..08d8240 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -534,8 +534,8 @@ extern void gfs2_rindex_out(const struct gfs2_rindex_host *ri, void *buf);
 extern void gfs2_rgrp_in(struct gfs2_rgrp_host *rg, const void *buf);
 extern void gfs2_rgrp_out(const struct gfs2_rgrp_host *rg, void *buf);
 extern void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf);
-extern void gfs2_dinode_in(struct gfs2_dinode_host *di, const void *buf);
 struct gfs2_inode;
+extern void gfs2_dinode_in(struct gfs2_inode *ip, const void *buf);
 extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
 extern void gfs2_ea_header_in(struct gfs2_ea_header *ea, const void *buf);
 extern void gfs2_ea_header_out(const struct gfs2_ea_header *ea, void *buf);
-- 
cgit v0.10.2


From ea744d01c6a5acf1f6171b4c6e1658a742063613 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 31 Oct 2006 15:28:00 -0500
Subject: [GFS2] Move gfs2_dinode_in to inode.c

gfs2_dinode_in() is only ever called from one place, so move it
to that place (in inode.c) and make it static.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9875e93..b39cfcf 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -208,6 +208,40 @@ fail:
 	return ERR_PTR(error);
 }
 
+static void gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
+{
+	struct gfs2_dinode_host *di = &ip->i_di;
+	const struct gfs2_dinode *str = buf;
+
+	gfs2_meta_header_in(&di->di_header, buf);
+	gfs2_inum_in(&di->di_num, &str->di_num);
+
+	di->di_mode = be32_to_cpu(str->di_mode);
+	di->di_uid = be32_to_cpu(str->di_uid);
+	di->di_gid = be32_to_cpu(str->di_gid);
+	di->di_nlink = be32_to_cpu(str->di_nlink);
+	di->di_size = be64_to_cpu(str->di_size);
+	di->di_blocks = be64_to_cpu(str->di_blocks);
+	di->di_atime = be64_to_cpu(str->di_atime);
+	di->di_mtime = be64_to_cpu(str->di_mtime);
+	di->di_ctime = be64_to_cpu(str->di_ctime);
+	di->di_major = be32_to_cpu(str->di_major);
+	di->di_minor = be32_to_cpu(str->di_minor);
+
+	di->di_goal_meta = be64_to_cpu(str->di_goal_meta);
+	di->di_goal_data = be64_to_cpu(str->di_goal_data);
+	di->di_generation = be64_to_cpu(str->di_generation);
+
+	di->di_flags = be32_to_cpu(str->di_flags);
+	di->di_payload_format = be32_to_cpu(str->di_payload_format);
+	di->di_height = be16_to_cpu(str->di_height);
+
+	di->di_depth = be16_to_cpu(str->di_depth);
+	di->di_entries = be32_to_cpu(str->di_entries);
+
+	di->di_eattr = be64_to_cpu(str->di_eattr);
+}
+
 /**
  * gfs2_inode_refresh - Refresh the incore copy of the dinode
  * @ip: The GFS2 inode
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index edf8756..062a44f 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -56,7 +56,7 @@ static void gfs2_inum_print(const struct gfs2_inum_host *no)
 	printk(KERN_INFO "  no_addr = %llu\n", (unsigned long long)no->no_addr);
 }
 
-static void gfs2_meta_header_in(struct gfs2_meta_header_host *mh, const void *buf)
+void gfs2_meta_header_in(struct gfs2_meta_header_host *mh, const void *buf)
 {
 	const struct gfs2_meta_header *str = buf;
 
@@ -155,40 +155,6 @@ void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf)
 	qu->qu_value = be64_to_cpu(str->qu_value);
 }
 
-void gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
-{
-	struct gfs2_dinode_host *di = &ip->i_di;
-	const struct gfs2_dinode *str = buf;
-
-	gfs2_meta_header_in(&di->di_header, buf);
-	gfs2_inum_in(&di->di_num, &str->di_num);
-
-	di->di_mode = be32_to_cpu(str->di_mode);
-	di->di_uid = be32_to_cpu(str->di_uid);
-	di->di_gid = be32_to_cpu(str->di_gid);
-	di->di_nlink = be32_to_cpu(str->di_nlink);
-	di->di_size = be64_to_cpu(str->di_size);
-	di->di_blocks = be64_to_cpu(str->di_blocks);
-	di->di_atime = be64_to_cpu(str->di_atime);
-	di->di_mtime = be64_to_cpu(str->di_mtime);
-	di->di_ctime = be64_to_cpu(str->di_ctime);
-	di->di_major = be32_to_cpu(str->di_major);
-	di->di_minor = be32_to_cpu(str->di_minor);
-
-	di->di_goal_meta = be64_to_cpu(str->di_goal_meta);
-	di->di_goal_data = be64_to_cpu(str->di_goal_data);
-	di->di_generation = be64_to_cpu(str->di_generation);
-
-	di->di_flags = be32_to_cpu(str->di_flags);
-	di->di_payload_format = be32_to_cpu(str->di_payload_format);
-	di->di_height = be16_to_cpu(str->di_height);
-
-	di->di_depth = be16_to_cpu(str->di_depth);
-	di->di_entries = be32_to_cpu(str->di_entries);
-
-	di->di_eattr = be64_to_cpu(str->di_eattr);
-}
-
 void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 {
 	const struct gfs2_dinode_host *di = &ip->i_di;
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index 08d8240..4fc297a 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -528,6 +528,7 @@ struct gfs2_quota_change_host {
 
 extern void gfs2_inum_in(struct gfs2_inum_host *no, const void *buf);
 extern void gfs2_inum_out(const struct gfs2_inum_host *no, void *buf);
+extern void gfs2_meta_header_in(struct gfs2_meta_header_host *mh, const void *buf);
 extern void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf);
 extern void gfs2_rindex_in(struct gfs2_rindex_host *ri, const void *buf);
 extern void gfs2_rindex_out(const struct gfs2_rindex_host *ri, void *buf);
@@ -535,7 +536,6 @@ extern void gfs2_rgrp_in(struct gfs2_rgrp_host *rg, const void *buf);
 extern void gfs2_rgrp_out(const struct gfs2_rgrp_host *rg, void *buf);
 extern void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf);
 struct gfs2_inode;
-extern void gfs2_dinode_in(struct gfs2_inode *ip, const void *buf);
 extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
 extern void gfs2_ea_header_in(struct gfs2_ea_header *ea, const void *buf);
 extern void gfs2_ea_header_out(const struct gfs2_ea_header *ea, void *buf);
-- 
cgit v0.10.2


From 4cc14f0b88bf3e0b508143e091eb5a8dff3e3b9c Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 31 Oct 2006 19:00:24 -0500
Subject: [GFS2] Change argument to gfs2_dinode_print

Change argument for gfs2_dinode_print in order to prepare
for removal of duplicate fields between struct inode and
struct gfs2_dinode_host.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index b39cfcf..4c5d286 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -269,7 +269,7 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
 
 	if (ip->i_num.no_addr != ip->i_di.di_num.no_addr) {
 		if (gfs2_consist_inode(ip))
-			gfs2_dinode_print(&ip->i_di);
+			gfs2_dinode_print(ip);
 		return -EIO;
 	}
 	if (ip->i_num.no_formal_ino != ip->i_di.di_num.no_formal_ino)
@@ -289,7 +289,7 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
 
 	if (ip->i_di.di_blocks != 1) {
 		if (gfs2_consist_inode(ip))
-			gfs2_dinode_print(&ip->i_di);
+			gfs2_dinode_print(ip);
 		return -EIO;
 	}
 
@@ -359,7 +359,7 @@ int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
 	   bigger than the old one, we must have underflowed. */
 	if (diff < 0 && nlink > ip->i_di.di_nlink) {
 		if (gfs2_consist_inode(ip))
-			gfs2_dinode_print(&ip->i_di);
+			gfs2_dinode_print(ip);
 		return -EIO;
 	}
 
@@ -1010,7 +1010,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
 
 	if (ip->i_di.di_entries != 2) {
 		if (gfs2_consist_inode(ip))
-			gfs2_dinode_print(&ip->i_di);
+			gfs2_dinode_print(ip);
 		return -EIO;
 	}
 
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index 062a44f..77bed44 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -190,8 +190,10 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 
 }
 
-void gfs2_dinode_print(const struct gfs2_dinode_host *di)
+void gfs2_dinode_print(const struct gfs2_inode *ip)
 {
+	const struct gfs2_dinode_host *di = &ip->i_di;
+
 	gfs2_meta_header_print(&di->di_header);
 	gfs2_inum_print(&di->di_num);
 
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index bd26885..b2c2fe6 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -467,7 +467,7 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
 
 	if (ip->i_di.di_entries < 2) {
 		if (gfs2_consist_inode(ip))
-			gfs2_dinode_print(&ip->i_di);
+			gfs2_dinode_print(ip);
 		error = -EIO;
 		goto out_gunlock;
 	}
@@ -640,7 +640,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 		if (S_ISDIR(nip->i_di.di_mode)) {
 			if (nip->i_di.di_entries < 2) {
 				if (gfs2_consist_inode(nip))
-					gfs2_dinode_print(&nip->i_di);
+					gfs2_dinode_print(nip);
 				error = -EIO;
 				goto out_gunlock;
 			}
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index 4fc297a..cf4c655 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -549,7 +549,7 @@ extern void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
 /* Printing functions */
 
 extern void gfs2_rindex_print(const struct gfs2_rindex_host *ri);
-extern void gfs2_dinode_print(const struct gfs2_dinode_host *di);
+extern void gfs2_dinode_print(const struct gfs2_inode *ip);
 
 #endif /* __KERNEL__ */
 
-- 
cgit v0.10.2


From af339c0241d0dd3b35f9097b4f4999bb22ffe502 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 1 Nov 2006 10:34:15 -0500
Subject: [GFS2] Shrink gfs2_inode (1) - di_header/di_num

The metadata header doesn't need to be stored in the incore
struct gfs2_inode since its constant, and this patch removes it.
Also, there is already a field for the inode's number in the
struct gfs2_inode, so we don't need one in struct gfs2_dinode_host
as well.

This saves 28 bytes of space in the struct gfs2_inode.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 4c5d286..7ba05fc 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -208,13 +208,18 @@ fail:
 	return ERR_PTR(error);
 }
 
-static void gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
+static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 {
 	struct gfs2_dinode_host *di = &ip->i_di;
 	const struct gfs2_dinode *str = buf;
 
-	gfs2_meta_header_in(&di->di_header, buf);
-	gfs2_inum_in(&di->di_num, &str->di_num);
+	if (ip->i_num.no_addr != be64_to_cpu(str->di_num.no_addr)) {
+		if (gfs2_consist_inode(ip))
+			gfs2_dinode_print(ip);
+		return -EIO;
+	}
+	if (ip->i_num.no_formal_ino != be64_to_cpu(str->di_num.no_formal_ino))
+		return -ESTALE;
 
 	di->di_mode = be32_to_cpu(str->di_mode);
 	di->di_uid = be32_to_cpu(str->di_uid);
@@ -240,6 +245,7 @@ static void gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	di->di_entries = be32_to_cpu(str->di_entries);
 
 	di->di_eattr = be64_to_cpu(str->di_eattr);
+	return 0;
 }
 
 /**
@@ -263,21 +269,12 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
 		return -EIO;
 	}
 
-	gfs2_dinode_in(ip, dibh->b_data);
+	error = gfs2_dinode_in(ip, dibh->b_data);
 
 	brelse(dibh);
-
-	if (ip->i_num.no_addr != ip->i_di.di_num.no_addr) {
-		if (gfs2_consist_inode(ip))
-			gfs2_dinode_print(ip);
-		return -EIO;
-	}
-	if (ip->i_num.no_formal_ino != ip->i_di.di_num.no_formal_ino)
-		return -ESTALE;
-
 	ip->i_vn = ip->i_gl->gl_vn;
 
-	return 0;
+	return error;
 }
 
 int gfs2_dinode_dealloc(struct gfs2_inode *ip)
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index 77bed44..4bc590e 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -56,7 +56,7 @@ static void gfs2_inum_print(const struct gfs2_inum_host *no)
 	printk(KERN_INFO "  no_addr = %llu\n", (unsigned long long)no->no_addr);
 }
 
-void gfs2_meta_header_in(struct gfs2_meta_header_host *mh, const void *buf)
+static void gfs2_meta_header_in(struct gfs2_meta_header_host *mh, const void *buf)
 {
 	const struct gfs2_meta_header *str = buf;
 
@@ -74,13 +74,6 @@ static void gfs2_meta_header_out(const struct gfs2_meta_header_host *mh, void *b
 	str->mh_format = cpu_to_be32(mh->mh_format);
 }
 
-static void gfs2_meta_header_print(const struct gfs2_meta_header_host *mh)
-{
-	pv(mh, mh_magic, "0x%.8X");
-	pv(mh, mh_type, "%u");
-	pv(mh, mh_format, "%u");
-}
-
 void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
 {
 	const struct gfs2_sb *str = buf;
@@ -160,8 +153,13 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	const struct gfs2_dinode_host *di = &ip->i_di;
 	struct gfs2_dinode *str = buf;
 
-	gfs2_meta_header_out(&di->di_header, buf);
-	gfs2_inum_out(&di->di_num, (char *)&str->di_num);
+	str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+	str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI);
+	str->di_header.__pad0 = 0;
+	str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
+	str->di_header.__pad1 = 0;
+
+	gfs2_inum_out(&ip->i_num, &str->di_num);
 
 	str->di_mode = cpu_to_be32(di->di_mode);
 	str->di_uid = cpu_to_be32(di->di_uid);
@@ -187,15 +185,13 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	str->di_entries = cpu_to_be32(di->di_entries);
 
 	str->di_eattr = cpu_to_be64(di->di_eattr);
-
 }
 
 void gfs2_dinode_print(const struct gfs2_inode *ip)
 {
 	const struct gfs2_dinode_host *di = &ip->i_di;
 
-	gfs2_meta_header_print(&di->di_header);
-	gfs2_inum_print(&di->di_num);
+	gfs2_inum_print(&ip->i_num);
 
 	pv(di, di_mode, "0%o");
 	pv(di, di_uid, "%u");
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index cf4c655..c0e76fc 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -322,10 +322,6 @@ struct gfs2_dinode {
 };
 
 struct gfs2_dinode_host {
-	struct gfs2_meta_header_host di_header;
-
-	struct gfs2_inum_host di_num;
-
 	__u32 di_mode;	/* mode of file */
 	__u32 di_uid;	/* owner's user id */
 	__u32 di_gid;	/* owner's group id */
@@ -528,7 +524,6 @@ struct gfs2_quota_change_host {
 
 extern void gfs2_inum_in(struct gfs2_inum_host *no, const void *buf);
 extern void gfs2_inum_out(const struct gfs2_inum_host *no, void *buf);
-extern void gfs2_meta_header_in(struct gfs2_meta_header_host *mh, const void *buf);
 extern void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf);
 extern void gfs2_rindex_in(struct gfs2_rindex_host *ri, const void *buf);
 extern void gfs2_rindex_out(const struct gfs2_rindex_host *ri, void *buf);
-- 
cgit v0.10.2


From e7f14f4d094ea1a9ce1953375f5bc1500c760c79 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 31 Oct 2006 21:45:08 -0500
Subject: [GFS2] Shrink gfs2_inode (2) - di_major/di_minor

This removes the device numbers from this structure by using
inode->i_rdev instead. It also cleans up the code in gfs2_mknod.
It results in shrinking the gfs2_inode by 8 bytes.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7ba05fc..a995919 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -51,17 +51,6 @@ void gfs2_inode_attr_in(struct gfs2_inode *ip)
 	struct gfs2_dinode_host *di = &ip->i_di;
 
 	inode->i_ino = ip->i_num.no_addr;
-
-	switch (di->di_mode & S_IFMT) {
-	case S_IFBLK:
-	case S_IFCHR:
-		inode->i_rdev = MKDEV(di->di_major, di->di_minor);
-		break;
-	default:
-		inode->i_rdev = 0;
-		break;
-	};
-
 	inode->i_mode = di->di_mode;
 	inode->i_nlink = di->di_nlink;
 	inode->i_uid = di->di_uid;
@@ -222,6 +211,15 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 		return -ESTALE;
 
 	di->di_mode = be32_to_cpu(str->di_mode);
+	ip->i_inode.i_rdev = 0;
+	switch (di->di_mode & S_IFMT) {
+	case S_IFBLK:
+	case S_IFCHR:
+		ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major),
+					   be32_to_cpu(str->di_minor));
+		break;
+	};
+
 	di->di_uid = be32_to_cpu(str->di_uid);
 	di->di_gid = be32_to_cpu(str->di_gid);
 	di->di_nlink = be32_to_cpu(str->di_nlink);
@@ -230,8 +228,6 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	di->di_atime = be64_to_cpu(str->di_atime);
 	di->di_mtime = be64_to_cpu(str->di_mtime);
 	di->di_ctime = be64_to_cpu(str->di_ctime);
-	di->di_major = be32_to_cpu(str->di_major);
-	di->di_minor = be32_to_cpu(str->di_minor);
 
 	di->di_goal_meta = be64_to_cpu(str->di_goal_meta);
 	di->di_goal_data = be64_to_cpu(str->di_goal_data);
@@ -270,7 +266,6 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
 	}
 
 	error = gfs2_dinode_in(ip, dibh->b_data);
-
 	brelse(dibh);
 	ip->i_vn = ip->i_gl->gl_vn;
 
@@ -684,7 +679,7 @@ out:
 static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 			const struct gfs2_inum_host *inum, unsigned int mode,
 			unsigned int uid, unsigned int gid,
-			const u64 *generation)
+			const u64 *generation, dev_t dev)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
 	struct gfs2_dinode *di;
@@ -705,7 +700,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 	di->di_size = cpu_to_be64(0);
 	di->di_blocks = cpu_to_be64(1);
 	di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(get_seconds());
-	di->di_major = di->di_minor = cpu_to_be32(0);
+	di->di_major = cpu_to_be32(MAJOR(dev));
+	di->di_minor = cpu_to_be32(MINOR(dev));
 	di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr);
 	di->di_generation = cpu_to_be64(*generation);
 	di->di_flags = cpu_to_be32(0);
@@ -740,7 +736,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 
 static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 		       unsigned int mode, const struct gfs2_inum_host *inum,
-		       const u64 *generation)
+		       const u64 *generation, dev_t dev)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
 	unsigned int uid, gid;
@@ -761,7 +757,7 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 	if (error)
 		goto out_quota;
 
-	init_dinode(dip, gl, inum, mode, uid, gid, generation);
+	init_dinode(dip, gl, inum, mode, uid, gid, generation, dev);
 	gfs2_quota_change(dip, +1, uid, gid);
 	gfs2_trans_end(sdp);
 
@@ -892,7 +888,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
  */
 
 struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
-			   unsigned int mode)
+			   unsigned int mode, dev_t dev)
 {
 	struct inode *inode;
 	struct gfs2_inode *dip = ghs->gh_gl->gl_object;
@@ -950,7 +946,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 			goto fail_gunlock;
 	}
 
-	error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation);
+	error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev);
 	if (error)
 		goto fail_gunlock2;
 
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index d699b92..33c9ea6 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -37,7 +37,7 @@ int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
 struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 			   int is_root, struct nameidata *nd);
 struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
-			   unsigned int mode);
+			   unsigned int mode, dev_t dev);
 int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
 		struct gfs2_inode *ip);
 int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index 4bc590e..60dd943 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -170,8 +170,6 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	str->di_atime = cpu_to_be64(di->di_atime);
 	str->di_mtime = cpu_to_be64(di->di_mtime);
 	str->di_ctime = cpu_to_be64(di->di_ctime);
-	str->di_major = cpu_to_be32(di->di_major);
-	str->di_minor = cpu_to_be32(di->di_minor);
 
 	str->di_goal_meta = cpu_to_be64(di->di_goal_meta);
 	str->di_goal_data = cpu_to_be64(di->di_goal_data);
@@ -202,8 +200,6 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
 	printk(KERN_INFO "  di_atime = %lld\n", (long long)di->di_atime);
 	printk(KERN_INFO "  di_mtime = %lld\n", (long long)di->di_mtime);
 	printk(KERN_INFO "  di_ctime = %lld\n", (long long)di->di_ctime);
-	pv(di, di_major, "%u");
-	pv(di, di_minor, "%u");
 
 	printk(KERN_INFO "  di_goal_meta = %llu\n", (unsigned long long)di->di_goal_meta);
 	printk(KERN_INFO "  di_goal_data = %llu\n", (unsigned long long)di->di_goal_data);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index b2c2fe6..c10b914 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -59,7 +59,7 @@ static int gfs2_create(struct inode *dir, struct dentry *dentry,
 	gfs2_holder_init(dip->i_gl, 0, 0, ghs);
 
 	for (;;) {
-		inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode);
+		inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode, 0);
 		if (!IS_ERR(inode)) {
 			gfs2_trans_end(sdp);
 			if (dip->i_alloc.al_rgd)
@@ -326,7 +326,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
 
 	gfs2_holder_init(dip->i_gl, 0, 0, ghs);
 
-	inode = gfs2_createi(ghs, &dentry->d_name, S_IFLNK | S_IRWXUGO);
+	inode = gfs2_createi(ghs, &dentry->d_name, S_IFLNK | S_IRWXUGO, 0);
 	if (IS_ERR(inode)) {
 		gfs2_holder_uninit(ghs);
 		return PTR_ERR(inode);
@@ -379,7 +379,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	gfs2_holder_init(dip->i_gl, 0, 0, ghs);
 
-	inode = gfs2_createi(ghs, &dentry->d_name, S_IFDIR | mode);
+	inode = gfs2_createi(ghs, &dentry->d_name, S_IFDIR | mode, 0);
 	if (IS_ERR(inode)) {
 		gfs2_holder_uninit(ghs);
 		return PTR_ERR(inode);
@@ -504,47 +504,19 @@ out:
 static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
 		      dev_t dev)
 {
-	struct gfs2_inode *dip = GFS2_I(dir), *ip;
+	struct gfs2_inode *dip = GFS2_I(dir);
 	struct gfs2_sbd *sdp = GFS2_SB(dir);
 	struct gfs2_holder ghs[2];
 	struct inode *inode;
-	struct buffer_head *dibh;
-	u32 major = 0, minor = 0;
-	int error;
-
-	switch (mode & S_IFMT) {
-	case S_IFBLK:
-	case S_IFCHR:
-		major = MAJOR(dev);
-		minor = MINOR(dev);
-		break;
-	case S_IFIFO:
-	case S_IFSOCK:
-		break;
-	default:
-		return -EOPNOTSUPP;
-	};
 
 	gfs2_holder_init(dip->i_gl, 0, 0, ghs);
 
-	inode = gfs2_createi(ghs, &dentry->d_name, mode);
+	inode = gfs2_createi(ghs, &dentry->d_name, mode, dev);
 	if (IS_ERR(inode)) {
 		gfs2_holder_uninit(ghs);
 		return PTR_ERR(inode);
 	}
 
-	ip = ghs[1].gh_gl->gl_object;
-
-	ip->i_di.di_major = major;
-	ip->i_di.di_minor = minor;
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-
-	if (!gfs2_assert_withdraw(sdp, !error)) {
-		gfs2_dinode_out(ip, dibh->b_data);
-		brelse(dibh);
-	}
-
 	gfs2_trans_end(sdp);
 	if (dip->i_alloc.al_rgd)
 		gfs2_inplace_release(dip);
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index c0e76fc..5bcf895 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -331,8 +331,6 @@ struct gfs2_dinode_host {
 	__u64 di_atime;	/* time last accessed */
 	__u64 di_mtime;	/* time last modified */
 	__u64 di_ctime;	/* time last changed */
-	__u32 di_major;	/* device major number */
-	__u32 di_minor;	/* device minor number */
 
 	/* This section varies from gfs1. Padding added to align with
          * remainder of dinode
-- 
cgit v0.10.2


From b60623c238b6a819bd04090139704e2cb57a751f Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 1 Nov 2006 12:22:46 -0500
Subject: [GFS2] Shrink gfs2_inode (3) - di_mode

This removes the duplicate di_mode field in favour of using the
inode->i_mode field. This saves 4 bytes.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 906e403..87f6304f 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -76,9 +76,9 @@ int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
 		return -EOPNOTSUPP;
 	if (current->fsuid != ip->i_di.di_uid && !capable(CAP_FOWNER))
 		return -EPERM;
-	if (S_ISLNK(ip->i_di.di_mode))
+	if (S_ISLNK(ip->i_inode.i_mode))
 		return -EOPNOTSUPP;
-	if (!access && !S_ISDIR(ip->i_di.di_mode))
+	if (!access && !S_ISDIR(ip->i_inode.i_mode))
 		return -EACCES;
 
 	return 0;
@@ -198,8 +198,8 @@ static int munge_mode(struct gfs2_inode *ip, mode_t mode)
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (!error) {
 		gfs2_assert_withdraw(sdp,
-				(ip->i_di.di_mode & S_IFMT) == (mode & S_IFMT));
-		ip->i_di.di_mode = mode;
+				(ip->i_inode.i_mode & S_IFMT) == (mode & S_IFMT));
+		ip->i_inode.i_mode = mode;
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
@@ -215,12 +215,12 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
 	struct posix_acl *acl = NULL, *clone;
 	struct gfs2_ea_request er;
-	mode_t mode = ip->i_di.di_mode;
+	mode_t mode = ip->i_inode.i_mode;
 	int error;
 
 	if (!sdp->sd_args.ar_posix_acl)
 		return 0;
-	if (S_ISLNK(ip->i_di.di_mode))
+	if (S_ISLNK(ip->i_inode.i_mode))
 		return 0;
 
 	memset(&er, 0, sizeof(struct gfs2_ea_request));
@@ -232,7 +232,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
 		return error;
 	if (!acl) {
 		mode &= ~current->fs->umask;
-		if (mode != ip->i_di.di_mode)
+		if (mode != ip->i_inode.i_mode)
 			error = munge_mode(ip, mode);
 		return error;
 	}
@@ -244,7 +244,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
 	posix_acl_release(acl);
 	acl = clone;
 
-	if (S_ISDIR(ip->i_di.di_mode)) {
+	if (S_ISDIR(ip->i_inode.i_mode)) {
 		er.er_name = GFS2_POSIX_ACL_DEFAULT;
 		er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
 		error = gfs2_system_eaops.eo_set(ip, &er);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 8c092ab..481a068 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1109,7 +1109,7 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
 {
 	int error;
 
-	if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_di.di_mode)))
+	if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode)))
 		return -EINVAL;
 
 	if (size > ip->i_di.di_size)
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
index 92c54e9..cd747c0 100644
--- a/fs/gfs2/eaops.c
+++ b/fs/gfs2/eaops.c
@@ -120,7 +120,7 @@ static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 
 	if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
 		if (!(er->er_flags & GFS2_ERF_MODE)) {
-			er->er_mode = ip->i_di.di_mode;
+			er->er_mode = ip->i_inode.i_mode;
 			er->er_flags |= GFS2_ERF_MODE;
 		}
 		error = gfs2_acl_validate_set(ip, 1, er,
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 9b7bb56..5208fa9 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -711,9 +711,9 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	if (!error) {
 		if (er->er_flags & GFS2_ERF_MODE) {
 			gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
-					    (ip->i_di.di_mode & S_IFMT) ==
+					    (ip->i_inode.i_mode & S_IFMT) ==
 					    (er->er_mode & S_IFMT));
-			ip->i_di.di_mode = er->er_mode;
+			ip->i_inode.i_mode = er->er_mode;
 		}
 		ip->i_di.di_ctime = get_seconds();
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -847,8 +847,8 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
 
 	if (er->er_flags & GFS2_ERF_MODE) {
 		gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
-			(ip->i_di.di_mode & S_IFMT) == (er->er_mode & S_IFMT));
-		ip->i_di.di_mode = er->er_mode;
+			(ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT));
+		ip->i_inode.i_mode = er->er_mode;
 	}
 	ip->i_di.di_ctime = get_seconds();
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 78fe0fa..44633c4 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -2078,7 +2078,7 @@ static int dump_inode(struct gfs2_inode *ip)
 	printk(KERN_INFO "    num = %llu %llu\n",
 		    (unsigned long long)ip->i_num.no_formal_ino,
 		    (unsigned long long)ip->i_num.no_addr);
-	printk(KERN_INFO "    type = %u\n", IF2DT(ip->i_di.di_mode));
+	printk(KERN_INFO "    type = %u\n", IF2DT(ip->i_inode.i_mode));
 	printk(KERN_INFO "    i_flags =");
 	for (x = 0; x < 32; x++)
 		if (test_bit(x, &ip->i_flags))
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 5406b19..aad45b7 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -92,7 +92,7 @@ static void gfs2_pte_inval(struct gfs2_glock *gl)
 
 	ip = gl->gl_object;
 	inode = &ip->i_inode;
-	if (!ip || !S_ISREG(ip->i_di.di_mode))
+	if (!ip || !S_ISREG(inode->i_mode))
 		return;
 
 	if (!test_bit(GIF_PAGED, &ip->i_flags))
@@ -119,7 +119,7 @@ static void gfs2_page_inval(struct gfs2_glock *gl)
 
 	ip = gl->gl_object;
 	inode = &ip->i_inode;
-	if (!ip || !S_ISREG(ip->i_di.di_mode))
+	if (!ip || !S_ISREG(inode->i_mode))
 		return;
 
 	truncate_inode_pages(inode->i_mapping, 0);
@@ -142,7 +142,7 @@ static void gfs2_page_wait(struct gfs2_glock *gl)
 	struct address_space *mapping = inode->i_mapping;
 	int error;
 
-	if (!S_ISREG(ip->i_di.di_mode))
+	if (!S_ISREG(inode->i_mode))
 		return;
 
 	error = filemap_fdatawait(mapping);
@@ -164,7 +164,7 @@ static void gfs2_page_writeback(struct gfs2_glock *gl)
 	struct inode *inode = &ip->i_inode;
 	struct address_space *mapping = inode->i_mapping;
 
-	if (!S_ISREG(ip->i_di.di_mode))
+	if (!S_ISREG(inode->i_mode))
 		return;
 
 	filemap_fdatawrite(mapping);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index a995919..de46604 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -51,7 +51,6 @@ void gfs2_inode_attr_in(struct gfs2_inode *ip)
 	struct gfs2_dinode_host *di = &ip->i_di;
 
 	inode->i_ino = ip->i_num.no_addr;
-	inode->i_mode = di->di_mode;
 	inode->i_nlink = di->di_nlink;
 	inode->i_uid = di->di_uid;
 	inode->i_gid = di->di_gid;
@@ -88,9 +87,6 @@ void gfs2_inode_attr_out(struct gfs2_inode *ip)
 {
 	struct inode *inode = &ip->i_inode;
 	struct gfs2_dinode_host *di = &ip->i_di;
-	gfs2_assert_withdraw(GFS2_SB(inode),
-		(di->di_mode & S_IFMT) == (inode->i_mode & S_IFMT));
-	di->di_mode = inode->i_mode;
 	di->di_uid = inode->i_uid;
 	di->di_gid = inode->i_gid;
 	di->di_atime = inode->i_atime.tv_sec;
@@ -210,9 +206,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	if (ip->i_num.no_formal_ino != be64_to_cpu(str->di_num.no_formal_ino))
 		return -ESTALE;
 
-	di->di_mode = be32_to_cpu(str->di_mode);
+	ip->i_inode.i_mode = be32_to_cpu(str->di_mode);
 	ip->i_inode.i_rdev = 0;
-	switch (di->di_mode & S_IFMT) {
+	switch (ip->i_inode.i_mode & S_IFMT) {
 	case S_IFBLK:
 	case S_IFCHR:
 		ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major),
@@ -620,7 +616,7 @@ static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode,
 			       unsigned int *uid, unsigned int *gid)
 {
 	if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir &&
-	    (dip->i_di.di_mode & S_ISUID) && dip->i_di.di_uid) {
+	    (dip->i_inode.i_mode & S_ISUID) && dip->i_di.di_uid) {
 		if (S_ISDIR(*mode))
 			*mode |= S_ISUID;
 		else if (dip->i_di.di_uid != current->fsuid)
@@ -629,7 +625,7 @@ static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode,
 	} else
 		*uid = current->fsuid;
 
-	if (dip->i_di.di_mode & S_ISGID) {
+	if (dip->i_inode.i_mode & S_ISGID) {
 		if (S_ISDIR(*mode))
 			*mode |= S_ISGID;
 		*gid = dip->i_di.di_gid;
@@ -810,7 +806,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 			goto fail_quota_locks;
 	}
 
-	error = gfs2_dir_add(&dip->i_inode, name, &ip->i_num, IF2DT(ip->i_di.di_mode));
+	error = gfs2_dir_add(&dip->i_inode, name, &ip->i_num, IF2DT(ip->i_inode.i_mode));
 	if (error)
 		goto fail_end_trans;
 
@@ -1053,7 +1049,7 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 	if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
 		return -EPERM;
 
-	if ((dip->i_di.di_mode & S_ISVTX) &&
+	if ((dip->i_inode.i_mode & S_ISVTX) &&
 	    dip->i_di.di_uid != current->fsuid &&
 	    ip->i_di.di_uid != current->fsuid && !capable(CAP_FOWNER))
 		return -EPERM;
@@ -1072,7 +1068,7 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 	if (!gfs2_inum_equal(&inum, &ip->i_num))
 		return -ENOENT;
 
-	if (IF2DT(ip->i_di.di_mode) != type) {
+	if (IF2DT(ip->i_inode.i_mode) != type) {
 		gfs2_consist_inode(dip);
 		return -EIO;
 	}
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 33c9ea6..69cbf98 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -22,7 +22,7 @@ static inline int gfs2_is_jdata(struct gfs2_inode *ip)
 
 static inline int gfs2_is_dir(struct gfs2_inode *ip)
 {
-	return S_ISDIR(ip->i_di.di_mode);
+	return S_ISDIR(ip->i_inode.i_mode);
 }
 
 void gfs2_inode_attr_in(struct gfs2_inode *ip);
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index 60dd943..6b50a57 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -161,7 +161,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 
 	gfs2_inum_out(&ip->i_num, &str->di_num);
 
-	str->di_mode = cpu_to_be32(di->di_mode);
+	str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
 	str->di_uid = cpu_to_be32(di->di_uid);
 	str->di_gid = cpu_to_be32(di->di_gid);
 	str->di_nlink = cpu_to_be32(di->di_nlink);
@@ -191,7 +191,6 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
 
 	gfs2_inum_print(&ip->i_num);
 
-	pv(di, di_mode, "0%o");
 	pv(di, di_uid, "%u");
 	pv(di, di_gid, "%u");
 	pv(di, di_nlink, "%u");
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 015640b..45a3d85 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -498,7 +498,6 @@ static int gfs2_commit_write(struct file *file, struct page *page,
 		di->di_size = cpu_to_be64(inode->i_size);
 	}
 
-	di->di_mode = cpu_to_be32(inode->i_mode);
 	di->di_atime = cpu_to_be64(inode->i_atime.tv_sec);
 	di->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec);
 	di->di_ctime = cpu_to_be64(inode->i_ctime.tv_sec);
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index c36f9e3..d355899 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -76,7 +76,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
 	if (!gfs2_inum_equal(&ip->i_num, &inum))
 		goto invalid_gunlock;
 
-	if (IF2DT(ip->i_di.di_mode) != type) {
+	if (IF2DT(ip->i_inode.i_mode) != type) {
 		gfs2_consist_inode(dip);
 		goto fail_gunlock;
 	}
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 7ea4175..b52b9db 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -425,7 +425,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
 	gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
 	file->private_data = fp;
 
-	if (S_ISREG(ip->i_di.di_mode)) {
+	if (S_ISREG(ip->i_inode.i_mode)) {
 		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
 					   &i_gh);
 		if (error)
@@ -515,7 +515,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 
 	if (!(fl->fl_flags & FL_POSIX))
 		return -ENOLCK;
-	if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+	if ((ip->i_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
 		return -ENOLCK;
 
 	if (sdp->sd_args.ar_localflocks) {
@@ -617,7 +617,7 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 
 	if (!(fl->fl_flags & FL_FLOCK))
 		return -ENOLCK;
-	if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+	if ((ip->i_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
 		return -ENOLCK;
 
 	if (sdp->sd_args.ar_localflocks)
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index c10b914..cf7a5ba 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -144,7 +144,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	int alloc_required;
 	int error;
 
-	if (S_ISDIR(ip->i_di.di_mode))
+	if (S_ISDIR(inode->i_mode))
 		return -EPERM;
 
 	gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
@@ -220,7 +220,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	}
 
 	error = gfs2_dir_add(dir, &dentry->d_name, &ip->i_num,
-			     IF2DT(ip->i_di.di_mode));
+			     IF2DT(inode->i_mode));
 	if (error)
 		goto out_end_trans;
 
@@ -564,11 +564,10 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 
 	/* Make sure we aren't trying to move a dirctory into it's subdir */
 
-	if (S_ISDIR(ip->i_di.di_mode) && odip != ndip) {
+	if (S_ISDIR(ip->i_inode.i_mode) && odip != ndip) {
 		dir_rename = 1;
 
-		error = gfs2_glock_nq_init(sdp->sd_rename_gl,
-					   LM_ST_EXCLUSIVE, 0,
+		error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 0,
 					   &r_gh);
 		if (error)
 			goto out;
@@ -609,7 +608,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 		if (error)
 			goto out_gunlock;
 
-		if (S_ISDIR(nip->i_di.di_mode)) {
+		if (S_ISDIR(nip->i_inode.i_mode)) {
 			if (nip->i_di.di_entries < 2) {
 				if (gfs2_consist_inode(nip))
 					gfs2_dinode_print(nip);
@@ -646,7 +645,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 				error = -EFBIG;
 				goto out_gunlock;
 			}
-			if (S_ISDIR(ip->i_di.di_mode) &&
+			if (S_ISDIR(ip->i_inode.i_mode) &&
 			    ndip->i_di.di_nlink == (u32)-1) {
 				error = -EMLINK;
 				goto out_gunlock;
@@ -701,7 +700,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	/* Remove the target file, if it exists */
 
 	if (nip) {
-		if (S_ISDIR(nip->i_di.di_mode))
+		if (S_ISDIR(nip->i_inode.i_mode))
 			error = gfs2_rmdiri(ndip, &ndentry->d_name, nip);
 		else {
 			error = gfs2_dir_del(ndip, &ndentry->d_name);
@@ -743,7 +742,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 		goto out_end_trans;
 
 	error = gfs2_dir_add(ndir, &ndentry->d_name, &ip->i_num,
-			     IF2DT(ip->i_di.di_mode));
+			     IF2DT(ip->i_inode.i_mode));
 	if (error)
 		goto out_end_trans;
 
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 9c786d1..8635175 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -407,7 +407,7 @@ static void gfs2_delete_inode(struct inode *inode)
 	if (error)
 		goto out_uninit;
 
-	if (S_ISDIR(ip->i_di.di_mode) &&
+	if (S_ISDIR(inode->i_mode) &&
 	    (ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
 		error = gfs2_dir_exhash_dealloc(ip);
 		if (error)
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index 5bcf895..f1ea0b4 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -322,7 +322,6 @@ struct gfs2_dinode {
 };
 
 struct gfs2_dinode_host {
-	__u32 di_mode;	/* mode of file */
 	__u32 di_uid;	/* owner's user id */
 	__u32 di_gid;	/* owner's group id */
 	__u32 di_nlink;	/* number of links to this file */
-- 
cgit v0.10.2


From 2933f9254a6af33db25270778c998a42029da668 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 1 Nov 2006 13:23:29 -0500
Subject: [GFS2] Shrink gfs2_inode (4) - di_uid/di_gid

Remove duplicate di_uid/di_gid fields in favour of using
inode->i_uid/inode->i_gid instead. This saves 8 bytes.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 87f6304f..3908992 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -74,7 +74,7 @@ int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
 {
 	if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl)
 		return -EOPNOTSUPP;
-	if (current->fsuid != ip->i_di.di_uid && !capable(CAP_FOWNER))
+	if (current->fsuid != ip->i_inode.i_uid && !capable(CAP_FOWNER))
 		return -EPERM;
 	if (S_ISLNK(ip->i_inode.i_mode))
 		return -EOPNOTSUPP;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 481a068..0c913ee 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -819,7 +819,7 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
 	if (error)
 		goto out;
 
-	error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+	error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
 	if (error)
 		goto out_gunlock_q;
 
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 5208fa9..935cc9a 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -687,7 +687,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	if (error)
 		goto out;
 
-	error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+	error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
 	if (error)
 		goto out_gunlock_q;
 
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index de46604..0de9b22 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -52,8 +52,6 @@ void gfs2_inode_attr_in(struct gfs2_inode *ip)
 
 	inode->i_ino = ip->i_num.no_addr;
 	inode->i_nlink = di->di_nlink;
-	inode->i_uid = di->di_uid;
-	inode->i_gid = di->di_gid;
 	i_size_write(inode, di->di_size);
 	inode->i_atime.tv_sec = di->di_atime;
 	inode->i_mtime.tv_sec = di->di_mtime;
@@ -87,8 +85,6 @@ void gfs2_inode_attr_out(struct gfs2_inode *ip)
 {
 	struct inode *inode = &ip->i_inode;
 	struct gfs2_dinode_host *di = &ip->i_di;
-	di->di_uid = inode->i_uid;
-	di->di_gid = inode->i_gid;
 	di->di_atime = inode->i_atime.tv_sec;
 	di->di_mtime = inode->i_mtime.tv_sec;
 	di->di_ctime = inode->i_ctime.tv_sec;
@@ -216,8 +212,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 		break;
 	};
 
-	di->di_uid = be32_to_cpu(str->di_uid);
-	di->di_gid = be32_to_cpu(str->di_gid);
+	ip->i_inode.i_uid = be32_to_cpu(str->di_uid);
+	ip->i_inode.i_gid = be32_to_cpu(str->di_gid);
 	di->di_nlink = be32_to_cpu(str->di_nlink);
 	di->di_size = be64_to_cpu(str->di_size);
 	di->di_blocks = be64_to_cpu(str->di_blocks);
@@ -616,19 +612,19 @@ static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode,
 			       unsigned int *uid, unsigned int *gid)
 {
 	if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir &&
-	    (dip->i_inode.i_mode & S_ISUID) && dip->i_di.di_uid) {
+	    (dip->i_inode.i_mode & S_ISUID) && dip->i_inode.i_uid) {
 		if (S_ISDIR(*mode))
 			*mode |= S_ISUID;
-		else if (dip->i_di.di_uid != current->fsuid)
+		else if (dip->i_inode.i_uid != current->fsuid)
 			*mode &= ~07111;
-		*uid = dip->i_di.di_uid;
+		*uid = dip->i_inode.i_uid;
 	} else
 		*uid = current->fsuid;
 
 	if (dip->i_inode.i_mode & S_ISGID) {
 		if (S_ISDIR(*mode))
 			*mode |= S_ISGID;
-		*gid = dip->i_di.di_gid;
+		*gid = dip->i_inode.i_gid;
 	} else
 		*gid = current->fsgid;
 }
@@ -783,8 +779,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 	if (alloc_required < 0)
 		goto fail;
 	if (alloc_required) {
-		error = gfs2_quota_check(dip, dip->i_di.di_uid,
-					 dip->i_di.di_gid);
+		error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid);
 		if (error)
 			goto fail_quota_locks;
 
@@ -1050,8 +1045,8 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 		return -EPERM;
 
 	if ((dip->i_inode.i_mode & S_ISVTX) &&
-	    dip->i_di.di_uid != current->fsuid &&
-	    ip->i_di.di_uid != current->fsuid && !capable(CAP_FOWNER))
+	    dip->i_inode.i_uid != current->fsuid &&
+	    ip->i_inode.i_uid != current->fsuid && !capable(CAP_FOWNER))
 		return -EPERM;
 
 	if (IS_APPEND(&dip->i_inode))
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index 6b50a57..e224f6a 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -162,8 +162,8 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	gfs2_inum_out(&ip->i_num, &str->di_num);
 
 	str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
-	str->di_uid = cpu_to_be32(di->di_uid);
-	str->di_gid = cpu_to_be32(di->di_gid);
+	str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
+	str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
 	str->di_nlink = cpu_to_be32(di->di_nlink);
 	str->di_size = cpu_to_be64(di->di_size);
 	str->di_blocks = cpu_to_be64(di->di_blocks);
@@ -191,8 +191,6 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
 
 	gfs2_inum_print(&ip->i_num);
 
-	pv(di, di_uid, "%u");
-	pv(di, di_gid, "%u");
 	pv(di, di_nlink, "%u");
 	printk(KERN_INFO "  di_size = %llu\n", (unsigned long long)di->di_size);
 	printk(KERN_INFO "  di_blocks = %llu\n", (unsigned long long)di->di_blocks);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 45a3d85..38b702a 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -386,7 +386,7 @@ static int gfs2_prepare_write(struct file *file, struct page *page,
 		if (error)
 			goto out_alloc_put;
 
-		error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+		error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
 		if (error)
 			goto out_qunlock;
 
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index cf7a5ba..efbcec3 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -196,8 +196,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 		if (error)
 			goto out_alloc;
 
-		error = gfs2_quota_check(dip, dip->i_di.di_uid,
-					 dip->i_di.di_gid);
+		error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid);
 		if (error)
 			goto out_gunlock_q;
 
@@ -673,8 +672,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 		if (error)
 			goto out_alloc;
 
-		error = gfs2_quota_check(ndip, ndip->i_di.di_uid,
-					 ndip->i_di.di_gid);
+		error = gfs2_quota_check(ndip, ndip->i_inode.i_uid, ndip->i_inode.i_gid);
 		if (error)
 			goto out_gunlock_q;
 
@@ -885,8 +883,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 	u32 ouid, ogid, nuid, ngid;
 	int error;
 
-	ouid = ip->i_di.di_uid;
-	ogid = ip->i_di.di_gid;
+	ouid = inode->i_uid;
+	ogid = inode->i_gid;
 	nuid = attr->ia_uid;
 	ngid = attr->ia_gid;
 
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
index 5453d29..45a5f11 100644
--- a/fs/gfs2/ops_vm.c
+++ b/fs/gfs2/ops_vm.c
@@ -76,7 +76,7 @@ static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
 	if (error)
 		goto out;
 
-	error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+	error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
 	if (error)
 		goto out_gunlock_q;
 
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 5d00e9b..d0db881 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -452,19 +452,19 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
 	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
 		return 0;
 
-	error = qdsb_get(sdp, QUOTA_USER, ip->i_di.di_uid, CREATE, qd);
+	error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, CREATE, qd);
 	if (error)
 		goto out;
 	al->al_qd_num++;
 	qd++;
 
-	error = qdsb_get(sdp, QUOTA_GROUP, ip->i_di.di_gid, CREATE, qd);
+	error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, CREATE, qd);
 	if (error)
 		goto out;
 	al->al_qd_num++;
 	qd++;
 
-	if (uid != NO_QUOTA_CHANGE && uid != ip->i_di.di_uid) {
+	if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) {
 		error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd);
 		if (error)
 			goto out;
@@ -472,7 +472,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
 		qd++;
 	}
 
-	if (gid != NO_QUOTA_CHANGE && gid != ip->i_di.di_gid) {
+	if (gid != NO_QUOTA_CHANGE && gid != ip->i_inode.i_gid) {
 		error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd);
 		if (error)
 			goto out;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 07dfd63..ff08465 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1217,7 +1217,7 @@ u64 gfs2_alloc_data(struct gfs2_inode *ip)
 	al->al_alloced++;
 
 	gfs2_statfs_change(sdp, 0, -1, 0);
-	gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
+	gfs2_quota_change(ip, +1, ip->i_inode.i_uid, ip->i_inode.i_gid);
 
 	spin_lock(&sdp->sd_rindex_spin);
 	rgd->rd_free_clone--;
@@ -1261,7 +1261,7 @@ u64 gfs2_alloc_meta(struct gfs2_inode *ip)
 	al->al_alloced++;
 
 	gfs2_statfs_change(sdp, 0, -1, 0);
-	gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
+	gfs2_quota_change(ip, +1, ip->i_inode.i_uid, ip->i_inode.i_gid);
 	gfs2_trans_add_unrevoke(sdp, block);
 
 	spin_lock(&sdp->sd_rindex_spin);
@@ -1337,8 +1337,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
 	gfs2_trans_add_rg(rgd);
 
 	gfs2_statfs_change(sdp, 0, +blen, 0);
-	gfs2_quota_change(ip, -(s64)blen,
-			 ip->i_di.di_uid, ip->i_di.di_gid);
+	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
 }
 
 /**
@@ -1366,7 +1365,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
 	gfs2_trans_add_rg(rgd);
 
 	gfs2_statfs_change(sdp, 0, +blen, 0);
-	gfs2_quota_change(ip, -(s64)blen, ip->i_di.di_uid, ip->i_di.di_gid);
+	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
 	gfs2_meta_wipe(ip, bstart, blen);
 }
 
@@ -1411,7 +1410,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
 void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
 {
 	gfs2_free_uninit_di(rgd, ip->i_num.no_addr);
-	gfs2_quota_change(ip, -1, ip->i_di.di_uid, ip->i_di.di_gid);
+	gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
 	gfs2_meta_wipe(ip, ip->i_num.no_addr, 1);
 }
 
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index f1ea0b4..896c7f8 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -322,8 +322,6 @@ struct gfs2_dinode {
 };
 
 struct gfs2_dinode_host {
-	__u32 di_uid;	/* owner's user id */
-	__u32 di_gid;	/* owner's group id */
 	__u32 di_nlink;	/* number of links to this file */
 	__u64 di_size;	/* number of bytes in file */
 	__u64 di_blocks;	/* number of blocks in file */
-- 
cgit v0.10.2


From 4f56110a00af5fb2e22fbccfcaf944d62cae8fcf Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 1 Nov 2006 14:04:17 -0500
Subject: [GFS2] Shrink gfs2_inode (5) - di_nlink

Remove the di_nlink field in favour of inode->i_nlink and
update the nlink handling to use the proper macros. This
saves 4 bytes.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 0de9b22..7112039 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -51,7 +51,6 @@ void gfs2_inode_attr_in(struct gfs2_inode *ip)
 	struct gfs2_dinode_host *di = &ip->i_di;
 
 	inode->i_ino = ip->i_num.no_addr;
-	inode->i_nlink = di->di_nlink;
 	i_size_write(inode, di->di_size);
 	inode->i_atime.tv_sec = di->di_atime;
 	inode->i_mtime.tv_sec = di->di_mtime;
@@ -214,7 +213,12 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 
 	ip->i_inode.i_uid = be32_to_cpu(str->di_uid);
 	ip->i_inode.i_gid = be32_to_cpu(str->di_gid);
-	di->di_nlink = be32_to_cpu(str->di_nlink);
+	/*
+	 * We will need to review setting the nlink count here in the
+	 * light of the forthcoming ro bind mount work. This is a reminder
+	 * to do that.
+	 */
+	ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
 	di->di_size = be64_to_cpu(str->di_size);
 	di->di_blocks = be64_to_cpu(str->di_blocks);
 	di->di_atime = be64_to_cpu(str->di_atime);
@@ -336,12 +340,12 @@ int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
 	u32 nlink;
 	int error;
 
-	BUG_ON(ip->i_di.di_nlink != ip->i_inode.i_nlink);
-	nlink = ip->i_di.di_nlink + diff;
+	BUG_ON(diff != 1 && diff != -1);
+	nlink = ip->i_inode.i_nlink + diff;
 
 	/* If we are reducing the nlink count, but the new value ends up being
 	   bigger than the old one, we must have underflowed. */
-	if (diff < 0 && nlink > ip->i_di.di_nlink) {
+	if (diff < 0 && nlink > ip->i_inode.i_nlink) {
 		if (gfs2_consist_inode(ip))
 			gfs2_dinode_print(ip);
 		return -EIO;
@@ -351,16 +355,19 @@ int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
 	if (error)
 		return error;
 
-	ip->i_di.di_nlink = nlink;
+	if (diff > 0)
+		inc_nlink(&ip->i_inode);
+	else
+		drop_nlink(&ip->i_inode);
+
 	ip->i_di.di_ctime = get_seconds();
-	ip->i_inode.i_nlink = nlink;
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
 	mark_inode_dirty(&ip->i_inode);
 
-	if (ip->i_di.di_nlink == 0) {
+	if (ip->i_inode.i_nlink == 0) {
 		struct gfs2_rgrpd *rgd;
 		struct gfs2_holder ri_gh, rg_gh;
 
@@ -375,7 +382,6 @@ int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
 		if (error)
 			goto out_norgrp;
 
-		clear_nlink(&ip->i_inode);
 		gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */
 		gfs2_glock_dq_uninit(&rg_gh);
 out_norgrp:
@@ -586,7 +592,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
 		return error;
 
 	/*  Don't create entries in an unlinked directory  */
-	if (!dip->i_di.di_nlink)
+	if (!dip->i_inode.i_nlink)
 		return -EPERM;
 
 	error = gfs2_dir_search(&dip->i_inode, name, NULL, NULL);
@@ -602,7 +608,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
 
 	if (dip->i_di.di_entries == (u32)-1)
 		return -EFBIG;
-	if (S_ISDIR(mode) && dip->i_di.di_nlink == (u32)-1)
+	if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1)
 		return -EMLINK;
 
 	return 0;
@@ -808,7 +814,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (error)
 		goto fail_end_trans;
-	ip->i_di.di_nlink = 1;
+	ip->i_inode.i_nlink = 1;
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
@@ -1016,7 +1022,12 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
 	if (error)
 		return error;
 
-	error = gfs2_change_nlink(ip, -2);
+	/* It looks odd, but it really should be done twice */
+	error = gfs2_change_nlink(ip, -1);
+	if (error)
+		return error;
+
+	error = gfs2_change_nlink(ip, -1);
 	if (error)
 		return error;
 
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index e224f6a..b4e354b 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -164,7 +164,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
 	str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
 	str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
-	str->di_nlink = cpu_to_be32(di->di_nlink);
+	str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
 	str->di_size = cpu_to_be64(di->di_size);
 	str->di_blocks = cpu_to_be64(di->di_blocks);
 	str->di_atime = cpu_to_be64(di->di_atime);
@@ -191,7 +191,6 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
 
 	gfs2_inum_print(&ip->i_num);
 
-	pv(di, di_nlink, "%u");
 	printk(KERN_INFO "  di_size = %llu\n", (unsigned long long)di->di_size);
 	printk(KERN_INFO "  di_blocks = %llu\n", (unsigned long long)di->di_blocks);
 	printk(KERN_INFO "  di_atime = %lld\n", (long long)di->di_atime);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index efbcec3..06176de 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -169,7 +169,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	}
 
 	error = -EINVAL;
-	if (!dip->i_di.di_nlink)
+	if (!dip->i_inode.i_nlink)
 		goto out_gunlock;
 	error = -EFBIG;
 	if (dip->i_di.di_entries == (u32)-1)
@@ -178,10 +178,10 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
 		goto out_gunlock;
 	error = -EINVAL;
-	if (!ip->i_di.di_nlink)
+	if (!ip->i_inode.i_nlink)
 		goto out_gunlock;
 	error = -EMLINK;
-	if (ip->i_di.di_nlink == (u32)-1)
+	if (ip->i_inode.i_nlink == (u32)-1)
 		goto out_gunlock;
 
 	alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
@@ -386,7 +386,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	ip = ghs[1].gh_gl->gl_object;
 
-	ip->i_di.di_nlink = 2;
+	ip->i_inode.i_nlink = 2;
 	ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
 	ip->i_di.di_flags |= GFS2_DIF_JDATA;
 	ip->i_di.di_payload_format = GFS2_FORMAT_DE;
@@ -636,7 +636,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 		};
 
 		if (odip != ndip) {
-			if (!ndip->i_di.di_nlink) {
+			if (!ndip->i_inode.i_nlink) {
 				error = -EINVAL;
 				goto out_gunlock;
 			}
@@ -645,7 +645,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 				goto out_gunlock;
 			}
 			if (S_ISDIR(ip->i_inode.i_mode) &&
-			    ndip->i_di.di_nlink == (u32)-1) {
+			    ndip->i_inode.i_nlink == (u32)-1) {
 				error = -EMLINK;
 				goto out_gunlock;
 			}
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index 896c7f8..c61517b 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -322,7 +322,6 @@ struct gfs2_dinode {
 };
 
 struct gfs2_dinode_host {
-	__u32 di_nlink;	/* number of links to this file */
 	__u64 di_size;	/* number of bytes in file */
 	__u64 di_blocks;	/* number of blocks in file */
 	__u64 di_atime;	/* time last accessed */
-- 
cgit v0.10.2


From 1a7b1eed5802502fd649e04784becd58557fdcf1 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 1 Nov 2006 14:35:17 -0500
Subject: [GFS2] Shrink gfs2_inode (6) - di_atime/di_mtime/di_ctime

Remove the di_[amc]time fields and use inode->i_[amc]time
fields instead. This saves 24 bytes from the gfs2_inode.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 0c913ee..692d4a3 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -778,7 +778,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 			gfs2_free_data(ip, bstart, blen);
 	}
 
-	ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+	ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds();
 
 	gfs2_dinode_out(ip, dibh->b_data);
 
@@ -853,7 +853,7 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
 	}
 
 	ip->i_di.di_size = size;
-	ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+	ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds();
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (error)
@@ -968,7 +968,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
 
 	if (gfs2_is_stuffed(ip)) {
 		ip->i_di.di_size = size;
-		ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+		ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds();
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 		gfs2_dinode_out(ip, dibh->b_data);
 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
@@ -980,7 +980,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
 
 		if (!error) {
 			ip->i_di.di_size = size;
-			ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+			ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds();
 			ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
 			gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 			gfs2_dinode_out(ip, dibh->b_data);
@@ -1053,7 +1053,7 @@ static int trunc_end(struct gfs2_inode *ip)
 			ip->i_num.no_addr;
 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 	}
-	ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+	ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds();
 	ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 0742761..ca23c8b 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -131,7 +131,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
 	memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
 	if (ip->i_di.di_size < offset + size)
 		ip->i_di.di_size = offset + size;
-	ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+	ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds();
 	gfs2_dinode_out(ip, dibh->b_data);
 
 	brelse(dibh);
@@ -229,7 +229,7 @@ out:
 
 	if (ip->i_di.di_size < offset + copied)
 		ip->i_di.di_size = offset + copied;
-	ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+	ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds();
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
@@ -1560,7 +1560,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
 				break;
 			gfs2_trans_add_bh(ip->i_gl, bh, 1);
 			ip->i_di.di_entries++;
-			ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+			ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds();
 			gfs2_dinode_out(ip, bh->b_data);
 			brelse(bh);
 			error = 0;
@@ -1646,7 +1646,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
 		gfs2_consist_inode(dip);
 	gfs2_trans_add_bh(dip->i_gl, bh, 1);
 	dip->i_di.di_entries--;
-	dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
+	dip->i_inode.i_mtime.tv_sec = dip->i_inode.i_ctime.tv_sec = get_seconds();
 	gfs2_dinode_out(dip, bh->b_data);
 	brelse(bh);
 	mark_inode_dirty(&dip->i_inode);
@@ -1694,7 +1694,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
 		gfs2_trans_add_bh(dip->i_gl, bh, 1);
 	}
 
-	dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
+	dip->i_inode.i_mtime.tv_sec = dip->i_inode.i_ctime.tv_sec = get_seconds();
 	gfs2_dinode_out(dip, bh->b_data);
 	brelse(bh);
 	return 0;
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 935cc9a..7dde847 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -300,7 +300,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (!error) {
-		ip->i_di.di_ctime = get_seconds();
+		ip->i_inode.i_ctime.tv_sec = get_seconds();
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
@@ -715,7 +715,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 					    (er->er_mode & S_IFMT));
 			ip->i_inode.i_mode = er->er_mode;
 		}
-		ip->i_di.di_ctime = get_seconds();
+		ip->i_inode.i_ctime.tv_sec = get_seconds();
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
@@ -850,7 +850,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
 			(ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT));
 		ip->i_inode.i_mode = er->er_mode;
 	}
-	ip->i_di.di_ctime = get_seconds();
+	ip->i_inode.i_ctime.tv_sec = get_seconds();
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
@@ -1130,7 +1130,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (!error) {
-		ip->i_di.di_ctime = get_seconds();
+		ip->i_inode.i_ctime.tv_sec = get_seconds();
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
@@ -1285,7 +1285,6 @@ int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
 	if (!error) {
 		error = inode_setattr(&ip->i_inode, attr);
 		gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
-		gfs2_inode_attr_out(ip);
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7112039..c22ae3c 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -52,12 +52,6 @@ void gfs2_inode_attr_in(struct gfs2_inode *ip)
 
 	inode->i_ino = ip->i_num.no_addr;
 	i_size_write(inode, di->di_size);
-	inode->i_atime.tv_sec = di->di_atime;
-	inode->i_mtime.tv_sec = di->di_mtime;
-	inode->i_ctime.tv_sec = di->di_ctime;
-	inode->i_atime.tv_nsec = 0;
-	inode->i_mtime.tv_nsec = 0;
-	inode->i_ctime.tv_nsec = 0;
 	inode->i_blocks = di->di_blocks <<
 		(GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
 
@@ -72,23 +66,6 @@ void gfs2_inode_attr_in(struct gfs2_inode *ip)
 		inode->i_flags &= ~S_APPEND;
 }
 
-/**
- * gfs2_inode_attr_out - Copy attributes from VFS inode into the dinode
- * @ip: The GFS2 inode
- *
- * Only copy out the attributes that we want the VFS layer
- * to be able to modify.
- */
-
-void gfs2_inode_attr_out(struct gfs2_inode *ip)
-{
-	struct inode *inode = &ip->i_inode;
-	struct gfs2_dinode_host *di = &ip->i_di;
-	di->di_atime = inode->i_atime.tv_sec;
-	di->di_mtime = inode->i_mtime.tv_sec;
-	di->di_ctime = inode->i_ctime.tv_sec;
-}
-
 static int iget_test(struct inode *inode, void *opaque)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -221,9 +198,12 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
 	di->di_size = be64_to_cpu(str->di_size);
 	di->di_blocks = be64_to_cpu(str->di_blocks);
-	di->di_atime = be64_to_cpu(str->di_atime);
-	di->di_mtime = be64_to_cpu(str->di_mtime);
-	di->di_ctime = be64_to_cpu(str->di_ctime);
+	ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime);
+	ip->i_inode.i_atime.tv_nsec = 0;
+	ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
+	ip->i_inode.i_mtime.tv_nsec = 0;
+	ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
+	ip->i_inode.i_ctime.tv_nsec = 0;
 
 	di->di_goal_meta = be64_to_cpu(str->di_goal_meta);
 	di->di_goal_data = be64_to_cpu(str->di_goal_data);
@@ -360,7 +340,7 @@ int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
 	else
 		drop_nlink(&ip->i_inode);
 
-	ip->i_di.di_ctime = get_seconds();
+	ip->i_inode.i_ctime.tv_sec = get_seconds();
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
@@ -1224,7 +1204,7 @@ int gfs2_glock_nq_atime(struct gfs2_holder *gh)
 		return 0;
 
 	curtime = get_seconds();
-	if (curtime - ip->i_di.di_atime >= quantum) {
+	if (curtime - ip->i_inode.i_atime.tv_sec >= quantum) {
 		gfs2_glock_dq(gh);
 		gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY,
 				   gh);
@@ -1236,7 +1216,7 @@ int gfs2_glock_nq_atime(struct gfs2_holder *gh)
 		   trying to get exclusive lock. */
 
 		curtime = get_seconds();
-		if (curtime - ip->i_di.di_atime >= quantum) {
+		if (curtime - ip->i_inode.i_atime.tv_sec >= quantum) {
 			struct buffer_head *dibh;
 			struct gfs2_dinode *di;
 
@@ -1250,11 +1230,11 @@ int gfs2_glock_nq_atime(struct gfs2_holder *gh)
 			if (error)
 				goto fail_end_trans;
 
-			ip->i_di.di_atime = curtime;
+			ip->i_inode.i_atime.tv_sec = curtime;
 
 			gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 			di = (struct gfs2_dinode *)dibh->b_data;
-			di->di_atime = cpu_to_be64(ip->i_di.di_atime);
+			di->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
 			brelse(dibh);
 
 			gfs2_trans_end(sdp);
@@ -1375,8 +1355,6 @@ __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 	if (!error) {
 		error = inode_setattr(&ip->i_inode, attr);
 		gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
-		gfs2_inode_attr_out(ip);
-
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 69cbf98..54d584e 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -26,7 +26,6 @@ static inline int gfs2_is_dir(struct gfs2_inode *ip)
 }
 
 void gfs2_inode_attr_in(struct gfs2_inode *ip);
-void gfs2_inode_attr_out(struct gfs2_inode *ip);
 struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum_host *inum, unsigned type);
 struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum_host *inum);
 
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index b4e354b..82003e8 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -167,9 +167,9 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
 	str->di_size = cpu_to_be64(di->di_size);
 	str->di_blocks = cpu_to_be64(di->di_blocks);
-	str->di_atime = cpu_to_be64(di->di_atime);
-	str->di_mtime = cpu_to_be64(di->di_mtime);
-	str->di_ctime = cpu_to_be64(di->di_ctime);
+	str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
+	str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
+	str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec);
 
 	str->di_goal_meta = cpu_to_be64(di->di_goal_meta);
 	str->di_goal_data = cpu_to_be64(di->di_goal_data);
@@ -193,10 +193,6 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
 
 	printk(KERN_INFO "  di_size = %llu\n", (unsigned long long)di->di_size);
 	printk(KERN_INFO "  di_blocks = %llu\n", (unsigned long long)di->di_blocks);
-	printk(KERN_INFO "  di_atime = %lld\n", (long long)di->di_atime);
-	printk(KERN_INFO "  di_mtime = %lld\n", (long long)di->di_mtime);
-	printk(KERN_INFO "  di_ctime = %lld\n", (long long)di->di_ctime);
-
 	printk(KERN_INFO "  di_goal_meta = %llu\n", (unsigned long long)di->di_goal_meta);
 	printk(KERN_INFO "  di_goal_data = %llu\n", (unsigned long long)di->di_goal_data);
 
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 38b702a..5c3962c 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -498,10 +498,6 @@ static int gfs2_commit_write(struct file *file, struct page *page,
 		di->di_size = cpu_to_be64(inode->i_size);
 	}
 
-	di->di_atime = cpu_to_be64(inode->i_atime.tv_sec);
-	di->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec);
-	di->di_ctime = cpu_to_be64(inode->i_ctime.tv_sec);
-
 	brelse(dibh);
 	gfs2_trans_end(sdp);
 	if (al->al_requested) {
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 06176de..585b43a 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -729,7 +729,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 		error = gfs2_meta_inode_buffer(ip, &dibh);
 		if (error)
 			goto out_end_trans;
-		ip->i_di.di_ctime = get_seconds();
+		ip->i_inode.i_ctime.tv_sec = get_seconds();
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
@@ -915,7 +915,6 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 
 	error = inode_setattr(inode, attr);
 	gfs2_assert_warn(sdp, !error);
-	gfs2_inode_attr_out(ip);
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index c61517b..7f5a4a1 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -324,9 +324,6 @@ struct gfs2_dinode {
 struct gfs2_dinode_host {
 	__u64 di_size;	/* number of bytes in file */
 	__u64 di_blocks;	/* number of blocks in file */
-	__u64 di_atime;	/* time last accessed */
-	__u64 di_mtime;	/* time last modified */
-	__u64 di_ctime;	/* time last changed */
 
 	/* This section varies from gfs1. Padding added to align with
          * remainder of dinode
-- 
cgit v0.10.2


From a9583c7983cbba9726bfe64ee46613d654fc9e26 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 1 Nov 2006 20:09:14 -0500
Subject: [GFS2] Shrink gfs2_inode (7) - di_payload_format

This is almost never used. Its there for backward
compatibility with GFS1. It doesn't need its own
field since it can always be calculated from the
inode mode & flags. This saves a bit more space
in the gfs2_inode.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index ca23c8b..c82d7cb 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -902,7 +902,6 @@ static int dir_make_exhash(struct inode *inode)
 	dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
 	dip->i_di.di_blocks++;
 	dip->i_di.di_flags |= GFS2_DIF_EXHASH;
-	dip->i_di.di_payload_format = 0;
 
 	for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
 	dip->i_di.di_depth = y;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index c22ae3c..f6177fc 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -210,7 +210,6 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	di->di_generation = be64_to_cpu(str->di_generation);
 
 	di->di_flags = be32_to_cpu(str->di_flags);
-	di->di_payload_format = be32_to_cpu(str->di_payload_format);
 	di->di_height = be16_to_cpu(str->di_height);
 
 	di->di_depth = be16_to_cpu(str->di_depth);
@@ -699,7 +698,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 	}
 
 	di->__pad1 = 0;
-	di->di_payload_format = cpu_to_be32(0);
+	di->di_payload_format = cpu_to_be32(S_ISDIR(mode) ? GFS2_FORMAT_DE : 0);
 	di->di_height = cpu_to_be32(0);
 	di->__pad2 = 0;
 	di->__pad3 = 0;
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index 82003e8..b2baba5 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -176,9 +176,10 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	str->di_generation = cpu_to_be64(di->di_generation);
 
 	str->di_flags = cpu_to_be32(di->di_flags);
-	str->di_payload_format = cpu_to_be32(di->di_payload_format);
 	str->di_height = cpu_to_be16(di->di_height);
-
+	str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
+					     !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ?
+					     GFS2_FORMAT_DE : 0);
 	str->di_depth = cpu_to_be16(di->di_depth);
 	str->di_entries = cpu_to_be32(di->di_entries);
 
@@ -197,7 +198,6 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
 	printk(KERN_INFO "  di_goal_data = %llu\n", (unsigned long long)di->di_goal_data);
 
 	pv(di, di_flags, "0x%.8X");
-	pv(di, di_payload_format, "%u");
 	pv(di, di_height, "%u");
 
 	pv(di, di_depth, "%u");
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 585b43a..0e4eade 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -389,7 +389,6 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	ip->i_inode.i_nlink = 2;
 	ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
 	ip->i_di.di_flags |= GFS2_DIF_JDATA;
-	ip->i_di.di_payload_format = GFS2_FORMAT_DE;
 	ip->i_di.di_entries = 2;
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index 7f5a4a1..536575e 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -333,7 +333,6 @@ struct gfs2_dinode_host {
 	__u64 di_generation;	/* generation number for NFS */
 
 	__u32 di_flags;	/* GFS2_DIF_... */
-	__u32 di_payload_format;  /* GFS2_FORMAT_... */
 	__u16 di_height;	/* height of metadata */
 
 	/* These only apply to directories  */
-- 
cgit v0.10.2


From bfded27ba010d1c3b0aa3843f97dc9b80de751be Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 1 Nov 2006 16:05:38 -0500
Subject: [GFS2] Shrink gfs2_inode (8) - i_vn

This shrinks the size of the gfs2_inode by 8 bytes by
replacing the version counter with a one bit valid/invalid
flag.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index aad45b7..9c20337 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -305,8 +305,9 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
 	int data = (flags & DIO_DATA);
 
 	if (meta) {
+		struct gfs2_inode *ip = gl->gl_object;
 		gfs2_meta_inval(gl);
-		gl->gl_vn++;
+		set_bit(GIF_INVALID, &ip->i_flags);
 	}
 	if (data)
 		gfs2_page_inval(gl);
@@ -351,7 +352,7 @@ static int inode_go_lock(struct gfs2_holder *gh)
 	if (!ip)
 		return 0;
 
-	if (ip->i_vn != gl->gl_vn) {
+	if (test_bit(GIF_INVALID, &ip->i_flags)) {
 		error = gfs2_inode_refresh(ip);
 		if (error)
 			return error;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index c0a8c3b..227a74d 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -217,6 +217,7 @@ struct gfs2_alloc {
 };
 
 enum {
+	GIF_INVALID		= 0,
 	GIF_QD_LOCKED		= 1,
 	GIF_PAGED		= 2,
 	GIF_SW_PAGED		= 3,
@@ -228,7 +229,6 @@ struct gfs2_inode {
 
 	unsigned long i_flags;		/* GIF_... */
 
-	u64 i_vn;
 	struct gfs2_dinode_host i_di; /* To be replaced by ref to block */
 
 	struct gfs2_glock *i_gl; /* Move into i_gh? */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index f6177fc..e467780 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -145,7 +145,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum_host *i
 		if (unlikely(error))
 			goto fail_put;
 
-		ip->i_vn = ip->i_gl->gl_vn - 1;
+		set_bit(GIF_INVALID, &ip->i_flags);
 		error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
 		if (unlikely(error))
 			goto fail_iopen;
@@ -242,7 +242,7 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
 
 	error = gfs2_dinode_in(ip, dibh->b_data);
 	brelse(dibh);
-	ip->i_vn = ip->i_gl->gl_vn;
+	clear_bit(GIF_INVALID, &ip->i_flags);
 
 	return error;
 }
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 0e4eade..b247f25 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -844,7 +844,7 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
 	struct gfs2_holder i_gh;
 	int error;
 
-	if (ip->i_vn == ip->i_gl->gl_vn)
+	if (!test_bit(GIF_INVALID, &ip->i_flags))
 		return generic_permission(inode, mask, gfs2_check_acl);
 
 	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-- 
cgit v0.10.2


From 294caaa3b8304c0a14c5039691caf23363bd9369 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 2 Nov 2006 11:59:28 -0500
Subject: [GFS2] Tidy up 0 initialisations in inode.c

We don't need to use endian conversions for 0 initialisations
when creating a new on-disk inode.

Cc: Christoph Hellwig <hch@infradead.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e467780..faf9b9e 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -673,15 +673,15 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 	di->di_mode = cpu_to_be32(mode);
 	di->di_uid = cpu_to_be32(uid);
 	di->di_gid = cpu_to_be32(gid);
-	di->di_nlink = cpu_to_be32(0);
-	di->di_size = cpu_to_be64(0);
+	di->di_nlink = 0;
+	di->di_size = 0;
 	di->di_blocks = cpu_to_be64(1);
 	di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(get_seconds());
 	di->di_major = cpu_to_be32(MAJOR(dev));
 	di->di_minor = cpu_to_be32(MINOR(dev));
 	di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr);
 	di->di_generation = cpu_to_be64(*generation);
-	di->di_flags = cpu_to_be32(0);
+	di->di_flags = 0;
 
 	if (S_ISREG(mode)) {
 		if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
@@ -699,13 +699,13 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 
 	di->__pad1 = 0;
 	di->di_payload_format = cpu_to_be32(S_ISDIR(mode) ? GFS2_FORMAT_DE : 0);
-	di->di_height = cpu_to_be32(0);
+	di->di_height = 0;
 	di->__pad2 = 0;
 	di->__pad3 = 0;
-	di->di_depth = cpu_to_be16(0);
-	di->di_entries = cpu_to_be32(0);
+	di->di_depth = 0;
+	di->di_entries = 0;
 	memset(&di->__pad4, 0, sizeof(di->__pad4));
-	di->di_eattr = cpu_to_be64(0);
+	di->di_eattr = 0;
 	memset(&di->di_reserved, 0, sizeof(di->di_reserved));
 
 	brelse(dibh);
-- 
cgit v0.10.2


From f6e58f01e8dc869803b9f73b2aa9d5bc3f32ca05 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 7 Nov 2006 15:14:58 -0500
Subject: [GFS2] Don't copy meta_header for rgrp in and out

The meta_header for an ondisk rgrp never changes, so there is no point
copying it in and back out to disk. Also there is no reason to keep
a copy for each rgrp in memory.

The code already checks to ensure that the header is correct before
it calls the routine to copy the data in, so that we don't even need
to check whether its correct on disk in the functions in ondisk.c

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
index b2baba5..f2495f1 100644
--- a/fs/gfs2/ondisk.c
+++ b/fs/gfs2/ondisk.c
@@ -65,15 +65,6 @@ static void gfs2_meta_header_in(struct gfs2_meta_header_host *mh, const void *bu
 	mh->mh_format = be32_to_cpu(str->mh_format);
 }
 
-static void gfs2_meta_header_out(const struct gfs2_meta_header_host *mh, void *buf)
-{
-	struct gfs2_meta_header *str = buf;
-
-	str->mh_magic = cpu_to_be32(mh->mh_magic);
-	str->mh_type = cpu_to_be32(mh->mh_type);
-	str->mh_format = cpu_to_be32(mh->mh_format);
-}
-
 void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
 {
 	const struct gfs2_sb *str = buf;
@@ -119,7 +110,6 @@ void gfs2_rgrp_in(struct gfs2_rgrp_host *rg, const void *buf)
 {
 	const struct gfs2_rgrp *str = buf;
 
-	gfs2_meta_header_in(&rg->rg_header, buf);
 	rg->rg_flags = be32_to_cpu(str->rg_flags);
 	rg->rg_free = be32_to_cpu(str->rg_free);
 	rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
@@ -130,7 +120,6 @@ void gfs2_rgrp_out(const struct gfs2_rgrp_host *rg, void *buf)
 {
 	struct gfs2_rgrp *str = buf;
 
-	gfs2_meta_header_out(&rg->rg_header, buf);
 	str->rg_flags = cpu_to_be32(rg->rg_flags);
 	str->rg_free = cpu_to_be32(rg->rg_free);
 	str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index 536575e..8b7e4c1 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -213,8 +213,6 @@ struct gfs2_rgrp {
 };
 
 struct gfs2_rgrp_host {
-	struct gfs2_meta_header_host rg_header;
-
 	__u32 rg_flags;
 	__u32 rg_free;
 	__u32 rg_dinodes;
-- 
cgit v0.10.2


From c594d8866460a2710c436839d79f334a0714a2a7 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 8 Nov 2006 09:01:13 -0500
Subject: [GFS2] Remove unused GL_DUMP flag

There is no way to set the GL_DUMP flag, and in any case the
same thing can be done with systemtap if required for debugging,
so this removes it.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 44633c4..746347a 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1244,9 +1244,6 @@ restart:
 
 	clear_bit(GLF_PREFETCH, &gl->gl_flags);
 
-	if (error == GLR_TRYFAILED && (gh->gh_flags & GL_DUMP))
-		dump_glock(gl);
-
 	return error;
 }
 
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2b2a889..b985627 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -28,7 +28,6 @@
 #define GL_NOCACHE		0x00000400
 #define GL_NOCANCEL		0x00001000
 #define GL_AOP			0x00004000
-#define GL_DUMP			0x00008000
 
 #define GLR_TRYFAILED		13
 #define GLR_CANCELED		14
-- 
cgit v0.10.2


From 2ca99501fa5422e84f18333918a503433449e2b5 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 8 Nov 2006 10:26:54 -0500
Subject: [GFS2] Fix page lock/glock deadlock

This fixes a race between the glock and the page lock encountered
during truncate in gfs2_readpage and gfs2_prepare_write. The gfs2_readpages
function doesn't need the same fix since it only uses a try lock anyway, so
it will fail back to gfs2_readpage in the case of a potential deadlock.

This bug was spotted by Russell Cattelan.

Cc: Russell Cattelan <cattelan@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index b985627..a331bf8 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -27,7 +27,6 @@
 #define GL_ATIME		0x00000200
 #define GL_NOCACHE		0x00000400
 #define GL_NOCANCEL		0x00001000
-#define GL_AOP			0x00004000
 
 #define GLR_TRYFAILED		13
 #define GLR_CANCELED		14
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 5c3962c..3822189 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -230,7 +230,7 @@ static int gfs2_readpage(struct file *file, struct page *page)
 				/* gfs2_sharewrite_nopage has grabbed the ip->i_gl already */
 				goto skip_lock;
 		}
-		gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|GL_AOP, &gh);
+		gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh);
 		do_unlock = 1;
 		error = gfs2_glock_nq_m_atime(1, &gh);
 		if (unlikely(error))
@@ -254,6 +254,8 @@ skip_lock:
 out:
 	return error;
 out_unlock:
+	if (error == GLR_TRYFAILED)
+		error = AOP_TRUNCATED_PAGE;
 	unlock_page(page);
 	if (do_unlock)
 		gfs2_holder_uninit(&gh);
@@ -293,7 +295,7 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
 				goto skip_lock;
 		}
 		gfs2_holder_init(ip->i_gl, LM_ST_SHARED,
-				 LM_FLAG_TRY_1CB|GL_ATIME|GL_AOP, &gh);
+				 LM_FLAG_TRY_1CB|GL_ATIME, &gh);
 		do_unlock = 1;
 		ret = gfs2_glock_nq_m_atime(1, &gh);
 		if (ret == GLR_TRYFAILED)
@@ -366,10 +368,13 @@ static int gfs2_prepare_write(struct file *file, struct page *page,
 	unsigned int write_len = to - from;
 
 
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|GL_AOP, &ip->i_gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|LM_FLAG_TRY_1CB, &ip->i_gh);
 	error = gfs2_glock_nq_m_atime(1, &ip->i_gh);
-	if (error)
+	if (unlikely(error)) {
+		if (error == GLR_TRYFAILED)
+			error = AOP_TRUNCATED_PAGE;
 		goto out_uninit;
+	}
 
 	gfs2_write_calc_reserv(ip, write_len, &data_blocks, &ind_blocks);
 
-- 
cgit v0.10.2


From 6b124d8dba1f46c5f2caf3b3159bbe627f75b9b6 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 8 Nov 2006 12:51:06 -0500
Subject: [GFS2] Only set inode flags when required

We were setting the inode flags from GFS2's flags far too often, even when they
couldn't possibly have changed. This patch reduces the amount of flag
setting going on so that we do it only when the inode is read in or
when the flags have changed. The create case is covered by the "when
the inode is read in" case.

This also fixes a bug where we didn't set S_SYNC correctly.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index faf9b9e..56b39be 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -54,16 +54,6 @@ void gfs2_inode_attr_in(struct gfs2_inode *ip)
 	i_size_write(inode, di->di_size);
 	inode->i_blocks = di->di_blocks <<
 		(GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
-
-	if (di->di_flags & GFS2_DIF_IMMUTABLE)
-		inode->i_flags |= S_IMMUTABLE;
-	else
-		inode->i_flags &= ~S_IMMUTABLE;
-
-	if (di->di_flags & GFS2_DIF_APPENDONLY)
-		inode->i_flags |= S_APPEND;
-	else
-		inode->i_flags &= ~S_APPEND;
 }
 
 static int iget_test(struct inode *inode, void *opaque)
@@ -210,6 +200,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	di->di_generation = be64_to_cpu(str->di_generation);
 
 	di->di_flags = be32_to_cpu(str->di_flags);
+	gfs2_set_inode_flags(&ip->i_inode);
 	di->di_height = be16_to_cpu(str->di_height);
 
 	di->di_depth = be16_to_cpu(str->di_depth);
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index b52b9db..eabf6c6 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -266,6 +266,24 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
 	return error;
 }
 
+void gfs2_set_inode_flags(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_dinode_host *di = &ip->i_di;
+	unsigned int flags = inode->i_flags;
+
+	flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+	if (di->di_flags & GFS2_DIF_IMMUTABLE)
+		flags |= S_IMMUTABLE;
+	if (di->di_flags & GFS2_DIF_APPENDONLY)
+		flags |= S_APPEND;
+	if (di->di_flags & GFS2_DIF_NOATIME)
+		flags |= S_NOATIME;
+	if (di->di_flags & GFS2_DIF_SYNC)
+		flags |= S_SYNC;
+	inode->i_flags = flags;
+}
+
 /* Flags that can be set by user space */
 #define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA|			\
 			     GFS2_DIF_DIRECTIO|			\
@@ -338,6 +356,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 	ip->i_di.di_flags = new_flags;
 	gfs2_dinode_out(ip, bh->b_data);
 	brelse(bh);
+	gfs2_set_inode_flags(inode);
 out_trans_end:
 	gfs2_trans_end(sdp);
 out:
diff --git a/fs/gfs2/ops_file.h b/fs/gfs2/ops_file.h
index ce319f8..7e5d8ec 100644
--- a/fs/gfs2/ops_file.h
+++ b/fs/gfs2/ops_file.h
@@ -17,7 +17,7 @@ extern struct file gfs2_internal_file_sentinel;
 extern int gfs2_internal_read(struct gfs2_inode *ip,
 			      struct file_ra_state *ra_state,
 			      char *buf, loff_t *pos, unsigned size);
-
+extern void gfs2_set_inode_flags(struct inode *inode);
 extern const struct file_operations gfs2_file_fops;
 extern const struct file_operations gfs2_dir_fops;
 
-- 
cgit v0.10.2


From e7c698d74fc9e0e76b3086062b0519df3601ff52 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 8 Nov 2006 13:52:05 -0500
Subject: [GFS2] Inode number is constant

Since the inode number is constant, we don't need to keep updating
it everytime we refresh the other inode fields.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 56b39be..19b2736 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -50,7 +50,6 @@ void gfs2_inode_attr_in(struct gfs2_inode *ip)
 	struct inode *inode = &ip->i_inode;
 	struct gfs2_dinode_host *di = &ip->i_di;
 
-	inode->i_ino = ip->i_num.no_addr;
 	i_size_write(inode, di->di_size);
 	inode->i_blocks = di->di_blocks <<
 		(GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
@@ -73,6 +72,7 @@ static int iget_set(struct inode *inode, void *opaque)
 	struct gfs2_inum_host *inum = opaque;
 
 	ip->i_num = *inum;
+	inode->i_ino = inum->no_addr;
 	return 0;
 }
 
-- 
cgit v0.10.2


From 9e2dbdac3df300516ffdd9a8631f23164d068a50 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 8 Nov 2006 15:45:46 -0500
Subject: [GFS2] Remove gfs2_inode_attr_in

This function wasn't really doing the right thing. There was no need
to update the inode size at this point and the updating of the
i_blocks field has now been moved to the places where di_blocks is
updated. A result of this patch and some those preceeding it is that
unlocking a glock is now a much more efficient process, since there
is no longer any requirement to copy data from the gfs2 inode into
the vfs inode at this point.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 692d4a3..06e3447 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -163,6 +163,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 	if (ip->i_di.di_size) {
 		*(__be64 *)(di + 1) = cpu_to_be64(block);
 		ip->i_di.di_blocks++;
+		gfs2_set_inode_blocks(&ip->i_inode);
 		di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
 	}
 
@@ -272,6 +273,7 @@ static int build_height(struct inode *inode, unsigned height)
 	*(__be64 *)(di + 1) = cpu_to_be64(bn);
 	ip->i_di.di_height += new_height;
 	ip->i_di.di_blocks += new_height;
+	gfs2_set_inode_blocks(&ip->i_inode);
 	di->di_height = cpu_to_be16(ip->i_di.di_height);
 	di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
 	brelse(dibh);
@@ -415,6 +417,7 @@ static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
 
 	*ptr = cpu_to_be64(*block);
 	ip->i_di.di_blocks++;
+	gfs2_set_inode_blocks(&ip->i_inode);
 
 	*new = 1;
 	return 0;
@@ -770,6 +773,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 		if (!ip->i_di.di_blocks)
 			gfs2_consist_inode(ip);
 		ip->i_di.di_blocks--;
+		gfs2_set_inode_blocks(&ip->i_inode);
 	}
 	if (bstart) {
 		if (metadata)
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c82d7cb..a2923fb 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -901,6 +901,7 @@ static int dir_make_exhash(struct inode *inode)
 
 	dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
 	dip->i_di.di_blocks++;
+	gfs2_set_inode_blocks(&dip->i_inode);
 	dip->i_di.di_flags |= GFS2_DIF_EXHASH;
 
 	for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
@@ -1038,6 +1039,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 	error = gfs2_meta_inode_buffer(dip, &dibh);
 	if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
 		dip->i_di.di_blocks++;
+		gfs2_set_inode_blocks(&dip->i_inode);
 		gfs2_dinode_out(dip, dibh->b_data);
 		brelse(dibh);
 	}
@@ -1516,6 +1518,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
 		return error;
 	gfs2_trans_add_bh(ip->i_gl, bh, 1);
 	ip->i_di.di_blocks++;
+	gfs2_set_inode_blocks(&ip->i_inode);
 	gfs2_dinode_out(ip, bh->b_data);
 	brelse(bh);
 	return 0;
@@ -1860,6 +1863,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
 		if (!dip->i_di.di_blocks)
 			gfs2_consist_inode(dip);
 		dip->i_di.di_blocks--;
+		gfs2_set_inode_blocks(&dip->i_inode);
 	}
 
 	error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size);
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 7dde847..ebebbdc 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -281,6 +281,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
 		if (!ip->i_di.di_blocks)
 			gfs2_consist_inode(ip);
 		ip->i_di.di_blocks--;
+		gfs2_set_inode_blocks(&ip->i_inode);
 	}
 	if (bstart)
 		gfs2_free_meta(ip, bstart, blen);
@@ -598,6 +599,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
 	ea->ea_num_ptrs = 0;
 
 	ip->i_di.di_blocks++;
+	gfs2_set_inode_blocks(&ip->i_inode);
 
 	return 0;
 }
@@ -649,6 +651,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 			gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
 
 			ip->i_di.di_blocks++;
+			gfs2_set_inode_blocks(&ip->i_inode);
 
 			copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize :
 							   data_len;
@@ -977,6 +980,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 		ip->i_di.di_eattr = blk;
 		ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
 		ip->i_di.di_blocks++;
+		gfs2_set_inode_blocks(&ip->i_inode);
 
 		eablk++;
 	}
@@ -1387,6 +1391,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
 		if (!ip->i_di.di_blocks)
 			gfs2_consist_inode(ip);
 		ip->i_di.di_blocks--;
+		gfs2_set_inode_blocks(&ip->i_inode);
 	}
 	if (bstart)
 		gfs2_free_meta(ip, bstart, blen);
@@ -1441,6 +1446,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
 	if (!ip->i_di.di_blocks)
 		gfs2_consist_inode(ip);
 	ip->i_di.di_blocks--;
+	gfs2_set_inode_blocks(&ip->i_inode);
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (!error) {
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 9c20337..b92de0a 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -356,7 +356,6 @@ static int inode_go_lock(struct gfs2_holder *gh)
 		error = gfs2_inode_refresh(ip);
 		if (error)
 			return error;
-		gfs2_inode_attr_in(ip);
 	}
 
 	if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) &&
@@ -380,11 +379,8 @@ static void inode_go_unlock(struct gfs2_holder *gh)
 	struct gfs2_glock *gl = gh->gh_gl;
 	struct gfs2_inode *ip = gl->gl_object;
 
-	if (ip == NULL)
-		return;
-	if (test_bit(GLF_DIRTY, &gl->gl_flags))
-		gfs2_inode_attr_in(ip);
-	gfs2_meta_cache_flush(ip);
+	if (ip)
+		gfs2_meta_cache_flush(ip);
 }
 
 /**
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 19b2736..ea9ca23 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -38,29 +38,12 @@
 #include "trans.h"
 #include "util.h"
 
-/**
- * gfs2_inode_attr_in - Copy attributes from the dinode into the VFS inode
- * @ip: The GFS2 inode (with embedded disk inode data)
- * @inode:  The Linux VFS inode
- *
- */
-
-void gfs2_inode_attr_in(struct gfs2_inode *ip)
-{
-	struct inode *inode = &ip->i_inode;
-	struct gfs2_dinode_host *di = &ip->i_di;
-
-	i_size_write(inode, di->di_size);
-	inode->i_blocks = di->di_blocks <<
-		(GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
-}
-
 static int iget_test(struct inode *inode, void *opaque)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_inum_host *inum = opaque;
 
-	if (ip && ip->i_num.no_addr == inum->no_addr)
+	if (ip->i_num.no_addr == inum->no_addr)
 		return 1;
 
 	return 0;
@@ -187,7 +170,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	 */
 	ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
 	di->di_size = be64_to_cpu(str->di_size);
+	i_size_write(&ip->i_inode, di->di_size);
 	di->di_blocks = be64_to_cpu(str->di_blocks);
+	gfs2_set_inode_blocks(&ip->i_inode);
 	ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime);
 	ip->i_inode.i_atime.tv_nsec = 0;
 	ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 54d584e..46917ed 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -25,6 +25,13 @@ static inline int gfs2_is_dir(struct gfs2_inode *ip)
 	return S_ISDIR(ip->i_inode.i_mode);
 }
 
+static inline void gfs2_set_inode_blocks(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	inode->i_blocks = ip->i_di.di_blocks <<
+		(GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
+}
+
 void gfs2_inode_attr_in(struct gfs2_inode *ip);
 struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum_host *inum, unsigned type);
 struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum_host *inum);
-- 
cgit v0.10.2


From 7020933156ac2a8a7386314933e49948bf0438f7 Mon Sep 17 00:00:00 2001
From: Russell Cattelan <cattelan@redhat.com>
Date: Thu, 9 Nov 2006 11:28:08 -0500
Subject: [GFS2] Fix race in logging code

The log lock is dropped prior to io submittion, but
this exposes a hole in which the log data structures
may be going away due to a truncate.
Store the buffer head in a local pointer prior to
dropping the lock and relay on the buffer_head lock
for consitency on the buffer head.

Signed-Off-By: Russell Cattelan <cattelan@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 8a654cd..4d7f94d 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -509,7 +509,7 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
 {
 	LIST_HEAD(started);
 	struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt;
-	struct buffer_head *bh = NULL;
+	struct buffer_head *bh = NULL,*bh1 = NULL;
 	unsigned int offset = sizeof(struct gfs2_log_descriptor);
 	struct gfs2_log_descriptor *ld;
 	unsigned int limit;
@@ -537,8 +537,13 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
 		list_for_each_entry_safe_continue(bd1, bdt,
 						  &sdp->sd_log_le_databuf,
 						  bd_le.le_list) {
+			/* store off the buffer head in a local ptr since
+			 * gfs2_bufdata might change when we drop the log lock
+			 */
+			bh1 = bd1->bd_bh;
+
 			/* An ordered write buffer */
-			if (bd1->bd_bh && !buffer_pinned(bd1->bd_bh)) {
+			if (bh1 && !buffer_pinned(bh1)) {
 				list_move(&bd1->bd_le.le_list, &started);
 				if (bd1 == bd2) {
 					bd2 = NULL;
@@ -547,20 +552,21 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
 							bd_le.le_list);
 				}
 				total_dbuf--;
-				if (bd1->bd_bh) {
-					get_bh(bd1->bd_bh);
-					if (buffer_dirty(bd1->bd_bh)) {
+				if (bh1) {
+					if (buffer_dirty(bh1)) {
+						get_bh(bh1);
+
 						gfs2_log_unlock(sdp);
-						wait_on_buffer(bd1->bd_bh);
-						ll_rw_block(WRITE, 1,
-							    &bd1->bd_bh);
+
+						ll_rw_block(SWRITE, 1, &bh1);
+						brelse(bh1);
+
 						gfs2_log_lock(sdp);
 					}
-					brelse(bd1->bd_bh);
 					continue;
 				}
 				continue;
-			} else if (bd1->bd_bh) { /* A journaled buffer */
+			} else if (bh1) { /* A journaled buffer */
 				int magic;
 				gfs2_log_unlock(sdp);
 				if (!bh) {
@@ -582,16 +588,16 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
 					ld->ld_data2 = cpu_to_be32(0);
 					memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
 				}
-				magic = gfs2_check_magic(bd1->bd_bh);
-				*ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
+				magic = gfs2_check_magic(bh1);
+				*ptr++ = cpu_to_be64(bh1->b_blocknr);
 				*ptr++ = cpu_to_be64((__u64)magic);
-				clear_buffer_escaped(bd1->bd_bh);
+				clear_buffer_escaped(bh1);
 				if (unlikely(magic != 0))
-					set_buffer_escaped(bd1->bd_bh);
+					set_buffer_escaped(bh1);
 				gfs2_log_lock(sdp);
 				if (n++ > num)
 					break;
-			} else if (!bd1->bd_bh) {
+			} else if (!bh1) {
 				total_dbuf--;
 				sdp->sd_log_num_databuf--;
 				list_del_init(&bd1->bd_le.le_list);
-- 
cgit v0.10.2


From 61057c6bb3a3d14cf2bea6ca20dc6d367e1d852e Mon Sep 17 00:00:00 2001
From: Russell Cattelan <cattelan@redhat.com>
Date: Thu, 9 Nov 2006 11:42:33 -0500
Subject: [GFS2] Remove unused zero_readpage from stuffed_readpage

Stuffed files only consist of a maximum of
(gfs2 block size - sizeof(struct gfs2_dinode)) bytes. Since the
gfs2 block size is always less than page size, we will never see
a call to stuffed_readpage for anything other than the first page
in the file.

Signed-off-by: Russell Cattelan <cattelan@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 3822189..2f7ef98 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -156,19 +156,6 @@ out_ignore:
 	return 0;
 }
 
-static int zero_readpage(struct page *page)
-{
-	void *kaddr;
-
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset(kaddr, 0, PAGE_CACHE_SIZE);
-	kunmap_atomic(kaddr, KM_USER0);
-
-	SetPageUptodate(page);
-
-	return 0;
-}
-
 /**
  * stuffed_readpage - Fill in a Linux page with stuffed file data
  * @ip: the inode
@@ -183,9 +170,7 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 	void *kaddr;
 	int error;
 
-	/* Only the first page of a stuffed file might contain data */
-	if (unlikely(page->index))
-		return zero_readpage(page);
+	BUG_ON(page->index);
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (error)
@@ -737,6 +722,9 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
 			if (!atomic_read(&aspace->i_writecount))
 				return 0;
 
+			if (!(gfp_mask & __GFP_WAIT))
+				return 0;
+
 			if (time_after_eq(jiffies, t)) {
 				stuck_releasepage(bh);
 				/* should we withdraw here? */
-- 
cgit v0.10.2


From fdda387f73947e6ae511ec601f5b3c6fbb582aac Mon Sep 17 00:00:00 2001
From: Patrick Caulfield <pcaulfie@redhat.com>
Date: Thu, 2 Nov 2006 11:19:21 -0500
Subject: [DLM] Add support for tcp communications

The following patch adds a TCP based communications layer
to the DLM which is compile time selectable. The existing SCTP
layer gives the advantage of allowing multihoming, whereas
the TCP layer has been heavily tested in previous versions of
the DLM and is known to be robust and therefore can be used as
a baseline for performance testing.

Signed-off-by: Patrick Caulfield <pcaulfie@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index 81b2c64..c5985b8 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -9,6 +9,23 @@ config DLM
 	A general purpose distributed lock manager for kernel or userspace
 	applications.
 
+choice
+	prompt "Select DLM communications protocol"
+	depends on DLM
+	default DLM_TCP
+	help
+	The DLM Can use TCP or SCTP for it's network communications.
+	SCTP supports multi-homed operations whereas TCP doesn't.
+	However, SCTP seems to have stability problems at the moment.
+
+config DLM_TCP
+	bool "TCP/IP"
+
+config DLM_SCTP
+	bool "SCTP"
+
+endchoice
+
 config DLM_DEBUG
 	bool "DLM debugging"
 	depends on DLM
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
index 1832e02..6538894 100644
--- a/fs/dlm/Makefile
+++ b/fs/dlm/Makefile
@@ -4,7 +4,6 @@ dlm-y :=			ast.o \
 				dir.o \
 				lock.o \
 				lockspace.o \
-				lowcomms.o \
 				main.o \
 				member.o \
 				memory.o \
@@ -17,3 +16,6 @@ dlm-y :=			ast.o \
 				util.o
 dlm-$(CONFIG_DLM_DEBUG) +=	debug_fs.o
 
+dlm-$(CONFIG_DLM_TCP)   += lowcomms-tcp.o
+
+dlm-$(CONFIG_DLM_SCTP)  += lowcomms-sctp.o
\ No newline at end of file
diff --git a/fs/dlm/lowcomms-sctp.c b/fs/dlm/lowcomms-sctp.c
new file mode 100644
index 0000000..6da6b14
--- /dev/null
+++ b/fs/dlm/lowcomms-sctp.c
@@ -0,0 +1,1239 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * lowcomms.c
+ *
+ * This is the "low-level" comms layer.
+ *
+ * It is responsible for sending/receiving messages
+ * from other nodes in the cluster.
+ *
+ * Cluster nodes are referred to by their nodeids. nodeids are
+ * simply 32 bit numbers to the locking module - if they need to
+ * be expanded for the cluster infrastructure then that is it's
+ * responsibility. It is this layer's
+ * responsibility to resolve these into IP address or
+ * whatever it needs for inter-node communication.
+ *
+ * The comms level is two kernel threads that deal mainly with
+ * the receiving of messages from other nodes and passing them
+ * up to the mid-level comms layer (which understands the
+ * message format) for execution by the locking core, and
+ * a send thread which does all the setting up of connections
+ * to remote nodes and the sending of data. Threads are not allowed
+ * to send their own data because it may cause them to wait in times
+ * of high load. Also, this way, the sending thread can collect together
+ * messages bound for one node and send them in one block.
+ *
+ * I don't see any problem with the recv thread executing the locking
+ * code on behalf of remote processes as the locking code is
+ * short, efficient and never (well, hardly ever) waits.
+ *
+ */
+
+#include <asm/ioctls.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/sctp/user.h>
+#include <linux/pagemap.h>
+#include <linux/socket.h>
+#include <linux/idr.h>
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "midcomms.h"
+
+static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
+static int			dlm_local_count;
+static int			dlm_local_nodeid;
+
+/* One of these per connected node */
+
+#define NI_INIT_PENDING 1
+#define NI_WRITE_PENDING 2
+
+struct nodeinfo {
+	spinlock_t		lock;
+	sctp_assoc_t		assoc_id;
+	unsigned long		flags;
+	struct list_head	write_list; /* nodes with pending writes */
+	struct list_head	writequeue; /* outgoing writequeue_entries */
+	spinlock_t		writequeue_lock;
+	int			nodeid;
+};
+
+static DEFINE_IDR(nodeinfo_idr);
+static struct rw_semaphore	nodeinfo_lock;
+static int			max_nodeid;
+
+struct cbuf {
+	unsigned		base;
+	unsigned		len;
+	unsigned		mask;
+};
+
+/* Just the one of these, now. But this struct keeps
+   the connection-specific variables together */
+
+#define CF_READ_PENDING 1
+
+struct connection {
+	struct socket          *sock;
+	unsigned long		flags;
+	struct page            *rx_page;
+	atomic_t		waiting_requests;
+	struct cbuf		cb;
+	int                     eagain_flag;
+};
+
+/* An entry waiting to be sent */
+
+struct writequeue_entry {
+	struct list_head	list;
+	struct page            *page;
+	int			offset;
+	int			len;
+	int			end;
+	int			users;
+	struct nodeinfo        *ni;
+};
+
+#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
+#define CBUF_EMPTY(cb) ((cb)->len == 0)
+#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
+#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
+
+#define CBUF_INIT(cb, size) \
+do { \
+	(cb)->base = (cb)->len = 0; \
+	(cb)->mask = ((size)-1); \
+} while(0)
+
+#define CBUF_EAT(cb, n) \
+do { \
+	(cb)->len  -= (n); \
+	(cb)->base += (n); \
+	(cb)->base &= (cb)->mask; \
+} while(0)
+
+
+/* List of nodes which have writes pending */
+static struct list_head write_nodes;
+static spinlock_t write_nodes_lock;
+
+/* Maximum number of incoming messages to process before
+ * doing a schedule()
+ */
+#define MAX_RX_MSG_COUNT 25
+
+/* Manage daemons */
+static struct task_struct *recv_task;
+static struct task_struct *send_task;
+static wait_queue_head_t lowcomms_recv_wait;
+static atomic_t accepting;
+
+/* The SCTP connection */
+static struct connection sctp_con;
+
+
+static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
+{
+	struct sockaddr_storage addr;
+	int error;
+
+	if (!dlm_local_count)
+		return -1;
+
+	error = dlm_nodeid_to_addr(nodeid, &addr);
+	if (error)
+		return error;
+
+	if (dlm_local_addr[0]->ss_family == AF_INET) {
+	        struct sockaddr_in *in4  = (struct sockaddr_in *) &addr;
+		struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
+		ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
+	} else {
+	        struct sockaddr_in6 *in6  = (struct sockaddr_in6 *) &addr;
+		struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
+		memcpy(&ret6->sin6_addr, &in6->sin6_addr,
+		       sizeof(in6->sin6_addr));
+	}
+
+	return 0;
+}
+
+static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc)
+{
+	struct nodeinfo *ni;
+	int r;
+	int n;
+
+	down_read(&nodeinfo_lock);
+	ni = idr_find(&nodeinfo_idr, nodeid);
+	up_read(&nodeinfo_lock);
+
+	if (!ni && alloc) {
+		down_write(&nodeinfo_lock);
+
+		ni = idr_find(&nodeinfo_idr, nodeid);
+		if (ni)
+			goto out_up;
+
+		r = idr_pre_get(&nodeinfo_idr, alloc);
+		if (!r)
+			goto out_up;
+
+		ni = kmalloc(sizeof(struct nodeinfo), alloc);
+		if (!ni)
+			goto out_up;
+
+		r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
+		if (r) {
+			kfree(ni);
+			ni = NULL;
+			goto out_up;
+		}
+		if (n != nodeid) {
+			idr_remove(&nodeinfo_idr, n);
+			kfree(ni);
+			ni = NULL;
+			goto out_up;
+		}
+		memset(ni, 0, sizeof(struct nodeinfo));
+		spin_lock_init(&ni->lock);
+		INIT_LIST_HEAD(&ni->writequeue);
+		spin_lock_init(&ni->writequeue_lock);
+		ni->nodeid = nodeid;
+
+		if (nodeid > max_nodeid)
+			max_nodeid = nodeid;
+	out_up:
+		up_write(&nodeinfo_lock);
+	}
+
+	return ni;
+}
+
+/* Don't call this too often... */
+static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
+{
+	int i;
+	struct nodeinfo *ni;
+
+	for (i=1; i<=max_nodeid; i++) {
+		ni = nodeid2nodeinfo(i, 0);
+		if (ni && ni->assoc_id == assoc)
+			return ni;
+	}
+	return NULL;
+}
+
+/* Data or notification available on socket */
+static void lowcomms_data_ready(struct sock *sk, int count_unused)
+{
+	atomic_inc(&sctp_con.waiting_requests);
+	if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
+		return;
+
+	wake_up_interruptible(&lowcomms_recv_wait);
+}
+
+
+/* Add the port number to an IP6 or 4 sockaddr and return the address length.
+   Also padd out the struct with zeros to make comparisons meaningful */
+
+static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
+			  int *addr_len)
+{
+	struct sockaddr_in *local4_addr;
+	struct sockaddr_in6 *local6_addr;
+
+	if (!dlm_local_count)
+		return;
+
+	if (!port) {
+		if (dlm_local_addr[0]->ss_family == AF_INET) {
+			local4_addr = (struct sockaddr_in *)dlm_local_addr[0];
+			port = be16_to_cpu(local4_addr->sin_port);
+		} else {
+			local6_addr = (struct sockaddr_in6 *)dlm_local_addr[0];
+			port = be16_to_cpu(local6_addr->sin6_port);
+		}
+	}
+
+	saddr->ss_family = dlm_local_addr[0]->ss_family;
+	if (dlm_local_addr[0]->ss_family == AF_INET) {
+		struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
+		in4_addr->sin_port = cpu_to_be16(port);
+		memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
+		memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
+				      sizeof(struct sockaddr_in));
+		*addr_len = sizeof(struct sockaddr_in);
+	} else {
+		struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
+		in6_addr->sin6_port = cpu_to_be16(port);
+		memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
+				      sizeof(struct sockaddr_in6));
+		*addr_len = sizeof(struct sockaddr_in6);
+	}
+}
+
+/* Close the connection and tidy up */
+static void close_connection(void)
+{
+	if (sctp_con.sock) {
+		sock_release(sctp_con.sock);
+		sctp_con.sock = NULL;
+	}
+
+	if (sctp_con.rx_page) {
+		__free_page(sctp_con.rx_page);
+		sctp_con.rx_page = NULL;
+	}
+}
+
+/* We only send shutdown messages to nodes that are not part of the cluster */
+static void send_shutdown(sctp_assoc_t associd)
+{
+	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+	struct msghdr outmessage;
+	struct cmsghdr *cmsg;
+	struct sctp_sndrcvinfo *sinfo;
+	int ret;
+
+	outmessage.msg_name = NULL;
+	outmessage.msg_namelen = 0;
+	outmessage.msg_control = outcmsg;
+	outmessage.msg_controllen = sizeof(outcmsg);
+	outmessage.msg_flags = MSG_EOR;
+
+	cmsg = CMSG_FIRSTHDR(&outmessage);
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_SNDRCV;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+	outmessage.msg_controllen = cmsg->cmsg_len;
+	sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+
+	sinfo->sinfo_flags |= MSG_EOF;
+	sinfo->sinfo_assoc_id = associd;
+
+	ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
+
+	if (ret != 0)
+		log_print("send EOF to node failed: %d", ret);
+}
+
+
+/* INIT failed but we don't know which node...
+   restart INIT on all pending nodes */
+static void init_failed(void)
+{
+	int i;
+	struct nodeinfo *ni;
+
+	for (i=1; i<=max_nodeid; i++) {
+		ni = nodeid2nodeinfo(i, 0);
+		if (!ni)
+			continue;
+
+		if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
+			ni->assoc_id = 0;
+			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+				spin_lock_bh(&write_nodes_lock);
+				list_add_tail(&ni->write_list, &write_nodes);
+				spin_unlock_bh(&write_nodes_lock);
+			}
+		}
+	}
+	wake_up_process(send_task);
+}
+
+/* Something happened to an association */
+static void process_sctp_notification(struct msghdr *msg, char *buf)
+{
+	union sctp_notification *sn = (union sctp_notification *)buf;
+
+	if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
+		switch (sn->sn_assoc_change.sac_state) {
+
+		case SCTP_COMM_UP:
+		case SCTP_RESTART:
+		{
+			/* Check that the new node is in the lockspace */
+			struct sctp_prim prim;
+			mm_segment_t fs;
+			int nodeid;
+			int prim_len, ret;
+			int addr_len;
+			struct nodeinfo *ni;
+
+			/* This seems to happen when we received a connection
+			 * too early... or something...  anyway, it happens but
+			 * we always seem to get a real message too, see
+			 * receive_from_sock */
+
+			if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
+				log_print("COMM_UP for invalid assoc ID %d",
+					 (int)sn->sn_assoc_change.sac_assoc_id);
+				init_failed();
+				return;
+			}
+			memset(&prim, 0, sizeof(struct sctp_prim));
+			prim_len = sizeof(struct sctp_prim);
+			prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
+
+			fs = get_fs();
+			set_fs(get_ds());
+			ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
+						IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
+						(char*)&prim, &prim_len);
+			set_fs(fs);
+			if (ret < 0) {
+				struct nodeinfo *ni;
+
+				log_print("getsockopt/sctp_primary_addr on "
+					  "new assoc %d failed : %d",
+				    (int)sn->sn_assoc_change.sac_assoc_id, ret);
+
+				/* Retry INIT later */
+				ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
+				if (ni)
+					clear_bit(NI_INIT_PENDING, &ni->flags);
+				return;
+			}
+			make_sockaddr(&prim.ssp_addr, 0, &addr_len);
+			if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
+				log_print("reject connect from unknown addr");
+				send_shutdown(prim.ssp_assoc_id);
+				return;
+			}
+
+			ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
+			if (!ni)
+				return;
+
+			/* Save the assoc ID */
+			spin_lock(&ni->lock);
+			ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
+			spin_unlock(&ni->lock);
+
+			log_print("got new/restarted association %d nodeid %d",
+			       (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
+
+			/* Send any pending writes */
+			clear_bit(NI_INIT_PENDING, &ni->flags);
+			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+				spin_lock_bh(&write_nodes_lock);
+				list_add_tail(&ni->write_list, &write_nodes);
+				spin_unlock_bh(&write_nodes_lock);
+			}
+			wake_up_process(send_task);
+		}
+		break;
+
+		case SCTP_COMM_LOST:
+		case SCTP_SHUTDOWN_COMP:
+		{
+			struct nodeinfo *ni;
+
+			ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
+			if (ni) {
+				spin_lock(&ni->lock);
+				ni->assoc_id = 0;
+				spin_unlock(&ni->lock);
+			}
+		}
+		break;
+
+		/* We don't know which INIT failed, so clear the PENDING flags
+		 * on them all.  if assoc_id is zero then it will then try
+		 * again */
+
+		case SCTP_CANT_STR_ASSOC:
+		{
+			log_print("Can't start SCTP association - retrying");
+			init_failed();
+		}
+		break;
+
+		default:
+			log_print("unexpected SCTP assoc change id=%d state=%d",
+				  (int)sn->sn_assoc_change.sac_assoc_id,
+				  sn->sn_assoc_change.sac_state);
+		}
+	}
+}
+
+/* Data received from remote end */
+static int receive_from_sock(void)
+{
+	int ret = 0;
+	struct msghdr msg;
+	struct kvec iov[2];
+	unsigned len;
+	int r;
+	struct sctp_sndrcvinfo *sinfo;
+	struct cmsghdr *cmsg;
+	struct nodeinfo *ni;
+
+	/* These two are marginally too big for stack allocation, but this
+	 * function is (currently) only called by dlm_recvd so static should be
+	 * OK.
+	 */
+	static struct sockaddr_storage msgname;
+	static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+
+	if (sctp_con.sock == NULL)
+		goto out;
+
+	if (sctp_con.rx_page == NULL) {
+		/*
+		 * This doesn't need to be atomic, but I think it should
+		 * improve performance if it is.
+		 */
+		sctp_con.rx_page = alloc_page(GFP_ATOMIC);
+		if (sctp_con.rx_page == NULL)
+			goto out_resched;
+		CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE);
+	}
+
+	memset(&incmsg, 0, sizeof(incmsg));
+	memset(&msgname, 0, sizeof(msgname));
+
+	memset(incmsg, 0, sizeof(incmsg));
+	msg.msg_name = &msgname;
+	msg.msg_namelen = sizeof(msgname);
+	msg.msg_flags = 0;
+	msg.msg_control = incmsg;
+	msg.msg_controllen = sizeof(incmsg);
+	msg.msg_iovlen = 1;
+
+	/* I don't see why this circular buffer stuff is necessary for SCTP
+	 * which is a packet-based protocol, but the whole thing breaks under
+	 * load without it! The overhead is minimal (and is in the TCP lowcomms
+	 * anyway, of course) so I'll leave it in until I can figure out what's
+	 * really happening.
+	 */
+
+	/*
+	 * iov[0] is the bit of the circular buffer between the current end
+	 * point (cb.base + cb.len) and the end of the buffer.
+	 */
+	iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb);
+	iov[0].iov_base = page_address(sctp_con.rx_page) +
+			  CBUF_DATA(&sctp_con.cb);
+	iov[1].iov_len = 0;
+
+	/*
+	 * iov[1] is the bit of the circular buffer between the start of the
+	 * buffer and the start of the currently used section (cb.base)
+	 */
+	if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) {
+		iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb);
+		iov[1].iov_len = sctp_con.cb.base;
+		iov[1].iov_base = page_address(sctp_con.rx_page);
+		msg.msg_iovlen = 2;
+	}
+	len = iov[0].iov_len + iov[1].iov_len;
+
+	r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, msg.msg_iovlen, len,
+				 MSG_NOSIGNAL | MSG_DONTWAIT);
+	if (ret <= 0)
+		goto out_close;
+
+	msg.msg_control = incmsg;
+	msg.msg_controllen = sizeof(incmsg);
+	cmsg = CMSG_FIRSTHDR(&msg);
+	sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+
+	if (msg.msg_flags & MSG_NOTIFICATION) {
+		process_sctp_notification(&msg, page_address(sctp_con.rx_page));
+		return 0;
+	}
+
+	/* Is this a new association ? */
+	ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
+	if (ni) {
+		ni->assoc_id = sinfo->sinfo_assoc_id;
+		if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
+
+			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+				spin_lock_bh(&write_nodes_lock);
+				list_add_tail(&ni->write_list, &write_nodes);
+				spin_unlock_bh(&write_nodes_lock);
+			}
+			wake_up_process(send_task);
+		}
+	}
+
+	/* INIT sends a message with length of 1 - ignore it */
+	if (r == 1)
+		return 0;
+
+	CBUF_ADD(&sctp_con.cb, ret);
+	ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
+					  page_address(sctp_con.rx_page),
+					  sctp_con.cb.base, sctp_con.cb.len,
+					  PAGE_CACHE_SIZE);
+	if (ret < 0)
+		goto out_close;
+	CBUF_EAT(&sctp_con.cb, ret);
+
+      out:
+	ret = 0;
+	goto out_ret;
+
+      out_resched:
+	lowcomms_data_ready(sctp_con.sock->sk, 0);
+	ret = 0;
+	schedule();
+	goto out_ret;
+
+      out_close:
+	if (ret != -EAGAIN)
+		log_print("error reading from sctp socket: %d", ret);
+      out_ret:
+	return ret;
+}
+
+/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
+static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
+{
+	mm_segment_t fs;
+	int result = 0;
+
+	fs = get_fs();
+	set_fs(get_ds());
+	if (num == 1)
+		result = sctp_con.sock->ops->bind(sctp_con.sock,
+					(struct sockaddr *) addr, addr_len);
+	else
+		result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
+				SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len);
+	set_fs(fs);
+
+	if (result < 0)
+		log_print("Can't bind to port %d addr number %d",
+			  dlm_config.tcp_port, num);
+
+	return result;
+}
+
+static void init_local(void)
+{
+	struct sockaddr_storage sas, *addr;
+	int i;
+
+	dlm_local_nodeid = dlm_our_nodeid();
+
+	for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
+		if (dlm_our_addr(&sas, i))
+			break;
+
+		addr = kmalloc(sizeof(*addr), GFP_KERNEL);
+		if (!addr)
+			break;
+		memcpy(addr, &sas, sizeof(*addr));
+		dlm_local_addr[dlm_local_count++] = addr;
+	}
+}
+
+/* Initialise SCTP socket and bind to all interfaces */
+static int init_sock(void)
+{
+	mm_segment_t fs;
+	struct socket *sock = NULL;
+	struct sockaddr_storage localaddr;
+	struct sctp_event_subscribe subscribe;
+	int result = -EINVAL, num = 1, i, addr_len;
+
+	if (!dlm_local_count) {
+		init_local();
+		if (!dlm_local_count) {
+			log_print("no local IP address has been set");
+			goto out;
+		}
+	}
+
+	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
+				  IPPROTO_SCTP, &sock);
+	if (result < 0) {
+		log_print("Can't create comms socket, check SCTP is loaded");
+		goto out;
+	}
+
+	/* Listen for events */
+	memset(&subscribe, 0, sizeof(subscribe));
+	subscribe.sctp_data_io_event = 1;
+	subscribe.sctp_association_event = 1;
+	subscribe.sctp_send_failure_event = 1;
+	subscribe.sctp_shutdown_event = 1;
+	subscribe.sctp_partial_delivery_event = 1;
+
+	fs = get_fs();
+	set_fs(get_ds());
+	result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
+				       (char *)&subscribe, sizeof(subscribe));
+	set_fs(fs);
+
+	if (result < 0) {
+		log_print("Failed to set SCTP_EVENTS on socket: result=%d",
+			  result);
+		goto create_delsock;
+	}
+
+	/* Init con struct */
+	sock->sk->sk_user_data = &sctp_con;
+	sctp_con.sock = sock;
+	sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
+
+	/* Bind to all interfaces. */
+	for (i = 0; i < dlm_local_count; i++) {
+		memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
+		make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len);
+
+		result = add_bind_addr(&localaddr, addr_len, num);
+		if (result)
+			goto create_delsock;
+		++num;
+	}
+
+	result = sock->ops->listen(sock, 5);
+	if (result < 0) {
+		log_print("Can't set socket listening");
+		goto create_delsock;
+	}
+
+	return 0;
+
+ create_delsock:
+	sock_release(sock);
+	sctp_con.sock = NULL;
+ out:
+	return result;
+}
+
+
+static struct writequeue_entry *new_writequeue_entry(gfp_t allocation)
+{
+	struct writequeue_entry *entry;
+
+	entry = kmalloc(sizeof(struct writequeue_entry), allocation);
+	if (!entry)
+		return NULL;
+
+	entry->page = alloc_page(allocation);
+	if (!entry->page) {
+		kfree(entry);
+		return NULL;
+	}
+
+	entry->offset = 0;
+	entry->len = 0;
+	entry->end = 0;
+	entry->users = 0;
+
+	return entry;
+}
+
+void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
+{
+	struct writequeue_entry *e;
+	int offset = 0;
+	int users = 0;
+	struct nodeinfo *ni;
+
+	if (!atomic_read(&accepting))
+		return NULL;
+
+	ni = nodeid2nodeinfo(nodeid, allocation);
+	if (!ni)
+		return NULL;
+
+	spin_lock(&ni->writequeue_lock);
+	e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
+	if (((struct list_head *) e == &ni->writequeue) ||
+	    (PAGE_CACHE_SIZE - e->end < len)) {
+		e = NULL;
+	} else {
+		offset = e->end;
+		e->end += len;
+		users = e->users++;
+	}
+	spin_unlock(&ni->writequeue_lock);
+
+	if (e) {
+	      got_one:
+		if (users == 0)
+			kmap(e->page);
+		*ppc = page_address(e->page) + offset;
+		return e;
+	}
+
+	e = new_writequeue_entry(allocation);
+	if (e) {
+		spin_lock(&ni->writequeue_lock);
+		offset = e->end;
+		e->end += len;
+		e->ni = ni;
+		users = e->users++;
+		list_add_tail(&e->list, &ni->writequeue);
+		spin_unlock(&ni->writequeue_lock);
+		goto got_one;
+	}
+	return NULL;
+}
+
+void dlm_lowcomms_commit_buffer(void *arg)
+{
+	struct writequeue_entry *e = (struct writequeue_entry *) arg;
+	int users;
+	struct nodeinfo *ni = e->ni;
+
+	if (!atomic_read(&accepting))
+		return;
+
+	spin_lock(&ni->writequeue_lock);
+	users = --e->users;
+	if (users)
+		goto out;
+	e->len = e->end - e->offset;
+	kunmap(e->page);
+	spin_unlock(&ni->writequeue_lock);
+
+	if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+		spin_lock_bh(&write_nodes_lock);
+		list_add_tail(&ni->write_list, &write_nodes);
+		spin_unlock_bh(&write_nodes_lock);
+		wake_up_process(send_task);
+	}
+	return;
+
+      out:
+	spin_unlock(&ni->writequeue_lock);
+	return;
+}
+
+static void free_entry(struct writequeue_entry *e)
+{
+	__free_page(e->page);
+	kfree(e);
+}
+
+/* Initiate an SCTP association. In theory we could just use sendmsg() on
+   the first IP address and it should work, but this allows us to set up the
+   association before sending any valuable data that we can't afford to lose.
+   It also keeps the send path clean as it can now always use the association ID */
+static void initiate_association(int nodeid)
+{
+	struct sockaddr_storage rem_addr;
+	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+	struct msghdr outmessage;
+	struct cmsghdr *cmsg;
+	struct sctp_sndrcvinfo *sinfo;
+	int ret;
+	int addrlen;
+	char buf[1];
+	struct kvec iov[1];
+	struct nodeinfo *ni;
+
+	log_print("Initiating association with node %d", nodeid);
+
+	ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
+	if (!ni)
+		return;
+
+	if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
+		log_print("no address for nodeid %d", nodeid);
+		return;
+	}
+
+	make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen);
+
+	outmessage.msg_name = &rem_addr;
+	outmessage.msg_namelen = addrlen;
+	outmessage.msg_control = outcmsg;
+	outmessage.msg_controllen = sizeof(outcmsg);
+	outmessage.msg_flags = MSG_EOR;
+
+	iov[0].iov_base = buf;
+	iov[0].iov_len = 1;
+
+	/* Real INIT messages seem to cause trouble. Just send a 1 byte message
+	   we can afford to lose */
+	cmsg = CMSG_FIRSTHDR(&outmessage);
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_SNDRCV;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+	sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+	sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
+
+	outmessage.msg_controllen = cmsg->cmsg_len;
+	ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
+	if (ret < 0) {
+		log_print("send INIT to node failed: %d", ret);
+		/* Try again later */
+		clear_bit(NI_INIT_PENDING, &ni->flags);
+	}
+}
+
+/* Send a message */
+static int send_to_sock(struct nodeinfo *ni)
+{
+	int ret = 0;
+	struct writequeue_entry *e;
+	int len, offset;
+	struct msghdr outmsg;
+	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+	struct cmsghdr *cmsg;
+	struct sctp_sndrcvinfo *sinfo;
+	struct kvec iov;
+
+        /* See if we need to init an association before we start
+	   sending precious messages */
+	spin_lock(&ni->lock);
+	if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
+		spin_unlock(&ni->lock);
+		initiate_association(ni->nodeid);
+		return 0;
+	}
+	spin_unlock(&ni->lock);
+
+	outmsg.msg_name = NULL; /* We use assoc_id */
+	outmsg.msg_namelen = 0;
+	outmsg.msg_control = outcmsg;
+	outmsg.msg_controllen = sizeof(outcmsg);
+	outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
+
+	cmsg = CMSG_FIRSTHDR(&outmsg);
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_SNDRCV;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+	sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+	sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
+	sinfo->sinfo_assoc_id = ni->assoc_id;
+	outmsg.msg_controllen = cmsg->cmsg_len;
+
+	spin_lock(&ni->writequeue_lock);
+	for (;;) {
+		if (list_empty(&ni->writequeue))
+			break;
+		e = list_entry(ni->writequeue.next, struct writequeue_entry,
+			       list);
+		len = e->len;
+		offset = e->offset;
+		BUG_ON(len == 0 && e->users == 0);
+		spin_unlock(&ni->writequeue_lock);
+		kmap(e->page);
+
+		ret = 0;
+		if (len) {
+			iov.iov_base = page_address(e->page)+offset;
+			iov.iov_len = len;
+
+			ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
+					     len);
+			if (ret == -EAGAIN) {
+				sctp_con.eagain_flag = 1;
+				goto out;
+			} else if (ret < 0)
+				goto send_error;
+		} else {
+			/* Don't starve people filling buffers */
+			schedule();
+		}
+
+		spin_lock(&ni->writequeue_lock);
+		e->offset += ret;
+		e->len -= ret;
+
+		if (e->len == 0 && e->users == 0) {
+			list_del(&e->list);
+			free_entry(e);
+			continue;
+		}
+	}
+	spin_unlock(&ni->writequeue_lock);
+ out:
+	return ret;
+
+ send_error:
+	log_print("Error sending to node %d %d", ni->nodeid, ret);
+	spin_lock(&ni->lock);
+	if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
+		ni->assoc_id = 0;
+		spin_unlock(&ni->lock);
+		initiate_association(ni->nodeid);
+	} else
+		spin_unlock(&ni->lock);
+
+	return ret;
+}
+
+/* Try to send any messages that are pending */
+static void process_output_queue(void)
+{
+	struct list_head *list;
+	struct list_head *temp;
+
+	spin_lock_bh(&write_nodes_lock);
+	list_for_each_safe(list, temp, &write_nodes) {
+		struct nodeinfo *ni =
+		    list_entry(list, struct nodeinfo, write_list);
+		clear_bit(NI_WRITE_PENDING, &ni->flags);
+		list_del(&ni->write_list);
+
+		spin_unlock_bh(&write_nodes_lock);
+
+		send_to_sock(ni);
+		spin_lock_bh(&write_nodes_lock);
+	}
+	spin_unlock_bh(&write_nodes_lock);
+}
+
+/* Called after we've had -EAGAIN and been woken up */
+static void refill_write_queue(void)
+{
+	int i;
+
+	for (i=1; i<=max_nodeid; i++) {
+		struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
+
+		if (ni) {
+			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+				spin_lock_bh(&write_nodes_lock);
+				list_add_tail(&ni->write_list, &write_nodes);
+				spin_unlock_bh(&write_nodes_lock);
+			}
+		}
+	}
+}
+
+static void clean_one_writequeue(struct nodeinfo *ni)
+{
+	struct list_head *list;
+	struct list_head *temp;
+
+	spin_lock(&ni->writequeue_lock);
+	list_for_each_safe(list, temp, &ni->writequeue) {
+		struct writequeue_entry *e =
+			list_entry(list, struct writequeue_entry, list);
+		list_del(&e->list);
+		free_entry(e);
+	}
+	spin_unlock(&ni->writequeue_lock);
+}
+
+static void clean_writequeues(void)
+{
+	int i;
+
+	for (i=1; i<=max_nodeid; i++) {
+		struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
+		if (ni)
+			clean_one_writequeue(ni);
+	}
+}
+
+
+static void dealloc_nodeinfo(void)
+{
+	int i;
+
+	for (i=1; i<=max_nodeid; i++) {
+		struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
+		if (ni) {
+			idr_remove(&nodeinfo_idr, i);
+			kfree(ni);
+		}
+	}
+}
+
+int dlm_lowcomms_close(int nodeid)
+{
+	struct nodeinfo *ni;
+
+	ni = nodeid2nodeinfo(nodeid, 0);
+	if (!ni)
+		return -1;
+
+	spin_lock(&ni->lock);
+	if (ni->assoc_id) {
+		ni->assoc_id = 0;
+		/* Don't send shutdown here, sctp will just queue it
+		   till the node comes back up! */
+	}
+	spin_unlock(&ni->lock);
+
+	clean_one_writequeue(ni);
+	clear_bit(NI_INIT_PENDING, &ni->flags);
+	return 0;
+}
+
+static int write_list_empty(void)
+{
+	int status;
+
+	spin_lock_bh(&write_nodes_lock);
+	status = list_empty(&write_nodes);
+	spin_unlock_bh(&write_nodes_lock);
+
+	return status;
+}
+
+static int dlm_recvd(void *data)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	while (!kthread_should_stop()) {
+		int count = 0;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&lowcomms_recv_wait, &wait);
+		if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
+			schedule();
+		remove_wait_queue(&lowcomms_recv_wait, &wait);
+		set_current_state(TASK_RUNNING);
+
+		if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
+			int ret;
+
+			do {
+				ret = receive_from_sock();
+
+				/* Don't starve out everyone else */
+				if (++count >= MAX_RX_MSG_COUNT) {
+					schedule();
+					count = 0;
+				}
+			} while (!kthread_should_stop() && ret >=0);
+		}
+		schedule();
+	}
+
+	return 0;
+}
+
+static int dlm_sendd(void *data)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
+
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (write_list_empty())
+			schedule();
+		set_current_state(TASK_RUNNING);
+
+		if (sctp_con.eagain_flag) {
+			sctp_con.eagain_flag = 0;
+			refill_write_queue();
+		}
+		process_output_queue();
+	}
+
+	remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
+
+	return 0;
+}
+
+static void daemons_stop(void)
+{
+	kthread_stop(recv_task);
+	kthread_stop(send_task);
+}
+
+static int daemons_start(void)
+{
+	struct task_struct *p;
+	int error;
+
+	p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
+	error = IS_ERR(p);
+       	if (error) {
+		log_print("can't start dlm_recvd %d", error);
+		return error;
+	}
+	recv_task = p;
+
+	p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
+	error = IS_ERR(p);
+       	if (error) {
+		log_print("can't start dlm_sendd %d", error);
+		kthread_stop(recv_task);
+		return error;
+	}
+	send_task = p;
+
+	return 0;
+}
+
+/*
+ * This is quite likely to sleep...
+ */
+int dlm_lowcomms_start(void)
+{
+	int error;
+
+	error = init_sock();
+	if (error)
+		goto fail_sock;
+	error = daemons_start();
+	if (error)
+		goto fail_sock;
+	atomic_set(&accepting, 1);
+	return 0;
+
+ fail_sock:
+	close_connection();
+	return error;
+}
+
+/* Set all the activity flags to prevent any socket activity. */
+
+void dlm_lowcomms_stop(void)
+{
+	atomic_set(&accepting, 0);
+	sctp_con.flags = 0x7;
+	daemons_stop();
+	clean_writequeues();
+	close_connection();
+	dealloc_nodeinfo();
+	max_nodeid = 0;
+}
+
+int dlm_lowcomms_init(void)
+{
+	init_waitqueue_head(&lowcomms_recv_wait);
+	spin_lock_init(&write_nodes_lock);
+	INIT_LIST_HEAD(&write_nodes);
+	init_rwsem(&nodeinfo_lock);
+	return 0;
+}
+
+void dlm_lowcomms_exit(void)
+{
+	int i;
+
+	for (i = 0; i < dlm_local_count; i++)
+		kfree(dlm_local_addr[i]);
+	dlm_local_count = 0;
+	dlm_local_nodeid = 0;
+}
+
diff --git a/fs/dlm/lowcomms-tcp.c b/fs/dlm/lowcomms-tcp.c
new file mode 100644
index 0000000..7289e59
--- /dev/null
+++ b/fs/dlm/lowcomms-tcp.c
@@ -0,0 +1,1263 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * lowcomms.c
+ *
+ * This is the "low-level" comms layer.
+ *
+ * It is responsible for sending/receiving messages
+ * from other nodes in the cluster.
+ *
+ * Cluster nodes are referred to by their nodeids. nodeids are
+ * simply 32 bit numbers to the locking module - if they need to
+ * be expanded for the cluster infrastructure then that is it's
+ * responsibility. It is this layer's
+ * responsibility to resolve these into IP address or
+ * whatever it needs for inter-node communication.
+ *
+ * The comms level is two kernel threads that deal mainly with
+ * the receiving of messages from other nodes and passing them
+ * up to the mid-level comms layer (which understands the
+ * message format) for execution by the locking core, and
+ * a send thread which does all the setting up of connections
+ * to remote nodes and the sending of data. Threads are not allowed
+ * to send their own data because it may cause them to wait in times
+ * of high load. Also, this way, the sending thread can collect together
+ * messages bound for one node and send them in one block.
+ *
+ * I don't see any problem with the recv thread executing the locking
+ * code on behalf of remote processes as the locking code is
+ * short, efficient and never waits.
+ *
+ */
+
+
+#include <asm/ioctls.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/pagemap.h>
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "config.h"
+
+struct cbuf {
+	unsigned base;
+	unsigned len;
+	unsigned mask;
+};
+
+#ifndef FALSE
+#define FALSE 0
+#define TRUE 1
+#endif
+#define NODE_INCREMENT 32
+
+#define CBUF_INIT(cb, size) do { (cb)->base = (cb)->len = 0; (cb)->mask = ((size)-1); } while(0)
+#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
+#define CBUF_EMPTY(cb) ((cb)->len == 0)
+#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
+#define CBUF_EAT(cb, n) do { (cb)->len  -= (n); \
+                             (cb)->base += (n); (cb)->base &= (cb)->mask; } while(0)
+#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
+
+/* Maximum number of incoming messages to process before
+   doing a schedule()
+*/
+#define MAX_RX_MSG_COUNT 25
+
+struct connection {
+	struct socket *sock;	/* NULL if not connected */
+	uint32_t nodeid;	/* So we know who we are in the list */
+	struct rw_semaphore sock_sem;	/* Stop connect races */
+	struct list_head read_list;	/* On this list when ready for reading */
+	struct list_head write_list;	/* On this list when ready for writing */
+	struct list_head state_list;	/* On this list when ready to connect */
+	unsigned long flags;	/* bit 1,2 = We are on the read/write lists */
+#define CF_READ_PENDING 1
+#define CF_WRITE_PENDING 2
+#define CF_CONNECT_PENDING 3
+#define CF_IS_OTHERCON 4
+	struct list_head writequeue;	/* List of outgoing writequeue_entries */
+	struct list_head listenlist;    /* List of allocated listening sockets */
+	spinlock_t writequeue_lock;
+	int (*rx_action) (struct connection *);	/* What to do when active */
+	struct page *rx_page;
+	struct cbuf cb;
+	int retries;
+	atomic_t waiting_requests;
+#define MAX_CONNECT_RETRIES 3
+	struct connection *othercon;
+};
+#define sock2con(x) ((struct connection *)(x)->sk_user_data)
+
+/* An entry waiting to be sent */
+struct writequeue_entry {
+	struct list_head list;
+	struct page *page;
+	int offset;
+	int len;
+	int end;
+	int users;
+	struct connection *con;
+};
+
+static struct sockaddr_storage dlm_local_addr;
+
+/* Manage daemons */
+static struct task_struct *recv_task;
+static struct task_struct *send_task;
+
+static wait_queue_t lowcomms_send_waitq_head;
+static wait_queue_head_t lowcomms_send_waitq;
+static wait_queue_t lowcomms_recv_waitq_head;
+static wait_queue_head_t lowcomms_recv_waitq;
+
+/* An array of pointers to connections, indexed by NODEID */
+static struct connection **connections;
+static struct semaphore connections_lock;
+static kmem_cache_t *con_cache;
+static int conn_array_size;
+static atomic_t accepting;
+
+/* List of sockets that have reads pending */
+static struct list_head read_sockets;
+static spinlock_t read_sockets_lock;
+
+/* List of sockets which have writes pending */
+static struct list_head write_sockets;
+static spinlock_t write_sockets_lock;
+
+/* List of sockets which have connects pending */
+static struct list_head state_sockets;
+static spinlock_t state_sockets_lock;
+
+static struct connection *nodeid2con(int nodeid, gfp_t allocation)
+{
+	struct connection *con = NULL;
+
+	down(&connections_lock);
+	if (nodeid >= conn_array_size) {
+		int new_size = nodeid + NODE_INCREMENT;
+		struct connection **new_conns;
+
+		new_conns = kmalloc(sizeof(struct connection *) *
+				    new_size, allocation);
+		if (!new_conns)
+			goto finish;
+
+		memset(new_conns, 0, sizeof(struct connection *) * new_size);
+		memcpy(new_conns, connections,  sizeof(struct connection *) * conn_array_size);
+		conn_array_size = new_size;
+		kfree(connections);
+		connections = new_conns;
+
+	}
+
+	con = connections[nodeid];
+	if (con == NULL && allocation) {
+		con = kmem_cache_alloc(con_cache, allocation);
+		if (!con)
+			goto finish;
+
+		memset(con, 0, sizeof(*con));
+		con->nodeid = nodeid;
+		init_rwsem(&con->sock_sem);
+		INIT_LIST_HEAD(&con->writequeue);
+		spin_lock_init(&con->writequeue_lock);
+
+		connections[nodeid] = con;
+	}
+
+ finish:
+	up(&connections_lock);
+	return con;
+}
+
+/* Data available on socket or listen socket received a connect */
+static void lowcomms_data_ready(struct sock *sk, int count_unused)
+{
+	struct connection *con = sock2con(sk);
+
+	atomic_inc(&con->waiting_requests);
+	if (test_and_set_bit(CF_READ_PENDING, &con->flags))
+		return;
+
+	spin_lock_bh(&read_sockets_lock);
+	list_add_tail(&con->read_list, &read_sockets);
+	spin_unlock_bh(&read_sockets_lock);
+
+	wake_up_interruptible(&lowcomms_recv_waitq);
+}
+
+static void lowcomms_write_space(struct sock *sk)
+{
+	struct connection *con = sock2con(sk);
+
+	if (test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+		return;
+
+	spin_lock_bh(&write_sockets_lock);
+	list_add_tail(&con->write_list, &write_sockets);
+	spin_unlock_bh(&write_sockets_lock);
+
+	wake_up_interruptible(&lowcomms_send_waitq);
+}
+
+static inline void lowcomms_connect_sock(struct connection *con)
+{
+	if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
+		return;
+	if (!atomic_read(&accepting))
+		return;
+
+	spin_lock_bh(&state_sockets_lock);
+	list_add_tail(&con->state_list, &state_sockets);
+	spin_unlock_bh(&state_sockets_lock);
+
+	wake_up_interruptible(&lowcomms_send_waitq);
+}
+
+static void lowcomms_state_change(struct sock *sk)
+{
+/*	struct connection *con = sock2con(sk); */
+
+	switch (sk->sk_state) {
+	case TCP_ESTABLISHED:
+		lowcomms_write_space(sk);
+		break;
+
+	case TCP_FIN_WAIT1:
+	case TCP_FIN_WAIT2:
+	case TCP_TIME_WAIT:
+	case TCP_CLOSE:
+	case TCP_CLOSE_WAIT:
+	case TCP_LAST_ACK:
+	case TCP_CLOSING:
+		/* FIXME: I think this causes more trouble than it solves.
+		   lowcomms wil reconnect anyway when there is something to
+		   send. This just attempts reconnection if a node goes down!
+		*/
+		/* lowcomms_connect_sock(con); */
+		break;
+
+	default:
+		printk("dlm: lowcomms_state_change: state=%d\n", sk->sk_state);
+		break;
+	}
+}
+
+/* Make a socket active */
+static int add_sock(struct socket *sock, struct connection *con)
+{
+	con->sock = sock;
+
+	/* Install a data_ready callback */
+	con->sock->sk->sk_data_ready = lowcomms_data_ready;
+	con->sock->sk->sk_write_space = lowcomms_write_space;
+	con->sock->sk->sk_state_change = lowcomms_state_change;
+
+	return 0;
+}
+
+/* Add the port number to an IP6 or 4 sockaddr and return the address
+   length */
+static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
+			  int *addr_len)
+{
+        saddr->ss_family =  dlm_local_addr.ss_family;
+        if (saddr->ss_family == AF_INET) {
+		struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
+		in4_addr->sin_port = cpu_to_be16(port);
+		*addr_len = sizeof(struct sockaddr_in);
+	}
+	else {
+		struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
+		in6_addr->sin6_port = cpu_to_be16(port);
+		*addr_len = sizeof(struct sockaddr_in6);
+	}
+}
+
+/* Close a remote connection and tidy up */
+static void close_connection(struct connection *con, int and_other)
+{
+	down_write(&con->sock_sem);
+
+	if (con->sock) {
+		sock_release(con->sock);
+		con->sock = NULL;
+	}
+	if (con->othercon && and_other) {
+		/* Argh! recursion in kernel code!
+		   Actually, this isn't a list so it
+		   will only re-enter once.
+		*/
+		close_connection(con->othercon, FALSE);
+	}
+	if (con->rx_page) {
+		__free_page(con->rx_page);
+		con->rx_page = NULL;
+	}
+	con->retries = 0;
+	up_write(&con->sock_sem);
+}
+
+/* Data received from remote end */
+static int receive_from_sock(struct connection *con)
+{
+	int ret = 0;
+	struct msghdr msg;
+	struct iovec iov[2];
+	mm_segment_t fs;
+	unsigned len;
+	int r;
+	int call_again_soon = 0;
+
+	down_read(&con->sock_sem);
+
+	if (con->sock == NULL)
+		goto out;
+	if (con->rx_page == NULL) {
+		/*
+		 * This doesn't need to be atomic, but I think it should
+		 * improve performance if it is.
+		 */
+		con->rx_page = alloc_page(GFP_ATOMIC);
+		if (con->rx_page == NULL)
+			goto out_resched;
+		CBUF_INIT(&con->cb, PAGE_CACHE_SIZE);
+	}
+
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_iovlen = 1;
+	msg.msg_iov = iov;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_flags = 0;
+
+	/*
+	 * iov[0] is the bit of the circular buffer between the current end
+	 * point (cb.base + cb.len) and the end of the buffer.
+	 */
+	iov[0].iov_len = con->cb.base - CBUF_DATA(&con->cb);
+	iov[0].iov_base = page_address(con->rx_page) + CBUF_DATA(&con->cb);
+	iov[1].iov_len = 0;
+
+	/*
+	 * iov[1] is the bit of the circular buffer between the start of the
+	 * buffer and the start of the currently used section (cb.base)
+	 */
+	if (CBUF_DATA(&con->cb) >= con->cb.base) {
+		iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&con->cb);
+		iov[1].iov_len = con->cb.base;
+		iov[1].iov_base = page_address(con->rx_page);
+		msg.msg_iovlen = 2;
+	}
+	len = iov[0].iov_len + iov[1].iov_len;
+
+	fs = get_fs();
+	set_fs(get_ds());
+	r = ret = sock_recvmsg(con->sock, &msg, len,
+			       MSG_DONTWAIT | MSG_NOSIGNAL);
+	set_fs(fs);
+
+	if (ret <= 0)
+		goto out_close;
+	if (ret == len)
+		call_again_soon = 1;
+	CBUF_ADD(&con->cb, ret);
+	ret = dlm_process_incoming_buffer(con->nodeid,
+					  page_address(con->rx_page),
+					  con->cb.base, con->cb.len,
+					  PAGE_CACHE_SIZE);
+	if (ret == -EBADMSG) {
+		printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, "
+		       "iov_len=%u, iov_base[0]=%p, read=%d\n",
+		       page_address(con->rx_page), con->cb.base, con->cb.len,
+		       len, iov[0].iov_base, r);
+	}
+	if (ret < 0)
+		goto out_close;
+	CBUF_EAT(&con->cb, ret);
+
+	if (CBUF_EMPTY(&con->cb) && !call_again_soon) {
+		__free_page(con->rx_page);
+		con->rx_page = NULL;
+	}
+
+      out:
+	if (call_again_soon)
+		goto out_resched;
+	up_read(&con->sock_sem);
+	ret = 0;
+	goto out_ret;
+
+      out_resched:
+	lowcomms_data_ready(con->sock->sk, 0);
+	up_read(&con->sock_sem);
+	ret = 0;
+	schedule();
+	goto out_ret;
+
+      out_close:
+	up_read(&con->sock_sem);
+	if (ret != -EAGAIN && !test_bit(CF_IS_OTHERCON, &con->flags)) {
+		close_connection(con, FALSE);
+		/* Reconnect when there is something to send */
+	}
+
+      out_ret:
+	return ret;
+}
+
+/* Listening socket is busy, accept a connection */
+static int accept_from_sock(struct connection *con)
+{
+	int result;
+	struct sockaddr_storage peeraddr;
+	struct socket *newsock;
+	int len;
+	int nodeid;
+	struct connection *newcon;
+
+	memset(&peeraddr, 0, sizeof(peeraddr));
+	result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &newsock);
+	if (result < 0)
+		return -ENOMEM;
+
+	down_read(&con->sock_sem);
+
+	result = -ENOTCONN;
+	if (con->sock == NULL)
+		goto accept_err;
+
+	newsock->type = con->sock->type;
+	newsock->ops = con->sock->ops;
+
+	result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK);
+	if (result < 0)
+		goto accept_err;
+
+	/* Get the connected socket's peer */
+	memset(&peeraddr, 0, sizeof(peeraddr));
+	if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr,
+				  &len, 2)) {
+		result = -ECONNABORTED;
+		goto accept_err;
+	}
+
+	/* Get the new node's NODEID */
+	make_sockaddr(&peeraddr, 0, &len);
+	if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) {
+	    	printk("dlm: connect from non cluster node\n");
+		sock_release(newsock);
+		up_read(&con->sock_sem);
+		return -1;
+	}
+
+	log_print("got connection from %d", nodeid);
+
+	/*  Check to see if we already have a connection to this node. This
+	 *  could happen if the two nodes initiate a connection at roughly
+	 *  the same time and the connections cross on the wire.
+	 * TEMPORARY FIX:
+	 *  In this case we store the incoming one in "othercon"
+	 */
+	newcon = nodeid2con(nodeid, GFP_KERNEL);
+	if (!newcon) {
+		result = -ENOMEM;
+		goto accept_err;
+	}
+	down_write(&newcon->sock_sem);
+	if (newcon->sock) {
+	        struct connection *othercon = newcon->othercon;
+
+		if (!othercon) {
+			othercon = kmem_cache_alloc(con_cache, GFP_KERNEL);
+			if (!othercon) {
+				printk("dlm: failed to allocate incoming socket\n");
+				up_write(&newcon->sock_sem);
+				result = -ENOMEM;
+				goto accept_err;
+			}
+			memset(othercon, 0, sizeof(*othercon));
+			othercon->nodeid = nodeid;
+			othercon->rx_action = receive_from_sock;
+			init_rwsem(&othercon->sock_sem);
+			set_bit(CF_IS_OTHERCON, &othercon->flags);
+			newcon->othercon = othercon;
+		}
+		othercon->sock = newsock;
+		newsock->sk->sk_user_data = othercon;
+		add_sock(newsock, othercon);
+	}
+	else {
+		newsock->sk->sk_user_data = newcon;
+		newcon->rx_action = receive_from_sock;
+		add_sock(newsock, newcon);
+
+	}
+
+	up_write(&newcon->sock_sem);
+
+	/*
+	 * Add it to the active queue in case we got data
+	 * beween processing the accept adding the socket
+	 * to the read_sockets list
+	 */
+	lowcomms_data_ready(newsock->sk, 0);
+	up_read(&con->sock_sem);
+
+	return 0;
+
+      accept_err:
+	up_read(&con->sock_sem);
+	sock_release(newsock);
+
+	if (result != -EAGAIN)
+		printk("dlm: error accepting connection from node: %d\n", result);
+	return result;
+}
+
+/* Connect a new socket to its peer */
+static int connect_to_sock(struct connection *con)
+{
+	int result = -EHOSTUNREACH;
+	struct sockaddr_storage saddr;
+	int addr_len;
+	struct socket *sock;
+
+	if (con->nodeid == 0) {
+		log_print("attempt to connect sock 0 foiled");
+		return 0;
+	}
+
+	down_write(&con->sock_sem);
+	if (con->retries++ > MAX_CONNECT_RETRIES)
+		goto out;
+
+	/* Some odd races can cause double-connects, ignore them */
+	if (con->sock) {
+		result = 0;
+		goto out;
+	}
+
+	/* Create a socket to communicate with */
+	result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (result < 0)
+		goto out_err;
+
+	memset(&saddr, 0, sizeof(saddr));
+	if (dlm_nodeid_to_addr(con->nodeid, &saddr))
+	        goto out_err;
+
+	sock->sk->sk_user_data = con;
+	con->rx_action = receive_from_sock;
+
+	make_sockaddr(&saddr, dlm_config.tcp_port, &addr_len);
+
+	add_sock(sock, con);
+
+	log_print("connecting to %d", con->nodeid);
+	result =
+		sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
+			       O_NONBLOCK);
+	if (result == -EINPROGRESS)
+		result = 0;
+	if (result != 0)
+		goto out_err;
+
+      out:
+	up_write(&con->sock_sem);
+	/*
+	 * Returning an error here means we've given up trying to connect to
+	 * a remote node, otherwise we return 0 and reschedule the connetion
+	 * attempt
+	 */
+	return result;
+
+      out_err:
+	if (con->sock) {
+		sock_release(con->sock);
+		con->sock = NULL;
+	}
+	/*
+	 * Some errors are fatal and this list might need adjusting. For other
+	 * errors we try again until the max number of retries is reached.
+	 */
+	if (result != -EHOSTUNREACH && result != -ENETUNREACH &&
+	    result != -ENETDOWN && result != EINVAL
+	    && result != -EPROTONOSUPPORT) {
+		lowcomms_connect_sock(con);
+		result = 0;
+	}
+	goto out;
+}
+
+static struct socket *create_listen_sock(struct connection *con, struct sockaddr_storage *saddr)
+{
+        struct socket *sock = NULL;
+	mm_segment_t fs;
+	int result = 0;
+	int one = 1;
+	int addr_len;
+
+	if (dlm_local_addr.ss_family == AF_INET)
+		addr_len = sizeof(struct sockaddr_in);
+	else
+		addr_len = sizeof(struct sockaddr_in6);
+
+	/* Create a socket to communicate with */
+	result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (result < 0) {
+		printk("dlm: Can't create listening comms socket\n");
+		goto create_out;
+	}
+
+	fs = get_fs();
+	set_fs(get_ds());
+	result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&one, sizeof(one));
+	set_fs(fs);
+	if (result < 0) {
+		printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",result);
+	}
+	sock->sk->sk_user_data = con;
+	con->rx_action = accept_from_sock;
+	con->sock = sock;
+
+	/* Bind to our port */
+	make_sockaddr(saddr, dlm_config.tcp_port, &addr_len);
+	result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
+	if (result < 0) {
+		printk("dlm: Can't bind to port %d\n", dlm_config.tcp_port);
+		sock_release(sock);
+		sock = NULL;
+		con->sock = NULL;
+		goto create_out;
+	}
+
+	fs = get_fs();
+	set_fs(get_ds());
+
+	result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&one, sizeof(one));
+	set_fs(fs);
+	if (result < 0) {
+		printk("dlm: Set keepalive failed: %d\n", result);
+	}
+
+	result = sock->ops->listen(sock, 5);
+	if (result < 0) {
+		printk("dlm: Can't listen on port %d\n", dlm_config.tcp_port);
+		sock_release(sock);
+		sock = NULL;
+		goto create_out;
+	}
+
+      create_out:
+	return sock;
+}
+
+
+/* Listen on all interfaces */
+static int listen_for_all(void)
+{
+	struct socket *sock = NULL;
+	struct connection *con = nodeid2con(0, GFP_KERNEL);
+	int result = -EINVAL;
+
+	/* We don't support multi-homed hosts */
+	memset(con, 0, sizeof(*con));
+	init_rwsem(&con->sock_sem);
+	spin_lock_init(&con->writequeue_lock);
+	INIT_LIST_HEAD(&con->writequeue);
+	set_bit(CF_IS_OTHERCON, &con->flags);
+
+	sock = create_listen_sock(con, &dlm_local_addr);
+	if (sock) {
+		add_sock(sock, con);
+		result = 0;
+	}
+	else {
+		result = -EADDRINUSE;
+	}
+
+	return result;
+}
+
+
+
+static struct writequeue_entry *new_writequeue_entry(struct connection *con,
+						     gfp_t allocation)
+{
+	struct writequeue_entry *entry;
+
+	entry = kmalloc(sizeof(struct writequeue_entry), allocation);
+	if (!entry)
+		return NULL;
+
+	entry->page = alloc_page(allocation);
+	if (!entry->page) {
+		kfree(entry);
+		return NULL;
+	}
+
+	entry->offset = 0;
+	entry->len = 0;
+	entry->end = 0;
+	entry->users = 0;
+	entry->con = con;
+
+	return entry;
+}
+
+void *dlm_lowcomms_get_buffer(int nodeid, int len,
+			      gfp_t allocation, char **ppc)
+{
+	struct connection *con;
+	struct writequeue_entry *e;
+	int offset = 0;
+	int users = 0;
+
+	if (!atomic_read(&accepting))
+		return NULL;
+
+	con = nodeid2con(nodeid, allocation);
+	if (!con)
+		return NULL;
+
+	spin_lock(&con->writequeue_lock);
+	e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
+	if (((struct list_head *) e == &con->writequeue) ||
+	    (PAGE_CACHE_SIZE - e->end < len)) {
+		e = NULL;
+	} else {
+		offset = e->end;
+		e->end += len;
+		users = e->users++;
+	}
+	spin_unlock(&con->writequeue_lock);
+
+	if (e) {
+	      got_one:
+		if (users == 0)
+			kmap(e->page);
+		*ppc = page_address(e->page) + offset;
+		return e;
+	}
+
+	e = new_writequeue_entry(con, allocation);
+	if (e) {
+		spin_lock(&con->writequeue_lock);
+		offset = e->end;
+		e->end += len;
+		users = e->users++;
+		list_add_tail(&e->list, &con->writequeue);
+		spin_unlock(&con->writequeue_lock);
+		goto got_one;
+	}
+	return NULL;
+}
+
+void dlm_lowcomms_commit_buffer(void *mh)
+{
+	struct writequeue_entry *e = (struct writequeue_entry *)mh;
+	struct connection *con = e->con;
+	int users;
+
+	if (!atomic_read(&accepting))
+		return;
+
+	spin_lock(&con->writequeue_lock);
+	users = --e->users;
+	if (users)
+		goto out;
+	e->len = e->end - e->offset;
+	kunmap(e->page);
+	spin_unlock(&con->writequeue_lock);
+
+	if (test_and_set_bit(CF_WRITE_PENDING, &con->flags) == 0) {
+		spin_lock_bh(&write_sockets_lock);
+		list_add_tail(&con->write_list, &write_sockets);
+		spin_unlock_bh(&write_sockets_lock);
+
+		wake_up_interruptible(&lowcomms_send_waitq);
+	}
+	return;
+
+      out:
+	spin_unlock(&con->writequeue_lock);
+	return;
+}
+
+static void free_entry(struct writequeue_entry *e)
+{
+	__free_page(e->page);
+	kfree(e);
+}
+
+/* Send a message */
+static int send_to_sock(struct connection *con)
+{
+	int ret = 0;
+	ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
+	const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+	struct writequeue_entry *e;
+	int len, offset;
+
+	down_read(&con->sock_sem);
+	if (con->sock == NULL)
+		goto out_connect;
+
+	sendpage = con->sock->ops->sendpage;
+
+	spin_lock(&con->writequeue_lock);
+	for (;;) {
+		e = list_entry(con->writequeue.next, struct writequeue_entry,
+			       list);
+		if ((struct list_head *) e == &con->writequeue)
+			break;
+
+		len = e->len;
+		offset = e->offset;
+		BUG_ON(len == 0 && e->users == 0);
+		spin_unlock(&con->writequeue_lock);
+
+		ret = 0;
+		if (len) {
+			ret = sendpage(con->sock, e->page, offset, len,
+				       msg_flags);
+			if (ret == -EAGAIN || ret == 0)
+				goto out;
+			if (ret <= 0)
+				goto send_error;
+		}
+		else {
+			/* Don't starve people filling buffers */
+			schedule();
+		}
+
+		spin_lock(&con->writequeue_lock);
+		e->offset += ret;
+		e->len -= ret;
+
+		if (e->len == 0 && e->users == 0) {
+			list_del(&e->list);
+			free_entry(e);
+			continue;
+		}
+	}
+	spin_unlock(&con->writequeue_lock);
+      out:
+	up_read(&con->sock_sem);
+	return ret;
+
+      send_error:
+	up_read(&con->sock_sem);
+	close_connection(con, FALSE);
+	lowcomms_connect_sock(con);
+	return ret;
+
+      out_connect:
+	up_read(&con->sock_sem);
+	lowcomms_connect_sock(con);
+	return 0;
+}
+
+static void clean_one_writequeue(struct connection *con)
+{
+	struct list_head *list;
+	struct list_head *temp;
+
+	spin_lock(&con->writequeue_lock);
+	list_for_each_safe(list, temp, &con->writequeue) {
+		struct writequeue_entry *e =
+			list_entry(list, struct writequeue_entry, list);
+		list_del(&e->list);
+		free_entry(e);
+	}
+	spin_unlock(&con->writequeue_lock);
+}
+
+/* Called from recovery when it knows that a node has
+   left the cluster */
+int dlm_lowcomms_close(int nodeid)
+{
+	struct connection *con;
+
+	if (!connections)
+		goto out;
+
+	log_print("closing connection to node %d", nodeid);
+	con = nodeid2con(nodeid, 0);
+	if (con) {
+		clean_one_writequeue(con);
+		close_connection(con, TRUE);
+		atomic_set(&con->waiting_requests, 0);
+	}
+	return 0;
+
+      out:
+	return -1;
+}
+
+/* API send message call, may queue the request */
+/* N.B. This is the old interface - use the new one for new calls */
+int lowcomms_send_message(int nodeid, char *buf, int len, gfp_t allocation)
+{
+	struct writequeue_entry *e;
+	char *b;
+
+	e = dlm_lowcomms_get_buffer(nodeid, len, allocation, &b);
+	if (e) {
+		memcpy(b, buf, len);
+		dlm_lowcomms_commit_buffer(e);
+		return 0;
+	}
+	return -ENOBUFS;
+}
+
+/* Look for activity on active sockets */
+static void process_sockets(void)
+{
+	struct list_head *list;
+	struct list_head *temp;
+	int count = 0;
+
+	spin_lock_bh(&read_sockets_lock);
+	list_for_each_safe(list, temp, &read_sockets) {
+
+		struct connection *con =
+		    list_entry(list, struct connection, read_list);
+		list_del(&con->read_list);
+		clear_bit(CF_READ_PENDING, &con->flags);
+
+		spin_unlock_bh(&read_sockets_lock);
+
+		/* This can reach zero if we are processing requests
+		 * as they come in.
+		 */
+		if (atomic_read(&con->waiting_requests) == 0) {
+			spin_lock_bh(&read_sockets_lock);
+			continue;
+		}
+
+		do {
+			con->rx_action(con);
+
+			/* Don't starve out everyone else */
+			if (++count >= MAX_RX_MSG_COUNT) {
+				schedule();
+				count = 0;
+			}
+
+		} while (!atomic_dec_and_test(&con->waiting_requests) &&
+			 !kthread_should_stop());
+
+		spin_lock_bh(&read_sockets_lock);
+	}
+	spin_unlock_bh(&read_sockets_lock);
+}
+
+/* Try to send any messages that are pending
+ */
+static void process_output_queue(void)
+{
+	struct list_head *list;
+	struct list_head *temp;
+	int ret;
+
+	spin_lock_bh(&write_sockets_lock);
+	list_for_each_safe(list, temp, &write_sockets) {
+		struct connection *con =
+		    list_entry(list, struct connection, write_list);
+		clear_bit(CF_WRITE_PENDING, &con->flags);
+		list_del(&con->write_list);
+
+		spin_unlock_bh(&write_sockets_lock);
+
+		ret = send_to_sock(con);
+		if (ret < 0) {
+		}
+		spin_lock_bh(&write_sockets_lock);
+	}
+	spin_unlock_bh(&write_sockets_lock);
+}
+
+static void process_state_queue(void)
+{
+	struct list_head *list;
+	struct list_head *temp;
+	int ret;
+
+	spin_lock_bh(&state_sockets_lock);
+	list_for_each_safe(list, temp, &state_sockets) {
+		struct connection *con =
+		    list_entry(list, struct connection, state_list);
+		list_del(&con->state_list);
+		clear_bit(CF_CONNECT_PENDING, &con->flags);
+		spin_unlock_bh(&state_sockets_lock);
+
+		ret = connect_to_sock(con);
+		if (ret < 0) {
+		}
+		spin_lock_bh(&state_sockets_lock);
+	}
+	spin_unlock_bh(&state_sockets_lock);
+}
+
+
+/* Discard all entries on the write queues */
+static void clean_writequeues(void)
+{
+	int nodeid;
+
+	for (nodeid = 1; nodeid < conn_array_size; nodeid++) {
+		struct connection *con = nodeid2con(nodeid, 0);
+
+		if (con)
+			clean_one_writequeue(con);
+	}
+}
+
+static int read_list_empty(void)
+{
+	int status;
+
+	spin_lock_bh(&read_sockets_lock);
+	status = list_empty(&read_sockets);
+	spin_unlock_bh(&read_sockets_lock);
+
+	return status;
+}
+
+/* DLM Transport comms receive daemon */
+static int dlm_recvd(void *data)
+{
+	init_waitqueue_head(&lowcomms_recv_waitq);
+	init_waitqueue_entry(&lowcomms_recv_waitq_head, current);
+	add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head);
+
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (read_list_empty())
+			schedule();
+		set_current_state(TASK_RUNNING);
+
+		process_sockets();
+	}
+
+	return 0;
+}
+
+static int write_and_state_lists_empty(void)
+{
+	int status;
+
+	spin_lock_bh(&write_sockets_lock);
+	status = list_empty(&write_sockets);
+	spin_unlock_bh(&write_sockets_lock);
+
+	spin_lock_bh(&state_sockets_lock);
+	if (list_empty(&state_sockets) == 0)
+		status = 0;
+	spin_unlock_bh(&state_sockets_lock);
+
+	return status;
+}
+
+/* DLM Transport send daemon */
+static int dlm_sendd(void *data)
+{
+	init_waitqueue_head(&lowcomms_send_waitq);
+	init_waitqueue_entry(&lowcomms_send_waitq_head, current);
+	add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head);
+
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (write_and_state_lists_empty())
+			schedule();
+		set_current_state(TASK_RUNNING);
+
+		process_state_queue();
+		process_output_queue();
+	}
+
+	return 0;
+}
+
+static void daemons_stop(void)
+{
+	kthread_stop(recv_task);
+	kthread_stop(send_task);
+}
+
+static int daemons_start(void)
+{
+	struct task_struct *p;
+	int error;
+
+	p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
+	error = IS_ERR(p);
+       	if (error) {
+		log_print("can't start dlm_recvd %d", error);
+		return error;
+	}
+	recv_task = p;
+
+	p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
+	error = IS_ERR(p);
+       	if (error) {
+		log_print("can't start dlm_sendd %d", error);
+		kthread_stop(recv_task);
+		return error;
+	}
+	send_task = p;
+
+	return 0;
+}
+
+/*
+ * Return the largest buffer size we can cope with.
+ */
+int lowcomms_max_buffer_size(void)
+{
+	return PAGE_CACHE_SIZE;
+}
+
+void dlm_lowcomms_stop(void)
+{
+	int i;
+
+	atomic_set(&accepting, 0);
+
+	/* Set all the activity flags to prevent any
+	   socket activity.
+	*/
+	for (i = 0; i < conn_array_size; i++) {
+		if (connections[i])
+			connections[i]->flags |= 0x7;
+	}
+	daemons_stop();
+	clean_writequeues();
+
+	for (i = 0; i < conn_array_size; i++) {
+		if (connections[i]) {
+			close_connection(connections[i], TRUE);
+			if (connections[i]->othercon)
+				kmem_cache_free(con_cache, connections[i]->othercon);
+			kmem_cache_free(con_cache, connections[i]);
+		}
+	}
+
+	kfree(connections);
+	connections = NULL;
+
+	kmem_cache_destroy(con_cache);
+}
+
+/* This is quite likely to sleep... */
+int dlm_lowcomms_start(void)
+{
+	int error = 0;
+
+	error = -ENOTCONN;
+
+	/*
+	 * Temporarily initialise the waitq head so that lowcomms_send_message
+	 * doesn't crash if it gets called before the thread is fully
+	 * initialised
+	 */
+	init_waitqueue_head(&lowcomms_send_waitq);
+
+	error = -ENOMEM;
+	connections = kmalloc(sizeof(struct connection *) *
+			      NODE_INCREMENT, GFP_KERNEL);
+	if (!connections)
+		goto out;
+
+	memset(connections, 0,
+	       sizeof(struct connection *) * NODE_INCREMENT);
+
+	conn_array_size = NODE_INCREMENT;
+
+	if (dlm_our_addr(&dlm_local_addr, 0)) {
+		log_print("no local IP address has been set");
+		goto fail_free_conn;
+	}
+	if (!dlm_our_addr(&dlm_local_addr, 1)) {
+		log_print("This dlm comms module does not support multi-homed clustering");
+		goto fail_free_conn;
+	}
+
+	con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection),
+				      __alignof__(struct connection), 0, NULL, NULL);
+	if (!con_cache)
+		goto fail_free_conn;
+
+
+	/* Start listening */
+	error = listen_for_all();
+	if (error)
+		goto fail_unlisten;
+
+	error = daemons_start();
+	if (error)
+		goto fail_unlisten;
+
+	atomic_set(&accepting, 1);
+
+	return 0;
+
+      fail_unlisten:
+	close_connection(connections[0], 0);
+	kmem_cache_free(con_cache, connections[0]);
+	kmem_cache_destroy(con_cache);
+
+      fail_free_conn:
+	kfree(connections);
+
+      out:
+	return error;
+}
+
+int dlm_lowcomms_init(void)
+{
+	INIT_LIST_HEAD(&read_sockets);
+	INIT_LIST_HEAD(&write_sockets);
+	INIT_LIST_HEAD(&state_sockets);
+
+	spin_lock_init(&read_sockets_lock);
+	spin_lock_init(&write_sockets_lock);
+	spin_lock_init(&state_sockets_lock);
+	init_MUTEX(&connections_lock);
+
+	return 0;
+}
+
+void dlm_lowcomms_exit(void)
+{
+}
+
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
deleted file mode 100644
index 6da6b14..0000000
--- a/fs/dlm/lowcomms.c
+++ /dev/null
@@ -1,1239 +0,0 @@
-/******************************************************************************
-*******************************************************************************
-**
-**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
-**
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
-**
-*******************************************************************************
-******************************************************************************/
-
-/*
- * lowcomms.c
- *
- * This is the "low-level" comms layer.
- *
- * It is responsible for sending/receiving messages
- * from other nodes in the cluster.
- *
- * Cluster nodes are referred to by their nodeids. nodeids are
- * simply 32 bit numbers to the locking module - if they need to
- * be expanded for the cluster infrastructure then that is it's
- * responsibility. It is this layer's
- * responsibility to resolve these into IP address or
- * whatever it needs for inter-node communication.
- *
- * The comms level is two kernel threads that deal mainly with
- * the receiving of messages from other nodes and passing them
- * up to the mid-level comms layer (which understands the
- * message format) for execution by the locking core, and
- * a send thread which does all the setting up of connections
- * to remote nodes and the sending of data. Threads are not allowed
- * to send their own data because it may cause them to wait in times
- * of high load. Also, this way, the sending thread can collect together
- * messages bound for one node and send them in one block.
- *
- * I don't see any problem with the recv thread executing the locking
- * code on behalf of remote processes as the locking code is
- * short, efficient and never (well, hardly ever) waits.
- *
- */
-
-#include <asm/ioctls.h>
-#include <net/sock.h>
-#include <net/tcp.h>
-#include <net/sctp/user.h>
-#include <linux/pagemap.h>
-#include <linux/socket.h>
-#include <linux/idr.h>
-
-#include "dlm_internal.h"
-#include "lowcomms.h"
-#include "config.h"
-#include "midcomms.h"
-
-static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
-static int			dlm_local_count;
-static int			dlm_local_nodeid;
-
-/* One of these per connected node */
-
-#define NI_INIT_PENDING 1
-#define NI_WRITE_PENDING 2
-
-struct nodeinfo {
-	spinlock_t		lock;
-	sctp_assoc_t		assoc_id;
-	unsigned long		flags;
-	struct list_head	write_list; /* nodes with pending writes */
-	struct list_head	writequeue; /* outgoing writequeue_entries */
-	spinlock_t		writequeue_lock;
-	int			nodeid;
-};
-
-static DEFINE_IDR(nodeinfo_idr);
-static struct rw_semaphore	nodeinfo_lock;
-static int			max_nodeid;
-
-struct cbuf {
-	unsigned		base;
-	unsigned		len;
-	unsigned		mask;
-};
-
-/* Just the one of these, now. But this struct keeps
-   the connection-specific variables together */
-
-#define CF_READ_PENDING 1
-
-struct connection {
-	struct socket          *sock;
-	unsigned long		flags;
-	struct page            *rx_page;
-	atomic_t		waiting_requests;
-	struct cbuf		cb;
-	int                     eagain_flag;
-};
-
-/* An entry waiting to be sent */
-
-struct writequeue_entry {
-	struct list_head	list;
-	struct page            *page;
-	int			offset;
-	int			len;
-	int			end;
-	int			users;
-	struct nodeinfo        *ni;
-};
-
-#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
-#define CBUF_EMPTY(cb) ((cb)->len == 0)
-#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
-#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
-
-#define CBUF_INIT(cb, size) \
-do { \
-	(cb)->base = (cb)->len = 0; \
-	(cb)->mask = ((size)-1); \
-} while(0)
-
-#define CBUF_EAT(cb, n) \
-do { \
-	(cb)->len  -= (n); \
-	(cb)->base += (n); \
-	(cb)->base &= (cb)->mask; \
-} while(0)
-
-
-/* List of nodes which have writes pending */
-static struct list_head write_nodes;
-static spinlock_t write_nodes_lock;
-
-/* Maximum number of incoming messages to process before
- * doing a schedule()
- */
-#define MAX_RX_MSG_COUNT 25
-
-/* Manage daemons */
-static struct task_struct *recv_task;
-static struct task_struct *send_task;
-static wait_queue_head_t lowcomms_recv_wait;
-static atomic_t accepting;
-
-/* The SCTP connection */
-static struct connection sctp_con;
-
-
-static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
-{
-	struct sockaddr_storage addr;
-	int error;
-
-	if (!dlm_local_count)
-		return -1;
-
-	error = dlm_nodeid_to_addr(nodeid, &addr);
-	if (error)
-		return error;
-
-	if (dlm_local_addr[0]->ss_family == AF_INET) {
-	        struct sockaddr_in *in4  = (struct sockaddr_in *) &addr;
-		struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
-		ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
-	} else {
-	        struct sockaddr_in6 *in6  = (struct sockaddr_in6 *) &addr;
-		struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
-		memcpy(&ret6->sin6_addr, &in6->sin6_addr,
-		       sizeof(in6->sin6_addr));
-	}
-
-	return 0;
-}
-
-static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc)
-{
-	struct nodeinfo *ni;
-	int r;
-	int n;
-
-	down_read(&nodeinfo_lock);
-	ni = idr_find(&nodeinfo_idr, nodeid);
-	up_read(&nodeinfo_lock);
-
-	if (!ni && alloc) {
-		down_write(&nodeinfo_lock);
-
-		ni = idr_find(&nodeinfo_idr, nodeid);
-		if (ni)
-			goto out_up;
-
-		r = idr_pre_get(&nodeinfo_idr, alloc);
-		if (!r)
-			goto out_up;
-
-		ni = kmalloc(sizeof(struct nodeinfo), alloc);
-		if (!ni)
-			goto out_up;
-
-		r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
-		if (r) {
-			kfree(ni);
-			ni = NULL;
-			goto out_up;
-		}
-		if (n != nodeid) {
-			idr_remove(&nodeinfo_idr, n);
-			kfree(ni);
-			ni = NULL;
-			goto out_up;
-		}
-		memset(ni, 0, sizeof(struct nodeinfo));
-		spin_lock_init(&ni->lock);
-		INIT_LIST_HEAD(&ni->writequeue);
-		spin_lock_init(&ni->writequeue_lock);
-		ni->nodeid = nodeid;
-
-		if (nodeid > max_nodeid)
-			max_nodeid = nodeid;
-	out_up:
-		up_write(&nodeinfo_lock);
-	}
-
-	return ni;
-}
-
-/* Don't call this too often... */
-static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
-{
-	int i;
-	struct nodeinfo *ni;
-
-	for (i=1; i<=max_nodeid; i++) {
-		ni = nodeid2nodeinfo(i, 0);
-		if (ni && ni->assoc_id == assoc)
-			return ni;
-	}
-	return NULL;
-}
-
-/* Data or notification available on socket */
-static void lowcomms_data_ready(struct sock *sk, int count_unused)
-{
-	atomic_inc(&sctp_con.waiting_requests);
-	if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
-		return;
-
-	wake_up_interruptible(&lowcomms_recv_wait);
-}
-
-
-/* Add the port number to an IP6 or 4 sockaddr and return the address length.
-   Also padd out the struct with zeros to make comparisons meaningful */
-
-static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
-			  int *addr_len)
-{
-	struct sockaddr_in *local4_addr;
-	struct sockaddr_in6 *local6_addr;
-
-	if (!dlm_local_count)
-		return;
-
-	if (!port) {
-		if (dlm_local_addr[0]->ss_family == AF_INET) {
-			local4_addr = (struct sockaddr_in *)dlm_local_addr[0];
-			port = be16_to_cpu(local4_addr->sin_port);
-		} else {
-			local6_addr = (struct sockaddr_in6 *)dlm_local_addr[0];
-			port = be16_to_cpu(local6_addr->sin6_port);
-		}
-	}
-
-	saddr->ss_family = dlm_local_addr[0]->ss_family;
-	if (dlm_local_addr[0]->ss_family == AF_INET) {
-		struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
-		in4_addr->sin_port = cpu_to_be16(port);
-		memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
-		memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
-				      sizeof(struct sockaddr_in));
-		*addr_len = sizeof(struct sockaddr_in);
-	} else {
-		struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
-		in6_addr->sin6_port = cpu_to_be16(port);
-		memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
-				      sizeof(struct sockaddr_in6));
-		*addr_len = sizeof(struct sockaddr_in6);
-	}
-}
-
-/* Close the connection and tidy up */
-static void close_connection(void)
-{
-	if (sctp_con.sock) {
-		sock_release(sctp_con.sock);
-		sctp_con.sock = NULL;
-	}
-
-	if (sctp_con.rx_page) {
-		__free_page(sctp_con.rx_page);
-		sctp_con.rx_page = NULL;
-	}
-}
-
-/* We only send shutdown messages to nodes that are not part of the cluster */
-static void send_shutdown(sctp_assoc_t associd)
-{
-	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
-	struct msghdr outmessage;
-	struct cmsghdr *cmsg;
-	struct sctp_sndrcvinfo *sinfo;
-	int ret;
-
-	outmessage.msg_name = NULL;
-	outmessage.msg_namelen = 0;
-	outmessage.msg_control = outcmsg;
-	outmessage.msg_controllen = sizeof(outcmsg);
-	outmessage.msg_flags = MSG_EOR;
-
-	cmsg = CMSG_FIRSTHDR(&outmessage);
-	cmsg->cmsg_level = IPPROTO_SCTP;
-	cmsg->cmsg_type = SCTP_SNDRCV;
-	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
-	outmessage.msg_controllen = cmsg->cmsg_len;
-	sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
-	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
-
-	sinfo->sinfo_flags |= MSG_EOF;
-	sinfo->sinfo_assoc_id = associd;
-
-	ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
-
-	if (ret != 0)
-		log_print("send EOF to node failed: %d", ret);
-}
-
-
-/* INIT failed but we don't know which node...
-   restart INIT on all pending nodes */
-static void init_failed(void)
-{
-	int i;
-	struct nodeinfo *ni;
-
-	for (i=1; i<=max_nodeid; i++) {
-		ni = nodeid2nodeinfo(i, 0);
-		if (!ni)
-			continue;
-
-		if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
-			ni->assoc_id = 0;
-			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-				spin_lock_bh(&write_nodes_lock);
-				list_add_tail(&ni->write_list, &write_nodes);
-				spin_unlock_bh(&write_nodes_lock);
-			}
-		}
-	}
-	wake_up_process(send_task);
-}
-
-/* Something happened to an association */
-static void process_sctp_notification(struct msghdr *msg, char *buf)
-{
-	union sctp_notification *sn = (union sctp_notification *)buf;
-
-	if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
-		switch (sn->sn_assoc_change.sac_state) {
-
-		case SCTP_COMM_UP:
-		case SCTP_RESTART:
-		{
-			/* Check that the new node is in the lockspace */
-			struct sctp_prim prim;
-			mm_segment_t fs;
-			int nodeid;
-			int prim_len, ret;
-			int addr_len;
-			struct nodeinfo *ni;
-
-			/* This seems to happen when we received a connection
-			 * too early... or something...  anyway, it happens but
-			 * we always seem to get a real message too, see
-			 * receive_from_sock */
-
-			if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
-				log_print("COMM_UP for invalid assoc ID %d",
-					 (int)sn->sn_assoc_change.sac_assoc_id);
-				init_failed();
-				return;
-			}
-			memset(&prim, 0, sizeof(struct sctp_prim));
-			prim_len = sizeof(struct sctp_prim);
-			prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
-
-			fs = get_fs();
-			set_fs(get_ds());
-			ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
-						IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
-						(char*)&prim, &prim_len);
-			set_fs(fs);
-			if (ret < 0) {
-				struct nodeinfo *ni;
-
-				log_print("getsockopt/sctp_primary_addr on "
-					  "new assoc %d failed : %d",
-				    (int)sn->sn_assoc_change.sac_assoc_id, ret);
-
-				/* Retry INIT later */
-				ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
-				if (ni)
-					clear_bit(NI_INIT_PENDING, &ni->flags);
-				return;
-			}
-			make_sockaddr(&prim.ssp_addr, 0, &addr_len);
-			if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
-				log_print("reject connect from unknown addr");
-				send_shutdown(prim.ssp_assoc_id);
-				return;
-			}
-
-			ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
-			if (!ni)
-				return;
-
-			/* Save the assoc ID */
-			spin_lock(&ni->lock);
-			ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
-			spin_unlock(&ni->lock);
-
-			log_print("got new/restarted association %d nodeid %d",
-			       (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
-
-			/* Send any pending writes */
-			clear_bit(NI_INIT_PENDING, &ni->flags);
-			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-				spin_lock_bh(&write_nodes_lock);
-				list_add_tail(&ni->write_list, &write_nodes);
-				spin_unlock_bh(&write_nodes_lock);
-			}
-			wake_up_process(send_task);
-		}
-		break;
-
-		case SCTP_COMM_LOST:
-		case SCTP_SHUTDOWN_COMP:
-		{
-			struct nodeinfo *ni;
-
-			ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
-			if (ni) {
-				spin_lock(&ni->lock);
-				ni->assoc_id = 0;
-				spin_unlock(&ni->lock);
-			}
-		}
-		break;
-
-		/* We don't know which INIT failed, so clear the PENDING flags
-		 * on them all.  if assoc_id is zero then it will then try
-		 * again */
-
-		case SCTP_CANT_STR_ASSOC:
-		{
-			log_print("Can't start SCTP association - retrying");
-			init_failed();
-		}
-		break;
-
-		default:
-			log_print("unexpected SCTP assoc change id=%d state=%d",
-				  (int)sn->sn_assoc_change.sac_assoc_id,
-				  sn->sn_assoc_change.sac_state);
-		}
-	}
-}
-
-/* Data received from remote end */
-static int receive_from_sock(void)
-{
-	int ret = 0;
-	struct msghdr msg;
-	struct kvec iov[2];
-	unsigned len;
-	int r;
-	struct sctp_sndrcvinfo *sinfo;
-	struct cmsghdr *cmsg;
-	struct nodeinfo *ni;
-
-	/* These two are marginally too big for stack allocation, but this
-	 * function is (currently) only called by dlm_recvd so static should be
-	 * OK.
-	 */
-	static struct sockaddr_storage msgname;
-	static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
-
-	if (sctp_con.sock == NULL)
-		goto out;
-
-	if (sctp_con.rx_page == NULL) {
-		/*
-		 * This doesn't need to be atomic, but I think it should
-		 * improve performance if it is.
-		 */
-		sctp_con.rx_page = alloc_page(GFP_ATOMIC);
-		if (sctp_con.rx_page == NULL)
-			goto out_resched;
-		CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE);
-	}
-
-	memset(&incmsg, 0, sizeof(incmsg));
-	memset(&msgname, 0, sizeof(msgname));
-
-	memset(incmsg, 0, sizeof(incmsg));
-	msg.msg_name = &msgname;
-	msg.msg_namelen = sizeof(msgname);
-	msg.msg_flags = 0;
-	msg.msg_control = incmsg;
-	msg.msg_controllen = sizeof(incmsg);
-	msg.msg_iovlen = 1;
-
-	/* I don't see why this circular buffer stuff is necessary for SCTP
-	 * which is a packet-based protocol, but the whole thing breaks under
-	 * load without it! The overhead is minimal (and is in the TCP lowcomms
-	 * anyway, of course) so I'll leave it in until I can figure out what's
-	 * really happening.
-	 */
-
-	/*
-	 * iov[0] is the bit of the circular buffer between the current end
-	 * point (cb.base + cb.len) and the end of the buffer.
-	 */
-	iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb);
-	iov[0].iov_base = page_address(sctp_con.rx_page) +
-			  CBUF_DATA(&sctp_con.cb);
-	iov[1].iov_len = 0;
-
-	/*
-	 * iov[1] is the bit of the circular buffer between the start of the
-	 * buffer and the start of the currently used section (cb.base)
-	 */
-	if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) {
-		iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb);
-		iov[1].iov_len = sctp_con.cb.base;
-		iov[1].iov_base = page_address(sctp_con.rx_page);
-		msg.msg_iovlen = 2;
-	}
-	len = iov[0].iov_len + iov[1].iov_len;
-
-	r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, msg.msg_iovlen, len,
-				 MSG_NOSIGNAL | MSG_DONTWAIT);
-	if (ret <= 0)
-		goto out_close;
-
-	msg.msg_control = incmsg;
-	msg.msg_controllen = sizeof(incmsg);
-	cmsg = CMSG_FIRSTHDR(&msg);
-	sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
-
-	if (msg.msg_flags & MSG_NOTIFICATION) {
-		process_sctp_notification(&msg, page_address(sctp_con.rx_page));
-		return 0;
-	}
-
-	/* Is this a new association ? */
-	ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
-	if (ni) {
-		ni->assoc_id = sinfo->sinfo_assoc_id;
-		if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
-
-			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-				spin_lock_bh(&write_nodes_lock);
-				list_add_tail(&ni->write_list, &write_nodes);
-				spin_unlock_bh(&write_nodes_lock);
-			}
-			wake_up_process(send_task);
-		}
-	}
-
-	/* INIT sends a message with length of 1 - ignore it */
-	if (r == 1)
-		return 0;
-
-	CBUF_ADD(&sctp_con.cb, ret);
-	ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
-					  page_address(sctp_con.rx_page),
-					  sctp_con.cb.base, sctp_con.cb.len,
-					  PAGE_CACHE_SIZE);
-	if (ret < 0)
-		goto out_close;
-	CBUF_EAT(&sctp_con.cb, ret);
-
-      out:
-	ret = 0;
-	goto out_ret;
-
-      out_resched:
-	lowcomms_data_ready(sctp_con.sock->sk, 0);
-	ret = 0;
-	schedule();
-	goto out_ret;
-
-      out_close:
-	if (ret != -EAGAIN)
-		log_print("error reading from sctp socket: %d", ret);
-      out_ret:
-	return ret;
-}
-
-/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
-static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
-{
-	mm_segment_t fs;
-	int result = 0;
-
-	fs = get_fs();
-	set_fs(get_ds());
-	if (num == 1)
-		result = sctp_con.sock->ops->bind(sctp_con.sock,
-					(struct sockaddr *) addr, addr_len);
-	else
-		result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
-				SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len);
-	set_fs(fs);
-
-	if (result < 0)
-		log_print("Can't bind to port %d addr number %d",
-			  dlm_config.tcp_port, num);
-
-	return result;
-}
-
-static void init_local(void)
-{
-	struct sockaddr_storage sas, *addr;
-	int i;
-
-	dlm_local_nodeid = dlm_our_nodeid();
-
-	for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
-		if (dlm_our_addr(&sas, i))
-			break;
-
-		addr = kmalloc(sizeof(*addr), GFP_KERNEL);
-		if (!addr)
-			break;
-		memcpy(addr, &sas, sizeof(*addr));
-		dlm_local_addr[dlm_local_count++] = addr;
-	}
-}
-
-/* Initialise SCTP socket and bind to all interfaces */
-static int init_sock(void)
-{
-	mm_segment_t fs;
-	struct socket *sock = NULL;
-	struct sockaddr_storage localaddr;
-	struct sctp_event_subscribe subscribe;
-	int result = -EINVAL, num = 1, i, addr_len;
-
-	if (!dlm_local_count) {
-		init_local();
-		if (!dlm_local_count) {
-			log_print("no local IP address has been set");
-			goto out;
-		}
-	}
-
-	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
-				  IPPROTO_SCTP, &sock);
-	if (result < 0) {
-		log_print("Can't create comms socket, check SCTP is loaded");
-		goto out;
-	}
-
-	/* Listen for events */
-	memset(&subscribe, 0, sizeof(subscribe));
-	subscribe.sctp_data_io_event = 1;
-	subscribe.sctp_association_event = 1;
-	subscribe.sctp_send_failure_event = 1;
-	subscribe.sctp_shutdown_event = 1;
-	subscribe.sctp_partial_delivery_event = 1;
-
-	fs = get_fs();
-	set_fs(get_ds());
-	result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
-				       (char *)&subscribe, sizeof(subscribe));
-	set_fs(fs);
-
-	if (result < 0) {
-		log_print("Failed to set SCTP_EVENTS on socket: result=%d",
-			  result);
-		goto create_delsock;
-	}
-
-	/* Init con struct */
-	sock->sk->sk_user_data = &sctp_con;
-	sctp_con.sock = sock;
-	sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
-
-	/* Bind to all interfaces. */
-	for (i = 0; i < dlm_local_count; i++) {
-		memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
-		make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len);
-
-		result = add_bind_addr(&localaddr, addr_len, num);
-		if (result)
-			goto create_delsock;
-		++num;
-	}
-
-	result = sock->ops->listen(sock, 5);
-	if (result < 0) {
-		log_print("Can't set socket listening");
-		goto create_delsock;
-	}
-
-	return 0;
-
- create_delsock:
-	sock_release(sock);
-	sctp_con.sock = NULL;
- out:
-	return result;
-}
-
-
-static struct writequeue_entry *new_writequeue_entry(gfp_t allocation)
-{
-	struct writequeue_entry *entry;
-
-	entry = kmalloc(sizeof(struct writequeue_entry), allocation);
-	if (!entry)
-		return NULL;
-
-	entry->page = alloc_page(allocation);
-	if (!entry->page) {
-		kfree(entry);
-		return NULL;
-	}
-
-	entry->offset = 0;
-	entry->len = 0;
-	entry->end = 0;
-	entry->users = 0;
-
-	return entry;
-}
-
-void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
-{
-	struct writequeue_entry *e;
-	int offset = 0;
-	int users = 0;
-	struct nodeinfo *ni;
-
-	if (!atomic_read(&accepting))
-		return NULL;
-
-	ni = nodeid2nodeinfo(nodeid, allocation);
-	if (!ni)
-		return NULL;
-
-	spin_lock(&ni->writequeue_lock);
-	e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
-	if (((struct list_head *) e == &ni->writequeue) ||
-	    (PAGE_CACHE_SIZE - e->end < len)) {
-		e = NULL;
-	} else {
-		offset = e->end;
-		e->end += len;
-		users = e->users++;
-	}
-	spin_unlock(&ni->writequeue_lock);
-
-	if (e) {
-	      got_one:
-		if (users == 0)
-			kmap(e->page);
-		*ppc = page_address(e->page) + offset;
-		return e;
-	}
-
-	e = new_writequeue_entry(allocation);
-	if (e) {
-		spin_lock(&ni->writequeue_lock);
-		offset = e->end;
-		e->end += len;
-		e->ni = ni;
-		users = e->users++;
-		list_add_tail(&e->list, &ni->writequeue);
-		spin_unlock(&ni->writequeue_lock);
-		goto got_one;
-	}
-	return NULL;
-}
-
-void dlm_lowcomms_commit_buffer(void *arg)
-{
-	struct writequeue_entry *e = (struct writequeue_entry *) arg;
-	int users;
-	struct nodeinfo *ni = e->ni;
-
-	if (!atomic_read(&accepting))
-		return;
-
-	spin_lock(&ni->writequeue_lock);
-	users = --e->users;
-	if (users)
-		goto out;
-	e->len = e->end - e->offset;
-	kunmap(e->page);
-	spin_unlock(&ni->writequeue_lock);
-
-	if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-		spin_lock_bh(&write_nodes_lock);
-		list_add_tail(&ni->write_list, &write_nodes);
-		spin_unlock_bh(&write_nodes_lock);
-		wake_up_process(send_task);
-	}
-	return;
-
-      out:
-	spin_unlock(&ni->writequeue_lock);
-	return;
-}
-
-static void free_entry(struct writequeue_entry *e)
-{
-	__free_page(e->page);
-	kfree(e);
-}
-
-/* Initiate an SCTP association. In theory we could just use sendmsg() on
-   the first IP address and it should work, but this allows us to set up the
-   association before sending any valuable data that we can't afford to lose.
-   It also keeps the send path clean as it can now always use the association ID */
-static void initiate_association(int nodeid)
-{
-	struct sockaddr_storage rem_addr;
-	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
-	struct msghdr outmessage;
-	struct cmsghdr *cmsg;
-	struct sctp_sndrcvinfo *sinfo;
-	int ret;
-	int addrlen;
-	char buf[1];
-	struct kvec iov[1];
-	struct nodeinfo *ni;
-
-	log_print("Initiating association with node %d", nodeid);
-
-	ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
-	if (!ni)
-		return;
-
-	if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
-		log_print("no address for nodeid %d", nodeid);
-		return;
-	}
-
-	make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen);
-
-	outmessage.msg_name = &rem_addr;
-	outmessage.msg_namelen = addrlen;
-	outmessage.msg_control = outcmsg;
-	outmessage.msg_controllen = sizeof(outcmsg);
-	outmessage.msg_flags = MSG_EOR;
-
-	iov[0].iov_base = buf;
-	iov[0].iov_len = 1;
-
-	/* Real INIT messages seem to cause trouble. Just send a 1 byte message
-	   we can afford to lose */
-	cmsg = CMSG_FIRSTHDR(&outmessage);
-	cmsg->cmsg_level = IPPROTO_SCTP;
-	cmsg->cmsg_type = SCTP_SNDRCV;
-	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
-	sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
-	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
-	sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
-
-	outmessage.msg_controllen = cmsg->cmsg_len;
-	ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
-	if (ret < 0) {
-		log_print("send INIT to node failed: %d", ret);
-		/* Try again later */
-		clear_bit(NI_INIT_PENDING, &ni->flags);
-	}
-}
-
-/* Send a message */
-static int send_to_sock(struct nodeinfo *ni)
-{
-	int ret = 0;
-	struct writequeue_entry *e;
-	int len, offset;
-	struct msghdr outmsg;
-	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
-	struct cmsghdr *cmsg;
-	struct sctp_sndrcvinfo *sinfo;
-	struct kvec iov;
-
-        /* See if we need to init an association before we start
-	   sending precious messages */
-	spin_lock(&ni->lock);
-	if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
-		spin_unlock(&ni->lock);
-		initiate_association(ni->nodeid);
-		return 0;
-	}
-	spin_unlock(&ni->lock);
-
-	outmsg.msg_name = NULL; /* We use assoc_id */
-	outmsg.msg_namelen = 0;
-	outmsg.msg_control = outcmsg;
-	outmsg.msg_controllen = sizeof(outcmsg);
-	outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
-
-	cmsg = CMSG_FIRSTHDR(&outmsg);
-	cmsg->cmsg_level = IPPROTO_SCTP;
-	cmsg->cmsg_type = SCTP_SNDRCV;
-	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
-	sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
-	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
-	sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
-	sinfo->sinfo_assoc_id = ni->assoc_id;
-	outmsg.msg_controllen = cmsg->cmsg_len;
-
-	spin_lock(&ni->writequeue_lock);
-	for (;;) {
-		if (list_empty(&ni->writequeue))
-			break;
-		e = list_entry(ni->writequeue.next, struct writequeue_entry,
-			       list);
-		len = e->len;
-		offset = e->offset;
-		BUG_ON(len == 0 && e->users == 0);
-		spin_unlock(&ni->writequeue_lock);
-		kmap(e->page);
-
-		ret = 0;
-		if (len) {
-			iov.iov_base = page_address(e->page)+offset;
-			iov.iov_len = len;
-
-			ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
-					     len);
-			if (ret == -EAGAIN) {
-				sctp_con.eagain_flag = 1;
-				goto out;
-			} else if (ret < 0)
-				goto send_error;
-		} else {
-			/* Don't starve people filling buffers */
-			schedule();
-		}
-
-		spin_lock(&ni->writequeue_lock);
-		e->offset += ret;
-		e->len -= ret;
-
-		if (e->len == 0 && e->users == 0) {
-			list_del(&e->list);
-			free_entry(e);
-			continue;
-		}
-	}
-	spin_unlock(&ni->writequeue_lock);
- out:
-	return ret;
-
- send_error:
-	log_print("Error sending to node %d %d", ni->nodeid, ret);
-	spin_lock(&ni->lock);
-	if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
-		ni->assoc_id = 0;
-		spin_unlock(&ni->lock);
-		initiate_association(ni->nodeid);
-	} else
-		spin_unlock(&ni->lock);
-
-	return ret;
-}
-
-/* Try to send any messages that are pending */
-static void process_output_queue(void)
-{
-	struct list_head *list;
-	struct list_head *temp;
-
-	spin_lock_bh(&write_nodes_lock);
-	list_for_each_safe(list, temp, &write_nodes) {
-		struct nodeinfo *ni =
-		    list_entry(list, struct nodeinfo, write_list);
-		clear_bit(NI_WRITE_PENDING, &ni->flags);
-		list_del(&ni->write_list);
-
-		spin_unlock_bh(&write_nodes_lock);
-
-		send_to_sock(ni);
-		spin_lock_bh(&write_nodes_lock);
-	}
-	spin_unlock_bh(&write_nodes_lock);
-}
-
-/* Called after we've had -EAGAIN and been woken up */
-static void refill_write_queue(void)
-{
-	int i;
-
-	for (i=1; i<=max_nodeid; i++) {
-		struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
-
-		if (ni) {
-			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-				spin_lock_bh(&write_nodes_lock);
-				list_add_tail(&ni->write_list, &write_nodes);
-				spin_unlock_bh(&write_nodes_lock);
-			}
-		}
-	}
-}
-
-static void clean_one_writequeue(struct nodeinfo *ni)
-{
-	struct list_head *list;
-	struct list_head *temp;
-
-	spin_lock(&ni->writequeue_lock);
-	list_for_each_safe(list, temp, &ni->writequeue) {
-		struct writequeue_entry *e =
-			list_entry(list, struct writequeue_entry, list);
-		list_del(&e->list);
-		free_entry(e);
-	}
-	spin_unlock(&ni->writequeue_lock);
-}
-
-static void clean_writequeues(void)
-{
-	int i;
-
-	for (i=1; i<=max_nodeid; i++) {
-		struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
-		if (ni)
-			clean_one_writequeue(ni);
-	}
-}
-
-
-static void dealloc_nodeinfo(void)
-{
-	int i;
-
-	for (i=1; i<=max_nodeid; i++) {
-		struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
-		if (ni) {
-			idr_remove(&nodeinfo_idr, i);
-			kfree(ni);
-		}
-	}
-}
-
-int dlm_lowcomms_close(int nodeid)
-{
-	struct nodeinfo *ni;
-
-	ni = nodeid2nodeinfo(nodeid, 0);
-	if (!ni)
-		return -1;
-
-	spin_lock(&ni->lock);
-	if (ni->assoc_id) {
-		ni->assoc_id = 0;
-		/* Don't send shutdown here, sctp will just queue it
-		   till the node comes back up! */
-	}
-	spin_unlock(&ni->lock);
-
-	clean_one_writequeue(ni);
-	clear_bit(NI_INIT_PENDING, &ni->flags);
-	return 0;
-}
-
-static int write_list_empty(void)
-{
-	int status;
-
-	spin_lock_bh(&write_nodes_lock);
-	status = list_empty(&write_nodes);
-	spin_unlock_bh(&write_nodes_lock);
-
-	return status;
-}
-
-static int dlm_recvd(void *data)
-{
-	DECLARE_WAITQUEUE(wait, current);
-
-	while (!kthread_should_stop()) {
-		int count = 0;
-
-		set_current_state(TASK_INTERRUPTIBLE);
-		add_wait_queue(&lowcomms_recv_wait, &wait);
-		if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
-			schedule();
-		remove_wait_queue(&lowcomms_recv_wait, &wait);
-		set_current_state(TASK_RUNNING);
-
-		if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
-			int ret;
-
-			do {
-				ret = receive_from_sock();
-
-				/* Don't starve out everyone else */
-				if (++count >= MAX_RX_MSG_COUNT) {
-					schedule();
-					count = 0;
-				}
-			} while (!kthread_should_stop() && ret >=0);
-		}
-		schedule();
-	}
-
-	return 0;
-}
-
-static int dlm_sendd(void *data)
-{
-	DECLARE_WAITQUEUE(wait, current);
-
-	add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
-
-	while (!kthread_should_stop()) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (write_list_empty())
-			schedule();
-		set_current_state(TASK_RUNNING);
-
-		if (sctp_con.eagain_flag) {
-			sctp_con.eagain_flag = 0;
-			refill_write_queue();
-		}
-		process_output_queue();
-	}
-
-	remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
-
-	return 0;
-}
-
-static void daemons_stop(void)
-{
-	kthread_stop(recv_task);
-	kthread_stop(send_task);
-}
-
-static int daemons_start(void)
-{
-	struct task_struct *p;
-	int error;
-
-	p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
-	error = IS_ERR(p);
-       	if (error) {
-		log_print("can't start dlm_recvd %d", error);
-		return error;
-	}
-	recv_task = p;
-
-	p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
-	error = IS_ERR(p);
-       	if (error) {
-		log_print("can't start dlm_sendd %d", error);
-		kthread_stop(recv_task);
-		return error;
-	}
-	send_task = p;
-
-	return 0;
-}
-
-/*
- * This is quite likely to sleep...
- */
-int dlm_lowcomms_start(void)
-{
-	int error;
-
-	error = init_sock();
-	if (error)
-		goto fail_sock;
-	error = daemons_start();
-	if (error)
-		goto fail_sock;
-	atomic_set(&accepting, 1);
-	return 0;
-
- fail_sock:
-	close_connection();
-	return error;
-}
-
-/* Set all the activity flags to prevent any socket activity. */
-
-void dlm_lowcomms_stop(void)
-{
-	atomic_set(&accepting, 0);
-	sctp_con.flags = 0x7;
-	daemons_stop();
-	clean_writequeues();
-	close_connection();
-	dealloc_nodeinfo();
-	max_nodeid = 0;
-}
-
-int dlm_lowcomms_init(void)
-{
-	init_waitqueue_head(&lowcomms_recv_wait);
-	spin_lock_init(&write_nodes_lock);
-	INIT_LIST_HEAD(&write_nodes);
-	init_rwsem(&nodeinfo_lock);
-	return 0;
-}
-
-void dlm_lowcomms_exit(void)
-{
-	int i;
-
-	for (i = 0; i < dlm_local_count; i++)
-		kfree(dlm_local_addr[i]);
-	dlm_local_count = 0;
-	dlm_local_nodeid = 0;
-}
-
-- 
cgit v0.10.2


From 520698096436f7da5b9142e63e3bed5580c5f14e Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Thu, 2 Nov 2006 09:49:02 -0600
Subject: [DLM] res_recover_locks_count not reset when recover_locks is aborted

Red Hat BZ 213684

If a node sends an lkb to the new master (RCOM_LOCK message) during
recovery and recovery is then aborted on both nodes before it gets a
reply, the res_recover_locks_count needs to be reset to 0 so that when the
subsequent recovery comes along and sends the lkb to the new master again
the assertion doesn't trigger that checks that counter is zero.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index a5e6d18..cf9f683 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -252,6 +252,7 @@ static void recover_list_clear(struct dlm_ls *ls)
 	spin_lock(&ls->ls_recover_list_lock);
 	list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
 		list_del_init(&r->res_recover_list);
+		r->res_recover_locks_count = 0;
 		dlm_put_rsb(r);
 		ls->ls_recover_list_count--;
 	}
-- 
cgit v0.10.2


From 435618b75b82b5ee511cc01fcdda9c44adb2f4bd Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Thu, 2 Nov 2006 09:45:56 -0600
Subject: [DLM] status messages ping-pong between unmounted nodes

Red Hat BZ 213682

If two nodes leave the lockspace (while unmounting the fs in the case of
gfs) after one has sent a STATUS message to the other, STATUS/STATUS_REPLY
messages will then ping-pong between the nodes when neither of them can
find the lockspace in question any longer.  We kill this by not sending
another STATUS message when we get a STATUS_REPLY for an unknown
lockspace.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 518239a..87b12f7 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -412,9 +412,10 @@ void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
 
 	ls = dlm_find_lockspace_global(hd->h_lockspace);
 	if (!ls) {
-		log_print("lockspace %x from %d not found",
-			  hd->h_lockspace, nodeid);
-		send_ls_not_ready(nodeid, rc);
+		log_print("lockspace %x from %d type %x not found",
+			  hd->h_lockspace, nodeid, rc->rc_type);
+		if (rc->rc_type == DLM_RCOM_STATUS)
+			send_ls_not_ready(nodeid, rc);
 		return;
 	}
 
-- 
cgit v0.10.2


From d4400156d415540086c34a06e5d233122d6bf56a Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Tue, 31 Oct 2006 11:55:56 -0600
Subject: [DLM] fix requestqueue race

Red Hat BZ 211914

There's a race between dlm_recoverd (1) enabling locking and (2) clearing
out the requestqueue, and dlm_recvd (1) checking if locking is enabled and
(2) adding a message to the requestqueue.  An order of recoverd(1),
recvd(1), recvd(2), recoverd(2) will result in a message being left on the
requestqueue.  The fix is to have dlm_recvd check if dlm_recoverd has
enabled locking after taking the mutex for the requestqueue and if it has
processing the message instead of queueing it.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 3f2befa..6088a16 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -3028,10 +3028,17 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
 
 	while (1) {
 		if (dlm_locking_stopped(ls)) {
-			if (!recovery)
-				dlm_add_requestqueue(ls, nodeid, hd);
-			error = -EINTR;
-			goto out;
+			if (recovery) {
+				error = -EINTR;
+				goto out;
+			}
+			error = dlm_add_requestqueue(ls, nodeid, hd);
+			if (error == -EAGAIN)
+				continue;
+			else {
+				error = -EINTR;
+				goto out;
+			}
 		}
 
 		if (lock_recovery_try(ls))
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index 7b2b089..0226d2a 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -30,26 +30,39 @@ struct rq_entry {
  * lockspace is enabled on some while still suspended on others.
  */
 
-void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
+int dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
 {
 	struct rq_entry *e;
 	int length = hd->h_length;
+	int rv = 0;
 
 	if (dlm_is_removed(ls, nodeid))
-		return;
+		return 0;
 
 	e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
 	if (!e) {
 		log_print("dlm_add_requestqueue: out of memory\n");
-		return;
+		return 0;
 	}
 
 	e->nodeid = nodeid;
 	memcpy(e->request, hd, length);
 
+	/* We need to check dlm_locking_stopped() after taking the mutex to
+	   avoid a race where dlm_recoverd enables locking and runs
+	   process_requestqueue between our earlier dlm_locking_stopped check
+	   and this addition to the requestqueue. */
+
 	mutex_lock(&ls->ls_requestqueue_mutex);
-	list_add_tail(&e->list, &ls->ls_requestqueue);
+	if (dlm_locking_stopped(ls))
+		list_add_tail(&e->list, &ls->ls_requestqueue);
+	else {
+		log_debug(ls, "dlm_add_requestqueue skip from %d", nodeid);
+		kfree(e);
+		rv = -EAGAIN;
+	}
 	mutex_unlock(&ls->ls_requestqueue_mutex);
+	return rv;
 }
 
 int dlm_process_requestqueue(struct dlm_ls *ls)
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
index 349f0d2..6a53ea0 100644
--- a/fs/dlm/requestqueue.h
+++ b/fs/dlm/requestqueue.h
@@ -13,7 +13,7 @@
 #ifndef __REQUESTQUEUE_DOT_H__
 #define __REQUESTQUEUE_DOT_H__
 
-void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
+int dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
 int dlm_process_requestqueue(struct dlm_ls *ls);
 void dlm_wait_requestqueue(struct dlm_ls *ls);
 void dlm_purge_requestqueue(struct dlm_ls *ls);
-- 
cgit v0.10.2


From 91c0dc93a1a6bbdd79707ed311e48b4397df177f Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Tue, 31 Oct 2006 11:56:01 -0600
Subject: [DLM] fix aborted recovery during node removal

Red Hat BZ 211914

With the new cluster infrastructure, dlm recovery for a node removal can
be aborted and restarted for a node addition.  When this happens, the
restarted recovery isn't aware that it's doing recovery for the earlier
removal as well as the addition.  So, it then skips the recovery steps
only required when nodes are removed.  This can result in locks not being
purged for failed/removed nodes.  The fix is to check for removed nodes
for which recovery has not been completed at the start of a new recovery
sequence.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index a3f7de7..85e2897 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -186,6 +186,14 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
 	struct dlm_member *memb, *safe;
 	int i, error, found, pos = 0, neg = 0, low = -1;
 
+	/* previously removed members that we've not finished removing need to
+	   count as a negative change so the "neg" recovery steps will happen */
+
+	list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
+		log_debug(ls, "prev removed member %d", memb->nodeid);
+		neg++;
+	}
+
 	/* move departed members from ls_nodes to ls_nodes_gone */
 
 	list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 362e3ef..4a1d602 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -164,6 +164,13 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 		 */
 
 		dlm_recover_rsbs(ls);
+	} else {
+		/*
+		 * Other lockspace members may be going through the "neg" steps
+		 * while also adding us to the lockspace, in which case they'll
+		 * be looking for this status bit during dlm_recover_locks().
+		 */
+		dlm_set_recover_status(ls, DLM_RS_LOCKS);
 	}
 
 	dlm_release_root_list(ls);
-- 
cgit v0.10.2


From 2cdc98aaf072d573df10c503d3b3b0b74e2a6d06 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Tue, 31 Oct 2006 11:56:08 -0600
Subject: [DLM] fix stopping unstarted recovery

Red Hat BZ 211914

When many nodes are joining a lockspace simultaneously, the dlm gets a
quick sequence of stop/start events, a pair for adding each node.
dlm_controld in user space sends dlm_recoverd in the kernel each stop and
start event.  dlm_controld will sometimes send the stop before
dlm_recoverd has had a chance to take up the previously queued start.  The
stop aborts the processing of the previous start by setting the
RECOVERY_STOP flag.  dlm_recoverd is erroneously clearing this flag and
ignoring the stop/abort if it happens to take up the start after the stop
meant to abort it.  The fix is to check the sequence number that's
incremented for each stop/start before clearing the flag.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 4a1d602..6e4ee94 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -219,6 +219,10 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 	return error;
 }
 
+/* The dlm_ls_start() that created the rv we take here may already have been
+   stopped via dlm_ls_stop(); in that case we need to leave the RECOVERY_STOP
+   flag set. */
+
 static void do_ls_recovery(struct dlm_ls *ls)
 {
 	struct dlm_recover *rv = NULL;
@@ -226,7 +230,8 @@ static void do_ls_recovery(struct dlm_ls *ls)
 	spin_lock(&ls->ls_recover_lock);
 	rv = ls->ls_recover_args;
 	ls->ls_recover_args = NULL;
-	clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+	if (rv && ls->ls_recover_seq == rv->seq)
+		clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
 	spin_unlock(&ls->ls_recover_lock);
 
 	if (rv) {
-- 
cgit v0.10.2


From 4b77f2c93d052adca8cc8690b9b5e7f8798f4ddd Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Wed, 1 Nov 2006 09:31:48 -0600
Subject: [DLM] do full recover_locks barrier

Red Hat BZ 211914

The previous patch "[DLM] fix aborted recovery during
node removal" was incomplete as discovered with further testing.  It set
the bit for the RS_LOCKS barrier but did not then wait for the barrier.
This is often ok, but sometimes it will cause yet another recovery hang.
If it's a new node that also has the lowest nodeid that skips the barrier
wait, then it misses the important step of collecting and reporting the
barrier status from the other nodes (which is the job of the low nodeid in
the barrier wait routine).

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 6e4ee94..8bb895f 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -168,9 +168,15 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 		/*
 		 * Other lockspace members may be going through the "neg" steps
 		 * while also adding us to the lockspace, in which case they'll
-		 * be looking for this status bit during dlm_recover_locks().
+		 * be doing the recover_locks (RS_LOCKS) barrier.
 		 */
 		dlm_set_recover_status(ls, DLM_RS_LOCKS);
+
+		error = dlm_recover_locks_wait(ls);
+		if (error) {
+			log_error(ls, "recover_locks_wait failed %d", error);
+			goto fail;
+		}
 	}
 
 	dlm_release_root_list(ls);
-- 
cgit v0.10.2


From 6f90a8b1b87f97144911790390d56f695b59db9b Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Fri, 10 Nov 2006 14:16:27 -0600
Subject: [DLM] clear sbflags on lock master

RH BZ 211622

The ALTMODE flag can be set in the lock master's copy of the lock but
never cleared, so ALTMODE will also be returned in a subsequent conversion
of the lock when it shouldn't be.  This results in lock_dlm incorrectly
switching to the alternate lock mode when returning the result to gfs
which then asserts when it sees the wrong lock state.  The fix is to
propagate the cleared sbflags value to the master node when the lock is
requested.  QA's d_rwrandirectlarge test triggers this bug very quickly.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 6088a16..30878de 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -2372,6 +2372,7 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
 static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
 {
 	lkb->lkb_exflags = ms->m_exflags;
+	lkb->lkb_sbflags = ms->m_sbflags;
 	lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
 		         (ms->m_flags & 0x0000FFFF);
 }
-- 
cgit v0.10.2


From b98c95af01c10827e3443157651eb469071391a3 Mon Sep 17 00:00:00 2001
From: Patrick Caulfield <pcaulfie@redhat.com>
Date: Wed, 15 Nov 2006 12:29:24 -0500
Subject: [DLM] Fix DLM config

The attached patch fixes the DLM config so that it selects the chosen network
transport. It should fix the bug where DLM can be left selected when NET gets
unselected. This incorporates all the comments received about this patch.

Cc: Adrian Bunk <bunk@stusta.de>
Cc: Andrew Morton <akpm@osdl.org>
Signed-Off-By: Patrick Caulfield <pcaulfie@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index c5985b8..b5654a2 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -1,10 +1,11 @@
 menu "Distributed Lock Manager"
-	depends on INET && IP_SCTP && EXPERIMENTAL
+	depends on EXPERIMENTAL && INET
 
 config DLM
 	tristate "Distributed Lock Manager (DLM)"
 	depends on IPV6 || IPV6=n
 	select CONFIGFS_FS
+	select IP_SCTP if DLM_SCTP
 	help
 	A general purpose distributed lock manager for kernel or userspace
 	applications.
-- 
cgit v0.10.2


From ab923031ceb95ec50ef33ccadf28663c660aa94c Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 15 Nov 2006 15:17:03 -0500
Subject: [GFS2] Fix memory allocation in glock.c

Change from GFP_KERNEL to GFP_NOFS as this was causing a
slow down when trying to push inodes from cache.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 746347a..edc21c8 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -769,7 +769,7 @@ restart:
 	} else {
 		spin_unlock(&gl->gl_spin);
 
-		new_gh = gfs2_holder_get(gl, state, LM_FLAG_TRY, GFP_KERNEL);
+		new_gh = gfs2_holder_get(gl, state, LM_FLAG_TRY, GFP_NOFS);
 		if (!new_gh)
 			return;
 		set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
-- 
cgit v0.10.2


From 4cf1ed8144e740de27c6146c25d5d7ea26679cc5 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 15 Nov 2006 15:21:06 -0500
Subject: [GFS2] Tidy up bmap & fix boundary bug

This moves the locking for bmap into the bmap function itself
rather than using a wrapper function. It also fixes a bug where
the boundary flag was set on the wrong bh. Also the flags on
the mapped bh are reset earlier in the function to ensure that
they are 100% correct on the error path.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 06e3447..8240c1f 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -423,12 +423,29 @@ static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
 	return 0;
 }
 
+static inline void bmap_lock(struct inode *inode, int create)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	if (create)
+		down_write(&ip->i_rw_mutex);
+	else
+		down_read(&ip->i_rw_mutex);
+}
+
+static inline void bmap_unlock(struct inode *inode, int create)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	if (create)
+		up_write(&ip->i_rw_mutex);
+	else
+		up_read(&ip->i_rw_mutex);
+}
+
 /**
- * gfs2_block_pointers - Map a block from an inode to a disk block
+ * gfs2_block_map - Map a block from an inode to a disk block
  * @inode: The inode
  * @lblock: The logical block number
- * @map_bh: The bh to be mapped
- * @mp: metapath to use
+ * @bh_map: The bh to be mapped
  *
  * Find the block number on the current device which corresponds to an
  * inode's block. If the block had to be created, "new" will be set.
@@ -436,8 +453,8 @@ static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
  * Returns: errno
  */
 
-static int gfs2_block_pointers(struct inode *inode, u64 lblock, int create,
-			       struct buffer_head *bh_map, struct metapath *mp)
+int gfs2_block_map(struct inode *inode, u64 lblock, int create,
+		   struct buffer_head *bh_map)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -451,51 +468,55 @@ static int gfs2_block_pointers(struct inode *inode, u64 lblock, int create,
 	u64 dblock = 0;
 	int boundary;
 	unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
+	struct metapath mp;
+	u64 size;
 
 	BUG_ON(maxlen == 0);
 
 	if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
 		return 0;
 
+	bmap_lock(inode, create);
+	clear_buffer_mapped(bh_map);
+	clear_buffer_new(bh_map);
+	clear_buffer_boundary(bh_map);
 	bsize = gfs2_is_dir(ip) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize;
-
-	height = calc_tree_height(ip, (lblock + 1) * bsize);
-	if (ip->i_di.di_height < height) {
-		if (!create)
-			return 0;
-
-		error = build_height(inode, height);
-		if (error)
-			return error;
+	size = (lblock + 1) * bsize;
+
+	if (size > ip->i_di.di_size) {
+		height = calc_tree_height(ip, size);
+		if (ip->i_di.di_height < height) {
+			if (!create)
+				goto out_ok;
+	
+			error = build_height(inode, height);
+			if (error)
+				goto out_fail;
+		}
 	}
 
-	find_metapath(ip, lblock, mp);
+	find_metapath(ip, lblock, &mp);
 	end_of_metadata = ip->i_di.di_height - 1;
-
 	error = gfs2_meta_inode_buffer(ip, &bh);
 	if (error)
-		return error;
+		goto out_fail;
 
 	for (x = 0; x < end_of_metadata; x++) {
-		lookup_block(ip, bh, x, mp, create, &new, &dblock);
+		lookup_block(ip, bh, x, &mp, create, &new, &dblock);
 		brelse(bh);
 		if (!dblock)
-			return 0;
+			goto out_ok;
 
 		error = gfs2_meta_indirect_buffer(ip, x+1, dblock, new, &bh);
 		if (error)
-			return error;
+			goto out_fail;
 	}
 
-	boundary = lookup_block(ip, bh, end_of_metadata, mp, create, &new, &dblock);
-	clear_buffer_mapped(bh_map);
-	clear_buffer_new(bh_map);
-	clear_buffer_boundary(bh_map);
-
+	boundary = lookup_block(ip, bh, end_of_metadata, &mp, create, &new, &dblock);
 	if (dblock) {
 		map_bh(bh_map, inode->i_sb, dblock);
 		if (boundary)
-			set_buffer_boundary(bh);
+			set_buffer_boundary(bh_map);
 		if (new) {
 			struct buffer_head *dibh;
 			error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -510,8 +531,8 @@ static int gfs2_block_pointers(struct inode *inode, u64 lblock, int create,
 		while(--maxlen && !buffer_boundary(bh_map)) {
 			u64 eblock;
 
-			mp->mp_list[end_of_metadata]++;
-			boundary = lookup_block(ip, bh, end_of_metadata, mp, 0, &new, &eblock);
+			mp.mp_list[end_of_metadata]++;
+			boundary = lookup_block(ip, bh, end_of_metadata, &mp, 0, &new, &eblock);
 			if (eblock != ++dblock)
 				break;
 			bh_map->b_size += (1 << inode->i_blkbits);
@@ -521,43 +542,15 @@ static int gfs2_block_pointers(struct inode *inode, u64 lblock, int create,
 	}
 out_brelse:
 	brelse(bh);
-	return 0;
-}
-
-
-static inline void bmap_lock(struct inode *inode, int create)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	if (create)
-		down_write(&ip->i_rw_mutex);
-	else
-		down_read(&ip->i_rw_mutex);
-}
-
-static inline void bmap_unlock(struct inode *inode, int create)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	if (create)
-		up_write(&ip->i_rw_mutex);
-	else
-		up_read(&ip->i_rw_mutex);
-}
-
-int gfs2_block_map(struct inode *inode, u64 lblock, int create,
-		   struct buffer_head *bh)
-{
-	struct metapath mp;
-	int ret;
-
-	bmap_lock(inode, create);
-	ret = gfs2_block_pointers(inode, lblock, create, bh, &mp);
+out_ok:
+	error = 0;
+out_fail:
 	bmap_unlock(inode, create);
-	return ret;
+	return error;
 }
 
 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
 {
-	struct metapath mp;
 	struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
 	int ret;
 	int create = *new;
@@ -567,9 +560,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
 	BUG_ON(!new);
 
 	bh.b_size = 1 << (inode->i_blkbits + 5);
-	bmap_lock(inode, create);
-	ret = gfs2_block_pointers(inode, lblock, create, &bh, &mp);
-	bmap_unlock(inode, create);
+	ret = gfs2_block_map(inode, lblock, create, &bh);
 	*extlen = bh.b_size >> inode->i_blkbits;
 	*dblock = bh.b_blocknr;
 	if (buffer_new(&bh))
-- 
cgit v0.10.2


From 175011cf6edddea32e5f5e0e04434104cc348de9 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 16 Nov 2006 10:58:55 -0500
Subject: [GFS2] Remove unused sysfs files

Four of the sysfs files are unused and can therefore be removed.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 0e0ec98..983eaf1 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -426,9 +426,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
 }                                                                             \
 TUNE_ATTR_2(name, name##_store)
 
-TUNE_ATTR(ilimit, 0);
-TUNE_ATTR(ilimit_tries, 0);
-TUNE_ATTR(ilimit_min, 0);
 TUNE_ATTR(demote_secs, 0);
 TUNE_ATTR(incore_log_blocks, 0);
 TUNE_ATTR(log_flush_secs, 0);
@@ -447,7 +444,6 @@ TUNE_ATTR(quota_simul_sync, 1);
 TUNE_ATTR(quota_cache_secs, 1);
 TUNE_ATTR(max_atomic_write, 1);
 TUNE_ATTR(stall_secs, 1);
-TUNE_ATTR(entries_per_readdir, 1);
 TUNE_ATTR(greedy_default, 1);
 TUNE_ATTR(greedy_quantum, 1);
 TUNE_ATTR(greedy_max, 1);
@@ -459,9 +455,6 @@ TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
 TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
 
 static struct attribute *tune_attrs[] = {
-	&tune_attr_ilimit.attr,
-	&tune_attr_ilimit_tries.attr,
-	&tune_attr_ilimit_min.attr,
 	&tune_attr_demote_secs.attr,
 	&tune_attr_incore_log_blocks.attr,
 	&tune_attr_log_flush_secs.attr,
@@ -478,7 +471,6 @@ static struct attribute *tune_attrs[] = {
 	&tune_attr_quota_cache_secs.attr,
 	&tune_attr_max_atomic_write.attr,
 	&tune_attr_stall_secs.attr,
-	&tune_attr_entries_per_readdir.attr,
 	&tune_attr_greedy_default.attr,
 	&tune_attr_greedy_quantum.attr,
 	&tune_attr_greedy_max.attr,
-- 
cgit v0.10.2


From dcd2479959c79d44f5dd77e71672e70f1f8b1f06 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 16 Nov 2006 11:08:16 -0500
Subject: [GFS2] Remove unused function from inode.c

The gfs2_glock_nq_m_atime function is unused in so far as its only
ever called with num_gh = 1, and this falls through to the
gfs2_glock_nq_atime function, so we might as well call that directly.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index ea9ca23..ce7f833 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1234,92 +1234,6 @@ fail:
 	return error;
 }
 
-/**
- * glock_compare_atime - Compare two struct gfs2_glock structures for sort
- * @arg_a: the first structure
- * @arg_b: the second structure
- *
- * Returns: 1 if A > B
- *         -1 if A < B
- *          0 if A == B
- */
-
-static int glock_compare_atime(const void *arg_a, const void *arg_b)
-{
-	const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a;
-	const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b;
-	const struct lm_lockname *a = &gh_a->gh_gl->gl_name;
-	const struct lm_lockname *b = &gh_b->gh_gl->gl_name;
-
-	if (a->ln_number > b->ln_number)
-		return 1;
-	if (a->ln_number < b->ln_number)
-		return -1;
-	if (gh_a->gh_state == LM_ST_SHARED && gh_b->gh_state == LM_ST_EXCLUSIVE)
-		return 1;
-	if (gh_a->gh_state == LM_ST_SHARED && (gh_b->gh_flags & GL_ATIME))
-		return 1;
-
-	return 0;
-}
-
-/**
- * gfs2_glock_nq_m_atime - acquire multiple glocks where one may need an
- *      atime update
- * @num_gh: the number of structures
- * @ghs: an array of struct gfs2_holder structures
- *
- * Returns: 0 on success (all glocks acquired),
- *          errno on failure (no glocks acquired)
- */
-
-int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs)
-{
-	struct gfs2_holder **p;
-	unsigned int x;
-	int error = 0;
-
-	if (!num_gh)
-		return 0;
-
-	if (num_gh == 1) {
-		ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
-		if (ghs->gh_flags & GL_ATIME)
-			error = gfs2_glock_nq_atime(ghs);
-		else
-			error = gfs2_glock_nq(ghs);
-		return error;
-	}
-
-	p = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
-	if (!p)
-		return -ENOMEM;
-
-	for (x = 0; x < num_gh; x++)
-		p[x] = &ghs[x];
-
-	sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare_atime,NULL);
-
-	for (x = 0; x < num_gh; x++) {
-		p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
-
-		if (p[x]->gh_flags & GL_ATIME)
-			error = gfs2_glock_nq_atime(p[x]);
-		else
-			error = gfs2_glock_nq(p[x]);
-
-		if (error) {
-			while (x--)
-				gfs2_glock_dq(p[x]);
-			break;
-		}
-	}
-
-	kfree(p);
-	return error;
-}
-
-
 static int
 __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 46917ed..b57f448 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -50,12 +50,8 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 		   struct gfs2_inode *ip);
 int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
 int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
-
 int gfs2_glock_nq_atime(struct gfs2_holder *gh);
-int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs);
-
 int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
-
 struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 
 #endif /* __INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 2f7ef98..8676c39 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -217,7 +217,7 @@ static int gfs2_readpage(struct file *file, struct page *page)
 		}
 		gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh);
 		do_unlock = 1;
-		error = gfs2_glock_nq_m_atime(1, &gh);
+		error = gfs2_glock_nq_atime(&gh);
 		if (unlikely(error))
 			goto out_unlock;
 	}
@@ -282,7 +282,7 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
 		gfs2_holder_init(ip->i_gl, LM_ST_SHARED,
 				 LM_FLAG_TRY_1CB|GL_ATIME, &gh);
 		do_unlock = 1;
-		ret = gfs2_glock_nq_m_atime(1, &gh);
+		ret = gfs2_glock_nq_atime(&gh);
 		if (ret == GLR_TRYFAILED)
 			goto out_noerror;
 		if (unlikely(ret))
@@ -354,7 +354,7 @@ static int gfs2_prepare_write(struct file *file, struct page *page,
 
 
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|LM_FLAG_TRY_1CB, &ip->i_gh);
-	error = gfs2_glock_nq_m_atime(1, &ip->i_gh);
+	error = gfs2_glock_nq_atime(&ip->i_gh);
 	if (unlikely(error)) {
 		if (error == GLR_TRYFAILED)
 			error = AOP_TRUNCATED_PAGE;
@@ -609,7 +609,7 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
 	 * on this path. All we need change is atime.
 	 */
 	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
-	rv = gfs2_glock_nq_m_atime(1, &gh);
+	rv = gfs2_glock_nq_atime(&gh);
 	if (rv)
 		goto out;
 
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index eabf6c6..c2be216 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -253,7 +253,7 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
 	u32 fsflags;
 
 	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
-	error = gfs2_glock_nq_m_atime(1, &gh);
+	error = gfs2_glock_nq_atime(&gh);
 	if (error)
 		return error;
 
-- 
cgit v0.10.2


From 5e7d65cd9d3819512b059f4260de0119b985454c Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 17 Nov 2006 12:27:44 -0500
Subject: [GFS2] Make sentinel dirents compatible with gfs1

When deleting directory entries, we set the inum.no_addr to zero
in a dirent when its the first dirent in a block and thus cannot
be merged into the previous dirent as is the usual case. In gfs1,
inum.no_formal_ino was used instead.

This patch changes gfs2 to set both inum.no_addr and inum.no_formal_ino
to zero. It also changes the test from just looking at inum.no_addr to
look at both inum.no_addr and inum.no_formal_ino and a sentinel is
now considered to be a dirent in which _either_ (or both) of them
is set to zero.

This resolves Red Hat bugzillas: #215809, #211465

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index a2923fb..0fdcb77 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -340,10 +340,15 @@ fail:
 	return (copied) ? copied : error;
 }
 
+static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent)
+{
+	return dent->de_inum.no_addr == 0 || dent->de_inum.no_formal_ino == 0;
+}
+
 static inline int __gfs2_dirent_find(const struct gfs2_dirent *dent,
 				     const struct qstr *name, int ret)
 {
-	if (dent->de_inum.no_addr != 0 &&
+	if (!gfs2_dirent_sentinel(dent) &&
 	    be32_to_cpu(dent->de_hash) == name->hash &&
 	    be16_to_cpu(dent->de_name_len) == name->len &&
 	    memcmp(dent+1, name->name, name->len) == 0)
@@ -388,7 +393,7 @@ static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
 	unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
 	unsigned totlen = be16_to_cpu(dent->de_rec_len);
 
-	if (!dent->de_inum.no_addr)
+	if (gfs2_dirent_sentinel(dent))
 		actual = GFS2_DIRENT_SIZE(0);
 	if (totlen - actual >= required)
 		return 1;
@@ -405,7 +410,7 @@ static int gfs2_dirent_gather(const struct gfs2_dirent *dent,
 			      void *opaque)
 {
 	struct dirent_gather *g = opaque;
-	if (dent->de_inum.no_addr) {
+	if (!gfs2_dirent_sentinel(dent)) {
 		g->pdent[g->offset++] = dent;
 	}
 	return 0;
@@ -433,10 +438,10 @@ static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
 	if (unlikely(offset + size > len))
 		goto error;
 	msg = "zero inode number";
-	if (unlikely(!first && !dent->de_inum.no_addr))
+	if (unlikely(!first && gfs2_dirent_sentinel(dent)))
 		goto error;
 	msg = "name length is greater than space in dirent";
-	if (dent->de_inum.no_addr &&
+	if (!gfs2_dirent_sentinel(dent) &&
 	    unlikely(sizeof(struct gfs2_dirent)+be16_to_cpu(dent->de_name_len) >
 		     size))
 		goto error;
@@ -598,7 +603,7 @@ static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh,
 		return ret;
 
         /* Only the first dent could ever have de_inum.no_addr == 0 */
-	if (!tmp->de_inum.no_addr) {
+	if (gfs2_dirent_sentinel(tmp)) {
 		gfs2_consist_inode(dip);
 		return -EIO;
 	}
@@ -621,7 +626,7 @@ static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
 {
 	u16 cur_rec_len, prev_rec_len;
 
-	if (!cur->de_inum.no_addr) {
+	if (gfs2_dirent_sentinel(cur)) {
 		gfs2_consist_inode(dip);
 		return;
 	}
@@ -633,7 +638,8 @@ static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
 	   out the inode number and return.  */
 
 	if (!prev) {
-		cur->de_inum.no_addr = 0;	/* No endianess worries */
+		cur->de_inum.no_addr = 0;
+		cur->de_inum.no_formal_ino = 0;
 		return;
 	}
 
@@ -664,7 +670,7 @@ static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
 	struct gfs2_dirent *ndent;
 	unsigned offset = 0, totlen;
 
-	if (dent->de_inum.no_addr)
+	if (!gfs2_dirent_sentinel(dent))
 		offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
 	totlen = be16_to_cpu(dent->de_rec_len);
 	BUG_ON(offset + name->len > totlen);
@@ -1002,7 +1008,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 		if (dirent_next(dip, obh, &next))
 			next = NULL;
 
-		if (dent->de_inum.no_addr &&
+		if (!gfs2_dirent_sentinel(dent) &&
 		    be32_to_cpu(dent->de_hash) < divider) {
 			struct qstr str;
 			str.name = (char*)(dent+1);
-- 
cgit v0.10.2


From fa2ecfc5e11b12f25b67f9c84ac6b0e74a6a0115 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 20 Nov 2006 10:04:49 -0500
Subject: [GFS2] Fix Kconfig wrt CRC32
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GFS2 requires the CRC32 library function. This was reported by
Toralf Förster.

Cc: Toralf Förster <toralf.foerster@gmx.de>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 8c27de8..c0791cb 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -2,6 +2,7 @@ config GFS2_FS
 	tristate "GFS2 file system support"
 	depends on EXPERIMENTAL
 	select FS_POSIX_ACL
+	select CRC32
 	help
 	A cluster filesystem.
 
-- 
cgit v0.10.2


From 1a14d3a68f04527546121eb7b45187ff6af63151 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 20 Nov 2006 10:37:45 -0500
Subject: [GFS2] Simplify glops functions

The go_sync callback took two flags, but one of them was set on every
call, so this patch removes once of the flags and makes the previously
conditional operations (on this flag), unconditional.

The go_inval callback took three flags, each of which was set on every
call to it. This patch removes the flags and makes the operations
unconditional, which makes the logic rather more obvious.

Two now unused flags are also removed from incore.h.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index edc21c8..b8ba4d5 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -847,12 +847,12 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
 
 	if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) {
 		if (glops->go_inval)
-			glops->go_inval(gl, DIO_METADATA | DIO_DATA);
+			glops->go_inval(gl, DIO_METADATA);
 	} else if (gl->gl_state == LM_ST_DEFERRED) {
 		/* We might not want to do this here.
 		   Look at moving to the inode glops. */
 		if (glops->go_inval)
-			glops->go_inval(gl, DIO_DATA);
+			glops->go_inval(gl, 0);
 	}
 
 	/*  Deal with each possible exit condition  */
@@ -954,7 +954,7 @@ void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags)
 	gfs2_assert_warn(sdp, state != gl->gl_state);
 
 	if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync)
-		glops->go_sync(gl, DIO_METADATA | DIO_DATA | DIO_RELEASE);
+		glops->go_sync(gl);
 
 	gfs2_glock_hold(gl);
 	gl->gl_req_bh = xmote_bh;
@@ -995,7 +995,7 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
 	state_change(gl, LM_ST_UNLOCKED);
 
 	if (glops->go_inval)
-		glops->go_inval(gl, DIO_METADATA | DIO_DATA);
+		glops->go_inval(gl, DIO_METADATA);
 
 	if (gh) {
 		spin_lock(&gl->gl_spin);
@@ -1041,7 +1041,7 @@ void gfs2_glock_drop_th(struct gfs2_glock *gl)
 	gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
 
 	if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync)
-		glops->go_sync(gl, DIO_METADATA | DIO_DATA | DIO_RELEASE);
+		glops->go_sync(gl);
 
 	gfs2_glock_hold(gl);
 	gl->gl_req_bh = drop_bh;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index b92de0a..60561ca 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -173,23 +173,18 @@ static void gfs2_page_writeback(struct gfs2_glock *gl)
 /**
  * meta_go_sync - sync out the metadata for this glock
  * @gl: the glock
- * @flags: DIO_*
  *
  * Called when demoting or unlocking an EX glock.  We must flush
  * to disk all dirty buffers/pages relating to this glock, and must not
  * not return to caller to demote/unlock the glock until I/O is complete.
  */
 
-static void meta_go_sync(struct gfs2_glock *gl, int flags)
+static void meta_go_sync(struct gfs2_glock *gl)
 {
-	if (!(flags & DIO_METADATA))
-		return;
-
 	if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) {
 		gfs2_log_flush(gl->gl_sbd, gl);
 		gfs2_meta_sync(gl);
-		if (flags & DIO_RELEASE)
-			gfs2_ail_empty_gl(gl);
+		gfs2_ail_empty_gl(gl);
 	}
 
 }
@@ -264,31 +259,18 @@ static void inode_go_drop_th(struct gfs2_glock *gl)
 /**
  * inode_go_sync - Sync the dirty data and/or metadata for an inode glock
  * @gl: the glock protecting the inode
- * @flags:
  *
  */
 
-static void inode_go_sync(struct gfs2_glock *gl, int flags)
+static void inode_go_sync(struct gfs2_glock *gl)
 {
-	int meta = (flags & DIO_METADATA);
-	int data = (flags & DIO_DATA);
-
 	if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
-		if (meta && data) {
-			gfs2_page_writeback(gl);
-			gfs2_log_flush(gl->gl_sbd, gl);
-			gfs2_meta_sync(gl);
-			gfs2_page_wait(gl);
-			clear_bit(GLF_DIRTY, &gl->gl_flags);
-		} else if (meta) {
-			gfs2_log_flush(gl->gl_sbd, gl);
-			gfs2_meta_sync(gl);
-		} else if (data) {
-			gfs2_page_writeback(gl);
-			gfs2_page_wait(gl);
-		}
-		if (flags & DIO_RELEASE)
-			gfs2_ail_empty_gl(gl);
+		gfs2_page_writeback(gl);
+		gfs2_log_flush(gl->gl_sbd, gl);
+		gfs2_meta_sync(gl);
+		gfs2_page_wait(gl);
+		clear_bit(GLF_DIRTY, &gl->gl_flags);
+		gfs2_ail_empty_gl(gl);
 	}
 }
 
@@ -302,15 +284,13 @@ static void inode_go_sync(struct gfs2_glock *gl, int flags)
 static void inode_go_inval(struct gfs2_glock *gl, int flags)
 {
 	int meta = (flags & DIO_METADATA);
-	int data = (flags & DIO_DATA);
 
 	if (meta) {
 		struct gfs2_inode *ip = gl->gl_object;
 		gfs2_meta_inval(gl);
 		set_bit(GIF_INVALID, &ip->i_flags);
 	}
-	if (data)
-		gfs2_page_inval(gl);
+	gfs2_page_inval(gl);
 }
 
 /**
@@ -494,7 +474,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
 	if (gl->gl_state != LM_ST_UNLOCKED &&
 	    test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
 		gfs2_meta_cache_flush(GFS2_I(sdp->sd_jdesc->jd_inode));
-		j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
+		j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
 
 		error = gfs2_find_jhead(sdp->sd_jdesc, &head);
 		if (error)
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 227a74d..734421e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -14,8 +14,6 @@
 
 #define DIO_WAIT	0x00000010
 #define DIO_METADATA	0x00000020
-#define DIO_DATA	0x00000040
-#define DIO_RELEASE	0x00000080
 #define DIO_ALL		0x00000100
 
 struct gfs2_log_operations;
@@ -103,18 +101,17 @@ struct gfs2_bufdata {
 };
 
 struct gfs2_glock_operations {
-	void (*go_xmote_th) (struct gfs2_glock * gl, unsigned int state,
-			     int flags);
-	void (*go_xmote_bh) (struct gfs2_glock * gl);
-	void (*go_drop_th) (struct gfs2_glock * gl);
-	void (*go_drop_bh) (struct gfs2_glock * gl);
-	void (*go_sync) (struct gfs2_glock * gl, int flags);
-	void (*go_inval) (struct gfs2_glock * gl, int flags);
-	int (*go_demote_ok) (struct gfs2_glock * gl);
-	int (*go_lock) (struct gfs2_holder * gh);
-	void (*go_unlock) (struct gfs2_holder * gh);
-	void (*go_callback) (struct gfs2_glock * gl, unsigned int state);
-	void (*go_greedy) (struct gfs2_glock * gl);
+	void (*go_xmote_th) (struct gfs2_glock *gl, unsigned int state, int flags);
+	void (*go_xmote_bh) (struct gfs2_glock *gl);
+	void (*go_drop_th) (struct gfs2_glock *gl);
+	void (*go_drop_bh) (struct gfs2_glock *gl);
+	void (*go_sync) (struct gfs2_glock *gl);
+	void (*go_inval) (struct gfs2_glock *gl, int flags);
+	int (*go_demote_ok) (struct gfs2_glock *gl);
+	int (*go_lock) (struct gfs2_holder *gh);
+	void (*go_unlock) (struct gfs2_holder *gh);
+	void (*go_callback) (struct gfs2_glock *gl, unsigned int state);
+	void (*go_greedy) (struct gfs2_glock *gl);
 	const int go_type;
 };
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 0ef8317..1408c5f 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -517,7 +517,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
 		return error;
 
 	gfs2_meta_cache_flush(ip);
-	j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
+	j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
 
 	error = gfs2_find_jhead(sdp->sd_jdesc, &head);
 	if (error)
-- 
cgit v0.10.2


From 28626e2078571c4b776a17eaa486bbd2b7dfe2cd Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 22 Nov 2006 11:13:21 -0500
Subject: [GFS2] Fix glock ordering on inode creation

The lock order here should be parent -> child rather than
numeric order.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index ce7f833..d122074c 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -870,33 +870,10 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 	if (error)
 		goto fail_gunlock;
 
-	if (inum.no_addr < dip->i_num.no_addr) {
-		gfs2_glock_dq(ghs);
-
-		error = gfs2_glock_nq_num(sdp, inum.no_addr,
-					  &gfs2_inode_glops, LM_ST_EXCLUSIVE,
-					  GL_SKIP, ghs + 1);
-		if (error) {
-			return ERR_PTR(error);
-		}
-
-		gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
-		error = gfs2_glock_nq(ghs);
-		if (error) {
-			gfs2_glock_dq_uninit(ghs + 1);
-			return ERR_PTR(error);
-		}
-
-		error = create_ok(dip, name, mode);
-		if (error)
-			goto fail_gunlock2;
-	} else {
-		error = gfs2_glock_nq_num(sdp, inum.no_addr,
-					  &gfs2_inode_glops, LM_ST_EXCLUSIVE,
-					  GL_SKIP, ghs + 1);
-		if (error)
-			goto fail_gunlock;
-	}
+	error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops,
+				  LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
+	if (error)
+		goto fail_gunlock;
 
 	error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev);
 	if (error)
-- 
cgit v0.10.2


From ae619320b22f8e0b2bbe4a3a5ac2f9ccf08d7ec2 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 22 Nov 2006 11:28:47 -0500
Subject: [GFS2] mark_inode_dirty after write to stuffed file

Writes to stuffed files were not being marked dirty correctly.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 8676c39..d8d69a7 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -472,8 +472,10 @@ static int gfs2_commit_write(struct file *file, struct page *page,
 
 		SetPageUptodate(page);
 
-		if (inode->i_size < file_size)
+		if (inode->i_size < file_size) {
 			i_size_write(inode, file_size);
+			mark_inode_dirty(inode);
+		}
 	} else {
 		if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED ||
 		    gfs2_is_jdata(ip))
-- 
cgit v0.10.2


From b004157ab5b374a498a5874cda68c389219d23e7 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 23 Nov 2006 10:51:34 -0500
Subject: [GFS2] Fix journal flush problem

This fixes a bug which resulted in poor performance due to flushing
the journal too often. The code path in question was via the inode_go_sync()
function in glops.c. The solution is not to flush the journal immediately
when inodes are ejected from memory, but batch up the work for glockd to
deal with later on. This means that glocks may now live on beyond the end of
the lifetime of their inodes (but not very much longer in the normal case).

Also fixed in this patch is a bug (which was hidden by the bug mentioned above) in
calculation of the number of free journal blocks.

The gfs2_logd process has been altered to be more responsive to the journal
filling up. We now wake it up when the number of uncommitted journal blocks
has reached the threshold level rather than trying to flush directly at the
end of each transaction. This again means doing fewer, but larger, log
flushes in general.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
index cab1f68..683cb5b 100644
--- a/fs/gfs2/daemon.c
+++ b/fs/gfs2/daemon.c
@@ -112,6 +112,7 @@ int gfs2_logd(void *data)
 	struct gfs2_sbd *sdp = data;
 	struct gfs2_holder ji_gh;
 	unsigned long t;
+	int need_flush;
 
 	while (!kthread_should_stop()) {
 		/* Advance the log tail */
@@ -120,8 +121,10 @@ int gfs2_logd(void *data)
 		    gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
 
 		gfs2_ail1_empty(sdp, DIO_ALL);
-
-		if (time_after_eq(jiffies, t)) {
+		gfs2_log_lock(sdp);
+		need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks);
+		gfs2_log_unlock(sdp);
+		if (need_flush || time_after_eq(jiffies, t)) {
 			gfs2_log_flush(sdp, NULL);
 			sdp->sd_log_flush_time = jiffies;
 		}
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index b8ba4d5..3c2ff81 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -785,21 +785,6 @@ out:
 		gfs2_holder_put(new_gh);
 }
 
-void gfs2_glock_inode_squish(struct inode *inode)
-{
-	struct gfs2_holder gh;
-	struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
-	gfs2_holder_init(gl, LM_ST_UNLOCKED, 0, &gh);
-	set_bit(HIF_DEMOTE, &gh.gh_iflags);
-	spin_lock(&gl->gl_spin);
-	gfs2_assert(inode->i_sb->s_fs_info, list_empty(&gl->gl_holders));
-	list_add_tail(&gh.gh_list, &gl->gl_waiters2);
-	run_queue(gl);
-	spin_unlock(&gl->gl_spin);
-	wait_for_completion(&gh.gh_wait);
-	gfs2_holder_uninit(&gh);
-}
-
 /**
  * state_change - record that the glock is now in a different state
  * @gl: the glock
@@ -1920,7 +1905,7 @@ out:
 
 static void scan_glock(struct gfs2_glock *gl)
 {
-	if (gl->gl_ops == &gfs2_inode_glops)
+	if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object)
 		return;
 
 	if (gfs2_glmutex_trylock(gl)) {
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index a331bf8..fb39108 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -106,7 +106,6 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number,
 			     const struct gfs2_glock_operations *glops,
 			     unsigned int state, int flags);
-void gfs2_glock_inode_squish(struct inode *inode);
 
 /**
  * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 60561ca..b068d10 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -107,70 +107,6 @@ static void gfs2_pte_inval(struct gfs2_glock *gl)
 }
 
 /**
- * gfs2_page_inval - Invalidate all pages associated with a glock
- * @gl: the glock
- *
- */
-
-static void gfs2_page_inval(struct gfs2_glock *gl)
-{
-	struct gfs2_inode *ip;
-	struct inode *inode;
-
-	ip = gl->gl_object;
-	inode = &ip->i_inode;
-	if (!ip || !S_ISREG(inode->i_mode))
-		return;
-
-	truncate_inode_pages(inode->i_mapping, 0);
-	gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), !inode->i_mapping->nrpages);
-	clear_bit(GIF_PAGED, &ip->i_flags);
-}
-
-/**
- * gfs2_page_wait - Wait for writeback of data
- * @gl: the glock
- *
- * Syncs data (not metadata) for a regular file.
- * No-op for all other types.
- */
-
-static void gfs2_page_wait(struct gfs2_glock *gl)
-{
-	struct gfs2_inode *ip = gl->gl_object;
-	struct inode *inode = &ip->i_inode;
-	struct address_space *mapping = inode->i_mapping;
-	int error;
-
-	if (!S_ISREG(inode->i_mode))
-		return;
-
-	error = filemap_fdatawait(mapping);
-
-	/* Put back any errors cleared by filemap_fdatawait()
-	   so they can be caught by someone who can pass them
-	   up to user space. */
-
-	if (error == -ENOSPC)
-		set_bit(AS_ENOSPC, &mapping->flags);
-	else if (error)
-		set_bit(AS_EIO, &mapping->flags);
-
-}
-
-static void gfs2_page_writeback(struct gfs2_glock *gl)
-{
-	struct gfs2_inode *ip = gl->gl_object;
-	struct inode *inode = &ip->i_inode;
-	struct address_space *mapping = inode->i_mapping;
-
-	if (!S_ISREG(inode->i_mode))
-		return;
-
-	filemap_fdatawrite(mapping);
-}
-
-/**
  * meta_go_sync - sync out the metadata for this glock
  * @gl: the glock
  *
@@ -264,11 +200,24 @@ static void inode_go_drop_th(struct gfs2_glock *gl)
 
 static void inode_go_sync(struct gfs2_glock *gl)
 {
+	struct gfs2_inode *ip = gl->gl_object;
+
+	if (ip && !S_ISREG(ip->i_inode.i_mode))
+		ip = NULL;
+
 	if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
-		gfs2_page_writeback(gl);
 		gfs2_log_flush(gl->gl_sbd, gl);
+		if (ip)
+			filemap_fdatawrite(ip->i_inode.i_mapping);
 		gfs2_meta_sync(gl);
-		gfs2_page_wait(gl);
+		if (ip) {
+			struct address_space *mapping = ip->i_inode.i_mapping;
+			int error = filemap_fdatawait(mapping);
+			if (error == -ENOSPC)
+				set_bit(AS_ENOSPC, &mapping->flags);
+			else if (error)
+				set_bit(AS_EIO, &mapping->flags);
+		}
 		clear_bit(GLF_DIRTY, &gl->gl_flags);
 		gfs2_ail_empty_gl(gl);
 	}
@@ -283,14 +232,20 @@ static void inode_go_sync(struct gfs2_glock *gl)
 
 static void inode_go_inval(struct gfs2_glock *gl, int flags)
 {
+	struct gfs2_inode *ip = gl->gl_object;
 	int meta = (flags & DIO_METADATA);
 
 	if (meta) {
-		struct gfs2_inode *ip = gl->gl_object;
 		gfs2_meta_inval(gl);
-		set_bit(GIF_INVALID, &ip->i_flags);
+		if (ip)
+			set_bit(GIF_INVALID, &ip->i_flags);
+	}
+
+	if (ip && S_ISREG(ip->i_inode.i_mode)) {
+		truncate_inode_pages(ip->i_inode.i_mapping, 0);
+		gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), !ip->i_inode.i_mapping->nrpages);
+		clear_bit(GIF_PAGED, &ip->i_flags);
 	}
-	gfs2_page_inval(gl);
 }
 
 /**
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 0cace3d..6456fc3 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -261,6 +261,12 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
  * @sdp: The GFS2 superblock
  * @blks: The number of blocks to reserve
  *
+ * Note that we never give out the last 6 blocks of the journal. Thats
+ * due to the fact that there is are a small number of header blocks
+ * associated with each log flush. The exact number can't be known until
+ * flush time, so we ensure that we have just enough free blocks at all
+ * times to avoid running out during a log flush.
+ *
  * Returns: errno
  */
 
@@ -274,7 +280,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
 
 	mutex_lock(&sdp->sd_log_reserve_mutex);
 	gfs2_log_lock(sdp);
-	while(sdp->sd_log_blks_free <= blks) {
+	while(sdp->sd_log_blks_free <= (blks + 6)) {
 		gfs2_log_unlock(sdp);
 		gfs2_ail1_empty(sdp, 0);
 		gfs2_log_flush(sdp, NULL);
@@ -643,12 +649,9 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 	up_read(&sdp->sd_log_flush_lock);
 
 	gfs2_log_lock(sdp);
-	if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) {
-		gfs2_log_unlock(sdp);
-		gfs2_log_flush(sdp, NULL);
-	} else {
-		gfs2_log_unlock(sdp);
-	}
+	if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks))
+		wake_up_process(sdp->sd_logd_process);
+	gfs2_log_unlock(sdp);
 }
 
 /**
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 3912d6a..939a09f 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -472,6 +472,9 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
 	struct buffer_head *bh = NULL, **bh_slot = ip->i_cache + height;
 	int in_cache = 0;
 
+	BUG_ON(!gl);
+	BUG_ON(!sdp);
+
 	spin_lock(&ip->i_spin);
 	if (*bh_slot && (*bh_slot)->b_blocknr == num) {
 		bh = *bh_slot;
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 8635175..7685b46 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -157,7 +157,8 @@ static void gfs2_write_super(struct super_block *sb)
 static int gfs2_sync_fs(struct super_block *sb, int wait)
 {
 	sb->s_dirt = 0;
-	gfs2_log_flush(sb->s_fs_info, NULL);
+	if (wait)
+		gfs2_log_flush(sb->s_fs_info, NULL);
 	return 0;
 }
 
@@ -293,8 +294,6 @@ static void gfs2_clear_inode(struct inode *inode)
 	 */
 	if (inode->i_private) {
 		struct gfs2_inode *ip = GFS2_I(inode);
-		gfs2_glock_inode_squish(inode);
-		gfs2_assert(inode->i_sb->s_fs_info, ip->i_gl->gl_state == LM_ST_UNLOCKED);
 		ip->i_gl->gl_object = NULL;
 		gfs2_glock_schedule_for_reclaim(ip->i_gl);
 		gfs2_glock_put(ip->i_gl);
@@ -395,7 +394,7 @@ static void gfs2_delete_inode(struct inode *inode)
 	if (!inode->i_private)
 		goto out;
 
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &gh);
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB, &gh);
 	if (unlikely(error)) {
 		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
 		goto out;
-- 
cgit v0.10.2


From a25311c8e0b7071b129ca9a9e49e22eeaf620864 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 23 Nov 2006 11:06:35 -0500
Subject: [GFS2] Move gfs2_meta_syncfs() into log.c

By moving gfs2_meta_syncfs() into log.c, gfs2_ail1_start()
can be made static.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 6456fc3..7713d59 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -15,6 +15,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
+#include <linux/delay.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -142,7 +143,7 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl
 	return list_empty(&ai->ai_ail1_list);
 }
 
-void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
+static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
 {
 	struct list_head *head = &sdp->sd_ail1_list;
 	u64 sync_gen;
@@ -689,3 +690,21 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
 	up_write(&sdp->sd_log_flush_lock);
 }
 
+
+/**
+ * gfs2_meta_syncfs - sync all the buffers in a filesystem
+ * @sdp: the filesystem
+ *
+ */
+
+void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
+{
+	gfs2_log_flush(sdp, NULL);
+	for (;;) {
+		gfs2_ail1_start(sdp, DIO_ALL);
+		if (gfs2_ail1_empty(sdp, DIO_ALL))
+			break;
+		msleep(10);
+	}
+}
+
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 7f5737d..8e7aa0f 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -48,7 +48,6 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
 unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
 			    unsigned int ssize);
 
-void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags);
 int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags);
 
 int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
@@ -61,5 +60,6 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
 void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
 
 void gfs2_log_shutdown(struct gfs2_sbd *sdp);
+void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
 
 #endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 939a09f..fbeba81 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -574,20 +574,3 @@ out:
 	return first_bh;
 }
 
-/**
- * gfs2_meta_syncfs - sync all the buffers in a filesystem
- * @sdp: the filesystem
- *
- */
-
-void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
-{
-	gfs2_log_flush(sdp, NULL);
-	for (;;) {
-		gfs2_ail1_start(sdp, DIO_ALL);
-		if (gfs2_ail1_empty(sdp, DIO_ALL))
-			break;
-		msleep(10);
-	}
-}
-
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 3ec939e..e037425 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -67,7 +67,6 @@ static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
 }
 
 struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen);
-void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
 
 #define buffer_busy(bh) \
 ((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned)))
-- 
cgit v0.10.2


From cb4c03131836a55bf95e1c165409244ac6b4f39f Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 23 Nov 2006 11:16:32 -0500
Subject: [GFS2] Reduce number of arguments to meta_io.c:getbuf()

Since the superblock and the address_space are determined by the
glock, we might as well just pass that as the argument since all
the callers already have that available.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index fbeba81..0e34d99 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -127,17 +127,17 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
 
 /**
  * getbuf - Get a buffer with a given address space
- * @sdp: the filesystem
- * @aspace: the address space
+ * @gl: the glock
  * @blkno: the block number (filesystem scope)
  * @create: 1 if the buffer should be created
  *
  * Returns: the buffer
  */
 
-static struct buffer_head *getbuf(struct gfs2_sbd *sdp, struct inode *aspace,
-				  u64 blkno, int create)
+static struct buffer_head *getbuf(struct gfs2_glock *gl, u64 blkno, int create)
 {
+	struct address_space *mapping = gl->gl_aspace->i_mapping;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct page *page;
 	struct buffer_head *bh;
 	unsigned int shift;
@@ -150,13 +150,13 @@ static struct buffer_head *getbuf(struct gfs2_sbd *sdp, struct inode *aspace,
 
 	if (create) {
 		for (;;) {
-			page = grab_cache_page(aspace->i_mapping, index);
+			page = grab_cache_page(mapping, index);
 			if (page)
 				break;
 			yield();
 		}
 	} else {
-		page = find_lock_page(aspace->i_mapping, index);
+		page = find_lock_page(mapping, index);
 		if (!page)
 			return NULL;
 	}
@@ -202,7 +202,7 @@ static void meta_prep_new(struct buffer_head *bh)
 struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 {
 	struct buffer_head *bh;
-	bh = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
+	bh = getbuf(gl, blkno, CREATE);
 	meta_prep_new(bh);
 	return bh;
 }
@@ -220,7 +220,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 		   struct buffer_head **bhp)
 {
-	*bhp = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
+	*bhp = getbuf(gl, blkno, CREATE);
 	if (!buffer_uptodate(*bhp))
 		ll_rw_block(READ_META, 1, bhp);
 	if (flags & DIO_WAIT) {
@@ -379,11 +379,10 @@ void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
 void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct inode *aspace = ip->i_gl->gl_aspace;
 	struct buffer_head *bh;
 
 	while (blen) {
-		bh = getbuf(sdp, aspace, bstart, NO_CREATE);
+		bh = getbuf(ip->i_gl, bstart, NO_CREATE);
 		if (bh) {
 			struct gfs2_bufdata *bd = bh->b_private;
 
@@ -484,7 +483,7 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
 	spin_unlock(&ip->i_spin);
 
 	if (!bh)
-		bh = getbuf(gl->gl_sbd, gl->gl_aspace, num, CREATE);
+		bh = getbuf(gl, num, CREATE);
 
 	if (!bh)
 		return -ENOBUFS;
@@ -535,7 +534,6 @@ err:
 struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
 {
 	struct gfs2_sbd *sdp = gl->gl_sbd;
-	struct inode *aspace = gl->gl_aspace;
 	struct buffer_head *first_bh, *bh;
 	u32 max_ra = gfs2_tune_get(sdp, gt_max_readahead) >>
 			  sdp->sd_sb.sb_bsize_shift;
@@ -547,7 +545,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
 	if (extlen > max_ra)
 		extlen = max_ra;
 
-	first_bh = getbuf(sdp, aspace, dblock, CREATE);
+	first_bh = getbuf(gl, dblock, CREATE);
 
 	if (buffer_uptodate(first_bh))
 		goto out;
@@ -558,7 +556,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
 	extlen--;
 
 	while (extlen) {
-		bh = getbuf(sdp, aspace, dblock, CREATE);
+		bh = getbuf(gl, dblock, CREATE);
 
 		if (!buffer_uptodate(bh) && !buffer_locked(bh))
 			ll_rw_block(READA, 1, &bh);
-- 
cgit v0.10.2


From 300c7d75f3a5e8edd3e390ccd56b808f3fb14e33 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 27 Nov 2006 09:55:28 -0500
Subject: [GFS2] Fix recursive locking in gfs2_permission

Since gfs2_permission may be called either from the VFS (in which case
we need to obtain a shared glock) or from GFS2 (in which case we already
have a glock) we need to test to see whether or not a lock is required.
The original test was buggy due to a potential race. This one should
be safe.

This fixes Red Hat bugzilla #217129

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index b247f25..fd9fee2 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -835,6 +835,10 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
  * @mask:
  * @nd: passed from Linux VFS, ignored by us
  *
+ * This may be called from the VFS directly, or from within GFS2 with the
+ * inode locked, so we look to see if the glock is already locked and only
+ * lock the glock if its not already been done.
+ *
  * Returns: errno
  */
 
@@ -843,15 +847,18 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder i_gh;
 	int error;
+	int unlock = 0;
 
-	if (!test_bit(GIF_INVALID, &ip->i_flags))
-		return generic_permission(inode, mask, gfs2_check_acl);
+	if (gfs2_glock_is_locked_by_me(ip->i_gl) == 0) {
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+		if (error)
+			return error;
+		unlock = 1;
+	}
 
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-	if (!error) {
-		error = generic_permission(inode, mask, gfs2_check_acl_locked);
+	error = generic_permission(inode, mask, gfs2_check_acl_locked);
+	if (unlock)
 		gfs2_glock_dq_uninit(&i_gh);
-	}
 
 	return error;
 }
-- 
cgit v0.10.2


From dcf3dd852f554bb0016aa23892596717cc123a26 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 27 Nov 2006 10:12:05 -0500
Subject: [GFS2] Fix recursive locking in gfs2_getattr

The readdirplus NFS operation can result in gfs2_getattr being
called with the glock already held. In this case we do not want
to try and grab the lock again.

This fixes Red Hat bugzilla #215727

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index fd9fee2..fbe38c1 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -992,6 +992,12 @@ out:
  * @dentry: The dentry to stat
  * @stat: The inode's stats
  *
+ * This may be called from the VFS directly, or from within GFS2 with the
+ * inode locked, so we look to see if the glock is already locked and only
+ * lock the glock if its not already been done. Note that its the NFS
+ * readdirplus operation which causes this to be called (from filldir)
+ * with the glock already held.
+ *
  * Returns: errno
  */
 
@@ -1002,14 +1008,20 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder gh;
 	int error;
+	int unlock = 0;
 
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
-	if (!error) {
-		generic_fillattr(inode, stat);
-		gfs2_glock_dq_uninit(&gh);
+	if (gfs2_glock_is_locked_by_me(ip->i_gl) == 0) {
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+		if (error)
+			return error;
+		unlock = 1;
 	}
 
-	return error;
+	generic_fillattr(inode, stat);
+	if (unlock);
+		gfs2_glock_dq_uninit(&gh);
+
+	return 0;
 }
 
 static int gfs2_setxattr(struct dentry *dentry, const char *name,
-- 
cgit v0.10.2


From 2896ee37ccc1f9acb244c9b02becb74a43661009 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Mon, 27 Nov 2006 11:31:22 -0600
Subject: [DLM] fix add_requestqueue checking nodes list

Requests that arrive after recovery has started are saved in the
requestqueue and processed after recovery is done.  Some of these requests
are purged during recovery if they are from nodes that have been removed.
We move the purging of the requests (dlm_purge_requestqueue) to later in
the recovery sequence which allows the routine saving requests
(dlm_add_requestqueue) to avoid filtering out requests by nodeid since the
same will be done by the purge.  The current code has add_requestqueue
filtering by nodeid but doesn't hold any locks when accessing the list of
current nodes.  This also means that we need to call the purge routine
when the lockspace is being shut down since the add routine will not be
rejecting requests itself any more.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index f8842ca..791388b 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -22,6 +22,7 @@
 #include "memory.h"
 #include "lock.h"
 #include "recover.h"
+#include "requestqueue.h"
 
 #ifdef CONFIG_DLM_DEBUG
 int dlm_create_debug_file(struct dlm_ls *ls);
@@ -684,6 +685,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
 	 * Free structures on any other lists
 	 */
 
+	dlm_purge_requestqueue(ls);
 	kfree(ls->ls_recover_args);
 	dlm_clear_free_entries(ls);
 	dlm_clear_members(ls);
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 8bb895f..9dc2f91 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -94,14 +94,6 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 	}
 
 	/*
-	 * Purge directory-related requests that are saved in requestqueue.
-	 * All dir requests from before recovery are invalid now due to the dir
-	 * rebuild and will be resent by the requesting nodes.
-	 */
-
-	dlm_purge_requestqueue(ls);
-
-	/*
 	 * Wait for all nodes to complete directory rebuild.
 	 */
 
@@ -181,6 +173,14 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 
 	dlm_release_root_list(ls);
 
+	/*
+	 * Purge directory-related requests that are saved in requestqueue.
+	 * All dir requests from before recovery are invalid now due to the dir
+	 * rebuild and will be resent by the requesting nodes.
+	 */
+
+	dlm_purge_requestqueue(ls);
+
 	dlm_set_recover_status(ls, DLM_RS_DONE);
 	error = dlm_recover_done_wait(ls);
 	if (error) {
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index 0226d2a..65008d7 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -36,9 +36,6 @@ int dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
 	int length = hd->h_length;
 	int rv = 0;
 
-	if (dlm_is_removed(ls, nodeid))
-		return 0;
-
 	e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
 	if (!e) {
 		log_print("dlm_add_requestqueue: out of memory\n");
@@ -133,6 +130,10 @@ static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
 {
 	uint32_t type = ms->m_type;
 
+	/* the ls is being cleaned up and freed by release_lockspace */
+	if (!ls->ls_count)
+		return 1;
+
 	if (dlm_is_removed(ls, nodeid))
 		return 1;
 
-- 
cgit v0.10.2


From aed3255f2267e2d1d95b9cf7f2995ce24e6c873b Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <ryusuke@osrg.net>
Date: Tue, 28 Nov 2006 02:53:22 +0900
Subject: [GFS2] fs/gfs2/log.c:log_bmap() fix printk format warning

Fix a printk format warning in fs/gfs2/log.c:
fs/gfs2/log.c:322: warning: format '%llu' expects type 'long long unsigned int', but argument 3 has type 'sector_t'

Signed-off-by: Ryusuke Konishi <ryusuke@osrg.net>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 7713d59..291415d 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -326,7 +326,8 @@ static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
 	bh_map.b_size = 1 << inode->i_blkbits;
 	error = gfs2_block_map(inode, lbn, 0, &bh_map);
 	if (error || !bh_map.b_blocknr)
-		printk(KERN_INFO "error=%d, dbn=%llu lbn=%u", error, bh_map.b_blocknr, lbn);
+		printk(KERN_INFO "error=%d, dbn=%llu lbn=%u", error,
+		       (unsigned long long)bh_map.b_blocknr, lbn);
 	gfs2_assert_withdraw(sdp, !error && bh_map.b_blocknr);
 
 	return bh_map.b_blocknr;
-- 
cgit v0.10.2


From 1babdb453138f17b8ed3d1d5711089c4e2fa5ace Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Mon, 27 Nov 2006 13:18:41 -0600
Subject: [DLM] fix size of STATUS_REPLY message

When the not_ready routine sends a "fake" status reply with blank status
flags, it needs to use the correct size for a normal STATUS_REPLY by
including the size of the would-be config parameters.  We also fill in the
non-existant config parameters with an invalid lvblen value so it's easier
to notice if these invalid paratmers are ever being used.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 87b12f7..6ac195c 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -370,9 +370,10 @@ static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
 static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
 {
 	struct dlm_rcom *rc;
+	struct rcom_config *rf;
 	struct dlm_mhandle *mh;
 	char *mb;
-	int mb_len = sizeof(struct dlm_rcom);
+	int mb_len = sizeof(struct dlm_rcom) + sizeof(struct rcom_config);
 
 	mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb);
 	if (!mh)
@@ -391,6 +392,9 @@ static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
 	rc->rc_id = rc_in->rc_id;
 	rc->rc_result = -ESRCH;
 
+	rf = (struct rcom_config *) rc->rc_buf;
+	rf->rf_lvblen = -1;
+
 	dlm_rcom_out(rc);
 	dlm_lowcomms_commit_buffer(mh);
 
-- 
cgit v0.10.2


From 98f176fb32f33795b6d0f83856008b932123ab38 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Mon, 27 Nov 2006 13:19:28 -0600
Subject: [DLM] don't accept replies to old recovery messages

We often abort a recovery after sending a status request to a remote node.
We want to ignore any potential status reply we get from the remote node.
If we get one of these unwanted replies, we've often moved on to the next
recovery message and incremented the message sequence counter, so the
reply will be ignored due to the seq number.  In some cases, we've not
moved on to the next message so the seq number of the reply we want to
ignore is still correct, causing the reply to be accepted.  The next
recovery message will then mistake this old reply as a new one.

To fix this, we add the flag RCOM_WAIT to indicate when we can accept a
new reply.  We clear this flag if we abort recovery while waiting for a
reply.  Before the flag is set again (to allow new replies) we know that
any old replies will be rejected due to their sequence number.  We also
initialize the recovery-message sequence number to a random value when a
lockspace is first created.  This makes it clear when messages are being
rejected from an old instance of a lockspace that has since been
recreated.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 1e5cd67..1ee8195 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -471,6 +471,7 @@ struct dlm_ls {
 	char			*ls_recover_buf;
 	int			ls_recover_nodeid; /* for debugging */
 	uint64_t		ls_rcom_seq;
+	spinlock_t		ls_rcom_spin;
 	struct list_head	ls_recover_list;
 	spinlock_t		ls_recover_list_lock;
 	int			ls_recover_list_count;
@@ -488,7 +489,8 @@ struct dlm_ls {
 #define LSFL_RUNNING		1
 #define LSFL_RECOVERY_STOP	2
 #define LSFL_RCOM_READY		3
-#define LSFL_UEVENT_WAIT	4
+#define LSFL_RCOM_WAIT		4
+#define LSFL_UEVENT_WAIT	5
 
 /* much of this is just saving user space pointers associated with the
    lock that we pass back to the user lib with an ast */
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 791388b..59012b0 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -479,6 +479,8 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 	ls->ls_recoverd_task = NULL;
 	mutex_init(&ls->ls_recoverd_active);
 	spin_lock_init(&ls->ls_recover_lock);
+	spin_lock_init(&ls->ls_rcom_spin);
+	get_random_bytes(&ls->ls_rcom_seq, sizeof(uint64_t));
 	ls->ls_recover_status = 0;
 	ls->ls_recover_seq = 0;
 	ls->ls_recover_args = NULL;
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 6ac195c..c42f2db 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -90,13 +90,28 @@ static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid)
 	return 0;
 }
 
+static void allow_sync_reply(struct dlm_ls *ls, uint64_t *new_seq)
+{
+	spin_lock(&ls->ls_rcom_spin);
+	*new_seq = ++ls->ls_rcom_seq;
+	set_bit(LSFL_RCOM_WAIT, &ls->ls_flags);
+	spin_unlock(&ls->ls_rcom_spin);
+}
+
+static void disallow_sync_reply(struct dlm_ls *ls)
+{
+	spin_lock(&ls->ls_rcom_spin);
+	clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags);
+	clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
+	spin_unlock(&ls->ls_rcom_spin);
+}
+
 int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
 {
 	struct dlm_rcom *rc;
 	struct dlm_mhandle *mh;
 	int error = 0;
 
-	memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
 	ls->ls_recover_nodeid = nodeid;
 
 	if (nodeid == dlm_our_nodeid()) {
@@ -108,12 +123,14 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
 	error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
 	if (error)
 		goto out;
-	rc->rc_id = ++ls->ls_rcom_seq;
+
+	allow_sync_reply(ls, &rc->rc_id);
+	memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
 
 	send_rcom(ls, mh, rc);
 
 	error = dlm_wait_function(ls, &rcom_response);
-	clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
+	disallow_sync_reply(ls);
 	if (error)
 		goto out;
 
@@ -150,14 +167,20 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
 
 static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
 {
-	if (rc_in->rc_id != ls->ls_rcom_seq) {
-		log_debug(ls, "reject old reply %d got %llx wanted %llx",
-			  rc_in->rc_type, rc_in->rc_id, ls->ls_rcom_seq);
-		return;
+	spin_lock(&ls->ls_rcom_spin);
+	if (!test_bit(LSFL_RCOM_WAIT, &ls->ls_flags) ||
+	    rc_in->rc_id != ls->ls_rcom_seq) {
+		log_debug(ls, "reject reply %d from %d seq %llx expect %llx",
+			  rc_in->rc_type, rc_in->rc_header.h_nodeid,
+			  rc_in->rc_id, ls->ls_rcom_seq);
+		goto out;
 	}
 	memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
 	set_bit(LSFL_RCOM_READY, &ls->ls_flags);
+	clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags);
 	wake_up(&ls->ls_wait_general);
+ out:
+	spin_unlock(&ls->ls_rcom_spin);
 }
 
 static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
@@ -171,7 +194,6 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
 	struct dlm_mhandle *mh;
 	int error = 0, len = sizeof(struct dlm_rcom);
 
-	memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
 	ls->ls_recover_nodeid = nodeid;
 
 	if (nodeid == dlm_our_nodeid()) {
@@ -185,12 +207,14 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
 	if (error)
 		goto out;
 	memcpy(rc->rc_buf, last_name, last_len);
-	rc->rc_id = ++ls->ls_rcom_seq;
+
+	allow_sync_reply(ls, &rc->rc_id);
+	memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
 
 	send_rcom(ls, mh, rc);
 
 	error = dlm_wait_function(ls, &rcom_response);
-	clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
+	disallow_sync_reply(ls);
  out:
 	return error;
 }
-- 
cgit v0.10.2


From 0ac230699a0f3f0d15ad4e4ad99446dac5b4a21f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 28 Nov 2006 22:29:19 -0800
Subject: [GFS2] lock function parameter

Fix function parameter typing:
fs/gfs2/glock.c:100: warning: function declaration isn't a prototype

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 3c2ff81..f130f98 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -96,7 +96,7 @@ static inline rwlock_t *gl_lock_addr(unsigned int x)
 	return &gl_hash_locks[x & (GL_HASH_LOCK_SZ-1)];
 }
 #else /* not SMP, so no spinlocks required */
-static inline rwlock_t *gl_lock_addr(x)
+static inline rwlock_t *gl_lock_addr(unsigned int x)
 {
 	return NULL;
 }
-- 
cgit v0.10.2


From 57adf7eede38d315e0e328c52484d6a596e9a238 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <ryusuke@osrg.net>
Date: Wed, 29 Nov 2006 09:33:48 -0500
Subject: [DLM] fix format warnings in rcom.c and recoverd.c

This fixes the following gcc warnings generated on
the architectures where uint64_t != unsigned long long (e.g. ppc64).

fs/dlm/rcom.c:154: warning: format '%llx' expects type 'long long unsigned int', but argument 4 has type 'uint64_t'
fs/dlm/rcom.c:154: warning: format '%llx' expects type 'long long unsigned int', but argument 5 has type 'uint64_t'
fs/dlm/recoverd.c:48: warning: format '%llx' expects type 'long long unsigned int', but argument 3 has type 'uint64_t'
fs/dlm/recoverd.c:202: warning: format '%llx' expects type 'long long unsigned int', but argument 3 has type 'uint64_t'
fs/dlm/recoverd.c:210: warning: format '%llx' expects type 'long long unsigned int', but argument 3 has type 'uint64_t'

Signed-off-by: Ryusuke Konishi <ryusuke@osrg.net>
Signed-off-by: Patrick Caulfield <pcaulfie@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index c42f2db..4cc31be 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -172,7 +172,8 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
 	    rc_in->rc_id != ls->ls_rcom_seq) {
 		log_debug(ls, "reject reply %d from %d seq %llx expect %llx",
 			  rc_in->rc_type, rc_in->rc_header.h_nodeid,
-			  rc_in->rc_id, ls->ls_rcom_seq);
+			  (unsigned long long)rc_in->rc_id,
+			  (unsigned long long)ls->ls_rcom_seq);
 		goto out;
 	}
 	memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 9dc2f91..650536aa 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -45,7 +45,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 	unsigned long start;
 	int error, neg = 0;
 
-	log_debug(ls, "recover %llx", rv->seq);
+	log_debug(ls, "recover %llx", (unsigned long long)rv->seq);
 
 	mutex_lock(&ls->ls_recoverd_active);
 
@@ -212,7 +212,8 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 
 	dlm_astd_wake();
 
-	log_debug(ls, "recover %llx done: %u ms", rv->seq,
+	log_debug(ls, "recover %llx done: %u ms",
+		  (unsigned long long)rv->seq,
 		  jiffies_to_msecs(jiffies - start));
 	mutex_unlock(&ls->ls_recoverd_active);
 
@@ -220,7 +221,8 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 
  fail:
 	dlm_release_root_list(ls);
-	log_debug(ls, "recover %llx error %d", rv->seq, error);
+	log_debug(ls, "recover %llx error %d",
+		  (unsigned long long)rv->seq, error);
 	mutex_unlock(&ls->ls_recoverd_active);
 	return error;
 }
-- 
cgit v0.10.2


From 77386e1f662f104680da7885d32e068e4b11b882 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 29 Nov 2006 10:41:49 -0500
Subject: [GFS2] Remove gfs2_check_acl()

As pointed out by Adrian Bunk, the gfs2_check_acl() function is no
longer used. This patch removes it and renamed gfs2_check_acl_locked()
to gfs2_check_acl() since we only need one variant of that function now.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Adrian Bunk <bunk@stusta.de>

diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3908992..6e80844 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -145,14 +145,14 @@ out:
 }
 
 /**
- * gfs2_check_acl_locked - Check an ACL to see if we're allowed to do something
+ * gfs2_check_acl - Check an ACL to see if we're allowed to do something
  * @inode: the file we want to do something to
  * @mask: what we want to do
  *
  * Returns: errno
  */
 
-int gfs2_check_acl_locked(struct inode *inode, int mask)
+int gfs2_check_acl(struct inode *inode, int mask)
 {
 	struct posix_acl *acl = NULL;
 	int error;
@@ -170,21 +170,6 @@ int gfs2_check_acl_locked(struct inode *inode, int mask)
 	return -EAGAIN;
 }
 
-int gfs2_check_acl(struct inode *inode, int mask)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_holder i_gh;
-	int error;
-
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-	if (!error) {
-		error = gfs2_check_acl_locked(inode, mask);
-		gfs2_glock_dq_uninit(&i_gh);
-	}
-
-	return error;
-}
-
 static int munge_mode(struct gfs2_inode *ip, mode_t mode)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 05c294f..6751930 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -31,7 +31,6 @@ int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
 			  struct gfs2_ea_request *er,
 			  int *remove, mode_t *mode);
 int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
-int gfs2_check_acl_locked(struct inode *inode, int mask);
 int gfs2_check_acl(struct inode *inode, int mask);
 int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
 int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index fbe38c1..636dda4 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -856,7 +856,7 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
 		unlock = 1;
 	}
 
-	error = generic_permission(inode, mask, gfs2_check_acl_locked);
+	error = generic_permission(inode, mask, gfs2_check_acl);
 	if (unlock)
 		gfs2_glock_dq_uninit(&i_gh);
 
-- 
cgit v0.10.2


From 0da3585e1ef650d3224b4d6f9799558d1d99fa1e Mon Sep 17 00:00:00 2001
From: Srinivasa Ds <srinivasa@in.ibm.com>
Date: Thu, 30 Nov 2006 15:04:55 +0530
Subject: [GFS2] Mount problem with the GFS2 code

  While mounting the gfs2 filesystem,our test team had a problem and we
got this error message.
=======================================================

GFS2: fsid=: Trying to join cluster "lock_nolock", "dasde1"
GFS2: fsid=dasde1.0: Joined cluster. Now mounting FS...
GFS2: not a GFS2 filesystem
GFS2: fsid=dasde1.0: can't read superblock: -22

==========================================================================
On debugging further we found that problem is while reading the super
block(gfs2_read_super) and comparing the magic number in it.
When I  replace the submit_bio() call(present in gfs2_read_super) with
the sb_getblk() and ll_rw_block(), mount operation succeded.
On further analysis we found that before calling submit_bio(),
bio->bi_sector was set to "sector" variable. This "sector" variable has
the same value of bh->b_blocknr(block number). Hence there is a need to
multiply this valuwith (blocksize >> 9)(9 because,sector size
2^9,samething happens in ll_rw_block also, before calling submit_bio()).
So I have developed the patch which solves this problem. Please let me
know your comments.
================================================================

Signed-off-by: Srinivasa DS <srinivasa@in.ibm.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 1408c5f..3b22727 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -199,7 +199,7 @@ struct page *gfs2_read_super(struct super_block *sb, sector_t sector)
 		return NULL;
 	}
 
-	bio->bi_sector = sector;
+	bio->bi_sector = sector * (sb->s_blocksize >> 9);
 	bio->bi_bdev = sb->s_bdev;
 	bio_add_page(bio, page, PAGE_SIZE, 0);
 
-- 
cgit v0.10.2


From aac1a3c77a46c2d06f297641760dd740ac2a84af Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 30 Nov 2006 10:02:19 -0500
Subject: [GFS2] Add a comment about reading the super block

The comment explains why we use the bio functions to read
the super block.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Andrew Morton <akpm@osdl.org>
Cc: Srinivasa Ds <srinivasa@in.ibm.com>

diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 3b22727..43a24f2 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -180,6 +180,24 @@ static int end_bio_io_page(struct bio *bio, unsigned int bytes_done, int error)
 	return 0;
 }
 
+/**
+ * gfs2_read_super - Read the gfs2 super block from disk
+ * @sb: The VFS super block
+ * @sector: The location of the super block
+ *
+ * This uses the bio functions to read the super block from disk
+ * because we want to be 100% sure that we never read cached data.
+ * A super block is read twice only during each GFS2 mount and is
+ * never written to by the filesystem. The first time its read no
+ * locks are held, and the only details which are looked at are those
+ * relating to the locking protocol. Once locking is up and working,
+ * the sb is read again under the lock to establish the location of
+ * the master directory (contains pointers to journals etc) and the
+ * root directory.
+ *
+ * Returns: A page containing the sb or NULL
+ */
+
 struct page *gfs2_read_super(struct super_block *sb, sector_t sector)
 {
 	struct page *page;
-- 
cgit v0.10.2


From 33c3de32872ef3c075e4dac04c0de8f86ac39f6f Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 30 Nov 2006 10:14:32 -0500
Subject: [GFS2] Don't flush everything on fdatasync

The gfs2_fsync() function was doing a journal flush on each
and every call. While this is correct, its also a lot of
overhead. This patch means that on fdatasync flushes we
rely on the VFS to flush the data for us and we don't do
a journal flush unless we really need to.

We have to do a journal flush for stuffed files though because
they have the data and the inode metadata in the same block.
Journaled files also need a journal flush too of course.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index c2be216..7bd971b 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -22,6 +22,7 @@
 #include <linux/ext2_fs.h>
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
+#include <linux/writeback.h>
 #include <asm/uaccess.h>
 
 #include "gfs2.h"
@@ -503,16 +504,39 @@ static int gfs2_close(struct inode *inode, struct file *file)
  * @file: the file that points to the dentry (we ignore this)
  * @dentry: the dentry that points to the inode to sync
  *
+ * The VFS will flush "normal" data for us. We only need to worry
+ * about metadata here. For journaled data, we just do a log flush
+ * as we can't avoid it. Otherwise we can just bale out if datasync
+ * is set. For stuffed inodes we must flush the log in order to
+ * ensure that all data is on disk.
+ *
  * Returns: errno
  */
 
 static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
-	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+	struct inode *inode = dentry->d_inode;
+	int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
+	int ret = 0;
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = 0,
+	};
+
+	if (gfs2_is_jdata(GFS2_I(inode))) {
+		gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
+		return 0;
+	}
 
-	gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
+	if (sync_state != 0) {
+		if (!datasync)
+			ret = sync_inode(inode, &wbc);
 
-	return 0;
+		if (gfs2_is_stuffed(GFS2_I(inode)))
+			gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
+	}
+
+	return ret;
 }
 
 /**
-- 
cgit v0.10.2


From 9cb427b6ff0b3e235c518acf5c1fcbbfc95f0ae2 Mon Sep 17 00:00:00 2001
From: Francois Romieu <romieu@fr.zoreil.com>
Date: Thu, 2 Nov 2006 00:10:16 +0100
Subject: r8169: more magic during initialization of the hardware

Mostly taken from Realtek's driver.

It's a bit yucky but the original is even worse.

Signed-off-by: Francois Romieu <romieu@fr.zoreil.com>
Signed-off-by: Darren Salt <linux@youmustbejoking.demon.co.uk>

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 45d3ca4..c8fa9b1 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -1815,12 +1815,25 @@ static void rtl8169_hw_reset(void __iomem *ioaddr)
 	RTL_R8(ChipCmd);
 }
 
-static void
-rtl8169_hw_start(struct net_device *dev)
+static void rtl8169_set_rx_tx_config_registers(struct rtl8169_private *tp)
+{
+	void __iomem *ioaddr = tp->mmio_addr;
+	u32 cfg = rtl8169_rx_config;
+
+	cfg |= (RTL_R32(RxConfig) & rtl_chip_info[tp->chipset].RxConfigMask);
+	RTL_W32(RxConfig, cfg);
+
+	/* Set DMA burst size and Interframe Gap Time */
+	RTL_W32(TxConfig, (TX_DMA_BURST << TxDMAShift) |
+		(InterFrameGap << TxInterFrameGapShift));
+}
+
+static void rtl8169_hw_start(struct net_device *dev)
 {
 	struct rtl8169_private *tp = netdev_priv(dev);
 	void __iomem *ioaddr = tp->mmio_addr;
 	struct pci_dev *pdev = tp->pci_dev;
+	u16 cmd;
 	u32 i;
 
 	/* Soft reset the chip. */
@@ -1833,6 +1846,11 @@ rtl8169_hw_start(struct net_device *dev)
 		msleep_interruptible(1);
 	}
 
+	if (tp->mac_version == RTL_GIGA_MAC_VER_05) {
+		RTL_W16(CPlusCmd, RTL_R16(CPlusCmd) | PCIMulRW);
+		pci_write_config_byte(pdev, PCI_CACHE_LINE_SIZE, 0x08);
+	}
+
 	if (tp->mac_version == RTL_GIGA_MAC_VER_13) {
 		pci_write_config_word(pdev, 0x68, 0x00);
 		pci_write_config_word(pdev, 0x69, 0x08);
@@ -1840,8 +1858,6 @@ rtl8169_hw_start(struct net_device *dev)
 
 	/* Undocumented stuff. */
 	if (tp->mac_version == RTL_GIGA_MAC_VER_05) {
-		u16 cmd;
-
 		/* Realtek's r1000_n.c driver uses '&& 0x01' here. Well... */
 		if ((RTL_R8(Config2) & 0x07) & 0x01)
 			RTL_W32(0x7c, 0x0007ffff);
@@ -1853,23 +1869,29 @@ rtl8169_hw_start(struct net_device *dev)
 		pci_write_config_word(pdev, PCI_COMMAND, cmd);
 	}
 
-
 	RTL_W8(Cfg9346, Cfg9346_Unlock);
+	if ((tp->mac_version == RTL_GIGA_MAC_VER_01) ||
+	    (tp->mac_version == RTL_GIGA_MAC_VER_02) ||
+	    (tp->mac_version == RTL_GIGA_MAC_VER_03) ||
+	    (tp->mac_version == RTL_GIGA_MAC_VER_04))
+		RTL_W8(ChipCmd, CmdTxEnb | CmdRxEnb);
+
 	RTL_W8(EarlyTxThres, EarlyTxThld);
 
 	/* Low hurts. Let's disable the filtering. */
 	RTL_W16(RxMaxSize, 16383);
 
-	/* Set Rx Config register */
-	i = rtl8169_rx_config |
-		(RTL_R32(RxConfig) & rtl_chip_info[tp->chipset].RxConfigMask);
-	RTL_W32(RxConfig, i);
+	if ((tp->mac_version == RTL_GIGA_MAC_VER_01) ||
+	    (tp->mac_version == RTL_GIGA_MAC_VER_02) ||
+	    (tp->mac_version == RTL_GIGA_MAC_VER_03) ||
+	    (tp->mac_version == RTL_GIGA_MAC_VER_04))
+		RTL_W8(ChipCmd, CmdTxEnb | CmdRxEnb);
+		rtl8169_set_rx_tx_config_registers(tp);
 
-	/* Set DMA burst size and Interframe Gap Time */
-	RTL_W32(TxConfig, (TX_DMA_BURST << TxDMAShift) |
-		(InterFrameGap << TxInterFrameGapShift));
+	cmd = RTL_R16(CPlusCmd);
+	RTL_W16(CPlusCmd, cmd);
 
-	tp->cp_cmd |= RTL_R16(CPlusCmd) | PCIMulRW;
+	tp->cp_cmd |= cmd | PCIMulRW;
 
 	if ((tp->mac_version == RTL_GIGA_MAC_VER_02) ||
 	    (tp->mac_version == RTL_GIGA_MAC_VER_03)) {
@@ -1895,7 +1917,15 @@ rtl8169_hw_start(struct net_device *dev)
 	RTL_W32(TxDescStartAddrLow, ((u64) tp->TxPhyAddr & DMA_32BIT_MASK));
 	RTL_W32(RxDescAddrHigh, ((u64) tp->RxPhyAddr >> 32));
 	RTL_W32(RxDescAddrLow, ((u64) tp->RxPhyAddr & DMA_32BIT_MASK));
-	RTL_W8(ChipCmd, CmdTxEnb | CmdRxEnb);
+
+	if ((tp->mac_version != RTL_GIGA_MAC_VER_01) &&
+	    (tp->mac_version != RTL_GIGA_MAC_VER_02) &&
+	    (tp->mac_version != RTL_GIGA_MAC_VER_03) &&
+	    (tp->mac_version != RTL_GIGA_MAC_VER_04)) {
+		RTL_W8(ChipCmd, CmdTxEnb | CmdRxEnb);
+		rtl8169_set_rx_tx_config_registers(tp);
+	}
+
 	RTL_W8(Cfg9346, Cfg9346_Lock);
 
 	/* Initially a 10 us delay. Turned it into a PCI commit. - FR */
-- 
cgit v0.10.2


From d03902b8864d7814c938f67befade5a3bba68708 Mon Sep 17 00:00:00 2001
From: Francois Romieu <romieu@fr.zoreil.com>
Date: Thu, 23 Nov 2006 00:00:42 +0100
Subject: r8169: tweak the PCI data parity error recovery

The 8110SB based n2100 board signals a lot of what ought to be
PCI data parity errors durint operation of the 8169 as target.
Experiment proved that the driver can ignore the error and
process the packet as if nothing had happened.

Let's add an ad-hoc knob to enable users to fix their system while
avoiding the risks of a wholesale change.

Signed-off-by: Francois Romieu <romieu@fr.zoreil.com>

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index c8fa9b1..7438049 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -225,6 +225,7 @@ MODULE_DEVICE_TABLE(pci, rtl8169_pci_tbl);
 
 static int rx_copybreak = 200;
 static int use_dac;
+static int ignore_parity_err;
 static struct {
 	u32 msg_enable;
 } debug = { -1 };
@@ -469,6 +470,8 @@ module_param(use_dac, int, 0);
 MODULE_PARM_DESC(use_dac, "Enable PCI DAC. Unsafe on 32 bit PCI slot.");
 module_param_named(debug, debug.msg_enable, int, 0);
 MODULE_PARM_DESC(debug, "Debug verbosity level (0=none, ..., 16=all)");
+module_param_named(ignore_parity_err, ignore_parity_err, bool, 0);
+MODULE_PARM_DESC(ignore_parity_err, "Ignore PCI parity error as target. Default: false");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(RTL8169_VERSION);
 
@@ -2380,12 +2383,17 @@ static void rtl8169_pcierr_interrupt(struct net_device *dev)
 	/*
 	 * The recovery sequence below admits a very elaborated explanation:
 	 * - it seems to work;
-	 * - I did not see what else could be done.
+	 * - I did not see what else could be done;
+	 * - it makes iop3xx happy.
 	 *
 	 * Feel free to adjust to your needs.
 	 */
-	pci_write_config_word(pdev, PCI_COMMAND,
-			      pci_cmd | PCI_COMMAND_SERR | PCI_COMMAND_PARITY);
+	if (ignore_parity_err)
+		pci_cmd &= ~PCI_COMMAND_PARITY;
+	else
+		pci_cmd |= PCI_COMMAND_SERR | PCI_COMMAND_PARITY;
+
+	pci_write_config_word(pdev, PCI_COMMAND, pci_cmd);
 
 	pci_write_config_word(pdev, PCI_STATUS,
 		pci_status & (PCI_STATUS_DETECTED_PARITY |
@@ -2399,10 +2407,11 @@ static void rtl8169_pcierr_interrupt(struct net_device *dev)
 		tp->cp_cmd &= ~PCIDAC;
 		RTL_W16(CPlusCmd, tp->cp_cmd);
 		dev->features &= ~NETIF_F_HIGHDMA;
-		rtl8169_schedule_work(dev, rtl8169_reinit_task);
 	}
 
 	rtl8169_hw_reset(ioaddr);
+
+	rtl8169_schedule_work(dev, rtl8169_reinit_task);
 }
 
 static void
-- 
cgit v0.10.2


From 12d86f682e8acad8555718dc7b0082590f2365d0 Mon Sep 17 00:00:00 2001
From: Francois Romieu <romieu@fr.zoreil.com>
Date: Thu, 2 Nov 2006 00:24:52 +0100
Subject: r8169: phy program update

This is commented out in Realtek's driver as well.

Signed-off-by: Francois Romieu <romieu@fr.zoreil.com>

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 7438049..0b57050 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -1286,11 +1286,6 @@ static void rtl8169_hw_phy_config(struct net_device *dev)
 	/* Shazam ! */
 
 	if (tp->mac_version == RTL_GIGA_MAC_VER_04) {
-		mdio_write(ioaddr, 31, 0x0001);
-		mdio_write(ioaddr,  9, 0x273a);
-		mdio_write(ioaddr, 14, 0x7bfb);
-		mdio_write(ioaddr, 27, 0x841e);
-
 		mdio_write(ioaddr, 31, 0x0002);
 		mdio_write(ioaddr,  1, 0x90d0);
 		mdio_write(ioaddr, 31, 0x0000);
-- 
cgit v0.10.2


From cc9f022d97d08e4e36d38661857991fe91447d68 Mon Sep 17 00:00:00 2001
From: Francois Romieu <romieu@fr.zoreil.com>
Date: Fri, 17 Nov 2006 23:15:17 +0100
Subject: r8169: more alignment for the 0x8168

Two thirds of packets are lost because of misalignment. Users of
Asus laptop did apparently not notice it.

Reported on Gigabyte GA-945GM-S2.

Fix for http://bugzilla.kernel.org/show_bug.cgi?id=7517

Signed-off-by: Francois Romieu <romieu@fr.zoreil.com>

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 0b57050..2379d83 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -2018,7 +2018,7 @@ static int rtl8169_alloc_rx_skb(struct pci_dev *pdev, struct sk_buff **sk_buff,
 	if (!skb)
 		goto err_out;
 
-	skb_reserve(skb, align);
+	skb_reserve(skb, (align - 1) & (u32)skb->data);
 	*sk_buff = skb;
 
 	mapping = pci_map_single(pdev, skb->data, rx_buf_sz,
@@ -2486,7 +2486,7 @@ static inline int rtl8169_try_rx_copy(struct sk_buff **sk_buff, int pkt_size,
 
 		skb = dev_alloc_skb(pkt_size + align);
 		if (skb) {
-			skb_reserve(skb, align);
+			skb_reserve(skb, (align - 1) & (u32)skb->data);
 			eth_copy_and_sum(skb, sk_buff[0]->data, pkt_size, 0);
 			*sk_buff = skb;
 			rtl8169_mark_to_asic(desc, rx_buf_sz);
-- 
cgit v0.10.2


From fbd819766568c6f3d286dbabb9a17bb13e48f40d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 30 May 2006 23:58:25 -0400
Subject: [PATCH] __iomem annotations: smc91x

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/drivers/net/smc91x.h b/drivers/net/smc91x.h
index a864016..9e0fbc5 100644
--- a/drivers/net/smc91x.h
+++ b/drivers/net/smc91x.h
@@ -1216,7 +1216,7 @@ static const char * chip_ids[ 16 ] =  {
 		if (SMC_CAN_USE_32BIT) {				\
 			void *__ptr = (p);				\
 			int __len = (l);				\
-			void *__ioaddr = ioaddr;			\
+			void __iomem *__ioaddr = ioaddr;		\
 			if (__len >= 2 && (unsigned long)__ptr & 2) {	\
 				__len -= 2;				\
 				SMC_outw(*(u16 *)__ptr, ioaddr, DATA_REG); \
@@ -1240,7 +1240,7 @@ static const char * chip_ids[ 16 ] =  {
 		if (SMC_CAN_USE_32BIT) {				\
 			void *__ptr = (p);				\
 			int __len = (l);				\
-			void *__ioaddr = ioaddr;			\
+			void __iomem *__ioaddr = ioaddr;		\
 			if ((unsigned long)__ptr & 2) {			\
 				/*					\
 				 * We want 32bit alignment here.	\
-- 
cgit v0.10.2


From 059807755c0d2b2727588bb52951f8ff6cbf07b4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 30 May 2006 23:59:09 -0400
Subject: [PATCH] mv643xx_eth.c NULL noise removal

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index 9997081..21d0137 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -1098,7 +1098,7 @@ static void eth_tx_fill_frag_descs(struct mv643xx_private *mp,
 					 ETH_TX_ENABLE_INTERRUPT;
 			mp->tx_skb[tx_index] = skb;
 		} else
-			mp->tx_skb[tx_index] = 0;
+			mp->tx_skb[tx_index] = NULL;
 
 		desc = &mp->p_tx_desc_area[tx_index];
 		desc->l4i_chk = 0;
@@ -1134,7 +1134,7 @@ static void eth_tx_submit_descs_for_skb(struct mv643xx_private *mp,
 		eth_tx_fill_frag_descs(mp, skb);
 
 		length = skb_headlen(skb);
-		mp->tx_skb[tx_index] = 0;
+		mp->tx_skb[tx_index] = NULL;
 	} else {
 		cmd_sts |= ETH_ZERO_PADDING |
 			   ETH_TX_LAST_DESC |
-- 
cgit v0.10.2


From afc8eb46c0ea2cab8bc28713b2e0614f015a7516 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 14 Jun 2006 18:50:53 -0400
Subject: [PATCH] trivial missing __init in drivers/net/*

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/drivers/net/3c501.c b/drivers/net/3c501.c
index 11d170a..06e3378 100644
--- a/drivers/net/3c501.c
+++ b/drivers/net/3c501.c
@@ -922,7 +922,7 @@ int __init init_module(void)
  * and then free up the resources we took when the card was found.
  */
 
-void cleanup_module(void)
+void __exit cleanup_module(void)
 {
 	struct net_device *dev = dev_3c501;
 	unregister_netdev(dev);
diff --git a/drivers/net/3c503.c b/drivers/net/3c503.c
index a34b220..7e34c4f 100644
--- a/drivers/net/3c503.c
+++ b/drivers/net/3c503.c
@@ -726,7 +726,7 @@ static void cleanup_card(struct net_device *dev)
 		iounmap(ei_status.mem);
 }
 
-void
+void __exit
 cleanup_module(void)
 {
 	int this_dev;
diff --git a/drivers/net/3c505.c b/drivers/net/3c505.c
index 458cb9c..702bfb2 100644
--- a/drivers/net/3c505.c
+++ b/drivers/net/3c505.c
@@ -1670,7 +1670,7 @@ int __init init_module(void)
 	return 0;
 }
 
-void cleanup_module(void)
+void __exit cleanup_module(void)
 {
 	int this_dev;
 
diff --git a/drivers/net/3c507.c b/drivers/net/3c507.c
index aa43563..54e1d5a 100644
--- a/drivers/net/3c507.c
+++ b/drivers/net/3c507.c
@@ -940,7 +940,7 @@ int __init init_module(void)
 	return IS_ERR(dev_3c507) ? PTR_ERR(dev_3c507) : 0;
 }
 
-void
+void __exit
 cleanup_module(void)
 {
 	struct net_device *dev = dev_3c507;
diff --git a/drivers/net/3c523.c b/drivers/net/3c523.c
index 9184946..17d61eb 100644
--- a/drivers/net/3c523.c
+++ b/drivers/net/3c523.c
@@ -1302,7 +1302,7 @@ int __init init_module(void)
 	} else return 0;
 }
 
-void cleanup_module(void)
+void __exit cleanup_module(void)
 {
 	int this_dev;
 	for (this_dev=0; this_dev<MAX_3C523_CARDS; this_dev++) {
diff --git a/drivers/net/3c527.c b/drivers/net/3c527.c
index f4aca53..6c7437e 100644
--- a/drivers/net/3c527.c
+++ b/drivers/net/3c527.c
@@ -1659,7 +1659,7 @@ int __init init_module(void)
  *	transmit operations are allowed to start scribbling into memory.
  */
 
-void cleanup_module(void)
+void __exit cleanup_module(void)
 {
 	unregister_netdev(this_device);
 	cleanup_card(this_device);
diff --git a/drivers/net/ac3200.c b/drivers/net/ac3200.c
index 0dca8bb..c01f87f 100644
--- a/drivers/net/ac3200.c
+++ b/drivers/net/ac3200.c
@@ -405,7 +405,7 @@ static void cleanup_card(struct net_device *dev)
 	iounmap(ei_status.mem);
 }
 
-void
+void __exit
 cleanup_module(void)
 {
 	int this_dev;
diff --git a/drivers/net/apne.c b/drivers/net/apne.c
index 9164d8c..d4e4081 100644
--- a/drivers/net/apne.c
+++ b/drivers/net/apne.c
@@ -568,7 +568,7 @@ static irqreturn_t apne_interrupt(int irq, void *dev_id)
 #ifdef MODULE
 static struct net_device *apne_dev;
 
-int init_module(void)
+int __init init_module(void)
 {
 	apne_dev = apne_probe(-1);
 	if (IS_ERR(apne_dev))
@@ -576,7 +576,7 @@ int init_module(void)
 	return 0;
 }
 
-void cleanup_module(void)
+void __exit cleanup_module(void)
 {
 	unregister_netdev(apne_dev);
 
diff --git a/drivers/net/appletalk/cops.c b/drivers/net/appletalk/cops.c
index cc1a27e..dba5e51 100644
--- a/drivers/net/appletalk/cops.c
+++ b/drivers/net/appletalk/cops.c
@@ -1041,7 +1041,7 @@ int __init init_module(void)
         return 0;
 }
 
-void cleanup_module(void)
+void __exit cleanup_module(void)
 {
 	unregister_netdev(cops_dev);
 	cleanup_card(cops_dev);
diff --git a/drivers/net/at1700.c b/drivers/net/at1700.c
index 8620a5b..56ae8ba 100644
--- a/drivers/net/at1700.c
+++ b/drivers/net/at1700.c
@@ -908,7 +908,7 @@ int __init init_module(void)
 	return 0;
 }
 
-void
+void __exit
 cleanup_module(void)
 {
 	unregister_netdev(dev_at1700);
diff --git a/drivers/net/atarilance.c b/drivers/net/atarilance.c
index d79489e..7e37ac8 100644
--- a/drivers/net/atarilance.c
+++ b/drivers/net/atarilance.c
@@ -1179,7 +1179,7 @@ static int lance_set_mac_address( struct net_device *dev, void *addr )
 #ifdef MODULE
 static struct net_device *atarilance_dev;
 
-int init_module(void)
+int __init init_module(void)
 {
 	atarilance_dev = atarilance_probe(-1);
 	if (IS_ERR(atarilance_dev))
@@ -1187,7 +1187,7 @@ int init_module(void)
 	return 0;
 }
 
-void cleanup_module(void)
+void __exit cleanup_module(void)
 {
 	unregister_netdev(atarilance_dev);
 	free_irq(atarilance_dev->irq, atarilance_dev);
diff --git a/drivers/net/cs89x0.c b/drivers/net/cs89x0.c
index dec70c2..4612f71 100644
--- a/drivers/net/cs89x0.c
+++ b/drivers/net/cs89x0.c
@@ -1974,7 +1974,7 @@ out:
 	return ret;
 }
 
-void
+void __exit
 cleanup_module(void)
 {
 	unregister_netdev(dev_cs89x0);
diff --git a/drivers/net/e2100.c b/drivers/net/e2100.c
index d39e848..c62d9c6 100644
--- a/drivers/net/e2100.c
+++ b/drivers/net/e2100.c
@@ -463,7 +463,7 @@ static void cleanup_card(struct net_device *dev)
 	release_region(dev->base_addr, E21_IO_EXTENT);
 }
 
-void
+void __exit
 cleanup_module(void)
 {
 	int this_dev;
diff --git a/drivers/net/eepro.c b/drivers/net/eepro.c
index a4eb0dc..b446309 100644
--- a/drivers/net/eepro.c
+++ b/drivers/net/eepro.c
@@ -1827,7 +1827,7 @@ int __init init_module(void)
 	return n_eepro ? 0 : -ENODEV;
 }
 
-void
+void __exit
 cleanup_module(void)
 {
 	int i;
diff --git a/drivers/net/eexpress.c b/drivers/net/eexpress.c
index e14be02..4a50fcb 100644
--- a/drivers/net/eexpress.c
+++ b/drivers/net/eexpress.c
@@ -1719,7 +1719,7 @@ int __init init_module(void)
 	return -ENXIO;
 }
 
-void cleanup_module(void)
+void __exit cleanup_module(void)
 {
 	int this_dev;
 
diff --git a/drivers/net/es3210.c b/drivers/net/es3210.c
index fd7b32a..2d2ea94 100644
--- a/drivers/net/es3210.c
+++ b/drivers/net/es3210.c
@@ -455,7 +455,7 @@ static void cleanup_card(struct net_device *dev)
 	iounmap(ei_status.mem);
 }
 
-void
+void __exit
 cleanup_module(void)
 {
 	int this_dev;
diff --git a/drivers/net/eth16i.c b/drivers/net/eth16i.c
index b7b8bc2..93283e3 100644
--- a/drivers/net/eth16i.c
+++ b/drivers/net/eth16i.c
@@ -1475,7 +1475,7 @@ int __init init_module(void)
 	return -ENXIO;
 }
 
-void cleanup_module(void)
+void __exit cleanup_module(void)
 {
 	int this_dev;
 
diff --git a/drivers/net/hp-plus.c b/drivers/net/hp-plus.c
index 6abcfd2..99a36cc 100644
--- a/drivers/net/hp-plus.c
+++ b/drivers/net/hp-plus.c
@@ -482,7 +482,7 @@ static void cleanup_card(struct net_device *dev)
 	release_region(dev->base_addr - NIC_OFFSET, HP_IO_EXTENT);
 }
 
-void
+void __exit
 cleanup_module(void)
 {
 	int this_dev;
diff --git a/drivers/net/hp.c b/drivers/net/hp.c
index 2947097..635b13c 100644
--- a/drivers/net/hp.c
+++ b/drivers/net/hp.c
@@ -444,7 +444,7 @@ static void cleanup_card(struct net_device *dev)
 	release_region(dev->base_addr - NIC_OFFSET, HP_IO_EXTENT);
 }
 
-void
+void __exit
 cleanup_module(void)
 {
 	int this_dev;
diff --git a/drivers/net/lance.c b/drivers/net/lance.c
index 4256c13..a384332 100644
--- a/drivers/net/lance.c
+++ b/drivers/net/lance.c
@@ -368,7 +368,7 @@ static void cleanup_card(struct net_device *dev)
 	kfree(lp);
 }
 
-void cleanup_module(void)
+void __exit cleanup_module(void)
 {
 	int this_dev;
 
diff --git a/drivers/net/lne390.c b/drivers/net/lne390.c
index 5795ee1..0a08d0c 100644
--- a/drivers/net/lne390.c
+++ b/drivers/net/lne390.c
@@ -440,7 +440,7 @@ static void cleanup_card(struct net_device *dev)
 	iounmap(ei_status.mem);
 }
 
-void cleanup_module(void)
+void __exit cleanup_module(void)
 {
 	int this_dev;
 
diff --git a/drivers/net/mvme147.c b/drivers/net/mvme147.c
index 56a82d8..e246d00 100644
--- a/drivers/net/mvme147.c
+++ b/drivers/net/mvme147.c
@@ -184,7 +184,7 @@ static int m147lance_close(struct net_device *dev)
 MODULE_LICENSE("GPL");
 
 static struct net_device *dev_mvme147_lance;
-int init_module(void)
+int __init init_module(void)
 {
 	dev_mvme147_lance = mvme147lance_probe(-1);
 	if (IS_ERR(dev_mvme147_lance))
@@ -192,7 +192,7 @@ int init_module(void)
 	return 0;
 }
 
-void cleanup_module(void)
+void __exit cleanup_module(void)
 {
 	struct m147lance_private *lp = dev_mvme147_lance->priv;
 	unregister_netdev(dev_mvme147_lance);
diff --git a/drivers/net/ne.c b/drivers/net/ne.c
index 787aa42..a5c4199 100644
--- a/drivers/net/ne.c
+++ b/drivers/net/ne.c
@@ -867,7 +867,7 @@ static void cleanup_card(struct net_device *dev)
 	release_region(dev->base_addr, NE_IO_EXTENT);
 }
 
-void cleanup_module(void)
+void __exit cleanup_module(void)
 {
 	int this_dev;
 
diff --git a/drivers/net/ne2.c b/drivers/net/ne2.c
index 5fccfea..089b5bb 100644
--- a/drivers/net/ne2.c
+++ b/drivers/net/ne2.c
@@ -813,7 +813,7 @@ static void cleanup_card(struct net_device *dev)
 	release_region(dev->base_addr, NE_IO_EXTENT);
 }
 
-void cleanup_module(void)
+void __exit cleanup_module(void)
 {
 	int this_dev;
 
diff --git a/drivers/net/ni52.c b/drivers/net/ni52.c
index 26e42f6..196993a 100644
--- a/drivers/net/ni52.c
+++ b/drivers/net/ni52.c
@@ -1335,7 +1335,7 @@ int __init init_module(void)
 	return 0;
 }
 
-void cleanup_module(void)
+void __exit cleanup_module(void)
 {
 	unregister_netdev(dev_ni52);
 	release_region(dev_ni52->base_addr, NI52_TOTAL_SIZE);
diff --git a/drivers/net/ni65.c b/drivers/net/ni65.c
index 340ad0d..1578f4d 100644
--- a/drivers/net/ni65.c
+++ b/drivers/net/ni65.c
@@ -1259,7 +1259,7 @@ int __init init_module(void)
 	return IS_ERR(dev_ni65) ? PTR_ERR(dev_ni65) : 0;
 }
 
-void cleanup_module(void)
+void __exit cleanup_module(void)
 {
  	unregister_netdev(dev_ni65);
  	cleanup_card(dev_ni65);
diff --git a/drivers/net/seeq8005.c b/drivers/net/seeq8005.c
index d9d0a3a..0d6c95c 100644
--- a/drivers/net/seeq8005.c
+++ b/drivers/net/seeq8005.c
@@ -750,7 +750,7 @@ int __init init_module(void)
 	return 0;
 }
 
-void cleanup_module(void)
+void __exit cleanup_module(void)
 {
 	unregister_netdev(dev_seeq);
 	release_region(dev_seeq->base_addr, SEEQ8005_IO_EXTENT);
diff --git a/drivers/net/smc-ultra.c b/drivers/net/smc-ultra.c
index 889ef0d..d70bc97 100644
--- a/drivers/net/smc-ultra.c
+++ b/drivers/net/smc-ultra.c
@@ -593,7 +593,7 @@ static void cleanup_card(struct net_device *dev)
 	iounmap(ei_status.mem);
 }
 
-void
+void __exit
 cleanup_module(void)
 {
 	int this_dev;
diff --git a/drivers/net/smc-ultra32.c b/drivers/net/smc-ultra32.c
index e10755e..2c5319c 100644
--- a/drivers/net/smc-ultra32.c
+++ b/drivers/net/smc-ultra32.c
@@ -437,7 +437,7 @@ int __init init_module(void)
 	return -ENXIO;
 }
 
-void cleanup_module(void)
+void __exit cleanup_module(void)
 {
 	int this_dev;
 
diff --git a/drivers/net/smc9194.c b/drivers/net/smc9194.c
index c0d13d6..bd6e845 100644
--- a/drivers/net/smc9194.c
+++ b/drivers/net/smc9194.c
@@ -1616,7 +1616,7 @@ int __init init_module(void)
 	return 0;
 }
 
-void cleanup_module(void)
+void __exit cleanup_module(void)
 {
 	unregister_netdev(devSMC9194);
 	free_irq(devSMC9194->irq, devSMC9194);
diff --git a/drivers/net/sun3lance.c b/drivers/net/sun3lance.c
index 47a1c09..c62e85d 100644
--- a/drivers/net/sun3lance.c
+++ b/drivers/net/sun3lance.c
@@ -945,7 +945,7 @@ static void set_multicast_list( struct net_device *dev )
 
 static struct net_device *sun3lance_dev;
 
-int init_module(void)
+int __init init_module(void)
 {
 	sun3lance_dev = sun3lance_probe(-1);
 	if (IS_ERR(sun3lance_dev))
@@ -953,7 +953,7 @@ int init_module(void)
 	return 0;
 }
 
-void cleanup_module(void)
+void __exit cleanup_module(void)
 {
 	unregister_netdev(sun3lance_dev);
 #ifdef CONFIG_SUN3
diff --git a/drivers/net/tokenring/smctr.c b/drivers/net/tokenring/smctr.c
index 46dabdb..cec282a 100644
--- a/drivers/net/tokenring/smctr.c
+++ b/drivers/net/tokenring/smctr.c
@@ -5706,7 +5706,7 @@ int __init init_module(void)
         return found ? 0 : -ENODEV;
 }
 
-void cleanup_module(void)
+void __exit cleanup_module(void)
 {
         int i;
 
diff --git a/drivers/net/wd.c b/drivers/net/wd.c
index 41f1d67..7f38012 100644
--- a/drivers/net/wd.c
+++ b/drivers/net/wd.c
@@ -538,7 +538,7 @@ static void cleanup_card(struct net_device *dev)
 	iounmap(ei_status.mem);
 }
 
-void
+void __exit
 cleanup_module(void)
 {
 	int this_dev;
-- 
cgit v0.10.2


From bffa2154956da31f59c6050f176fadba630ff53a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 15 Jun 2006 14:23:19 -0400
Subject: [PATCH] drivers/net/arm missing __devinit

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/drivers/net/arm/ether1.c b/drivers/net/arm/ether1.c
index f3478a3..d6da3ce 100644
--- a/drivers/net/arm/ether1.c
+++ b/drivers/net/arm/ether1.c
@@ -254,7 +254,7 @@ ether1_readbuffer (struct net_device *dev, void *data, unsigned int start, unsig
 	} while (thislen);
 }
 
-static int __init
+static int __devinit
 ether1_ramtest(struct net_device *dev, unsigned char byte)
 {
 	unsigned char *buffer = kmalloc (BUFFER_SIZE, GFP_KERNEL);
@@ -308,7 +308,7 @@ ether1_reset (struct net_device *dev)
 	return BUS_16;
 }
 
-static int __init
+static int __devinit
 ether1_init_2(struct net_device *dev)
 {
 	int i;
@@ -986,7 +986,7 @@ ether1_setmulticastlist (struct net_device *dev)
 
 /* ------------------------------------------------------------------------- */
 
-static void __init ether1_banner(void)
+static void __devinit ether1_banner(void)
 {
 	static unsigned int version_printed = 0;
 
diff --git a/drivers/net/arm/ether3.c b/drivers/net/arm/ether3.c
index 84686c8..4fc2347 100644
--- a/drivers/net/arm/ether3.c
+++ b/drivers/net/arm/ether3.c
@@ -198,7 +198,7 @@ static inline void ether3_ledon(struct net_device *dev)
  * Read the ethernet address string from the on board rom.
  * This is an ascii string!!!
  */
-static int __init
+static int __devinit
 ether3_addr(char *addr, struct expansion_card *ec)
 {
 	struct in_chunk_dir cd;
@@ -223,7 +223,7 @@ ether3_addr(char *addr, struct expansion_card *ec)
 
 /* --------------------------------------------------------------------------- */
 
-static int __init
+static int __devinit
 ether3_ramtest(struct net_device *dev, unsigned char byte)
 {
 	unsigned char *buffer = kmalloc(RX_END, GFP_KERNEL);
@@ -272,7 +272,7 @@ ether3_ramtest(struct net_device *dev, unsigned char byte)
 
 /* ------------------------------------------------------------------------------- */
 
-static int __init ether3_init_2(struct net_device *dev)
+static int __devinit ether3_init_2(struct net_device *dev)
 {
 	int i;
 
@@ -765,7 +765,7 @@ static void ether3_tx(struct net_device *dev)
 	}
 }
 
-static void __init ether3_banner(void)
+static void __devinit ether3_banner(void)
 {
 	static unsigned version_printed = 0;
 
-- 
cgit v0.10.2


From 40f6cff5c47efac2df361fbfc2eb2816729986c8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 20 Nov 2006 13:48:32 -0500
Subject: [PATCH] myri10ge annotations

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
index 36350e6..d320bbe 100644
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -89,7 +89,7 @@ MODULE_LICENSE("Dual BSD/GPL");
 #define MYRI10GE_EEPROM_STRINGS_SIZE 256
 #define MYRI10GE_MAX_SEND_DESC_TSO ((65536 / 2048) * 2)
 
-#define MYRI10GE_NO_CONFIRM_DATA 0xffffffff
+#define MYRI10GE_NO_CONFIRM_DATA htonl(0xffffffff)
 #define MYRI10GE_NO_RESPONSE_RESULT 0xffffffff
 
 struct myri10ge_rx_buffer_state {
@@ -156,8 +156,8 @@ struct myri10ge_priv {
 	int sram_size;
 	unsigned long board_span;
 	unsigned long iomem_base;
-	u32 __iomem *irq_claim;
-	u32 __iomem *irq_deassert;
+	__be32 __iomem *irq_claim;
+	__be32 __iomem *irq_deassert;
 	char *mac_addr_string;
 	struct mcp_cmd_response *cmd;
 	dma_addr_t cmd_bus;
@@ -165,10 +165,10 @@ struct myri10ge_priv {
 	dma_addr_t fw_stats_bus;
 	struct pci_dev *pdev;
 	int msi_enabled;
-	unsigned int link_state;
+	__be32 link_state;
 	unsigned int rdma_tags_available;
 	int intr_coal_delay;
-	u32 __iomem *intr_coal_delay_ptr;
+	__be32 __iomem *intr_coal_delay_ptr;
 	int mtrr;
 	int wake_queue;
 	int stop_queue;
@@ -273,6 +273,11 @@ MODULE_PARM_DESC(myri10ge_debug, "Debug level (0=none,...,16=all)");
 
 #define myri10ge_pio_copy(to,from,size) __iowrite64_copy(to,from,size/8)
 
+static inline void put_be32(__be32 val, __be32 __iomem *p)
+{
+	__raw_writel((__force __u32)val, (__force void __iomem *)p);
+}
+
 static int
 myri10ge_send_cmd(struct myri10ge_priv *mgp, u32 cmd,
 		  struct myri10ge_cmd *data, int atomic)
@@ -296,7 +301,7 @@ myri10ge_send_cmd(struct myri10ge_priv *mgp, u32 cmd,
 
 	buf->response_addr.low = htonl(dma_low);
 	buf->response_addr.high = htonl(dma_high);
-	response->result = MYRI10GE_NO_RESPONSE_RESULT;
+	response->result = htonl(MYRI10GE_NO_RESPONSE_RESULT);
 	mb();
 	myri10ge_pio_copy(cmd_addr, buf, sizeof(*buf));
 
@@ -311,14 +316,14 @@ myri10ge_send_cmd(struct myri10ge_priv *mgp, u32 cmd,
 		 * (1ms will be enough for those commands) */
 		for (sleep_total = 0;
 		     sleep_total < 1000
-		     && response->result == MYRI10GE_NO_RESPONSE_RESULT;
+		     && response->result == htonl(MYRI10GE_NO_RESPONSE_RESULT);
 		     sleep_total += 10)
 			udelay(10);
 	} else {
 		/* use msleep for most command */
 		for (sleep_total = 0;
 		     sleep_total < 15
-		     && response->result == MYRI10GE_NO_RESPONSE_RESULT;
+		     && response->result == htonl(MYRI10GE_NO_RESPONSE_RESULT);
 		     sleep_total++)
 			msleep(1);
 	}
@@ -393,7 +398,7 @@ abort:
 static void myri10ge_dummy_rdma(struct myri10ge_priv *mgp, int enable)
 {
 	char __iomem *submit;
-	u32 buf[16];
+	__be32 buf[16];
 	u32 dma_low, dma_high;
 	int i;
 
@@ -410,7 +415,7 @@ static void myri10ge_dummy_rdma(struct myri10ge_priv *mgp, int enable)
 
 	buf[0] = htonl(dma_high);	/* confirm addr MSW */
 	buf[1] = htonl(dma_low);	/* confirm addr LSW */
-	buf[2] = htonl(MYRI10GE_NO_CONFIRM_DATA);	/* confirm data */
+	buf[2] = MYRI10GE_NO_CONFIRM_DATA;	/* confirm data */
 	buf[3] = htonl(dma_high);	/* dummy addr MSW */
 	buf[4] = htonl(dma_low);	/* dummy addr LSW */
 	buf[5] = htonl(enable);	/* enable? */
@@ -479,7 +484,7 @@ static int myri10ge_load_hotplug_firmware(struct myri10ge_priv *mgp, u32 * size)
 	}
 
 	/* check id */
-	hdr_offset = ntohl(*(u32 *) (fw->data + MCP_HEADER_PTR_OFFSET));
+	hdr_offset = ntohl(*(__be32 *) (fw->data + MCP_HEADER_PTR_OFFSET));
 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw->size) {
 		dev_err(dev, "Bad firmware file\n");
 		status = -EINVAL;
@@ -550,7 +555,7 @@ static int myri10ge_adopt_running_firmware(struct myri10ge_priv *mgp)
 static int myri10ge_load_firmware(struct myri10ge_priv *mgp)
 {
 	char __iomem *submit;
-	u32 buf[16];
+	__be32 buf[16];
 	u32 dma_low, dma_high, size;
 	int status, i;
 
@@ -600,7 +605,7 @@ static int myri10ge_load_firmware(struct myri10ge_priv *mgp)
 
 	buf[0] = htonl(dma_high);	/* confirm addr MSW */
 	buf[1] = htonl(dma_low);	/* confirm addr LSW */
-	buf[2] = htonl(MYRI10GE_NO_CONFIRM_DATA);	/* confirm data */
+	buf[2] = MYRI10GE_NO_CONFIRM_DATA;	/* confirm data */
 
 	/* FIX: All newest firmware should un-protect the bottom of
 	 * the sram before handoff. However, the very first interfaces
@@ -705,21 +710,21 @@ static int myri10ge_reset(struct myri10ge_priv *mgp)
 
 	status |=
 	    myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd, 0);
-	mgp->irq_claim = (__iomem u32 *) (mgp->sram + cmd.data0);
+	mgp->irq_claim = (__iomem __be32 *) (mgp->sram + cmd.data0);
 	if (!mgp->msi_enabled) {
 		status |= myri10ge_send_cmd
 		    (mgp, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd, 0);
-		mgp->irq_deassert = (__iomem u32 *) (mgp->sram + cmd.data0);
+		mgp->irq_deassert = (__iomem __be32 *) (mgp->sram + cmd.data0);
 
 	}
 	status |= myri10ge_send_cmd
 	    (mgp, MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd, 0);
-	mgp->intr_coal_delay_ptr = (__iomem u32 *) (mgp->sram + cmd.data0);
+	mgp->intr_coal_delay_ptr = (__iomem __be32 *) (mgp->sram + cmd.data0);
 	if (status != 0) {
 		dev_err(&mgp->pdev->dev, "failed set interrupt parameters\n");
 		return status;
 	}
-	__raw_writel(htonl(mgp->intr_coal_delay), mgp->intr_coal_delay_ptr);
+	put_be32(htonl(mgp->intr_coal_delay), mgp->intr_coal_delay_ptr);
 
 	/* Run a small DMA test.
 	 * The magic multipliers to the length tell the firmware
@@ -786,14 +791,14 @@ static inline void
 myri10ge_submit_8rx(struct mcp_kreq_ether_recv __iomem * dst,
 		    struct mcp_kreq_ether_recv *src)
 {
-	u32 low;
+	__be32 low;
 
 	low = src->addr_low;
-	src->addr_low = DMA_32BIT_MASK;
+	src->addr_low = htonl(DMA_32BIT_MASK);
 	myri10ge_pio_copy(dst, src, 8 * sizeof(*src));
 	mb();
 	src->addr_low = low;
-	__raw_writel(low, &dst->addr_low);
+	put_be32(low, &dst->addr_low);
 	mb();
 }
 
@@ -939,11 +944,11 @@ done:
 	return retval;
 }
 
-static inline void myri10ge_vlan_ip_csum(struct sk_buff *skb, u16 hw_csum)
+static inline void myri10ge_vlan_ip_csum(struct sk_buff *skb, __wsum hw_csum)
 {
 	struct vlan_hdr *vh = (struct vlan_hdr *)(skb->data);
 
-	if ((skb->protocol == ntohs(ETH_P_8021Q)) &&
+	if ((skb->protocol == htons(ETH_P_8021Q)) &&
 	    (vh->h_vlan_encapsulated_proto == htons(ETH_P_IP) ||
 	     vh->h_vlan_encapsulated_proto == htons(ETH_P_IPV6))) {
 		skb->csum = hw_csum;
@@ -953,7 +958,7 @@ static inline void myri10ge_vlan_ip_csum(struct sk_buff *skb, u16 hw_csum)
 
 static inline unsigned long
 myri10ge_rx_done(struct myri10ge_priv *mgp, struct myri10ge_rx_buf *rx,
-		 int bytes, int len, int csum)
+		 int bytes, int len, __wsum csum)
 {
 	dma_addr_t bus;
 	struct sk_buff *skb;
@@ -986,12 +991,12 @@ myri10ge_rx_done(struct myri10ge_priv *mgp, struct myri10ge_rx_buf *rx,
 
 	skb->protocol = eth_type_trans(skb, mgp->dev);
 	if (mgp->csum_flag) {
-		if ((skb->protocol == ntohs(ETH_P_IP)) ||
-		    (skb->protocol == ntohs(ETH_P_IPV6))) {
-			skb->csum = ntohs((u16) csum);
+		if ((skb->protocol == htons(ETH_P_IP)) ||
+		    (skb->protocol == htons(ETH_P_IPV6))) {
+			skb->csum = csum;
 			skb->ip_summed = CHECKSUM_COMPLETE;
 		} else
-			myri10ge_vlan_ip_csum(skb, ntohs((u16) csum));
+			myri10ge_vlan_ip_csum(skb, csum);
 	}
 
 	netif_receive_skb(skb);
@@ -1060,12 +1065,12 @@ static inline void myri10ge_clean_rx_done(struct myri10ge_priv *mgp, int *limit)
 	int idx = rx_done->idx;
 	int cnt = rx_done->cnt;
 	u16 length;
-	u16 checksum;
+	__wsum checksum;
 
 	while (rx_done->entry[idx].length != 0 && *limit != 0) {
 		length = ntohs(rx_done->entry[idx].length);
 		rx_done->entry[idx].length = 0;
-		checksum = ntohs(rx_done->entry[idx].checksum);
+		checksum = csum_unfold(rx_done->entry[idx].checksum);
 		if (length <= mgp->small_bytes)
 			rx_ok = myri10ge_rx_done(mgp, &mgp->rx_small,
 						 mgp->small_bytes,
@@ -1142,7 +1147,7 @@ static int myri10ge_poll(struct net_device *netdev, int *budget)
 
 	if (rx_done->entry[rx_done->idx].length == 0 || !netif_running(netdev)) {
 		netif_rx_complete(netdev);
-		__raw_writel(htonl(3), mgp->irq_claim);
+		put_be32(htonl(3), mgp->irq_claim);
 		return 0;
 	}
 	return 1;
@@ -1166,7 +1171,7 @@ static irqreturn_t myri10ge_intr(int irq, void *arg)
 		netif_rx_schedule(mgp->dev);
 
 	if (!mgp->msi_enabled) {
-		__raw_writel(0, mgp->irq_deassert);
+		put_be32(0, mgp->irq_deassert);
 		if (!myri10ge_deassert_wait)
 			stats->valid = 0;
 		mb();
@@ -1195,7 +1200,7 @@ static irqreturn_t myri10ge_intr(int irq, void *arg)
 
 	myri10ge_check_statblock(mgp);
 
-	__raw_writel(htonl(3), mgp->irq_claim + 1);
+	put_be32(htonl(3), mgp->irq_claim + 1);
 	return (IRQ_HANDLED);
 }
 
@@ -1233,7 +1238,7 @@ myri10ge_set_coalesce(struct net_device *netdev, struct ethtool_coalesce *coal)
 	struct myri10ge_priv *mgp = netdev_priv(netdev);
 
 	mgp->intr_coal_delay = coal->rx_coalesce_usecs;
-	__raw_writel(htonl(mgp->intr_coal_delay), mgp->intr_coal_delay_ptr);
+	put_be32(htonl(mgp->intr_coal_delay), mgp->intr_coal_delay_ptr);
 	return 0;
 }
 
@@ -1748,7 +1753,7 @@ static int myri10ge_open(struct net_device *dev)
 		goto abort_with_rings;
 	}
 
-	mgp->link_state = -1;
+	mgp->link_state = htonl(~0U);
 	mgp->rdma_tags_available = 15;
 
 	netif_poll_enable(mgp->dev);	/* must happen prior to any irq */
@@ -1876,7 +1881,7 @@ myri10ge_submit_req(struct myri10ge_tx_buf *tx, struct mcp_kreq_ether_send *src,
 
 	/* re-write the last 32-bits with the valid flags */
 	src->flags = last_flags;
-	__raw_writel(*((u32 *) src + 3), (u32 __iomem *) dst + 3);
+	put_be32(*((__be32 *) src + 3), (__be32 __iomem *) dst + 3);
 	tx->req += cnt;
 	mb();
 }
@@ -1919,7 +1924,8 @@ static int myri10ge_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct myri10ge_tx_buf *tx = &mgp->tx;
 	struct skb_frag_struct *frag;
 	dma_addr_t bus;
-	u32 low, high_swapped;
+	u32 low;
+	__be32 high_swapped;
 	unsigned int len;
 	int idx, last_idx, avail, frag_cnt, frag_idx, count, mss, max_segments;
 	u16 pseudo_hdr_offset, cksum_offset;
@@ -1964,7 +1970,6 @@ again:
 			cksum_offset = 0;
 			pseudo_hdr_offset = 0;
 		} else {
-			pseudo_hdr_offset = htons(pseudo_hdr_offset);
 			odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
 			flags |= MXGEFW_FLAGS_CKSUM;
 		}
@@ -1986,7 +1991,7 @@ again:
 		/* for TSO, pseudo_hdr_offset holds mss.
 		 * The firmware figures out where to put
 		 * the checksum by parsing the header. */
-		pseudo_hdr_offset = htons(mss);
+		pseudo_hdr_offset = mss;
 	} else
 #endif				/*NETIF_F_TSO */
 		/* Mark small packets, and pad out tiny packets */
@@ -2086,7 +2091,7 @@ again:
 #endif				/* NETIF_F_TSO */
 			req->addr_high = high_swapped;
 			req->addr_low = htonl(low);
-			req->pseudo_hdr_offset = pseudo_hdr_offset;
+			req->pseudo_hdr_offset = htons(pseudo_hdr_offset);
 			req->pad = 0;	/* complete solid 16-byte block; does this matter? */
 			req->rdma_count = 1;
 			req->length = htons(seglen);
@@ -2199,6 +2204,7 @@ static void myri10ge_set_multicast_list(struct net_device *dev)
 	struct myri10ge_cmd cmd;
 	struct myri10ge_priv *mgp;
 	struct dev_mc_list *mc_list;
+	__be32 data[2] = {0, 0};
 	int err;
 
 	mgp = netdev_priv(dev);
@@ -2237,10 +2243,9 @@ static void myri10ge_set_multicast_list(struct net_device *dev)
 
 	/* Walk the multicast list, and add each address */
 	for (mc_list = dev->mc_list; mc_list != NULL; mc_list = mc_list->next) {
-		memcpy(&cmd.data0, &mc_list->dmi_addr, 4);
-		memcpy(&cmd.data1, ((char *)&mc_list->dmi_addr) + 4, 2);
-		cmd.data0 = htonl(cmd.data0);
-		cmd.data1 = htonl(cmd.data1);
+		memcpy(data, &mc_list->dmi_addr, 6);
+		cmd.data0 = ntohl(data[0]);
+		cmd.data1 = ntohl(data[1]);
 		err = myri10ge_send_cmd(mgp, MXGEFW_JOIN_MULTICAST_GROUP,
 					&cmd, 1);
 
diff --git a/drivers/net/myri10ge/myri10ge_mcp.h b/drivers/net/myri10ge/myri10ge_mcp.h
index 9519ae7..29463b3 100644
--- a/drivers/net/myri10ge/myri10ge_mcp.h
+++ b/drivers/net/myri10ge/myri10ge_mcp.h
@@ -6,23 +6,23 @@
 
 /* 8 Bytes */
 struct mcp_dma_addr {
-	u32 high;
-	u32 low;
+	__be32 high;
+	__be32 low;
 };
 
 /* 4 Bytes */
 struct mcp_slot {
-	u16 checksum;
-	u16 length;
+	__sum16 checksum;
+	__be16 length;
 };
 
 /* 64 Bytes */
 struct mcp_cmd {
-	u32 cmd;
-	u32 data0;		/* will be low portion if data > 32 bits */
+	__be32 cmd;
+	__be32 data0;		/* will be low portion if data > 32 bits */
 	/* 8 */
-	u32 data1;		/* will be high portion if data > 32 bits */
-	u32 data2;		/* currently unused.. */
+	__be32 data1;		/* will be high portion if data > 32 bits */
+	__be32 data2;		/* currently unused.. */
 	/* 16 */
 	struct mcp_dma_addr response_addr;
 	/* 24 */
@@ -31,8 +31,8 @@ struct mcp_cmd {
 
 /* 8 Bytes */
 struct mcp_cmd_response {
-	u32 data;
-	u32 result;
+	__be32 data;
+	__be32 result;
 };
 
 /*
@@ -73,10 +73,10 @@ union mcp_pso_or_cumlen {
 
 /* 16 Bytes */
 struct mcp_kreq_ether_send {
-	u32 addr_high;
-	u32 addr_low;
-	u16 pseudo_hdr_offset;
-	u16 length;
+	__be32 addr_high;
+	__be32 addr_low;
+	__be16 pseudo_hdr_offset;
+	__be16 length;
 	u8 pad;
 	u8 rdma_count;
 	u8 cksum_offset;	/* where to start computing cksum */
@@ -85,8 +85,8 @@ struct mcp_kreq_ether_send {
 
 /* 8 Bytes */
 struct mcp_kreq_ether_recv {
-	u32 addr_high;
-	u32 addr_low;
+	__be32 addr_high;
+	__be32 addr_low;
 };
 
 /* Commands */
@@ -219,19 +219,19 @@ enum myri10ge_mcp_cmd_status {
 
 struct mcp_irq_data {
 	/* add new counters at the beginning */
-	u32 future_use[5];
-	u32 dropped_multicast_filtered;
+	__be32 future_use[5];
+	__be32 dropped_multicast_filtered;
 	/* 40 Bytes */
-	u32 send_done_count;
-
-	u32 link_up;
-	u32 dropped_link_overflow;
-	u32 dropped_link_error_or_filtered;
-	u32 dropped_runt;
-	u32 dropped_overrun;
-	u32 dropped_no_small_buffer;
-	u32 dropped_no_big_buffer;
-	u32 rdma_tags_available;
+	__be32 send_done_count;
+
+	__be32 link_up;
+	__be32 dropped_link_overflow;
+	__be32 dropped_link_error_or_filtered;
+	__be32 dropped_runt;
+	__be32 dropped_overrun;
+	__be32 dropped_no_small_buffer;
+	__be32 dropped_no_big_buffer;
+	__be32 rdma_tags_available;
 
 	u8 tx_stopped;
 	u8 link_down;
diff --git a/drivers/net/myri10ge/myri10ge_mcp_gen_header.h b/drivers/net/myri10ge/myri10ge_mcp_gen_header.h
index 487f779..16a810d 100644
--- a/drivers/net/myri10ge/myri10ge_mcp_gen_header.h
+++ b/drivers/net/myri10ge/myri10ge_mcp_gen_header.h
@@ -36,7 +36,7 @@
 struct mcp_gen_header {
 	/* the first 4 fields are filled at compile time */
 	unsigned header_length;
-	unsigned mcp_type;
+	__be32 mcp_type;
 	char version[128];
 	unsigned mcp_globals;	/* pointer to mcp-type specific structure */
 
-- 
cgit v0.10.2


From c69fda4e181fe448c43c2e1cc7b3fa67263d88ca Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 20 Nov 2006 14:12:54 -0500
Subject: [PATCH] ns83820 annotations

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/drivers/net/ns83820.c b/drivers/net/ns83820.c
index b0127c7..eb0e119 100644
--- a/drivers/net/ns83820.c
+++ b/drivers/net/ns83820.c
@@ -414,10 +414,10 @@ struct rx_info {
 
 	struct sk_buff	*skbs[NR_RX_DESC];
 
-	u32		*next_rx_desc;
+	__le32		*next_rx_desc;
 	u16		next_rx, next_empty;
 
-	u32		*descs;
+	__le32		*descs;
 	dma_addr_t	phy_descs;
 };
 
@@ -459,7 +459,7 @@ struct ns83820 {
 	struct sk_buff	*tx_skbs[NR_TX_DESC];
 
 	char		pad[16] __attribute__((aligned(16)));
-	u32		*tx_descs;
+	__le32		*tx_descs;
 	dma_addr_t	tx_phy_descs;
 
 	struct timer_list	tx_watchdog;
@@ -533,7 +533,7 @@ static void ns83820_vlan_rx_kill_vid(struct net_device *ndev, unsigned short vid
  * conditions, still route realtime traffic with as low jitter as
  * possible.
  */
-static inline void build_rx_desc(struct ns83820 *dev, u32 *desc, dma_addr_t link, dma_addr_t buf, u32 cmdsts, u32 extsts)
+static inline void build_rx_desc(struct ns83820 *dev, __le32 *desc, dma_addr_t link, dma_addr_t buf, u32 cmdsts, u32 extsts)
 {
 	desc_addr_set(desc + DESC_LINK, link);
 	desc_addr_set(desc + DESC_BUFPTR, buf);
@@ -547,7 +547,7 @@ static inline int ns83820_add_rx_skb(struct ns83820 *dev, struct sk_buff *skb)
 {
 	unsigned next_empty;
 	u32 cmdsts;
-	u32 *sg;
+	__le32 *sg;
 	dma_addr_t buf;
 
 	next_empty = dev->rx_info.next_empty;
@@ -874,7 +874,8 @@ static void fastcall rx_irq(struct net_device *ndev)
 	struct rx_info *info = &dev->rx_info;
 	unsigned next_rx;
 	int rx_rc, len;
-	u32 cmdsts, *desc;
+	u32 cmdsts;
+	__le32 *desc;
 	unsigned long flags;
 	int nr = 0;
 
@@ -1010,7 +1011,8 @@ static inline void kick_tx(struct ns83820 *dev)
 static void do_tx_done(struct net_device *ndev)
 {
 	struct ns83820 *dev = PRIV(ndev);
-	u32 cmdsts, tx_done_idx, *desc;
+	u32 cmdsts, tx_done_idx;
+	__le32 *desc;
 
 	dprintk("do_tx_done(%p)\n", ndev);
 	tx_done_idx = dev->tx_done_idx;
@@ -1077,7 +1079,7 @@ static void ns83820_cleanup_tx(struct ns83820 *dev)
 		struct sk_buff *skb = dev->tx_skbs[i];
 		dev->tx_skbs[i] = NULL;
 		if (skb) {
-			u32 *desc = dev->tx_descs + (i * DESC_SIZE);
+			__le32 *desc = dev->tx_descs + (i * DESC_SIZE);
 			pci_unmap_single(dev->pci_dev,
 					desc_addr_get(desc + DESC_BUFPTR),
 					le32_to_cpu(desc[DESC_CMDSTS]) & CMDSTS_LEN_MASK,
@@ -1107,7 +1109,7 @@ static int ns83820_hard_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	skb_frag_t *frag;
 	int stopped = 0;
 	int do_intr = 0;
-	volatile u32 *first_desc;
+	volatile __le32 *first_desc;
 
 	dprintk("ns83820_hard_start_xmit\n");
 
@@ -1180,7 +1182,7 @@ again:
 	first_desc = dev->tx_descs + (free_idx * DESC_SIZE);
 
 	for (;;) {
-		volatile u32 *desc = dev->tx_descs + (free_idx * DESC_SIZE);
+		volatile __le32 *desc = dev->tx_descs + (free_idx * DESC_SIZE);
 
 		dprintk("frag[%3u]: %4u @ 0x%08Lx\n", free_idx, len,
 			(unsigned long long)buf);
@@ -1455,7 +1457,8 @@ static int ns83820_stop(struct net_device *ndev)
 static void ns83820_tx_timeout(struct net_device *ndev)
 {
 	struct ns83820 *dev = PRIV(ndev);
-        u32 tx_done_idx, *desc;
+        u32 tx_done_idx;
+	__le32 *desc;
 	unsigned long flags;
 
 	spin_lock_irqsave(&dev->tx_lock, flags);
-- 
cgit v0.10.2


From 33fee56ae846cdee67d2ab6d14c3baa879dfc794 Mon Sep 17 00:00:00 2001
From: Deepak Saxena <dsaxena@plexity.net>
Date: Mon, 4 Dec 2006 15:04:46 -0800
Subject: [PATCH] Update smc91x driver with ARM Versatile board info

We need to specify a Versatile-specific SMC_IRQ_FLAGS value or the new
generic IRQ layer will complain thusly:

No IRQF_TRIGGER set_type function for IRQ 25 (<NULL>)

Signed-off-by: Deepak Saxena <dsaxena@plexity.net>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Nicolas Pitre <nico@cam.org>

On Fri, 20 Oct 2006 22:50:40 +0100
Russell King <rmk@arm.linux.org.uk> wrote:

> On Fri, Oct 20, 2006 at 02:42:04PM -0700, akpm@osdl.org wrote:
> > We need to specify a Versatile-specific SMC_IRQ_FLAGS value or the new
> > generic IRQ layer will complain thusly:
>
> I don't think I heard anything back from my previous suggestion that
> the IRQ flags are passed through the platform device IRQ resource.
>
> Doing so would avoid adding yet another platform specific block into
> the file.
>
> BTW, Integrator platforms will also suffer from this, which will add
> another ifdef to this header.
>
> Let's do it right and arrange to pass these flags from the platform
> code.  It's not like they're in a critical path.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/net/smc91x.h b/drivers/net/smc91x.h
index 9e0fbc5..d28adf2 100644
--- a/drivers/net/smc91x.h
+++ b/drivers/net/smc91x.h
@@ -434,6 +434,24 @@ static inline void LPD7_SMC_outsw (unsigned char* a, int r,
 
 #define SMC_IRQ_FLAGS		(0)
 
+#elif	defined(CONFIG_ARCH_VERSATILE)
+
+#define SMC_CAN_USE_8BIT	1
+#define SMC_CAN_USE_16BIT	1
+#define SMC_CAN_USE_32BIT	1
+#define SMC_NOWAIT		1
+
+#define SMC_inb(a, r)		readb((a) + (r))
+#define SMC_inw(a, r)		readw((a) + (r))
+#define SMC_inl(a, r)		readl((a) + (r))
+#define SMC_outb(v, a, r)	writeb(v, (a) + (r))
+#define SMC_outw(v, a, r)	writew(v, (a) + (r))
+#define SMC_outl(v, a, r)	writel(v, (a) + (r))
+#define SMC_insl(a, r, p, l)	readsl((a) + (r), p, l)
+#define SMC_outsl(a, r, p, l)	writesl((a) + (r), p, l)
+
+#define SMC_IRQ_FLAGS		(0)
+
 #else
 
 #define SMC_CAN_USE_8BIT	1
-- 
cgit v0.10.2


From 4e1400796c93df5e7f92d766e4a4332d0c98795f Mon Sep 17 00:00:00 2001
From: Andy Gospodarek <andy@greyhouse.net>
Date: Mon, 4 Dec 2006 15:04:54 -0800
Subject: [PATCH] bonding: incorrect bonding state reported via ioctl

This is a small fix-up to finish out the work done by Jay Vosburgh to add
carrier-state support for bonding devices.  The output in
/proc/net/bonding/bondX was correct, but when collecting the same info via
an iotcl it could still be incorrect.

Signed-off-by: Andy Gospodarek <andy@greyhouse.net>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 488d8ed..6482aed 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3684,7 +3684,7 @@ static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd
 			mii->val_out = 0;
 			read_lock_bh(&bond->lock);
 			read_lock(&bond->curr_slave_lock);
-			if (bond->curr_active_slave) {
+			if (netif_carrier_ok(bond->dev)) {
 				mii->val_out = BMSR_LSTATUS;
 			}
 			read_unlock(&bond->curr_slave_lock);
-- 
cgit v0.10.2


From 3b6e8fe7eca12fca2cc7fde46ba2a94a86ab0815 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Mon, 4 Dec 2006 15:04:54 -0800
Subject: [PATCH] declance: Fix PMAX and PMAD support

The shared buffer used by the LANCE on the PMAX only supports halfword
(16-bit) accesses.  And the PMAD has the buffer wired differently.  This is
a change to fix these issues.

Tested with a DECstation 2100 (thanks Flo for making this possible) and a
DECstation 5000/133 (both the PMAD and the onboard LANCE).

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/net/declance.c b/drivers/net/declance.c
index 00e2a8a..1167f8f 100644
--- a/drivers/net/declance.c
+++ b/drivers/net/declance.c
@@ -40,6 +40,10 @@
  *
  *      v0.009: Module support fixes, multiple interfaces support, various
  *              bits. macro
+ *
+ *      v0.010: Fixes for the PMAD mapping of the LANCE buffer and for the
+ *              PMAX requirement to only use halfword accesses to the
+ *              buffer. macro
  */
 
 #include <linux/crc32.h>
@@ -54,6 +58,7 @@
 #include <linux/spinlock.h>
 #include <linux/stddef.h>
 #include <linux/string.h>
+#include <linux/types.h>
 
 #include <asm/addrspace.h>
 #include <asm/system.h>
@@ -67,7 +72,7 @@
 #include <asm/dec/tc.h>
 
 static char version[] __devinitdata =
-"declance.c: v0.009 by Linux MIPS DECstation task force\n";
+"declance.c: v0.010 by Linux MIPS DECstation task force\n";
 
 MODULE_AUTHOR("Linux MIPS DECstation task force");
 MODULE_DESCRIPTION("DEC LANCE (DECstation onboard, PMAD-xx) driver");
@@ -110,24 +115,25 @@ MODULE_LICENSE("GPL");
 #define	LE_C3_BCON	0x1	/* Byte control */
 
 /* Receive message descriptor 1 */
-#define LE_R1_OWN       0x80	/* Who owns the entry */
-#define LE_R1_ERR       0x40	/* Error: if FRA, OFL, CRC or BUF is set */
-#define LE_R1_FRA       0x20	/* FRA: Frame error */
-#define LE_R1_OFL       0x10	/* OFL: Frame overflow */
-#define LE_R1_CRC       0x08	/* CRC error */
-#define LE_R1_BUF       0x04	/* BUF: Buffer error */
-#define LE_R1_SOP       0x02	/* Start of packet */
-#define LE_R1_EOP       0x01	/* End of packet */
-#define LE_R1_POK       0x03	/* Packet is complete: SOP + EOP */
-
-#define LE_T1_OWN       0x80	/* Lance owns the packet */
-#define LE_T1_ERR       0x40	/* Error summary */
-#define LE_T1_EMORE     0x10	/* Error: more than one retry needed */
-#define LE_T1_EONE      0x08	/* Error: one retry needed */
-#define LE_T1_EDEF      0x04	/* Error: deferred */
-#define LE_T1_SOP       0x02	/* Start of packet */
-#define LE_T1_EOP       0x01	/* End of packet */
-#define LE_T1_POK	0x03	/* Packet is complete: SOP + EOP */
+#define LE_R1_OWN	0x8000	/* Who owns the entry */
+#define LE_R1_ERR	0x4000	/* Error: if FRA, OFL, CRC or BUF is set */
+#define LE_R1_FRA	0x2000	/* FRA: Frame error */
+#define LE_R1_OFL	0x1000	/* OFL: Frame overflow */
+#define LE_R1_CRC	0x0800	/* CRC error */
+#define LE_R1_BUF	0x0400	/* BUF: Buffer error */
+#define LE_R1_SOP	0x0200	/* Start of packet */
+#define LE_R1_EOP	0x0100	/* End of packet */
+#define LE_R1_POK	0x0300	/* Packet is complete: SOP + EOP */
+
+/* Transmit message descriptor 1 */
+#define LE_T1_OWN	0x8000	/* Lance owns the packet */
+#define LE_T1_ERR	0x4000	/* Error summary */
+#define LE_T1_EMORE	0x1000	/* Error: more than one retry needed */
+#define LE_T1_EONE	0x0800	/* Error: one retry needed */
+#define LE_T1_EDEF	0x0400	/* Error: deferred */
+#define LE_T1_SOP	0x0200	/* Start of packet */
+#define LE_T1_EOP	0x0100	/* End of packet */
+#define LE_T1_POK	0x0300	/* Packet is complete: SOP + EOP */
 
 #define LE_T3_BUF       0x8000	/* Buffer error */
 #define LE_T3_UFL       0x4000	/* Error underflow */
@@ -156,69 +162,57 @@ MODULE_LICENSE("GPL");
 #undef TEST_HITS
 #define ZERO 0
 
-/* The DS2000/3000 have a linear 64 KB buffer.
-
- * The PMAD-AA has 128 kb buffer on-board.
+/*
+ * The DS2100/3100 have a linear 64 kB buffer which supports halfword
+ * accesses only.  Each halfword of the buffer is word-aligned in the
+ * CPU address space.
+ *
+ * The PMAD-AA has a 128 kB buffer on-board.
  *
- * The IOASIC LANCE devices use a shared memory region. This region as seen
- * from the CPU is (max) 128 KB long and has to be on an 128 KB boundary.
- * The LANCE sees this as a 64 KB long continuous memory region.
+ * The IOASIC LANCE devices use a shared memory region.  This region
+ * as seen from the CPU is (max) 128 kB long and has to be on an 128 kB
+ * boundary.  The LANCE sees this as a 64 kB long continuous memory
+ * region.
  *
- * The LANCE's DMA address is used as an index in this buffer and DMA takes
- * place in bursts of eight 16-Bit words which are packed into four 32-Bit words
- * by the IOASIC. This leads to a strange padding: 16 bytes of valid data followed
- * by a 16 byte gap :-(.
+ * The LANCE's DMA address is used as an index in this buffer and DMA
+ * takes place in bursts of eight 16-bit words which are packed into
+ * four 32-bit words by the IOASIC.  This leads to a strange padding:
+ * 16 bytes of valid data followed by a 16 byte gap :-(.
  */
 
 struct lance_rx_desc {
 	unsigned short rmd0;		/* low address of packet */
-	short gap0;
-	unsigned char rmd1_hadr;	/* high address of packet */
-	unsigned char rmd1_bits;	/* descriptor bits */
-	short gap1;
+	unsigned short rmd1;		/* high address of packet
+					   and descriptor bits */
 	short length;			/* 2s complement (negative!)
 					   of buffer length */
-	short gap2;
 	unsigned short mblength;	/* actual number of bytes received */
-	short gap3;
 };
 
 struct lance_tx_desc {
 	unsigned short tmd0;		/* low address of packet */
-	short gap0;
-	unsigned char tmd1_hadr;	/* high address of packet */
-	unsigned char tmd1_bits;	/* descriptor bits */
-	short gap1;
+	unsigned short tmd1;		/* high address of packet
+					   and descriptor bits */
 	short length;			/* 2s complement (negative!)
 					   of buffer length */
-	short gap2;
 	unsigned short misc;
-	short gap3;
 };
 
 
 /* First part of the LANCE initialization block, described in databook. */
 struct lance_init_block {
 	unsigned short mode;		/* pre-set mode (reg. 15) */
-	short gap0;
 
-	unsigned char phys_addr[12];	/* physical ethernet address
-					   only 0, 1, 4, 5, 8, 9 are valid
-					   2, 3, 6, 7, 10, 11 are gaps */
-	unsigned short filter[8];	/* multicast filter
-					   only 0, 2, 4, 6 are valid
-					   1, 3, 5, 7 are gaps */
+	unsigned short phys_addr[3];	/* physical ethernet address */
+	unsigned short filter[4];	/* multicast filter */
 
 	/* Receive and transmit ring base, along with extra bits. */
 	unsigned short rx_ptr;		/* receive descriptor addr */
-	short gap1;
 	unsigned short rx_len;		/* receive len and high addr */
-	short gap2;
 	unsigned short tx_ptr;		/* transmit descriptor addr */
-	short gap3;
 	unsigned short tx_len;		/* transmit len and high addr */
-	short gap4;
-	short gap5[8];
+
+	short gap[4];
 
 	/* The buffer descriptors */
 	struct lance_rx_desc brx_ring[RX_RING_SIZE];
@@ -226,15 +220,28 @@ struct lance_init_block {
 };
 
 #define BUF_OFFSET_CPU sizeof(struct lance_init_block)
-#define BUF_OFFSET_LNC (sizeof(struct lance_init_block)>>1)
+#define BUF_OFFSET_LNC sizeof(struct lance_init_block)
 
-#define libdesc_offset(rt, elem) \
-((__u32)(((unsigned long)(&(((struct lance_init_block *)0)->rt[elem])))))
+#define shift_off(off, type)						\
+	(type == ASIC_LANCE || type == PMAX_LANCE ? off << 1 : off)
 
-/*
- * This works *only* for the ring descriptors
- */
-#define LANCE_ADDR(x) (CPHYSADDR(x) >> 1)
+#define lib_off(rt, type)						\
+	shift_off(offsetof(struct lance_init_block, rt), type)
+
+#define lib_ptr(ib, rt, type) 						\
+	((volatile u16 *)((u8 *)(ib) + lib_off(rt, type)))
+
+#define rds_off(rt, type)						\
+	shift_off(offsetof(struct lance_rx_desc, rt), type)
+
+#define rds_ptr(rd, rt, type) 						\
+	((volatile u16 *)((u8 *)(rd) + rds_off(rt, type)))
+
+#define tds_off(rt, type)						\
+	shift_off(offsetof(struct lance_tx_desc, rt), type)
+
+#define tds_ptr(td, rt, type) 						\
+	((volatile u16 *)((u8 *)(td) + tds_off(rt, type)))
 
 struct lance_private {
 	struct net_device *next;
@@ -242,7 +249,6 @@ struct lance_private {
 	int slot;
 	int dma_irq;
 	volatile struct lance_regs *ll;
-	volatile struct lance_init_block *init_block;
 
 	spinlock_t	lock;
 
@@ -260,8 +266,8 @@ struct lance_private {
 	char *tx_buf_ptr_cpu[TX_RING_SIZE];
 
 	/* Pointers to the ring buffers as seen from the LANCE */
-	char *rx_buf_ptr_lnc[RX_RING_SIZE];
-	char *tx_buf_ptr_lnc[TX_RING_SIZE];
+	uint rx_buf_ptr_lnc[RX_RING_SIZE];
+	uint tx_buf_ptr_lnc[TX_RING_SIZE];
 };
 
 #define TX_BUFFS_AVAIL ((lp->tx_old<=lp->tx_new)?\
@@ -294,7 +300,7 @@ static inline void writereg(volatile unsigned short *regptr, short value)
 static void load_csrs(struct lance_private *lp)
 {
 	volatile struct lance_regs *ll = lp->ll;
-	int leptr;
+	uint leptr;
 
 	/* The address space as seen from the LANCE
 	 * begins at address 0. HK
@@ -316,12 +322,14 @@ static void load_csrs(struct lance_private *lp)
  * Our specialized copy routines
  *
  */
-void cp_to_buf(const int type, void *to, const void *from, int len)
+static void cp_to_buf(const int type, void *to, const void *from, int len)
 {
 	unsigned short *tp, *fp, clen;
 	unsigned char *rtp, *rfp;
 
-	if (type == PMAX_LANCE) {
+	if (type == PMAD_LANCE) {
+		memcpy(to, from, len);
+	} else if (type == PMAX_LANCE) {
 		clen = len >> 1;
 		tp = (unsigned short *) to;
 		fp = (unsigned short *) from;
@@ -370,12 +378,14 @@ void cp_to_buf(const int type, void *to, const void *from, int len)
 	iob();
 }
 
-void cp_from_buf(const int type, void *to, const void *from, int len)
+static void cp_from_buf(const int type, void *to, const void *from, int len)
 {
 	unsigned short *tp, *fp, clen;
 	unsigned char *rtp, *rfp;
 
-	if (type == PMAX_LANCE) {
+	if (type == PMAD_LANCE) {
+		memcpy(to, from, len);
+	} else if (type == PMAX_LANCE) {
 		clen = len >> 1;
 		tp = (unsigned short *) to;
 		fp = (unsigned short *) from;
@@ -431,12 +441,10 @@ void cp_from_buf(const int type, void *to, const void *from, int len)
 static void lance_init_ring(struct net_device *dev)
 {
 	struct lance_private *lp = netdev_priv(dev);
-	volatile struct lance_init_block *ib;
-	int leptr;
+	volatile u16 *ib = (volatile u16 *)dev->mem_start;
+	uint leptr;
 	int i;
 
-	ib = (struct lance_init_block *) (dev->mem_start);
-
 	/* Lock out other processes while setting up hardware */
 	netif_stop_queue(dev);
 	lp->rx_new = lp->tx_new = 0;
@@ -445,55 +453,64 @@ static void lance_init_ring(struct net_device *dev)
 	/* Copy the ethernet address to the lance init block.
 	 * XXX bit 0 of the physical address registers has to be zero
 	 */
-	ib->phys_addr[0] = dev->dev_addr[0];
-	ib->phys_addr[1] = dev->dev_addr[1];
-	ib->phys_addr[4] = dev->dev_addr[2];
-	ib->phys_addr[5] = dev->dev_addr[3];
-	ib->phys_addr[8] = dev->dev_addr[4];
-	ib->phys_addr[9] = dev->dev_addr[5];
+	*lib_ptr(ib, phys_addr[0], lp->type) = (dev->dev_addr[1] << 8) |
+				     dev->dev_addr[0];
+	*lib_ptr(ib, phys_addr[1], lp->type) = (dev->dev_addr[3] << 8) |
+				     dev->dev_addr[2];
+	*lib_ptr(ib, phys_addr[2], lp->type) = (dev->dev_addr[5] << 8) |
+				     dev->dev_addr[4];
 	/* Setup the initialization block */
 
 	/* Setup rx descriptor pointer */
-	leptr = LANCE_ADDR(libdesc_offset(brx_ring, 0));
-	ib->rx_len = (LANCE_LOG_RX_BUFFERS << 13) | (leptr >> 16);
-	ib->rx_ptr = leptr;
+	leptr = offsetof(struct lance_init_block, brx_ring);
+	*lib_ptr(ib, rx_len, lp->type) = (LANCE_LOG_RX_BUFFERS << 13) |
+					 (leptr >> 16);
+	*lib_ptr(ib, rx_ptr, lp->type) = leptr;
 	if (ZERO)
-		printk("RX ptr: %8.8x(%8.8x)\n", leptr, libdesc_offset(brx_ring, 0));
+		printk("RX ptr: %8.8x(%8.8x)\n",
+		       leptr, lib_off(brx_ring, lp->type));
 
 	/* Setup tx descriptor pointer */
-	leptr = LANCE_ADDR(libdesc_offset(btx_ring, 0));
-	ib->tx_len = (LANCE_LOG_TX_BUFFERS << 13) | (leptr >> 16);
-	ib->tx_ptr = leptr;
+	leptr = offsetof(struct lance_init_block, btx_ring);
+	*lib_ptr(ib, tx_len, lp->type) = (LANCE_LOG_TX_BUFFERS << 13) |
+					 (leptr >> 16);
+	*lib_ptr(ib, tx_ptr, lp->type) = leptr;
 	if (ZERO)
-		printk("TX ptr: %8.8x(%8.8x)\n", leptr, libdesc_offset(btx_ring, 0));
+		printk("TX ptr: %8.8x(%8.8x)\n",
+		       leptr, lib_off(btx_ring, lp->type));
 
 	if (ZERO)
 		printk("TX rings:\n");
 
 	/* Setup the Tx ring entries */
 	for (i = 0; i < TX_RING_SIZE; i++) {
-		leptr = (int) lp->tx_buf_ptr_lnc[i];
-		ib->btx_ring[i].tmd0 = leptr;
-		ib->btx_ring[i].tmd1_hadr = leptr >> 16;
-		ib->btx_ring[i].tmd1_bits = 0;
-		ib->btx_ring[i].length = 0xf000;	/* The ones required by tmd2 */
-		ib->btx_ring[i].misc = 0;
+		leptr = lp->tx_buf_ptr_lnc[i];
+		*lib_ptr(ib, btx_ring[i].tmd0, lp->type) = leptr;
+		*lib_ptr(ib, btx_ring[i].tmd1, lp->type) = (leptr >> 16) &
+							   0xff;
+		*lib_ptr(ib, btx_ring[i].length, lp->type) = 0xf000;
+						/* The ones required by tmd2 */
+		*lib_ptr(ib, btx_ring[i].misc, lp->type) = 0;
 		if (i < 3 && ZERO)
-			printk("%d: 0x%8.8x(0x%8.8x)\n", i, leptr, (int) lp->tx_buf_ptr_cpu[i]);
+			printk("%d: 0x%8.8x(0x%8.8x)\n",
+			       i, leptr, (uint)lp->tx_buf_ptr_cpu[i]);
 	}
 
 	/* Setup the Rx ring entries */
 	if (ZERO)
 		printk("RX rings:\n");
 	for (i = 0; i < RX_RING_SIZE; i++) {
-		leptr = (int) lp->rx_buf_ptr_lnc[i];
-		ib->brx_ring[i].rmd0 = leptr;
-		ib->brx_ring[i].rmd1_hadr = leptr >> 16;
-		ib->brx_ring[i].rmd1_bits = LE_R1_OWN;
-		ib->brx_ring[i].length = -RX_BUFF_SIZE | 0xf000;
-		ib->brx_ring[i].mblength = 0;
+		leptr = lp->rx_buf_ptr_lnc[i];
+		*lib_ptr(ib, brx_ring[i].rmd0, lp->type) = leptr;
+		*lib_ptr(ib, brx_ring[i].rmd1, lp->type) = ((leptr >> 16) &
+							    0xff) |
+							   LE_R1_OWN;
+		*lib_ptr(ib, brx_ring[i].length, lp->type) = -RX_BUFF_SIZE |
+							     0xf000;
+		*lib_ptr(ib, brx_ring[i].mblength, lp->type) = 0;
 		if (i < 3 && ZERO)
-			printk("%d: 0x%8.8x(0x%8.8x)\n", i, leptr, (int) lp->rx_buf_ptr_cpu[i]);
+			printk("%d: 0x%8.8x(0x%8.8x)\n",
+			       i, leptr, (uint)lp->rx_buf_ptr_cpu[i]);
 	}
 	iob();
 }
@@ -511,11 +528,13 @@ static int init_restart_lance(struct lance_private *lp)
 		udelay(10);
 	}
 	if ((i == 100) || (ll->rdp & LE_C0_ERR)) {
-		printk("LANCE unopened after %d ticks, csr0=%4.4x.\n", i, ll->rdp);
+		printk("LANCE unopened after %d ticks, csr0=%4.4x.\n",
+		       i, ll->rdp);
 		return -1;
 	}
 	if ((ll->rdp & LE_C0_ERR)) {
-		printk("LANCE unopened after %d ticks, csr0=%4.4x.\n", i, ll->rdp);
+		printk("LANCE unopened after %d ticks, csr0=%4.4x.\n",
+		       i, ll->rdp);
 		return -1;
 	}
 	writereg(&ll->rdp, LE_C0_IDON);
@@ -528,12 +547,11 @@ static int init_restart_lance(struct lance_private *lp)
 static int lance_rx(struct net_device *dev)
 {
 	struct lance_private *lp = netdev_priv(dev);
-	volatile struct lance_init_block *ib;
-	volatile struct lance_rx_desc *rd = 0;
-	unsigned char bits;
-	int len = 0;
-	struct sk_buff *skb = 0;
-	ib = (struct lance_init_block *) (dev->mem_start);
+	volatile u16 *ib = (volatile u16 *)dev->mem_start;
+	volatile u16 *rd;
+	unsigned short bits;
+	int entry, len;
+	struct sk_buff *skb;
 
 #ifdef TEST_HITS
 	{
@@ -542,19 +560,22 @@ static int lance_rx(struct net_device *dev)
 		printk("[");
 		for (i = 0; i < RX_RING_SIZE; i++) {
 			if (i == lp->rx_new)
-				printk("%s", ib->brx_ring[i].rmd1_bits &
+				printk("%s", *lib_ptr(ib, brx_ring[i].rmd1,
+						      lp->type) &
 					     LE_R1_OWN ? "_" : "X");
 			else
-				printk("%s", ib->brx_ring[i].rmd1_bits &
+				printk("%s", *lib_ptr(ib, brx_ring[i].rmd1,
+						      lp->type) &
 					     LE_R1_OWN ? "." : "1");
 		}
 		printk("]");
 	}
 #endif
 
-	for (rd = &ib->brx_ring[lp->rx_new];
-	     !((bits = rd->rmd1_bits) & LE_R1_OWN);
-	     rd = &ib->brx_ring[lp->rx_new]) {
+	for (rd = lib_ptr(ib, brx_ring[lp->rx_new], lp->type);
+	     !((bits = *rds_ptr(rd, rmd1, lp->type)) & LE_R1_OWN);
+	     rd = lib_ptr(ib, brx_ring[lp->rx_new], lp->type)) {
+		entry = lp->rx_new;
 
 		/* We got an incomplete frame? */
 		if ((bits & LE_R1_POK) != LE_R1_POK) {
@@ -575,16 +596,18 @@ static int lance_rx(struct net_device *dev)
 			if (bits & LE_R1_EOP)
 				lp->stats.rx_errors++;
 		} else {
-			len = (rd->mblength & 0xfff) - 4;
+			len = (*rds_ptr(rd, mblength, lp->type) & 0xfff) - 4;
 			skb = dev_alloc_skb(len + 2);
 
 			if (skb == 0) {
 				printk("%s: Memory squeeze, deferring packet.\n",
 				       dev->name);
 				lp->stats.rx_dropped++;
-				rd->mblength = 0;
-				rd->rmd1_bits = LE_R1_OWN;
-				lp->rx_new = (lp->rx_new + 1) & RX_RING_MOD_MASK;
+				*rds_ptr(rd, mblength, lp->type) = 0;
+				*rds_ptr(rd, rmd1, lp->type) =
+					((lp->rx_buf_ptr_lnc[entry] >> 16) &
+					 0xff) | LE_R1_OWN;
+				lp->rx_new = (entry + 1) & RX_RING_MOD_MASK;
 				return 0;
 			}
 			lp->stats.rx_bytes += len;
@@ -594,8 +617,7 @@ static int lance_rx(struct net_device *dev)
 			skb_put(skb, len);	/* make room */
 
 			cp_from_buf(lp->type, skb->data,
-				    (char *)lp->rx_buf_ptr_cpu[lp->rx_new],
-				    len);
+				    (char *)lp->rx_buf_ptr_cpu[entry], len);
 
 			skb->protocol = eth_type_trans(skb, dev);
 			netif_rx(skb);
@@ -604,10 +626,11 @@ static int lance_rx(struct net_device *dev)
 		}
 
 		/* Return the packet to the pool */
-		rd->mblength = 0;
-		rd->length = -RX_BUFF_SIZE | 0xf000;
-		rd->rmd1_bits = LE_R1_OWN;
-		lp->rx_new = (lp->rx_new + 1) & RX_RING_MOD_MASK;
+		*rds_ptr(rd, mblength, lp->type) = 0;
+		*rds_ptr(rd, length, lp->type) = -RX_BUFF_SIZE | 0xf000;
+		*rds_ptr(rd, rmd1, lp->type) =
+			((lp->rx_buf_ptr_lnc[entry] >> 16) & 0xff) | LE_R1_OWN;
+		lp->rx_new = (entry + 1) & RX_RING_MOD_MASK;
 	}
 	return 0;
 }
@@ -615,24 +638,24 @@ static int lance_rx(struct net_device *dev)
 static void lance_tx(struct net_device *dev)
 {
 	struct lance_private *lp = netdev_priv(dev);
-	volatile struct lance_init_block *ib;
+	volatile u16 *ib = (volatile u16 *)dev->mem_start;
 	volatile struct lance_regs *ll = lp->ll;
-	volatile struct lance_tx_desc *td;
+	volatile u16 *td;
 	int i, j;
 	int status;
-	ib = (struct lance_init_block *) (dev->mem_start);
+
 	j = lp->tx_old;
 
 	spin_lock(&lp->lock);
 
 	for (i = j; i != lp->tx_new; i = j) {
-		td = &ib->btx_ring[i];
+		td = lib_ptr(ib, btx_ring[i], lp->type);
 		/* If we hit a packet not owned by us, stop */
-		if (td->tmd1_bits & LE_T1_OWN)
+		if (*tds_ptr(td, tmd1, lp->type) & LE_T1_OWN)
 			break;
 
-		if (td->tmd1_bits & LE_T1_ERR) {
-			status = td->misc;
+		if (*tds_ptr(td, tmd1, lp->type) & LE_T1_ERR) {
+			status = *tds_ptr(td, misc, lp->type);
 
 			lp->stats.tx_errors++;
 			if (status & LE_T3_RTY)
@@ -667,18 +690,19 @@ static void lance_tx(struct net_device *dev)
 				init_restart_lance(lp);
 				goto out;
 			}
-		} else if ((td->tmd1_bits & LE_T1_POK) == LE_T1_POK) {
+		} else if ((*tds_ptr(td, tmd1, lp->type) & LE_T1_POK) ==
+			   LE_T1_POK) {
 			/*
 			 * So we don't count the packet more than once.
 			 */
-			td->tmd1_bits &= ~(LE_T1_POK);
+			*tds_ptr(td, tmd1, lp->type) &= ~(LE_T1_POK);
 
 			/* One collision before packet was sent. */
-			if (td->tmd1_bits & LE_T1_EONE)
+			if (*tds_ptr(td, tmd1, lp->type) & LE_T1_EONE)
 				lp->stats.collisions++;
 
 			/* More than one collision, be optimistic. */
-			if (td->tmd1_bits & LE_T1_EMORE)
+			if (*tds_ptr(td, tmd1, lp->type) & LE_T1_EMORE)
 				lp->stats.collisions += 2;
 
 			lp->stats.tx_packets++;
@@ -752,7 +776,7 @@ struct net_device *last_dev = 0;
 
 static int lance_open(struct net_device *dev)
 {
-	volatile struct lance_init_block *ib = (struct lance_init_block *) (dev->mem_start);
+	volatile u16 *ib = (volatile u16 *)dev->mem_start;
 	struct lance_private *lp = netdev_priv(dev);
 	volatile struct lance_regs *ll = lp->ll;
 	int status = 0;
@@ -769,11 +793,11 @@ static int lance_open(struct net_device *dev)
 	 *
 	 * BTW it is common bug in all lance drivers! --ANK
 	 */
-	ib->mode = 0;
-	ib->filter [0] = 0;
-	ib->filter [2] = 0;
-	ib->filter [4] = 0;
-	ib->filter [6] = 0;
+	*lib_ptr(ib, mode, lp->type) = 0;
+	*lib_ptr(ib, filter[0], lp->type) = 0;
+	*lib_ptr(ib, filter[1], lp->type) = 0;
+	*lib_ptr(ib, filter[2], lp->type) = 0;
+	*lib_ptr(ib, filter[3], lp->type) = 0;
 
 	lance_init_ring(dev);
 	load_csrs(lp);
@@ -874,12 +898,10 @@ static int lance_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct lance_private *lp = netdev_priv(dev);
 	volatile struct lance_regs *ll = lp->ll;
-	volatile struct lance_init_block *ib = (struct lance_init_block *) (dev->mem_start);
-	int entry, skblen, len;
-
-	skblen = skb->len;
+	volatile u16 *ib = (volatile u16 *)dev->mem_start;
+	int entry, len;
 
-	len = skblen;
+	len = skb->len;
 
 	if (len < ETH_ZLEN) {
 		if (skb_padto(skb, ETH_ZLEN))
@@ -889,23 +911,17 @@ static int lance_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	lp->stats.tx_bytes += len;
 
-	entry = lp->tx_new & TX_RING_MOD_MASK;
-	ib->btx_ring[entry].length = (-len);
-	ib->btx_ring[entry].misc = 0;
+	entry = lp->tx_new;
+	*lib_ptr(ib, btx_ring[entry].length, lp->type) = (-len);
+	*lib_ptr(ib, btx_ring[entry].misc, lp->type) = 0;
 
-	cp_to_buf(lp->type, (char *)lp->tx_buf_ptr_cpu[entry], skb->data,
-		  skblen);
-
-	/* Clear the slack of the packet, do I need this? */
-	/* For a firewall it's a good idea - AC */
-/*
-   if (len != skblen)
-   memset ((char *) &ib->tx_buf [entry][skblen], 0, (len - skblen) << 1);
- */
+	cp_to_buf(lp->type, (char *)lp->tx_buf_ptr_cpu[entry], skb->data, len);
 
 	/* Now, give the packet to the lance */
-	ib->btx_ring[entry].tmd1_bits = (LE_T1_POK | LE_T1_OWN);
-	lp->tx_new = (lp->tx_new + 1) & TX_RING_MOD_MASK;
+	*lib_ptr(ib, btx_ring[entry].tmd1, lp->type) =
+		((lp->tx_buf_ptr_lnc[entry] >> 16) & 0xff) |
+		(LE_T1_POK | LE_T1_OWN);
+	lp->tx_new = (entry + 1) & TX_RING_MOD_MASK;
 
 	if (TX_BUFFS_AVAIL <= 0)
 		netif_stop_queue(dev);
@@ -930,8 +946,8 @@ static struct net_device_stats *lance_get_stats(struct net_device *dev)
 
 static void lance_load_multicast(struct net_device *dev)
 {
-	volatile struct lance_init_block *ib = (struct lance_init_block *) (dev->mem_start);
-	volatile u16 *mcast_table = (u16 *) & ib->filter;
+	struct lance_private *lp = netdev_priv(dev);
+	volatile u16 *ib = (volatile u16 *)dev->mem_start;
 	struct dev_mc_list *dmi = dev->mc_list;
 	char *addrs;
 	int i;
@@ -939,17 +955,17 @@ static void lance_load_multicast(struct net_device *dev)
 
 	/* set all multicast bits */
 	if (dev->flags & IFF_ALLMULTI) {
-		ib->filter[0] = 0xffff;
-		ib->filter[2] = 0xffff;
-		ib->filter[4] = 0xffff;
-		ib->filter[6] = 0xffff;
+		*lib_ptr(ib, filter[0], lp->type) = 0xffff;
+		*lib_ptr(ib, filter[1], lp->type) = 0xffff;
+		*lib_ptr(ib, filter[2], lp->type) = 0xffff;
+		*lib_ptr(ib, filter[3], lp->type) = 0xffff;
 		return;
 	}
 	/* clear the multicast filter */
-	ib->filter[0] = 0;
-	ib->filter[2] = 0;
-	ib->filter[4] = 0;
-	ib->filter[6] = 0;
+	*lib_ptr(ib, filter[0], lp->type) = 0;
+	*lib_ptr(ib, filter[1], lp->type) = 0;
+	*lib_ptr(ib, filter[2], lp->type) = 0;
+	*lib_ptr(ib, filter[3], lp->type) = 0;
 
 	/* Add addresses */
 	for (i = 0; i < dev->mc_count; i++) {
@@ -962,7 +978,7 @@ static void lance_load_multicast(struct net_device *dev)
 
 		crc = ether_crc_le(ETH_ALEN, addrs);
 		crc = crc >> 26;
-		mcast_table[2 * (crc >> 4)] |= 1 << (crc & 0xf);
+		*lib_ptr(ib, filter[crc >> 4], lp->type) |= 1 << (crc & 0xf);
 	}
 	return;
 }
@@ -970,11 +986,9 @@ static void lance_load_multicast(struct net_device *dev)
 static void lance_set_multicast(struct net_device *dev)
 {
 	struct lance_private *lp = netdev_priv(dev);
-	volatile struct lance_init_block *ib;
+	volatile u16 *ib = (volatile u16 *)dev->mem_start;
 	volatile struct lance_regs *ll = lp->ll;
 
-	ib = (struct lance_init_block *) (dev->mem_start);
-
 	if (!netif_running(dev))
 		return;
 
@@ -992,9 +1006,9 @@ static void lance_set_multicast(struct net_device *dev)
 	lance_init_ring(dev);
 
 	if (dev->flags & IFF_PROMISC) {
-		ib->mode |= LE_MO_PROM;
+		*lib_ptr(ib, mode, lp->type) |= LE_MO_PROM;
 	} else {
-		ib->mode &= ~LE_MO_PROM;
+		*lib_ptr(ib, mode, lp->type) &= ~LE_MO_PROM;
 		lance_load_multicast(dev);
 	}
 	load_csrs(lp);
@@ -1073,20 +1087,20 @@ static int __init dec_lance_init(const int type, const int slot)
 		 */
 		for (i = 0; i < RX_RING_SIZE; i++) {
 			lp->rx_buf_ptr_cpu[i] =
-				(char *)(dev->mem_start + BUF_OFFSET_CPU +
+				(char *)(dev->mem_start + 2 * BUF_OFFSET_CPU +
 					 2 * i * RX_BUFF_SIZE);
 			lp->rx_buf_ptr_lnc[i] =
-				(char *)(BUF_OFFSET_LNC + i * RX_BUFF_SIZE);
+				(BUF_OFFSET_LNC + i * RX_BUFF_SIZE);
 		}
 		for (i = 0; i < TX_RING_SIZE; i++) {
 			lp->tx_buf_ptr_cpu[i] =
-				(char *)(dev->mem_start + BUF_OFFSET_CPU +
+				(char *)(dev->mem_start + 2 * BUF_OFFSET_CPU +
 					 2 * RX_RING_SIZE * RX_BUFF_SIZE +
 					 2 * i * TX_BUFF_SIZE);
 			lp->tx_buf_ptr_lnc[i] =
-				(char *)(BUF_OFFSET_LNC +
-					 RX_RING_SIZE * RX_BUFF_SIZE +
-					 i * TX_BUFF_SIZE);
+				(BUF_OFFSET_LNC +
+				 RX_RING_SIZE * RX_BUFF_SIZE +
+				 i * TX_BUFF_SIZE);
 		}
 
 		/* Setup I/O ASIC LANCE DMA.  */
@@ -1100,6 +1114,7 @@ static int __init dec_lance_init(const int type, const int slot)
 		claim_tc_card(slot);
 
 		dev->mem_start = CKSEG1ADDR(get_tc_base_addr(slot));
+		dev->mem_end = dev->mem_start + 0x100000;
 		dev->base_addr = dev->mem_start + 0x100000;
 		dev->irq = get_tc_irq_nr(slot);
 		esar_base = dev->mem_start + 0x1c0002;
@@ -1110,7 +1125,7 @@ static int __init dec_lance_init(const int type, const int slot)
 				(char *)(dev->mem_start + BUF_OFFSET_CPU +
 					 i * RX_BUFF_SIZE);
 			lp->rx_buf_ptr_lnc[i] =
-				(char *)(BUF_OFFSET_LNC + i * RX_BUFF_SIZE);
+				(BUF_OFFSET_LNC + i * RX_BUFF_SIZE);
 		}
 		for (i = 0; i < TX_RING_SIZE; i++) {
 			lp->tx_buf_ptr_cpu[i] =
@@ -1118,9 +1133,9 @@ static int __init dec_lance_init(const int type, const int slot)
 					 RX_RING_SIZE * RX_BUFF_SIZE +
 					 i * TX_BUFF_SIZE);
 			lp->tx_buf_ptr_lnc[i] =
-				(char *)(BUF_OFFSET_LNC +
-					 RX_RING_SIZE * RX_BUFF_SIZE +
-					 i * TX_BUFF_SIZE);
+				(BUF_OFFSET_LNC +
+				 RX_RING_SIZE * RX_BUFF_SIZE +
+				 i * TX_BUFF_SIZE);
 		}
 
 		break;
@@ -1130,6 +1145,7 @@ static int __init dec_lance_init(const int type, const int slot)
 		dev->irq = dec_interrupt[DEC_IRQ_LANCE];
 		dev->base_addr = CKSEG1ADDR(KN01_SLOT_BASE + KN01_LANCE);
 		dev->mem_start = CKSEG1ADDR(KN01_SLOT_BASE + KN01_LANCE_MEM);
+		dev->mem_end = dev->mem_start + KN01_SLOT_SIZE;
 		esar_base = CKSEG1ADDR(KN01_SLOT_BASE + KN01_ESAR + 1);
 		lp->dma_irq = -1;
 
@@ -1138,20 +1154,20 @@ static int __init dec_lance_init(const int type, const int slot)
 		 */
 		for (i = 0; i < RX_RING_SIZE; i++) {
 			lp->rx_buf_ptr_cpu[i] =
-				(char *)(dev->mem_start + BUF_OFFSET_CPU +
+				(char *)(dev->mem_start + 2 * BUF_OFFSET_CPU +
 					 2 * i * RX_BUFF_SIZE);
 			lp->rx_buf_ptr_lnc[i] =
-				(char *)(BUF_OFFSET_LNC + i * RX_BUFF_SIZE);
+				(BUF_OFFSET_LNC + i * RX_BUFF_SIZE);
 		}
 		for (i = 0; i < TX_RING_SIZE; i++) {
 			lp->tx_buf_ptr_cpu[i] =
-				(char *)(dev->mem_start + BUF_OFFSET_CPU +
+				(char *)(dev->mem_start + 2 * BUF_OFFSET_CPU +
 					 2 * RX_RING_SIZE * RX_BUFF_SIZE +
 					 2 * i * TX_BUFF_SIZE);
 			lp->tx_buf_ptr_lnc[i] =
-				(char *)(BUF_OFFSET_LNC +
-					 RX_RING_SIZE * RX_BUFF_SIZE +
-					 i * TX_BUFF_SIZE);
+				(BUF_OFFSET_LNC +
+				 RX_RING_SIZE * RX_BUFF_SIZE +
+				 i * TX_BUFF_SIZE);
 		}
 
 		break;
-- 
cgit v0.10.2


From e8f7f7f11d07c3fe3316d57790bae4c561064c33 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Mon, 4 Dec 2006 15:04:55 -0800
Subject: [PATCH] declance: Support the I/O ASIC LANCE w/o TURBOchannel

The onboard LANCE of I/O ASIC systems is not a TURBOchannel device, at
least from the software point of view.  Therefore it does not rely on any
kernel TURBOchannel bus services and can be supported even if support for
TURBOchannel has not been enabled in the configuration.

Tested with the onboard LANCE of a DECstation 5000/133.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/net/declance.c b/drivers/net/declance.c
index 1167f8f..4ae0fed 100644
--- a/drivers/net/declance.c
+++ b/drivers/net/declance.c
@@ -1065,7 +1065,6 @@ static int __init dec_lance_init(const int type, const int slot)
 	lp->type = type;
 	lp->slot = slot;
 	switch (type) {
-#ifdef CONFIG_TC
 	case ASIC_LANCE:
 		dev->base_addr = CKSEG1ADDR(dec_kn_slot_base + IOASIC_LANCE);
 
@@ -1109,7 +1108,7 @@ static int __init dec_lance_init(const int type, const int slot)
 			     CPHYSADDR(dev->mem_start) << 3);
 
 		break;
-
+#ifdef CONFIG_TC
 	case PMAD_LANCE:
 		claim_tc_card(slot);
 
@@ -1140,7 +1139,6 @@ static int __init dec_lance_init(const int type, const int slot)
 
 		break;
 #endif
-
 	case PMAX_LANCE:
 		dev->irq = dec_interrupt[DEC_IRQ_LANCE];
 		dev->base_addr = CKSEG1ADDR(KN01_SLOT_BASE + KN01_LANCE);
@@ -1295,10 +1293,8 @@ static int __init dec_lance_probe(void)
 	/* Then handle onboard devices. */
 	if (dec_interrupt[DEC_IRQ_LANCE] >= 0) {
 		if (dec_interrupt[DEC_IRQ_LANCE_MERR] >= 0) {
-#ifdef CONFIG_TC
 			if (dec_lance_init(ASIC_LANCE, -1) >= 0)
 				count++;
-#endif
 		} else if (!TURBOCHANNEL) {
 			if (dec_lance_init(PMAX_LANCE, -1) >= 0)
 				count++;
-- 
cgit v0.10.2


From 043d58064ac6556a3abd3a74201831f3e9a5b6e8 Mon Sep 17 00:00:00 2001
From: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
Date: Mon, 4 Dec 2006 15:04:56 -0800
Subject: [PATCH] sk98lin debug build fix

Fix parenthesis mismatch.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/net/sk98lin/skgesirq.c b/drivers/net/sk98lin/skgesirq.c
index ab66d80..3e7aa49 100644
--- a/drivers/net/sk98lin/skgesirq.c
+++ b/drivers/net/sk98lin/skgesirq.c
@@ -1319,7 +1319,7 @@ SK_BOOL	AutoNeg)	/* Is Auto-negotiation used ? */
 	SkXmPhyRead(pAC, IoC, Port, PHY_BCOM_INT_STAT, &Isrc);
 
 #ifdef xDEBUG
-	if ((Isrc & ~(PHY_B_IS_HCT | PHY_B_IS_LCT) ==
+	if ((Isrc & ~(PHY_B_IS_HCT | PHY_B_IS_LCT)) ==
 		(PHY_B_IS_SCR_S_ER | PHY_B_IS_RRS_CHANGE | PHY_B_IS_LRS_CHANGE)) {
 
 		SK_U32	Stat1, Stat2, Stat3;
-- 
cgit v0.10.2


From 59dc76a4e3bed66f5be474dcdc81cc39c7290cec Mon Sep 17 00:00:00 2001
From: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
Date: Mon, 4 Dec 2006 15:04:56 -0800
Subject: [PATCH] net: smc91x add missing bracket

Signed-off-by: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/net/smc91x.h b/drivers/net/smc91x.h
index d28adf2..9367c57 100644
--- a/drivers/net/smc91x.h
+++ b/drivers/net/smc91x.h
@@ -238,7 +238,7 @@ SMC_outw(u16 val, void __iomem *ioaddr, int reg)
 #define SMC_CAN_USE_16BIT	1
 #define SMC_CAN_USE_32BIT	0
 
-#define SMC_inb(a, r)		inb((u32)a) + (r))
+#define SMC_inb(a, r)		inb(((u32)a) + (r))
 #define SMC_inw(a, r)		inw(((u32)a) + (r))
 #define SMC_outb(v, a, r)	outb(v, ((u32)a) + (r))
 #define SMC_outw(v, a, r)	outw(v, ((u32)a) + (r))
-- 
cgit v0.10.2


From 80922fbcb6f00127e91580e7565bb665947ac5d3 Mon Sep 17 00:00:00 2001
From: "Amit S. Kale" <amitkale@netxen.com>
Date: Mon, 4 Dec 2006 09:18:00 -0800
Subject: [PATCH] NetXen: whitespace cleaup and more cleanup fixes

Signed-off-by: Amit S. Kale <amitkale@netxen.com>

 netxen_nic.h         |   56 ++++++++++++++++++++--------------------------
 netxen_nic_ethtool.c |   53 +++++++++++++++++++++-----------------------
 netxen_nic_hdr.h     |    6 ++---
 netxen_nic_hw.c      |   54 +++++++++++++++++++++------------------------
 netxen_nic_hw.h      |   10 ++++----
 netxen_nic_init.c    |   61 +++++++++++++++++++++++++--------------------------
 netxen_nic_ioctl.h   |    6 ++---
 netxen_nic_isr.c     |   48 +++++++++++++++++-----------------------
 netxen_nic_main.c    |   54 +++++++++++++++++----------------------------
 netxen_nic_niu.c     |   10 ++++----
 10 files changed, 165 insertions(+), 193 deletions(-)
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/net/netxen/netxen_nic.h b/drivers/net/netxen/netxen_nic.h
index d925053..d51f437 100644
--- a/drivers/net/netxen/netxen_nic.h
+++ b/drivers/net/netxen/netxen_nic.h
@@ -1,25 +1,25 @@
 /*
  * Copyright (C) 2003 - 2006 NetXen, Inc.
  * All rights reserved.
- * 
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version 2
  * of the License, or (at your option) any later version.
- *                            
+ *
  * This program is distributed in the hope that it will be useful, but
  * WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *                                   
+ *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  * MA  02111-1307, USA.
- * 
+ *
  * The full GNU General Public License is included in this distribution
  * in the file called LICENSE.
- * 
+ *
  * Contact Information:
  *    info@netxen.com
  * NetXen,
@@ -89,8 +89,8 @@
  * normalize a 64MB crb address to 32MB PCI window 
  * To use NETXEN_CRB_NORMALIZE, window _must_ be set to 1
  */
-#define NETXEN_CRB_NORMAL(reg)        \
-	(reg) - NETXEN_CRB_PCIX_HOST2 + NETXEN_CRB_PCIX_HOST
+#define NETXEN_CRB_NORMAL(reg)	\
+	((reg) - NETXEN_CRB_PCIX_HOST2 + NETXEN_CRB_PCIX_HOST)
 
 #define NETXEN_CRB_NORMALIZE(adapter, reg) \
 	pci_base_offset(adapter, NETXEN_CRB_NORMAL(reg))
@@ -164,7 +164,7 @@ enum {
 
 #define MAX_CMD_DESCRIPTORS		1024
 #define MAX_RCV_DESCRIPTORS		32768
-#define MAX_JUMBO_RCV_DESCRIPTORS	1024
+#define MAX_JUMBO_RCV_DESCRIPTORS	4096
 #define MAX_RCVSTATUS_DESCRIPTORS	MAX_RCV_DESCRIPTORS
 #define MAX_JUMBO_RCV_DESC	MAX_JUMBO_RCV_DESCRIPTORS
 #define MAX_RCV_DESC		MAX_RCV_DESCRIPTORS
@@ -559,7 +559,7 @@ typedef enum {
 #define PRIMARY_START 		(BOOTLD_START)
 #define FLASH_CRBINIT_SIZE 	(0x4000)
 #define FLASH_BRDCFG_SIZE 	(sizeof(struct netxen_board_info))
-#define FLASH_USER_SIZE		(sizeof(netxen_user_info)/sizeof(u32))
+#define FLASH_USER_SIZE		(sizeof(struct netxen_user_info)/sizeof(u32))
 #define FLASH_SECONDARY_SIZE 	(USER_START-SECONDARY_START)
 #define NUM_PRIMARY_SECTORS	(0x20)
 #define NUM_CONFIG_SECTORS 	(1)
@@ -572,7 +572,7 @@ typedef enum {
 #else
 #define DPRINTK(klevel, fmt, args...)	do { \
 	printk(KERN_##klevel PFX "%s: %s: " fmt, __FUNCTION__,\
-		(adapter != NULL && adapter->port != NULL && \
+		(adapter != NULL && \
 		adapter->port[0] != NULL && \
 		adapter->port[0]->netdev != NULL) ? \
 		adapter->port[0]->netdev->name : NULL, \
@@ -703,8 +703,6 @@ struct netxen_recv_context {
 
 #define NETXEN_NIC_MSI_ENABLED 0x02
 
-struct netxen_drvops;
-
 struct netxen_adapter {
 	struct netxen_hardware_context ahw;
 	int port_count;		/* Number of configured ports  */
@@ -746,8 +744,21 @@ struct netxen_adapter {
 	struct netxen_recv_context recv_ctx[MAX_RCV_CTX];
 
 	int is_up;
-	int work_done;
-	struct netxen_drvops *ops;
+	int (*enable_phy_interrupts) (struct netxen_adapter *, int);
+	int (*disable_phy_interrupts) (struct netxen_adapter *, int);
+	void (*handle_phy_intr) (struct netxen_adapter *);
+	int (*macaddr_set) (struct netxen_port *, netxen_ethernet_macaddr_t);
+	int (*set_mtu) (struct netxen_port *, int);
+	int (*set_promisc) (struct netxen_adapter *, int,
+			    netxen_niu_prom_mode_t);
+	int (*unset_promisc) (struct netxen_adapter *, int,
+			      netxen_niu_prom_mode_t);
+	int (*phy_read) (struct netxen_adapter *, long phy, long reg, u32 *);
+	int (*phy_write) (struct netxen_adapter *, long phy, long reg, u32 val);
+	int (*init_port) (struct netxen_adapter *, int);
+	void (*init_niu) (struct netxen_adapter *);
+	int (*stop_port) (struct netxen_adapter *, int);
+
 };				/* netxen_adapter structure */
 
 /* Max number of xmit producer threads that can run simultaneously */
@@ -829,23 +840,6 @@ static inline void __iomem *pci_base(struct netxen_adapter *adapter,
 	return NULL;
 }
 
-struct netxen_drvops {
-	int (*enable_phy_interrupts) (struct netxen_adapter *, int);
-	int (*disable_phy_interrupts) (struct netxen_adapter *, int);
-	void (*handle_phy_intr) (struct netxen_adapter *);
-	int (*macaddr_set) (struct netxen_port *, netxen_ethernet_macaddr_t);
-	int (*set_mtu) (struct netxen_port *, int);
-	int (*set_promisc) (struct netxen_adapter *, int,
-			    netxen_niu_prom_mode_t);
-	int (*unset_promisc) (struct netxen_adapter *, int,
-			      netxen_niu_prom_mode_t);
-	int (*phy_read) (struct netxen_adapter *, long phy, long reg, u32 *);
-	int (*phy_write) (struct netxen_adapter *, long phy, long reg, u32 val);
-	int (*init_port) (struct netxen_adapter *, int);
-	void (*init_niu) (struct netxen_adapter *);
-	int (*stop_port) (struct netxen_adapter *, int);
-};
-
 extern char netxen_nic_driver_name[];
 
 int netxen_niu_xgbe_enable_phy_interrupts(struct netxen_adapter *adapter,
diff --git a/drivers/net/netxen/netxen_nic_ethtool.c b/drivers/net/netxen/netxen_nic_ethtool.c
index 9a914ae..c7fcbf3 100644
--- a/drivers/net/netxen/netxen_nic_ethtool.c
+++ b/drivers/net/netxen/netxen_nic_ethtool.c
@@ -1,25 +1,25 @@
 /*
  * Copyright (C) 2003 - 2006 NetXen, Inc.
  * All rights reserved.
- * 
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version 2
  * of the License, or (at your option) any later version.
- *                            
+ *
  * This program is distributed in the hope that it will be useful, but
  * WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *                                   
+ *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  * MA  02111-1307, USA.
- * 
+ *
  * The full GNU General Public License is included in this distribution
  * in the file called LICENSE.
- * 
+ *
  * Contact Information:
  *    info@netxen.com
  * NetXen,
@@ -118,7 +118,7 @@ netxen_nic_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo)
 	u32 fw_minor = 0;
 	u32 fw_build = 0;
 
-	strncpy(drvinfo->driver, "netxen_nic", 32);
+	strncpy(drvinfo->driver, netxen_nic_driver_name, 32);
 	strncpy(drvinfo->version, NETXEN_NIC_LINUX_VERSIONID, 32);
 	fw_major = readl(NETXEN_CRB_NORMALIZE(adapter,
 					      NETXEN_FW_VERSION_MAJOR));
@@ -210,7 +210,6 @@ netxen_nic_get_settings(struct net_device *dev, struct ethtool_cmd *ecmd)
 		printk(KERN_ERR "netxen-nic: Unsupported board model %d\n",
 		       (netxen_brdtype_t) boardinfo->board_type);
 		return -EIO;
-
 	}
 
 	return 0;
@@ -226,18 +225,18 @@ netxen_nic_set_settings(struct net_device *dev, struct ethtool_cmd *ecmd)
 	/* read which mode */
 	if (adapter->ahw.board_type == NETXEN_NIC_GBE) {
 		/* autonegotiation */
-		if (adapter->ops->phy_write
-		    && adapter->ops->phy_write(adapter, port->portnum,
-					       NETXEN_NIU_GB_MII_MGMT_ADDR_AUTONEG,
-					       (__le32) ecmd->autoneg) != 0)
+		if (adapter->phy_write
+		    && adapter->phy_write(adapter, port->portnum,
+					  NETXEN_NIU_GB_MII_MGMT_ADDR_AUTONEG,
+					  (__le32) ecmd->autoneg) != 0)
 			return -EIO;
 		else
 			port->link_autoneg = ecmd->autoneg;
 
-		if (adapter->ops->phy_read
-		    && adapter->ops->phy_read(adapter, port->portnum,
-					      NETXEN_NIU_GB_MII_MGMT_ADDR_PHY_STATUS,
-					      &status) != 0)
+		if (adapter->phy_read
+		    && adapter->phy_read(adapter, port->portnum,
+					 NETXEN_NIU_GB_MII_MGMT_ADDR_PHY_STATUS,
+					 &status) != 0)
 			return -EIO;
 
 		/* speed */
@@ -257,10 +256,10 @@ netxen_nic_set_settings(struct net_device *dev, struct ethtool_cmd *ecmd)
 			netxen_clear_phy_duplex(status);
 		if (ecmd->duplex == DUPLEX_FULL)
 			netxen_set_phy_duplex(status);
-		if (adapter->ops->phy_write
-		    && adapter->ops->phy_write(adapter, port->portnum,
-					       NETXEN_NIU_GB_MII_MGMT_ADDR_PHY_STATUS,
-					       *((int *)&status)) != 0)
+		if (adapter->phy_write
+		    && adapter->phy_write(adapter, port->portnum,
+					  NETXEN_NIU_GB_MII_MGMT_ADDR_PHY_STATUS,
+					  *((int *)&status)) != 0)
 			return -EIO;
 		else {
 			port->link_speed = ecmd->speed;
@@ -422,10 +421,10 @@ static u32 netxen_nic_get_link(struct net_device *dev)
 
 	/* read which mode */
 	if (adapter->ahw.board_type == NETXEN_NIC_GBE) {
-		if (adapter->ops->phy_read
-		    && adapter->ops->phy_read(adapter, port->portnum,
-					      NETXEN_NIU_GB_MII_MGMT_ADDR_PHY_STATUS,
-					      &status) != 0)
+		if (adapter->phy_read
+		    && adapter->phy_read(adapter, port->portnum,
+					 NETXEN_NIU_GB_MII_MGMT_ADDR_PHY_STATUS,
+					 &status) != 0)
 			return -EIO;
 		else
 			return (netxen_get_phy_link(status));
@@ -526,10 +525,10 @@ netxen_nic_set_pauseparam(struct net_device *dev,
 				    *(u32 *) (&val));
 		/* set autoneg */
 		autoneg = pause->autoneg;
-		if (adapter->ops->phy_write
-		    && adapter->ops->phy_write(adapter, port->portnum,
-					       NETXEN_NIU_GB_MII_MGMT_ADDR_AUTONEG,
-					       (__le32) autoneg) != 0)
+		if (adapter->phy_write
+		    && adapter->phy_write(adapter, port->portnum,
+					  NETXEN_NIU_GB_MII_MGMT_ADDR_AUTONEG,
+					  (__le32) autoneg) != 0)
 			return -EIO;
 		else {
 			port->link_autoneg = pause->autoneg;
diff --git a/drivers/net/netxen/netxen_nic_hdr.h b/drivers/net/netxen/netxen_nic_hdr.h
index 72c6ec4..fe8b675 100644
--- a/drivers/net/netxen/netxen_nic_hdr.h
+++ b/drivers/net/netxen/netxen_nic_hdr.h
@@ -1,7 +1,7 @@
 /*
  * Copyright (C) 2003 - 2006 NetXen, Inc.
  * All rights reserved.
- * 
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version 2
@@ -16,10 +16,10 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  * MA  02111-1307, USA.
- * 
+ *
  * The full GNU General Public License is included in this distribution
  * in the file called LICENSE.
- * 
+ *
  * Contact Information:
  *    info@netxen.com
  * NetXen,
diff --git a/drivers/net/netxen/netxen_nic_hw.c b/drivers/net/netxen/netxen_nic_hw.c
index 105c24f..7470852 100644
--- a/drivers/net/netxen/netxen_nic_hw.c
+++ b/drivers/net/netxen/netxen_nic_hw.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (C) 2003 - 2006 NetXen, Inc.
  * All rights reserved.
- * 
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version 2
@@ -16,10 +16,10 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  * MA  02111-1307, USA.
- * 
+ *
  * The full GNU General Public License is included in this distribution
  * in the file called LICENSE.
- * 
+ *
  * Contact Information:
  *    info@netxen.com
  * NetXen,
@@ -81,8 +81,8 @@ int netxen_nic_set_mac(struct net_device *netdev, void *p)
 	DPRINTK(INFO, "valid ether addr\n");
 	memcpy(netdev->dev_addr, addr->sa_data, netdev->addr_len);
 
-	if (adapter->ops->macaddr_set)
-		adapter->ops->macaddr_set(port, addr->sa_data);
+	if (adapter->macaddr_set)
+		adapter->macaddr_set(port, addr->sa_data);
 
 	return 0;
 }
@@ -99,17 +99,17 @@ void netxen_nic_set_multi(struct net_device *netdev)
 
 	mc_ptr = netdev->mc_list;
 	if (netdev->flags & IFF_PROMISC) {
-		if (adapter->ops->set_promisc)
-			adapter->ops->set_promisc(adapter,
-						  port->portnum,
-						  NETXEN_NIU_PROMISC_MODE);
+		if (adapter->set_promisc)
+			adapter->set_promisc(adapter,
+					     port->portnum,
+					     NETXEN_NIU_PROMISC_MODE);
 	} else {
-		if (adapter->ops->unset_promisc &&
+		if (adapter->unset_promisc &&
 		    adapter->ahw.boardcfg.board_type
 		    != NETXEN_BRDTYPE_P2_SB31_10G_IMEZ)
-			adapter->ops->unset_promisc(adapter,
-						    port->portnum,
-						    NETXEN_NIU_NON_PROMISC_MODE);
+			adapter->unset_promisc(adapter,
+					       port->portnum,
+					       NETXEN_NIU_NON_PROMISC_MODE);
 	}
 	if (adapter->ahw.board_type == NETXEN_NIC_XGBE) {
 		netxen_nic_mcr_set_mode_select(netxen_mac_addr_cntl_data, 0x03);
@@ -160,8 +160,8 @@ int netxen_nic_change_mtu(struct net_device *netdev, int mtu)
 		return -EINVAL;
 	}
 
-	if (adapter->ops->set_mtu)
-		adapter->ops->set_mtu(port, mtu);
+	if (adapter->set_mtu)
+		adapter->set_mtu(port, mtu);
 	netdev->mtu = mtu;
 
 	return 0;
@@ -184,14 +184,12 @@ int netxen_nic_hw_resources(struct netxen_adapter *adapter)
 	struct netxen_recv_context *recv_ctx;
 	struct netxen_rcv_desc_ctx *rcv_desc;
 
-	DPRINTK(INFO, "crb_base: %lx %lx", NETXEN_PCI_CRBSPACE,
+	DPRINTK(INFO, "crb_base: %lx %x", NETXEN_PCI_CRBSPACE,
 		PCI_OFFSET_SECOND_RANGE(adapter, NETXEN_PCI_CRBSPACE));
-	DPRINTK(INFO, "cam base: %lx %lx", NETXEN_CRB_CAM,
+	DPRINTK(INFO, "cam base: %lx %x", NETXEN_CRB_CAM,
 		pci_base_offset(adapter, NETXEN_CRB_CAM));
-	DPRINTK(INFO, "cam RAM: %lx %lx", NETXEN_CAM_RAM_BASE,
+	DPRINTK(INFO, "cam RAM: %lx %x", NETXEN_CAM_RAM_BASE,
 		pci_base_offset(adapter, NETXEN_CAM_RAM_BASE));
-	DPRINTK(INFO, "NIC base:%lx %lx\n", NIC_CRB_BASE_PORT1,
-		pci_base_offset(adapter, NIC_CRB_BASE_PORT1));
 
 	/* Window 1 call */
 	card_cmdring = readl(NETXEN_CRB_NORMALIZE(adapter, CRB_CMDPEG_CMDRING));
@@ -648,7 +646,7 @@ void netxen_nic_reg_write(struct netxen_adapter *adapter, u64 off, u32 val)
 
 	addr = NETXEN_CRB_NORMALIZE(adapter, off);
 	DPRINTK(INFO, "writing to base %lx offset %llx addr %p data %x\n",
-		pci_base(adapter, off), off, addr);
+		pci_base(adapter, off), off, addr, val);
 	writel(val, addr);
 
 }
@@ -660,7 +658,7 @@ int netxen_nic_reg_read(struct netxen_adapter *adapter, u64 off)
 
 	addr = NETXEN_CRB_NORMALIZE(adapter, off);
 	DPRINTK(INFO, "reading from base %lx offset %llx addr %p\n",
-		adapter->ahw.pci_base, off, addr);
+		pci_base(adapter, off), off, addr);
 	val = readl(addr);
 	writel(val, addr);
 
@@ -848,8 +846,8 @@ void netxen_nic_stop_all_ports(struct netxen_adapter *adapter)
 
 	for (port_nr = 0; port_nr < adapter->ahw.max_ports; port_nr++) {
 		port = adapter->port[port_nr];
-		if (adapter->ops->stop_port)
-			adapter->ops->stop_port(adapter, port->portnum);
+		if (adapter->stop_port)
+			adapter->stop_port(adapter, port->portnum);
 	}
 }
 
@@ -878,8 +876,8 @@ void netxen_nic_set_link_parameters(struct netxen_port *port)
 
 	netxen_nic_read_w0(adapter, NETXEN_NIU_MODE, &mode);
 	if (netxen_get_niu_enable_ge(mode)) {	/* Gb 10/100/1000 Mbps mode */
-		if (adapter->ops->phy_read
-		    && adapter->ops->
+		if (adapter->phy_read
+		    && adapter->
 		    phy_read(adapter, port->portnum,
 			     NETXEN_NIU_GB_MII_MGMT_ADDR_PHY_STATUS,
 			     &status) == 0) {
@@ -909,8 +907,8 @@ void netxen_nic_set_link_parameters(struct netxen_port *port)
 					port->link_duplex = -1;
 					break;
 				}
-				if (adapter->ops->phy_read
-				    && adapter->ops->
+				if (adapter->phy_read
+				    && adapter->
 				    phy_read(adapter, port->portnum,
 					     NETXEN_NIU_GB_MII_MGMT_ADDR_AUTONEG,
 					     (__le32 *) & autoneg) != 0)
diff --git a/drivers/net/netxen/netxen_nic_hw.h b/drivers/net/netxen/netxen_nic_hw.h
index 201a636..0685633 100644
--- a/drivers/net/netxen/netxen_nic_hw.h
+++ b/drivers/net/netxen/netxen_nic_hw.h
@@ -1,7 +1,7 @@
 /*
  * Copyright (C) 2003 - 2006 NetXen, Inc.
  * All rights reserved.
- * 
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version 2
@@ -16,10 +16,10 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  * MA  02111-1307, USA.
- * 
+ *
  * The full GNU General Public License is included in this distribution
  * in the file called LICENSE.
- * 
+ *
  * Contact Information:
  *    info@netxen.com
  * NetXen,
@@ -83,8 +83,8 @@ struct netxen_adapter;
 #define NETXEN_PCI_MAPSIZE_BYTES  (NETXEN_PCI_MAPSIZE << 20)
 
 #define NETXEN_NIC_LOCKED_READ_REG(X, Y)	\
-	addr = pci_base_offset(adapter, (X));	\
-	*(u32 *)Y = readl(addr);
+	addr = pci_base_offset(adapter, X);	\
+	*(u32 *)Y = readl((void __iomem*) addr);
 
 struct netxen_port;
 void netxen_nic_set_link_parameters(struct netxen_port *port);
diff --git a/drivers/net/netxen/netxen_nic_init.c b/drivers/net/netxen/netxen_nic_init.c
index 0dca029..deac1a3 100644
--- a/drivers/net/netxen/netxen_nic_init.c
+++ b/drivers/net/netxen/netxen_nic_init.c
@@ -1,25 +1,25 @@
 /*
  * Copyright (C) 2003 - 2006 NetXen, Inc.
  * All rights reserved.
- * 
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version 2
  * of the License, or (at your option) any later version.
- *                            
+ *
  * This program is distributed in the hope that it will be useful, but
  * WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *                                   
+ *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  * MA  02111-1307, USA.
- * 
+ *
  * The full GNU General Public License is included in this distribution
  * in the file called LICENSE.
- * 
+ *
  * Contact Information:
  *    info@netxen.com
  * NetXen,
@@ -212,37 +212,36 @@ void netxen_initialize_adapter_hw(struct netxen_adapter *adapter)
 
 void netxen_initialize_adapter_ops(struct netxen_adapter *adapter)
 {
-	struct netxen_drvops *ops = adapter->ops;
 	switch (adapter->ahw.board_type) {
 	case NETXEN_NIC_GBE:
-		ops->enable_phy_interrupts =
+		adapter->enable_phy_interrupts =
 		    netxen_niu_gbe_enable_phy_interrupts;
-		ops->disable_phy_interrupts =
+		adapter->disable_phy_interrupts =
 		    netxen_niu_gbe_disable_phy_interrupts;
-		ops->handle_phy_intr = netxen_nic_gbe_handle_phy_intr;
-		ops->macaddr_set = netxen_niu_macaddr_set;
-		ops->set_mtu = netxen_nic_set_mtu_gb;
-		ops->set_promisc = netxen_niu_set_promiscuous_mode;
-		ops->unset_promisc = netxen_niu_set_promiscuous_mode;
-		ops->phy_read = netxen_niu_gbe_phy_read;
-		ops->phy_write = netxen_niu_gbe_phy_write;
-		ops->init_port = netxen_niu_gbe_init_port;
-		ops->init_niu = netxen_nic_init_niu_gb;
-		ops->stop_port = netxen_niu_disable_gbe_port;
+		adapter->handle_phy_intr = netxen_nic_gbe_handle_phy_intr;
+		adapter->macaddr_set = netxen_niu_macaddr_set;
+		adapter->set_mtu = netxen_nic_set_mtu_gb;
+		adapter->set_promisc = netxen_niu_set_promiscuous_mode;
+		adapter->unset_promisc = netxen_niu_set_promiscuous_mode;
+		adapter->phy_read = netxen_niu_gbe_phy_read;
+		adapter->phy_write = netxen_niu_gbe_phy_write;
+		adapter->init_port = netxen_niu_gbe_init_port;
+		adapter->init_niu = netxen_nic_init_niu_gb;
+		adapter->stop_port = netxen_niu_disable_gbe_port;
 		break;
 
 	case NETXEN_NIC_XGBE:
-		ops->enable_phy_interrupts =
+		adapter->enable_phy_interrupts =
 		    netxen_niu_xgbe_enable_phy_interrupts;
-		ops->disable_phy_interrupts =
+		adapter->disable_phy_interrupts =
 		    netxen_niu_xgbe_disable_phy_interrupts;
-		ops->handle_phy_intr = netxen_nic_xgbe_handle_phy_intr;
-		ops->macaddr_set = netxen_niu_xg_macaddr_set;
-		ops->set_mtu = netxen_nic_set_mtu_xgb;
-		ops->init_port = netxen_niu_xg_init_port;
-		ops->set_promisc = netxen_niu_xg_set_promiscuous_mode;
-		ops->unset_promisc = netxen_niu_xg_set_promiscuous_mode;
-		ops->stop_port = netxen_niu_disable_xg_port;
+		adapter->handle_phy_intr = netxen_nic_xgbe_handle_phy_intr;
+		adapter->macaddr_set = netxen_niu_xg_macaddr_set;
+		adapter->set_mtu = netxen_nic_set_mtu_xgb;
+		adapter->init_port = netxen_niu_xg_init_port;
+		adapter->set_promisc = netxen_niu_xg_set_promiscuous_mode;
+		adapter->unset_promisc = netxen_niu_xg_set_promiscuous_mode;
+		adapter->stop_port = netxen_niu_disable_xg_port;
 		break;
 
 	default:
@@ -383,8 +382,8 @@ int netxen_rom_wip_poll(struct netxen_adapter *adapter)
 	return 0;
 }
 
-static inline int do_rom_fast_write(struct netxen_adapter *adapter,
-				    int addr, int data)
+static inline int do_rom_fast_write(struct netxen_adapter *adapter, int addr,
+				    int data)
 {
 	if (netxen_rom_wren(adapter)) {
 		return -1;
@@ -734,8 +733,8 @@ void netxen_watchdog_task(unsigned long v)
 			netif_wake_queue(netdev);
 	}
 
-	if (adapter->ops->handle_phy_intr)
-		adapter->ops->handle_phy_intr(adapter);
+	if (adapter->handle_phy_intr)
+		adapter->handle_phy_intr(adapter);
 	mod_timer(&adapter->watchdog_timer, jiffies + 2 * HZ);
 }
 
diff --git a/drivers/net/netxen/netxen_nic_ioctl.h b/drivers/net/netxen/netxen_nic_ioctl.h
index 23e53ad..8eef139 100644
--- a/drivers/net/netxen/netxen_nic_ioctl.h
+++ b/drivers/net/netxen/netxen_nic_ioctl.h
@@ -1,7 +1,7 @@
 /*
  * Copyright (C) 2003 - 2006 NetXen, Inc.
  * All rights reserved.
- * 
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version 2
@@ -16,10 +16,10 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  * MA  02111-1307, USA.
- * 
+ *
  * The full GNU General Public License is included in this distribution
  * in the file called LICENSE.
- * 
+ *
  * Contact Information:
  *    info@netxen.com
  * NetXen,
diff --git a/drivers/net/netxen/netxen_nic_isr.c b/drivers/net/netxen/netxen_nic_isr.c
index ae180fe..0f6e7b8 100644
--- a/drivers/net/netxen/netxen_nic_isr.c
+++ b/drivers/net/netxen/netxen_nic_isr.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (C) 2003 - 2006 NetXen, Inc.
  * All rights reserved.
- * 
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version 2
@@ -16,10 +16,10 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  * MA  02111-1307, USA.
- * 
+ *
  * The full GNU General Public License is included in this distribution
  * in the file called LICENSE.
- * 
+ *
  * Contact Information:
  *    info@netxen.com
  * NetXen,
@@ -84,46 +84,41 @@ void netxen_handle_port_int(struct netxen_adapter *adapter, u32 portno,
 	struct netxen_port *port;
 
 	/*  This should clear the interrupt source */
-	if (adapter->ops->phy_read)
-		adapter->ops->phy_read(adapter, portno,
-				       NETXEN_NIU_GB_MII_MGMT_ADDR_INT_STATUS,
-				       &int_src);
+	if (adapter->phy_read)
+		adapter->phy_read(adapter, portno,
+				  NETXEN_NIU_GB_MII_MGMT_ADDR_INT_STATUS,
+				  &int_src);
 	if (int_src == 0) {
 		DPRINTK(INFO, "No phy interrupts for port #%d\n", portno);
 		return;
 	}
-	if (adapter->ops->disable_phy_interrupts)
-		adapter->ops->disable_phy_interrupts(adapter, portno);
+	if (adapter->disable_phy_interrupts)
+		adapter->disable_phy_interrupts(adapter, portno);
 
 	port = adapter->port[portno];
 
 	if (netxen_get_phy_int_jabber(int_src))
-		DPRINTK(INFO, "NetXen: %s Jabber interrupt \n",
-			port->netdev->name);
+		DPRINTK(INFO, "Jabber interrupt \n");
 
 	if (netxen_get_phy_int_polarity_changed(int_src))
-		DPRINTK(INFO, "NetXen: %s POLARITY CHANGED int \n",
-			port->netdev->name);
+		DPRINTK(INFO, "POLARITY CHANGED int \n");
 
 	if (netxen_get_phy_int_energy_detect(int_src))
-		DPRINTK(INFO, "NetXen: %s ENERGY DETECT INT \n",
-			port->netdev->name);
+		DPRINTK(INFO, "ENERGY DETECT INT \n");
 
 	if (netxen_get_phy_int_downshift(int_src))
-		DPRINTK(INFO, "NetXen: %s DOWNSHIFT INT \n",
-			port->netdev->name);
+		DPRINTK(INFO, "DOWNSHIFT INT \n");
 	/* write it down later.. */
 	if ((netxen_get_phy_int_speed_changed(int_src))
 	    || (netxen_get_phy_int_link_status_changed(int_src))) {
 		__le32 status;
 
-		DPRINTK(INFO, "NetXen: %s SPEED CHANGED OR"
-			" LINK STATUS CHANGED \n", port->netdev->name);
+		DPRINTK(INFO, "SPEED CHANGED OR LINK STATUS CHANGED \n");
 
-		if (adapter->ops->phy_read
-		    && adapter->ops->phy_read(adapter, portno,
-					      NETXEN_NIU_GB_MII_MGMT_ADDR_PHY_STATUS,
-					      &status) == 0) {
+		if (adapter->phy_read
+		    && adapter->phy_read(adapter, portno,
+					 NETXEN_NIU_GB_MII_MGMT_ADDR_PHY_STATUS,
+					 &status) == 0) {
 			if (netxen_get_phy_int_link_status_changed(int_src)) {
 				if (netxen_get_phy_link(status)) {
 					netxen_niu_gbe_init_port(adapter,
@@ -143,8 +138,8 @@ void netxen_handle_port_int(struct netxen_adapter *adapter, u32 portno,
 			}
 		}
 	}
-	if (adapter->ops->enable_phy_interrupts)
-		adapter->ops->enable_phy_interrupts(adapter, portno);
+	if (adapter->enable_phy_interrupts)
+		adapter->enable_phy_interrupts(adapter, portno);
 }
 
 void netxen_nic_isr_other(struct netxen_adapter *adapter)
@@ -159,8 +154,7 @@ void netxen_nic_isr_other(struct netxen_adapter *adapter)
 
 	qg_linksup = adapter->ahw.qg_linksup;
 	adapter->ahw.qg_linksup = val;
-	DPRINTK(1, INFO, "%s: link update 0x%08x\n", netxen_nic_driver_name,
-		val);
+	DPRINTK(INFO, "link update 0x%08x\n", val);
 	for (portno = 0; portno < NETXEN_NIU_MAX_GBE_PORTS; portno++) {
 		linkup = val & 1;
 		if (linkup != (qg_linksup & 1)) {
diff --git a/drivers/net/netxen/netxen_nic_main.c b/drivers/net/netxen/netxen_nic_main.c
index 1cb662d..6dbdc8b 100644
--- a/drivers/net/netxen/netxen_nic_main.c
+++ b/drivers/net/netxen/netxen_nic_main.c
@@ -1,25 +1,25 @@
 /*
  * Copyright (C) 2003 - 2006 NetXen, Inc.
  * All rights reserved.
- * 
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version 2
  * of the License, or (at your option) any later version.
- *                            
+ *
  * This program is distributed in the hope that it will be useful, but
  * WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *                                   
+ *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  * MA  02111-1307, USA.
- * 
+ *
  * The full GNU General Public License is included in this distribution
  * in the file called LICENSE.
- * 
+ *
  * Contact Information:
  *    info@netxen.com
  * NetXen,
@@ -233,16 +233,6 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	}
 
-	adapter->ops = kzalloc(sizeof(struct netxen_drvops), GFP_KERNEL);
-	if (adapter->ops == NULL) {
-		printk(KERN_ERR
-		       "%s: Could not allocate memory for adapter->ops:%d\n",
-		       netxen_nic_driver_name,
-		       (int)sizeof(struct netxen_adapter));
-		err = -ENOMEM;
-		goto err_out_free_rx_buffer;
-	}
-
 	adapter->cmd_buf_arr = cmd_buf_arr;
 	adapter->ahw.pci_base0 = mem_ptr0;
 	adapter->ahw.pci_base1 = mem_ptr1;
@@ -373,10 +363,9 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 				       netdev->dev_addr[4],
 				       netdev->dev_addr[5]);
 			} else {
-				if (adapter->ops->macaddr_set)
-					adapter->ops->macaddr_set(port,
-								  netdev->
-								  dev_addr);
+				if (adapter->macaddr_set)
+					adapter->macaddr_set(port,
+							     netdev->dev_addr);
 			}
 		}
 		INIT_WORK(&adapter->tx_timeout_task,
@@ -427,7 +416,6 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 			free_netdev(port->netdev);
 		}
 	}
-	kfree(adapter->ops);
 
       err_out_free_rx_buffer:
 	for (i = 0; i < MAX_RCV_CTX; ++i) {
@@ -525,7 +513,6 @@ static void __devexit netxen_nic_remove(struct pci_dev *pdev)
 	}
 
 	vfree(adapter->cmd_buf_arr);
-	kfree(adapter->ops);
 	kfree(adapter);
 }
 
@@ -557,15 +544,15 @@ static int netxen_nic_open(struct net_device *netdev)
 			       err);
 			return err;
 		}
-		if (adapter->ops->init_port
-		    && adapter->ops->init_port(adapter, port->portnum) != 0) {
+		if (adapter->init_port
+		    && adapter->init_port(adapter, port->portnum) != 0) {
 			printk(KERN_ERR "%s: Failed to initialize port %d\n",
 			       netxen_nic_driver_name, port->portnum);
 			netxen_free_hw_resources(adapter);
 			return -EIO;
 		}
-		if (adapter->ops->init_niu)
-			adapter->ops->init_niu(adapter);
+		if (adapter->init_niu)
+			adapter->init_niu(adapter);
 		for (ctx = 0; ctx < MAX_RCV_CTX; ++ctx) {
 			for (ring = 0; ring < NUM_RCV_DESC_RINGS; ring++)
 				netxen_post_rx_buffers(adapter, ctx, ring);
@@ -591,8 +578,8 @@ static int netxen_nic_open(struct net_device *netdev)
 
 	/* Done here again so that even if phantom sw overwrote it,
 	 * we set it */
-	if (adapter->ops->macaddr_set)
-		adapter->ops->macaddr_set(port, netdev->dev_addr);
+	if (adapter->macaddr_set)
+		adapter->macaddr_set(port, netdev->dev_addr);
 	netxen_nic_set_link_parameters(port);
 
 	netxen_nic_set_multi(netdev);
@@ -1039,11 +1026,12 @@ static int netxen_nic_poll(struct net_device *netdev, int *budget)
 	int done = 1;
 	int ctx;
 	int this_work_done;
+	int work_done = 0;
 
 	DPRINTK(INFO, "polling for %d descriptors\n", *budget);
 	port->stats.polled++;
 
-	adapter->work_done = 0;
+	work_done = 0;
 	for (ctx = 0; ctx < MAX_RCV_CTX; ++ctx) {
 		/*
 		 * Fairness issue. This will give undue weight to the
@@ -1060,20 +1048,20 @@ static int netxen_nic_poll(struct net_device *netdev, int *budget)
 		this_work_done = netxen_process_rcv_ring(adapter, ctx,
 							 work_to_do /
 							 MAX_RCV_CTX);
-		adapter->work_done += this_work_done;
+		work_done += this_work_done;
 	}
 
-	netdev->quota -= adapter->work_done;
-	*budget -= adapter->work_done;
+	netdev->quota -= work_done;
+	*budget -= work_done;
 
-	if (adapter->work_done >= work_to_do
+	if (work_done >= work_to_do
 	    && netxen_nic_rx_has_work(adapter) != 0)
 		done = 0;
 
 	netxen_process_cmd_ring((unsigned long)adapter);
 
 	DPRINTK(INFO, "new work_done: %d work_to_do: %d\n",
-		adapter->work_done, work_to_do);
+		work_done, work_to_do);
 	if (done) {
 		netif_rx_complete(netdev);
 		netxen_nic_enable_int(adapter);
diff --git a/drivers/net/netxen/netxen_nic_niu.c b/drivers/net/netxen/netxen_nic_niu.c
index 7950a04..ff74f1e 100644
--- a/drivers/net/netxen/netxen_nic_niu.c
+++ b/drivers/net/netxen/netxen_nic_niu.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (C) 2003 - 2006 NetXen, Inc.
  * All rights reserved.
- * 
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version 2
@@ -16,10 +16,10 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  * MA  02111-1307, USA.
- * 
+ *
  * The full GNU General Public License is included in this distribution
  * in the file called LICENSE.
- * 
+ *
  * Contact Information:
  *    info@netxen.com
  * NetXen,
@@ -399,8 +399,8 @@ int netxen_niu_gbe_init_port(struct netxen_adapter *adapter, int port)
 {
 	int result = 0;
 	__le32 status;
-	if (adapter->ops->disable_phy_interrupts)
-		adapter->ops->disable_phy_interrupts(adapter, port);
+	if (adapter->disable_phy_interrupts)
+		adapter->disable_phy_interrupts(adapter, port);
 	mdelay(2);
 
 	if (0 ==
-- 
cgit v0.10.2


From ed25ffa16434724f5ed825aa48734c7f3aefa203 Mon Sep 17 00:00:00 2001
From: "Amit S. Kale" <amitkale@netxen.com>
Date: Mon, 4 Dec 2006 09:23:25 -0800
Subject: [PATCH] NetXen: multiport firmware support, ioctl interface

NetXen: 1G/10G Ethernet driver updates
	- Multiport and newer firmware support
	- ioctl interface for user level tools
	- Cast error fix for multiport

Signed-off-by: Amit S. Kale <amitkale@netxen.com>

 netxen_nic.h          |  281 +++++++++++++++++++++++++-------
 netxen_nic_ethtool.c  |   12 -
 netxen_nic_hw.c       |  429 +++++++++++++++++++++++++++++++++++++++++---------
 netxen_nic_init.c     |  301 ++++++++++++++++++++++++++++++-----
 netxen_nic_ioctl.h    |    2
 netxen_nic_isr.c      |    3
 netxen_nic_main.c     |  260 ++++++++++++++++++------------
 netxen_nic_niu.c      |   22 +-
 netxen_nic_phan_reg.h |  228 ++++++++++++++++----------
 9 files changed, 1161 insertions(+), 377 deletions(-)
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/net/netxen/netxen_nic.h b/drivers/net/netxen/netxen_nic.h
index d51f437..3151aaa 100644
--- a/drivers/net/netxen/netxen_nic.h
+++ b/drivers/net/netxen/netxen_nic.h
@@ -63,27 +63,49 @@
 
 #include "netxen_nic_hw.h"
 
-#define NETXEN_NIC_BUILD_NO     "5"
-#define _NETXEN_NIC_LINUX_MAJOR 2
+#define NETXEN_NIC_BUILD_NO     "1"
+#define _NETXEN_NIC_LINUX_MAJOR 3
 #define _NETXEN_NIC_LINUX_MINOR 3
-#define _NETXEN_NIC_LINUX_SUBVERSION 59
-#define NETXEN_NIC_LINUX_VERSIONID  "2.3.59" "-" NETXEN_NIC_BUILD_NO
-#define NETXEN_NIC_FW_VERSIONID "2.3.59"
+#define _NETXEN_NIC_LINUX_SUBVERSION 2
+#define NETXEN_NIC_LINUX_VERSIONID  "3.3.2" "-" NETXEN_NIC_BUILD_NO
+#define NETXEN_NIC_FW_VERSIONID "3.3.2"
 
 #define RCV_DESC_RINGSIZE	\
 	(sizeof(struct rcv_desc) * adapter->max_rx_desc_count)
 #define STATUS_DESC_RINGSIZE	\
 	(sizeof(struct status_desc)* adapter->max_rx_desc_count)
+#define LRO_DESC_RINGSIZE	\
+	(sizeof(rcvDesc_t) * adapter->max_lro_rx_desc_count)
 #define TX_RINGSIZE	\
 	(sizeof(struct netxen_cmd_buffer) * adapter->max_tx_desc_count)
 #define RCV_BUFFSIZE	\
 	(sizeof(struct netxen_rx_buffer) * rcv_desc->max_rx_desc_count)
 #define find_diff_among(a,b,range) ((a)<(b)?((b)-(a)):((b)+(range)-(a)))
 
-#define NETXEN_NETDEV_STATUS 0x1
+#define NETXEN_NETDEV_STATUS		0x1
+#define NETXEN_RCV_PRODUCER_OFFSET	0
+#define NETXEN_RCV_PEG_DB_ID		2
+#define NETXEN_HOST_DUMMY_DMA_SIZE 1024
 
 #define ADDR_IN_WINDOW1(off)	\
 	((off > NETXEN_CRB_PCIX_HOST2) && (off < NETXEN_CRB_MAX)) ? 1 : 0
+/*
+ * In netxen_nic_down(), we must wait for any pending callback requests into
+ * netxen_watchdog_task() to complete; eg otherwise the watchdog_timer could be
+ * reenabled right after it is deleted in netxen_nic_down(). FLUSH_SCHEDULED_WORK()
+ * does this synchronization.
+ *
+ * Normally, schedule_work()/flush_scheduled_work() could have worked, but
+ * netxen_nic_close() is invoked with kernel rtnl lock held. netif_carrier_off()
+ * call in netxen_nic_close() triggers a schedule_work(&linkwatch_work), and a
+ * subsequent call to flush_scheduled_work() in netxen_nic_down() would cause
+ * linkwatch_event() to be executed which also attempts to acquire the rtnl
+ * lock thus causing a deadlock.
+ */
+
+#define SCHEDULE_WORK(tp)	queue_work(netxen_workq, tp)
+#define FLUSH_SCHEDULED_WORK()	flush_workqueue(netxen_workq)
+extern struct workqueue_struct *netxen_workq;
 
 /* 
  * normalize a 64MB crb address to 32MB PCI window 
@@ -95,8 +117,14 @@
 #define NETXEN_CRB_NORMALIZE(adapter, reg) \
 	pci_base_offset(adapter, NETXEN_CRB_NORMAL(reg))
 
+#define DB_NORMALIZE(adapter, off) \
+	(adapter->ahw.db_base + (off))
+
+#define NX_P2_C0		0x24
+#define NX_P2_C1		0x25
+
 #define FIRST_PAGE_GROUP_START	0
-#define FIRST_PAGE_GROUP_END	0x400000
+#define FIRST_PAGE_GROUP_END	0x100000
 
 #define SECOND_PAGE_GROUP_START	0x4000000
 #define SECOND_PAGE_GROUP_END	0x66BC000
@@ -108,11 +136,13 @@
 #define SECOND_PAGE_GROUP_SIZE SECOND_PAGE_GROUP_END - SECOND_PAGE_GROUP_START
 #define THIRD_PAGE_GROUP_SIZE  THIRD_PAGE_GROUP_END - THIRD_PAGE_GROUP_START
 
-#define MAX_RX_BUFFER_LENGTH		2000
+#define MAX_RX_BUFFER_LENGTH		1760
 #define MAX_RX_JUMBO_BUFFER_LENGTH 	9046
-#define RX_DMA_MAP_LEN			(MAX_RX_BUFFER_LENGTH - NET_IP_ALIGN)
+#define MAX_RX_LRO_BUFFER_LENGTH	((48*1024)-512)
+#define RX_DMA_MAP_LEN			(MAX_RX_BUFFER_LENGTH - 2)
 #define RX_JUMBO_DMA_MAP_LEN	\
-	(MAX_RX_JUMBO_BUFFER_LENGTH - NET_IP_ALIGN)
+	(MAX_RX_JUMBO_BUFFER_LENGTH - 2)
+#define RX_LRO_DMA_MAP_LEN		(MAX_RX_LRO_BUFFER_LENGTH - 2)
 #define NETXEN_ROM_ROUNDUP		0x80000000ULL
 
 /*
@@ -151,30 +181,38 @@ enum {
 /* Host writes the following to notify that it has done the init-handshake */
 #define PHAN_INITIALIZE_ACK	0xf00f
 
-#define NUM_RCV_DESC_RINGS	2	/* No of Rcv Descriptor contexts */
+#define NUM_RCV_DESC_RINGS	3	/* No of Rcv Descriptor contexts */
 
 /* descriptor types */
 #define RCV_DESC_NORMAL		0x01
 #define RCV_DESC_JUMBO		0x02
+#define RCV_DESC_LRO		0x04
 #define RCV_DESC_NORMAL_CTXID	0
 #define RCV_DESC_JUMBO_CTXID	1
+#define RCV_DESC_LRO_CTXID	2
 
 #define RCV_DESC_TYPE(ID) \
-	((ID == RCV_DESC_JUMBO_CTXID) ? RCV_DESC_JUMBO : RCV_DESC_NORMAL)
+	((ID == RCV_DESC_JUMBO_CTXID)	\
+		? RCV_DESC_JUMBO	\
+		: ((ID == RCV_DESC_LRO_CTXID)	\
+			? RCV_DESC_LRO :	\
+			(RCV_DESC_NORMAL)))
 
 #define MAX_CMD_DESCRIPTORS		1024
 #define MAX_RCV_DESCRIPTORS		32768
 #define MAX_JUMBO_RCV_DESCRIPTORS	4096
+#define MAX_LRO_RCV_DESCRIPTORS		2048
 #define MAX_RCVSTATUS_DESCRIPTORS	MAX_RCV_DESCRIPTORS
 #define MAX_JUMBO_RCV_DESC	MAX_JUMBO_RCV_DESCRIPTORS
 #define MAX_RCV_DESC		MAX_RCV_DESCRIPTORS
 #define MAX_RCVSTATUS_DESC	MAX_RCV_DESCRIPTORS
-#define NUM_RCV_DESC		(MAX_RCV_DESC + MAX_JUMBO_RCV_DESCRIPTORS)
 #define MAX_EPG_DESCRIPTORS	(MAX_CMD_DESCRIPTORS * 8)
-
+#define NUM_RCV_DESC		(MAX_RCV_DESC + MAX_JUMBO_RCV_DESCRIPTORS + \
+				 MAX_LRO_RCV_DESCRIPTORS)
 #define MIN_TX_COUNT	4096
 #define MIN_RX_COUNT	4096
-
+#define NETXEN_CTX_SIGNATURE	0xdee0
+#define NETXEN_RCV_PRODUCER(ringid)	(ringid)
 #define MAX_FRAME_SIZE	0x10000	/* 64K MAX size for LSO */
 
 #define PHAN_PEG_RCV_INITIALIZED	0xff01
@@ -186,6 +224,67 @@ enum {
 #define get_index_range(index,length,count)	\
 	(((index) + (count)) & ((length) - 1))
 
+#define MPORT_SINGLE_FUNCTION_MODE 0x1111
+
+extern unsigned long long netxen_dma_mask;
+
+/*
+ * NetXen host-peg signal message structure
+ *
+ *	Bit 0-1		: peg_id => 0x2 for tx and 01 for rx
+ *	Bit 2		: priv_id => must be 1
+ *	Bit 3-17	: count => for doorbell
+ *	Bit 18-27	: ctx_id => Context id
+ *	Bit 28-31	: opcode
+ */
+
+typedef u32 netxen_ctx_msg;
+
+#define _netxen_set_bits(config_word, start, bits, val)	{\
+	unsigned long long mask = (((1ULL << (bits)) - 1) << (start));	\
+	unsigned long long value = (val);	\
+	(config_word) &= ~mask;	\
+	(config_word) |= (((value) << (start)) & mask); \
+}
+
+#define netxen_set_msg_peg_id(config_word, val)	\
+	_netxen_set_bits(config_word, 0, 2, val)
+#define netxen_set_msg_privid(config_word)	\
+	set_bit(2, (unsigned long*)&config_word)
+#define netxen_set_msg_count(config_word, val)	\
+	_netxen_set_bits(config_word, 3, 15, val)
+#define netxen_set_msg_ctxid(config_word, val)	\
+	_netxen_set_bits(config_word, 18, 10, val)
+#define netxen_set_msg_opcode(config_word, val)	\
+	_netxen_set_bits(config_word, 28, 4, val)
+
+struct netxen_rcv_context {
+	u32 rcv_ring_addr_lo;
+	u32 rcv_ring_addr_hi;
+	u32 rcv_ring_size;
+	u32 rsrvd;
+};
+
+struct netxen_ring_ctx {
+
+	/* one command ring */
+	u64 cmd_consumer_offset;
+	u32 cmd_ring_addr_lo;
+	u32 cmd_ring_addr_hi;
+	u32 cmd_ring_size;
+	u32 rsrvd;
+
+	/* three receive rings */
+	struct netxen_rcv_context rcv_ctx[3];
+
+	/* one status ring */
+	u32 sts_ring_addr_lo;
+	u32 sts_ring_addr_hi;
+	u32 sts_ring_size;
+
+	u32 ctx_id;
+} __attribute__ ((aligned(64)));
+
 /*
  * Following data structures describe the descriptors that will be used.
  * Added fileds of tcpHdrSize and ipHdrSize, The driver needs to do it only when
@@ -203,22 +302,32 @@ enum {
 #define FLAGS_IPSEC_SA_DELETE	0x08
 #define FLAGS_VLAN_TAGGED	0x10
 
-#define CMD_DESC_TOTAL_LENGTH(cmd_desc)	\
-		((cmd_desc)->length_tcp_hdr & 0x00FFFFFF)
-#define CMD_DESC_TCP_HDR_OFFSET(cmd_desc)	\
-		(((cmd_desc)->length_tcp_hdr >> 24) & 0x0FF)
-#define CMD_DESC_PORT(cmd_desc)		((cmd_desc)->port_ctxid & 0x0F)
-#define CMD_DESC_CTX_ID(cmd_desc)	(((cmd_desc)->port_ctxid >> 4) & 0x0F)
+#define netxen_set_cmd_desc_port(cmd_desc, var)	\
+	((cmd_desc)->port_ctxid |= ((var) & 0x0F))
 
-#define CMD_DESC_TOTAL_LENGTH_WRT(cmd_desc, var)	\
-		((cmd_desc)->length_tcp_hdr |= ((var) & 0x00FFFFFF))
-#define CMD_DESC_TCP_HDR_OFFSET_WRT(cmd_desc, var)	\
-		((cmd_desc)->length_tcp_hdr |= (((var) << 24) & 0xFF000000))
-#define CMD_DESC_PORT_WRT(cmd_desc, var)	\
-		((cmd_desc)->port_ctxid |= ((var) & 0x0F))
+#define netxen_set_cmd_desc_flags(cmd_desc, val)	\
+	_netxen_set_bits((cmd_desc)->flags_opcode, 0, 7, val)
+#define netxen_set_cmd_desc_opcode(cmd_desc, val)	\
+	_netxen_set_bits((cmd_desc)->flags_opcode, 7, 6, val)
+
+#define netxen_set_cmd_desc_num_of_buff(cmd_desc, val)	\
+	_netxen_set_bits((cmd_desc)->num_of_buffers_total_length, 0, 8, val);
+#define netxen_set_cmd_desc_totallength(cmd_desc, val)	\
+	_netxen_set_bits((cmd_desc)->num_of_buffers_total_length, 8, 24, val);
+
+#define netxen_get_cmd_desc_opcode(cmd_desc)	\
+	(((cmd_desc)->flags_opcode >> 7) & 0x003F)
+#define netxen_get_cmd_desc_totallength(cmd_desc)	\
+	(((cmd_desc)->num_of_buffers_total_length >> 8) & 0x0FFFFFF)
 
 struct cmd_desc_type0 {
-	u64 netxen_next;	/* for fragments handled by Phantom */
+	u8 tcp_hdr_offset;	/* For LSO only */
+	u8 ip_hdr_offset;	/* For LSO only */
+	/* Bit pattern: 0-6 flags, 7-12 opcode, 13-15 unused */
+	u16 flags_opcode;
+	/* Bit pattern: 0-7 total number of segments,
+	   8-31 Total size of the packet */
+	u32 num_of_buffers_total_length;
 	union {
 		struct {
 			u32 addr_low_part2;
@@ -227,13 +336,6 @@ struct cmd_desc_type0 {
 		u64 addr_buffer2;
 	};
 
-	/* Bit pattern: 0-23 total length, 24-32 tcp header offset */
-	u32 length_tcp_hdr;
-	u8 ip_hdr_offset;	/* For LSO only */
-	u8 num_of_buffers;	/* total number of segments */
-	u8 flags;		/* as defined above */
-	u8 opcode;
-
 	u16 reference_handle;	/* changed to u16 to add mss */
 	u16 mss;		/* passed by NDIS_PACKET for LSO */
 	/* Bit pattern 0-3 port, 0-3 ctx id */
@@ -248,7 +350,6 @@ struct cmd_desc_type0 {
 		};
 		u64 addr_buffer3;
 	};
-
 	union {
 		struct {
 			u32 addr_low_part1;
@@ -270,6 +371,8 @@ struct cmd_desc_type0 {
 		u64 addr_buffer4;
 	};
 
+	u64 unused;
+
 } __attribute__ ((aligned(64)));
 
 /* Note: sizeof(rcv_desc) should always be a mutliple of 2 */
@@ -296,22 +399,49 @@ struct rcv_desc {
 #define NETXEN_PROT_UNKNOWN	(0)
 
 /* Note: sizeof(status_desc) should always be a mutliple of 2 */
-#define STATUS_DESC_PORT(status_desc)	\
-		((status_desc)->port_status_type_op & 0x0F)
-#define STATUS_DESC_STATUS(status_desc)	\
-		(((status_desc)->port_status_type_op >> 4) & 0x0F)
-#define STATUS_DESC_TYPE(status_desc)	\
-		(((status_desc)->port_status_type_op >> 8) & 0x0F)
-#define STATUS_DESC_OPCODE(status_desc)	\
-		(((status_desc)->port_status_type_op >> 12) & 0x0F)
+
+#define netxen_get_sts_desc_lro_cnt(status_desc)	\
+	((status_desc)->lro & 0x7F)
+#define netxen_get_sts_desc_lro_last_frag(status_desc)	\
+	(((status_desc)->lro & 0x80) >> 7)
+
+#define netxen_get_sts_port(status_desc)	\
+	((status_desc)->status_desc_data & 0x0F)
+#define netxen_get_sts_status(status_desc)	\
+	(((status_desc)->status_desc_data >> 4) & 0x0F)
+#define netxen_get_sts_type(status_desc)	\
+	(((status_desc)->status_desc_data >> 8) & 0x0F)
+#define netxen_get_sts_totallength(status_desc)	\
+	(((status_desc)->status_desc_data >> 12) & 0xFFFF)
+#define netxen_get_sts_refhandle(status_desc)	\
+	(((status_desc)->status_desc_data >> 28) & 0xFFFF)
+#define netxen_get_sts_prot(status_desc)	\
+	(((status_desc)->status_desc_data >> 44) & 0x0F)
+#define netxen_get_sts_owner(status_desc)	\
+	(((status_desc)->status_desc_data >> 56) & 0x03)
+#define netxen_get_sts_opcode(status_desc)	\
+	(((status_desc)->status_desc_data >> 58) & 0x03F)
+
+#define netxen_clear_sts_owner(status_desc)	\
+	((status_desc)->status_desc_data &=	\
+	~(((unsigned long long)3) << 56 ))
+#define netxen_set_sts_owner(status_desc, val)	\
+	((status_desc)->status_desc_data |=	\
+	(((unsigned long long)((val) & 0x3)) << 56 ))
 
 struct status_desc {
-	/* Bit pattern: 0-3 port, 4-7 status, 8-11 type, 12-15 opcode */
-	u16 port_status_type_op;
-	u16 total_length;	/* NIC mode */
-	u16 reference_handle;	/* handle for the associated packet */
-	/* Bit pattern: 0-1 owner, 2-5 protocol */
-	u16 owner;		/* Owner of the descriptor */
+	/* Bit pattern: 0-3 port, 4-7 status, 8-11 type, 12-27 total_length
+	   28-43 reference_handle, 44-47 protocol, 48-52 unused
+	   53-55 desc_cnt, 56-57 owner, 58-63 opcode
+	 */
+	u64 status_desc_data;
+	u32 hash_value;
+	u8 hash_type;
+	u8 msg_type;
+	u8 unused;
+	/* Bit pattern: 0-6 lro_count indicates frag sequence,
+	   7 last_frag indicates last frag */
+	u8 lro;
 } __attribute__ ((aligned(8)));
 
 enum {
@@ -563,7 +693,8 @@ typedef enum {
 #define FLASH_SECONDARY_SIZE 	(USER_START-SECONDARY_START)
 #define NUM_PRIMARY_SECTORS	(0x20)
 #define NUM_CONFIG_SECTORS 	(1)
-#define PFX "netxen: "
+#define PFX "NetXen: "
+extern char netxen_nic_driver_name[];
 
 /* Note: Make sure to not call this before adapter->port is valid */
 #if !defined(NETXEN_DEBUG)
@@ -609,7 +740,6 @@ struct netxen_cmd_buffer {
 	u8 frag_count;
 	unsigned long time_stamp;
 	u32 state;
-	u32 no_of_descriptors;
 };
 
 /* In rx_buffer, we do not need multiple fragments as is a single buffer */
@@ -618,6 +748,9 @@ struct netxen_rx_buffer {
 	u64 dma;
 	u16 ref_handle;
 	u16 state;
+	u32 lro_expected_frags;
+	u32 lro_current_frags;
+	u32 lro_length;
 };
 
 /* Board types */
@@ -633,6 +766,8 @@ struct netxen_hardware_context {
 	void __iomem *pci_base0;
 	void __iomem *pci_base1;
 	void __iomem *pci_base2;
+	void __iomem *db_base;
+	unsigned long db_len;
 
 	u8 revision_id;
 	u16 board_type;
@@ -642,14 +777,13 @@ struct netxen_hardware_context {
 	u32 qg_linksup;
 	/* Address of cmd ring in Phantom */
 	struct cmd_desc_type0 *cmd_desc_head;
-	char *pauseaddr;
 	struct pci_dev *cmd_desc_pdev;
 	dma_addr_t cmd_desc_phys_addr;
-	dma_addr_t pause_physaddr;
-	struct pci_dev *pause_pdev;
 	struct netxen_adapter *adapter;
 };
 
+#define RCV_RING_LRO	RCV_DESC_LRO
+
 #define MINIMUM_ETHERNET_FRAME_SIZE	64	/* With FCS */
 #define ETHERNET_FCS_SIZE		4
 
@@ -702,6 +836,13 @@ struct netxen_recv_context {
 };
 
 #define NETXEN_NIC_MSI_ENABLED 0x02
+#define NETXEN_DMA_MASK	0xfffffffe
+#define NETXEN_DB_MAPSIZE_BYTES    0x1000
+
+struct netxen_dummy_dma {
+	void *addr;
+	dma_addr_t phys_addr;
+};
 
 struct netxen_adapter {
 	struct netxen_hardware_context ahw;
@@ -711,18 +852,19 @@ struct netxen_adapter {
 	spinlock_t tx_lock;
 	spinlock_t lock;
 	struct work_struct watchdog_task;
-	struct work_struct tx_timeout_task;
+	struct work_struct tx_timeout_task[NETXEN_MAX_PORTS];
 	struct timer_list watchdog_timer;
 
 	u32 curr_window;
 
 	u32 cmd_producer;
-	u32 cmd_consumer;
+	u32 *cmd_consumer;
 
 	u32 last_cmd_consumer;
 	u32 max_tx_desc_count;
 	u32 max_rx_desc_count;
 	u32 max_jumbo_rx_desc_count;
+	u32 max_lro_rx_desc_count;
 	/* Num of instances active on cmd buffer ring */
 	u32 proc_cmd_buf_counter;
 
@@ -744,6 +886,13 @@ struct netxen_adapter {
 	struct netxen_recv_context recv_ctx[MAX_RCV_CTX];
 
 	int is_up;
+	int number;
+	struct netxen_dummy_dma dummy_dma;
+
+	/* Context interface shared between card and host */
+	struct netxen_ring_ctx *ctx_desc;
+	struct pci_dev *ctx_desc_pdev;
+	dma_addr_t ctx_desc_phys_addr;
 	int (*enable_phy_interrupts) (struct netxen_adapter *, int);
 	int (*disable_phy_interrupts) (struct netxen_adapter *, int);
 	void (*handle_phy_intr) (struct netxen_adapter *);
@@ -758,7 +907,6 @@ struct netxen_adapter {
 	int (*init_port) (struct netxen_adapter *, int);
 	void (*init_niu) (struct netxen_adapter *);
 	int (*stop_port) (struct netxen_adapter *, int);
-
 };				/* netxen_adapter structure */
 
 /* Max number of xmit producer threads that can run simultaneously */
@@ -840,8 +988,6 @@ static inline void __iomem *pci_base(struct netxen_adapter *adapter,
 	return NULL;
 }
 
-extern char netxen_nic_driver_name[];
-
 int netxen_niu_xgbe_enable_phy_interrupts(struct netxen_adapter *adapter,
 					  int port);
 int netxen_niu_gbe_enable_phy_interrupts(struct netxen_adapter *adapter,
@@ -880,10 +1026,20 @@ int netxen_nic_hw_read_wx(struct netxen_adapter *adapter, u64 off, void *data,
 			  int len);
 int netxen_nic_hw_write_wx(struct netxen_adapter *adapter, u64 off, void *data,
 			   int len);
+int netxen_nic_hw_read_ioctl(struct netxen_adapter *adapter, u64 off,
+			     void *data, int len);
+int netxen_nic_hw_write_ioctl(struct netxen_adapter *adapter, u64 off,
+			      void *data, int len);
+int netxen_nic_pci_mem_write_ioctl(struct netxen_adapter *adapter,
+				   u64 off, void *data, int size);
+int netxen_nic_pci_mem_read_ioctl(struct netxen_adapter *adapter,
+				  u64 off, void *data, int size);
 void netxen_crb_writelit_adapter(struct netxen_adapter *adapter,
 				 unsigned long off, int data);
 
 /* Functions from netxen_nic_init.c */
+void netxen_free_adapter_offload(struct netxen_adapter *adapter);
+int netxen_initialize_adapter_offload(struct netxen_adapter *adapter);
 void netxen_phantom_init(struct netxen_adapter *adapter, int pegtune_val);
 void netxen_load_firmware(struct netxen_adapter *adapter);
 int netxen_pinit_from_rom(struct netxen_adapter *adapter, int verbose);
@@ -918,7 +1074,9 @@ int netxen_nic_tx_has_work(struct netxen_adapter *adapter);
 void netxen_watchdog_task(unsigned long v);
 void netxen_post_rx_buffers(struct netxen_adapter *adapter, u32 ctx,
 			    u32 ringid);
-void netxen_process_cmd_ring(unsigned long data);
+void netxen_post_rx_buffers_nodb(struct netxen_adapter *adapter, u32 ctx,
+				 u32 ringid);
+int netxen_process_cmd_ring(unsigned long data);
 u32 netxen_process_rcv_ring(struct netxen_adapter *adapter, int ctx, int max);
 void netxen_nic_set_multi(struct net_device *netdev);
 int netxen_nic_change_mtu(struct net_device *netdev, int new_mtu);
@@ -1012,7 +1170,6 @@ static inline void get_brd_name_by_type(u32 type, char *name)
 
 int netxen_is_flash_supported(struct netxen_adapter *adapter);
 int netxen_get_flash_mac_addr(struct netxen_adapter *adapter, u64 mac[]);
-
 extern void netxen_change_ringparam(struct netxen_adapter *adapter);
 extern int netxen_rom_fast_read(struct netxen_adapter *adapter, int addr,
 				int *valp);
diff --git a/drivers/net/netxen/netxen_nic_ethtool.c b/drivers/net/netxen/netxen_nic_ethtool.c
index c7fcbf3..2ab4885 100644
--- a/drivers/net/netxen/netxen_nic_ethtool.c
+++ b/drivers/net/netxen/netxen_nic_ethtool.c
@@ -459,20 +459,22 @@ netxen_nic_get_ringparam(struct net_device *dev, struct ethtool_ringparam *ring)
 {
 	struct netxen_port *port = netdev_priv(dev);
 	struct netxen_adapter *adapter = port->adapter;
-	int i, j;
+	int i;
 
 	ring->rx_pending = 0;
+	ring->rx_jumbo_pending = 0;
 	for (i = 0; i < MAX_RCV_CTX; ++i) {
-		for (j = 0; j < NUM_RCV_DESC_RINGS; j++)
-			ring->rx_pending +=
-			    adapter->recv_ctx[i].rcv_desc[j].rcv_pending;
+		ring->rx_pending += adapter->recv_ctx[i].
+		    rcv_desc[RCV_DESC_NORMAL_CTXID].rcv_pending;
+		ring->rx_jumbo_pending += adapter->recv_ctx[i].
+		    rcv_desc[RCV_DESC_JUMBO_CTXID].rcv_pending;
 	}
 
 	ring->rx_max_pending = adapter->max_rx_desc_count;
 	ring->tx_max_pending = adapter->max_tx_desc_count;
+	ring->rx_jumbo_max_pending = adapter->max_jumbo_rx_desc_count;
 	ring->rx_mini_max_pending = 0;
 	ring->rx_mini_pending = 0;
-	ring->rx_jumbo_max_pending = 0;
 	ring->rx_jumbo_pending = 0;
 }
 
diff --git a/drivers/net/netxen/netxen_nic_hw.c b/drivers/net/netxen/netxen_nic_hw.c
index 7470852..9147b60 100644
--- a/drivers/net/netxen/netxen_nic_hw.c
+++ b/drivers/net/netxen/netxen_nic_hw.c
@@ -42,7 +42,7 @@
 
 #define NETXEN_FLASH_BASE	(BOOTLD_START)
 #define NETXEN_PHANTOM_MEM_BASE	(NETXEN_FLASH_BASE)
-#define NETXEN_MAX_MTU		8000
+#define NETXEN_MAX_MTU		8000 + NETXEN_ENET_HEADER_SIZE + NETXEN_ETH_FCS_SIZE
 #define NETXEN_MIN_MTU		64
 #define NETXEN_ETH_FCS_SIZE     4
 #define NETXEN_ENET_HEADER_SIZE 14
@@ -176,11 +176,9 @@ int netxen_nic_hw_resources(struct netxen_adapter *adapter)
 	struct netxen_hardware_context *hw = &adapter->ahw;
 	u32 state = 0;
 	void *addr;
-	void *pause_addr;
 	int loops = 0, err = 0;
 	int ctx, ring;
 	u32 card_cmdring = 0;
-	struct netxen_rcv_desc_crb *rcv_desc_crb = NULL;
 	struct netxen_recv_context *recv_ctx;
 	struct netxen_rcv_desc_ctx *rcv_desc;
 
@@ -224,33 +222,42 @@ int netxen_nic_hw_resources(struct netxen_adapter *adapter)
 	DPRINTK(INFO, "Recieve Peg ready too. starting stuff\n");
 
 	addr = netxen_alloc(adapter->ahw.pdev,
-			    sizeof(struct cmd_desc_type0) *
-			    adapter->max_tx_desc_count,
-			    &hw->cmd_desc_phys_addr, &hw->cmd_desc_pdev);
+			    sizeof(struct netxen_ring_ctx) +
+			    sizeof(uint32_t),
+			    (dma_addr_t *) & adapter->ctx_desc_phys_addr,
+			    &adapter->ctx_desc_pdev);
 
+	printk("ctx_desc_phys_addr: 0x%llx\n",
+	       (u64) adapter->ctx_desc_phys_addr);
 	if (addr == NULL) {
 		DPRINTK(ERR, "bad return from pci_alloc_consistent\n");
-		return -ENOMEM;
+		err = -ENOMEM;
+		return err;
 	}
+	memset(addr, 0, sizeof(struct netxen_ring_ctx));
+	adapter->ctx_desc = (struct netxen_ring_ctx *)addr;
+	adapter->ctx_desc->cmd_consumer_offset = adapter->ctx_desc_phys_addr
+	    + sizeof(struct netxen_ring_ctx);
+	adapter->cmd_consumer = (uint32_t *) (((char *)addr) +
+					      sizeof(struct netxen_ring_ctx));
+
+	addr = pci_alloc_consistent(adapter->ahw.pdev,
+				    sizeof(struct cmd_desc_type0) *
+				    adapter->max_tx_desc_count,
+				    (dma_addr_t *) & hw->cmd_desc_phys_addr);
+	printk("cmd_desc_phys_addr: 0x%llx\n", (u64) hw->cmd_desc_phys_addr);
 
-	pause_addr = netxen_alloc(adapter->ahw.pdev, 512,
-				  (dma_addr_t *) & hw->pause_physaddr,
-				  &hw->pause_pdev);
-	if (pause_addr == NULL) {
-		DPRINTK(1, ERR, "bad return from pci_alloc_consistent\n");
+	if (addr == NULL) {
+		DPRINTK(ERR, "bad return from pci_alloc_consistent\n");
+		netxen_free_hw_resources(adapter);
 		return -ENOMEM;
 	}
 
-	hw->pauseaddr = (char *)pause_addr;
-	{
-		u64 *ptr = (u64 *) pause_addr;
-		*ptr++ = NETXEN_NIC_ZERO_PAUSE_ADDR;
-		*ptr++ = NETXEN_NIC_ZERO_PAUSE_ADDR;
-		*ptr++ = NETXEN_NIC_UNIT_PAUSE_ADDR;
-		*ptr++ = NETXEN_NIC_ZERO_PAUSE_ADDR;
-		*ptr++ = NETXEN_NIC_EPG_PAUSE_ADDR1;
-		*ptr++ = NETXEN_NIC_EPG_PAUSE_ADDR2;
-	}
+	adapter->ctx_desc->cmd_ring_addr_lo =
+	    hw->cmd_desc_phys_addr & 0xffffffffUL;
+	adapter->ctx_desc->cmd_ring_addr_hi =
+	    ((u64) hw->cmd_desc_phys_addr >> 32);
+	adapter->ctx_desc->cmd_ring_size = adapter->max_tx_desc_count;
 
 	hw->cmd_desc_head = (struct cmd_desc_type0 *)addr;
 
@@ -271,6 +278,12 @@ int netxen_nic_hw_resources(struct netxen_adapter *adapter)
 				return err;
 			}
 			rcv_desc->desc_head = (struct rcv_desc *)addr;
+			adapter->ctx_desc->rcv_ctx[ring].rcv_ring_addr_lo =
+			    rcv_desc->phys_addr & 0xffffffffUL;
+			adapter->ctx_desc->rcv_ctx[ring].rcv_ring_addr_hi =
+			    ((u64) rcv_desc->phys_addr >> 32);
+			adapter->ctx_desc->rcv_ctx[ring].rcv_ring_size =
+			    rcv_desc->max_rx_desc_count;
 		}
 
 		addr = netxen_alloc(adapter->ahw.pdev, STATUS_DESC_RINGSIZE,
@@ -284,47 +297,21 @@ int netxen_nic_hw_resources(struct netxen_adapter *adapter)
 			return err;
 		}
 		recv_ctx->rcv_status_desc_head = (struct status_desc *)addr;
-		for (ring = 0; ring < NUM_RCV_DESC_RINGS; ring++) {
-			rcv_desc = &recv_ctx->rcv_desc[ring];
-			rcv_desc_crb =
-			    &recv_crb_registers[ctx].rcv_desc_crb[ring];
-			DPRINTK(INFO, "ring #%d crb global ring reg 0x%x\n",
-				ring, rcv_desc_crb->crb_globalrcv_ring);
-			/* Window = 1 */
-			writel(lower32(rcv_desc->phys_addr),
-			       NETXEN_CRB_NORMALIZE(adapter,
-						    rcv_desc_crb->
-						    crb_globalrcv_ring));
-			DPRINTK(INFO, "GLOBAL_RCV_RING ctx %d, addr 0x%x"
-				" val 0x%llx,"
-				" virt %p\n", ctx,
-				rcv_desc_crb->crb_globalrcv_ring,
-				(unsigned long long)rcv_desc->phys_addr,
-				+rcv_desc->desc_head);
-		}
+		adapter->ctx_desc->sts_ring_addr_lo =
+		    recv_ctx->rcv_status_desc_phys_addr & 0xffffffffUL;
+		adapter->ctx_desc->sts_ring_addr_hi =
+		    ((u64) recv_ctx->rcv_status_desc_phys_addr >> 32);
+		adapter->ctx_desc->sts_ring_size = adapter->max_rx_desc_count;
 
-		/* Window = 1 */
-		writel(lower32(recv_ctx->rcv_status_desc_phys_addr),
-		       NETXEN_CRB_NORMALIZE(adapter,
-					    recv_crb_registers[ctx].
-					    crb_rcvstatus_ring));
-		DPRINTK(INFO, "RCVSTATUS_RING, ctx %d, addr 0x%x,"
-			" val 0x%x,virt%p\n",
-			ctx,
-			recv_crb_registers[ctx].crb_rcvstatus_ring,
-			(unsigned long long)recv_ctx->rcv_status_desc_phys_addr,
-			recv_ctx->rcv_status_desc_head);
 	}
 	/* Window = 1 */
-	writel(lower32(hw->pause_physaddr),
-	       NETXEN_CRB_NORMALIZE(adapter, CRB_PAUSE_ADDR_LO));
-	writel(upper32(hw->pause_physaddr),
-	       NETXEN_CRB_NORMALIZE(adapter, CRB_PAUSE_ADDR_HI));
-
-	writel(lower32(hw->cmd_desc_phys_addr),
-	       NETXEN_CRB_NORMALIZE(adapter, CRB_HOST_CMD_ADDR_LO));
-	writel(upper32(hw->cmd_desc_phys_addr),
-	       NETXEN_CRB_NORMALIZE(adapter, CRB_HOST_CMD_ADDR_HI));
+
+	writel(lower32(adapter->ctx_desc_phys_addr),
+	       NETXEN_CRB_NORMALIZE(adapter, CRB_CTX_ADDR_REG_LO));
+	writel(upper32(adapter->ctx_desc_phys_addr),
+	       NETXEN_CRB_NORMALIZE(adapter, CRB_CTX_ADDR_REG_HI));
+	writel(NETXEN_CTX_SIGNATURE,
+	       NETXEN_CRB_NORMALIZE(adapter, CRB_CTX_SIGNATURE_REG));
 	return err;
 }
 
@@ -334,6 +321,15 @@ void netxen_free_hw_resources(struct netxen_adapter *adapter)
 	struct netxen_rcv_desc_ctx *rcv_desc;
 	int ctx, ring;
 
+	if (adapter->ctx_desc != NULL) {
+		pci_free_consistent(adapter->ctx_desc_pdev,
+				    sizeof(struct netxen_ring_ctx) +
+				    sizeof(uint32_t),
+				    adapter->ctx_desc,
+				    adapter->ctx_desc_phys_addr);
+		adapter->ctx_desc = NULL;
+	}
+
 	if (adapter->ahw.cmd_desc_head != NULL) {
 		pci_free_consistent(adapter->ahw.cmd_desc_pdev,
 				    sizeof(struct cmd_desc_type0) *
@@ -342,11 +338,9 @@ void netxen_free_hw_resources(struct netxen_adapter *adapter)
 				    adapter->ahw.cmd_desc_phys_addr);
 		adapter->ahw.cmd_desc_head = NULL;
 	}
-	if (adapter->ahw.pauseaddr != NULL) {
-		pci_free_consistent(adapter->ahw.pause_pdev, 512,
-				    adapter->ahw.pauseaddr,
-				    adapter->ahw.pause_physaddr);
-		adapter->ahw.pauseaddr = NULL;
+	/* Special handling: there are 2 ports on this board */
+	if (adapter->ahw.boardcfg.board_type == NETXEN_BRDTYPE_P2_SB31_10G_IMEZ) {
+		adapter->ahw.max_ports = 2;
 	}
 
 	for (ctx = 0; ctx < MAX_RCV_CTX; ++ctx) {
@@ -381,19 +375,22 @@ void netxen_tso_check(struct netxen_adapter *adapter,
 		desc->total_hdr_length = sizeof(struct ethhdr) +
 		    ((skb->nh.iph)->ihl * sizeof(u32)) +
 		    ((skb->h.th)->doff * sizeof(u32));
-		desc->opcode = TX_TCP_LSO;
+		netxen_set_cmd_desc_opcode(desc, TX_TCP_LSO);
 	} else if (skb->ip_summed == CHECKSUM_COMPLETE) {
 		if (skb->nh.iph->protocol == IPPROTO_TCP) {
-			desc->opcode = TX_TCP_PKT;
+			netxen_set_cmd_desc_opcode(desc, TX_TCP_PKT);
 		} else if (skb->nh.iph->protocol == IPPROTO_UDP) {
-			desc->opcode = TX_UDP_PKT;
+			netxen_set_cmd_desc_opcode(desc, TX_UDP_PKT);
 		} else {
 			return;
 		}
 	}
 	adapter->stats.xmitcsummed++;
-	CMD_DESC_TCP_HDR_OFFSET_WRT(desc, skb->h.raw - skb->data);
-	desc->length_tcp_hdr = cpu_to_le32(desc->length_tcp_hdr);
+	desc->tcp_hdr_offset = skb->h.raw - skb->data;
+	netxen_set_cmd_desc_totallength(desc,
+					cpu_to_le32
+					(netxen_get_cmd_desc_totallength
+					 (desc)));
 	desc->ip_hdr_offset = skb->nh.raw - skb->data;
 }
 
@@ -871,7 +868,7 @@ void netxen_nic_set_link_parameters(struct netxen_port *port)
 {
 	struct netxen_adapter *adapter = port->adapter;
 	__le32 status;
-	u16 autoneg;
+	__le32 autoneg;
 	__le32 mode;
 
 	netxen_nic_read_w0(adapter, NETXEN_NIU_MODE, &mode);
@@ -911,7 +908,7 @@ void netxen_nic_set_link_parameters(struct netxen_port *port)
 				    && adapter->
 				    phy_read(adapter, port->portnum,
 					     NETXEN_NIU_GB_MII_MGMT_ADDR_AUTONEG,
-					     (__le32 *) & autoneg) != 0)
+					     &autoneg) != 0)
 					port->link_autoneg = autoneg;
 			} else
 				goto link_down;
@@ -1006,3 +1003,291 @@ int netxen_crb_read_val(struct netxen_adapter *adapter, unsigned long off)
 	netxen_nic_hw_read_wx(adapter, off, &data, 4);
 	return data;
 }
+
+int netxen_nic_hw_write_ioctl(struct netxen_adapter *adapter, u64 off,
+			      void *data, int len)
+{
+	void *addr;
+	u64 offset = off;
+	u8 *mem_ptr = NULL;
+	unsigned long mem_base;
+	unsigned long mem_page;
+
+	if (ADDR_IN_WINDOW1(off)) {
+		addr = NETXEN_CRB_NORMALIZE(adapter, off);
+		if (!addr) {
+			mem_base = pci_resource_start(adapter->ahw.pdev, 0);
+			offset = NETXEN_CRB_NORMAL(off);
+			mem_page = offset & PAGE_MASK;
+			if (mem_page != ((offset + len - 1) & PAGE_MASK))
+				mem_ptr =
+				    ioremap(mem_base + mem_page, PAGE_SIZE * 2);
+			else
+				mem_ptr =
+				    ioremap(mem_base + mem_page, PAGE_SIZE);
+			if (mem_ptr == 0UL) {
+				return 1;
+			}
+			addr = mem_ptr;
+			addr += offset & (PAGE_SIZE - 1);
+		}
+	} else {
+		addr = pci_base_offset(adapter, off);
+		if (!addr) {
+			mem_base = pci_resource_start(adapter->ahw.pdev, 0);
+			mem_page = off & PAGE_MASK;
+			if (mem_page != ((off + len - 1) & PAGE_MASK))
+				mem_ptr =
+				    ioremap(mem_base + mem_page, PAGE_SIZE * 2);
+			else
+				mem_ptr =
+				    ioremap(mem_base + mem_page, PAGE_SIZE);
+			if (mem_ptr == 0UL) {
+				return 1;
+			}
+			addr = mem_ptr;
+			addr += off & (PAGE_SIZE - 1);
+		}
+		netxen_nic_pci_change_crbwindow(adapter, 0);
+	}
+	switch (len) {
+	case 1:
+		writeb(*(u8 *) data, addr);
+		break;
+	case 2:
+		writew(*(u16 *) data, addr);
+		break;
+	case 4:
+		writel(*(u32 *) data, addr);
+		break;
+	case 8:
+		writeq(*(u64 *) data, addr);
+		break;
+	default:
+		DPRINTK(INFO,
+			"writing data %lx to offset %llx, num words=%d\n",
+			*(unsigned long *)data, off, (len >> 3));
+
+		netxen_nic_hw_block_write64((u64 __iomem *) data, addr,
+					    (len >> 3));
+		break;
+	}
+
+	if (!ADDR_IN_WINDOW1(off))
+		netxen_nic_pci_change_crbwindow(adapter, 1);
+	if (mem_ptr)
+		iounmap(mem_ptr);
+	return 0;
+}
+
+int netxen_nic_hw_read_ioctl(struct netxen_adapter *adapter, u64 off,
+			     void *data, int len)
+{
+	void *addr;
+	u64 offset;
+	u8 *mem_ptr = NULL;
+	unsigned long mem_base;
+	unsigned long mem_page;
+
+	if (ADDR_IN_WINDOW1(off)) {
+		addr = NETXEN_CRB_NORMALIZE(adapter, off);
+		if (!addr) {
+			mem_base = pci_resource_start(adapter->ahw.pdev, 0);
+			offset = NETXEN_CRB_NORMAL(off);
+			mem_page = offset & PAGE_MASK;
+			if (mem_page != ((offset + len - 1) & PAGE_MASK))
+				mem_ptr =
+				    ioremap(mem_base + mem_page, PAGE_SIZE * 2);
+			else
+				mem_ptr =
+				    ioremap(mem_base + mem_page, PAGE_SIZE);
+			if (mem_ptr == 0UL) {
+				*(u8 *) data = 0;
+				return 1;
+			}
+			addr = mem_ptr;
+			addr += offset & (PAGE_SIZE - 1);
+		}
+	} else {
+		addr = pci_base_offset(adapter, off);
+		if (!addr) {
+			mem_base = pci_resource_start(adapter->ahw.pdev, 0);
+			mem_page = off & PAGE_MASK;
+			if (mem_page != ((off + len - 1) & PAGE_MASK))
+				mem_ptr =
+				    ioremap(mem_base + mem_page, PAGE_SIZE * 2);
+			else
+				mem_ptr =
+				    ioremap(mem_base + mem_page, PAGE_SIZE);
+			if (mem_ptr == 0UL)
+				return 1;
+			addr = mem_ptr;
+			addr += off & (PAGE_SIZE - 1);
+		}
+		netxen_nic_pci_change_crbwindow(adapter, 0);
+	}
+	switch (len) {
+	case 1:
+		*(u8 *) data = readb(addr);
+		break;
+	case 2:
+		*(u16 *) data = readw(addr);
+		break;
+	case 4:
+		*(u32 *) data = readl(addr);
+		break;
+	case 8:
+		*(u64 *) data = readq(addr);
+		break;
+	default:
+		netxen_nic_hw_block_read64((u64 __iomem *) data, addr,
+					   (len >> 3));
+		break;
+	}
+	if (!ADDR_IN_WINDOW1(off))
+		netxen_nic_pci_change_crbwindow(adapter, 1);
+	if (mem_ptr)
+		iounmap(mem_ptr);
+	return 0;
+}
+
+int netxen_nic_pci_mem_write_ioctl(struct netxen_adapter *adapter, u64 off,
+				   void *data, int size)
+{
+	void *addr;
+	int ret = 0;
+	u8 *mem_ptr = NULL;
+	unsigned long mem_base;
+	unsigned long mem_page;
+
+	if (data == NULL || off > (128 * 1024 * 1024)) {
+		printk(KERN_ERR "%s: data: %p off:%llx\n",
+		       netxen_nic_driver_name, data, off);
+		return 1;
+	}
+	off = netxen_nic_pci_set_window(adapter, off);
+	/* Corner case : Malicious user tried to break the driver by reading
+	   last few bytes in ranges and tries to read further addresses.
+	 */
+	if (!pci_base(adapter, off + size - 1) && pci_base(adapter, off)) {
+		printk(KERN_ERR "%s: Invalid access to memory address range"
+		       " 0x%llx - 0x%llx\n", netxen_nic_driver_name, off,
+		       off + size);
+		return 1;
+	}
+	addr = pci_base_offset(adapter, off);
+	DPRINTK(INFO, "writing data %llx to offset %llx\n",
+		*(unsigned long long *)data, off);
+	if (!addr) {
+		mem_base = pci_resource_start(adapter->ahw.pdev, 0);
+		mem_page = off & PAGE_MASK;
+		/* Map two pages whenever user tries to access addresses in two
+		   consecutive pages.
+		 */
+		if (mem_page != ((off + size - 1) & PAGE_MASK))
+			mem_ptr = ioremap(mem_base + mem_page, PAGE_SIZE * 2);
+		else
+			mem_ptr = ioremap(mem_base + mem_page, PAGE_SIZE);
+		if (mem_ptr == 0UL) {
+			return 1;
+		}
+		addr = mem_ptr;
+		addr += off & (PAGE_SIZE - 1);
+	}
+	switch (size) {
+	case 1:
+		writeb(*(u8 *) data, addr);
+		break;
+	case 2:
+		writew(*(u16 *) data, addr);
+		break;
+	case 4:
+		writel(*(u32 *) data, addr);
+		break;
+	case 8:
+		writeq(*(u64 *) data, addr);
+		break;
+	default:
+		DPRINTK(INFO,
+			"writing data %lx to offset %llx, num words=%d\n",
+			*(unsigned long *)data, off, (size >> 3));
+
+		netxen_nic_hw_block_write64((u64 __iomem *) data, addr,
+					    (size >> 3));
+		break;
+	}
+
+	if (mem_ptr)
+		iounmap(mem_ptr);
+	DPRINTK(INFO, "wrote %llx\n", *(unsigned long long *)data);
+
+	return ret;
+}
+
+int netxen_nic_pci_mem_read_ioctl(struct netxen_adapter *adapter,
+				  u64 off, void *data, int size)
+{
+	void *addr;
+	int ret = 0;
+	u8 *mem_ptr = NULL;
+	unsigned long mem_base;
+	unsigned long mem_page;
+
+	if (data == NULL || off > (128 * 1024 * 1024)) {
+		printk(KERN_ERR "%s: data: %p off:%llx\n",
+		       netxen_nic_driver_name, data, off);
+		return 1;
+	}
+	off = netxen_nic_pci_set_window(adapter, off);
+	/* Corner case : Malicious user tried to break the driver by reading
+	   last few bytes in ranges and tries to read further addresses.
+	 */
+	if (!pci_base(adapter, off + size - 1) && pci_base(adapter, off)) {
+		printk(KERN_ERR "%s: Invalid access to memory address range"
+		       " 0x%llx - 0x%llx\n", netxen_nic_driver_name, off,
+		       off + size);
+		return 1;
+	}
+	addr = pci_base_offset(adapter, off);
+	if (!addr) {
+		mem_base = pci_resource_start(adapter->ahw.pdev, 0);
+		mem_page = off & PAGE_MASK;
+		/* Map two pages whenever user tries to access addresses in two
+		   consecutive pages.
+		 */
+		if (mem_page != ((off + size - 1) & PAGE_MASK))
+			mem_ptr = ioremap(mem_base + mem_page, PAGE_SIZE * 2);
+		else
+			mem_ptr = ioremap(mem_base + mem_page, PAGE_SIZE);
+		if (mem_ptr == 0UL) {
+			*(u8 *) data = 0;
+			return 1;
+		}
+		addr = mem_ptr;
+		addr += off & (PAGE_SIZE - 1);
+	}
+	switch (size) {
+	case 1:
+		*(u8 *) data = readb(addr);
+		break;
+	case 2:
+		*(u16 *) data = readw(addr);
+		break;
+	case 4:
+		*(u32 *) data = readl(addr);
+		break;
+	case 8:
+		*(u64 *) data = readq(addr);
+		break;
+	default:
+		netxen_nic_hw_block_read64((u64 __iomem *) data, addr,
+					   (size >> 3));
+		break;
+	}
+
+	if (mem_ptr)
+		iounmap(mem_ptr);
+	DPRINTK(INFO, "read %llx\n", *(unsigned long long *)data);
+
+	return ret;
+}
diff --git a/drivers/net/netxen/netxen_nic_init.c b/drivers/net/netxen/netxen_nic_init.c
index deac1a3..f786680 100644
--- a/drivers/net/netxen/netxen_nic_init.c
+++ b/drivers/net/netxen/netxen_nic_init.c
@@ -137,6 +137,8 @@ int netxen_init_firmware(struct netxen_adapter *adapter)
 		return err;
 	}
 	/* Window 1 call */
+	writel(MPORT_SINGLE_FUNCTION_MODE,
+	       NETXEN_CRB_NORMALIZE(adapter, CRB_MPORT_MODE));
 	writel(PHAN_INITIALIZE_ACK,
 	       NETXEN_CRB_NORMALIZE(adapter, CRB_CMDPEG_STATE));
 
@@ -184,15 +186,12 @@ void netxen_initialize_adapter_sw(struct netxen_adapter *adapter)
 			for (i = 0; i < num_rx_bufs; i++) {
 				rx_buf->ref_handle = i;
 				rx_buf->state = NETXEN_BUFFER_FREE;
-
 				DPRINTK(INFO, "Rx buf:ctx%d i(%d) rx_buf:"
 					"%p\n", ctxid, i, rx_buf);
 				rx_buf++;
 			}
 		}
 	}
-	DPRINTK(INFO, "initialized buffers for %s and %s\n",
-		"adapter->free_cmd_buf_list", "adapter->free_rxbuf");
 }
 
 void netxen_initialize_adapter_hw(struct netxen_adapter *adapter)
@@ -621,6 +620,43 @@ int netxen_pinit_from_rom(struct netxen_adapter *adapter, int verbose)
 	return 0;
 }
 
+int netxen_initialize_adapter_offload(struct netxen_adapter *adapter)
+{
+	uint64_t addr;
+	uint32_t hi;
+	uint32_t lo;
+
+	adapter->dummy_dma.addr =
+	    pci_alloc_consistent(adapter->ahw.pdev,
+				 NETXEN_HOST_DUMMY_DMA_SIZE,
+				 &adapter->dummy_dma.phys_addr);
+	if (adapter->dummy_dma.addr == NULL) {
+		printk("%s: ERROR: Could not allocate dummy DMA memory\n",
+		       __FUNCTION__);
+		return -ENOMEM;
+	}
+
+	addr = (uint64_t) adapter->dummy_dma.phys_addr;
+	hi = (addr >> 32) & 0xffffffff;
+	lo = addr & 0xffffffff;
+
+	writel(hi, NETXEN_CRB_NORMALIZE(adapter, CRB_HOST_DUMMY_BUF_ADDR_HI));
+	writel(lo, NETXEN_CRB_NORMALIZE(adapter, CRB_HOST_DUMMY_BUF_ADDR_LO));
+
+	return 0;
+}
+
+void netxen_free_adapter_offload(struct netxen_adapter *adapter)
+{
+	if (adapter->dummy_dma.addr) {
+		pci_free_consistent(adapter->ahw.pdev,
+				    NETXEN_HOST_DUMMY_DMA_SIZE,
+				    adapter->dummy_dma.addr,
+				    adapter->dummy_dma.phys_addr);
+		adapter->dummy_dma.addr = NULL;
+	}
+}
+
 void netxen_phantom_init(struct netxen_adapter *adapter, int pegtune_val)
 {
 	u32 val = 0;
@@ -655,7 +691,8 @@ int netxen_nic_rx_has_work(struct netxen_adapter *adapter)
 		desc_head = recv_ctx->rcv_status_desc_head;
 		desc = &desc_head[consumer];
 
-		if (((le16_to_cpu(desc->owner)) & STATUS_OWNER_HOST))
+		if (((le16_to_cpu(netxen_get_sts_owner(desc)))
+		     & STATUS_OWNER_HOST))
 			return 1;
 	}
 
@@ -747,19 +784,19 @@ void
 netxen_process_rcv(struct netxen_adapter *adapter, int ctxid,
 		   struct status_desc *desc)
 {
-	struct netxen_port *port = adapter->port[STATUS_DESC_PORT(desc)];
+	struct netxen_port *port = adapter->port[netxen_get_sts_port(desc)];
 	struct pci_dev *pdev = port->pdev;
 	struct net_device *netdev = port->netdev;
-	int index = le16_to_cpu(desc->reference_handle);
+	int index = le16_to_cpu(netxen_get_sts_refhandle(desc));
 	struct netxen_recv_context *recv_ctx = &(adapter->recv_ctx[ctxid]);
 	struct netxen_rx_buffer *buffer;
 	struct sk_buff *skb;
-	u32 length = le16_to_cpu(desc->total_length);
+	u32 length = le16_to_cpu(netxen_get_sts_totallength(desc));
 	u32 desc_ctx;
 	struct netxen_rcv_desc_ctx *rcv_desc;
 	int ret;
 
-	desc_ctx = STATUS_DESC_TYPE(desc);
+	desc_ctx = netxen_get_sts_type(desc);
 	if (unlikely(desc_ctx >= NUM_RCV_DESC_RINGS)) {
 		printk("%s: %s Bad Rcv descriptor ring\n",
 		       netxen_nic_driver_name, netdev->name);
@@ -767,20 +804,49 @@ netxen_process_rcv(struct netxen_adapter *adapter, int ctxid,
 	}
 
 	rcv_desc = &recv_ctx->rcv_desc[desc_ctx];
+	if (unlikely(index > rcv_desc->max_rx_desc_count)) {
+		DPRINTK(ERR, "Got a buffer index:%x Max is %x\n",
+			index, rcv_desc->max_rx_desc_count);
+		return;
+	}
 	buffer = &rcv_desc->rx_buf_arr[index];
+	if (desc_ctx == RCV_DESC_LRO_CTXID) {
+		buffer->lro_current_frags++;
+		if (netxen_get_sts_desc_lro_last_frag(desc)) {
+			buffer->lro_expected_frags =
+			    netxen_get_sts_desc_lro_cnt(desc);
+			buffer->lro_length = length;
+		}
+		if (buffer->lro_current_frags != buffer->lro_expected_frags) {
+			if (buffer->lro_expected_frags != 0) {
+				printk("LRO: (refhandle:%x) recv frag."
+				       "wait for last. flags: %x expected:%d"
+				       "have:%d\n", index,
+				       netxen_get_sts_desc_lro_last_frag(desc),
+				       buffer->lro_expected_frags,
+				       buffer->lro_current_frags);
+			}
+			return;
+		}
+	}
 
 	pci_unmap_single(pdev, buffer->dma, rcv_desc->dma_size,
 			 PCI_DMA_FROMDEVICE);
 
 	skb = (struct sk_buff *)buffer->skb;
 
-	if (likely(STATUS_DESC_STATUS(desc) == STATUS_CKSUM_OK)) {
+	if (likely(netxen_get_sts_status(desc) == STATUS_CKSUM_OK)) {
 		port->stats.csummed++;
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
-	} else
-		skb->ip_summed = CHECKSUM_NONE;
+	}
 	skb->dev = netdev;
-	skb_put(skb, length);
+	if (desc_ctx == RCV_DESC_LRO_CTXID) {
+		/* True length was only available on the last pkt */
+		skb_put(skb, buffer->lro_length);
+	} else {
+		skb_put(skb, length);
+	}
+
 	skb->protocol = eth_type_trans(skb, netdev);
 
 	ret = netif_receive_skb(skb);
@@ -826,6 +892,8 @@ netxen_process_rcv(struct netxen_adapter *adapter, int ctxid,
 	adapter->stats.post_called++;
 	buffer->skb = NULL;
 	buffer->state = NETXEN_BUFFER_FREE;
+	buffer->lro_current_frags = 0;
+	buffer->lro_expected_frags = 0;
 
 	port->stats.no_rcv++;
 	port->stats.rxbytes += length;
@@ -838,6 +906,7 @@ u32 netxen_process_rcv_ring(struct netxen_adapter *adapter, int ctxid, int max)
 	struct status_desc *desc_head = recv_ctx->rcv_status_desc_head;
 	struct status_desc *desc;	/* used to read status desc here */
 	u32 consumer = recv_ctx->status_rx_consumer;
+	u32 producer = 0;
 	int count = 0, ring;
 
 	DPRINTK(INFO, "procesing receive\n");
@@ -849,18 +918,22 @@ u32 netxen_process_rcv_ring(struct netxen_adapter *adapter, int ctxid, int max)
 	 */
 	while (count < max) {
 		desc = &desc_head[consumer];
-		if (!((le16_to_cpu(desc->owner)) & STATUS_OWNER_HOST)) {
-			DPRINTK(ERR, "desc %p ownedby %x\n", desc, desc->owner);
+		if (!
+		    (le16_to_cpu(netxen_get_sts_owner(desc)) &
+		     STATUS_OWNER_HOST)) {
+			DPRINTK(ERR, "desc %p ownedby %x\n", desc,
+				netxen_get_sts_owner(desc));
 			break;
 		}
 		netxen_process_rcv(adapter, ctxid, desc);
-		desc->owner = STATUS_OWNER_PHANTOM;
+		netxen_clear_sts_owner(desc);
+		netxen_set_sts_owner(desc, STATUS_OWNER_PHANTOM);
 		consumer = (consumer + 1) & (adapter->max_rx_desc_count - 1);
 		count++;
 	}
 	if (count) {
 		for (ring = 0; ring < NUM_RCV_DESC_RINGS; ring++) {
-			netxen_post_rx_buffers(adapter, ctxid, ring);
+			netxen_post_rx_buffers_nodb(adapter, ctxid, ring);
 		}
 	}
 
@@ -868,6 +941,7 @@ u32 netxen_process_rcv_ring(struct netxen_adapter *adapter, int ctxid, int max)
 	if (count) {
 		adapter->stats.process_rcv++;
 		recv_ctx->status_rx_consumer = consumer;
+		recv_ctx->status_rx_producer = producer;
 
 		/* Window = 1 */
 		writel(consumer,
@@ -880,12 +954,13 @@ u32 netxen_process_rcv_ring(struct netxen_adapter *adapter, int ctxid, int max)
 }
 
 /* Process Command status ring */
-void netxen_process_cmd_ring(unsigned long data)
+int netxen_process_cmd_ring(unsigned long data)
 {
 	u32 last_consumer;
 	u32 consumer;
 	struct netxen_adapter *adapter = (struct netxen_adapter *)data;
-	int count = 0;
+	int count1 = 0;
+	int count2 = 0;
 	struct netxen_cmd_buffer *buffer;
 	struct netxen_port *port;	/* port #1 */
 	struct netxen_port *nport;
@@ -894,6 +969,7 @@ void netxen_process_cmd_ring(unsigned long data)
 	u32 i;
 	struct sk_buff *skb = NULL;
 	int p;
+	int done;
 
 	spin_lock(&adapter->tx_lock);
 	last_consumer = adapter->last_cmd_consumer;
@@ -903,14 +979,13 @@ void netxen_process_cmd_ring(unsigned long data)
 	 * number as part of the descriptor. This way we will be able to get
 	 * the netdev which is associated with that device.
 	 */
-	consumer =
-	    readl(NETXEN_CRB_NORMALIZE(adapter, CRB_CMD_CONSUMER_OFFSET));
 
+	consumer = *(adapter->cmd_consumer);
 	if (last_consumer == consumer) {	/* Ring is empty    */
 		DPRINTK(INFO, "last_consumer %d == consumer %d\n",
 			last_consumer, consumer);
 		spin_unlock(&adapter->tx_lock);
-		return;
+		return 1;
 	}
 
 	adapter->proc_cmd_buf_counter++;
@@ -921,7 +996,7 @@ void netxen_process_cmd_ring(unsigned long data)
 	 */
 	spin_unlock(&adapter->tx_lock);
 
-	while ((last_consumer != consumer) && (count < MAX_STATUS_HANDLE)) {
+	while ((last_consumer != consumer) && (count1 < MAX_STATUS_HANDLE)) {
 		buffer = &adapter->cmd_buf_arr[last_consumer];
 		port = adapter->port[buffer->port];
 		pdev = port->pdev;
@@ -947,24 +1022,25 @@ void netxen_process_cmd_ring(unsigned long data)
 			     && netif_carrier_ok(port->netdev))
 		    && ((jiffies - port->netdev->trans_start) >
 			port->netdev->watchdog_timeo)) {
-			schedule_work(&port->adapter->tx_timeout_task);
+			SCHEDULE_WORK(port->adapter->tx_timeout_task
+				      + port->portnum);
 		}
 
 		last_consumer = get_next_index(last_consumer,
 					       adapter->max_tx_desc_count);
-		count++;
+		count1++;
 	}
-	adapter->stats.noxmitdone += count;
+	adapter->stats.noxmitdone += count1;
 
-	count = 0;
+	count2 = 0;
 	spin_lock(&adapter->tx_lock);
 	if ((--adapter->proc_cmd_buf_counter) == 0) {
 		adapter->last_cmd_consumer = last_consumer;
 		while ((adapter->last_cmd_consumer != consumer)
-		       && (count < MAX_STATUS_HANDLE)) {
+		       && (count2 < MAX_STATUS_HANDLE)) {
 			buffer =
 			    &adapter->cmd_buf_arr[adapter->last_cmd_consumer];
-			count++;
+			count2++;
 			if (buffer->skb)
 				break;
 			else
@@ -973,7 +1049,7 @@ void netxen_process_cmd_ring(unsigned long data)
 						   adapter->max_tx_desc_count);
 		}
 	}
-	if (count) {
+	if (count1 || count2) {
 		for (p = 0; p < adapter->ahw.max_ports; p++) {
 			nport = adapter->port[p];
 			if (netif_queue_stopped(nport->netdev)
@@ -983,10 +1059,30 @@ void netxen_process_cmd_ring(unsigned long data)
 			}
 		}
 	}
+	/*
+	 * If everything is freed up to consumer then check if the ring is full
+	 * If the ring is full then check if more needs to be freed and
+	 * schedule the call back again.
+	 *
+	 * This happens when there are 2 CPUs. One could be freeing and the
+	 * other filling it. If the ring is full when we get out of here and
+	 * the card has already interrupted the host then the host can miss the
+	 * interrupt.
+	 *
+	 * There is still a possible race condition and the host could miss an
+	 * interrupt. The card has to take care of this.
+	 */
+	if (adapter->last_cmd_consumer == consumer &&
+	    (((adapter->cmd_producer + 1) %
+	      adapter->max_tx_desc_count) == adapter->last_cmd_consumer)) {
+		consumer = *(adapter->cmd_consumer);
+	}
+	done = (adapter->last_cmd_consumer == consumer);
 
 	spin_unlock(&adapter->tx_lock);
 	DPRINTK(INFO, "last consumer is %d in %s\n", last_consumer,
 		__FUNCTION__);
+	return (done);
 }
 
 /*
@@ -998,17 +1094,16 @@ void netxen_post_rx_buffers(struct netxen_adapter *adapter, u32 ctx, u32 ringid)
 	struct sk_buff *skb;
 	struct netxen_recv_context *recv_ctx = &(adapter->recv_ctx[ctx]);
 	struct netxen_rcv_desc_ctx *rcv_desc = NULL;
-	struct netxen_recv_crb *crbarea = &recv_crb_registers[ctx];
-	struct netxen_rcv_desc_crb *rcv_desc_crb = NULL;
-	u32 producer;
+	uint producer;
 	struct rcv_desc *pdesc;
 	struct netxen_rx_buffer *buffer;
 	int count = 0;
 	int index = 0;
+	netxen_ctx_msg msg = 0;
+	dma_addr_t dma;
 
 	adapter->stats.post_called++;
 	rcv_desc = &recv_ctx->rcv_desc[ringid];
-	rcv_desc_crb = &crbarea->rcv_desc_crb[ringid];
 
 	producer = rcv_desc->producer;
 	index = rcv_desc->begin_alloc;
@@ -1018,6 +1113,7 @@ void netxen_post_rx_buffers(struct netxen_adapter *adapter, u32 ctx, u32 ringid)
 		skb = dev_alloc_skb(rcv_desc->skb_size);
 		if (unlikely(!skb)) {
 			/*
+			 * TODO
 			 * We need to schedule the posting of buffers to the pegs.
 			 */
 			rcv_desc->begin_alloc = index;
@@ -1025,9 +1121,105 @@ void netxen_post_rx_buffers(struct netxen_adapter *adapter, u32 ctx, u32 ringid)
 				" allocated only %d buffers\n", count);
 			break;
 		}
+
+		count++;	/* now there should be no failure */
+		pdesc = &rcv_desc->desc_head[producer];
+
+#if defined(XGB_DEBUG)
+		*(unsigned long *)(skb->head) = 0xc0debabe;
+		if (skb_is_nonlinear(skb)) {
+			printk("Allocated SKB @%p is nonlinear\n");
+		}
+#endif
+		skb_reserve(skb, 2);
+		/* This will be setup when we receive the
+		 * buffer after it has been filled  FSL  TBD TBD
+		 * skb->dev = netdev;
+		 */
+		dma = pci_map_single(pdev, skb->data, rcv_desc->dma_size,
+				     PCI_DMA_FROMDEVICE);
+		pdesc->addr_buffer = dma;
+		buffer->skb = skb;
+		buffer->state = NETXEN_BUFFER_BUSY;
+		buffer->dma = dma;
+		/* make a rcv descriptor  */
+		pdesc->reference_handle = buffer->ref_handle;
+		pdesc->buffer_length = rcv_desc->dma_size;
+		DPRINTK(INFO, "done writing descripter\n");
+		producer =
+		    get_next_index(producer, rcv_desc->max_rx_desc_count);
+		index = get_next_index(index, rcv_desc->max_rx_desc_count);
+		buffer = &rcv_desc->rx_buf_arr[index];
+	}
+	/* if we did allocate buffers, then write the count to Phantom */
+	if (count) {
+		rcv_desc->begin_alloc = index;
+		rcv_desc->rcv_pending += count;
+		adapter->stats.lastposted = count;
+		adapter->stats.posted += count;
+		rcv_desc->producer = producer;
+		if (rcv_desc->rcv_free >= 32) {
+			rcv_desc->rcv_free = 0;
+			/* Window = 1 */
+			writel((producer - 1) &
+			       (rcv_desc->max_rx_desc_count - 1),
+			       NETXEN_CRB_NORMALIZE(adapter,
+						    recv_crb_registers[0].
+						    rcv_desc_crb[ringid].
+						    crb_rcv_producer_offset));
+			/*
+			 * Write a doorbell msg to tell phanmon of change in
+			 * receive ring producer
+			 */
+			netxen_set_msg_peg_id(msg, NETXEN_RCV_PEG_DB_ID);
+			netxen_set_msg_privid(msg);
+			netxen_set_msg_count(msg,
+					     ((producer -
+					       1) & (rcv_desc->
+						     max_rx_desc_count - 1)));
+			netxen_set_msg_ctxid(msg, 0);
+			netxen_set_msg_opcode(msg, NETXEN_RCV_PRODUCER(ringid));
+			writel(msg,
+			       DB_NORMALIZE(adapter,
+					    NETXEN_RCV_PRODUCER_OFFSET));
+		}
+	}
+}
+
+void netxen_post_rx_buffers_nodb(struct netxen_adapter *adapter, uint32_t ctx,
+				 uint32_t ringid)
+{
+	struct pci_dev *pdev = adapter->ahw.pdev;
+	struct sk_buff *skb;
+	struct netxen_recv_context *recv_ctx = &(adapter->recv_ctx[ctx]);
+	struct netxen_rcv_desc_ctx *rcv_desc = NULL;
+	u32 producer;
+	struct rcv_desc *pdesc;
+	struct netxen_rx_buffer *buffer;
+	int count = 0;
+	int index = 0;
+
+	adapter->stats.post_called++;
+	rcv_desc = &recv_ctx->rcv_desc[ringid];
+
+	producer = rcv_desc->producer;
+	index = rcv_desc->begin_alloc;
+	buffer = &rcv_desc->rx_buf_arr[index];
+	/* We can start writing rx descriptors into the phantom memory. */
+	while (buffer->state == NETXEN_BUFFER_FREE) {
+		skb = dev_alloc_skb(rcv_desc->skb_size);
+		if (unlikely(!skb)) {
+			/*
+			 * We need to schedule the posting of buffers to the pegs.
+			 */
+			rcv_desc->begin_alloc = index;
+			DPRINTK(ERR, "netxen_post_rx_buffers_nodb: "
+				" allocated only %d buffers\n", count);
+			break;
+		}
 		count++;	/* now there should be no failure */
 		pdesc = &rcv_desc->desc_head[producer];
-		skb_reserve(skb, NET_IP_ALIGN);
+		skb_reserve(skb, 2);
 		/* 
 		 * This will be setup when we receive the
 		 * buffer after it has been filled
@@ -1038,6 +1230,7 @@ void netxen_post_rx_buffers(struct netxen_adapter *adapter, u32 ctx, u32 ringid)
 		buffer->dma = pci_map_single(pdev, skb->data,
 					     rcv_desc->dma_size,
 					     PCI_DMA_FROMDEVICE);
+
 		/* make a rcv descriptor  */
 		pdesc->reference_handle = le16_to_cpu(buffer->ref_handle);
 		pdesc->buffer_length = le16_to_cpu(rcv_desc->dma_size);
@@ -1062,7 +1255,8 @@ void netxen_post_rx_buffers(struct netxen_adapter *adapter, u32 ctx, u32 ringid)
 			writel((producer - 1) &
 			       (rcv_desc->max_rx_desc_count - 1),
 			       NETXEN_CRB_NORMALIZE(adapter,
-						    rcv_desc_crb->
+						    recv_crb_registers[0].
+						    rcv_desc_crb[ringid].
 						    crb_rcv_producer_offset));
 			wmb();
 		}
@@ -1195,8 +1389,8 @@ netxen_nic_do_ioctl(struct netxen_adapter *adapter, void *u_data,
 
 	switch (data.cmd) {
 	case netxen_nic_cmd_pci_read:
-		if ((retval = netxen_nic_hw_read_wx(adapter, data.off,
-						    &(data.u), data.size)))
+		if ((retval = netxen_nic_hw_read_ioctl(adapter, data.off,
+						       &(data.u), data.size)))
 			goto error_out;
 		if (copy_to_user
 		    ((void __user *)&(up_data->u), &(data.u), data.size)) {
@@ -1209,8 +1403,35 @@ netxen_nic_do_ioctl(struct netxen_adapter *adapter, void *u_data,
 		break;
 
 	case netxen_nic_cmd_pci_write:
-		data.rv = netxen_nic_hw_write_wx(adapter, data.off, &(data.u),
-						 data.size);
+		if ((retval = netxen_nic_hw_write_ioctl(adapter, data.off,
+							&(data.u), data.size)))
+			goto error_out;
+		data.rv = 0;
+		break;
+
+	case netxen_nic_cmd_pci_mem_read:
+		if (netxen_nic_pci_mem_read_ioctl(adapter, data.off, &(data.u),
+						  data.size)) {
+			DPRINTK(ERR, "Failed to read the data.\n");
+			retval = -EFAULT;
+			goto error_out;
+		}
+		if (copy_to_user
+		    ((void __user *)&(up_data->u), &(data.u), data.size)) {
+			DPRINTK(ERR, "bad copy to userland: %d\n",
+				(int)sizeof(data));
+			retval = -EFAULT;
+			goto error_out;
+		}
+		data.rv = 0;
+		break;
+
+	case netxen_nic_cmd_pci_mem_write:
+		if ((retval = netxen_nic_pci_mem_write_ioctl(adapter, data.off,
+							     &(data.u),
+							     data.size)))
+			goto error_out;
+		data.rv = 0;
 		break;
 
 	case netxen_nic_cmd_pci_config_read:
@@ -1295,7 +1516,7 @@ netxen_nic_do_ioctl(struct netxen_adapter *adapter, void *u_data,
 		retval = -EOPNOTSUPP;
 		goto error_out;
 	}
-	put_user(data.rv, (u16 __user *) (&(up_data->rv)));
+	put_user(data.rv, (&(up_data->rv)));
 	DPRINTK(INFO, "done ioctl for %p well.\n", adapter);
 
       error_out:
diff --git a/drivers/net/netxen/netxen_nic_ioctl.h b/drivers/net/netxen/netxen_nic_ioctl.h
index 8eef139..1221fa5 100644
--- a/drivers/net/netxen/netxen_nic_ioctl.h
+++ b/drivers/net/netxen/netxen_nic_ioctl.h
@@ -36,7 +36,7 @@
 #define NETXEN_NIC_CMD		(NETXEN_CMD_START + 1)
 #define NETXEN_NIC_NAME		(NETXEN_CMD_START + 2)
 #define NETXEN_NIC_NAME_LEN	16
-#define NETXEN_NIC_NAME_RSP	"NETXEN"
+#define NETXEN_NIC_NAME_RSP	"NETXEN-UNM"
 
 typedef enum {
 	netxen_nic_cmd_none = 0,
diff --git a/drivers/net/netxen/netxen_nic_isr.c b/drivers/net/netxen/netxen_nic_isr.c
index 0f6e7b8..1b45f50 100644
--- a/drivers/net/netxen/netxen_nic_isr.c
+++ b/drivers/net/netxen/netxen_nic_isr.c
@@ -68,8 +68,7 @@ struct net_device_stats *netxen_nic_get_stats(struct net_device *netdev)
 void netxen_indicate_link_status(struct netxen_adapter *adapter, u32 portno,
 				 u32 link)
 {
-	struct netxen_port *pport = adapter->port[portno];
-	struct net_device *netdev = pport->netdev;
+	struct net_device *netdev = (adapter->port[portno])->netdev;
 
 	if (link)
 		netif_carrier_on(netdev);
diff --git a/drivers/net/netxen/netxen_nic_main.c b/drivers/net/netxen/netxen_nic_main.c
index 6dbdc8b..06c4778f 100644
--- a/drivers/net/netxen/netxen_nic_main.c
+++ b/drivers/net/netxen/netxen_nic_main.c
@@ -32,6 +32,7 @@
  */
 
 #include <linux/vmalloc.h>
+#include <linux/highmem.h>
 #include "netxen_nic_hw.h"
 
 #include "netxen_nic.h"
@@ -48,14 +49,21 @@ MODULE_DESCRIPTION("NetXen Multi port (1/10) Gigabit Network Driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(NETXEN_NIC_LINUX_VERSIONID);
 
-char netxen_nic_driver_name[] = "netxen";
+char netxen_nic_driver_name[] = "netxen-nic";
 static char netxen_nic_driver_string[] = "NetXen Network Driver version "
     NETXEN_NIC_LINUX_VERSIONID;
 
+struct netxen_adapter *g_adapter = NULL;
+
 #define NETXEN_NETDEV_WEIGHT 120
 #define NETXEN_ADAPTER_UP_MAGIC 777
 #define NETXEN_NIC_PEG_TUNE 0
 
+u8 nx_p2_id = NX_P2_C0;
+
+#define DMA_32BIT_MASK	0x00000000ffffffffULL
+#define DMA_35BIT_MASK	0x00000007ffffffffULL
+
 /* Local functions to NetXen NIC driver */
 static int __devinit netxen_nic_probe(struct pci_dev *pdev,
 				      const struct pci_device_id *ent);
@@ -87,6 +95,9 @@ static struct pci_device_id netxen_pci_tbl[] __devinitdata = {
 
 MODULE_DEVICE_TABLE(pci, netxen_pci_tbl);
 
+struct workqueue_struct *netxen_workq;
+static void netxen_watchdog(unsigned long);
+
 /*
  * netxen_nic_probe()
  *
@@ -105,20 +116,28 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	struct net_device *netdev = NULL;
 	struct netxen_adapter *adapter = NULL;
 	struct netxen_port *port = NULL;
-	u8 *mem_ptr0 = NULL;
-	u8 *mem_ptr1 = NULL;
-	u8 *mem_ptr2 = NULL;
+	void __iomem *mem_ptr0 = NULL;
+	void __iomem *mem_ptr1 = NULL;
+	void __iomem *mem_ptr2 = NULL;
 
-	unsigned long mem_base, mem_len;
+	u8 *db_ptr = NULL;
+	unsigned long mem_base, mem_len, db_base, db_len;
 	int pci_using_dac, i, err;
 	int ring;
 	struct netxen_recv_context *recv_ctx = NULL;
 	struct netxen_rcv_desc_ctx *rcv_desc = NULL;
 	struct netxen_cmd_buffer *cmd_buf_arr = NULL;
 	u64 mac_addr[FLASH_NUM_PORTS + 1];
-	int valid_mac;
+	int valid_mac = 0;
+	static int netxen_cards_found = 0;
 
 	printk(KERN_INFO "%s \n", netxen_nic_driver_string);
+	/* In current scheme, we use only PCI function 0 */
+	if (PCI_FUNC(pdev->devfn) != 0) {
+		DPRINTK(ERR, "NetXen function %d will not be enabled.\n",
+			PCI_FUNC(pdev->devfn));
+		return -ENODEV;
+	}
 	if ((err = pci_enable_device(pdev)))
 		return err;
 	if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) {
@@ -130,10 +149,12 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_out_disable_pdev;
 
 	pci_set_master(pdev);
-	if ((pci_set_dma_mask(pdev, DMA_64BIT_MASK) == 0) &&
-	    (pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK) == 0))
+	pci_read_config_byte(pdev, PCI_REVISION_ID, &nx_p2_id);
+	if (nx_p2_id == NX_P2_C1 &&
+	    (pci_set_dma_mask(pdev, DMA_35BIT_MASK) == 0) &&
+	    (pci_set_consistent_dma_mask(pdev, DMA_35BIT_MASK) == 0)) {
 		pci_using_dac = 1;
-	else {
+	} else {
 		if ((err = pci_set_dma_mask(pdev, DMA_32BIT_MASK)) ||
 		    (err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK)))
 			goto err_out_free_res;
@@ -153,21 +174,34 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	    ioremap(mem_base + THIRD_PAGE_GROUP_START, THIRD_PAGE_GROUP_SIZE);
 
 	if ((mem_ptr0 == 0UL) || (mem_ptr1 == 0UL) || (mem_ptr2 == 0UL)) {
-		DPRINTK(1, ERR,
+		DPRINTK(ERR,
 			"Cannot remap adapter memory aborting.:"
 			"0 -> %p, 1 -> %p, 2 -> %p\n",
 			mem_ptr0, mem_ptr1, mem_ptr2);
 
 		err = -EIO;
-		if (mem_ptr0)
-			iounmap(mem_ptr0);
-		if (mem_ptr1)
-			iounmap(mem_ptr1);
-		if (mem_ptr2)
-			iounmap(mem_ptr2);
-
-		goto err_out_free_res;
+		goto err_out_iounmap;
+	}
+	db_base = pci_resource_start(pdev, 4);	/* doorbell is on bar 4 */
+	db_len = pci_resource_len(pdev, 4);
+
+	if (db_len == 0) {
+		printk(KERN_ERR "%s: doorbell is disabled\n",
+		       netxen_nic_driver_name);
+		err = -EIO;
+		goto err_out_iounmap;
+	}
+	DPRINTK(INFO, "doorbell ioremap from %lx a size of %lx\n", db_base,
+		db_len);
+
+	db_ptr = ioremap(db_base, NETXEN_DB_MAPSIZE_BYTES);
+	if (db_ptr == 0UL) {
+		printk(KERN_ERR "%s: Failed to allocate doorbell map.",
+		       netxen_nic_driver_name);
+		err = -EIO;
+		goto err_out_iounmap;
 	}
+	DPRINTK(INFO, "doorbell ioremaped at %p\n", db_ptr);
 
 /*
  *      Allocate a adapter structure which will manage all the initialization
@@ -183,17 +217,24 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		       netxen_nic_driver_name,
 		       (int)sizeof(struct netxen_adapter));
 		err = -ENOMEM;
-		goto err_out_iounmap;
+		goto err_out_dbunmap;
 	}
 
+	if (netxen_cards_found == 0) {
+		g_adapter = adapter;
+	}
 	adapter->max_tx_desc_count = MAX_CMD_DESCRIPTORS;
 	adapter->max_rx_desc_count = MAX_RCV_DESCRIPTORS;
 	adapter->max_jumbo_rx_desc_count = MAX_JUMBO_RCV_DESCRIPTORS;
+	adapter->max_lro_rx_desc_count = MAX_LRO_RCV_DESCRIPTORS;
 
 	pci_set_drvdata(pdev, adapter);
 
 	cmd_buf_arr = (struct netxen_cmd_buffer *)vmalloc(TX_RINGSIZE);
 	if (cmd_buf_arr == NULL) {
+		printk(KERN_ERR
+		       "%s: Could not allocate cmd_buf_arr memory:%d\n",
+		       netxen_nic_driver_name, (int)TX_RINGSIZE);
 		err = -ENOMEM;
 		goto err_out_free_adapter;
 	}
@@ -220,11 +261,23 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 				rcv_desc->skb_size = MAX_RX_JUMBO_BUFFER_LENGTH;
 				break;
 
+			case RCV_RING_LRO:
+				rcv_desc->max_rx_desc_count =
+				    adapter->max_lro_rx_desc_count;
+				rcv_desc->flags = RCV_DESC_LRO;
+				rcv_desc->dma_size = RX_LRO_DMA_MAP_LEN;
+				rcv_desc->skb_size = MAX_RX_LRO_BUFFER_LENGTH;
+				break;
+
 			}
 			rcv_desc->rx_buf_arr = (struct netxen_rx_buffer *)
 			    vmalloc(RCV_BUFFSIZE);
 
 			if (rcv_desc->rx_buf_arr == NULL) {
+				printk(KERN_ERR "%s: Could not allocate"
+				       "rcv_desc->rx_buf_arr memory:%d\n",
+				       netxen_nic_driver_name,
+				       (int)RCV_BUFFSIZE);
 				err = -ENOMEM;
 				goto err_out_free_rx_buffer;
 			}
@@ -237,16 +290,17 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	adapter->ahw.pci_base0 = mem_ptr0;
 	adapter->ahw.pci_base1 = mem_ptr1;
 	adapter->ahw.pci_base2 = mem_ptr2;
+	adapter->ahw.db_base = db_ptr;
+	adapter->ahw.db_len = db_len;
 	spin_lock_init(&adapter->tx_lock);
 	spin_lock_init(&adapter->lock);
+	netxen_initialize_adapter_sw(adapter);	/* initialize the buffers in adapter */
 #ifdef CONFIG_IA64
 	netxen_pinit_from_rom(adapter, 0);
 	udelay(500);
 	netxen_load_firmware(adapter);
 #endif
 
-	/* initialize the buffers in adapter */
-	netxen_initialize_adapter_sw(adapter);
 	/*
 	 * Set the CRB window to invalid. If any register in window 0 is
 	 * accessed it should set the window to 0 and then reset it to 1.
@@ -268,7 +322,7 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		  (void (*)(void *))netxen_watchdog_task, adapter);
 	adapter->ahw.pdev = pdev;
 	adapter->proc_cmd_buf_counter = 0;
-	pci_read_config_byte(pdev, PCI_REVISION_ID, &adapter->ahw.revision_id);
+	adapter->ahw.revision_id = nx_p2_id;
 
 	if (pci_enable_msi(pdev)) {
 		adapter->flags &= ~NETXEN_NIC_MSI_ENABLED;
@@ -290,6 +344,12 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	writel(0, NETXEN_CRB_NORMALIZE(adapter, CRB_CMD_CONSUMER_OFFSET));
 	writel(0, NETXEN_CRB_NORMALIZE(adapter, CRB_HOST_CMD_ADDR_LO));
 
+	/* do this before waking up pegs so that we have valid dummy dma addr */
+	err = netxen_initialize_adapter_offload(adapter);
+	if (err) {
+		goto err_out_free_dev;
+	}
+
 	/* Unlock the HW, prompting the boot sequence */
 	writel(1,
 	       NETXEN_CRB_NORMALIZE(adapter, NETXEN_ROMUSB_GLB_PEGTUNE_DONE));
@@ -298,6 +358,7 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	netxen_phantom_init(adapter, NETXEN_NIC_PEG_TUNE);
 
 	/* initialize the all the ports */
+	adapter->active_ports = 0;
 
 	for (i = 0; i < adapter->ahw.max_ports; i++) {
 		netdev = alloc_etherdev(sizeof(struct netxen_port));
@@ -368,7 +429,7 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 							     netdev->dev_addr);
 			}
 		}
-		INIT_WORK(&adapter->tx_timeout_task,
+		INIT_WORK(adapter->tx_timeout_task + i,
 			  (void (*)(void *))netxen_tx_timeout_task, netdev);
 		netif_carrier_off(netdev);
 		netif_stop_queue(netdev);
@@ -381,7 +442,6 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 			goto err_out_free_dev;
 		}
 		adapter->port_count++;
-		adapter->active_ports = 0;
 		adapter->port[i] = port;
 	}
 
@@ -402,6 +462,7 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		break;
 	}
 
+	adapter->number = netxen_cards_found;
 	adapter->driver_mismatch = 0;
 
 	return 0;
@@ -417,6 +478,8 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		}
 	}
 
+	netxen_free_adapter_offload(adapter);
+
       err_out_free_rx_buffer:
 	for (i = 0; i < MAX_RCV_CTX; ++i) {
 		recv_ctx = &adapter->recv_ctx[i];
@@ -428,19 +491,23 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 			}
 		}
 	}
-
 	vfree(cmd_buf_arr);
 
-	kfree(adapter->port);
-
       err_out_free_adapter:
 	pci_set_drvdata(pdev, NULL);
 	kfree(adapter);
 
+      err_out_dbunmap:
+	if (db_ptr)
+		iounmap(db_ptr);
+
       err_out_iounmap:
-	iounmap(mem_ptr0);
-	iounmap(mem_ptr1);
-	iounmap(mem_ptr2);
+	if (mem_ptr0)
+		iounmap(mem_ptr0);
+	if (mem_ptr1)
+		iounmap(mem_ptr1);
+	if (mem_ptr2)
+		iounmap(mem_ptr2);
 
       err_out_free_res:
 	pci_release_regions(pdev);
@@ -465,12 +532,8 @@ static void __devexit netxen_nic_remove(struct pci_dev *pdev)
 
 	netxen_nic_stop_all_ports(adapter);
 	/* leave the hw in the same state as reboot */
-	netxen_pinit_from_rom(adapter, 0);
-	udelay(500);
 	netxen_load_firmware(adapter);
-
-	if ((adapter->flags & NETXEN_NIC_MSI_ENABLED))
-		netxen_nic_disable_int(adapter);
+	netxen_free_adapter_offload(adapter);
 
 	udelay(500);		/* Delay for a while to drain the DMA engines */
 	for (i = 0; i < adapter->port_count; i++) {
@@ -487,6 +550,7 @@ static void __devexit netxen_nic_remove(struct pci_dev *pdev)
 	if (adapter->is_up == NETXEN_ADAPTER_UP_MAGIC)
 		netxen_free_hw_resources(adapter);
 
+	iounmap(adapter->ahw.db_base);
 	iounmap(adapter->ahw.pci_base0);
 	iounmap(adapter->ahw.pci_base1);
 	iounmap(adapter->ahw.pci_base2);
@@ -534,6 +598,8 @@ static int netxen_nic_open(struct net_device *netdev)
 			return -EIO;
 		}
 		netxen_nic_flash_print(adapter);
+		if (adapter->init_niu)
+			adapter->init_niu(adapter);
 
 		/* setup all the resources for the Phantom... */
 		/* this include the descriptors for rcv, tx, and status */
@@ -551,25 +617,24 @@ static int netxen_nic_open(struct net_device *netdev)
 			netxen_free_hw_resources(adapter);
 			return -EIO;
 		}
-		if (adapter->init_niu)
-			adapter->init_niu(adapter);
 		for (ctx = 0; ctx < MAX_RCV_CTX; ++ctx) {
 			for (ring = 0; ring < NUM_RCV_DESC_RINGS; ring++)
 				netxen_post_rx_buffers(adapter, ctx, ring);
 		}
-		adapter->is_up = NETXEN_ADAPTER_UP_MAGIC;
-	}
-	adapter->active_ports++;
-	if (adapter->active_ports == 1) {
+		adapter->irq = adapter->ahw.pdev->irq;
 		err = request_irq(adapter->ahw.pdev->irq, &netxen_intr,
 				  SA_SHIRQ | SA_SAMPLE_RANDOM, netdev->name,
 				  adapter);
 		if (err) {
 			printk(KERN_ERR "request_irq failed with: %d\n", err);
-			adapter->active_ports--;
+			netxen_free_hw_resources(adapter);
 			return err;
 		}
-		adapter->irq = adapter->ahw.pdev->irq;
+
+		adapter->is_up = NETXEN_ADAPTER_UP_MAGIC;
+	}
+	adapter->active_ports++;
+	if (adapter->active_ports == 1) {
 		if (!adapter->driver_mismatch)
 			mod_timer(&adapter->watchdog_timer, jiffies);
 
@@ -583,6 +648,9 @@ static int netxen_nic_open(struct net_device *netdev)
 	netxen_nic_set_link_parameters(port);
 
 	netxen_nic_set_multi(netdev);
+	if (adapter->set_mtu)
+		adapter->set_mtu(port, netdev->mtu);
+
 	if (!adapter->driver_mismatch)
 		netif_start_queue(netdev);
 
@@ -635,6 +703,7 @@ static int netxen_nic_close(struct net_device *netdev)
 			}
 			cmd_buff++;
 		}
+		FLUSH_SCHEDULED_WORK();
 		del_timer_sync(&adapter->watchdog_timer);
 	}
 
@@ -655,7 +724,6 @@ static int netxen_nic_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 	struct cmd_desc_type0 *hwdesc;
 	int k;
 	struct netxen_cmd_buffer *pbuf = NULL;
-	unsigned int tries = 0;
 	static int dropped_packet = 0;
 	int frag_count;
 	u32 local_producer = 0;
@@ -717,7 +785,7 @@ static int netxen_nic_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 			if (((skb->nh.iph)->ihl * sizeof(u32)) +
 			    ((skb->h.th)->doff * sizeof(u32)) +
 			    sizeof(struct ethhdr) >
-			    (sizeof(struct cmd_desc_type0) - NET_IP_ALIGN)) {
+			    (sizeof(struct cmd_desc_type0) - 2)) {
 				no_of_desc++;
 			}
 		}
@@ -728,27 +796,17 @@ static int netxen_nic_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 	if ((k + no_of_desc) >=
 	    ((last_cmd_consumer <= k) ? last_cmd_consumer + max_tx_desc_count :
 	     last_cmd_consumer)) {
+		port->stats.nocmddescriptor++;
+		DPRINTK(ERR, "No command descriptors available,"
+			" producer = %d, consumer = %d count=%llu,"
+			" dropping packet\n", producer,
+			adapter->last_cmd_consumer,
+			port->stats.nocmddescriptor);
+
+		netif_stop_queue(netdev);
+		port->flags |= NETXEN_NETDEV_STATUS;
 		spin_unlock_bh(&adapter->tx_lock);
-		if (tries == 0) {
-			local_bh_disable();
-			netxen_process_cmd_ring((unsigned long)adapter);
-			local_bh_enable();
-			++tries;
-			goto retry_getting_window;
-		} else {
-			port->stats.nocmddescriptor++;
-			DPRINTK(ERR, "No command descriptors available,"
-				" producer = %d, consumer = %d count=%llu,"
-				" dropping packet\n", producer,
-				adapter->last_cmd_consumer,
-				port->stats.nocmddescriptor);
-
-			spin_lock_bh(&adapter->tx_lock);
-			netif_stop_queue(netdev);
-			port->flags |= NETXEN_NETDEV_STATUS;
-			spin_unlock_bh(&adapter->tx_lock);
-			return NETDEV_TX_BUSY;
-		}
+		return NETDEV_TX_BUSY;
 	}
 	k = get_index_range(k, max_tx_desc_count, no_of_desc);
 	adapter->cmd_producer = k;
@@ -770,7 +828,6 @@ static int netxen_nic_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 		pbuf->mss = 0;
 		hwdesc->mss = 0;
 	}
-	pbuf->no_of_descriptors = no_of_desc;
 	pbuf->total_length = skb->len;
 	pbuf->skb = skb;
 	pbuf->cmd = TX_ETHER_PKT;
@@ -780,11 +837,11 @@ static int netxen_nic_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 	buffrag->dma = pci_map_single(port->pdev, skb->data, first_seg_len,
 				      PCI_DMA_TODEVICE);
 	buffrag->length = first_seg_len;
-	CMD_DESC_TOTAL_LENGTH_WRT(hwdesc, skb->len);
-	hwdesc->num_of_buffers = frag_count;
-	hwdesc->opcode = TX_ETHER_PKT;
+	netxen_set_cmd_desc_totallength(hwdesc, skb->len);
+	netxen_set_cmd_desc_num_of_buff(hwdesc, frag_count);
+	netxen_set_cmd_desc_opcode(hwdesc, TX_ETHER_PKT);
 
-	CMD_DESC_PORT_WRT(hwdesc, port->portnum);
+	netxen_set_cmd_desc_port(hwdesc, port->portnum);
 	hwdesc->buffer1_length = cpu_to_le16(first_seg_len);
 	hwdesc->addr_buffer1 = cpu_to_le64(buffrag->dma);
 
@@ -843,12 +900,12 @@ static int netxen_nic_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 	/* For LSO, we need to copy the MAC/IP/TCP headers into
 	 * the descriptor ring
 	 */
-	if (hw->cmd_desc_head[saved_producer].opcode == TX_TCP_LSO) {
+	if (netxen_get_cmd_desc_opcode(&hw->cmd_desc_head[saved_producer])
+	    == TX_TCP_LSO) {
 		int hdr_len, first_hdr_len, more_hdr;
 		hdr_len = hw->cmd_desc_head[saved_producer].total_hdr_length;
-		if (hdr_len > (sizeof(struct cmd_desc_type0) - NET_IP_ALIGN)) {
-			first_hdr_len =
-			    sizeof(struct cmd_desc_type0) - NET_IP_ALIGN;
+		if (hdr_len > (sizeof(struct cmd_desc_type0) - 2)) {
+			first_hdr_len = sizeof(struct cmd_desc_type0) - 2;
 			more_hdr = 1;
 		} else {
 			first_hdr_len = hdr_len;
@@ -858,7 +915,7 @@ static int netxen_nic_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 		hwdesc = &hw->cmd_desc_head[producer];
 
 		/* copy the first 64 bytes */
-		memcpy(((void *)hwdesc) + NET_IP_ALIGN,
+		memcpy(((void *)hwdesc) + 2,
 		       (void *)(skb->data), first_hdr_len);
 		producer = get_next_index(producer, max_tx_desc_count);
 
@@ -874,7 +931,7 @@ static int netxen_nic_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 	}
 	spin_lock_bh(&adapter->tx_lock);
 	port->stats.txbytes +=
-	    CMD_DESC_TOTAL_LENGTH(&hw->cmd_desc_head[saved_producer]);
+	    netxen_get_cmd_desc_totallength(&hw->cmd_desc_head[saved_producer]);
 	/* Code to update the adapter considering how many producer threads
 	   are currently working */
 	if ((--adapter->num_threads) == 0) {
@@ -884,20 +941,6 @@ static int netxen_nic_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 		       NETXEN_CRB_NORMALIZE(adapter, CRB_CMD_PRODUCER_OFFSET));
 		wmb();
 		adapter->total_threads = 0;
-	} else {
-		u32 crb_producer = 0;
-		crb_producer =
-		    readl(NETXEN_CRB_NORMALIZE
-			  (adapter, CRB_CMD_PRODUCER_OFFSET));
-		if (crb_producer == local_producer) {
-			crb_producer = get_index_range(crb_producer,
-						       max_tx_desc_count,
-						       no_of_desc);
-			writel(crb_producer,
-			       NETXEN_CRB_NORMALIZE(adapter,
-						    CRB_CMD_PRODUCER_OFFSET));
-			wmb();
-		}
 	}
 
 	port->stats.xmitfinished++;
@@ -914,15 +957,20 @@ static int netxen_nic_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 static void netxen_watchdog(unsigned long v)
 {
 	struct netxen_adapter *adapter = (struct netxen_adapter *)v;
-	schedule_work(&adapter->watchdog_task);
+	if (adapter != g_adapter) {
+		printk("%s: ***BUG*** adapter[%p] != g_adapter[%p]\n",
+		       __FUNCTION__, adapter, g_adapter);
+		return;
+	}
+
+	SCHEDULE_WORK(&adapter->watchdog_task);
 }
 
 static void netxen_tx_timeout(struct net_device *netdev)
 {
 	struct netxen_port *port = (struct netxen_port *)netdev_priv(netdev);
-	struct netxen_adapter *adapter = port->adapter;
 
-	schedule_work(&adapter->tx_timeout_task);
+	SCHEDULE_WORK(port->adapter->tx_timeout_task + port->portnum);
 }
 
 static void netxen_tx_timeout_task(struct net_device *netdev)
@@ -953,6 +1001,11 @@ netxen_handle_int(struct netxen_adapter *adapter, struct net_device *netdev)
 	if (!(adapter->flags & NETXEN_NIC_MSI_ENABLED)) {
 		int count = 0;
 		u32 mask;
+		mask = readl(pci_base_offset(adapter, ISR_INT_VECTOR));
+		if ((mask & 0x80) == 0) {
+			/* not our interrupt */
+			return ret;
+		}
 		netxen_nic_disable_int(adapter);
 		/* Window = 0 or 1 */
 		do {
@@ -1012,7 +1065,10 @@ irqreturn_t netxen_intr(int irq, void *data)
 		netdev = port->netdev;
 
 		/* process our status queue (for all 4 ports) */
-		netxen_handle_int(adapter, netdev);
+		if (netif_running(netdev)) {
+			netxen_handle_int(adapter, netdev);
+			break;
+		}
 	}
 
 	return IRQ_HANDLED;
@@ -1054,11 +1110,11 @@ static int netxen_nic_poll(struct net_device *netdev, int *budget)
 	netdev->quota -= work_done;
 	*budget -= work_done;
 
-	if (work_done >= work_to_do
-	    && netxen_nic_rx_has_work(adapter) != 0)
+	if (work_done >= work_to_do && netxen_nic_rx_has_work(adapter) != 0)
 		done = 0;
 
-	netxen_process_cmd_ring((unsigned long)adapter);
+	if (netxen_process_cmd_ring((unsigned long)adapter) == 0)
+		done = 0;
 
 	DPRINTK(INFO, "new work_done: %d work_to_do: %d\n",
 		work_done, work_to_do);
@@ -1104,8 +1160,9 @@ netxen_nic_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
 		if (ifr->ifr_data) {
 			sprintf(dev_name, "%s-%d", NETXEN_NIC_NAME_RSP,
 				port->portnum);
-			nr_bytes = copy_to_user((char *)ifr->ifr_data, dev_name,
-						NETXEN_NIC_NAME_LEN);
+			nr_bytes =
+			    copy_to_user((char __user *)ifr->ifr_data, dev_name,
+					 NETXEN_NIC_NAME_LEN);
 			if (nr_bytes)
 				err = -EIO;
 
@@ -1132,6 +1189,9 @@ static struct pci_driver netxen_driver = {
 
 static int __init netxen_init_module(void)
 {
+	if ((netxen_workq = create_singlethread_workqueue("netxen")) == 0)
+		return -ENOMEM;
+
 	return pci_module_init(&netxen_driver);
 }
 
@@ -1142,7 +1202,7 @@ static void __exit netxen_exit_module(void)
 	/*
 	 * Wait for some time to allow the dma to drain, if any.
 	 */
-	mdelay(5);
+	destroy_workqueue(netxen_workq);
 	pci_unregister_driver(&netxen_driver);
 }
 
diff --git a/drivers/net/netxen/netxen_nic_niu.c b/drivers/net/netxen/netxen_nic_niu.c
index ff74f1e..4987dc7 100644
--- a/drivers/net/netxen/netxen_nic_niu.c
+++ b/drivers/net/netxen/netxen_nic_niu.c
@@ -40,13 +40,15 @@
 
 static long phy_lock_timeout = 100000000;
 
-static inline int phy_lock(void)
+static inline int phy_lock(struct netxen_adapter *adapter)
 {
 	int i;
 	int done = 0, timeout = 0;
 
 	while (!done) {
-		done = readl((void __iomem *)NETXEN_PCIE_REG(PCIE_SEM3_LOCK));
+		done =
+		    readl(pci_base_offset
+			  (adapter, NETXEN_PCIE_REG(PCIE_SEM3_LOCK)));
 		if (done == 1)
 			break;
 		if (timeout >= phy_lock_timeout) {
@@ -61,13 +63,15 @@ static inline int phy_lock(void)
 		}
 	}
 
-	writel(NETXEN_PHY_LOCK_ID, (void __iomem *)PHY_LOCK_DRIVER);
+	writel(PHY_LOCK_DRIVER,
+	       NETXEN_CRB_NORMALIZE(adapter, NETXEN_PHY_LOCK_ID));
 	return 0;
 }
 
-static inline int phy_unlock(void)
+static inline int phy_unlock(struct netxen_adapter *adapter)
 {
-	readl((void __iomem *)NETXEN_PCIE_REG(PCIE_SEM3_UNLOCK));
+	readl(pci_base_offset(adapter, NETXEN_PCIE_REG(PCIE_SEM3_UNLOCK)));
+
 	return 0;
 }
 
@@ -95,7 +99,7 @@ int netxen_niu_gbe_phy_read(struct netxen_adapter *adapter, long phy,
 	__le32 status;
 	__le32 mac_cfg0;
 
-	if (phy_lock() != 0) {
+	if (phy_lock(adapter) != 0) {
 		return -1;
 	}
 
@@ -162,7 +166,7 @@ int netxen_niu_gbe_phy_read(struct netxen_adapter *adapter, long phy,
 					   NETXEN_NIU_GB_MAC_CONFIG_0(0),
 					   &mac_cfg0, 4))
 			return -EIO;
-	phy_unlock();
+	phy_unlock(adapter);
 	return result;
 }
 
@@ -612,7 +616,7 @@ int netxen_niu_macaddr_set(struct netxen_port *port,
 	__le32 temp = 0;
 	struct netxen_adapter *adapter = port->adapter;
 	int phy = port->portnum;
-	unsigned char mac_addr[MAX_ADDR_LEN];
+	unsigned char mac_addr[6];
 	int i;
 
 	for (i = 0; i < 10; i++) {
@@ -631,7 +635,7 @@ int netxen_niu_macaddr_set(struct netxen_port *port,
 
 		netxen_niu_macaddr_get(adapter, phy,
 				       (netxen_ethernet_macaddr_t *) mac_addr);
-		if (memcmp(mac_addr, addr, MAX_ADDR_LEN == 0))
+		if (memcmp(mac_addr, addr, 6) == 0)
 			break;
 	}
 
diff --git a/drivers/net/netxen/netxen_nic_phan_reg.h b/drivers/net/netxen/netxen_nic_phan_reg.h
index 8181d43..7879f85 100644
--- a/drivers/net/netxen/netxen_nic_phan_reg.h
+++ b/drivers/net/netxen/netxen_nic_phan_reg.h
@@ -33,15 +33,74 @@
 /* 
  * CRB Registers or queue message done only at initialization time.
  */
+#define NIC_CRB_BASE               NETXEN_CAM_RAM(0x200)
+#define NETXEN_NIC_REG(X)             (NIC_CRB_BASE+(X))
 
-/*
- * The following 2 are the base adresses for the CRB registers and their
- * offsets will be added to get addresses for the index addresses.
- */
-#define NIC_CRB_BASE_PORT1	NETXEN_CAM_RAM(0x200)
-#define NIC_CRB_BASE_PORT2	NETXEN_CAM_RAM(0x250)
+#define CRB_PHAN_CNTRL_LO_OFFSET    NETXEN_NIC_REG(0x00)
+#define CRB_PHAN_CNTRL_HI_OFFSET    NETXEN_NIC_REG(0x04)
+#define CRB_CMD_PRODUCER_OFFSET     NETXEN_NIC_REG(0x08)
+#define CRB_CMD_CONSUMER_OFFSET     NETXEN_NIC_REG(0x0c)
+#define CRB_PAUSE_ADDR_LO           NETXEN_NIC_REG(0x10)	/* C0 EPG BUG  */
+#define CRB_PAUSE_ADDR_HI           NETXEN_NIC_REG(0x14)
+#define CRB_HOST_CMD_ADDR_HI        NETXEN_NIC_REG(0x18)	/* host add:cmd ring */
+#define CRB_HOST_CMD_ADDR_LO        NETXEN_NIC_REG(0x1c)
+#define CRB_CMD_INTR_LOOP           NETXEN_NIC_REG(0x20)	/* 4 regs for perf */
+#define CRB_CMD_DMA_LOOP            NETXEN_NIC_REG(0x24)
+#define CRB_RCV_INTR_LOOP           NETXEN_NIC_REG(0x28)
+#define CRB_RCV_DMA_LOOP            NETXEN_NIC_REG(0x2c)
+#define CRB_ENABLE_TX_INTR          NETXEN_NIC_REG(0x30)	/* phantom init status */
+#define CRB_MMAP_ADDR_3             NETXEN_NIC_REG(0x34)
+#define CRB_CMDPEG_CMDRING          NETXEN_NIC_REG(0x38)
+#define CRB_HOST_DUMMY_BUF_ADDR_HI  NETXEN_NIC_REG(0x3c)
+#define CRB_HOST_DUMMY_BUF_ADDR_LO  NETXEN_NIC_REG(0x40)
+#define CRB_MMAP_ADDR_0             NETXEN_NIC_REG(0x44)
+#define CRB_MMAP_ADDR_1             NETXEN_NIC_REG(0x48)
+#define CRB_MMAP_ADDR_2             NETXEN_NIC_REG(0x4c)
+#define CRB_CMDPEG_STATE            NETXEN_NIC_REG(0x50)
+#define CRB_MMAP_SIZE_0             NETXEN_NIC_REG(0x54)
+#define CRB_MMAP_SIZE_1             NETXEN_NIC_REG(0x58)
+#define CRB_MMAP_SIZE_2             NETXEN_NIC_REG(0x5c)
+#define CRB_MMAP_SIZE_3             NETXEN_NIC_REG(0x60)
+#define CRB_GLOBAL_INT_COAL         NETXEN_NIC_REG(0x64)	/* interrupt coalescing */
+#define CRB_INT_COAL_MODE           NETXEN_NIC_REG(0x68)
+#define CRB_MAX_RCV_BUFS            NETXEN_NIC_REG(0x6c)
+#define CRB_TX_INT_THRESHOLD        NETXEN_NIC_REG(0x70)
+#define CRB_RX_PKT_TIMER            NETXEN_NIC_REG(0x74)
+#define CRB_TX_PKT_TIMER            NETXEN_NIC_REG(0x78)
+#define CRB_RX_PKT_CNT              NETXEN_NIC_REG(0x7c)
+#define CRB_RX_TMR_CNT              NETXEN_NIC_REG(0x80)
+#define CRB_RX_LRO_TIMER            NETXEN_NIC_REG(0x84)
+#define CRB_RX_LRO_MID_TIMER        NETXEN_NIC_REG(0x88)
+#define CRB_DMA_MAX_RCV_BUFS        NETXEN_NIC_REG(0x8c)
+#define CRB_MAX_DMA_ENTRIES         NETXEN_NIC_REG(0x90)
+#define CRB_XG_STATE                NETXEN_NIC_REG(0x94)	/* XG Link status */
+#define CRB_AGENT_GO                NETXEN_NIC_REG(0x98)	/* NIC pkt gen agent */
+#define CRB_AGENT_TX_SIZE           NETXEN_NIC_REG(0x9c)
+#define CRB_AGENT_TX_TYPE           NETXEN_NIC_REG(0xa0)
+#define CRB_AGENT_TX_ADDR           NETXEN_NIC_REG(0xa4)
+#define CRB_AGENT_TX_MSS            NETXEN_NIC_REG(0xa8)
+#define CRB_TX_STATE                NETXEN_NIC_REG(0xac)	/* Debug -performance */
+#define CRB_TX_COUNT                NETXEN_NIC_REG(0xb0)
+#define CRB_RX_STATE                NETXEN_NIC_REG(0xb4)
+#define CRB_RX_PERF_DEBUG_1         NETXEN_NIC_REG(0xb8)
+#define CRB_RX_LRO_CONTROL          NETXEN_NIC_REG(0xbc)	/* LRO On/OFF */
+#define CRB_RX_LRO_START_NUM        NETXEN_NIC_REG(0xc0)
+#define CRB_MPORT_MODE              NETXEN_NIC_REG(0xc4)	/* Multiport Mode */
+#define CRB_CMD_RING_SIZE           NETXEN_NIC_REG(0xc8)
+#define CRB_INT_VECTOR              NETXEN_NIC_REG(0xd4)
+#define CRB_CTX_RESET               NETXEN_NIC_REG(0xd8)
+#define CRB_HOST_STS_PROD           NETXEN_NIC_REG(0xdc)
+#define CRB_HOST_STS_CONS           NETXEN_NIC_REG(0xe0)
+#define CRB_PEG_CMD_PROD            NETXEN_NIC_REG(0xe4)
+#define CRB_PEG_CMD_CONS            NETXEN_NIC_REG(0xe8)
+#define CRB_HOST_BUFFER_PROD        NETXEN_NIC_REG(0xec)
+#define CRB_HOST_BUFFER_CONS        NETXEN_NIC_REG(0xf0)
+#define CRB_JUMBO_BUFFER_PROD       NETXEN_NIC_REG(0xf4)
+#define CRB_JUMBO_BUFFER_CONS       NETXEN_NIC_REG(0xf8)
 
-#define NETXEN_NIC_REG(X)	(NIC_CRB_BASE_PORT1+(X))
+#define CRB_CMD_PRODUCER_OFFSET_1   NETXEN_NIC_REG(0x1ac)
+#define CRB_CMD_CONSUMER_OFFSET_1   NETXEN_NIC_REG(0x1b0)
+#define CRB_TEMP_STATE              NETXEN_NIC_REG(0x1b4)
 
 /*
  * CrbPortPhanCntrHi/Lo is used to pass the address of HostPhantomIndex address
@@ -51,74 +110,20 @@
  * on the Phantom.
  */
 
-#define CRB_PHAN_CNTRL_LO_OFFSET	NETXEN_NIC_REG(0x00)
-#define CRB_PHAN_CNTRL_HI_OFFSET	NETXEN_NIC_REG(0x04)
-
-/* point to the indexes */
-#define CRB_CMD_PRODUCER_OFFSET		NETXEN_NIC_REG(0x08)
-#define CRB_CMD_CONSUMER_OFFSET		NETXEN_NIC_REG(0x0c)
-
-#define CRB_PAUSE_ADDR_LO		NETXEN_NIC_REG(0x10)
-#define CRB_PAUSE_ADDR_HI		NETXEN_NIC_REG(0x14)
-
-/* address of command descriptors in the host memory */
-#define CRB_HOST_CMD_ADDR_HI		NETXEN_NIC_REG(0x30)
-#define CRB_HOST_CMD_ADDR_LO		NETXEN_NIC_REG(0x34)
-
-/* The following 4 CRB registers are for doing performance coal */
-#define CRB_CMD_INTR_LOOP		NETXEN_NIC_REG(0x38)
-#define CRB_CMD_DMA_LOOP		NETXEN_NIC_REG(0x3c)
-#define CRB_RCV_INTR_LOOP		NETXEN_NIC_REG(0x40)
-#define CRB_RCV_DMA_LOOP		NETXEN_NIC_REG(0x44)
-
-/* Needed by the host to find out the state of Phantom's initialization */
-#define CRB_ENABLE_TX_INTR		NETXEN_NIC_REG(0x4c)
-#define CRB_CMDPEG_STATE		NETXEN_NIC_REG(0x50)
-#define CRB_CMDPEG_CMDRING		NETXEN_NIC_REG(0x54)
-
-/* Interrupt coalescing parameters */
-#define CRB_GLOBAL_INT_COAL		NETXEN_NIC_REG(0x80)
-#define CRB_INT_COAL_MODE		NETXEN_NIC_REG(0x84)
-#define CRB_MAX_RCV_BUFS		NETXEN_NIC_REG(0x88)
-#define CRB_TX_INT_THRESHOLD		NETXEN_NIC_REG(0x8c)
-#define CRB_RX_PKT_TIMER		NETXEN_NIC_REG(0x90)
-#define CRB_TX_PKT_TIMER		NETXEN_NIC_REG(0x94)
-#define CRB_RX_PKT_CNT			NETXEN_NIC_REG(0x98)
-#define CRB_RX_TMR_CNT			NETXEN_NIC_REG(0x9c)
-#define CRB_INT_THRESH		 NETXEN_NIC_REG(0xa4)
-
-/* Register for communicating XG link status */
-#define CRB_XG_STATE			NETXEN_NIC_REG(0xa0)
-
-/* Register for communicating card temperature */
-/* Upper 16 bits are temperature value. Lower 16 bits are the state */
-#define CRB_TEMP_STATE		 NETXEN_NIC_REG(0xa8)
-#define nx_get_temp_val(x)	     ((x) >> 16)
-#define nx_get_temp_state(x)	   ((x) & 0xffff)
-#define nx_encode_temp(val, state)     (((val) << 16) | (state))
-
-/* Debug registers for controlling NIC pkt gen agent */
-#define CRB_AGENT_GO			NETXEN_NIC_REG(0xb0)
-#define CRB_AGENT_TX_SIZE		NETXEN_NIC_REG(0xb4)
-#define CRB_AGENT_TX_TYPE		NETXEN_NIC_REG(0xb8)
-#define CRB_AGENT_TX_ADDR		NETXEN_NIC_REG(0xbc)
-#define CRB_AGENT_TX_MSS		NETXEN_NIC_REG(0xc0)
-
-/* Debug registers for observing NIC performance */
-#define CRB_TX_STATE			NETXEN_NIC_REG(0xd0)
-#define CRB_TX_COUNT			NETXEN_NIC_REG(0xd4)
-#define CRB_RX_STATE			NETXEN_NIC_REG(0xd8)
+#define nx_get_temp_val(x)		((x) >> 16)
+#define nx_get_temp_state(x)		((x) & 0xffff)
+#define nx_encode_temp(val, state)	(((val) << 16) | (state))
 
 /* CRB registers per Rcv Descriptor ring */
 struct netxen_rcv_desc_crb {
 	u32 crb_rcv_producer_offset __attribute__ ((aligned(512)));
 	u32 crb_rcv_consumer_offset;
 	u32 crb_globalrcv_ring;
+	u32 crb_rcv_ring_size;
 };
 
 /*
- * CRB registers used by the receive peg logic. One instance of these
- * needs to be instantiated per instance of the receive peg.
+ * CRB registers used by the receive peg logic.
  */
 
 struct netxen_recv_crb {
@@ -127,6 +132,7 @@ struct netxen_recv_crb {
 	u32 crb_rcv_status_producer;
 	u32 crb_rcv_status_consumer;
 	u32 crb_rcvpeg_state;
+	u32 crb_status_ring_size;
 };
 
 #if defined(DEFINE_GLOBAL_RECV_CRB)
@@ -139,30 +145,48 @@ struct netxen_recv_crb recv_crb_registers[] = {
 	 {
 	  {
 	   /* crb_rcv_producer_offset: */
-	   NETXEN_NIC_REG(0x18),
+	   NETXEN_NIC_REG(0x100),
 	   /* crb_rcv_consumer_offset: */
-	   NETXEN_NIC_REG(0x1c),
+	   NETXEN_NIC_REG(0x104),
 	   /* crb_gloablrcv_ring: */
-	   NETXEN_NIC_REG(0x20),
+	   NETXEN_NIC_REG(0x108),
+	   /* crb_rcv_ring_size */
+	   NETXEN_NIC_REG(0x10c),
+
 	   },
 	  /* Jumbo frames */
 	  {
 	   /* crb_rcv_producer_offset: */
-	   NETXEN_NIC_REG(0x100),
+	   NETXEN_NIC_REG(0x110),
 	   /* crb_rcv_consumer_offset: */
-	   NETXEN_NIC_REG(0x104),
+	   NETXEN_NIC_REG(0x114),
 	   /* crb_gloablrcv_ring: */
-	   NETXEN_NIC_REG(0x108),
+	   NETXEN_NIC_REG(0x118),
+	   /* crb_rcv_ring_size */
+	   NETXEN_NIC_REG(0x11c),
+	   },
+	  /* LRO */
+	  {
+	   /* crb_rcv_producer_offset: */
+	   NETXEN_NIC_REG(0x120),
+	   /* crb_rcv_consumer_offset: */
+	   NETXEN_NIC_REG(0x124),
+	   /* crb_gloablrcv_ring: */
+	   NETXEN_NIC_REG(0x128),
+	   /* crb_rcv_ring_size */
+	   NETXEN_NIC_REG(0x12c),
 	   }
 	  },
 	 /* crb_rcvstatus_ring: */
-	 NETXEN_NIC_REG(0x24),
+	 NETXEN_NIC_REG(0x130),
 	 /* crb_rcv_status_producer: */
-	 NETXEN_NIC_REG(0x28),
+	 NETXEN_NIC_REG(0x134),
 	 /* crb_rcv_status_consumer: */
-	 NETXEN_NIC_REG(0x2c),
+	 NETXEN_NIC_REG(0x138),
 	 /* crb_rcvpeg_state: */
-	 NETXEN_NIC_REG(0x48),
+	 NETXEN_NIC_REG(0x13c),
+	 /* crb_status_ring_size */
+	 NETXEN_NIC_REG(0x140),
 
 	 },
 	/*
@@ -173,34 +197,66 @@ struct netxen_recv_crb recv_crb_registers[] = {
 	 {
 	  {
 	   /* crb_rcv_producer_offset: */
-	   NETXEN_NIC_REG(0x80),
+	   NETXEN_NIC_REG(0x144),
 	   /* crb_rcv_consumer_offset: */
-	   NETXEN_NIC_REG(0x84),
+	   NETXEN_NIC_REG(0x148),
 	   /* crb_globalrcv_ring: */
-	   NETXEN_NIC_REG(0x88),
+	   NETXEN_NIC_REG(0x14c),
+	   /* crb_rcv_ring_size */
+	   NETXEN_NIC_REG(0x150),
+
 	   },
 	  /* Jumbo frames */
 	  {
 	   /* crb_rcv_producer_offset: */
-	   NETXEN_NIC_REG(0x10C),
+	   NETXEN_NIC_REG(0x154),
 	   /* crb_rcv_consumer_offset: */
-	   NETXEN_NIC_REG(0x110),
+	   NETXEN_NIC_REG(0x158),
 	   /* crb_globalrcv_ring: */
-	   NETXEN_NIC_REG(0x114),
+	   NETXEN_NIC_REG(0x15c),
+	   /* crb_rcv_ring_size */
+	   NETXEN_NIC_REG(0x160),
+	   },
+	  /* LRO */
+	  {
+	   /* crb_rcv_producer_offset: */
+	   NETXEN_NIC_REG(0x164),
+	   /* crb_rcv_consumer_offset: */
+	   NETXEN_NIC_REG(0x168),
+	   /* crb_globalrcv_ring: */
+	   NETXEN_NIC_REG(0x16c),
+	   /* crb_rcv_ring_size */
+	   NETXEN_NIC_REG(0x170),
 	   }
+
 	  },
 	 /* crb_rcvstatus_ring: */
-	 NETXEN_NIC_REG(0x8c),
+	 NETXEN_NIC_REG(0x174),
 	 /* crb_rcv_status_producer: */
-	 NETXEN_NIC_REG(0x90),
+	 NETXEN_NIC_REG(0x178),
 	 /* crb_rcv_status_consumer: */
-	 NETXEN_NIC_REG(0x94),
+	 NETXEN_NIC_REG(0x17c),
 	 /* crb_rcvpeg_state: */
-	 NETXEN_NIC_REG(0x98),
+	 NETXEN_NIC_REG(0x180),
+	 /* crb_status_ring_size */
+	 NETXEN_NIC_REG(0x184),
+
 	 },
 };
+
+u64 ctx_addr_sig_regs[][3] = {
+	{NETXEN_NIC_REG(0x188), NETXEN_NIC_REG(0x18c), NETXEN_NIC_REG(0x1c0)},
+	{NETXEN_NIC_REG(0x190), NETXEN_NIC_REG(0x194), NETXEN_NIC_REG(0x1c4)},
+	{NETXEN_NIC_REG(0x198), NETXEN_NIC_REG(0x19c), NETXEN_NIC_REG(0x1c8)},
+	{NETXEN_NIC_REG(0x1a0), NETXEN_NIC_REG(0x1a4), NETXEN_NIC_REG(0x1cc)}
+};
+
 #else
 extern struct netxen_recv_crb recv_crb_registers[];
+extern u64 ctx_addr_sig_regs[][3];
+#define CRB_CTX_ADDR_REG_LO            (ctx_addr_sig_regs[0][0])
+#define CRB_CTX_ADDR_REG_HI            (ctx_addr_sig_regs[0][2])
+#define CRB_CTX_SIGNATURE_REG       (ctx_addr_sig_regs[0][1])
 #endif				/* DEFINE_GLOBAL_RECEIVE_CRB */
 
 /*
-- 
cgit v0.10.2


From 887bc5d00c02b32763845247024e8db5243ef857 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 5 Dec 2006 13:34:17 -0500
Subject: [GFS2] Fix indent in recovery.c

As per comments from Andrew Morton and Jan Engelhardt, this fixes the
indent and removes the "static" from a variable declaration since its
not needed in this case (now allocated on the stack of the function
in question).

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Jan Engelhardt <jengelh@linux01.gwdg.de>
Cc: Andrew Morton <akpm@osdl.org>

diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 4acf238..d0c806b 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -136,7 +136,7 @@ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
 {
 	struct buffer_head *bh;
 	struct gfs2_log_header_host lh;
-static const u32 nothing = 0;
+	const u32 nothing = 0;
 	u32 hash;
 	int error;
 
-- 
cgit v0.10.2


From 383956a9c59157db4c404d1c8bb9074b8dfe3ee0 Mon Sep 17 00:00:00 2001
From: Daniel Drake <dsd@gentoo.org>
Date: Fri, 1 Dec 2006 00:56:50 +0000
Subject: [PATCH] zd1211rw: zd_mac_rx isn't always called in IRQ context

e.g.

usb 1-7: rx_urb_complete() *** first fragment ***
usb 1-7: rx_urb_complete() *** second fragment ***
drivers/net/wireless/zd1211rw/zd_mac.c:1063 ASSERT
(((current_thread_info()->preempt_count) & (((1UL << (12))-1) << ((0 +
8) + 8)))) VIOLATED!
 [<f0299448>] zd_mac_rx+0x3e7/0x47a [zd1211rw]
 [<f029badc>] rx_urb_complete+0x22d/0x24a [zd1211rw]
 [<b028a22f>] urb_destroy+0x0/0x5
 [<b01f0930>] kref_put+0x65/0x72
 [<b0288cdf>] usb_hcd_giveback_urb+0x28/0x57
 [<b02950c4>] qh_completions+0x296/0x2f6
 [<b0294b21>] ehci_urb_done+0x70/0x7a
 [<b0294ea1>] qh_completions+0x73/0x2f6
 [<b02951bc>] ehci_work+0x98/0x538

Remove the bogus assertion, and use dev_kfree_skb_any as pointed out by
Ulrich Kunitz.

Signed-off-by: Daniel Drake <dsd@gentoo.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/zd1211rw/zd_mac.c b/drivers/net/wireless/zd1211rw/zd_mac.c
index 2696f95..d7a86b1 100644
--- a/drivers/net/wireless/zd1211rw/zd_mac.c
+++ b/drivers/net/wireless/zd1211rw/zd_mac.c
@@ -1059,10 +1059,8 @@ int zd_mac_rx(struct zd_mac *mac, const u8 *buffer, unsigned int length)
 	memcpy(skb_put(skb, length), buffer, length);
 
 	r = ieee80211_rx(ieee, skb, &stats);
-	if (!r) {
-		ZD_ASSERT(in_irq());
-		dev_kfree_skb_irq(skb);
-	}
+	if (!r)
+		dev_kfree_skb_any(skb);
 	return 0;
 }
 
-- 
cgit v0.10.2


From ff9b99bcccee8449afd23ddc28f8b4b7aec996ea Mon Sep 17 00:00:00 2001
From: Daniel Drake <dsd@gentoo.org>
Date: Fri, 1 Dec 2006 00:57:11 +0000
Subject: [PATCH] zd1211rw: Fill enc_capa in GIWRANGE handler

This is needed for NetworkManager users to connect to WPA networks.
Pointed out by Matthew Campbell.

Signed-off-by: Daniel Drake <dsd@gentoo.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/zd1211rw/zd_mac.c b/drivers/net/wireless/zd1211rw/zd_mac.c
index d7a86b1..bd1593e 100644
--- a/drivers/net/wireless/zd1211rw/zd_mac.c
+++ b/drivers/net/wireless/zd1211rw/zd_mac.c
@@ -615,6 +615,9 @@ int zd_mac_get_range(struct zd_mac *mac, struct iw_range *range)
 	range->we_version_compiled = WIRELESS_EXT;
 	range->we_version_source = 20;
 
+	range->enc_capa = IW_ENC_CAPA_WPA |  IW_ENC_CAPA_WPA2 |
+			  IW_ENC_CAPA_CIPHER_TKIP | IW_ENC_CAPA_CIPHER_CCMP;
+
 	ZD_ASSERT(!irqs_disabled());
 	spin_lock_irq(&mac->lock);
 	regdomain = mac->regdomain;
-- 
cgit v0.10.2


From 9cdac9657fda58ae39c2bbc8be396f5530ed8398 Mon Sep 17 00:00:00 2001
From: Ulrich Kunitz <kune@deine-taler.de>
Date: Fri, 1 Dec 2006 00:58:07 +0000
Subject: [PATCH] zd1211rw: Support for multicast addresses

Support for multicast adresses is implemented by supporting the
set_multicast_list() function of the network device. Address
filtering is supported by a group hash table in the device.

This is based on earlier work by Benoit Papillaut. Fixes multicast packet
reception and ipv6 connectivity:
http://bugzilla.kernel.org/show_bug.cgi?id=7424
http://bugzilla.kernel.org/show_bug.cgi?id=7425

Signed-off-by: Ulrich Kunitz <kune@deine-taler.de>
Signed-off-by: Daniel Drake <dsd@gentoo.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/zd1211rw/zd_chip.c b/drivers/net/wireless/zd1211rw/zd_chip.c
index 8be99eb..77e11dd 100644
--- a/drivers/net/wireless/zd1211rw/zd_chip.c
+++ b/drivers/net/wireless/zd1211rw/zd_chip.c
@@ -1673,3 +1673,16 @@ int zd_rfwritev_cr_locked(struct zd_chip *chip,
 
 	return 0;
 }
+
+int zd_chip_set_multicast_hash(struct zd_chip *chip,
+	                       struct zd_mc_hash *hash)
+{
+	struct zd_ioreq32 ioreqs[] = {
+		{ CR_GROUP_HASH_P1, hash->low },
+		{ CR_GROUP_HASH_P2, hash->high },
+	};
+
+	dev_dbg_f(zd_chip_dev(chip), "hash l 0x%08x h 0x%08x\n",
+		ioreqs[0].value, ioreqs[1].value);
+	return zd_iowrite32a(chip, ioreqs, ARRAY_SIZE(ioreqs));
+}
diff --git a/drivers/net/wireless/zd1211rw/zd_chip.h b/drivers/net/wireless/zd1211rw/zd_chip.h
index ca892b9..a4e3cee 100644
--- a/drivers/net/wireless/zd1211rw/zd_chip.h
+++ b/drivers/net/wireless/zd1211rw/zd_chip.h
@@ -390,10 +390,19 @@
 #define CR_BSSID_P1			CTL_REG(0x0618)
 #define CR_BSSID_P2			CTL_REG(0x061C)
 #define CR_BCN_PLCP_CFG			CTL_REG(0x0620)
+
+/* Group hash table for filtering incoming packets.
+ *
+ * The group hash table is 64 bit large and split over two parts. The first
+ * part is the lower part. The upper 6 bits of the last byte of the target
+ * address are used as index. Packets are received if the hash table bit is
+ * set. This is used for multicast handling, but for broadcasts (address
+ * ff:ff:ff:ff:ff:ff) the highest bit in the second table must also be set.
+ */
 #define CR_GROUP_HASH_P1		CTL_REG(0x0624)
 #define CR_GROUP_HASH_P2		CTL_REG(0x0628)
-#define CR_RX_TIMEOUT			CTL_REG(0x062C)
 
+#define CR_RX_TIMEOUT			CTL_REG(0x062C)
 /* Basic rates supported by the BSS. When producing ACK or CTS messages, the
  * device will use a rate in this table that is less than or equal to the rate
  * of the incoming frame which prompted the response */
@@ -864,4 +873,36 @@ u8 zd_rx_strength_percent(u8 rssi);
 
 u16 zd_rx_rate(const void *rx_frame, const struct rx_status *status);
 
+struct zd_mc_hash {
+	u32 low;
+	u32 high;
+};
+
+static inline void zd_mc_clear(struct zd_mc_hash *hash)
+{
+	hash->low = 0;
+	/* The interfaces must always received broadcasts.
+	 * The hash of the broadcast address ff:ff:ff:ff:ff:ff is 63.
+	 */
+	hash->high = 0x80000000;
+}
+
+static inline void zd_mc_add_all(struct zd_mc_hash *hash)
+{
+	hash->low = hash->high = 0xffffffff;
+}
+
+static inline void zd_mc_add_addr(struct zd_mc_hash *hash, u8 *addr)
+{
+	unsigned int i = addr[5] >> 2;
+	if (i < 32) {
+		hash->low |= 1 << i;
+	} else {
+		hash->high |= 1 << (i-32);
+	}
+}
+
+int zd_chip_set_multicast_hash(struct zd_chip *chip,
+	                       struct zd_mc_hash *hash);
+
 #endif /* _ZD_CHIP_H */
diff --git a/drivers/net/wireless/zd1211rw/zd_mac.c b/drivers/net/wireless/zd1211rw/zd_mac.c
index bd1593e..1dd3f76 100644
--- a/drivers/net/wireless/zd1211rw/zd_mac.c
+++ b/drivers/net/wireless/zd1211rw/zd_mac.c
@@ -39,6 +39,8 @@ static void housekeeping_init(struct zd_mac *mac);
 static void housekeeping_enable(struct zd_mac *mac);
 static void housekeeping_disable(struct zd_mac *mac);
 
+static void set_multicast_hash_handler(void *mac_ptr);
+
 int zd_mac_init(struct zd_mac *mac,
 	        struct net_device *netdev,
 	        struct usb_interface *intf)
@@ -55,6 +57,8 @@ int zd_mac_init(struct zd_mac *mac,
 	softmac_init(ieee80211_priv(netdev));
 	zd_chip_init(&mac->chip, netdev, intf);
 	housekeeping_init(mac);
+	INIT_WORK(&mac->set_multicast_hash_work, set_multicast_hash_handler,
+		  mac);
 	return 0;
 }
 
@@ -136,6 +140,7 @@ out:
 
 void zd_mac_clear(struct zd_mac *mac)
 {
+	flush_workqueue(zd_workqueue);
 	zd_chip_clear(&mac->chip);
 	ZD_ASSERT(!spin_is_locked(&mac->lock));
 	ZD_MEMCLEAR(mac, sizeof(struct zd_mac));
@@ -256,6 +261,42 @@ int zd_mac_set_mac_address(struct net_device *netdev, void *p)
 	return 0;
 }
 
+static void set_multicast_hash_handler(void *mac_ptr)
+{
+	struct zd_mac *mac = mac_ptr;
+	struct zd_mc_hash hash;
+
+	spin_lock_irq(&mac->lock);
+	hash = mac->multicast_hash;
+	spin_unlock_irq(&mac->lock);
+
+	zd_chip_set_multicast_hash(&mac->chip, &hash);
+}
+
+void zd_mac_set_multicast_list(struct net_device *dev)
+{
+	struct zd_mc_hash hash;
+	struct zd_mac *mac = zd_netdev_mac(dev);
+	struct dev_mc_list *mc;
+	unsigned long flags;
+
+	if (dev->flags & (IFF_PROMISC|IFF_ALLMULTI)) {
+		zd_mc_add_all(&hash);
+	} else {
+		zd_mc_clear(&hash);
+		for (mc = dev->mc_list; mc; mc = mc->next) {
+			dev_dbg_f(zd_mac_dev(mac), "mc addr " MAC_FMT "\n",
+				  MAC_ARG(mc->dmi_addr));
+			zd_mc_add_addr(&hash, mc->dmi_addr);
+		}
+	}
+
+	spin_lock_irqsave(&mac->lock, flags);
+	mac->multicast_hash = hash;
+	spin_unlock_irqrestore(&mac->lock, flags);
+	queue_work(zd_workqueue, &mac->set_multicast_hash_work);
+}
+
 int zd_mac_set_regdomain(struct zd_mac *mac, u8 regdomain)
 {
 	int r;
@@ -930,7 +971,8 @@ static int is_data_packet_for_us(struct ieee80211_device *ieee,
 	}
 
 	return memcmp(hdr->addr1, netdev->dev_addr, ETH_ALEN) == 0 ||
-	       is_multicast_ether_addr(hdr->addr1) ||
+	       (is_multicast_ether_addr(hdr->addr1) &&
+		memcmp(hdr->addr3, netdev->dev_addr, ETH_ALEN) != 0) ||
 	       (netdev->flags & IFF_PROMISC);
 }
 
diff --git a/drivers/net/wireless/zd1211rw/zd_mac.h b/drivers/net/wireless/zd1211rw/zd_mac.h
index 5dcfb25..77f1268 100644
--- a/drivers/net/wireless/zd1211rw/zd_mac.h
+++ b/drivers/net/wireless/zd1211rw/zd_mac.h
@@ -133,6 +133,8 @@ struct zd_mac {
 	struct iw_statistics iw_stats;
 
 	struct housekeeping housekeeping;
+	struct work_struct set_multicast_hash_work;
+	struct zd_mc_hash multicast_hash;
 	struct work_struct set_rts_cts_work;
 	struct work_struct set_basic_rates_work;
 
@@ -189,6 +191,7 @@ int zd_mac_init_hw(struct zd_mac *mac, u8 device_type);
 int zd_mac_open(struct net_device *netdev);
 int zd_mac_stop(struct net_device *netdev);
 int zd_mac_set_mac_address(struct net_device *dev, void *p);
+void zd_mac_set_multicast_list(struct net_device *netdev);
 
 int zd_mac_rx(struct zd_mac *mac, const u8 *buffer, unsigned int length);
 
diff --git a/drivers/net/wireless/zd1211rw/zd_netdev.c b/drivers/net/wireless/zd1211rw/zd_netdev.c
index 60f1b0f..8bda48d 100644
--- a/drivers/net/wireless/zd1211rw/zd_netdev.c
+++ b/drivers/net/wireless/zd1211rw/zd_netdev.c
@@ -242,7 +242,7 @@ struct net_device *zd_netdev_alloc(struct usb_interface *intf)
 	netdev->open = zd_mac_open;
 	netdev->stop = zd_mac_stop;
 	/* netdev->get_stats = */
-	/* netdev->set_multicast_list = */
+	netdev->set_multicast_list = zd_mac_set_multicast_list;
 	netdev->set_mac_address = zd_mac_set_mac_address;
 	netdev->wireless_handlers = &iw_handler_def;
 	/* netdev->ethtool_ops = */
-- 
cgit v0.10.2


From b0471bb7b779f5deea109e5bfdfe8c18d4a06241 Mon Sep 17 00:00:00 2001
From: Yan Burman <burman.yan@gmail.com>
Date: Sat, 2 Dec 2006 13:33:40 +0200
Subject: [PATCH] hostap: replace kmalloc+memset with kzalloc

Replace kmalloc+memset with kzalloc

Signed-off-by: Yan Burman <burman.yan@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/hostap/hostap_ap.c b/drivers/net/wireless/hostap/hostap_ap.c
index ba13125..798a855 100644
--- a/drivers/net/wireless/hostap/hostap_ap.c
+++ b/drivers/net/wireless/hostap/hostap_ap.c
@@ -1099,15 +1099,13 @@ static struct sta_info * ap_add_sta(struct ap_data *ap, u8 *addr)
 {
 	struct sta_info *sta;
 
-	sta = (struct sta_info *)
-		kmalloc(sizeof(struct sta_info), GFP_ATOMIC);
+	sta = kzalloc(sizeof(struct sta_info), GFP_ATOMIC);
 	if (sta == NULL) {
 		PDEBUG(DEBUG_AP, "AP: kmalloc failed\n");
 		return NULL;
 	}
 
 	/* initialize STA info data */
-	memset(sta, 0, sizeof(struct sta_info));
 	sta->local = ap->local;
 	skb_queue_head_init(&sta->tx_buf);
 	memcpy(sta->addr, addr, ETH_ALEN);
diff --git a/drivers/net/wireless/hostap/hostap_cs.c b/drivers/net/wireless/hostap/hostap_cs.c
index f63909e..ef470e6 100644
--- a/drivers/net/wireless/hostap/hostap_cs.c
+++ b/drivers/net/wireless/hostap/hostap_cs.c
@@ -566,12 +566,11 @@ static int prism2_config(struct pcmcia_device *link)
 	PDEBUG(DEBUG_FLOW, "prism2_config()\n");
 
 	parse = kmalloc(sizeof(cisparse_t), GFP_KERNEL);
-	hw_priv = kmalloc(sizeof(*hw_priv), GFP_KERNEL);
+	hw_priv = kzalloc(sizeof(*hw_priv), GFP_KERNEL);
 	if (parse == NULL || hw_priv == NULL) {
 		ret = -ENOMEM;
 		goto failed;
 	}
-	memset(hw_priv, 0, sizeof(*hw_priv));
 
 	tuple.DesiredTuple = CISTPL_CONFIG;
 	tuple.Attributes = 0;
diff --git a/drivers/net/wireless/hostap/hostap_download.c b/drivers/net/wireless/hostap/hostap_download.c
index ab26b52..24fc387 100644
--- a/drivers/net/wireless/hostap/hostap_download.c
+++ b/drivers/net/wireless/hostap/hostap_download.c
@@ -685,14 +685,12 @@ static int prism2_download(local_info_t *local,
 		goto out;
 	}
 
-	dl = kmalloc(sizeof(*dl) + param->num_areas *
+	dl = kzalloc(sizeof(*dl) + param->num_areas *
 		     sizeof(struct prism2_download_data_area), GFP_KERNEL);
 	if (dl == NULL) {
 		ret = -ENOMEM;
 		goto out;
 	}
-	memset(dl, 0, sizeof(*dl) + param->num_areas *
-	       sizeof(struct prism2_download_data_area));
 	dl->dl_cmd = param->dl_cmd;
 	dl->start_addr = param->start_addr;
 	dl->num_areas = param->num_areas;
diff --git a/drivers/net/wireless/hostap/hostap_hw.c b/drivers/net/wireless/hostap/hostap_hw.c
index ed00ebb..9c50336 100644
--- a/drivers/net/wireless/hostap/hostap_hw.c
+++ b/drivers/net/wireless/hostap/hostap_hw.c
@@ -347,14 +347,12 @@ static int hfa384x_cmd(struct net_device *dev, u16 cmd, u16 param0,
 	if (signal_pending(current))
 		return -EINTR;
 
-	entry = (struct hostap_cmd_queue *)
-		kmalloc(sizeof(*entry), GFP_ATOMIC);
+	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
 	if (entry == NULL) {
 		printk(KERN_DEBUG "%s: hfa384x_cmd - kmalloc failed\n",
 		       dev->name);
 		return -ENOMEM;
 	}
-	memset(entry, 0, sizeof(*entry));
 	atomic_set(&entry->usecnt, 1);
 	entry->type = CMD_SLEEP;
 	entry->cmd = cmd;
@@ -517,14 +515,12 @@ static int hfa384x_cmd_callback(struct net_device *dev, u16 cmd, u16 param0,
 		return -1;
 	}
 
-	entry = (struct hostap_cmd_queue *)
-		kmalloc(sizeof(*entry), GFP_ATOMIC);
+	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
 	if (entry == NULL) {
 		printk(KERN_DEBUG "%s: hfa384x_cmd_callback - kmalloc "
 		       "failed\n", dev->name);
 		return -ENOMEM;
 	}
-	memset(entry, 0, sizeof(*entry));
 	atomic_set(&entry->usecnt, 1);
 	entry->type = CMD_CALLBACK;
 	entry->cmd = cmd;
@@ -3015,14 +3011,12 @@ static int prism2_set_tim(struct net_device *dev, int aid, int set)
 	iface = netdev_priv(dev);
 	local = iface->local;
 
-	new_entry = (struct set_tim_data *)
-		kmalloc(sizeof(*new_entry), GFP_ATOMIC);
+	new_entry = kzalloc(sizeof(*new_entry), GFP_ATOMIC);
 	if (new_entry == NULL) {
 		printk(KERN_DEBUG "%s: prism2_set_tim: kmalloc failed\n",
 		       local->dev->name);
 		return -ENOMEM;
 	}
-	memset(new_entry, 0, sizeof(*new_entry));
 	new_entry->aid = aid;
 	new_entry->set = set;
 
diff --git a/drivers/net/wireless/hostap/hostap_info.c b/drivers/net/wireless/hostap/hostap_info.c
index 50f72d8..00ed638 100644
--- a/drivers/net/wireless/hostap/hostap_info.c
+++ b/drivers/net/wireless/hostap/hostap_info.c
@@ -327,11 +327,10 @@ static void prism2_info_hostscanresults(local_info_t *local,
 	ptr = (u8 *) pos;
 
 	new_count = left / result_size;
-	results = kmalloc(new_count * sizeof(struct hfa384x_hostscan_result),
+	results = kcalloc(new_count, sizeof(struct hfa384x_hostscan_result),
 			  GFP_ATOMIC);
 	if (results == NULL)
 		return;
-	memset(results, 0, new_count * sizeof(struct hfa384x_hostscan_result));
 
 	for (i = 0; i < new_count; i++) {
 		memcpy(&results[i], ptr, copy_len);
diff --git a/drivers/net/wireless/hostap/hostap_ioctl.c b/drivers/net/wireless/hostap/hostap_ioctl.c
index d061fb3..3b7b806 100644
--- a/drivers/net/wireless/hostap/hostap_ioctl.c
+++ b/drivers/net/wireless/hostap/hostap_ioctl.c
@@ -181,12 +181,10 @@ static int prism2_ioctl_siwencode(struct net_device *dev,
 		struct ieee80211_crypt_data *new_crypt;
 
 		/* take WEP into use */
-		new_crypt = (struct ieee80211_crypt_data *)
-			kmalloc(sizeof(struct ieee80211_crypt_data),
+		new_crypt = kzalloc(sizeof(struct ieee80211_crypt_data),
 				GFP_KERNEL);
 		if (new_crypt == NULL)
 			return -ENOMEM;
-		memset(new_crypt, 0, sizeof(struct ieee80211_crypt_data));
 		new_crypt->ops = ieee80211_get_crypto_ops("WEP");
 		if (!new_crypt->ops) {
 			request_module("ieee80211_crypt_wep");
@@ -3320,14 +3318,12 @@ static int prism2_ioctl_siwencodeext(struct net_device *dev,
 
 		prism2_crypt_delayed_deinit(local, crypt);
 
-		new_crypt = (struct ieee80211_crypt_data *)
-			kmalloc(sizeof(struct ieee80211_crypt_data),
+		new_crypt = kzalloc(sizeof(struct ieee80211_crypt_data),
 				GFP_KERNEL);
 		if (new_crypt == NULL) {
 			ret = -ENOMEM;
 			goto done;
 		}
-		memset(new_crypt, 0, sizeof(struct ieee80211_crypt_data));
 		new_crypt->ops = ops;
 		new_crypt->priv = new_crypt->ops->init(i);
 		if (new_crypt->priv == NULL) {
@@ -3538,14 +3534,12 @@ static int prism2_ioctl_set_encryption(local_info_t *local,
 
 		prism2_crypt_delayed_deinit(local, crypt);
 
-		new_crypt = (struct ieee80211_crypt_data *)
-			kmalloc(sizeof(struct ieee80211_crypt_data),
+		new_crypt = kzalloc(sizeof(struct ieee80211_crypt_data),
 				GFP_KERNEL);
 		if (new_crypt == NULL) {
 			ret = -ENOMEM;
 			goto done;
 		}
-		memset(new_crypt, 0, sizeof(struct ieee80211_crypt_data));
 		new_crypt->ops = ops;
 		new_crypt->priv = new_crypt->ops->init(param->u.crypt.idx);
 		if (new_crypt->priv == NULL) {
diff --git a/drivers/net/wireless/hostap/hostap_pci.c b/drivers/net/wireless/hostap/hostap_pci.c
index d1de976..c4f6020 100644
--- a/drivers/net/wireless/hostap/hostap_pci.c
+++ b/drivers/net/wireless/hostap/hostap_pci.c
@@ -300,10 +300,9 @@ static int prism2_pci_probe(struct pci_dev *pdev,
 	struct hostap_interface *iface;
 	struct hostap_pci_priv *hw_priv;
 
-	hw_priv = kmalloc(sizeof(*hw_priv), GFP_KERNEL);
+	hw_priv = kzalloc(sizeof(*hw_priv), GFP_KERNEL);
 	if (hw_priv == NULL)
 		return -ENOMEM;
-	memset(hw_priv, 0, sizeof(*hw_priv));
 
 	if (pci_enable_device(pdev))
 		goto err_out_free;
diff --git a/drivers/net/wireless/hostap/hostap_plx.c b/drivers/net/wireless/hostap/hostap_plx.c
index bc81b13..e235e06 100644
--- a/drivers/net/wireless/hostap/hostap_plx.c
+++ b/drivers/net/wireless/hostap/hostap_plx.c
@@ -447,10 +447,9 @@ static int prism2_plx_probe(struct pci_dev *pdev,
 	int tmd7160;
 	struct hostap_plx_priv *hw_priv;
 
-	hw_priv = kmalloc(sizeof(*hw_priv), GFP_KERNEL);
+	hw_priv = kzalloc(sizeof(*hw_priv), GFP_KERNEL);
 	if (hw_priv == NULL)
 		return -ENOMEM;
-	memset(hw_priv, 0, sizeof(*hw_priv));
 
 	if (pci_enable_device(pdev))
 		goto err_out_free;
-- 
cgit v0.10.2


From b950e83b69a69f3db5ae64ab70b336886855517f Mon Sep 17 00:00:00 2001
From: Yan Burman <burman.yan@gmail.com>
Date: Sat, 2 Dec 2006 13:35:20 +0200
Subject: [PATCH] prism54: replace kmalloc+memset with kzalloc

Replace kmalloc+memset with kzalloc

Signed-off-by: Yan Burman <burman.yan@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/prism54/isl_ioctl.c b/drivers/net/wireless/prism54/isl_ioctl.c
index 4a20e45..a48edd1 100644
--- a/drivers/net/wireless/prism54/isl_ioctl.c
+++ b/drivers/net/wireless/prism54/isl_ioctl.c
@@ -2140,11 +2140,9 @@ prism54_wpa_bss_ie_add(islpci_private *priv, u8 *bssid,
 					 struct islpci_bss_wpa_ie, list);
 			list_del(&bss->list);
 		} else {
-			bss = kmalloc(sizeof (*bss), GFP_ATOMIC);
-			if (bss != NULL) {
+			bss = kzalloc(sizeof (*bss), GFP_ATOMIC);
+			if (bss != NULL)
 				priv->num_bss_wpa++;
-				memset(bss, 0, sizeof (*bss));
-			}
 		}
 		if (bss != NULL) {
 			memcpy(bss->bssid, bssid, ETH_ALEN);
@@ -2684,11 +2682,10 @@ prism2_ioctl_set_generic_element(struct net_device *ndev,
                return -EINVAL;
 
        alen = sizeof(*attach) + len;
-       attach = kmalloc(alen, GFP_KERNEL);
+       attach = kzalloc(alen, GFP_KERNEL);
        if (attach == NULL)
                return -ENOMEM;
 
-       memset(attach, 0, alen);
 #define WLAN_FC_TYPE_MGMT 0
 #define WLAN_FC_STYPE_ASSOC_REQ 0
 #define WLAN_FC_STYPE_REASSOC_REQ 2
diff --git a/drivers/net/wireless/prism54/oid_mgt.c b/drivers/net/wireless/prism54/oid_mgt.c
index fbc52b6..e6cf9df 100644
--- a/drivers/net/wireless/prism54/oid_mgt.c
+++ b/drivers/net/wireless/prism54/oid_mgt.c
@@ -235,12 +235,10 @@ mgt_init(islpci_private *priv)
 {
 	int i;
 
-	priv->mib = kmalloc(OID_NUM_LAST * sizeof (void *), GFP_KERNEL);
+	priv->mib = kcalloc(OID_NUM_LAST, sizeof (void *), GFP_KERNEL);
 	if (!priv->mib)
 		return -ENOMEM;
 
-	memset(priv->mib, 0, OID_NUM_LAST * sizeof (void *));
-
 	/* Alloc the cache */
 	for (i = 0; i < OID_NUM_LAST; i++) {
 		if (isl_oid[i].flags & OID_FLAG_CACHED) {
-- 
cgit v0.10.2


From e6e3f12ad713fb878baa8e8b5456874a7ac714d3 Mon Sep 17 00:00:00 2001
From: Yan Burman <burman.yan@gmail.com>
Date: Sat, 2 Dec 2006 13:38:14 +0200
Subject: [PATCH] ipw2200: replace kmalloc+memset with kcalloc

Replace kmalloc+memset with kcalloc

Signed-off-by: Yan Burman <burman.yan@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/ipw2200.c b/drivers/net/wireless/ipw2200.c
index c692d01..b2e144b 100644
--- a/drivers/net/wireless/ipw2200.c
+++ b/drivers/net/wireless/ipw2200.c
@@ -11129,14 +11129,13 @@ static int ipw_up(struct ipw_priv *priv)
 		return -EIO;
 
 	if (cmdlog && !priv->cmdlog) {
-		priv->cmdlog = kmalloc(sizeof(*priv->cmdlog) * cmdlog,
+		priv->cmdlog = kcalloc(cmdlog, sizeof(*priv->cmdlog),
 				       GFP_KERNEL);
 		if (priv->cmdlog == NULL) {
 			IPW_ERROR("Error allocating %d command log entries.\n",
 				  cmdlog);
 			return -ENOMEM;
 		} else {
-			memset(priv->cmdlog, 0, sizeof(*priv->cmdlog) * cmdlog);
 			priv->cmdlog_len = cmdlog;
 		}
 	}
-- 
cgit v0.10.2


From 2b50c24554d31c2db2f93b1151b5991e62f96594 Mon Sep 17 00:00:00 2001
From: Ulrich Kunitz <kune@deine-taler.de>
Date: Sun, 3 Dec 2006 16:32:00 +0100
Subject: [PATCH] softmac: Fixed handling of deassociation from AP

In 2.6.19 a deauthentication from the AP doesn't start a
reassociation by the softmac code. It appears that
mac->associnfo.associating must be set and the
ieee80211softmac_assoc_work function must be scheduled. This patch
fixes that.

Signed-off-by: Ulrich Kunitz <kune@deine-taler.de>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/net/ieee80211/softmac/ieee80211softmac_assoc.c b/net/ieee80211/softmac/ieee80211softmac_assoc.c
index cf51c87..614aa8d 100644
--- a/net/ieee80211/softmac/ieee80211softmac_assoc.c
+++ b/net/ieee80211/softmac/ieee80211softmac_assoc.c
@@ -427,6 +427,17 @@ ieee80211softmac_handle_assoc_response(struct net_device * dev,
 	return 0;
 }
 
+void
+ieee80211softmac_try_reassoc(struct ieee80211softmac_device *mac)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&mac->lock, flags);
+	mac->associnfo.associating = 1;
+	schedule_work(&mac->associnfo.work);
+	spin_unlock_irqrestore(&mac->lock, flags);
+}
+
 int
 ieee80211softmac_handle_disassoc(struct net_device * dev,
 				 struct ieee80211_disassoc *disassoc)
@@ -445,8 +456,7 @@ ieee80211softmac_handle_disassoc(struct net_device * dev,
 	dprintk(KERN_INFO PFX "got disassoc frame\n");
 	ieee80211softmac_disassoc(mac);
 
-	/* try to reassociate */
-	schedule_work(&mac->associnfo.work);
+	ieee80211softmac_try_reassoc(mac);
 
 	return 0;
 }
diff --git a/net/ieee80211/softmac/ieee80211softmac_auth.c b/net/ieee80211/softmac/ieee80211softmac_auth.c
index 0612015..ec44900 100644
--- a/net/ieee80211/softmac/ieee80211softmac_auth.c
+++ b/net/ieee80211/softmac/ieee80211softmac_auth.c
@@ -334,6 +334,8 @@ ieee80211softmac_deauth_from_net(struct ieee80211softmac_device *mac,
 	/* can't transmit data right now... */
 	netif_carrier_off(mac->dev);
 	spin_unlock_irqrestore(&mac->lock, flags);
+
+	ieee80211softmac_try_reassoc(mac);
 }
 
 /* 
diff --git a/net/ieee80211/softmac/ieee80211softmac_priv.h b/net/ieee80211/softmac/ieee80211softmac_priv.h
index 0642e09..3ae894f 100644
--- a/net/ieee80211/softmac/ieee80211softmac_priv.h
+++ b/net/ieee80211/softmac/ieee80211softmac_priv.h
@@ -238,4 +238,6 @@ void ieee80211softmac_call_events_locked(struct ieee80211softmac_device *mac, in
 int ieee80211softmac_notify_internal(struct ieee80211softmac_device *mac,
 	int event, void *event_context, notify_function_ptr fun, void *context, gfp_t gfp_mask);
 
+void ieee80211softmac_try_reassoc(struct ieee80211softmac_device *mac);
+
 #endif /* IEEE80211SOFTMAC_PRIV_H_ */
-- 
cgit v0.10.2


From cc8ce997d2a4e524b1acea44beaf5bcfefdb1bfe Mon Sep 17 00:00:00 2001
From: Maxime Austruy <maxime@tralhalla.org>
Date: Sun, 3 Dec 2006 10:40:01 -0600
Subject: [PATCH] softmac: fix unbalanced mutex_lock/unlock in
 ieee80211softmac_wx_set_mlme

Routine ieee80211softmac_wx_set_mlme has one return that fails
to release a mutex acquired at entry.

Signed-off-by: Maxime Austruy <maxime@tralhalla.org>
Signed-off-by: Larry Finger <Larry.Finger@lwfinger.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/net/ieee80211/softmac/ieee80211softmac_wx.c b/net/ieee80211/softmac/ieee80211softmac_wx.c
index 23068a8..5b7b5b4 100644
--- a/net/ieee80211/softmac/ieee80211softmac_wx.c
+++ b/net/ieee80211/softmac/ieee80211softmac_wx.c
@@ -495,7 +495,8 @@ ieee80211softmac_wx_set_mlme(struct net_device *dev,
 			printk(KERN_DEBUG PFX "wx_set_mlme: we should know the net here...\n");
 			goto out;
 		}
-		return ieee80211softmac_deauth_req(mac, net, reason);
+		err =  ieee80211softmac_deauth_req(mac, net, reason);
+		goto out;
 	case IW_MLME_DISASSOC:
 		ieee80211softmac_send_disassoc_req(mac, reason);
 		mac->associnfo.associated = 0;
-- 
cgit v0.10.2


From 4b1f8a99a2f5c6c25f04fc93272e5b9c18e82b99 Mon Sep 17 00:00:00 2001
From: Zhu Yi <yi.zhu@intel.com>
Date: Tue, 5 Dec 2006 14:42:14 +0800
Subject: [PATCH] ipw2200: Add IEEE80211_RADIOTAP_TSFT for promiscuous mode

The ipw2200 BSS firmware passes on the TSF information within ipw_rx_frame,
but monitor firmware doesn't. I add back the IEEE80211_RADIOTAP_TSFT flags
so that we can get the MAC timestamp if we use the rtap interface. We will
see the MAC timestamp equals to zero if we capture the packets with a
monitor mode interface. But this is the expected behaviour.

Signed-off-by: Zhu Yi <yi.zhu@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/ipw2200.c b/drivers/net/wireless/ipw2200.c
index b2e144b..26998b5 100644
--- a/drivers/net/wireless/ipw2200.c
+++ b/drivers/net/wireless/ipw2200.c
@@ -7656,7 +7656,8 @@ static void ipw_handle_data_packet_monitor(struct ipw_priv *priv,
 
 	/* Big bitfield of all the fields we provide in radiotap */
 	ipw_rt->rt_hdr.it_present =
-	    ((1 << IEEE80211_RADIOTAP_FLAGS) |
+	    ((1 << IEEE80211_RADIOTAP_TSFT) |
+	     (1 << IEEE80211_RADIOTAP_FLAGS) |
 	     (1 << IEEE80211_RADIOTAP_RATE) |
 	     (1 << IEEE80211_RADIOTAP_CHANNEL) |
 	     (1 << IEEE80211_RADIOTAP_DBM_ANTSIGNAL) |
@@ -7665,10 +7666,14 @@ static void ipw_handle_data_packet_monitor(struct ipw_priv *priv,
 
 	/* Zero the flags, we'll add to them as we go */
 	ipw_rt->rt_flags = 0;
-	ipw_rt->rt_tsf = 0ULL;
+	ipw_rt->rt_tsf = (u64)(frame->parent_tsf[3] << 24 |
+			       frame->parent_tsf[2] << 16 |
+			       frame->parent_tsf[1] << 8  |
+			       frame->parent_tsf[0]);
 
 	/* Convert signal to DBM */
 	ipw_rt->rt_dbmsignal = antsignal;
+	ipw_rt->rt_dbmnoise = frame->noise;
 
 	/* Convert the channel data and set the flags */
 	ipw_rt->rt_channel = cpu_to_le16(ieee80211chan2mhz(received_channel));
@@ -7868,7 +7873,8 @@ static void ipw_handle_promiscuous_rx(struct ipw_priv *priv,
 
 	/* Big bitfield of all the fields we provide in radiotap */
 	ipw_rt->rt_hdr.it_present =
-	    ((1 << IEEE80211_RADIOTAP_FLAGS) |
+	    ((1 << IEEE80211_RADIOTAP_TSFT) |
+	     (1 << IEEE80211_RADIOTAP_FLAGS) |
 	     (1 << IEEE80211_RADIOTAP_RATE) |
 	     (1 << IEEE80211_RADIOTAP_CHANNEL) |
 	     (1 << IEEE80211_RADIOTAP_DBM_ANTSIGNAL) |
@@ -7877,7 +7883,10 @@ static void ipw_handle_promiscuous_rx(struct ipw_priv *priv,
 
 	/* Zero the flags, we'll add to them as we go */
 	ipw_rt->rt_flags = 0;
-	ipw_rt->rt_tsf = 0ULL;
+	ipw_rt->rt_tsf = (u64)(frame->parent_tsf[3] << 24 |
+			       frame->parent_tsf[2] << 16 |
+			       frame->parent_tsf[1] << 8  |
+			       frame->parent_tsf[0]);
 
 	/* Convert to DBM */
 	ipw_rt->rt_dbmsignal = signal;
-- 
cgit v0.10.2


From aac40ceb8ff72fff1d6d5ad2607d7b01045d59a6 Mon Sep 17 00:00:00 2001
From: Zhu Yi <yi.zhu@intel.com>
Date: Tue, 5 Dec 2006 14:41:24 +0800
Subject: [PATCH] ipw2200: Update version stamp to 1.2.0

Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/ipw2200.c b/drivers/net/wireless/ipw2200.c
index 26998b5..cce55f6 100644
--- a/drivers/net/wireless/ipw2200.c
+++ b/drivers/net/wireless/ipw2200.c
@@ -70,7 +70,7 @@
 #define VQ
 #endif
 
-#define IPW2200_VERSION "1.1.4" VK VD VM VP VR VQ
+#define IPW2200_VERSION "1.2.0" VK VD VM VP VR VQ
 #define DRV_DESCRIPTION	"Intel(R) PRO/Wireless 2200/2915 Network Driver"
 #define DRV_COPYRIGHT	"Copyright(c) 2003-2006 Intel Corporation"
 #define DRV_VERSION     IPW2200_VERSION
-- 
cgit v0.10.2


From 90c009ac30318e607d4f17ba1afb9cbac7fa2954 Mon Sep 17 00:00:00 2001
From: Zhu Yi <yi.zhu@intel.com>
Date: Tue, 5 Dec 2006 14:41:32 +0800
Subject: [PATCH] ipw2200: Fix a typo

Signed-off-by: Pascal Terjan <pterjan@gmail.com>
Signed-off-by: Zhu Yi <yi.zhu@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/ipw2100.c b/drivers/net/wireless/ipw2100.c
index 79607b8..060018e 100644
--- a/drivers/net/wireless/ipw2100.c
+++ b/drivers/net/wireless/ipw2100.c
@@ -6215,7 +6215,7 @@ static int ipw2100_pci_init_one(struct pci_dev *pci_dev,
 	/* Allocate and initialize the Tx/Rx queues and lists */
 	if (ipw2100_queues_allocate(priv)) {
 		printk(KERN_WARNING DRV_NAME
-		       "Error calilng ipw2100_queues_allocate.\n");
+		       "Error calling ipw2100_queues_allocate.\n");
 		err = -ENOMEM;
 		goto fail;
 	}
-- 
cgit v0.10.2


From 720eeb4332e5871c97d390b2fb55a5a74fb18ae6 Mon Sep 17 00:00:00 2001
From: Zhu Yi <yi.zhu@intel.com>
Date: Tue, 5 Dec 2006 14:41:40 +0800
Subject: [PATCH] ipw2200: Fix debug output endian issue

Signed-off-by: Zhu Yi <yi.zhu@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/ipw2200.c b/drivers/net/wireless/ipw2200.c
index cce55f6..d29f427 100644
--- a/drivers/net/wireless/ipw2200.c
+++ b/drivers/net/wireless/ipw2200.c
@@ -8285,7 +8285,7 @@ static void ipw_rx(struct ipw_priv *priv)
 				    ("Notification: subtype=%02X flags=%02X size=%d\n",
 				     pkt->u.notification.subtype,
 				     pkt->u.notification.flags,
-				     pkt->u.notification.size);
+				     le16_to_cpu(pkt->u.notification.size));
 				ipw_rx_notification(priv, &pkt->u.notification);
 				break;
 			}
-- 
cgit v0.10.2


From 8f820e976057261e249367514e9920cf20048c76 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:00 +0100
Subject: [PATCH] x86-64: Update defconfig

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index 0f5d44e..96f226c 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.19-rc2-git4
-# Sat Oct 21 03:38:52 2006
+# Linux kernel version: 2.6.19-git7
+# Wed Dec  6 23:50:47 2006
 #
 CONFIG_X86_64=y
 CONFIG_64BIT=y
@@ -47,13 +47,14 @@ CONFIG_POSIX_MQUEUE=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 # CONFIG_CPUSETS is not set
+CONFIG_SYSFS_DEPRECATED=y
 # CONFIG_RELAY is not set
 CONFIG_INITRAMFS_SOURCE=""
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
 # CONFIG_EMBEDDED is not set
 CONFIG_UID16=y
-# CONFIG_SYSCTL_SYSCALL is not set
+CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
@@ -87,9 +88,7 @@ CONFIG_STOP_MACHINE=y
 # Block layer
 #
 CONFIG_BLOCK=y
-CONFIG_LBD=y
 # CONFIG_BLK_DEV_IO_TRACE is not set
-# CONFIG_LSF is not set
 
 #
 # IO Schedulers
@@ -111,10 +110,11 @@ CONFIG_X86_PC=y
 # CONFIG_X86_VSMP is not set
 # CONFIG_MK8 is not set
 # CONFIG_MPSC is not set
-CONFIG_GENERIC_CPU=y
-CONFIG_X86_L1_CACHE_BYTES=128
-CONFIG_X86_L1_CACHE_SHIFT=7
-CONFIG_X86_INTERNODE_CACHE_BYTES=128
+CONFIG_MCORE2=y
+# CONFIG_GENERIC_CPU is not set
+CONFIG_X86_L1_CACHE_BYTES=64
+CONFIG_X86_L1_CACHE_SHIFT=6
+CONFIG_X86_INTERNODE_CACHE_BYTES=64
 CONFIG_X86_TSC=y
 CONFIG_X86_GOOD_APIC=y
 # CONFIG_MICROCODE is not set
@@ -322,6 +322,7 @@ CONFIG_INET_TCP_DIAG=y
 # CONFIG_TCP_CONG_ADVANCED is not set
 CONFIG_TCP_CONG_CUBIC=y
 CONFIG_DEFAULT_TCP_CONG="cubic"
+# CONFIG_TCP_MD5SIG is not set
 CONFIG_IPV6=y
 # CONFIG_IPV6_PRIVACY is not set
 # CONFIG_IPV6_ROUTER_PREF is not set
@@ -624,6 +625,7 @@ CONFIG_SATA_INTEL_COMBINED=y
 # CONFIG_PATA_IT821X is not set
 # CONFIG_PATA_JMICRON is not set
 # CONFIG_PATA_TRIFLEX is not set
+# CONFIG_PATA_MARVELL is not set
 # CONFIG_PATA_MPIIX is not set
 # CONFIG_PATA_OLDPIIX is not set
 # CONFIG_PATA_NETCELL is not set
@@ -795,6 +797,7 @@ CONFIG_BNX2=y
 CONFIG_S2IO=m
 # CONFIG_S2IO_NAPI is not set
 # CONFIG_MYRI10GE is not set
+# CONFIG_NETXEN_NIC is not set
 
 #
 # Token Ring devices
@@ -927,10 +930,6 @@ CONFIG_RTC=y
 # CONFIG_DTLK is not set
 # CONFIG_R3964 is not set
 # CONFIG_APPLICOM is not set
-
-#
-# Ftape, the floppy tape device driver
-#
 CONFIG_AGP=y
 CONFIG_AGP_AMD64=y
 CONFIG_AGP_INTEL=y
@@ -1135,6 +1134,7 @@ CONFIG_USB_DEVICEFS=y
 # CONFIG_USB_BANDWIDTH is not set
 # CONFIG_USB_DYNAMIC_MINORS is not set
 # CONFIG_USB_SUSPEND is not set
+# CONFIG_USB_MULTITHREAD_PROBE is not set
 # CONFIG_USB_OTG is not set
 
 #
@@ -1212,6 +1212,7 @@ CONFIG_USB_HIDINPUT=y
 # CONFIG_USB_KAWETH is not set
 # CONFIG_USB_PEGASUS is not set
 # CONFIG_USB_RTL8150 is not set
+# CONFIG_USB_USBNET_MII is not set
 # CONFIG_USB_USBNET is not set
 CONFIG_USB_MON=y
 
-- 
cgit v0.10.2


From dc3d1742543fffc79dc4d680ab64d2059e97d809 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:00 +0100
Subject: [PATCH] i386: Update defconfig

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/defconfig b/arch/i386/defconfig
index 97aacd6..65891f1 100644
--- a/arch/i386/defconfig
+++ b/arch/i386/defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.19-rc2-git4
-# Sat Oct 21 03:38:56 2006
+# Linux kernel version: 2.6.19-git7
+# Wed Dec  6 23:50:49 2006
 #
 CONFIG_X86_32=y
 CONFIG_GENERIC_TIME=y
@@ -40,13 +40,14 @@ CONFIG_POSIX_MQUEUE=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 # CONFIG_CPUSETS is not set
+CONFIG_SYSFS_DEPRECATED=y
 # CONFIG_RELAY is not set
 CONFIG_INITRAMFS_SOURCE=""
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
 # CONFIG_EMBEDDED is not set
 CONFIG_UID16=y
-# CONFIG_SYSCTL_SYSCALL is not set
+CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
@@ -110,6 +111,7 @@ CONFIG_SMP=y
 # CONFIG_X86_VISWS is not set
 CONFIG_X86_GENERICARCH=y
 # CONFIG_X86_ES7000 is not set
+# CONFIG_PARAVIRT is not set
 CONFIG_X86_CYCLONE_TIMER=y
 # CONFIG_M386 is not set
 # CONFIG_M486 is not set
@@ -120,6 +122,7 @@ CONFIG_X86_CYCLONE_TIMER=y
 # CONFIG_MPENTIUMII is not set
 CONFIG_MPENTIUMIII=y
 # CONFIG_MPENTIUMM is not set
+# CONFIG_MCORE2 is not set
 # CONFIG_MPENTIUM4 is not set
 # CONFIG_MK6 is not set
 # CONFIG_MK7 is not set
@@ -197,7 +200,6 @@ CONFIG_RESOURCES_64BIT=y
 CONFIG_MTRR=y
 # CONFIG_EFI is not set
 # CONFIG_IRQBALANCE is not set
-CONFIG_REGPARM=y
 CONFIG_SECCOMP=y
 # CONFIG_HZ_100 is not set
 CONFIG_HZ_250=y
@@ -205,7 +207,8 @@ CONFIG_HZ_250=y
 CONFIG_HZ=250
 # CONFIG_KEXEC is not set
 # CONFIG_CRASH_DUMP is not set
-CONFIG_PHYSICAL_START=0x100000
+# CONFIG_RELOCATABLE is not set
+CONFIG_PHYSICAL_ALIGN=0x100000
 # CONFIG_HOTPLUG_CPU is not set
 CONFIG_COMPAT_VDSO=y
 CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
@@ -367,6 +370,7 @@ CONFIG_INET_TCP_DIAG=y
 # CONFIG_TCP_CONG_ADVANCED is not set
 CONFIG_TCP_CONG_CUBIC=y
 CONFIG_DEFAULT_TCP_CONG="cubic"
+# CONFIG_TCP_MD5SIG is not set
 CONFIG_IPV6=y
 # CONFIG_IPV6_PRIVACY is not set
 # CONFIG_IPV6_ROUTER_PREF is not set
@@ -677,6 +681,7 @@ CONFIG_SATA_INTEL_COMBINED=y
 # CONFIG_PATA_IT821X is not set
 # CONFIG_PATA_JMICRON is not set
 # CONFIG_PATA_TRIFLEX is not set
+# CONFIG_PATA_MARVELL is not set
 # CONFIG_PATA_MPIIX is not set
 # CONFIG_PATA_OLDPIIX is not set
 # CONFIG_PATA_NETCELL is not set
@@ -850,6 +855,7 @@ CONFIG_BNX2=y
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
 # CONFIG_MYRI10GE is not set
+# CONFIG_NETXEN_NIC is not set
 
 #
 # Token Ring devices
@@ -984,10 +990,6 @@ CONFIG_RTC=y
 # CONFIG_R3964 is not set
 # CONFIG_APPLICOM is not set
 # CONFIG_SONYPI is not set
-
-#
-# Ftape, the floppy tape device driver
-#
 CONFIG_AGP=y
 # CONFIG_AGP_ALI is not set
 # CONFIG_AGP_ATI is not set
@@ -1108,6 +1110,7 @@ CONFIG_USB_DEVICEFS=y
 # CONFIG_USB_BANDWIDTH is not set
 # CONFIG_USB_DYNAMIC_MINORS is not set
 # CONFIG_USB_SUSPEND is not set
+# CONFIG_USB_MULTITHREAD_PROBE is not set
 # CONFIG_USB_OTG is not set
 
 #
@@ -1185,6 +1188,7 @@ CONFIG_USB_HIDINPUT=y
 # CONFIG_USB_KAWETH is not set
 # CONFIG_USB_PEGASUS is not set
 # CONFIG_USB_RTL8150 is not set
+# CONFIG_USB_USBNET_MII is not set
 # CONFIG_USB_USBNET is not set
 CONFIG_USB_MON=y
 
-- 
cgit v0.10.2


From 9b483417527f2e47985856867c5716df013227c7 Mon Sep 17 00:00:00 2001
From: Andreas Mohr <andi@rhlx01.fht-esslingen.de>
Date: Thu, 7 Dec 2006 02:14:00 +0100
Subject: [PATCH] i386: fix buggy MTRR address checks

Fix checks that failed to realize that values are 4-kB-unit-sized (note the
format strings in this same diff context which *do* realize the unit size,
via appended "000"!).  Also fix an incorrect below-1MB area check (as
gathered from Jan Beulich's unapplied patch at
http://www.ussg.iu.edu/hypermail/linux/kernel/0411.1/1378.html ) Update
mtrr_add_page() docu to make 4-kB-sized calculation more obvious.

Given several further items mentioned in Jan's patch mail, all in all MTRR
code seems surprisingly buggy, for a surprisingly long period of time (many
years).  Further work/investigation would be useful.

TBD Note that my patch is pretty much UNTESTED, since I can only verify that it
TBD successfully boots my machine, but I cannot test against actual buggy
TBD hardware which would require these (formerly broken) checks.  Long -mm
TBD simmering would make sense, especially since these now-working checks might
TBD turn out to have adverse effects on unaffected hardware.

Signed-off-by: Andreas Mohr <andi@lisas.de>
Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c
index 0b61eed..ee8dc67 100644
--- a/arch/i386/kernel/cpu/mtrr/generic.c
+++ b/arch/i386/kernel/cpu/mtrr/generic.c
@@ -366,7 +366,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
 			printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
 			return -EINVAL;
 		}
-		if (!(base + size < 0x70000000 || base > 0x7003FFFF) &&
+		if (!(base + size < 0x70000 || base > 0x7003F) &&
 		    (type == MTRR_TYPE_WRCOMB
 		     || type == MTRR_TYPE_WRBACK)) {
 			printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
@@ -374,7 +374,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
 		}
 	}
 
-	if (base + size < 0x100) {
+	if (base < 0x100) {
 		printk(KERN_WARNING "mtrr: cannot set region below 1 MiB (0x%lx000,0x%lx000)\n",
 		       base, size);
 		return -EINVAL;
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index fff90bd..2b8b0b3 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -263,8 +263,8 @@ static void set_mtrr(unsigned int reg, unsigned long base,
 
 /**
  *	mtrr_add_page - Add a memory type region
- *	@base: Physical base address of region in pages (4 KB)
- *	@size: Physical size of region in pages (4 KB)
+ *	@base: Physical base address of region in pages (in units of 4 kB!)
+ *	@size: Physical size of region in pages (4 kB)
  *	@type: Type of MTRR desired
  *	@increment: If this is true do usage counting on the region
  *
-- 
cgit v0.10.2


From b615ebdac97c648a2ae7d23c5a0bbb3972adf928 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:00 +0100
Subject: [PATCH] x86: shorten lines in unwinder to be <= 80 characters

Andrew complained about > 80 character lines in the new unwinder.
Fix that.

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index fe9c5e8..fe81d89 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -173,6 +173,8 @@ dump_trace_unwind(struct unwind_frame_info *info, void *data)
 	return n;
 }
 
+#define MSG(msg) ops->warning(data, msg)
+
 void dump_trace(struct task_struct *task, struct pt_regs *regs,
 	        unsigned long *stack,
 		struct stacktrace_ops *ops, void *data)
@@ -191,29 +193,31 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 			if (unwind_init_frame_info(&info, task, regs) == 0)
 				unw_ret = dump_trace_unwind(&info, &oad);
 		} else if (task == current)
-			unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
+			unw_ret = unwind_init_running(&info, dump_trace_unwind,
+						      &oad);
 		else {
 			if (unwind_init_blocked(&info, task) == 0)
 				unw_ret = dump_trace_unwind(&info, &oad);
 		}
 		if (unw_ret > 0) {
 			if (call_trace == 1 && !arch_unw_user_mode(&info)) {
-				ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
+				ops->warning_symbol(data,
+					     "DWARF2 unwinder stuck at %s\n",
 					     UNW_PC(&info));
 				if (UNW_SP(&info) >= PAGE_OFFSET) {
-					ops->warning(data, "Leftover inexact backtrace:\n");
+					MSG("Leftover inexact backtrace:\n");
 					stack = (void *)UNW_SP(&info);
 					if (!stack)
 						return;
 					ebp = UNW_FP(&info);
 				} else
-					ops->warning(data, "Full inexact backtrace again:\n");
+					MSG("Full inexact backtrace again:\n");
 			} else if (call_trace >= 1)
 				return;
 			else
-				ops->warning(data, "Full inexact backtrace again:\n");
+				MSG("Full inexact backtrace again:\n");
 		} else
-			ops->warning(data, "Inexact backtrace:\n");
+			MSG("Inexact backtrace:\n");
 	}
 	if (!stack) {
 		unsigned long dummy;
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 0d65b22..d3f43c9 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -235,6 +235,8 @@ static int dump_trace_unwind(struct unwind_frame_info *info, void *context)
 	return n;
 }
 
+#define MSG(txt) ops->warning(data, txt)
+
 /*
  * x86-64 can have upto three kernel stacks: 
  * process stack
@@ -248,11 +250,12 @@ static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
         return p > t && p < t + THREAD_SIZE - 3;
 }
 
-void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack,
+void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
+		unsigned long *stack,
 		struct stacktrace_ops *ops, void *data)
 {
 	const unsigned cpu = smp_processor_id();
-	unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
+	unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
 	unsigned used = 0;
 	struct thread_info *tinfo;
 
@@ -268,28 +271,30 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 			if (unwind_init_frame_info(&info, tsk, regs) == 0)
 				unw_ret = dump_trace_unwind(&info, &oad);
 		} else if (tsk == current)
-			unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
+			unw_ret = unwind_init_running(&info, dump_trace_unwind,
+						      &oad);
 		else {
 			if (unwind_init_blocked(&info, tsk) == 0)
 				unw_ret = dump_trace_unwind(&info, &oad);
 		}
 		if (unw_ret > 0) {
 			if (call_trace == 1 && !arch_unw_user_mode(&info)) {
-				ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
+				ops->warning_symbol(data,
+					     "DWARF2 unwinder stuck at %s\n",
 					     UNW_PC(&info));
 				if ((long)UNW_SP(&info) < 0) {
-					ops->warning(data, "Leftover inexact backtrace:\n");
+					MSG("Leftover inexact backtrace:");
 					stack = (unsigned long *)UNW_SP(&info);
 					if (!stack)
 						return;
 				} else
-					ops->warning(data, "Full inexact backtrace again:\n");
+					MSG("Full inexact backtrace again:\n");
 			} else if (call_trace >= 1)
 				return;
 			else
-				ops->warning(data, "Full inexact backtrace again:\n");
+				MSG("Full inexact backtrace again:\n");
 		} else
-			ops->warning(data, "Inexact backtrace:\n");
+			MSG("Inexact backtrace:\n");
 	}
 	if (!stack) {
 		unsigned long dummy;
-- 
cgit v0.10.2


From dd315df1767cf56bd4fb8d730fdff4a3d7e15d84 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:00 +0100
Subject: [PATCH] x86: Compress stack unwinder output

The unwinder has some extra newlines, which eat up loads of screen
space when it spews. (See https://bugzilla.redhat.com/bugzilla/attachment.cgi?id=137900
for a nasty example).

warning_symbol-> and warning-> already printk a newline, so don't add one
in the strings passed to them.

[AK: redone for new code]

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index fe81d89..396041a4 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -202,22 +202,22 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		if (unw_ret > 0) {
 			if (call_trace == 1 && !arch_unw_user_mode(&info)) {
 				ops->warning_symbol(data,
-					     "DWARF2 unwinder stuck at %s\n",
+					     "DWARF2 unwinder stuck at %s",
 					     UNW_PC(&info));
 				if (UNW_SP(&info) >= PAGE_OFFSET) {
-					MSG("Leftover inexact backtrace:\n");
+					MSG("Leftover inexact backtrace:");
 					stack = (void *)UNW_SP(&info);
 					if (!stack)
 						return;
 					ebp = UNW_FP(&info);
 				} else
-					MSG("Full inexact backtrace again:\n");
+					MSG("Full inexact backtrace again:");
 			} else if (call_trace >= 1)
 				return;
 			else
-				MSG("Full inexact backtrace again:\n");
+				MSG("Full inexact backtrace again:");
 		} else
-			MSG("Inexact backtrace:\n");
+			MSG("Inexact backtrace:");
 	}
 	if (!stack) {
 		unsigned long dummy;
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index d3f43c9..eedd4e7 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -280,7 +280,7 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 		if (unw_ret > 0) {
 			if (call_trace == 1 && !arch_unw_user_mode(&info)) {
 				ops->warning_symbol(data,
-					     "DWARF2 unwinder stuck at %s\n",
+					     "DWARF2 unwinder stuck at %s",
 					     UNW_PC(&info));
 				if ((long)UNW_SP(&info) < 0) {
 					MSG("Leftover inexact backtrace:");
@@ -288,13 +288,13 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 					if (!stack)
 						return;
 				} else
-					MSG("Full inexact backtrace again:\n");
+					MSG("Full inexact backtrace again:");
 			} else if (call_trace >= 1)
 				return;
 			else
-				MSG("Full inexact backtrace again:\n");
+				MSG("Full inexact backtrace again:");
 		} else
-			MSG("Inexact backtrace:\n");
+			MSG("Inexact backtrace:");
 	}
 	if (!stack) {
 		unsigned long dummy;
-- 
cgit v0.10.2


From d606f88fa5e529e9dc72be97e79db1e36a6261cb Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Thu, 7 Dec 2006 02:14:00 +0100
Subject: [PATCH] i386: fix must_checks

Fix __must_check warnings in i386/math-emu.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/math-emu/fpu_emu.h b/arch/i386/math-emu/fpu_emu.h
index d62b20a..65120f5 100644
--- a/arch/i386/math-emu/fpu_emu.h
+++ b/arch/i386/math-emu/fpu_emu.h
@@ -57,6 +57,7 @@
 #define TAG_Special	Const(2)	/* De-normal, + or - infinity,
 					   or Not a Number */
 #define TAG_Empty	Const(3)	/* empty */
+#define TAG_Error	Const(0x80)	/* probably need to abort */
 
 #define LOADED_DATA	Const(10101)	/* Special st() number to identify
 					   loaded data (not on stack). */
diff --git a/arch/i386/math-emu/fpu_entry.c b/arch/i386/math-emu/fpu_entry.c
index d93f16e..ddf8fa3 100644
--- a/arch/i386/math-emu/fpu_entry.c
+++ b/arch/i386/math-emu/fpu_entry.c
@@ -742,7 +742,8 @@ int save_i387_soft(void *s387, struct _fpstate __user * buf)
   S387->fcs &= ~0xf8000000;
   S387->fos |= 0xffff0000;
 #endif /* PECULIAR_486 */
-  __copy_to_user(d, &S387->cwd, 7*4);
+  if (__copy_to_user(d, &S387->cwd, 7*4))
+    return -1;
   RE_ENTRANT_CHECK_ON;
 
   d += 7*4;
diff --git a/arch/i386/math-emu/fpu_system.h b/arch/i386/math-emu/fpu_system.h
index bf26341..a3ae28c 100644
--- a/arch/i386/math-emu/fpu_system.h
+++ b/arch/i386/math-emu/fpu_system.h
@@ -68,6 +68,7 @@
 
 #define FPU_access_ok(x,y,z)	if ( !access_ok(x,y,z) ) \
 				math_abort(FPU_info,SIGSEGV)
+#define FPU_abort		math_abort(FPU_info, SIGSEGV)
 
 #undef FPU_IGNORE_CODE_SEGV
 #ifdef FPU_IGNORE_CODE_SEGV
diff --git a/arch/i386/math-emu/load_store.c b/arch/i386/math-emu/load_store.c
index 85314be..eebd6fb 100644
--- a/arch/i386/math-emu/load_store.c
+++ b/arch/i386/math-emu/load_store.c
@@ -227,6 +227,8 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
     case 027:      /* fild m64int */
       clear_C1();
       loaded_tag = FPU_load_int64((long long __user *)data_address);
+      if (loaded_tag == TAG_Error)
+	return 0;
       FPU_settag0(loaded_tag);
       break;
     case 030:     /* fstenv  m14/28byte */
diff --git a/arch/i386/math-emu/reg_ld_str.c b/arch/i386/math-emu/reg_ld_str.c
index f06ed41..e976cae 100644
--- a/arch/i386/math-emu/reg_ld_str.c
+++ b/arch/i386/math-emu/reg_ld_str.c
@@ -244,7 +244,8 @@ int FPU_load_int64(long long __user *_s)
 
   RE_ENTRANT_CHECK_OFF;
   FPU_access_ok(VERIFY_READ, _s, 8);
-  copy_from_user(&s,_s,8);
+  if (copy_from_user(&s,_s,8))
+    FPU_abort;
   RE_ENTRANT_CHECK_ON;
 
   if (s == 0)
@@ -907,7 +908,8 @@ int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d)
 
   RE_ENTRANT_CHECK_OFF;
   FPU_access_ok(VERIFY_WRITE,d,8);
-  copy_to_user(d, &tll, 8);
+  if (copy_to_user(d, &tll, 8))
+    FPU_abort;
   RE_ENTRANT_CHECK_ON;
 
   return 1;
@@ -1336,7 +1338,8 @@ u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d)
       I387.soft.fcs &= ~0xf8000000;
       I387.soft.fos |= 0xffff0000;
 #endif /* PECULIAR_486 */
-      __copy_to_user(d, &control_word, 7*4);
+      if (__copy_to_user(d, &control_word, 7*4))
+	FPU_abort;
       RE_ENTRANT_CHECK_ON;
       d += 0x1c;
     }
@@ -1359,9 +1362,11 @@ void fsave(fpu_addr_modes addr_modes, u_char __user *data_address)
   FPU_access_ok(VERIFY_WRITE,d,80);
 
   /* Copy all registers in stack order. */
-  __copy_to_user(d, register_base+offset, other);
+  if (__copy_to_user(d, register_base+offset, other))
+    FPU_abort;
   if ( offset )
-    __copy_to_user(d+other, register_base, offset);
+    if (__copy_to_user(d+other, register_base, offset))
+      FPU_abort;
   RE_ENTRANT_CHECK_ON;
 
   finit();
-- 
cgit v0.10.2


From a63954b5cad5765e52870bb649992bf636f32a6b Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 7 Dec 2006 02:14:00 +0100
Subject: [PATCH] i386: remove pointless printk from i386 oops output

This just got removed on x86-64, do the same on 32bit.
It always annoyed me when this ate a line of oops output pushing
interesting stuff off the screen.

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 396041a4..48ebfab6 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -777,7 +777,6 @@ void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
 	printk(" on CPU%d, eip %08lx, registers:\n",
 		smp_processor_id(), regs->eip);
 	show_registers(regs);
-	printk(KERN_EMERG "console shuts up ...\n");
 	console_silent();
 	spin_unlock(&nmi_print_lock);
 	bust_spinlocks(0);
-- 
cgit v0.10.2


From 87e1652c7863b9ae406ff37f33c7ec2bb494d7b1 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:00 +0100
Subject: [PATCH] x86-64: Don't keep interrupts disabled while spinning in
 spinlocks

Follows i386.

Based on patch from some folks at Google (MikeW, Edward G.?), but
completely redone by AK.

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/include/asm-x86_64/spinlock.h b/include/asm-x86_64/spinlock.h
index 05ef097..88bf981 100644
--- a/include/asm-x86_64/spinlock.h
+++ b/include/asm-x86_64/spinlock.h
@@ -36,7 +36,34 @@ static inline void __raw_spin_lock(raw_spinlock_t *lock)
 		"2:\t" : "=m" (lock->slock) : : "memory");
 }
 
-#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
+/*
+ * Same as __raw_spin_lock, but reenable interrupts during spinning.
+ */
+#ifndef CONFIG_PROVE_LOCKING
+static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
+{
+	asm volatile(
+		"\n1:\t"
+		LOCK_PREFIX " ; decl %0\n\t"
+		"jns 5f\n"
+		"testl $0x200, %1\n\t"	/* interrupts were disabled? */
+		"jz 4f\n\t"
+	        "sti\n"
+		"3:\t"
+		"rep;nop\n\t"
+		"cmpl $0, %0\n\t"
+		"jle 3b\n\t"
+		"cli\n\t"
+		"jmp 1b\n"
+		"4:\t"
+		"rep;nop\n\t"
+		"cmpl $0, %0\n\t"
+		"jg 1b\n\t"
+		"jmp 4b\n"
+		"5:\n\t"
+		: "+m" (lock->slock) : "r" ((unsigned)flags) : "memory");
+}
+#endif
 
 static inline int __raw_spin_trylock(raw_spinlock_t *lock)
 {
-- 
cgit v0.10.2


From bd1d599518bf11992cc6d5b0df08da4a2b7b0db5 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@hpl.hp.com>
Date: Thu, 7 Dec 2006 02:14:00 +0100
Subject: [PATCH] x86-64: x86_64 rename X86_FEATURE_DTES to X86_FEATURE_DS

Here is a patch (used by perfmon2) that renamed X86_FEATURE_DTES
to X86_FEATURE_DS to match Intel's documentation for the Debug Store
save area. The patch also adds cpu_has_ds.

changelog:
	- rename X86_FEATURE_DTES to X86_FEATURE_DS to match documentation
	- adds cpu_has_ds to test for X86_FEATURE_DS

Signed-off-by: stephane eranian <eranian@hpl.hp.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/include/asm-x86_64/cpufeature.h b/include/asm-x86_64/cpufeature.h
index ee792fa..65eb39e 100644
--- a/include/asm-x86_64/cpufeature.h
+++ b/include/asm-x86_64/cpufeature.h
@@ -29,7 +29,7 @@
 #define X86_FEATURE_PSE36	(0*32+17) /* 36-bit PSEs */
 #define X86_FEATURE_PN		(0*32+18) /* Processor serial number */
 #define X86_FEATURE_CLFLSH	(0*32+19) /* Supports the CLFLUSH instruction */
-#define X86_FEATURE_DTES	(0*32+21) /* Debug Trace Store */
+#define X86_FEATURE_DS		(0*32+21) /* Debug Store */
 #define X86_FEATURE_ACPI	(0*32+22) /* ACPI via MSR */
 #define X86_FEATURE_MMX		(0*32+23) /* Multimedia Extensions */
 #define X86_FEATURE_FXSR	(0*32+24) /* FXSAVE and FXRSTOR instructions (fast save and restore */
@@ -112,5 +112,6 @@
 #define cpu_has_cyrix_arr      0
 #define cpu_has_centaur_mcr    0
 #define cpu_has_clflush	       boot_cpu_has(X86_FEATURE_CLFLSH)
+#define cpu_has_ds 	       boot_cpu_has(X86_FEATURE_DS)
 
 #endif /* __ASM_X8664_CPUFEATURE_H */
-- 
cgit v0.10.2


From 36b2a8d5aff4cb3ee83d5e40447a8f073bcfe2fb Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@hpl.hp.com>
Date: Thu, 7 Dec 2006 02:14:01 +0100
Subject: [PATCH] x86-64: add X86_FEATURE_PEBS and detection

Here is a patch (used by perfmon2) to detect the presence of the
Precise Event Based Sampling (PEBS) feature for Intel 64-bit processors.
The patch also adds the cpu_has_pebs macro.

changelog:
	- adds X86_FEATURE_PEBS
	- adds cpu_has_pebs to test for X86_FEATURE_PEBS

Signed-off-by: stephane eranian <eranian@hpl.hp.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index fc944b5..619af2e 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -835,6 +835,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 			set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
 	}
 
+	if (cpu_has_ds) {
+		unsigned int l1, l2;
+		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
+		if (!(l1 & (1<<12)))
+			set_bit(X86_FEATURE_PEBS, c->x86_capability);
+	}
+
 	n = c->extended_cpuid_level;
 	if (n >= 0x80000008) {
 		unsigned eax = cpuid_eax(0x80000008);
diff --git a/include/asm-x86_64/cpufeature.h b/include/asm-x86_64/cpufeature.h
index 65eb39e..d280384 100644
--- a/include/asm-x86_64/cpufeature.h
+++ b/include/asm-x86_64/cpufeature.h
@@ -68,6 +68,7 @@
 #define X86_FEATURE_FXSAVE_LEAK (3*32+7)  /* FIP/FOP/FDP leaks through FXSAVE */
 #define X86_FEATURE_UP		(3*32+8) /* SMP kernel running on UP */
 #define X86_FEATURE_ARCH_PERFMON (3*32+9) /* Intel Architectural PerfMon */
+#define X86_FEATURE_PEBS	(3*32+10) /* Precise-Event Based Sampling */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */
@@ -113,5 +114,6 @@
 #define cpu_has_centaur_mcr    0
 #define cpu_has_clflush	       boot_cpu_has(X86_FEATURE_CLFLSH)
 #define cpu_has_ds 	       boot_cpu_has(X86_FEATURE_DS)
+#define cpu_has_pebs 	       boot_cpu_has(X86_FEATURE_PEBS)
 
 #endif /* __ASM_X8664_CPUFEATURE_H */
-- 
cgit v0.10.2


From d8cebe65ea5179e3293c38427d71f4d73c795d39 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai.lu@amd.com>
Date: Thu, 7 Dec 2006 02:14:01 +0100
Subject: [PATCH] x86-64: remove duplicated cpu_mask_to_apicid in x86_64 smp.h

inline function cpu_mask_to_apicid in smp.h is duplicated with macro
in mach_apic.h.

Signed-off-by: Yinghai Lu <yinghai.lu@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index d6b7c05..7ae7e7d 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -82,11 +82,6 @@ extern u8 x86_cpu_to_apicid[NR_CPUS];	/* physical ID */
 extern u8 x86_cpu_to_log_apicid[NR_CPUS];
 extern u8 bios_cpu_apicid[];
 
-static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
-{
-	return cpus_addr(cpumask)[0];
-}
-
 static inline int cpu_present_to_apicid(int mps_cpu)
 {
 	if (mps_cpu < NR_CPUS)
-- 
cgit v0.10.2


From d7731c0ff69dc3f18ea020257e627dae4d214fdb Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@hpl.hp.com>
Date: Thu, 7 Dec 2006 02:14:01 +0100
Subject: [PATCH] i386: i386 rename X86_FEATURE_DTES to X86_FEATURE_DS

Here is a patch (used by perfmon2) that renames X86_FEATURE_DTES to
X86_FEATURE_DS to match Intel's documentation for the Debug Store save area on
i386.  The patch also adds cpu_has_ds.

- rename X86_FEATURE_DTES to X86_FEATURE_DS to match documentation

- adds cpu_has_ds to test for X86_FEATURE_DS

Signed-off-by: stephane eranian <eranian@hpl.hp.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/include/asm-i386/cpufeature.h b/include/asm-i386/cpufeature.h
index d314ebb..69ce350 100644
--- a/include/asm-i386/cpufeature.h
+++ b/include/asm-i386/cpufeature.h
@@ -31,7 +31,7 @@
 #define X86_FEATURE_PSE36	(0*32+17) /* 36-bit PSEs */
 #define X86_FEATURE_PN		(0*32+18) /* Processor serial number */
 #define X86_FEATURE_CLFLSH	(0*32+19) /* Supports the CLFLUSH instruction */
-#define X86_FEATURE_DTES	(0*32+21) /* Debug Trace Store */
+#define X86_FEATURE_DS		(0*32+21) /* Debug Store */
 #define X86_FEATURE_ACPI	(0*32+22) /* ACPI via MSR */
 #define X86_FEATURE_MMX		(0*32+23) /* Multimedia Extensions */
 #define X86_FEATURE_FXSR	(0*32+24) /* FXSAVE and FXRSTOR instructions (fast save and restore */
@@ -134,6 +134,7 @@
 #define cpu_has_phe_enabled	boot_cpu_has(X86_FEATURE_PHE_EN)
 #define cpu_has_pmm		boot_cpu_has(X86_FEATURE_PMM)
 #define cpu_has_pmm_enabled	boot_cpu_has(X86_FEATURE_PMM_EN)
+#define cpu_has_ds		boot_cpu_has(X86_FEATURE_DS)
 
 #endif /* __ASM_I386_CPUFEATURE_H */
 
-- 
cgit v0.10.2


From 42ed458aa51337357d7632c64aed4528f923e829 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@hpl.hp.com>
Date: Thu, 7 Dec 2006 02:14:01 +0100
Subject: [PATCH] i386: i386 add X86_FEATURE_PEBS and detection

Here is a patch (used by perfmon2) to detect the presence of the Precise Event
Based Sampling (PEBS) feature for i386.  The patch also adds the cpu_has_pebs
macro.

- adds X86_FEATURE_PEBS

- adds cpu_has_pebs to test for X86_FEATURE_PEBS

Signed-off-by: stephane eranian <eranian@hpl.hp.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index 94a95aa..798c2f6 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -195,8 +195,14 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
 		(c->x86 == 0x6 && c->x86_model >= 0x0e))
 		set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
-}
 
+	if (cpu_has_ds) {
+		unsigned int l1;
+		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
+		if (!(l1 & (1<<12)))
+			set_bit(X86_FEATURE_PEBS, c->x86_capability);
+	}
+}
 
 static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size)
 {
diff --git a/include/asm-i386/cpufeature.h b/include/asm-i386/cpufeature.h
index 69ce350..2316725 100644
--- a/include/asm-i386/cpufeature.h
+++ b/include/asm-i386/cpufeature.h
@@ -73,6 +73,7 @@
 #define X86_FEATURE_UP		(3*32+ 9) /* smp kernel running on up */
 #define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* FXSAVE leaks FOP/FIP/FOP */
 #define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
+#define X86_FEATURE_PEBS	(3*32+12)  /* Precise-Event Based Sampling */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */
@@ -135,6 +136,7 @@
 #define cpu_has_pmm		boot_cpu_has(X86_FEATURE_PMM)
 #define cpu_has_pmm_enabled	boot_cpu_has(X86_FEATURE_PMM_EN)
 #define cpu_has_ds		boot_cpu_has(X86_FEATURE_DS)
+#define cpu_has_pebs 		boot_cpu_has(X86_FEATURE_PEBS)
 
 #endif /* __ASM_I386_CPUFEATURE_H */
 
-- 
cgit v0.10.2


From e2764a1e306c986053a52b33748c33463cf888de Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 7 Dec 2006 02:14:01 +0100
Subject: [PATCH] x86-64: use BUILD_BUG_ON in FPU code

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c
index 3aa1e9b..1d58c13 100644
--- a/arch/x86_64/kernel/i387.c
+++ b/arch/x86_64/kernel/i387.c
@@ -82,11 +82,8 @@ int save_i387(struct _fpstate __user *buf)
 	struct task_struct *tsk = current;
 	int err = 0;
 
-	{ 
-		extern void bad_user_i387_struct(void); 
-		if (sizeof(struct user_i387_struct) != sizeof(tsk->thread.i387.fxsave))
-			bad_user_i387_struct();
-	} 
+	BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
+			sizeof(tsk->thread.i387.fxsave));
 
 	if ((unsigned long)buf % 16) 
 		printk("save_i387: bad fpstate %p\n",buf); 
-- 
cgit v0.10.2


From e5e3a0428968dcc1f9318ce1c941a918e99f8b84 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 7 Dec 2006 02:14:01 +0100
Subject: [PATCH] i386: remove default_ldt, and simplify ldt-setting.

This patch removes the default_ldt[] array, as it has been unused since
iBCS stopped being supported.  This means it is now possible to actually
set an empty LDT segment.

In order to deal with this, the set_ldt_desc/load_LDT pair has been
replaced with a single set_ldt() operation which is responsible for both
setting up the LDT descriptor in the GDT, and reloading the LDT register.
If there are no LDT entries, the LDT register is loaded with a NULL
descriptor.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Acked-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/ldt.c b/arch/i386/kernel/ldt.c
index 445211e..b410e5f 100644
--- a/arch/i386/kernel/ldt.c
+++ b/arch/i386/kernel/ldt.c
@@ -160,16 +160,14 @@ static int read_default_ldt(void __user * ptr, unsigned long bytecount)
 {
 	int err;
 	unsigned long size;
-	void *address;
 
 	err = 0;
-	address = &default_ldt[0];
 	size = 5*sizeof(struct desc_struct);
 	if (size > bytecount)
 		size = bytecount;
 
 	err = size;
-	if (copy_to_user(ptr, address, size))
+	if (clear_user(ptr, size))
 		err = -EFAULT;
 
 	return err;
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 48ebfab6..56655ea 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -61,9 +61,6 @@ int panic_on_unrecovered_nmi;
 
 asmlinkage int system_call(void);
 
-struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
-		{ 0, 0 }, { 0, 0 } };
-
 /* Do we ignore FPU interrupts ? */
 char ignore_fpu_irq = 0;
 
diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h
index 5874ef1..a0398f7 100644
--- a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -33,11 +33,6 @@ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
 	return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
 }
 
-/*
- * This is the ldt that every process will get unless we need
- * something other than this.
- */
-extern struct desc_struct default_ldt[];
 extern struct desc_struct idt_table[];
 extern void set_intr_gate(unsigned int irq, void * addr);
 
@@ -65,7 +60,6 @@ static inline void pack_gate(__u32 *a, __u32 *b,
 #define DESCTYPE_S	0x10	/* !system */
 
 #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
-#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
 
 #define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
 #define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
@@ -115,13 +109,20 @@ static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const vo
 	write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
 }
 
-static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int entries)
+static inline void set_ldt(void *addr, unsigned int entries)
 {
-	__u32 a, b;
-	pack_descriptor(&a, &b, (unsigned long)addr,
-			entries * sizeof(struct desc_struct) - 1,
-			DESCTYPE_LDT, 0);
-	write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
+	if (likely(entries == 0))
+		__asm__ __volatile__("lldt %w0"::"q" (0));
+	else {
+		unsigned cpu = smp_processor_id();
+		__u32 a, b;
+
+		pack_descriptor(&a, &b, (unsigned long)addr,
+				entries * sizeof(struct desc_struct) - 1,
+				DESCTYPE_LDT, 0);
+		write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
+		__asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
+	}
 }
 
 #define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
@@ -153,35 +154,22 @@ static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int entri
 
 static inline void clear_LDT(void)
 {
-	int cpu = get_cpu();
-
-	set_ldt_desc(cpu, &default_ldt[0], 5);
-	load_LDT_desc();
-	put_cpu();
+	set_ldt(NULL, 0);
 }
 
 /*
  * load one particular LDT into the current CPU
  */
-static inline void load_LDT_nolock(mm_context_t *pc, int cpu)
+static inline void load_LDT_nolock(mm_context_t *pc)
 {
-	void *segments = pc->ldt;
-	int count = pc->size;
-
-	if (likely(!count)) {
-		segments = &default_ldt[0];
-		count = 5;
-	}
-		
-	set_ldt_desc(cpu, segments, count);
-	load_LDT_desc();
+	set_ldt(pc->ldt, pc->size);
 }
 
 static inline void load_LDT(mm_context_t *pc)
 {
-	int cpu = get_cpu();
-	load_LDT_nolock(pc, cpu);
-	put_cpu();
+	preempt_disable();
+	load_LDT_nolock(pc);
+	preempt_enable();
 }
 
 static inline unsigned long get_desc_base(unsigned long *desc)
diff --git a/include/asm-i386/mmu_context.h b/include/asm-i386/mmu_context.h
index 62b7bf1..1b14953 100644
--- a/include/asm-i386/mmu_context.h
+++ b/include/asm-i386/mmu_context.h
@@ -44,7 +44,7 @@ static inline void switch_mm(struct mm_struct *prev,
 		 * load the LDT, if the LDT is different:
 		 */
 		if (unlikely(prev->context.ldt != next->context.ldt))
-			load_LDT_nolock(&next->context, cpu);
+			load_LDT_nolock(&next->context);
 	}
 #ifdef CONFIG_SMP
 	else {
@@ -56,7 +56,7 @@ static inline void switch_mm(struct mm_struct *prev,
 			 * tlb flush IPI delivery. We must reload %cr3.
 			 */
 			load_cr3(next->pgd);
-			load_LDT_nolock(&next->context, cpu);
+			load_LDT_nolock(&next->context);
 		}
 	}
 #endif
-- 
cgit v0.10.2


From bb81a09e55eaf7e5f798468ab971469b6f66a259 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 7 Dec 2006 02:14:01 +0100
Subject: [PATCH] x86: all cpu backtrace

When a spinlock lockup occurs, arrange for the NMI code to emit an all-cpu
backtrace, so we get to see which CPU is holding the lock, and where.

Cc: Andi Kleen <ak@muc.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index eaafe23..171194c 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -22,6 +22,7 @@
 #include <linux/percpu.h>
 #include <linux/dmi.h>
 #include <linux/kprobes.h>
+#include <linux/cpumask.h>
 
 #include <asm/smp.h>
 #include <asm/nmi.h>
@@ -42,6 +43,8 @@ int nmi_watchdog_enabled;
 static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner);
 static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]);
 
+static cpumask_t backtrace_mask = CPU_MASK_NONE;
+
 /* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
  * offset from MSR_P4_BSU_ESCR0.  It will be the max for all platforms (for now)
  */
@@ -907,6 +910,16 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 		touched = 1;
 	}
 
+	if (cpu_isset(cpu, backtrace_mask)) {
+		static DEFINE_SPINLOCK(lock);	/* Serialise the printks */
+
+		spin_lock(&lock);
+		printk("NMI backtrace for cpu %d\n", cpu);
+		dump_stack();
+		spin_unlock(&lock);
+		cpu_clear(cpu, backtrace_mask);
+	}
+
 	sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
 
 	/* if the apic timer isn't firing, this cpu isn't doing much */
@@ -1033,6 +1046,19 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
 
 #endif
 
+void __trigger_all_cpu_backtrace(void)
+{
+	int i;
+
+	backtrace_mask = cpu_online_map;
+	/* Wait for up to 10 seconds for all CPUs to do the backtrace */
+	for (i = 0; i < 10 * 1000; i++) {
+		if (cpus_empty(backtrace_mask))
+			break;
+		mdelay(1);
+	}
+}
+
 EXPORT_SYMBOL(nmi_active);
 EXPORT_SYMBOL(nmi_watchdog);
 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 7af9cb3..27e95e7 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -12,14 +12,15 @@
  *  Mikael Pettersson	: PM converted to driver model. Disable/enable API.
  */
 
+#include <linux/nmi.h>
 #include <linux/mm.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/sysdev.h>
-#include <linux/nmi.h>
 #include <linux/sysctl.h>
 #include <linux/kprobes.h>
+#include <linux/cpumask.h>
 
 #include <asm/smp.h>
 #include <asm/nmi.h>
@@ -41,6 +42,8 @@ int panic_on_unrecovered_nmi;
 static DEFINE_PER_CPU(unsigned, perfctr_nmi_owner);
 static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[2]);
 
+static cpumask_t backtrace_mask = CPU_MASK_NONE;
+
 /* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
  * offset from MSR_P4_BSU_ESCR0.  It will be the max for all platforms (for now)
  */
@@ -782,6 +785,7 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 {
 	int sum;
 	int touched = 0;
+	int cpu = smp_processor_id();
 	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
 	u64 dummy;
 	int rc=0;
@@ -799,6 +803,16 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 		touched = 1;
 	}
 
+	if (cpu_isset(cpu, backtrace_mask)) {
+		static DEFINE_SPINLOCK(lock);	/* Serialise the printks */
+
+		spin_lock(&lock);
+		printk("NMI backtrace for cpu %d\n", cpu);
+		dump_stack();
+		spin_unlock(&lock);
+		cpu_clear(cpu, backtrace_mask);
+	}
+
 #ifdef CONFIG_X86_MCE
 	/* Could check oops_in_progress here too, but it's safer
 	   not too */
@@ -931,6 +945,19 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
 
 #endif
 
+void __trigger_all_cpu_backtrace(void)
+{
+	int i;
+
+	backtrace_mask = cpu_online_map;
+	/* Wait for up to 10 seconds for all CPUs to do the backtrace */
+	for (i = 0; i < 10 * 1000; i++) {
+		if (cpus_empty(backtrace_mask))
+			break;
+		mdelay(1);
+	}
+}
+
 EXPORT_SYMBOL(nmi_active);
 EXPORT_SYMBOL(nmi_watchdog);
 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
diff --git a/include/asm-i386/nmi.h b/include/asm-i386/nmi.h
index 269d315..b04333e 100644
--- a/include/asm-i386/nmi.h
+++ b/include/asm-i386/nmi.h
@@ -5,6 +5,9 @@
 #define ASM_NMI_H
 
 #include <linux/pm.h>
+#include <asm/irq.h>
+
+#ifdef ARCH_HAS_NMI_WATCHDOG
 
 /**
  * do_nmi_callback
@@ -42,4 +45,9 @@ extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
 			void __user *, size_t *, loff_t *);
 extern int unknown_nmi_panic;
 
+void __trigger_all_cpu_backtrace(void);
+#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace()
+
+#endif
+
 #endif /* ASM_NMI_H */
diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h
index f367d40..72375e7 100644
--- a/include/asm-x86_64/nmi.h
+++ b/include/asm-x86_64/nmi.h
@@ -77,4 +77,7 @@ extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
 
 extern int unknown_nmi_panic;
 
+void __trigger_all_cpu_backtrace(void);
+#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace()
+
 #endif /* ASM_NMI_H */
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index e16904e..acb4ed1 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -15,9 +15,14 @@
  * disables interrupts for a long time. This call is stateless.
  */
 #ifdef ARCH_HAS_NMI_WATCHDOG
+#include <asm/nmi.h>
 extern void touch_nmi_watchdog(void);
 #else
 # define touch_nmi_watchdog() touch_softlockup_watchdog()
 #endif
 
+#ifndef trigger_all_cpu_backtrace
+#define trigger_all_cpu_backtrace() do { } while (0)
+#endif
+
 #endif
diff --git a/lib/spinlock_debug.c b/lib/spinlock_debug.c
index b6c4f89..479fd46 100644
--- a/lib/spinlock_debug.c
+++ b/lib/spinlock_debug.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/spinlock.h>
+#include <linux/nmi.h>
 #include <linux/interrupt.h>
 #include <linux/debug_locks.h>
 #include <linux/delay.h>
@@ -117,6 +118,9 @@ static void __spin_lock_debug(spinlock_t *lock)
 				raw_smp_processor_id(), current->comm,
 				current->pid, lock);
 			dump_stack();
+#ifdef CONFIG_SMP
+			trigger_all_cpu_backtrace();
+#endif
 		}
 	}
 }
-- 
cgit v0.10.2


From be44d2aabce2d62f72d5751d1871b6212bf7a1c7 Mon Sep 17 00:00:00 2001
From: Stas Sergeev <stsp@aknet.ru>
Date: Thu, 7 Dec 2006 02:14:01 +0100
Subject: [PATCH] i386: espfix cleanup

Clean up the espfix code:

- Introduced PER_CPU() macro to be used from asm
- Introduced GET_DESC_BASE() macro to be used from asm
- Rewrote the fixup code in asm, as calling a C code with the altered %ss
  appeared to be unsafe
- No longer altering the stack from a .fixup section
- 16bit per-cpu stack is no longer used, instead the stack segment base
  is patched the way so that the high word of the kernel and user %esp
  are the same.
- Added the limit-patching for the espfix segment. (Chuck Ebbert)

[jeremy@goop.org: use the x86 scaling addressing mode rather than shifting]
Signed-off-by: Stas Sergeev <stsp@aknet.ru>
Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Zachary Amsden <zach@vmware.com>
Acked-by: Chuck Ebbert <76306.1226@compuserve.com>
Acked-by: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index c80271f..e94d910 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -58,6 +58,11 @@ void foo(void)
 	OFFSET(TI_sysenter_return, thread_info, sysenter_return);
 	BLANK();
 
+	OFFSET(GDS_size, Xgt_desc_struct, size);
+	OFFSET(GDS_address, Xgt_desc_struct, address);
+	OFFSET(GDS_pad, Xgt_desc_struct, pad);
+	BLANK();
+
 	OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
 	OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
 	BLANK();
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index d9f3e3c..5532fc4 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -24,9 +24,6 @@
 DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
 EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
 
-DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
-EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
-
 static int cachesize_override __cpuinitdata = -1;
 static int disable_x86_fxsr __cpuinitdata;
 static int disable_x86_serial_nr __cpuinitdata = 1;
@@ -603,7 +600,6 @@ void __cpuinit cpu_init(void)
 	struct tss_struct * t = &per_cpu(init_tss, cpu);
 	struct thread_struct *thread = &current->thread;
 	struct desc_struct *gdt;
-	__u32 stk16_off = (__u32)&per_cpu(cpu_16bit_stack, cpu);
 	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
 
 	if (cpu_test_and_set(cpu, cpu_initialized)) {
@@ -651,13 +647,6 @@ old_gdt:
 	 * and set up the GDT descriptor:
 	 */
  	memcpy(gdt, cpu_gdt_table, GDT_SIZE);
-
-	/* Set up GDT entry for 16bit stack */
- 	*(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |=
-		((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
-		((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
-		(CPU_16BIT_STACK_SIZE - 1);
-
 	cpu_gdt_descr->size = GDT_SIZE - 1;
  	cpu_gdt_descr->address = (unsigned long)gdt;
 
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 5a63d6f..c38d801 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -48,6 +48,7 @@
 #include <asm/smp.h>
 #include <asm/page.h>
 #include <asm/desc.h>
+#include <asm/percpu.h>
 #include <asm/dwarf2.h>
 #include "irq_vectors.h"
 
@@ -418,23 +419,18 @@ ldt_ss:
 	 * This is an "official" bug of all the x86-compatible
 	 * CPUs, which we can try to work around to make
 	 * dosemu and wine happy. */
-	subl $8, %esp		# reserve space for switch16 pointer
-	CFI_ADJUST_CFA_OFFSET 8
+	movl OLDESP(%esp), %eax
+	movl %esp, %edx
+	call patch_espfix_desc
+	pushl $__ESPFIX_SS
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
 	DISABLE_INTERRUPTS
 	TRACE_IRQS_OFF
-	movl %esp, %eax
-	/* Set up the 16bit stack frame with switch32 pointer on top,
-	 * and a switch16 pointer on top of the current frame. */
-	call setup_x86_bogus_stack
-	CFI_ADJUST_CFA_OFFSET -8	# frame has moved
-	TRACE_IRQS_IRET
-	RESTORE_REGS
-	lss 20+4(%esp), %esp	# switch to 16bit stack
-1:	INTERRUPT_RETURN
-.section __ex_table,"a"
-	.align 4
-	.long 1b,iret_exc
-.previous
+	lss (%esp), %esp
+	CFI_ADJUST_CFA_OFFSET -8
+	jmp restore_nocheck
 	CFI_ENDPROC
 
 	# perform work that needs to be done immediately before resumption
@@ -524,30 +520,30 @@ syscall_badsys:
 	CFI_ENDPROC
 
 #define FIXUP_ESPFIX_STACK \
-	movl %esp, %eax; \
-	/* switch to 32bit stack using the pointer on top of 16bit stack */ \
-	lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
-	/* copy data from 16bit stack to 32bit stack */ \
-	call fixup_x86_bogus_stack; \
-	/* put ESP to the proper location */ \
-	movl %eax, %esp;
-#define UNWIND_ESPFIX_STACK \
+	/* since we are on a wrong stack, we cant make it a C code :( */ \
+	GET_THREAD_INFO(%ebp); \
+	movl TI_cpu(%ebp), %ebx; \
+	PER_CPU(cpu_gdt_descr, %ebx); \
+	movl GDS_address(%ebx), %ebx; \
+	GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
+	addl %esp, %eax; \
+	pushl $__KERNEL_DS; \
+	CFI_ADJUST_CFA_OFFSET 4; \
 	pushl %eax; \
 	CFI_ADJUST_CFA_OFFSET 4; \
+	lss (%esp), %esp; \
+	CFI_ADJUST_CFA_OFFSET -8;
+#define UNWIND_ESPFIX_STACK \
 	movl %ss, %eax; \
-	/* see if on 16bit stack */ \
+	/* see if on espfix stack */ \
 	cmpw $__ESPFIX_SS, %ax; \
-	je 28f; \
-27:	popl %eax; \
-	CFI_ADJUST_CFA_OFFSET -4; \
-.section .fixup,"ax"; \
-28:	movl $__KERNEL_DS, %eax; \
+	jne 27f; \
+	movl $__KERNEL_DS, %eax; \
 	movl %eax, %ds; \
 	movl %eax, %es; \
-	/* switch to 32bit stack */ \
+	/* switch to normal stack */ \
 	FIXUP_ESPFIX_STACK; \
-	jmp 27b; \
-.previous
+27:;
 
 /*
  * Build the entry stubs and pointer table with
@@ -614,7 +610,6 @@ error_code:
 	pushl %eax
 	CFI_ADJUST_CFA_OFFSET 4
 	CFI_REL_OFFSET eax, 0
-	xorl %eax, %eax
 	pushl %ebp
 	CFI_ADJUST_CFA_OFFSET 4
 	CFI_REL_OFFSET ebp, 0
@@ -627,7 +622,6 @@ error_code:
 	pushl %edx
 	CFI_ADJUST_CFA_OFFSET 4
 	CFI_REL_OFFSET edx, 0
-	decl %eax			# eax = -1
 	pushl %ecx
 	CFI_ADJUST_CFA_OFFSET 4
 	CFI_REL_OFFSET ecx, 0
@@ -644,7 +638,7 @@ error_code:
 	/*CFI_REGISTER es, ecx*/
 	movl ES(%esp), %edi		# get the function address
 	movl ORIG_EAX(%esp), %edx	# get the error code
-	movl %eax, ORIG_EAX(%esp)
+	movl $-1, ORIG_EAX(%esp)
 	movl %ecx, ES(%esp)
 	/*CFI_REL_OFFSET es, ES*/
 	movl $(__USER_DS), %ecx
@@ -754,7 +748,7 @@ KPROBE_ENTRY(nmi)
 	cmpw $__ESPFIX_SS, %ax
 	popl %eax
 	CFI_ADJUST_CFA_OFFSET -4
-	je nmi_16bit_stack
+	je nmi_espfix_stack
 	cmpl $sysenter_entry,(%esp)
 	je nmi_stack_fixup
 	pushl %eax
@@ -797,7 +791,7 @@ nmi_debug_stack_check:
 	FIX_STACK(24,nmi_stack_correct, 1)
 	jmp nmi_stack_correct
 
-nmi_16bit_stack:
+nmi_espfix_stack:
 	/* We have a RING0_INT_FRAME here.
 	 *
 	 * create the pointer to lss back
@@ -806,7 +800,6 @@ nmi_16bit_stack:
 	CFI_ADJUST_CFA_OFFSET 4
 	pushl %esp
 	CFI_ADJUST_CFA_OFFSET 4
-	movzwl %sp, %esp
 	addw $4, (%esp)
 	/* copy the iret frame of 12 bytes */
 	.rept 3
@@ -817,11 +810,11 @@ nmi_16bit_stack:
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
 	FIXUP_ESPFIX_STACK		# %eax == %esp
-	CFI_ADJUST_CFA_OFFSET -20	# the frame has now moved
 	xorl %edx,%edx			# zero error code
 	call do_nmi
 	RESTORE_REGS
-	lss 12+4(%esp), %esp		# back to 16bit stack
+	lss 12+4(%esp), %esp		# back to espfix stack
+	CFI_ADJUST_CFA_OFFSET -24
 1:	INTERRUPT_RETURN
 	CFI_ENDPROC
 .section __ex_table,"a"
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index ca31f18..b1f1df1 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -584,7 +584,7 @@ ENTRY(cpu_gdt_table)
 	.quad 0x00009a000000ffff	/* 0xc0 APM CS 16 code (16 bit) */
 	.quad 0x004092000000ffff	/* 0xc8 APM DS    data */
 
-	.quad 0x0000920000000000	/* 0xd0 - ESPFIX 16-bit SS */
+	.quad 0x00c0920000000000	/* 0xd0 - ESPFIX SS */
 	.quad 0x0000000000000000	/* 0xd8 - unused */
 	.quad 0x0000000000000000	/* 0xe0 - unused */
 	.quad 0x0000000000000000	/* 0xe8 - unused */
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 56655ea..f9bb1f8 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -1088,49 +1088,24 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
 #endif
 }
 
-fastcall void setup_x86_bogus_stack(unsigned char * stk)
+fastcall unsigned long patch_espfix_desc(unsigned long uesp,
+					  unsigned long kesp)
 {
-	unsigned long *switch16_ptr, *switch32_ptr;
-	struct pt_regs *regs;
-	unsigned long stack_top, stack_bot;
-	unsigned short iret_frame16_off;
 	int cpu = smp_processor_id();
-	/* reserve the space on 32bit stack for the magic switch16 pointer */
-	memmove(stk, stk + 8, sizeof(struct pt_regs));
-	switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
-	regs = (struct pt_regs *)stk;
-	/* now the switch32 on 16bit stack */
-	stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
-	stack_top = stack_bot +	CPU_16BIT_STACK_SIZE;
-	switch32_ptr = (unsigned long *)(stack_top - 8);
-	iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
-	/* copy iret frame on 16bit stack */
-	memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
-	/* fill in the switch pointers */
-	switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
-	switch16_ptr[1] = __ESPFIX_SS;
-	switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
-		8 - CPU_16BIT_STACK_SIZE;
-	switch32_ptr[1] = __KERNEL_DS;
-}
-
-fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
-{
-	unsigned long *switch32_ptr;
-	unsigned char *stack16, *stack32;
-	unsigned long stack_top, stack_bot;
-	int len;
-	int cpu = smp_processor_id();
-	stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
-	stack_top = stack_bot +	CPU_16BIT_STACK_SIZE;
-	switch32_ptr = (unsigned long *)(stack_top - 8);
-	/* copy the data from 16bit stack to 32bit stack */
-	len = CPU_16BIT_STACK_SIZE - 8 - sp;
-	stack16 = (unsigned char *)(stack_bot + sp);
-	stack32 = (unsigned char *)
-		(switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
-	memcpy(stack32, stack16, len);
-	return stack32;
+	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
+	struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address;
+	unsigned long base = (kesp - uesp) & -THREAD_SIZE;
+	unsigned long new_kesp = kesp - base;
+	unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
+	__u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
+	/* Set up base for espfix segment */
+ 	desc &= 0x00f0ff0000000000ULL;
+ 	desc |=	((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
+		((((__u64)base) << 32) & 0xff00000000000000ULL) |
+		((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
+		(lim_pages & 0xffff);
+	*(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
+	return new_kesp;
 }
 
 /*
diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h
index a0398f7..6cf2ac2 100644
--- a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -4,8 +4,6 @@
 #include <asm/ldt.h>
 #include <asm/segment.h>
 
-#define CPU_16BIT_STACK_SIZE 1024
-
 #ifndef __ASSEMBLY__
 
 #include <linux/preempt.h>
@@ -16,8 +14,6 @@
 
 extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
 
-DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
-
 struct Xgt_desc_struct {
 	unsigned short size;
 	unsigned long address __attribute__((packed));
@@ -181,6 +177,29 @@ static inline unsigned long get_desc_base(unsigned long *desc)
 	return base;
 }
 
+#else /* __ASSEMBLY__ */
+
+/*
+ * GET_DESC_BASE reads the descriptor base of the specified segment.
+ *
+ * Args:
+ *    idx - descriptor index
+ *    gdt - GDT pointer
+ *    base - 32bit register to which the base will be written
+ *    lo_w - lo word of the "base" register
+ *    lo_b - lo byte of the "base" register
+ *    hi_b - hi byte of the low word of the "base" register
+ *
+ * Example:
+ *    GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
+ *    Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
+ */
+#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
+	movb idx*8+4(gdt), lo_b; \
+	movb idx*8+7(gdt), hi_b; \
+	shll $16, base; \
+	movw idx*8+2(gdt), lo_w;
+
 #endif /* !__ASSEMBLY__ */
 
 #endif
diff --git a/include/asm-i386/percpu.h b/include/asm-i386/percpu.h
index 5764afa..510ae1d 100644
--- a/include/asm-i386/percpu.h
+++ b/include/asm-i386/percpu.h
@@ -1,6 +1,31 @@
 #ifndef __ARCH_I386_PERCPU__
 #define __ARCH_I386_PERCPU__
 
+#ifndef __ASSEMBLY__
 #include <asm-generic/percpu.h>
+#else
+
+/*
+ * PER_CPU finds an address of a per-cpu variable.
+ *
+ * Args:
+ *    var - variable name
+ *    cpu - 32bit register containing the current CPU number
+ *
+ * The resulting address is stored in the "cpu" argument.
+ *
+ * Example:
+ *    PER_CPU(cpu_gdt_descr, %ebx)
+ */
+#ifdef CONFIG_SMP
+#define PER_CPU(var, cpu) \
+	movl __per_cpu_offset(,cpu,4), cpu;	\
+	addl $per_cpu__/**/var, cpu;
+#else /* ! SMP */
+#define PER_CPU(var, cpu) \
+	movl $per_cpu__/**/var, cpu;
+#endif	/* SMP */
+
+#endif /* !__ASSEMBLY__ */
 
 #endif /* __ARCH_I386_PERCPU__ */
-- 
cgit v0.10.2


From acc207616a91a413a50fdd8847a747c4a7324167 Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <76306.1226@compuserve.com>
Date: Thu, 7 Dec 2006 02:14:01 +0100
Subject: [PATCH] i386: add sleazy FPU optimization

i386 port of the sLeAZY-fpu feature.  Chuck reports that this gives him a +/-
0.4% improvement on his simple benchmark

x86_64 description follows:

Right now the kernel on x86-64 has a 100% lazy fpu behavior: after *every*
context switch a trap is taken for the first FPU use to restore the FPU
context lazily.  This is of course great for applications that have very
sporadic or no FPU use (since then you avoid doing the expensive save/restore
all the time).  However for very frequent FPU users...  you take an extra trap
every context switch.

The patch below adds a simple heuristic to this code: After 5 consecutive
context switches of FPU use, the lazy behavior is disabled and the context
gets restored every context switch.  If the app indeed uses the FPU, the trap
is avoided.  (the chance of the 6th time slice using FPU after the previous 5
having done so are quite high obviously).

After 256 switches, this is reset and lazy behavior is returned (until there
are 5 consecutive ones again).  The reason for this is to give apps that do
longer bursts of FPU use still the lazy behavior back after some time.

Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index dd53c58..ae924c4 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -648,6 +648,11 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
 
 	__unlazy_fpu(prev_p);
 
+
+	/* we're going to use this soon, after a few expensive things */
+	if (next_p->fpu_counter > 5)
+		prefetch(&next->i387.fxsave);
+
 	/*
 	 * Reload esp0.
 	 */
@@ -697,6 +702,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
 
 	disable_tsc(prev_p, next_p);
 
+	/* If the task has used fpu the last 5 timeslices, just do a full
+	 * restore of the math state immediately to avoid the trap; the
+	 * chances of needing FPU soon are obviously high now
+	 */
+	if (next_p->fpu_counter > 5)
+		math_state_restore();
+
 	return prev_p;
 }
 
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index f9bb1f8..4a6fa28 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -1118,7 +1118,7 @@ fastcall unsigned long patch_espfix_desc(unsigned long uesp,
  * Must be called with kernel preemption disabled (in this case,
  * local interrupts are disabled at the call-site in entry.S).
  */
-asmlinkage void math_state_restore(struct pt_regs regs)
+asmlinkage void math_state_restore(void)
 {
 	struct thread_info *thread = current_thread_info();
 	struct task_struct *tsk = thread->task;
@@ -1128,6 +1128,7 @@ asmlinkage void math_state_restore(struct pt_regs regs)
 		init_fpu(tsk);
 	restore_fpu(tsk);
 	thread->status |= TS_USEDFPU;	/* So we fnsave on switch_to() */
+	tsk->fpu_counter++;
 }
 
 #ifndef CONFIG_MATH_EMULATION
diff --git a/include/asm-i386/i387.h b/include/asm-i386/i387.h
index bc1d6ed..434936c7 100644
--- a/include/asm-i386/i387.h
+++ b/include/asm-i386/i387.h
@@ -76,7 +76,9 @@ static inline void __save_init_fpu( struct task_struct *tsk )
 
 #define __unlazy_fpu( tsk ) do { \
 	if (task_thread_info(tsk)->status & TS_USEDFPU) \
-		save_init_fpu( tsk ); \
+		save_init_fpu( tsk ); 			\
+	else						\
+		tsk->fpu_counter = 0;			\
 } while (0)
 
 #define __clear_fpu( tsk )					\
@@ -118,6 +120,7 @@ static inline void save_init_fpu( struct task_struct *tsk )
 extern unsigned short get_fpu_cwd( struct task_struct *tsk );
 extern unsigned short get_fpu_swd( struct task_struct *tsk );
 extern unsigned short get_fpu_mxcsr( struct task_struct *tsk );
+extern asmlinkage void math_state_restore(void);
 
 /*
  * Signal frame handlers...
-- 
cgit v0.10.2


From 399287229c775a8962a852a761d65dc9475dec7c Mon Sep 17 00:00:00 2001
From: Aaron Durbin <adurbin@google.com>
Date: Thu, 7 Dec 2006 02:14:01 +0100
Subject: [PATCH] x86-64: Insert Local and IO APIC(s) into resource map

Insert the Local APIC and IO APIC(s) into the resource tree.  It allows the
APIC resources to be visible within /proc/iomem.  The patch also takes into
account IO APIC(s) mapped in the PCI space by deferring the insertion until
after PCI has allocated its necessary resources.

Signed-off-by: Aaron Durbin <adurbin@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@muc.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 4d9d5ed..5c46897 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -25,6 +25,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/sysdev.h>
 #include <linux/module.h>
+#include <linux/ioport.h>
 
 #include <asm/atomic.h>
 #include <asm/smp.h>
@@ -45,6 +46,12 @@ int apic_calibrate_pmtmr __initdata;
 
 int disable_apic_timer __initdata;
 
+static struct resource *ioapic_resources;
+static struct resource lapic_resource = {
+	.name = "Local APIC",
+	.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
+};
+
 /*
  * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
  * IPIs in place of local APIC timers
@@ -585,6 +592,64 @@ static int __init detect_init_APIC (void)
 	return 0;
 }
 
+#ifdef CONFIG_X86_IO_APIC
+static struct resource * __init ioapic_setup_resources(void)
+{
+#define IOAPIC_RESOURCE_NAME_SIZE 11
+	unsigned long n;
+	struct resource *res;
+	char *mem;
+	int i;
+
+	if (nr_ioapics <= 0)
+		return NULL;
+
+	n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
+	n *= nr_ioapics;
+
+	mem = alloc_bootmem(n);
+	res = (void *)mem;
+
+	if (mem != NULL) {
+		memset(mem, 0, n);
+		mem += sizeof(struct resource) * nr_ioapics;
+
+		for (i = 0; i < nr_ioapics; i++) {
+			res[i].name = mem;
+			res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+			sprintf(mem,  "IOAPIC %u", i);
+			mem += IOAPIC_RESOURCE_NAME_SIZE;
+		}
+	}
+
+	ioapic_resources = res;
+
+	return res;
+}
+
+static int __init ioapic_insert_resources(void)
+{
+	int i;
+	struct resource *r = ioapic_resources;
+
+	if (!r) {
+		printk("IO APIC resources could be not be allocated.\n");
+		return -1;
+	}
+
+	for (i = 0; i < nr_ioapics; i++) {
+		insert_resource(&iomem_resource, r);
+		r++;
+	}
+
+	return 0;
+}
+
+/* Insert the IO APIC resources after PCI initialization has occured to handle
+ * IO APICS that are mapped in on a BAR in PCI space. */
+late_initcall(ioapic_insert_resources);
+#endif
+
 void __init init_apic_mappings(void)
 {
 	unsigned long apic_phys;
@@ -604,6 +669,11 @@ void __init init_apic_mappings(void)
 	apic_mapped = 1;
 	apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys);
 
+	/* Put local APIC into the resource map. */
+	lapic_resource.start = apic_phys;
+	lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
+	insert_resource(&iomem_resource, &lapic_resource);
+
 	/*
 	 * Fetch the APIC ID of the BSP in case we have a
 	 * default configuration (or the MP table is broken).
@@ -613,7 +683,9 @@ void __init init_apic_mappings(void)
 	{
 		unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
 		int i;
+		struct resource *ioapic_res;
 
+		ioapic_res = ioapic_setup_resources();
 		for (i = 0; i < nr_ioapics; i++) {
 			if (smp_found_config) {
 				ioapic_phys = mp_ioapics[i].mpc_apicaddr;
@@ -625,6 +697,12 @@ void __init init_apic_mappings(void)
 			apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n",
 					__fix_to_virt(idx), ioapic_phys);
 			idx++;
+
+			if (ioapic_res != NULL) {
+				ioapic_res->start = ioapic_phys;
+				ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
+				ioapic_res++;
+			}
 		}
 	}
 }
-- 
cgit v0.10.2


From c0e84b9901c0924e2503c0aab3772a4469ba4aef Mon Sep 17 00:00:00 2001
From: Amol Lad <amol@verismonetworks.com>
Date: Thu, 7 Dec 2006 02:14:02 +0100
Subject: [PATCH] i386: Add iounmap in error paths in hpet code

Signed-off-by: Amol Lad <amol@verismonetworks.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c
index 1a2a979..1e4702d 100644
--- a/arch/i386/kernel/time_hpet.c
+++ b/arch/i386/kernel/time_hpet.c
@@ -132,14 +132,20 @@ int __init hpet_enable(void)
 	 * the single HPET timer for system time.
 	 */
 #ifdef CONFIG_HPET_EMULATE_RTC
-	if (!(id & HPET_ID_NUMBER))
+	if (!(id & HPET_ID_NUMBER)) {
+		iounmap(hpet_virt_address);
+		hpet_virt_address = NULL;
 		return -1;
+	}
 #endif
 
 
 	hpet_period = hpet_readl(HPET_PERIOD);
-	if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD))
+	if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) {
+		iounmap(hpet_virt_address);
+		hpet_virt_address = NULL;
 		return -1;
+	}
 
 	/*
 	 * 64 bit math
@@ -156,8 +162,11 @@ int __init hpet_enable(void)
 
 	hpet_use_timer = id & HPET_ID_LEGSUP;
 
-	if (hpet_timer_stop_set_go(hpet_tick))
+	if (hpet_timer_stop_set_go(hpet_tick)) {
+		iounmap(hpet_virt_address);
+		hpet_virt_address = NULL;
 		return -1;
+	}
 
 	use_hpet = 1;
 
-- 
cgit v0.10.2


From fa5cecd111d235819a1d807d43216ae459a0dd6f Mon Sep 17 00:00:00 2001
From: Amol Lad <amol@verismonetworks.com>
Date: Thu, 7 Dec 2006 02:14:02 +0100
Subject: [PATCH] i386: add missing iounmap in i386 hpet clocksource code

ioremap must be balanced by an iounmap and failing to do so can result
in a memory leak.

Tested (compilation only):
- using allmodconfig
- making sure the files are compiling without any warning/error due to
new changes

Signed-off-by: Amol Lad <amol@verismonetworks.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c
index 17647a5..45a8685 100644
--- a/arch/i386/kernel/hpet.c
+++ b/arch/i386/kernel/hpet.c
@@ -34,6 +34,7 @@ static int __init init_hpet_clocksource(void)
 	unsigned long hpet_period;
 	void __iomem* hpet_base;
 	u64 tmp;
+	int err;
 
 	if (!is_hpet_enabled())
 		return -ENODEV;
@@ -61,7 +62,11 @@ static int __init init_hpet_clocksource(void)
 	do_div(tmp, FSEC_PER_NSEC);
 	clocksource_hpet.mult = (u32)tmp;
 
-	return clocksource_register(&clocksource_hpet);
+	err = clocksource_register(&clocksource_hpet);
+	if (err)
+		iounmap(hpet_base);
+
+	return err;
 }
 
 module_init(init_hpet_clocksource);
-- 
cgit v0.10.2


From 86efef50cfff9905c4e4ec64f3d3d3b299226674 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@hpl.hp.com>
Date: Thu, 7 Dec 2006 02:14:02 +0100
Subject: [PATCH] x86-64: x86-64 add Intel Core related PMU MSRs definitions

Add o the x86-64 tree a bunch of MSRs related to performance
monitoring for the processors based on Intel Core microarchitecture.
It also adds some architectural MSRs for PEBS. A similar patch for i386 will
follow.

changelog:
	- add Intel Precise-Event Based sampling (PEBS) related MSR
	- add Intel Data Save (DS) Area related MSR
	- add Intel Core microarchitecure performance counter MSRs

Signed-off-by: stephane eranian <eranian@hpl.hp.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/include/asm-x86_64/msr.h b/include/asm-x86_64/msr.h
index 37e1941..a745c50 100644
--- a/include/asm-x86_64/msr.h
+++ b/include/asm-x86_64/msr.h
@@ -210,6 +210,10 @@ static inline unsigned int cpuid_edx(unsigned int op)
 #define MSR_IA32_LASTINTFROMIP     0x1dd
 #define MSR_IA32_LASTINTTOIP       0x1de
 
+#define MSR_IA32_PEBS_ENABLE		0x3f1
+#define MSR_IA32_DS_AREA		0x600
+#define MSR_IA32_PERF_CAPABILITIES	0x345
+
 #define MSR_MTRRfix64K_00000	0x250
 #define MSR_MTRRfix16K_80000	0x258
 #define MSR_MTRRfix16K_A0000	0x259
@@ -407,4 +411,13 @@ static inline unsigned int cpuid_edx(unsigned int op)
 #define MSR_P4_U2L_ESCR0 		0x3b0
 #define MSR_P4_U2L_ESCR1 		0x3b1
 
+/* Intel Core-based CPU performance counters */
+#define MSR_CORE_PERF_FIXED_CTR0	0x309
+#define MSR_CORE_PERF_FIXED_CTR1	0x30a
+#define MSR_CORE_PERF_FIXED_CTR2	0x30b
+#define MSR_CORE_PERF_FIXED_CTR_CTRL	0x38d
+#define MSR_CORE_PERF_GLOBAL_STATUS	0x38e
+#define MSR_CORE_PERF_GLOBAL_CTRL	0x38f
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL	0x390
+
 #endif
-- 
cgit v0.10.2


From bb0d977ed42c79ed709c79dbab4ff2159941eb2a Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@hpl.hp.com>
Date: Thu, 7 Dec 2006 02:14:02 +0100
Subject: [PATCH] i386: add Intel Core related PMU MSRs

- add Intel Precise-Event Based sampling (PEBS) related MSR
- add Intel Data Save (DS) Area related MSR
- add Intel Core microarchitecure performance counter MSRs

Signed-off-by: stephane eranian <eranian@hpl.hp.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/include/asm-i386/msr.h b/include/asm-i386/msr.h
index 62b76cd..1820d9d 100644
--- a/include/asm-i386/msr.h
+++ b/include/asm-i386/msr.h
@@ -141,6 +141,10 @@ static inline void wrmsrl (unsigned long msr, unsigned long long val)
 #define MSR_IA32_MC0_ADDR		0x402
 #define MSR_IA32_MC0_MISC		0x403
 
+#define MSR_IA32_PEBS_ENABLE		0x3f1
+#define MSR_IA32_DS_AREA		0x600
+#define MSR_IA32_PERF_CAPABILITIES	0x345
+
 /* Pentium IV performance counter MSRs */
 #define MSR_P4_BPU_PERFCTR0 		0x300
 #define MSR_P4_BPU_PERFCTR1 		0x301
@@ -284,4 +288,13 @@ static inline void wrmsrl (unsigned long msr, unsigned long long val)
 #define MSR_TMTA_LRTI_READOUT		0x80868018
 #define MSR_TMTA_LRTI_VOLT_MHZ		0x8086801a
 
+/* Intel Core-based CPU performance counters */
+#define MSR_CORE_PERF_FIXED_CTR0	0x309
+#define MSR_CORE_PERF_FIXED_CTR1	0x30a
+#define MSR_CORE_PERF_FIXED_CTR2	0x30b
+#define MSR_CORE_PERF_FIXED_CTR_CTRL	0x38d
+#define MSR_CORE_PERF_GLOBAL_STATUS	0x38e
+#define MSR_CORE_PERF_GLOBAL_CTRL	0x38f
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL	0x390
+
 #endif /* __ASM_MSR_H */
-- 
cgit v0.10.2


From da68933e0a999fb13636653c710cca701b457ad2 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 7 Dec 2006 02:14:02 +0100
Subject: [PATCH] x86-64: dump_trace() atomicity fix

Fix

BUG: using smp_processor_id() in preemptible [00000001] code:

in backtracer on preemptible debug kernels.

Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index eedd4e7..e37b4d7 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -254,7 +254,7 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 		unsigned long *stack,
 		struct stacktrace_ops *ops, void *data)
 {
-	const unsigned cpu = smp_processor_id();
+	const unsigned cpu = get_cpu();
 	unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
 	unsigned used = 0;
 	struct thread_info *tinfo;
@@ -286,11 +286,11 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 					MSG("Leftover inexact backtrace:");
 					stack = (unsigned long *)UNW_SP(&info);
 					if (!stack)
-						return;
+						goto out;
 				} else
 					MSG("Full inexact backtrace again:");
 			} else if (call_trace >= 1)
-				return;
+				goto out;
 			else
 				MSG("Full inexact backtrace again:");
 		} else
@@ -385,6 +385,8 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 	tinfo = current_thread_info();
 	HANDLE_STACK (valid_stack_ptr(tinfo, stack));
 #undef HANDLE_STACK
+out:
+	put_cpu();
 }
 EXPORT_SYMBOL(dump_trace);
 
-- 
cgit v0.10.2


From bcddc0155f351ab3f06c6ede6d91fd399ef9e18f Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 7 Dec 2006 02:14:02 +0100
Subject: [PATCH] x86-64: miscellaneous entry.S adjustments

This patch:
- makes ret_from_sys_call no longer global (all external users were
  previously switched to use int_ret_from_sys_call)
- adjusts placement of a CFI_{REMEMBER,RESTORE}_STATE pair to better
  fit logic flow
- eliminates an unnecessary pair of CFI_{REMEMBER,RESTORE}_STATE
- glues together function- and unwinder-wise the previously separate
  system_call and int_ret_from_sys_call function fragments

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 7d401b0..601d332 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -230,7 +230,6 @@ ENTRY(system_call)
 	CFI_REL_OFFSET rip,RIP-ARGOFFSET
 	GET_THREAD_INFO(%rcx)
 	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
-	CFI_REMEMBER_STATE
 	jnz tracesys
 	cmpq $__NR_syscall_max,%rax
 	ja badsys
@@ -241,7 +240,6 @@ ENTRY(system_call)
  * Syscall return path ending with SYSRET (fast path)
  * Has incomplete stack frame and undefined top of stack. 
  */		
-	.globl ret_from_sys_call
 ret_from_sys_call:
 	movl $_TIF_ALLWORK_MASK,%edi
 	/* edi:	flagmask */
@@ -251,8 +249,8 @@ sysret_check:
 	TRACE_IRQS_OFF
 	movl threadinfo_flags(%rcx),%edx
 	andl %edi,%edx
-	CFI_REMEMBER_STATE
 	jnz  sysret_careful 
+	CFI_REMEMBER_STATE
 	/*
 	 * sysretq will re-enable interrupts:
 	 */
@@ -265,10 +263,10 @@ sysret_check:
 	swapgs
 	sysretq
 
+	CFI_RESTORE_STATE
 	/* Handle reschedules */
 	/* edx:	work, edi: workmask */	
 sysret_careful:
-	CFI_RESTORE_STATE
 	bt $TIF_NEED_RESCHED,%edx
 	jnc sysret_signal
 	TRACE_IRQS_ON
@@ -306,7 +304,6 @@ badsys:
 
 	/* Do syscall tracing */
 tracesys:			 
-	CFI_RESTORE_STATE
 	SAVE_REST
 	movq $-ENOSYS,RAX(%rsp)
 	FIXUP_TOP_OF_STACK %rdi
@@ -322,32 +319,13 @@ tracesys:
 	call *sys_call_table(,%rax,8)
 1:	movq %rax,RAX-ARGOFFSET(%rsp)
 	/* Use IRET because user could have changed frame */
-	jmp int_ret_from_sys_call
-	CFI_ENDPROC
-END(system_call)
 		
 /* 
  * Syscall return path ending with IRET.
  * Has correct top of stack, but partial stack frame.
- */ 	
-ENTRY(int_ret_from_sys_call)
-	CFI_STARTPROC	simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,SS+8-ARGOFFSET
-	/*CFI_REL_OFFSET	ss,SS-ARGOFFSET*/
-	CFI_REL_OFFSET	rsp,RSP-ARGOFFSET
-	/*CFI_REL_OFFSET	rflags,EFLAGS-ARGOFFSET*/
-	/*CFI_REL_OFFSET	cs,CS-ARGOFFSET*/
-	CFI_REL_OFFSET	rip,RIP-ARGOFFSET
-	CFI_REL_OFFSET	rdx,RDX-ARGOFFSET
-	CFI_REL_OFFSET	rcx,RCX-ARGOFFSET
-	CFI_REL_OFFSET	rax,RAX-ARGOFFSET
-	CFI_REL_OFFSET	rdi,RDI-ARGOFFSET
-	CFI_REL_OFFSET	rsi,RSI-ARGOFFSET
-	CFI_REL_OFFSET	r8,R8-ARGOFFSET
-	CFI_REL_OFFSET	r9,R9-ARGOFFSET
-	CFI_REL_OFFSET	r10,R10-ARGOFFSET
-	CFI_REL_OFFSET	r11,R11-ARGOFFSET
+ */
+	.globl int_ret_from_sys_call
+int_ret_from_sys_call:
 	cli
 	TRACE_IRQS_OFF
 	testl $3,CS-ARGOFFSET(%rsp)
@@ -394,8 +372,6 @@ int_very_careful:
 	popq %rdi
 	CFI_ADJUST_CFA_OFFSET -8
 	andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
-	cli
-	TRACE_IRQS_OFF
 	jmp int_restore_rest
 	
 int_signal:
@@ -411,7 +387,7 @@ int_restore_rest:
 	TRACE_IRQS_OFF
 	jmp int_with_check
 	CFI_ENDPROC
-END(int_ret_from_sys_call)
+END(system_call)
 		
 /* 
  * Certain special system calls that need to save a complete full stack frame.
-- 
cgit v0.10.2


From eb5b7b9d86f46b45ba1f986302fdf7df84fb8297 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 7 Dec 2006 02:14:02 +0100
Subject: [PATCH] i386: Use asm-offsets for the offsets of registers into the
 pt_regs struct

Use asm-offsets for the offsets of registers into the pt_regs struct, rather
than having hard-coded constants

I left the constants in the comments of entry.S because they're useful for
reference; the code in entry.S is very dependent on the layout of pt_regs,
even when using asm-offsets.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Keith Owens <kaos@ocs.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index e94d910..70b19807 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -63,6 +63,23 @@ void foo(void)
 	OFFSET(GDS_pad, Xgt_desc_struct, pad);
 	BLANK();
 
+	OFFSET(PT_EBX, pt_regs, ebx);
+	OFFSET(PT_ECX, pt_regs, ecx);
+	OFFSET(PT_EDX, pt_regs, edx);
+	OFFSET(PT_ESI, pt_regs, esi);
+	OFFSET(PT_EDI, pt_regs, edi);
+	OFFSET(PT_EBP, pt_regs, ebp);
+	OFFSET(PT_EAX, pt_regs, eax);
+	OFFSET(PT_DS,  pt_regs, xds);
+	OFFSET(PT_ES,  pt_regs, xes);
+	OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
+	OFFSET(PT_EIP, pt_regs, eip);
+	OFFSET(PT_CS,  pt_regs, xcs);
+	OFFSET(PT_EFLAGS, pt_regs, eflags);
+	OFFSET(PT_OLDESP, pt_regs, esp);
+	OFFSET(PT_OLDSS,  pt_regs, xss);
+	BLANK();
+
 	OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
 	OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
 	BLANK();
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index c38d801..0069bf0 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -54,22 +54,6 @@
 
 #define nr_syscalls ((syscall_table_size)/4)
 
-EBX		= 0x00
-ECX		= 0x04
-EDX		= 0x08
-ESI		= 0x0C
-EDI		= 0x10
-EBP		= 0x14
-EAX		= 0x18
-DS		= 0x1C
-ES		= 0x20
-ORIG_EAX	= 0x24
-EIP		= 0x28
-CS		= 0x2C
-EFLAGS		= 0x30
-OLDESP		= 0x34
-OLDSS		= 0x38
-
 CF_MASK		= 0x00000001
 TF_MASK		= 0x00000100
 IF_MASK		= 0x00000200
@@ -93,7 +77,7 @@ VM_MASK		= 0x00020000
 
 .macro TRACE_IRQS_IRET
 #ifdef CONFIG_TRACE_IRQFLAGS
-	testl $IF_MASK,EFLAGS(%esp)     # interrupts off?
+	testl $IF_MASK,PT_EFLAGS(%esp)     # interrupts off?
 	jz 1f
 	TRACE_IRQS_ON
 1:
@@ -199,18 +183,18 @@ VM_MASK		= 0x00020000
 #define RING0_PTREGS_FRAME \
 	CFI_STARTPROC simple;\
 	CFI_SIGNAL_FRAME;\
-	CFI_DEF_CFA esp, OLDESP-EBX;\
-	/*CFI_OFFSET cs, CS-OLDESP;*/\
-	CFI_OFFSET eip, EIP-OLDESP;\
-	/*CFI_OFFSET es, ES-OLDESP;*/\
-	/*CFI_OFFSET ds, DS-OLDESP;*/\
-	CFI_OFFSET eax, EAX-OLDESP;\
-	CFI_OFFSET ebp, EBP-OLDESP;\
-	CFI_OFFSET edi, EDI-OLDESP;\
-	CFI_OFFSET esi, ESI-OLDESP;\
-	CFI_OFFSET edx, EDX-OLDESP;\
-	CFI_OFFSET ecx, ECX-OLDESP;\
-	CFI_OFFSET ebx, EBX-OLDESP
+	CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
+	/*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
+	CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
+	/*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
+	/*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
+	CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
+	CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
+	CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
+	CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
+	CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
+	CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
+	CFI_OFFSET ebx, PT_EBX-PT_OLDESP
 
 ENTRY(ret_from_fork)
 	CFI_STARTPROC
@@ -242,8 +226,8 @@ ret_from_exception:
 ret_from_intr:
 	GET_THREAD_INFO(%ebp)
 check_userspace:
-	movl EFLAGS(%esp), %eax		# mix EFLAGS and CS
-	movb CS(%esp), %al
+	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS and CS
+	movb PT_CS(%esp), %al
 	andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
 	cmpl $USER_RPL, %eax
 	jb resume_kernel		# not returning to v8086 or userspace
@@ -266,7 +250,7 @@ need_resched:
 	movl TI_flags(%ebp), %ecx	# need_resched set ?
 	testb $_TIF_NEED_RESCHED, %cl
 	jz restore_all
-	testl $IF_MASK,EFLAGS(%esp)     # interrupts off (exception path) ?
+	testl $IF_MASK,PT_EFLAGS(%esp)	# interrupts off (exception path) ?
 	jz restore_all
 	call preempt_schedule_irq
 	jmp need_resched
@@ -332,15 +316,15 @@ sysenter_past_esp:
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
 	call *sys_call_table(,%eax,4)
-	movl %eax,EAX(%esp)
+	movl %eax,PT_EAX(%esp)
 	DISABLE_INTERRUPTS
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx
 	jne syscall_exit_work
 /* if something modifies registers it must also disable sysexit */
-	movl EIP(%esp), %edx
-	movl OLDESP(%esp), %ecx
+	movl PT_EIP(%esp), %edx
+	movl PT_OLDESP(%esp), %ecx
 	xorl %ebp,%ebp
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS_SYSEXIT
@@ -354,7 +338,7 @@ ENTRY(system_call)
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
-	testl $TF_MASK,EFLAGS(%esp)
+	testl $TF_MASK,PT_EFLAGS(%esp)
 	jz no_singlestep
 	orl $_TIF_SINGLESTEP,TI_flags(%ebp)
 no_singlestep:
@@ -366,7 +350,7 @@ no_singlestep:
 	jae syscall_badsys
 syscall_call:
 	call *sys_call_table(,%eax,4)
-	movl %eax,EAX(%esp)		# store the return value
+	movl %eax,PT_EAX(%esp)		# store the return value
 syscall_exit:
 	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
@@ -377,12 +361,12 @@ syscall_exit:
 	jne syscall_exit_work
 
 restore_all:
-	movl EFLAGS(%esp), %eax		# mix EFLAGS, SS and CS
-	# Warning: OLDSS(%esp) contains the wrong/random values if we
+	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS, SS and CS
+	# Warning: PT_OLDSS(%esp) contains the wrong/random values if we
 	# are returning to the kernel.
 	# See comments in process.c:copy_thread() for details.
-	movb OLDSS(%esp), %ah
-	movb CS(%esp), %al
+	movb PT_OLDSS(%esp), %ah
+	movb PT_CS(%esp), %al
 	andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
 	cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
 	CFI_REMEMBER_STATE
@@ -409,7 +393,7 @@ iret_exc:
 
 	CFI_RESTORE_STATE
 ldt_ss:
-	larl OLDSS(%esp), %eax
+	larl PT_OLDSS(%esp), %eax
 	jnz restore_nocheck
 	testl $0x00400000, %eax		# returning to 32bit stack?
 	jnz restore_nocheck		# allright, normal return
@@ -419,7 +403,7 @@ ldt_ss:
 	 * This is an "official" bug of all the x86-compatible
 	 * CPUs, which we can try to work around to make
 	 * dosemu and wine happy. */
-	movl OLDESP(%esp), %eax
+	movl PT_OLDESP(%esp), %eax
 	movl %esp, %edx
 	call patch_espfix_desc
 	pushl $__ESPFIX_SS
@@ -454,7 +438,7 @@ work_resched:
 
 work_notifysig:				# deal with pending signals and
 					# notify-resume requests
-	testl $VM_MASK, EFLAGS(%esp)
+	testl $VM_MASK, PT_EFLAGS(%esp)
 	movl %esp, %eax
 	jne work_notifysig_v86		# returning to kernel-space or
 					# vm86-space
@@ -479,14 +463,14 @@ work_notifysig_v86:
 	# perform syscall exit tracing
 	ALIGN
 syscall_trace_entry:
-	movl $-ENOSYS,EAX(%esp)
+	movl $-ENOSYS,PT_EAX(%esp)
 	movl %esp, %eax
 	xorl %edx,%edx
 	call do_syscall_trace
 	cmpl $0, %eax
 	jne resume_userspace		# ret != 0 -> running under PTRACE_SYSEMU,
 					# so must skip actual syscall
-	movl ORIG_EAX(%esp), %eax
+	movl PT_ORIG_EAX(%esp), %eax
 	cmpl $(nr_syscalls), %eax
 	jnae syscall_call
 	jmp syscall_exit
@@ -511,11 +495,11 @@ syscall_fault:
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
-	movl $-EFAULT,EAX(%esp)
+	movl $-EFAULT,PT_EAX(%esp)
 	jmp resume_userspace
 
 syscall_badsys:
-	movl $-ENOSYS,EAX(%esp)
+	movl $-ENOSYS,PT_EAX(%esp)
 	jmp resume_userspace
 	CFI_ENDPROC
 
@@ -636,10 +620,10 @@ error_code:
 	popl %ecx
 	CFI_ADJUST_CFA_OFFSET -4
 	/*CFI_REGISTER es, ecx*/
-	movl ES(%esp), %edi		# get the function address
-	movl ORIG_EAX(%esp), %edx	# get the error code
-	movl $-1, ORIG_EAX(%esp)
-	movl %ecx, ES(%esp)
+	movl PT_ES(%esp), %edi		# get the function address
+	movl PT_ORIG_EAX(%esp), %edx	# get the error code
+	movl $-1, PT_ORIG_EAX(%esp)
+	movl %ecx, PT_ES(%esp)
 	/*CFI_REL_OFFSET es, ES*/
 	movl $(__USER_DS), %ecx
 	movl %ecx, %ds
@@ -942,26 +926,26 @@ ENTRY(arch_unwind_init_running)
 	movl	4(%esp), %edx
 	movl	(%esp), %ecx
 	leal	4(%esp), %eax
-	movl	%ebx, EBX(%edx)
+	movl	%ebx, PT_EBX(%edx)
 	xorl	%ebx, %ebx
-	movl	%ebx, ECX(%edx)
-	movl	%ebx, EDX(%edx)
-	movl	%esi, ESI(%edx)
-	movl	%edi, EDI(%edx)
-	movl	%ebp, EBP(%edx)
-	movl	%ebx, EAX(%edx)
-	movl	$__USER_DS, DS(%edx)
-	movl	$__USER_DS, ES(%edx)
-	movl	%ebx, ORIG_EAX(%edx)
-	movl	%ecx, EIP(%edx)
+	movl	%ebx, PT_ECX(%edx)
+	movl	%ebx, PT_EDX(%edx)
+	movl	%esi, PT_ESI(%edx)
+	movl	%edi, PT_EDI(%edx)
+	movl	%ebp, PT_EBP(%edx)
+	movl	%ebx, PT_EAX(%edx)
+	movl	$__USER_DS, PT_DS(%edx)
+	movl	$__USER_DS, PT_ES(%edx)
+	movl	%ebx, PT_ORIG_EAX(%edx)
+	movl	%ecx, PT_EIP(%edx)
 	movl	12(%esp), %ecx
-	movl	$__KERNEL_CS, CS(%edx)
-	movl	%ebx, EFLAGS(%edx)
-	movl	%eax, OLDESP(%edx)
+	movl	$__KERNEL_CS, PT_CS(%edx)
+	movl	%ebx, PT_EFLAGS(%edx)
+	movl	%eax, PT_OLDESP(%edx)
 	movl	8(%esp), %eax
 	movl	%ecx, 8(%esp)
-	movl	EBX(%edx), %ebx
-	movl	$__KERNEL_DS, OLDSS(%edx)
+	movl	PT_EBX(%edx), %ebx
+	movl	$__KERNEL_DS, PT_OLDSS(%edx)
 	jmpl	*%eax
 	CFI_ENDPROC
 ENDPROC(arch_unwind_init_running)
-- 
cgit v0.10.2


From 9ca36101a8d74704d78f10910f89d62de96f9dc8 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 7 Dec 2006 02:14:02 +0100
Subject: [PATCH] i386: Basic definitions for i386-pda

This patch has the basic definitions of struct i386_pda, and the segment
selector in the GDT.

asm-i386/pda.h is more or less a direct copy of asm-x86_64/pda.h.  The most
interesting difference is the use of _proxy_pda, which is used to give gcc a
model for the actual memory operations on the real pda structure.  No actual
reference is ever made to _proxy_pda, so it is never defined.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Chuck Ebbert <76306.1226@compuserve.com>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index b1f1df1..4a83384 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -585,7 +585,7 @@ ENTRY(cpu_gdt_table)
 	.quad 0x004092000000ffff	/* 0xc8 APM DS    data */
 
 	.quad 0x00c0920000000000	/* 0xd0 - ESPFIX SS */
-	.quad 0x0000000000000000	/* 0xd8 - unused */
+	.quad 0x0000000000000000	/* 0xd8 - PDA */
 	.quad 0x0000000000000000	/* 0xe0 - unused */
 	.quad 0x0000000000000000	/* 0xe8 - unused */
 	.quad 0x0000000000000000	/* 0xf0 - unused */
diff --git a/include/asm-i386/pda.h b/include/asm-i386/pda.h
new file mode 100644
index 0000000..4c39ccb
--- /dev/null
+++ b/include/asm-i386/pda.h
@@ -0,0 +1,95 @@
+/*
+   Per-processor Data Areas
+   Jeremy Fitzhardinge <jeremy@goop.org> 2006
+   Based on asm-x86_64/pda.h by Andi Kleen.
+ */
+#ifndef _I386_PDA_H
+#define _I386_PDA_H
+
+#include <linux/stddef.h>
+
+struct i386_pda
+{
+	struct i386_pda *_pda;		/* pointer to self */
+};
+
+extern struct i386_pda *_cpu_pda[];
+
+#define cpu_pda(i)	(_cpu_pda[i])
+
+#define pda_offset(field) offsetof(struct i386_pda, field)
+
+extern void __bad_pda_field(void);
+
+/* This variable is never instantiated.  It is only used as a stand-in
+   for the real per-cpu PDA memory, so that gcc can understand what
+   memory operations the inline asms() below are performing.  This
+   eliminates the need to make the asms volatile or have memory
+   clobbers, so gcc can readily analyse them. */
+extern struct i386_pda _proxy_pda;
+
+#define pda_to_op(op,field,val)						\
+	do {								\
+		typedef typeof(_proxy_pda.field) T__;			\
+		if (0) { T__ tmp__; tmp__ = (val); }			\
+		switch (sizeof(_proxy_pda.field)) {			\
+		case 1:							\
+			asm(op "b %1,%%gs:%c2"				\
+			    : "+m" (_proxy_pda.field)			\
+			    :"ri" ((T__)val),				\
+			     "i"(pda_offset(field)));			\
+			break;						\
+		case 2:							\
+			asm(op "w %1,%%gs:%c2"				\
+			    : "+m" (_proxy_pda.field)			\
+			    :"ri" ((T__)val),				\
+			     "i"(pda_offset(field)));			\
+			break;						\
+		case 4:							\
+			asm(op "l %1,%%gs:%c2"				\
+			    : "+m" (_proxy_pda.field)			\
+			    :"ri" ((T__)val),				\
+			     "i"(pda_offset(field)));			\
+			break;						\
+		default: __bad_pda_field();				\
+		}							\
+	} while (0)
+
+#define pda_from_op(op,field)						\
+	({								\
+		typeof(_proxy_pda.field) ret__;				\
+		switch (sizeof(_proxy_pda.field)) {			\
+		case 1:							\
+			asm(op "b %%gs:%c1,%0"				\
+			    : "=r" (ret__)				\
+			    : "i" (pda_offset(field)),			\
+			      "m" (_proxy_pda.field));			\
+			break;						\
+		case 2:							\
+			asm(op "w %%gs:%c1,%0"				\
+			    : "=r" (ret__)				\
+			    : "i" (pda_offset(field)),			\
+			      "m" (_proxy_pda.field));			\
+			break;						\
+		case 4:							\
+			asm(op "l %%gs:%c1,%0"				\
+			    : "=r" (ret__)				\
+			    : "i" (pda_offset(field)),			\
+			      "m" (_proxy_pda.field));			\
+			break;						\
+		default: __bad_pda_field();				\
+		}							\
+		ret__; })
+
+/* Return a pointer to a pda field */
+#define pda_addr(field)							\
+	((typeof(_proxy_pda.field) *)((unsigned char *)read_pda(_pda) + \
+				      pda_offset(field)))
+
+#define read_pda(field) pda_from_op("mov",field)
+#define write_pda(field,val) pda_to_op("mov",field,val)
+#define add_pda(field,val) pda_to_op("add",field,val)
+#define sub_pda(field,val) pda_to_op("sub",field,val)
+#define or_pda(field,val) pda_to_op("or",field,val)
+
+#endif	/* _I386_PDA_H */
diff --git a/include/asm-i386/segment.h b/include/asm-i386/segment.h
index b7ab596..5bdda79 100644
--- a/include/asm-i386/segment.h
+++ b/include/asm-i386/segment.h
@@ -39,7 +39,7 @@
  *  25 - APM BIOS support 
  *
  *  26 - ESPFIX small SS
- *  27 - unused
+ *  27 - PDA				[ per-cpu private data area ]
  *  28 - unused
  *  29 - unused
  *  30 - unused
@@ -74,6 +74,9 @@
 #define GDT_ENTRY_ESPFIX_SS		(GDT_ENTRY_KERNEL_BASE + 14)
 #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
 
+#define GDT_ENTRY_PDA			(GDT_ENTRY_KERNEL_BASE + 15)
+#define __KERNEL_PDA (GDT_ENTRY_PDA * 8)
+
 #define GDT_ENTRY_DOUBLEFAULT_TSS	31
 
 /*
-- 
cgit v0.10.2


From 62111195800d80c66cdc69063ea3145878c99fbf Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 7 Dec 2006 02:14:02 +0100
Subject: [PATCH] i386: Initialize the per-CPU data area

When a CPU is brought up, a PDA and GDT are allocated for it.  The GDT's
__KERNEL_PDA entry is pointed to the allocated PDA memory, so that all
references using this segment descriptor will refer to the PDA.

This patch rearranges CPU initialization a bit, so that the GDT/PDA are set up
as early as possible in cpu_init().  Also for secondary CPUs, GDT+PDA are
preallocated and initialized so all the secondary CPU needs to do is set up
the ldt and load %gs.  This will be important once smp_processor_id() and
current use the PDA.

In all cases, the PDA is set up in head.S, before a CPU starts running C code,
so the PDA is always available.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Chuck Ebbert <76306.1226@compuserve.com>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <ak@suse.de>
Cc: James Bottomley <James.Bottomley@SteelEye.com>
Cc: Matt Tolentino <matthew.e.tolentino@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 5532fc4..2534e25 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -18,12 +18,16 @@
 #include <asm/apic.h>
 #include <mach_apic.h>
 #endif
+#include <asm/pda.h>
 
 #include "cpu.h"
 
 DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
 EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
 
+struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(_cpu_pda);
+
 static int cachesize_override __cpuinitdata = -1;
 static int disable_x86_fxsr __cpuinitdata;
 static int disable_x86_serial_nr __cpuinitdata = 1;
@@ -588,41 +592,16 @@ void __init early_cpu_init(void)
 	disable_pse = 1;
 #endif
 }
-/*
- * cpu_init() initializes state that is per-CPU. Some data is already
- * initialized (naturally) in the bootstrap process, such as the GDT
- * and IDT. We reload them nevertheless, this function acts as a
- * 'CPU state barrier', nothing should get across.
- */
-void __cpuinit cpu_init(void)
+
+__cpuinit int alloc_gdt(int cpu)
 {
-	int cpu = smp_processor_id();
-	struct tss_struct * t = &per_cpu(init_tss, cpu);
-	struct thread_struct *thread = &current->thread;
-	struct desc_struct *gdt;
 	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
+	struct desc_struct *gdt;
+	struct i386_pda *pda;
 
-	if (cpu_test_and_set(cpu, cpu_initialized)) {
-		printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
-		for (;;) local_irq_enable();
-	}
-	printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+	gdt = (struct desc_struct *)cpu_gdt_descr->address;
+	pda = cpu_pda(cpu);
 
-	if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
-		clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
-	if (tsc_disable && cpu_has_tsc) {
-		printk(KERN_NOTICE "Disabling TSC...\n");
-		/**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
-		clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
-		set_in_cr4(X86_CR4_TSD);
-	}
-
-	/* The CPU hotplug case */
-	if (cpu_gdt_descr->address) {
-		gdt = (struct desc_struct *)cpu_gdt_descr->address;
-		memset(gdt, 0, PAGE_SIZE);
-		goto old_gdt;
-	}
 	/*
 	 * This is a horrible hack to allocate the GDT.  The problem
 	 * is that cpu_init() is called really early for the boot CPU
@@ -630,36 +609,117 @@ void __cpuinit cpu_init(void)
 	 * CPUs, when bootmem will have gone away
 	 */
 	if (NODE_DATA(0)->bdata->node_bootmem_map) {
-		gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
-		/* alloc_bootmem_pages panics on failure, so no check */
+		BUG_ON(gdt != NULL || pda != NULL);
+
+		gdt = alloc_bootmem_pages(PAGE_SIZE);
+		pda = alloc_bootmem(sizeof(*pda));
+		/* alloc_bootmem(_pages) panics on failure, so no check */
+
 		memset(gdt, 0, PAGE_SIZE);
+		memset(pda, 0, sizeof(*pda));
 	} else {
-		gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
-		if (unlikely(!gdt)) {
-			printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
-			for (;;)
-				local_irq_enable();
+		/* GDT and PDA might already have been allocated if
+		   this is a CPU hotplug re-insertion. */
+		if (gdt == NULL)
+			gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
+
+		if (pda == NULL)
+			pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu));
+
+		if (unlikely(!gdt || !pda)) {
+			free_pages((unsigned long)gdt, 0);
+			kfree(pda);
+			return 0;
 		}
 	}
-old_gdt:
+
+ 	cpu_gdt_descr->address = (unsigned long)gdt;
+	cpu_pda(cpu) = pda;
+
+	return 1;
+}
+
+/* Initial PDA used by boot CPU */
+struct i386_pda boot_pda = {
+	._pda = &boot_pda,
+};
+
+/* Initialize the CPU's GDT and PDA.  The boot CPU does this for
+   itself, but secondaries find this done for them. */
+__cpuinit int init_gdt(int cpu, struct task_struct *idle)
+{
+	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
+	struct desc_struct *gdt;
+	struct i386_pda *pda;
+
+	/* For non-boot CPUs, the GDT and PDA should already have been
+	   allocated. */
+	if (!alloc_gdt(cpu)) {
+		printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu);
+		return 0;
+	}
+
+	gdt = (struct desc_struct *)cpu_gdt_descr->address;
+	pda = cpu_pda(cpu);
+
+	BUG_ON(gdt == NULL || pda == NULL);
+
 	/*
 	 * Initialize the per-CPU GDT with the boot GDT,
 	 * and set up the GDT descriptor:
 	 */
  	memcpy(gdt, cpu_gdt_table, GDT_SIZE);
 	cpu_gdt_descr->size = GDT_SIZE - 1;
- 	cpu_gdt_descr->address = (unsigned long)gdt;
 
+	pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
+			(u32 *)&gdt[GDT_ENTRY_PDA].b,
+			(unsigned long)pda, sizeof(*pda) - 1,
+			0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
+
+	memset(pda, 0, sizeof(*pda));
+	pda->_pda = pda;
+
+	return 1;
+}
+
+/* Common CPU init for both boot and secondary CPUs */
+static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
+{
+	struct tss_struct * t = &per_cpu(init_tss, cpu);
+	struct thread_struct *thread = &curr->thread;
+	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
+
+	/* Reinit these anyway, even if they've already been done (on
+	   the boot CPU, this will transition from the boot gdt+pda to
+	   the real ones). */
 	load_gdt(cpu_gdt_descr);
+
+	if (cpu_test_and_set(cpu, cpu_initialized)) {
+		printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
+		for (;;) local_irq_enable();
+	}
+
+	printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+
+	if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
+		clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+	if (tsc_disable && cpu_has_tsc) {
+		printk(KERN_NOTICE "Disabling TSC...\n");
+		/**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
+		clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
+		set_in_cr4(X86_CR4_TSD);
+	}
+
 	load_idt(&idt_descr);
 
 	/*
 	 * Set up and load the per-CPU TSS and LDT
 	 */
 	atomic_inc(&init_mm.mm_count);
-	current->active_mm = &init_mm;
-	BUG_ON(current->mm);
-	enter_lazy_tlb(&init_mm, current);
+	curr->active_mm = &init_mm;
+	if (curr->mm)
+		BUG();
+	enter_lazy_tlb(&init_mm, curr);
 
 	load_esp0(t, thread);
 	set_tss_desc(cpu,t);
@@ -690,6 +750,37 @@ old_gdt:
 	mxcsr_feature_mask_init();
 }
 
+/* Entrypoint to initialize secondary CPU */
+void __cpuinit secondary_cpu_init(void)
+{
+	int cpu = smp_processor_id();
+	struct task_struct *curr = current;
+
+	_cpu_init(cpu, curr);
+}
+
+/*
+ * cpu_init() initializes state that is per-CPU. Some data is already
+ * initialized (naturally) in the bootstrap process, such as the GDT
+ * and IDT. We reload them nevertheless, this function acts as a
+ * 'CPU state barrier', nothing should get across.
+ */
+void __cpuinit cpu_init(void)
+{
+	int cpu = smp_processor_id();
+	struct task_struct *curr = current;
+
+	/* Set up the real GDT and PDA, so we can transition from the
+	   boot versions. */
+	if (!init_gdt(cpu, curr)) {
+		/* failed to allocate something; not much we can do... */
+		for (;;)
+			local_irq_enable();
+	}
+
+	_cpu_init(cpu, curr);
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 void __cpuinit cpu_uninit(void)
 {
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 4bb8b77..0956366 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -52,6 +52,7 @@
 #include <asm/desc.h>
 #include <asm/arch_hooks.h>
 #include <asm/nmi.h>
+#include <asm/pda.h>
 
 #include <mach_apic.h>
 #include <mach_wakecpu.h>
@@ -536,11 +537,11 @@ set_cpu_sibling_map(int cpu)
 static void __devinit start_secondary(void *unused)
 {
 	/*
-	 * Dont put anything before smp_callin(), SMP
+	 * Don't put *anything* before secondary_cpu_init(), SMP
 	 * booting is too fragile that we want to limit the
 	 * things done here to the most necessary things.
 	 */
-	cpu_init();
+	secondary_cpu_init();
 	preempt_disable();
 	smp_callin();
 	while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
@@ -599,13 +600,16 @@ void __devinit initialize_secondary(void)
 		"movl %0,%%esp\n\t"
 		"jmp *%1"
 		:
-		:"r" (current->thread.esp),"r" (current->thread.eip));
+		:"m" (current->thread.esp),"m" (current->thread.eip));
 }
 
+/* Static state in head.S used to set up a CPU */
 extern struct {
 	void * esp;
 	unsigned short ss;
 } stack_start;
+extern struct i386_pda *start_pda;
+extern struct Xgt_desc_struct cpu_gdt_descr;
 
 #ifdef CONFIG_NUMA
 
@@ -936,9 +940,6 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
 	unsigned long start_eip;
 	unsigned short nmi_high = 0, nmi_low = 0;
 
-	++cpucount;
-	alternatives_smp_switch(1);
-
 	/*
 	 * We can't use kernel_thread since we must avoid to
 	 * reschedule the child.
@@ -946,15 +947,30 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
 	idle = alloc_idle_task(cpu);
 	if (IS_ERR(idle))
 		panic("failed fork for CPU %d", cpu);
+
+	/* Pre-allocate and initialize the CPU's GDT and PDA so it
+	   doesn't have to do any memory allocation during the
+	   delicate CPU-bringup phase. */
+	if (!init_gdt(cpu, idle)) {
+		printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu);
+		return -1;	/* ? */
+	}
+
 	idle->thread.eip = (unsigned long) start_secondary;
 	/* start_eip had better be page-aligned! */
 	start_eip = setup_trampoline();
 
+	++cpucount;
+	alternatives_smp_switch(1);
+
 	/* So we see what's up   */
 	printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
 	/* Stack for startup_32 can be just as for start_secondary onwards */
 	stack_start.esp = (void *) idle->thread.esp;
 
+	start_pda = cpu_pda(cpu);
+	cpu_gdt_descr = per_cpu(cpu_gdt_descr, cpu);
+
 	irq_ctx_init(cpu);
 
 	x86_cpu_to_apicid[cpu] = apicid;
diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c
index f3fea2a..55428e6 100644
--- a/arch/i386/mach-voyager/voyager_smp.c
+++ b/arch/i386/mach-voyager/voyager_smp.c
@@ -28,6 +28,7 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/arch_hooks.h>
+#include <asm/pda.h>
 
 /* TLB state -- visible externally, indexed physically */
 DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0 };
@@ -422,6 +423,7 @@ find_smp_config(void)
 	     VOYAGER_SUS_IN_CONTROL_PORT);
 
 	current_thread_info()->cpu = boot_cpu_id;
+	write_pda(cpu_number, boot_cpu_id);
 }
 
 /*
@@ -458,7 +460,7 @@ start_secondary(void *unused)
 	/* external functions not defined in the headers */
 	extern void calibrate_delay(void);
 
-	cpu_init();
+	secondary_cpu_init();
 
 	/* OK, we're in the routine */
 	ack_CPI(VIC_CPU_BOOT_CPI);
@@ -578,6 +580,15 @@ do_boot_cpu(__u8 cpu)
 	/* init_tasks (in sched.c) is indexed logically */
 	stack_start.esp = (void *) idle->thread.esp;
 
+	/* Pre-allocate and initialize the CPU's GDT and PDA so it
+	   doesn't have to do any memory allocation during the
+	   delicate CPU-bringup phase. */
+	if (!init_gdt(cpu, idle)) {
+		printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu);
+		cpucount--;
+		return;
+	}
+
 	irq_ctx_init(cpu);
 
 	/* Note: Don't modify initial ss override */
@@ -1963,4 +1974,5 @@ void __init
 smp_setup_processor_id(void)
 {
 	current_thread_info()->cpu = hard_smp_processor_id();
+	write_pda(cpu_number, hard_smp_processor_id());
 }
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index e0ddca9..a9f2041 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -727,4 +727,7 @@ extern unsigned long boot_option_idle_override;
 extern void enable_sep_cpu(void);
 extern int sysenter_setup(void);
 
+extern int init_gdt(int cpu, struct task_struct *idle);
+extern void secondary_cpu_init(void);
+
 #endif /* __ASM_I386_PROCESSOR_H */
-- 
cgit v0.10.2


From f95d47caae5302a63d92be9a0292abc90e2a14e1 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 7 Dec 2006 02:14:02 +0100
Subject: [PATCH] i386: Use %gs as the PDA base-segment in the kernel

This patch is the meat of the PDA change.  This patch makes several related
changes:

1: Most significantly, %gs is now used in the kernel.  This means that on
   entry, the old value of %gs is saved away, and it is reloaded with
   __KERNEL_PDA.

2: entry.S constructs the stack in the shape of struct pt_regs, and this
   is passed around the kernel so that the process's saved register
   state can be accessed.

   Unfortunately struct pt_regs doesn't currently have space for %gs
   (or %fs). This patch extends pt_regs to add space for gs (no space
   is allocated for %fs, since it won't be used, and it would just
   complicate the code in entry.S to work around the space).

3: Because %gs is now saved on the stack like %ds, %es and the integer
   registers, there are a number of places where it no longer needs to
   be handled specially; namely context switch, and saving/restoring the
   register state in a signal context.

4: And since kernel threads run in kernel space and call normal kernel
   code, they need to be created with their %gs == __KERNEL_PDA.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Chuck Ebbert <76306.1226@compuserve.com>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index 70b19807..9620872 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -72,6 +72,7 @@ void foo(void)
 	OFFSET(PT_EAX, pt_regs, eax);
 	OFFSET(PT_DS,  pt_regs, xds);
 	OFFSET(PT_ES,  pt_regs, xes);
+	OFFSET(PT_GS,  pt_regs, xgs);
 	OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
 	OFFSET(PT_EIP, pt_regs, eip);
 	OFFSET(PT_CS,  pt_regs, xcs);
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 2534e25..4e63d8c 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -593,6 +593,14 @@ void __init early_cpu_init(void)
 #endif
 }
 
+/* Make sure %gs is initialized properly in idle threads */
+struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
+{
+	memset(regs, 0, sizeof(struct pt_regs));
+	regs->xgs = __KERNEL_PDA;
+	return regs;
+}
+
 __cpuinit int alloc_gdt(int cpu)
 {
 	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
@@ -644,6 +652,14 @@ struct i386_pda boot_pda = {
 	._pda = &boot_pda,
 };
 
+static inline void set_kernel_gs(void)
+{
+	/* Set %gs for this CPU's PDA.  Memory clobber is to create a
+	   barrier with respect to any PDA operations, so the compiler
+	   doesn't move any before here. */
+	asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
+}
+
 /* Initialize the CPU's GDT and PDA.  The boot CPU does this for
    itself, but secondaries find this done for them. */
 __cpuinit int init_gdt(int cpu, struct task_struct *idle)
@@ -693,6 +709,7 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
 	   the boot CPU, this will transition from the boot gdt+pda to
 	   the real ones). */
 	load_gdt(cpu_gdt_descr);
+	set_kernel_gs();
 
 	if (cpu_test_and_set(cpu, cpu_initialized)) {
 		printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
@@ -731,8 +748,8 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
 	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
 #endif
 
-	/* Clear %fs and %gs. */
-	asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0));
+	/* Clear %fs. */
+	asm volatile ("mov %0, %%fs" : : "r" (0));
 
 	/* Clear all 6 debug registers: */
 	set_debugreg(0, 0);
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 0069bf0..b99d4a1 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -30,12 +30,13 @@
  *	18(%esp) - %eax
  *	1C(%esp) - %ds
  *	20(%esp) - %es
- *	24(%esp) - orig_eax
- *	28(%esp) - %eip
- *	2C(%esp) - %cs
- *	30(%esp) - %eflags
- *	34(%esp) - %oldesp
- *	38(%esp) - %oldss
+ *	24(%esp) - %gs
+ *	28(%esp) - orig_eax
+ *	2C(%esp) - %eip
+ *	30(%esp) - %cs
+ *	34(%esp) - %eflags
+ *	38(%esp) - %oldesp
+ *	3C(%esp) - %oldss
  *
  * "current" is in register %ebx during any slow entries.
  */
@@ -92,6 +93,9 @@ VM_MASK		= 0x00020000
 
 #define SAVE_ALL \
 	cld; \
+	pushl %gs; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	/*CFI_REL_OFFSET gs, 0;*/\
 	pushl %es; \
 	CFI_ADJUST_CFA_OFFSET 4;\
 	/*CFI_REL_OFFSET es, 0;*/\
@@ -121,7 +125,9 @@ VM_MASK		= 0x00020000
 	CFI_REL_OFFSET ebx, 0;\
 	movl $(__USER_DS), %edx; \
 	movl %edx, %ds; \
-	movl %edx, %es;
+	movl %edx, %es; \
+	movl $(__KERNEL_PDA), %edx; \
+	movl %edx, %gs
 
 #define RESTORE_INT_REGS \
 	popl %ebx;	\
@@ -154,17 +160,22 @@ VM_MASK		= 0x00020000
 2:	popl %es;	\
 	CFI_ADJUST_CFA_OFFSET -4;\
 	/*CFI_RESTORE es;*/\
-.section .fixup,"ax";	\
-3:	movl $0,(%esp);	\
-	jmp 1b;		\
+3:	popl %gs;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	/*CFI_RESTORE gs;*/\
+.pushsection .fixup,"ax";	\
 4:	movl $0,(%esp);	\
+	jmp 1b;		\
+5:	movl $0,(%esp);	\
 	jmp 2b;		\
-.previous;		\
+6:	movl $0,(%esp);	\
+	jmp 3b;		\
 .section __ex_table,"a";\
 	.align 4;	\
-	.long 1b,3b;	\
-	.long 2b,4b;	\
-.previous
+	.long 1b,4b;	\
+	.long 2b,5b;	\
+	.long 3b,6b;	\
+.popsection
 
 #define RING0_INT_FRAME \
 	CFI_STARTPROC simple;\
@@ -231,6 +242,7 @@ check_userspace:
 	andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
 	cmpl $USER_RPL, %eax
 	jb resume_kernel		# not returning to v8086 or userspace
+
 ENTRY(resume_userspace)
  	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
@@ -327,9 +339,16 @@ sysenter_past_esp:
 	movl PT_OLDESP(%esp), %ecx
 	xorl %ebp,%ebp
 	TRACE_IRQS_ON
+1:	mov  PT_GS(%esp), %gs
 	ENABLE_INTERRUPTS_SYSEXIT
 	CFI_ENDPROC
-
+.pushsection .fixup,"ax"
+2:	movl $0,PT_GS(%esp)
+	jmp 1b
+.section __ex_table,"a"
+	.align 4
+	.long 1b,2b
+.popsection
 
 	# system call handler stub
 ENTRY(system_call)
@@ -375,7 +394,7 @@ restore_nocheck:
 	TRACE_IRQS_IRET
 restore_nocheck_notrace:
 	RESTORE_REGS
-	addl $4, %esp
+	addl $4, %esp			# skip orig_eax/error_code
 	CFI_ADJUST_CFA_OFFSET -4
 1:	INTERRUPT_RETURN
 .section .fixup,"ax"
@@ -588,6 +607,10 @@ KPROBE_ENTRY(page_fault)
 	CFI_ADJUST_CFA_OFFSET 4
 	ALIGN
 error_code:
+	/* the function address is in %gs's slot on the stack */
+	pushl %es
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET es, 0*/
 	pushl %ds
 	CFI_ADJUST_CFA_OFFSET 4
 	/*CFI_REL_OFFSET ds, 0*/
@@ -613,18 +636,20 @@ error_code:
 	CFI_ADJUST_CFA_OFFSET 4
 	CFI_REL_OFFSET ebx, 0
 	cld
-	pushl %es
+	pushl %gs
 	CFI_ADJUST_CFA_OFFSET 4
-	/*CFI_REL_OFFSET es, 0*/
+	/*CFI_REL_OFFSET gs, 0*/
+	movl $(__KERNEL_PDA), %ecx
+	movl %ecx, %gs
 	UNWIND_ESPFIX_STACK
 	popl %ecx
 	CFI_ADJUST_CFA_OFFSET -4
 	/*CFI_REGISTER es, ecx*/
-	movl PT_ES(%esp), %edi		# get the function address
+	movl PT_GS(%esp), %edi		# get the function address
 	movl PT_ORIG_EAX(%esp), %edx	# get the error code
-	movl $-1, PT_ORIG_EAX(%esp)
-	movl %ecx, PT_ES(%esp)
-	/*CFI_REL_OFFSET es, ES*/
+	movl $-1, PT_ORIG_EAX(%esp)	# no syscall to restart
+	mov  %ecx, PT_GS(%esp)
+	/*CFI_REL_OFFSET gs, ES*/
 	movl $(__USER_DS), %ecx
 	movl %ecx, %ds
 	movl %ecx, %es
@@ -936,6 +961,7 @@ ENTRY(arch_unwind_init_running)
 	movl	%ebx, PT_EAX(%edx)
 	movl	$__USER_DS, PT_DS(%edx)
 	movl	$__USER_DS, PT_ES(%edx)
+	movl	$0, PT_GS(%edx)
 	movl	%ebx, PT_ORIG_EAX(%edx)
 	movl	%ecx, PT_EIP(%edx)
 	movl	12(%esp), %ecx
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index 4a83384..5b14e95 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -302,6 +302,7 @@ is386:	movl $2,%ecx		# set MP
 	movl %eax,%cr0
 
 	call check_x87
+	call setup_pda
 	lgdt cpu_gdt_descr
 	lidt idt_descr
 	ljmp $(__KERNEL_CS),$1f
@@ -312,10 +313,13 @@ is386:	movl $2,%ecx		# set MP
 	movl %eax,%ds
 	movl %eax,%es
 
-	xorl %eax,%eax			# Clear FS/GS and LDT
+	xorl %eax,%eax			# Clear FS and LDT
 	movl %eax,%fs
-	movl %eax,%gs
 	lldt %ax
+
+	movl $(__KERNEL_PDA),%eax
+	mov  %eax,%gs
+
 	cld			# gcc2 wants the direction flag cleared at all times
 	pushl $0		# fake return address for unwinder
 #ifdef CONFIG_SMP
@@ -346,6 +350,23 @@ check_x87:
 	ret
 
 /*
+ * Point the GDT at this CPU's PDA.  On boot this will be
+ * cpu_gdt_table and boot_pda; for secondary CPUs, these will be
+ * that CPU's GDT and PDA.
+ */
+setup_pda:
+	/* get the PDA pointer */
+	movl start_pda, %eax
+
+	/* slot the PDA address into the GDT */
+	mov cpu_gdt_descr+2, %ecx
+	mov %ax, (__KERNEL_PDA+0+2)(%ecx)		/* base & 0x0000ffff */
+	shr $16, %eax
+	mov %al, (__KERNEL_PDA+4+0)(%ecx)		/* base & 0x00ff0000 */
+	mov %ah, (__KERNEL_PDA+4+3)(%ecx)		/* base & 0xff000000 */
+	ret
+
+/*
  *  setup_idt
  *
  *  sets up a idt with 256 entries pointing to
@@ -484,6 +505,8 @@ ENTRY(empty_zero_page)
  * This starts the data section.
  */
 .data
+ENTRY(start_pda)
+	.long boot_pda
 
 ENTRY(stack_start)
 	.long init_thread_union+THREAD_SIZE
@@ -525,7 +548,7 @@ idt_descr:
 
 # boot GDT descriptor (later on used by CPU#0):
 	.word 0				# 32 bit align gdt_desc.address
-cpu_gdt_descr:
+ENTRY(cpu_gdt_descr)
 	.word GDT_ENTRIES*8-1
 	.long cpu_gdt_table
 
@@ -585,7 +608,7 @@ ENTRY(cpu_gdt_table)
 	.quad 0x004092000000ffff	/* 0xc8 APM DS    data */
 
 	.quad 0x00c0920000000000	/* 0xd0 - ESPFIX SS */
-	.quad 0x0000000000000000	/* 0xd8 - PDA */
+	.quad 0x00cf92000000ffff	/* 0xd8 - PDA */
 	.quad 0x0000000000000000	/* 0xe0 - unused */
 	.quad 0x0000000000000000	/* 0xe8 - unused */
 	.quad 0x0000000000000000	/* 0xf0 - unused */
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index ae924c4..905364d 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -56,6 +56,7 @@
 
 #include <asm/tlbflush.h>
 #include <asm/cpu.h>
+#include <asm/pda.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
@@ -346,6 +347,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
 
 	regs.xds = __USER_DS;
 	regs.xes = __USER_DS;
+	regs.xgs = __KERNEL_PDA;
 	regs.orig_eax = -1;
 	regs.eip = (unsigned long) kernel_thread_helper;
 	regs.xcs = __KERNEL_CS | get_kernel_rpl();
@@ -431,7 +433,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
 	p->thread.eip = (unsigned long) ret_from_fork;
 
 	savesegment(fs,p->thread.fs);
-	savesegment(gs,p->thread.gs);
 
 	tsk = current;
 	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -659,16 +660,16 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
 	load_esp0(tss, next);
 
 	/*
-	 * Save away %fs and %gs. No need to save %es and %ds, as
-	 * those are always kernel segments while inside the kernel.
-	 * Doing this before setting the new TLS descriptors avoids
-	 * the situation where we temporarily have non-reloadable
-	 * segments in %fs and %gs.  This could be an issue if the
-	 * NMI handler ever used %fs or %gs (it does not today), or
-	 * if the kernel is running inside of a hypervisor layer.
+	 * Save away %fs. No need to save %gs, as it was saved on the
+	 * stack on entry.  No need to save %es and %ds, as those are
+	 * always kernel segments while inside the kernel.  Doing this
+	 * before setting the new TLS descriptors avoids the situation
+	 * where we temporarily have non-reloadable segments in %fs
+	 * and %gs.  This could be an issue if the NMI handler ever
+	 * used %fs or %gs (it does not today), or if the kernel is
+	 * running inside of a hypervisor layer.
 	 */
 	savesegment(fs, prev->fs);
-	savesegment(gs, prev->gs);
 
 	/*
 	 * Load the per-thread Thread-Local Storage descriptor.
@@ -676,16 +677,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
 	load_TLS(next, cpu);
 
 	/*
-	 * Restore %fs and %gs if needed.
+	 * Restore %fs if needed.
 	 *
-	 * Glibc normally makes %fs be zero, and %gs is one of
-	 * the TLS segments.
+	 * Glibc normally makes %fs be zero.
 	 */
 	if (unlikely(prev->fs | next->fs))
 		loadsegment(fs, next->fs);
 
-	if (prev->gs | next->gs)
-		loadsegment(gs, next->gs);
 
 	/*
 	 * Restore IOPL if needed.
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index 43002cf..65d7620 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -128,7 +128,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
 			 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
 			 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
 
-	GET_SEG(gs);
+	COPY_SEG(gs);
 	GET_SEG(fs);
 	COPY_SEG(es);
 	COPY_SEG(ds);
@@ -244,9 +244,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
 {
 	int tmp, err = 0;
 
-	tmp = 0;
-	savesegment(gs, tmp);
-	err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
+	err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs);
 	savesegment(fs, tmp);
 	err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
 
diff --git a/include/asm-i386/mmu_context.h b/include/asm-i386/mmu_context.h
index 1b14953..68ff102 100644
--- a/include/asm-i386/mmu_context.h
+++ b/include/asm-i386/mmu_context.h
@@ -62,8 +62,8 @@ static inline void switch_mm(struct mm_struct *prev,
 #endif
 }
 
-#define deactivate_mm(tsk, mm) \
-	asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0))
+#define deactivate_mm(tsk, mm)			\
+	asm("movl %0,%%fs": :"r" (0));
 
 #define activate_mm(prev, next) \
 	switch_mm((prev),(next),NULL)
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index a9f2041..f73cf83 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -473,6 +473,7 @@ struct thread_struct {
 	.vm86_info = NULL,						\
 	.sysenter_cs = __KERNEL_CS,					\
 	.io_bitmap_ptr = NULL,						\
+	.gs = __KERNEL_PDA,						\
 }
 
 /*
@@ -500,7 +501,8 @@ static inline void load_esp0(struct tss_struct *tss, struct thread_struct *threa
 }
 
 #define start_thread(regs, new_eip, new_esp) do {		\
-	__asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0));	\
+	__asm__("movl %0,%%fs": :"r" (0));			\
+	regs->xgs = 0;						\
 	set_fs(USER_DS);					\
 	regs->xds = __USER_DS;					\
 	regs->xes = __USER_DS;					\
diff --git a/include/asm-i386/ptrace.h b/include/asm-i386/ptrace.h
index d505f50..bdbc894 100644
--- a/include/asm-i386/ptrace.h
+++ b/include/asm-i386/ptrace.h
@@ -16,6 +16,8 @@ struct pt_regs {
 	long eax;
 	int  xds;
 	int  xes;
+	/* int  xfs; */
+	int  xgs;
 	long orig_eax;
 	long eip;
 	int  xcs;
diff --git a/kernel/fork.c b/kernel/fork.c
index 8cdd3e7..fd22245 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1303,7 +1303,7 @@ fork_out:
 	return ERR_PTR(retval);
 }
 
-struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
+noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
 {
 	memset(regs, 0, sizeof(struct pt_regs));
 	return regs;
-- 
cgit v0.10.2


From 66e10a44d724f1464b5e8b5a3eae1e2cbbc2cca6 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 7 Dec 2006 02:14:02 +0100
Subject: [PATCH] i386: Fix places where using %gs changes the usermode ABI

There are a few places where the change in struct pt_regs and the use of %gs
affect the userspace ABI.  These are primarily debugging interfaces where
thread state can be inspected or extracted.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Chuck Ebbert <76306.1226@compuserve.com>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 905364d..dc42725 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -315,8 +315,8 @@ void show_regs(struct pt_regs * regs)
 		regs->eax,regs->ebx,regs->ecx,regs->edx);
 	printk("ESI: %08lx EDI: %08lx EBP: %08lx",
 		regs->esi, regs->edi, regs->ebp);
-	printk(" DS: %04x ES: %04x\n",
-		0xffff & regs->xds,0xffff & regs->xes);
+	printk(" DS: %04x ES: %04x GS: %04x\n",
+	       0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs);
 
 	cr0 = read_cr0();
 	cr2 = read_cr2();
@@ -509,7 +509,7 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
 	dump->regs.ds = regs->xds;
 	dump->regs.es = regs->xes;
 	savesegment(fs,dump->regs.fs);
-	savesegment(gs,dump->regs.gs);
+	dump->regs.gs = regs->xgs;
 	dump->regs.orig_eax = regs->orig_eax;
 	dump->regs.eip = regs->eip;
 	dump->regs.cs = regs->xcs;
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 775f50e..f3f94ac 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -94,13 +94,9 @@ static int putreg(struct task_struct *child,
 				return -EIO;
 			child->thread.fs = value;
 			return 0;
-		case GS:
-			if (value && (value & 3) != 3)
-				return -EIO;
-			child->thread.gs = value;
-			return 0;
 		case DS:
 		case ES:
+		case GS:
 			if (value && (value & 3) != 3)
 				return -EIO;
 			value &= 0xffff;
@@ -116,8 +112,8 @@ static int putreg(struct task_struct *child,
 			value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
 			break;
 	}
-	if (regno > GS*4)
-		regno -= 2*4;
+	if (regno > ES*4)
+		regno -= 1*4;
 	put_stack_long(child, regno - sizeof(struct pt_regs), value);
 	return 0;
 }
@@ -131,18 +127,16 @@ static unsigned long getreg(struct task_struct *child,
 		case FS:
 			retval = child->thread.fs;
 			break;
-		case GS:
-			retval = child->thread.gs;
-			break;
 		case DS:
 		case ES:
+		case GS:
 		case SS:
 		case CS:
 			retval = 0xffff;
 			/* fall through */
 		default:
-			if (regno > GS*4)
-				regno -= 2*4;
+			if (regno > ES*4)
+				regno -= 1*4;
 			regno = regno - sizeof(struct pt_regs);
 			retval &= get_stack_long(child, regno);
 	}
diff --git a/include/asm-i386/elf.h b/include/asm-i386/elf.h
index 3a05436..45d21a0 100644
--- a/include/asm-i386/elf.h
+++ b/include/asm-i386/elf.h
@@ -91,7 +91,7 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
 	pr_reg[7] = regs->xds;				\
 	pr_reg[8] = regs->xes;				\
 	savesegment(fs,pr_reg[9]);			\
-	savesegment(gs,pr_reg[10]);			\
+	pr_reg[10] = regs->xgs;				\
 	pr_reg[11] = regs->orig_eax;			\
 	pr_reg[12] = regs->eip;				\
 	pr_reg[13] = regs->xcs;				\
diff --git a/include/asm-i386/unwind.h b/include/asm-i386/unwind.h
index 5031d69..601fc67 100644
--- a/include/asm-i386/unwind.h
+++ b/include/asm-i386/unwind.h
@@ -71,6 +71,7 @@ static inline void arch_unw_init_blocked(struct unwind_frame_info *info)
 	info->regs.xss = __KERNEL_DS;
 	info->regs.xds = __USER_DS;
 	info->regs.xes = __USER_DS;
+	info->regs.xgs = __KERNEL_PDA;
 }
 
 extern asmlinkage int arch_unwind_init_running(struct unwind_frame_info *,
-- 
cgit v0.10.2


From 49d26b6eaa8e970c8cf6e299e6ccba2474191bf5 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 7 Dec 2006 02:14:03 +0100
Subject: [PATCH] i386: Update sys_vm86 to cope with changed pt_regs and %gs
 usage

sys_vm86 uses a struct kernel_vm86_regs, which is identical to pt_regs, but
adds an extra space for all the segment registers.  Previously this structure
was completely independent, so changes in pt_regs had to be reflected in
kernel_vm86_regs.  This changes just embeds pt_regs in kernel_vm86_regs, and
makes the appropriate changes to vm86.c to deal with the new naming.

Also, since %gs is dealt with differently in the kernel, this change adjusts
vm86.c to reflect this.

While making these changes, I also cleaned up some frankly bizarre code which
was added when auditing was added to sys_vm86.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Chuck Ebbert <76306.1226@compuserve.com>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
index cbcd61d..be2f96e 100644
--- a/arch/i386/kernel/vm86.c
+++ b/arch/i386/kernel/vm86.c
@@ -43,6 +43,7 @@
 #include <linux/highmem.h>
 #include <linux/ptrace.h>
 #include <linux/audit.h>
+#include <linux/stddef.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -72,10 +73,10 @@
 /*
  * 8- and 16-bit register defines..
  */
-#define AL(regs)	(((unsigned char *)&((regs)->eax))[0])
-#define AH(regs)	(((unsigned char *)&((regs)->eax))[1])
-#define IP(regs)	(*(unsigned short *)&((regs)->eip))
-#define SP(regs)	(*(unsigned short *)&((regs)->esp))
+#define AL(regs)	(((unsigned char *)&((regs)->pt.eax))[0])
+#define AH(regs)	(((unsigned char *)&((regs)->pt.eax))[1])
+#define IP(regs)	(*(unsigned short *)&((regs)->pt.eip))
+#define SP(regs)	(*(unsigned short *)&((regs)->pt.esp))
 
 /*
  * virtual flags (16 and 32-bit versions)
@@ -89,10 +90,37 @@
 #define SAFE_MASK	(0xDD5)
 #define RETURN_MASK	(0xDFF)
 
-#define VM86_REGS_PART2 orig_eax
-#define VM86_REGS_SIZE1 \
-        ( (unsigned)( & (((struct kernel_vm86_regs *)0)->VM86_REGS_PART2) ) )
-#define VM86_REGS_SIZE2 (sizeof(struct kernel_vm86_regs) - VM86_REGS_SIZE1)
+/* convert kernel_vm86_regs to vm86_regs */
+static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
+				  const struct kernel_vm86_regs *regs)
+{
+	int ret = 0;
+
+	/* kernel_vm86_regs is missing xfs, so copy everything up to
+	   (but not including) xgs, and then rest after xgs. */
+	ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.xgs));
+	ret += copy_to_user(&user->__null_gs, &regs->pt.xgs,
+			    sizeof(struct kernel_vm86_regs) -
+			    offsetof(struct kernel_vm86_regs, pt.xgs));
+
+	return ret;
+}
+
+/* convert vm86_regs to kernel_vm86_regs */
+static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
+				    const struct vm86_regs __user *user,
+				    unsigned extra)
+{
+	int ret = 0;
+
+	ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.xgs));
+	ret += copy_from_user(&regs->pt.xgs, &user->__null_gs,
+			      sizeof(struct kernel_vm86_regs) -
+			      offsetof(struct kernel_vm86_regs, pt.xgs) +
+			      extra);
+
+	return ret;
+}
 
 struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
 struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
@@ -112,10 +140,8 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
 		printk("no vm86_info: BAD\n");
 		do_exit(SIGSEGV);
 	}
-	set_flags(regs->eflags, VEFLAGS, VIF_MASK | current->thread.v86mask);
-	tmp = copy_to_user(&current->thread.vm86_info->regs,regs, VM86_REGS_SIZE1);
-	tmp += copy_to_user(&current->thread.vm86_info->regs.VM86_REGS_PART2,
-		&regs->VM86_REGS_PART2, VM86_REGS_SIZE2);
+	set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask);
+	tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs);
 	tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap);
 	if (tmp) {
 		printk("vm86: could not access userspace vm86_info\n");
@@ -129,9 +155,11 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
 	current->thread.saved_esp0 = 0;
 	put_cpu();
 
-	loadsegment(fs, current->thread.saved_fs);
-	loadsegment(gs, current->thread.saved_gs);
 	ret = KVM86->regs32;
+
+	loadsegment(fs, current->thread.saved_fs);
+	ret->xgs = current->thread.saved_gs;
+
 	return ret;
 }
 
@@ -183,9 +211,9 @@ asmlinkage int sys_vm86old(struct pt_regs regs)
 	tsk = current;
 	if (tsk->thread.saved_esp0)
 		goto out;
-	tmp  = copy_from_user(&info, v86, VM86_REGS_SIZE1);
-	tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2,
-		(long)&info.vm86plus - (long)&info.regs.VM86_REGS_PART2);
+	tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
+				       offsetof(struct kernel_vm86_struct, vm86plus) -
+				       sizeof(info.regs));
 	ret = -EFAULT;
 	if (tmp)
 		goto out;
@@ -233,9 +261,9 @@ asmlinkage int sys_vm86(struct pt_regs regs)
 	if (tsk->thread.saved_esp0)
 		goto out;
 	v86 = (struct vm86plus_struct __user *)regs.ecx;
-	tmp  = copy_from_user(&info, v86, VM86_REGS_SIZE1);
-	tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2,
-		(long)&info.regs32 - (long)&info.regs.VM86_REGS_PART2);
+	tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
+				       offsetof(struct kernel_vm86_struct, regs32) -
+				       sizeof(info.regs));
 	ret = -EFAULT;
 	if (tmp)
 		goto out;
@@ -252,15 +280,15 @@ out:
 static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
 {
 	struct tss_struct *tss;
-	long eax;
 /*
  * make sure the vm86() system call doesn't try to do anything silly
  */
-	info->regs.__null_ds = 0;
-	info->regs.__null_es = 0;
+	info->regs.pt.xds = 0;
+	info->regs.pt.xes = 0;
+	info->regs.pt.xgs = 0;
 
-/* we are clearing fs,gs later just before "jmp resume_userspace",
- * because starting with Linux 2.1.x they aren't no longer saved/restored
+/* we are clearing fs later just before "jmp resume_userspace",
+ * because it is not saved/restored.
  */
 
 /*
@@ -268,10 +296,10 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
  * has set it up safely, so this makes sure interrupt etc flags are
  * inherited from protected mode.
  */
- 	VEFLAGS = info->regs.eflags;
-	info->regs.eflags &= SAFE_MASK;
-	info->regs.eflags |= info->regs32->eflags & ~SAFE_MASK;
-	info->regs.eflags |= VM_MASK;
+ 	VEFLAGS = info->regs.pt.eflags;
+	info->regs.pt.eflags &= SAFE_MASK;
+	info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK;
+	info->regs.pt.eflags |= VM_MASK;
 
 	switch (info->cpu_type) {
 		case CPU_286:
@@ -294,7 +322,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	info->regs32->eax = 0;
 	tsk->thread.saved_esp0 = tsk->thread.esp0;
 	savesegment(fs, tsk->thread.saved_fs);
-	savesegment(gs, tsk->thread.saved_gs);
+	tsk->thread.saved_gs = info->regs32->xgs;
 
 	tss = &per_cpu(init_tss, get_cpu());
 	tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
@@ -306,19 +334,18 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	tsk->thread.screen_bitmap = info->screen_bitmap;
 	if (info->flags & VM86_SCREEN_BITMAP)
 		mark_screen_rdonly(tsk->mm);
-	__asm__ __volatile__("xorl %eax,%eax; movl %eax,%fs; movl %eax,%gs\n\t");
-	__asm__ __volatile__("movl %%eax, %0\n" :"=r"(eax));
 
 	/*call audit_syscall_exit since we do not exit via the normal paths */
 	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(eax), eax);
+		audit_syscall_exit(AUDITSC_RESULT(0), 0);
 
 	__asm__ __volatile__(
 		"movl %0,%%esp\n\t"
 		"movl %1,%%ebp\n\t"
+		"mov  %2, %%fs\n\t"
 		"jmp resume_userspace"
 		: /* no outputs */
-		:"r" (&info->regs), "r" (task_thread_info(tsk)));
+		:"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
 	/* we never return here */
 }
 
@@ -348,12 +375,12 @@ static inline void clear_IF(struct kernel_vm86_regs * regs)
 
 static inline void clear_TF(struct kernel_vm86_regs * regs)
 {
-	regs->eflags &= ~TF_MASK;
+	regs->pt.eflags &= ~TF_MASK;
 }
 
 static inline void clear_AC(struct kernel_vm86_regs * regs)
 {
-	regs->eflags &= ~AC_MASK;
+	regs->pt.eflags &= ~AC_MASK;
 }
 
 /* It is correct to call set_IF(regs) from the set_vflags_*
@@ -370,7 +397,7 @@ static inline void clear_AC(struct kernel_vm86_regs * regs)
 static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs)
 {
 	set_flags(VEFLAGS, eflags, current->thread.v86mask);
-	set_flags(regs->eflags, eflags, SAFE_MASK);
+	set_flags(regs->pt.eflags, eflags, SAFE_MASK);
 	if (eflags & IF_MASK)
 		set_IF(regs);
 	else
@@ -380,7 +407,7 @@ static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs
 static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs)
 {
 	set_flags(VFLAGS, flags, current->thread.v86mask);
-	set_flags(regs->eflags, flags, SAFE_MASK);
+	set_flags(regs->pt.eflags, flags, SAFE_MASK);
 	if (flags & IF_MASK)
 		set_IF(regs);
 	else
@@ -389,7 +416,7 @@ static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_reg
 
 static inline unsigned long get_vflags(struct kernel_vm86_regs * regs)
 {
-	unsigned long flags = regs->eflags & RETURN_MASK;
+	unsigned long flags = regs->pt.eflags & RETURN_MASK;
 
 	if (VEFLAGS & VIF_MASK)
 		flags |= IF_MASK;
@@ -493,7 +520,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
 	unsigned long __user *intr_ptr;
 	unsigned long segoffs;
 
-	if (regs->cs == BIOSSEG)
+	if (regs->pt.xcs == BIOSSEG)
 		goto cannot_handle;
 	if (is_revectored(i, &KVM86->int_revectored))
 		goto cannot_handle;
@@ -505,9 +532,9 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
 	if ((segoffs >> 16) == BIOSSEG)
 		goto cannot_handle;
 	pushw(ssp, sp, get_vflags(regs), cannot_handle);
-	pushw(ssp, sp, regs->cs, cannot_handle);
+	pushw(ssp, sp, regs->pt.xcs, cannot_handle);
 	pushw(ssp, sp, IP(regs), cannot_handle);
-	regs->cs = segoffs >> 16;
+	regs->pt.xcs = segoffs >> 16;
 	SP(regs) -= 6;
 	IP(regs) = segoffs & 0xffff;
 	clear_TF(regs);
@@ -524,7 +551,7 @@ int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno
 	if (VMPI.is_vm86pus) {
 		if ( (trapno==3) || (trapno==1) )
 			return_to_32bit(regs, VM86_TRAP + (trapno << 8));
-		do_int(regs, trapno, (unsigned char __user *) (regs->ss << 4), SP(regs));
+		do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs));
 		return 0;
 	}
 	if (trapno !=1)
@@ -560,10 +587,10 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
 		handle_vm86_trap(regs, 0, 1); \
 	return; } while (0)
 
-	orig_flags = *(unsigned short *)&regs->eflags;
+	orig_flags = *(unsigned short *)&regs->pt.eflags;
 
-	csp = (unsigned char __user *) (regs->cs << 4);
-	ssp = (unsigned char __user *) (regs->ss << 4);
+	csp = (unsigned char __user *) (regs->pt.xcs << 4);
+	ssp = (unsigned char __user *) (regs->pt.xss << 4);
 	sp = SP(regs);
 	ip = IP(regs);
 
@@ -650,7 +677,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
 			SP(regs) += 6;
 		}
 		IP(regs) = newip;
-		regs->cs = newcs;
+		regs->pt.xcs = newcs;
 		CHECK_IF_IN_TRAP;
 		if (data32) {
 			set_vflags_long(newflags, regs);
diff --git a/include/asm-i386/vm86.h b/include/asm-i386/vm86.h
index 952fd69..a5edf51 100644
--- a/include/asm-i386/vm86.h
+++ b/include/asm-i386/vm86.h
@@ -145,26 +145,13 @@ struct vm86plus_struct {
  * at the end of the structure. Look at ptrace.h to see the "normal"
  * setup. For user space layout see 'struct vm86_regs' above.
  */
+#include <asm/ptrace.h>
 
 struct kernel_vm86_regs {
 /*
  * normal regs, with special meaning for the segment descriptors..
  */
-	long ebx;
-	long ecx;
-	long edx;
-	long esi;
-	long edi;
-	long ebp;
-	long eax;
-	long __null_ds;
-	long __null_es;
-	long orig_eax;
-	long eip;
-	unsigned short cs, __csh;
-	long eflags;
-	long esp;
-	unsigned short ss, __ssh;
+	struct pt_regs pt;
 /*
  * these are specific to v86 mode:
  */
-- 
cgit v0.10.2


From b2938f880890ebfcccad356275e0000193153623 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 7 Dec 2006 02:14:03 +0100
Subject: [PATCH] i386: Implement smp_processor_id() with the PDA

Use the cpu_number in the PDA to implement raw_smp_processor_id.  This is a
little simpler than using thread_info, though the cpu field in thread_info
cannot be removed since it is used for things other than getting the current
CPU in common code.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Chuck Ebbert <76306.1226@compuserve.com>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index 9620872..85f1b03 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -51,7 +51,6 @@ void foo(void)
 	OFFSET(TI_exec_domain, thread_info, exec_domain);
 	OFFSET(TI_flags, thread_info, flags);
 	OFFSET(TI_status, thread_info, status);
-	OFFSET(TI_cpu, thread_info, cpu);
 	OFFSET(TI_preempt_count, thread_info, preempt_count);
 	OFFSET(TI_addr_limit, thread_info, addr_limit);
 	OFFSET(TI_restart_block, thread_info, restart_block);
@@ -97,4 +96,7 @@ void foo(void)
 	DEFINE(VDSO_PRELINK, VDSO_PRELINK);
 
 	OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
+
+	BLANK();
+ 	OFFSET(PDA_cpu, i386_pda, cpu_number);
 }
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 4e63d8c..e476202 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -650,6 +650,7 @@ __cpuinit int alloc_gdt(int cpu)
 /* Initial PDA used by boot CPU */
 struct i386_pda boot_pda = {
 	._pda = &boot_pda,
+	.cpu_number = 0,
 };
 
 static inline void set_kernel_gs(void)
@@ -694,6 +695,7 @@ __cpuinit int init_gdt(int cpu, struct task_struct *idle)
 
 	memset(pda, 0, sizeof(*pda));
 	pda->_pda = pda;
+	pda->cpu_number = cpu;
 
 	return 1;
 }
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index b99d4a1..d7423ef 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -524,8 +524,7 @@ syscall_badsys:
 
 #define FIXUP_ESPFIX_STACK \
 	/* since we are on a wrong stack, we cant make it a C code :( */ \
-	GET_THREAD_INFO(%ebp); \
-	movl TI_cpu(%ebp), %ebx; \
+	movl %gs:PDA_cpu, %ebx; \
 	PER_CPU(cpu_gdt_descr, %ebx); \
 	movl GDS_address(%ebx), %ebx; \
 	GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
diff --git a/include/asm-i386/pda.h b/include/asm-i386/pda.h
index 4c39ccb..f90fde2 100644
--- a/include/asm-i386/pda.h
+++ b/include/asm-i386/pda.h
@@ -11,6 +11,8 @@
 struct i386_pda
 {
 	struct i386_pda *_pda;		/* pointer to self */
+
+	int cpu_number;
 };
 
 extern struct i386_pda *_cpu_pda[];
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h
index bd59c15..64fe624 100644
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -8,6 +8,7 @@
 #include <linux/kernel.h>
 #include <linux/threads.h>
 #include <linux/cpumask.h>
+#include <asm/pda.h>
 #endif
 
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -56,7 +57,7 @@ extern void cpu_uninit(void);
  * from the initial startup. We map APIC_BASE very early in page_setup(),
  * so this is correct in the x86 case.
  */
-#define raw_smp_processor_id() (current_thread_info()->cpu)
+#define raw_smp_processor_id() (read_pda(cpu_number))
 
 extern cpumask_t cpu_callout_map;
 extern cpumask_t cpu_callin_map;
-- 
cgit v0.10.2


From ec7fcaabbfb3c5bd5189f857b6ac7bb9745ef291 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 7 Dec 2006 02:14:03 +0100
Subject: [PATCH] i386: Implement "current" with the PDA

Use the pcurrent field in the PDA to implement the "current" macro.  This ends
up compiling down to a single instruction to get the current task.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Chuck Ebbert <76306.1226@compuserve.com>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index 85f1b03..0666eb0e 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -15,6 +15,7 @@
 #include <asm/processor.h>
 #include <asm/thread_info.h>
 #include <asm/elf.h>
+#include <asm/pda.h>
 
 #define DEFINE(sym, val) \
         asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -99,4 +100,5 @@ void foo(void)
 
 	BLANK();
  	OFFSET(PDA_cpu, i386_pda, cpu_number);
+	OFFSET(PDA_pcurrent, i386_pda, pcurrent);
 }
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index e476202..6958ae5 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -651,6 +651,7 @@ __cpuinit int alloc_gdt(int cpu)
 struct i386_pda boot_pda = {
 	._pda = &boot_pda,
 	.cpu_number = 0,
+	.pcurrent = &init_task,
 };
 
 static inline void set_kernel_gs(void)
@@ -696,6 +697,7 @@ __cpuinit int init_gdt(int cpu, struct task_struct *idle)
 	memset(pda, 0, sizeof(*pda));
 	pda->_pda = pda;
 	pda->cpu_number = cpu;
+	pda->pcurrent = idle;
 
 	return 1;
 }
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index dc42725..8749b10 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -684,6 +684,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
 	if (unlikely(prev->fs | next->fs))
 		loadsegment(fs, next->fs);
 
+	write_pda(pcurrent, next_p);
 
 	/*
 	 * Restore IOPL if needed.
diff --git a/include/asm-i386/current.h b/include/asm-i386/current.h
index 3cbbecd..5252ee0 100644
--- a/include/asm-i386/current.h
+++ b/include/asm-i386/current.h
@@ -1,13 +1,14 @@
 #ifndef _I386_CURRENT_H
 #define _I386_CURRENT_H
 
-#include <linux/thread_info.h>
+#include <asm/pda.h>
+#include <linux/compiler.h>
 
 struct task_struct;
 
-static __always_inline struct task_struct * get_current(void)
+static __always_inline struct task_struct *get_current(void)
 {
-	return current_thread_info()->task;
+	return read_pda(pcurrent);
 }
  
 #define current get_current()
diff --git a/include/asm-i386/pda.h b/include/asm-i386/pda.h
index f90fde2..08a35c4 100644
--- a/include/asm-i386/pda.h
+++ b/include/asm-i386/pda.h
@@ -7,12 +7,14 @@
 #define _I386_PDA_H
 
 #include <linux/stddef.h>
+#include <linux/types.h>
 
 struct i386_pda
 {
 	struct i386_pda *_pda;		/* pointer to self */
 
 	int cpu_number;
+	struct task_struct *pcurrent;	/* current process */
 };
 
 extern struct i386_pda *_cpu_pda[];
-- 
cgit v0.10.2


From 70463daca852db396ce17f179d2404b257ba0f66 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 7 Dec 2006 02:14:03 +0100
Subject: [PATCH] i386: Store the interrupt regs pointer in the PDA

Signed-off-by: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/include/asm-i386/irq_regs.h b/include/asm-i386/irq_regs.h
index 3dd9c0b..a1b3f7f 100644
--- a/include/asm-i386/irq_regs.h
+++ b/include/asm-i386/irq_regs.h
@@ -1 +1,27 @@
-#include <asm-generic/irq_regs.h>
+/*
+ * Per-cpu current frame pointer - the location of the last exception frame on
+ * the stack, stored in the PDA.
+ *
+ * Jeremy Fitzhardinge <jeremy@goop.org>
+ */
+#ifndef _ASM_I386_IRQ_REGS_H
+#define _ASM_I386_IRQ_REGS_H
+
+#include <asm/pda.h>
+
+static inline struct pt_regs *get_irq_regs(void)
+{
+	return read_pda(irq_regs);
+}
+
+static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
+{
+	struct pt_regs *old_regs;
+
+	old_regs = read_pda(irq_regs);
+	write_pda(irq_regs, new_regs);
+
+	return old_regs;
+}
+
+#endif /* _ASM_I386_IRQ_REGS_H */
diff --git a/include/asm-i386/pda.h b/include/asm-i386/pda.h
index 08a35c4..2ba2736 100644
--- a/include/asm-i386/pda.h
+++ b/include/asm-i386/pda.h
@@ -15,6 +15,7 @@ struct i386_pda
 
 	int cpu_number;
 	struct task_struct *pcurrent;	/* current process */
+	struct pt_regs *irq_regs;
 };
 
 extern struct i386_pda *_cpu_pda[];
-- 
cgit v0.10.2


From 63cb683c6ed56a420ba60df6a5b206a44e3f85fe Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:03 +0100
Subject: [PATCH] i386: PDA: Fix math emulator for new pt_regs

This patch fixes the math emulator, which had not been adjusted
to match the changed struct pt_regs.

AK: extracted from larger patch by Jeremy.
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/include/asm-i386/math_emu.h b/include/asm-i386/math_emu.h
index 697673b..a4b0aa3 100644
--- a/include/asm-i386/math_emu.h
+++ b/include/asm-i386/math_emu.h
@@ -21,6 +21,7 @@ struct info {
 	long ___eax;
 	long ___ds;
 	long ___es;
+	long ___fs;
 	long ___orig_eax;
 	long ___eip;
 	long ___cs;
-- 
cgit v0.10.2


From 72690a21188586022a9e65cb6f1cc8845167555a Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:03 +0100
Subject: [PATCH] x86: Don't use nested idle loops

Currently the idle loop has two nested loops -- one high level
in cpu_idle and in some low level idle functions another one.

Looping in the low level idle functions breaks the idle notifiers
because interrupts waking up sleep states need to execute
exit_idle() which is only in cpu_idle().

So don't do that, only loop in cpu_idle(). This only removes
code.

In some cases e.g. poll_idle the idle loop is a little longer
now because cpu_idle checks more things. I hope that isn't a problem
ACPI idle doesn't change behaviour because it never looped anyways.

Cc: len.brown@intel.com
Cc: eranian@hpl.hp.com
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 8749b10..8f42659 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -100,22 +100,18 @@ EXPORT_SYMBOL(enable_hlt);
  */
 void default_idle(void)
 {
-	local_irq_enable();
-
 	if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
 		current_thread_info()->status &= ~TS_POLLING;
 		smp_mb__after_clear_bit();
-		while (!need_resched()) {
-			local_irq_disable();
-			if (!need_resched())
-				safe_halt();
-			else
-				local_irq_enable();
-		}
+		local_irq_disable();
+		if (!need_resched())
+			safe_halt();	/* enables interrupts racelessly */
+		else
+			local_irq_enable();
 		current_thread_info()->status |= TS_POLLING;
 	} else {
-		while (!need_resched())
-			cpu_relax();
+		/* loop is done by the caller */
+		cpu_relax();
 	}
 }
 #ifdef CONFIG_APM_MODULE
@@ -129,14 +125,7 @@ EXPORT_SYMBOL(default_idle);
  */
 static void poll_idle (void)
 {
-	local_irq_enable();
-
-	asm volatile(
-		"2:"
-		"testl %0, %1;"
-		"rep; nop;"
-		"je 2b;"
-		: : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
+	cpu_relax();
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -257,8 +246,7 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
 static void mwait_idle(void)
 {
 	local_irq_enable();
-	while (!need_resched())
-		mwait_idle_with_hints(0, 0);
+	mwait_idle_with_hints(0, 0);
 }
 
 void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 7451a4c..0b7b4ca 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -108,17 +108,15 @@ void exit_idle(void)
  */
 static void default_idle(void)
 {
-	local_irq_enable();
-
 	current_thread_info()->status &= ~TS_POLLING;
 	smp_mb__after_clear_bit();
-	while (!need_resched()) {
-		local_irq_disable();
-		if (!need_resched())
-			safe_halt();
-		else
-			local_irq_enable();
-	}
+	local_irq_disable();
+	if (!need_resched()) {
+		/* Enables interrupts one instruction before HLT.
+		   x86 special cases this so there is no race. */
+		safe_halt();
+	} else
+		local_irq_enable();
 	current_thread_info()->status |= TS_POLLING;
 }
 
@@ -129,16 +127,7 @@ static void default_idle(void)
  */
 static void poll_idle (void)
 {
-	local_irq_enable();
-
-	asm volatile(
-		"2:"
-		"testl %0,%1;"
-		"rep; nop;"
-		"je 2b;"
-		: :
-		"i" (_TIF_NEED_RESCHED),
-		"m" (current_thread_info()->flags));
+	cpu_relax();
 }
 
 void cpu_idle_wait(void)
@@ -257,8 +246,7 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
 static void mwait_idle(void)
 {
 	local_irq_enable();
-	while (!need_resched())
-		mwait_idle_with_hints(0,0);
+	mwait_idle_with_hints(0,0);
 }
 
 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
-- 
cgit v0.10.2


From 7cd8b6861eb586aabe4c725cc0c259ce2e653695 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Thu, 7 Dec 2006 02:14:03 +0100
Subject: [PATCH] x86: remove last two pci_find offenders in the core code

Resending as I believe the discussion about them established they were
correct.

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/pci/irq.c b/arch/i386/pci/irq.c
index e65551c..f2cb942 100644
--- a/arch/i386/pci/irq.c
+++ b/arch/i386/pci/irq.c
@@ -764,7 +764,7 @@ static void __init pirq_find_router(struct irq_router *r)
 	DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
 	    rt->rtr_vendor, rt->rtr_device);
 
-	pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn);
+	pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn);
 	if (!pirq_router_dev) {
 		DBG(KERN_DEBUG "PCI: Interrupt router not found at "
 			"%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
@@ -784,6 +784,8 @@ static void __init pirq_find_router(struct irq_router *r)
 		pirq_router_dev->vendor,
 		pirq_router_dev->device,
 		pci_name(pirq_router_dev));
+
+	/* The device remains referenced for the kernel lifetime */
 }
 
 static struct irq_info *pirq_get_info(struct pci_dev *dev)
diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 37a7708..d2ea87a 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -921,7 +921,7 @@ static int __init calgary_init(void)
 
 error:
 	do {
-		dev = pci_find_device_reverse(PCI_VENDOR_ID_IBM,
+		dev = pci_get_device_reverse(PCI_VENDOR_ID_IBM,
 					      PCI_DEVICE_ID_IBM_CALGARY,
 					      dev);
 		if (!dev)
-- 
cgit v0.10.2


From 9c5f8be4625e73f17e28fea89399ed871a30e064 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:03 +0100
Subject: [PATCH] x86: Mention PCI instead of RAM in NMI parity error message

On modern systems RAM errors don't cause NMIs, but it's usually
caused by PCI SERR. Mention PCI instead of RAM in the printk.

Reported by r_hayashi@ctc-g.co.jp (Ryutaro Hayashi)

Cc:  r_hayashi@ctc-g.co.jp

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 4a6fa28..237f488 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -708,8 +708,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
 {
 	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
 		"CPU %d.\n", reason, smp_processor_id());
-	printk(KERN_EMERG "You probably have a hardware problem with your RAM "
-			"chips\n");
+	printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
 	if (panic_on_unrecovered_nmi)
                 panic("NMI: Not continuing");
 
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index e37b4d7..70bfaab 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -793,8 +793,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
 {
 	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
 		reason);
-	printk(KERN_EMERG "You probably have a hardware problem with your "
-		"RAM chips\n");
+	printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
 
 	if (panic_on_unrecovered_nmi)
 		panic("NMI: Not continuing");
-- 
cgit v0.10.2


From 67fd44fea274a5033ceb90284683bc44df61df54 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:03 +0100
Subject: [PATCH] x86-64: Implement compat code for SIOCSIFHWBROADCAST

This network ioctl wasn't handled before.

Reported by Alexandra.Kossovsky@oktetlabs.ru (Alexandra Kossovsky)
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index a91f262..47bb78d 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -2397,6 +2397,7 @@ HANDLE_IOCTL(SIOCGIFMAP, dev_ifsioc)
 HANDLE_IOCTL(SIOCSIFMAP, dev_ifsioc)
 HANDLE_IOCTL(SIOCGIFADDR, dev_ifsioc)
 HANDLE_IOCTL(SIOCSIFADDR, dev_ifsioc)
+HANDLE_IOCTL(SIOCSIFHWBROADCAST, dev_ifsioc)
 
 /* ioctls used by appletalk ddp.c */
 HANDLE_IOCTL(SIOCATALKDIFADDR, dev_ifsioc)
-- 
cgit v0.10.2


From 6569580de7ae367def89b7671029cb97c1965574 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Thu, 7 Dec 2006 02:14:03 +0100
Subject: [PATCH] i386: Distinguish absolute symbols

Ld knows about 2 kinds of symbols,  absolute and section
relative.  Section relative symbols symbols change value
when a section is moved and absolute symbols do not.

Currently in the linker script we have several labels
marking the beginning and ending of sections that
are outside of sections, making them absolute symbols.
Having a mixture of absolute and section relative
symbols refereing to the same data is currently harmless
but it is confusing.

This must be done carefully as newer revs of ld do not place
symbols that appear in sections without data and instead
ld makes those symbols global :(

My ultimate goal is to build a relocatable kernel.  The
safest and least intrusive technique is to generate
relocation entries so the kernel can be relocated at load
time.  The only penalty would be an increase in the size
of the kernel binary.  The problem is that if absolute and
relocatable symbols are not properly specified absolute symbols
will be relocated or section relative symbols won't be, which
is fatal.

The practical motivation is that when generating kernels that
will run from a reserved area for analyzing what caused
a kernel panic, it is simpler if you don't need to hard code
the physical memory location they will run at, especially
for the distributions.

[AK: and merged:]

o Also put a message so that in future people can be aware of it and
  avoid introducing absolute symbols.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index c6f84a0..cbd24860 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -1,5 +1,11 @@
 /* ld script to make i386 Linux kernel
  * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
+ *
+ * Don't define absolute symbols until and unless you know that symbol
+ * value is should remain constant even if kernel image is relocated
+ * at run time. Absolute symbols are not relocated. If symbol value should
+ * change if kernel is relocated, make the symbol section relative and
+ * put it inside the section definition.
  */
 
 #define LOAD_OFFSET __PAGE_OFFSET
@@ -24,31 +30,32 @@ SECTIONS
   . = __KERNEL_START;
   phys_startup_32 = startup_32 - LOAD_OFFSET;
   /* read-only */
-  _text = .;			/* Text and read-only data */
   .text : AT(ADDR(.text) - LOAD_OFFSET) {
+  	_text = .;			/* Text and read-only data */
 	*(.text)
 	SCHED_TEXT
 	LOCK_TEXT
 	KPROBES_TEXT
 	*(.fixup)
 	*(.gnu.warning)
-	} :text = 0x9090
-
-  _etext = .;			/* End of text section */
+  	_etext = .;			/* End of text section */
+  } :text = 0x9090
 
   . = ALIGN(16);		/* Exception table */
-  __start___ex_table = .;
-  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
-  __stop___ex_table = .;
+  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+  	__start___ex_table = .;
+	 *(__ex_table)
+  	__stop___ex_table = .;
+  }
 
   RODATA
 
   . = ALIGN(4);
-  __tracedata_start = .;
   .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
+  	__tracedata_start = .;
 	*(.tracedata)
+  	__tracedata_end = .;
   }
-  __tracedata_end = .;
 
   /* writeable */
   . = ALIGN(4096);
@@ -58,10 +65,12 @@ SECTIONS
 	} :data
 
   . = ALIGN(4096);
-  __nosave_begin = .;
-  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
-  . = ALIGN(4096);
-  __nosave_end = .;
+  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+  	__nosave_begin = .;
+	*(.data.nosave)
+  	. = ALIGN(4096);
+  	__nosave_end = .;
+  }
 
   . = ALIGN(4096);
   .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
@@ -75,8 +84,10 @@ SECTIONS
 
   /* rarely changed data like cpu maps */
   . = ALIGN(32);
-  .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) }
-  _edata = .;			/* End of data section */
+  .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
+	*(.data.read_mostly)
+	_edata = .;		/* End of data section */
+  }
 
 #ifdef CONFIG_STACK_UNWIND
   . = ALIGN(4);
@@ -94,54 +105,56 @@ SECTIONS
 
   /* might get freed after init */
   . = ALIGN(4096);
-  __smp_alt_begin = .;
-  __smp_alt_instructions = .;
   .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) {
+	__smp_alt_begin = .;
+	__smp_alt_instructions = .;
 	*(.smp_altinstructions)
+	__smp_alt_instructions_end = .;
   }
-  __smp_alt_instructions_end = .;
   . = ALIGN(4);
-  __smp_locks = .;
   .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+  	__smp_locks = .;
 	*(.smp_locks)
+	__smp_locks_end = .;
   }
-  __smp_locks_end = .;
   .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) {
 	*(.smp_altinstr_replacement)
+  	__smp_alt_end = .;
   }
   . = ALIGN(4096);
-  __smp_alt_end = .;
 
   /* will be freed after init */
   . = ALIGN(4096);		/* Init code and data */
-  __init_begin = .;
   .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+  	__init_begin = .;
 	_sinittext = .;
 	*(.init.text)
 	_einittext = .;
   }
   .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
   . = ALIGN(16);
-  __setup_start = .;
-  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
-  __setup_end = .;
-  __initcall_start = .;
+  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+  	__setup_start = .;
+	*(.init.setup)
+  	__setup_end = .;
+   }
   .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+  	__initcall_start = .;
 	INITCALLS
+  	__initcall_end = .;
   }
-  __initcall_end = .;
-  __con_initcall_start = .;
   .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+  	__con_initcall_start = .;
 	*(.con_initcall.init)
+  	__con_initcall_end = .;
   }
-  __con_initcall_end = .;
   SECURITY_INIT
   . = ALIGN(4);
-  __alt_instructions = .;
   .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+  	__alt_instructions = .;
 	*(.altinstructions)
+	__alt_instructions_end = .;
   }
-  __alt_instructions_end = .; 
   .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
 	*(.altinstr_replacement)
   }
@@ -150,32 +163,32 @@ SECTIONS
   .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
   .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
   . = ALIGN(4096);
-  __initramfs_start = .;
-  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
-  __initramfs_end = .;
+  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+	__initramfs_start = .;
+	*(.init.ramfs)
+	__initramfs_end = .;
+  }
   . = ALIGN(L1_CACHE_BYTES);
-  __per_cpu_start = .;
-  .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
-  __per_cpu_end = .;
+  .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
+	__per_cpu_start = .;
+	*(.data.percpu)
+	__per_cpu_end = .;
+  }
   . = ALIGN(4096);
-  __init_end = .;
   /* freed after init ends here */
 	
-  __bss_start = .;		/* BSS */
-  .bss.page_aligned : AT(ADDR(.bss.page_aligned) - LOAD_OFFSET) {
-	*(.bss.page_aligned)
-  }
   .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+	__init_end = .;
+	__bss_start = .;		/* BSS */
+	*(.bss.page_aligned)
 	*(.bss)
+	. = ALIGN(4);
+	__bss_stop = .;
+  	_end = . ;
+	/* This is where the kernel creates the early boot page tables */
+	. = ALIGN(4096);
+	pg0 = . ;
   }
-  . = ALIGN(4);
-  __bss_stop = .; 
-
-  _end = . ;
-
-  /* This is where the kernel creates the early boot page tables */
-  . = ALIGN(4096);
-  pg0 = .;
 
   /* Sections to be discarded */
   /DISCARD/ : {
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index e60d6f2..9f47477 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -11,8 +11,8 @@
 
 #define RODATA								\
 	. = ALIGN(4096);						\
-	__start_rodata = .;						\
 	.rodata           : AT(ADDR(.rodata) - LOAD_OFFSET) {		\
+		VMLINUX_SYMBOL(__start_rodata) = .;			\
 		*(.rodata) *(.rodata.*)					\
 		*(__vermagic)		/* Kernel version magic */	\
 	}								\
@@ -119,17 +119,17 @@
 		*(__ksymtab_strings)					\
 	}								\
 									\
+	/* Unwind data binary search table */				\
+	EH_FRAME_HDR							\
+									\
 	/* Built-in module parameters. */				\
 	__param : AT(ADDR(__param) - LOAD_OFFSET) {			\
 		VMLINUX_SYMBOL(__start___param) = .;			\
 		*(__param)						\
 		VMLINUX_SYMBOL(__stop___param) = .;			\
+		VMLINUX_SYMBOL(__end_rodata) = .;			\
 	}								\
 									\
-	/* Unwind data binary search table */				\
-	EH_FRAME_HDR							\
-									\
-	__end_rodata = .;						\
 	. = ALIGN(4096);
 
 #define SECURITY_INIT							\
-- 
cgit v0.10.2


From 6ed018845f1172cdc94f8a20ad807df901c6b7eb Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Thu, 7 Dec 2006 02:14:03 +0100
Subject: [PATCH] i386: Add comment for align to vmlinux.lds

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index cbd24860..c217e18 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -121,6 +121,12 @@ SECTIONS
 	*(.smp_altinstr_replacement)
   	__smp_alt_end = .;
   }
+  /* will be freed after init
+   * Following ALIGN() is required to make sure no other data falls on the
+   * same page where __smp_alt_end is pointing as that page might be freed
+   * after boot. Always make sure that ALIGN() directive is present after
+   * the section which contains __smp_alt_end.
+   */
   . = ALIGN(4096);
 
   /* will be freed after init */
-- 
cgit v0.10.2


From 9f45accf17efc050ba26bf77cc4f166c950b284e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 7 Dec 2006 02:14:03 +0100
Subject: [PATCH] i386: define __pa_symbol()

On x86_64 we have to be careful with calculating the physical
address of kernel symbols.  Both because of compiler odditities
and because the symbols live in a different range of the virtual
address space.

Having a defintition of __pa_symbol that works on both x86_64 and
i386 simplifies writing code that works for both x86_64 and
i386 that has these kinds of dependencies.

So this patch adds the trivial i386 __pa_symbol definition.

Added assembly magic similar to RELOC_HIDE as suggested by Andi Kleen.
Just picked it up from x86_64.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h
index f5bf544..5a70501 100644
--- a/include/asm-i386/page.h
+++ b/include/asm-i386/page.h
@@ -124,6 +124,9 @@ extern int page_is_ram(unsigned long pagenr);
 #define VMALLOC_RESERVE		((unsigned long)__VMALLOC_RESERVE)
 #define MAXMEM			(-__PAGE_OFFSET-__VMALLOC_RESERVE)
 #define __pa(x)			((unsigned long)(x)-PAGE_OFFSET)
+/* __pa_symbol should be used for C visible symbols.
+   This seems to be the official gcc blessed way to do such arithmetic. */
+#define __pa_symbol(x)          __pa(RELOC_HIDE((unsigned long)(x),0))
 #define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))
 #define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
 #ifdef CONFIG_FLATMEM
-- 
cgit v0.10.2


From 8621b81c744ff8880a1efe095a4dcd09763ddb5a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 7 Dec 2006 02:14:03 +0100
Subject: [PATCH] i386: Reserve kernel memory starting from _text

Currently when we are reserving the memory the kernel text
resides in we start at __PHYSICAL_START which happens to be
correct but not very obvious.  In addition when we start relocating
the kernel __PHYSICAL_START is the wrong value, as it is an
absolute symbol that does not get relocated.

By starting the reservation at __pa_symbol(_text)
the code is clearer and will be correct when relocated.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 141041d..61539af 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -1118,8 +1118,8 @@ void __init setup_bootmem_allocator(void)
 	 * the (very unlikely) case of us accidentally initializing the
 	 * bootmem allocator with an invalid RAM area.
 	 */
-	reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) +
-			 bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START));
+	reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
+			 bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
 
 	/*
 	 * reserve physical page 0 - it's a special BIOS page on many boxes,
-- 
cgit v0.10.2


From 2a43f3ede48ea3d5790b863b719a1e21c90a3697 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 7 Dec 2006 02:14:04 +0100
Subject: [PATCH] i386: CONFIG_PHYSICAL_START cleanup

Defining __PHYSICAL_START and __KERNEL_START in asm-i386/page.h works but
it triggers a full kernel rebuild for the silliest of reasons.  This
modifies the users to directly use CONFIG_PHYSICAL_START and linux/config.h
which prevents the full rebuild problem, which makes the code much
more maintainer and hopefully user friendly.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/boot/compressed/head.S b/arch/i386/boot/compressed/head.S
index b5893e4..40a8de8 100644
--- a/arch/i386/boot/compressed/head.S
+++ b/arch/i386/boot/compressed/head.S
@@ -25,7 +25,6 @@
 
 #include <linux/linkage.h>
 #include <asm/segment.h>
-#include <asm/page.h>
 
 	.globl startup_32
 	
@@ -75,7 +74,7 @@ startup_32:
 	popl %esi	# discard address
 	popl %esi	# real mode pointer
 	xorl %ebx,%ebx
-	ljmp $(__BOOT_CS), $__PHYSICAL_START
+	ljmp $(__BOOT_CS), $CONFIG_PHYSICAL_START
 
 /*
  * We come here, if we were loaded high.
@@ -100,7 +99,7 @@ startup_32:
 	popl %ecx	# lcount
 	popl %edx	# high_buffer_start
 	popl %eax	# hcount
-	movl $__PHYSICAL_START,%edi
+	movl $CONFIG_PHYSICAL_START,%edi
 	cli		# make sure we don't get interrupted
 	ljmp $(__BOOT_CS), $0x1000 # and jump to the move routine
 
@@ -125,5 +124,5 @@ move_routine_start:
 	movsl
 	movl %ebx,%esi	# Restore setup pointer
 	xorl %ebx,%ebx
-	ljmp $(__BOOT_CS), $__PHYSICAL_START
+	ljmp $(__BOOT_CS), $CONFIG_PHYSICAL_START
 move_routine_end:
diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c
index b2ccd54..20970ff 100644
--- a/arch/i386/boot/compressed/misc.c
+++ b/arch/i386/boot/compressed/misc.c
@@ -13,7 +13,6 @@
 #include <linux/vmalloc.h>
 #include <linux/screen_info.h>
 #include <asm/io.h>
-#include <asm/page.h>
 
 /*
  * gzip declarations
@@ -303,7 +302,7 @@ static void setup_normal_output_buffer(void)
 #else
 	if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory");
 #endif
-	output_data = (unsigned char *)__PHYSICAL_START; /* Normally Points to 1M */
+	output_data = (unsigned char *)CONFIG_PHYSICAL_START; /* Normally Points to 1M */
 	free_mem_end_ptr = (long)real_mode;
 }
 
@@ -326,8 +325,8 @@ static void setup_output_buffer_if_we_run_high(struct moveparams *mv)
 	low_buffer_size = low_buffer_end - LOW_BUFFER_START;
 	high_loaded = 1;
 	free_mem_end_ptr = (long)high_buffer_start;
-	if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) {
-		high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size);
+	if ( (CONFIG_PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) {
+		high_buffer_start = (uch *)(CONFIG_PHYSICAL_START + low_buffer_size);
 		mv->hcount = 0; /* say: we need not to move high_buffer */
 	}
 	else mv->hcount = -1;
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index c217e18..f8d61ec 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -27,7 +27,7 @@ PHDRS {
 }
 SECTIONS
 {
-  . = __KERNEL_START;
+  . = LOAD_OFFSET + CONFIG_PHYSICAL_START;
   phys_startup_32 = startup_32 - LOAD_OFFSET;
   /* read-only */
   .text : AT(ADDR(.text) - LOAD_OFFSET) {
diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h
index 5a70501..2b69686 100644
--- a/include/asm-i386/page.h
+++ b/include/asm-i386/page.h
@@ -112,12 +112,9 @@ extern int page_is_ram(unsigned long pagenr);
 
 #ifdef __ASSEMBLY__
 #define __PAGE_OFFSET		CONFIG_PAGE_OFFSET
-#define __PHYSICAL_START	CONFIG_PHYSICAL_START
 #else
 #define __PAGE_OFFSET		((unsigned long)CONFIG_PAGE_OFFSET)
-#define __PHYSICAL_START	((unsigned long)CONFIG_PHYSICAL_START)
 #endif
-#define __KERNEL_START		(__PAGE_OFFSET + __PHYSICAL_START)
 
 
 #define PAGE_OFFSET		((unsigned long)__PAGE_OFFSET)
-- 
cgit v0.10.2


From fd593d12770d4a0d1ff095d44b96436c18479ee8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 7 Dec 2006 02:14:04 +0100
Subject: [PATCH] relocatable kernel: Kallsyms generate relocatable symbols

Print the addresses of non-absolute symbols relative to _text
so that ld will generate relocations.  Allowing a relocatable
kernel to relocate them.  We can't actually use the symbol names
because kallsyms includes static symbols that are not exported
from their object files.

Add the _text symbol definitions to the architectures which don't
define it otherwise linker will fail.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S
index 756325d..f05288b 100644
--- a/arch/h8300/kernel/vmlinux.lds.S
+++ b/arch/h8300/kernel/vmlinux.lds.S
@@ -70,6 +70,7 @@ SECTIONS
 #endif
         .text :
 	{
+	_text = .;
 #if defined(CONFIG_ROMKERNEL)
 	*(.int_redirect)
 #endif
diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S
index 58afa8b..2b2a10d 100644
--- a/arch/m68knommu/kernel/vmlinux.lds.S
+++ b/arch/m68knommu/kernel/vmlinux.lds.S
@@ -60,6 +60,7 @@ SECTIONS {
 #endif
 
 	.text : {
+		_text = .;
 		_stext = . ;
         	*(.text)
 		SCHED_TEXT
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index e8342d8..04b9867 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -33,6 +33,7 @@ SECTIONS
 
 	/* Text and gots */
 	.text : {
+		_text = .;
 		*(.text .text.*)
 		SCHED_TEXT
 		LOCK_TEXT
diff --git a/arch/ppc/kernel/vmlinux.lds.S b/arch/ppc/kernel/vmlinux.lds.S
index 16e8661..6192126 100644
--- a/arch/ppc/kernel/vmlinux.lds.S
+++ b/arch/ppc/kernel/vmlinux.lds.S
@@ -31,6 +31,7 @@ SECTIONS
   .plt : { *(.plt) }
   .text      :
   {
+    _text = .;
     *(.text)
     SCHED_TEXT
     LOCK_TEXT
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index 5cc5ff7..b73e6b9 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -11,6 +11,7 @@ SECTIONS
   . = 0x10000 + SIZEOF_HEADERS;
   .text 0xf0004000 :
   {
+    _text = .;
     *(.text)
     SCHED_TEXT
     LOCK_TEXT
diff --git a/arch/sparc64/kernel/vmlinux.lds.S b/arch/sparc64/kernel/vmlinux.lds.S
index bd9de8c..4a6063f 100644
--- a/arch/sparc64/kernel/vmlinux.lds.S
+++ b/arch/sparc64/kernel/vmlinux.lds.S
@@ -13,6 +13,7 @@ SECTIONS
   . = 0x4000;
   .text 0x0000000000404000 :
   {
+    _text = .;
     *(.text)
     SCHED_TEXT
     LOCK_TEXT
diff --git a/arch/v850/kernel/vmlinux.lds.S b/arch/v850/kernel/vmlinux.lds.S
index 88d087f..3a5fd07 100644
--- a/arch/v850/kernel/vmlinux.lds.S
+++ b/arch/v850/kernel/vmlinux.lds.S
@@ -90,6 +90,7 @@
 
 /* Kernel text segment, and some constant data areas.  */
 #define TEXT_CONTENTS							      \
+		_text = .;						      \
 		__stext = . ;						      \
         	*(.text)						      \
 		SCHED_TEXT						      \
diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c
index 22d281c..4c1ad0a 100644
--- a/scripts/kallsyms.c
+++ b/scripts/kallsyms.c
@@ -43,7 +43,7 @@ struct sym_entry {
 
 static struct sym_entry *table;
 static unsigned int table_size, table_cnt;
-static unsigned long long _stext, _etext, _sinittext, _einittext, _sextratext, _eextratext;
+static unsigned long long _text, _stext, _etext, _sinittext, _einittext, _sextratext, _eextratext;
 static int all_symbols = 0;
 static char symbol_prefix_char = '\0';
 
@@ -91,7 +91,9 @@ static int read_symbol(FILE *in, struct sym_entry *s)
 		sym++;
 
 	/* Ignore most absolute/undefined (?) symbols. */
-	if (strcmp(sym, "_stext") == 0)
+	if (strcmp(sym, "_text") == 0)
+		_text = s->addr;
+	else if (strcmp(sym, "_stext") == 0)
 		_stext = s->addr;
 	else if (strcmp(sym, "_etext") == 0)
 		_etext = s->addr;
@@ -265,9 +267,21 @@ static void write_src(void)
 
 	printf(".data\n");
 
+	/* Provide proper symbols relocatability by their '_text'
+	 * relativeness.  The symbol names cannot be used to construct
+	 * normal symbol references as the list of symbols contains
+	 * symbols that are declared static and are private to their
+	 * .o files.  This prevents .tmp_kallsyms.o or any other
+	 * object from referencing them.
+	 */
 	output_label("kallsyms_addresses");
 	for (i = 0; i < table_cnt; i++) {
-		printf("\tPTR\t%#llx\n", table[i].addr);
+		if (toupper(table[i].sym[0]) != 'A') {
+			printf("\tPTR\t_text + %#llx\n",
+				table[i].addr - _text);
+		} else {
+			printf("\tPTR\t%#llx\n", table[i].addr);
+		}
 	}
 	printf("\n");
 
-- 
cgit v0.10.2


From 968de4f02621db35b8ae5239c8cfc6664fb872d8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 7 Dec 2006 02:14:04 +0100
Subject: [PATCH] i386: Relocatable kernel support

This patch modifies the i386 kernel so that if CONFIG_RELOCATABLE is
selected it will be able to be loaded at any 4K aligned address below
1G.  The technique used is to compile the decompressor with -fPIC and
modify it so the decompressor is fully relocatable.  For the main
kernel relocations are generated.  Resulting in a kernel that is relocatable
with no runtime overhead and no need to modify the source code.

A reserved 32bit word in the parameters has been assigned
to serve as a stack so we figure out where are running.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 8ff1c6f..d588ca8 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -773,6 +773,18 @@ config CRASH_DUMP
           PHYSICAL_START.
 	  For more details see Documentation/kdump/kdump.txt
 
+config RELOCATABLE
+	bool "Build a relocatable kernel"
+	help
+	  This build a kernel image that retains relocation information
+          so it can be loaded someplace besides the default 1MB.
+	  The relocations tend to the kernel binary about 10% larger,
+          but are discarded at runtime.
+
+	  One use is for the kexec on panic case where the recovery kernel
+          must live at a different physical address than the primary
+          kernel.
+
 config PHYSICAL_START
 	hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
 
diff --git a/arch/i386/Makefile b/arch/i386/Makefile
index 0677908..d1aca52 100644
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -26,7 +26,9 @@ endif
 
 LDFLAGS		:= -m elf_i386
 OBJCOPYFLAGS	:= -O binary -R .note -R .comment -S
-LDFLAGS_vmlinux :=
+ifdef CONFIG_RELOCATABLE
+LDFLAGS_vmlinux := --emit-relocs
+endif
 CHECKFLAGS	+= -D__i386__
 
 CFLAGS += -pipe -msoft-float
diff --git a/arch/i386/boot/compressed/Makefile b/arch/i386/boot/compressed/Makefile
index 258ea95..cc28da3 100644
--- a/arch/i386/boot/compressed/Makefile
+++ b/arch/i386/boot/compressed/Makefile
@@ -4,22 +4,42 @@
 # create a compressed vmlinux image from the original vmlinux
 #
 
-targets		:= vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o
+targets		:= vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o \
+			vmlinux.bin.all vmlinux.relocs
 EXTRA_AFLAGS	:= -traditional
 
-LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32
+LDFLAGS_vmlinux := -T
+CFLAGS_misc.o += -fPIC
+hostprogs-y	:= relocs
 
-$(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE
+$(obj)/vmlinux: $(src)/vmlinux.lds $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE
 	$(call if_changed,ld)
 	@:
 
 $(obj)/vmlinux.bin: vmlinux FORCE
 	$(call if_changed,objcopy)
 
+quiet_cmd_relocs = RELOCS  $@
+      cmd_relocs = $(obj)/relocs $< > $@
+$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
+	$(call if_changed,relocs)
+
+vmlinux.bin.all-y := $(obj)/vmlinux.bin
+vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs
+quiet_cmd_relocbin = BUILD   $@
+      cmd_relocbin = cat $(filter-out FORCE,$^) > $@
+$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
+	$(call if_changed,relocbin)
+
+ifdef CONFIG_RELOCATABLE
+$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
+	$(call if_changed,gzip)
+else
 $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
 	$(call if_changed,gzip)
+endif
 
 LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
 
-$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
+$(obj)/piggy.o: $(src)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
 	$(call if_changed,ld)
diff --git a/arch/i386/boot/compressed/head.S b/arch/i386/boot/compressed/head.S
index 40a8de8..e4dd7a6 100644
--- a/arch/i386/boot/compressed/head.S
+++ b/arch/i386/boot/compressed/head.S
@@ -25,9 +25,11 @@
 
 #include <linux/linkage.h>
 #include <asm/segment.h>
+#include <asm/page.h>
 
+.section ".text.head"
 	.globl startup_32
-	
+
 startup_32:
 	cld
 	cli
@@ -36,93 +38,141 @@ startup_32:
 	movl %eax,%es
 	movl %eax,%fs
 	movl %eax,%gs
+	movl %eax,%ss
 
-	lss stack_start,%esp
-	xorl %eax,%eax
-1:	incl %eax		# check that A20 really IS enabled
-	movl %eax,0x000000	# loop forever if it isn't
-	cmpl %eax,0x100000
-	je 1b
+/* Calculate the delta between where we were compiled to run
+ * at and where we were actually loaded at.  This can only be done
+ * with a short local call on x86.  Nothing  else will tell us what
+ * address we are running at.  The reserved chunk of the real-mode
+ * data at 0x34-0x3f are used as the stack for this calculation.
+ * Only 4 bytes are needed.
+ */
+	leal 0x40(%esi), %esp
+	call 1f
+1:	popl %ebp
+	subl $1b, %ebp
+
+/* Compute the delta between where we were compiled to run at
+ * and where the code will actually run at.
+ */
+	/* Start with the delta to where the kernel will run at.  If we are
+	 * a relocatable kernel this is the delta to our load address otherwise
+	 * this is the delta to CONFIG_PHYSICAL start.
+	 */
+#ifdef CONFIG_RELOCATABLE
+	movl %ebp, %ebx
+#else
+	movl $(CONFIG_PHYSICAL_START - startup_32), %ebx
+#endif
+
+	/* Replace the compressed data size with the uncompressed size */
+	subl input_len(%ebp), %ebx
+	movl output_len(%ebp), %eax
+	addl %eax, %ebx
+	/* Add 8 bytes for every 32K input block */
+	shrl $12, %eax
+	addl %eax, %ebx
+	/* Add 32K + 18 bytes of extra slack */
+	addl $(32768 + 18), %ebx
+	/* Align on a 4K boundary */
+	addl $4095, %ebx
+	andl $~4095, %ebx
+
+/* Copy the compressed kernel to the end of our buffer
+ * where decompression in place becomes safe.
+ */
+	pushl %esi
+	leal _end(%ebp), %esi
+	leal _end(%ebx), %edi
+	movl $(_end - startup_32), %ecx
+	std
+	rep
+	movsb
+	cld
+	popl %esi
+
+/* Compute the kernel start address.
+ */
+#ifdef CONFIG_RELOCATABLE
+	leal	startup_32(%ebp), %ebp
+#else
+	movl	$CONFIG_PHYSICAL_START, %ebp
+#endif
 
 /*
- * Initialize eflags.  Some BIOS's leave bits like NT set.  This would
- * confuse the debugger if this code is traced.
- * XXX - best to initialize before switching to protected mode.
+ * Jump to the relocated address.
  */
-	pushl $0
-	popfl
+	leal relocated(%ebx), %eax
+	jmp *%eax
+.section ".text"
+relocated:
+
 /*
  * Clear BSS
  */
 	xorl %eax,%eax
-	movl $_edata,%edi
-	movl $_end,%ecx
+	leal _edata(%ebx),%edi
+	leal _end(%ebx), %ecx
 	subl %edi,%ecx
 	cld
 	rep
 	stosb
+
+/*
+ * Setup the stack for the decompressor
+ */
+	leal stack_end(%ebx), %esp
+
 /*
  * Do the decompression, and jump to the new kernel..
  */
-	subl $16,%esp	# place for structure on the stack
-	movl %esp,%eax
+	movl output_len(%ebx), %eax
+	pushl %eax
+	pushl %ebp	# output address
+	movl input_len(%ebx), %eax
+	pushl %eax	# input_len
+	leal input_data(%ebx), %eax
+	pushl %eax	# input_data
+	leal _end(%ebx), %eax
+	pushl %eax	# end of the image as third argument
 	pushl %esi	# real mode pointer as second arg
-	pushl %eax	# address of structure as first arg
 	call decompress_kernel
-	orl  %eax,%eax 
-	jnz  3f
-	popl %esi	# discard address
-	popl %esi	# real mode pointer
-	xorl %ebx,%ebx
-	ljmp $(__BOOT_CS), $CONFIG_PHYSICAL_START
+	addl $20, %esp
+	popl %ecx
+
+#if CONFIG_RELOCATABLE
+/* Find the address of the relocations.
+ */
+	movl %ebp, %edi
+	addl %ecx, %edi
+
+/* Calculate the delta between where vmlinux was compiled to run
+ * and where it was actually loaded.
+ */
+	movl %ebp, %ebx
+	subl $CONFIG_PHYSICAL_START, %ebx
 
 /*
- * We come here, if we were loaded high.
- * We need to move the move-in-place routine down to 0x1000
- * and then start it with the buffer addresses in registers,
- * which we got from the stack.
+ * Process relocations.
  */
-3:
-	movl $move_routine_start,%esi
-	movl $0x1000,%edi
-	movl $move_routine_end,%ecx
-	subl %esi,%ecx
-	addl $3,%ecx
-	shrl $2,%ecx
-	cld
-	rep
-	movsl
-
-	popl %esi	# discard the address
-	popl %ebx	# real mode pointer
-	popl %esi	# low_buffer_start
-	popl %ecx	# lcount
-	popl %edx	# high_buffer_start
-	popl %eax	# hcount
-	movl $CONFIG_PHYSICAL_START,%edi
-	cli		# make sure we don't get interrupted
-	ljmp $(__BOOT_CS), $0x1000 # and jump to the move routine
+
+1:	subl $4, %edi
+	movl 0(%edi), %ecx
+	testl %ecx, %ecx
+	jz 2f
+	addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
+	jmp 1b
+2:
+#endif
 
 /*
- * Routine (template) for moving the decompressed kernel in place,
- * if we were high loaded. This _must_ PIC-code !
+ * Jump to the decompressed kernel.
  */
-move_routine_start:
-	movl %ecx,%ebp
-	shrl $2,%ecx
-	rep
-	movsl
-	movl %ebp,%ecx
-	andl $3,%ecx
-	rep
-	movsb
-	movl %edx,%esi
-	movl %eax,%ecx	# NOTE: rep movsb won't move if %ecx == 0
-	addl $3,%ecx
-	shrl $2,%ecx
-	rep
-	movsl
-	movl %ebx,%esi	# Restore setup pointer
 	xorl %ebx,%ebx
-	ljmp $(__BOOT_CS), $CONFIG_PHYSICAL_START
-move_routine_end:
+	jmp *%ebp
+
+.bss
+.balign 4
+stack:
+	.fill 4096, 1, 0
+stack_end:
diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c
index 20970ff..4eac24e 100644
--- a/arch/i386/boot/compressed/misc.c
+++ b/arch/i386/boot/compressed/misc.c
@@ -13,6 +13,88 @@
 #include <linux/vmalloc.h>
 #include <linux/screen_info.h>
 #include <asm/io.h>
+#include <asm/page.h>
+
+/* WARNING!!
+ * This code is compiled with -fPIC and it is relocated dynamically
+ * at run time, but no relocation processing is performed.
+ * This means that it is not safe to place pointers in static structures.
+ */
+
+/*
+ * Getting to provable safe in place decompression is hard.
+ * Worst case behaviours need to be analized.
+ * Background information:
+ *
+ * The file layout is:
+ *    magic[2]
+ *    method[1]
+ *    flags[1]
+ *    timestamp[4]
+ *    extraflags[1]
+ *    os[1]
+ *    compressed data blocks[N]
+ *    crc[4] orig_len[4]
+ *
+ * resulting in 18 bytes of non compressed data overhead.
+ *
+ * Files divided into blocks
+ * 1 bit (last block flag)
+ * 2 bits (block type)
+ *
+ * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved.
+ * The smallest block type encoding is always used.
+ *
+ * stored:
+ *    32 bits length in bytes.
+ *
+ * fixed:
+ *    magic fixed tree.
+ *    symbols.
+ *
+ * dynamic:
+ *    dynamic tree encoding.
+ *    symbols.
+ *
+ *
+ * The buffer for decompression in place is the length of the
+ * uncompressed data, plus a small amount extra to keep the algorithm safe.
+ * The compressed data is placed at the end of the buffer.  The output
+ * pointer is placed at the start of the buffer and the input pointer
+ * is placed where the compressed data starts.  Problems will occur
+ * when the output pointer overruns the input pointer.
+ *
+ * The output pointer can only overrun the input pointer if the input
+ * pointer is moving faster than the output pointer.  A condition only
+ * triggered by data whose compressed form is larger than the uncompressed
+ * form.
+ *
+ * The worst case at the block level is a growth of the compressed data
+ * of 5 bytes per 32767 bytes.
+ *
+ * The worst case internal to a compressed block is very hard to figure.
+ * The worst case can at least be boundined by having one bit that represents
+ * 32764 bytes and then all of the rest of the bytes representing the very
+ * very last byte.
+ *
+ * All of which is enough to compute an amount of extra data that is required
+ * to be safe.  To avoid problems at the block level allocating 5 extra bytes
+ * per 32767 bytes of data is sufficient.  To avoind problems internal to a block
+ * adding an extra 32767 bytes (the worst case uncompressed block size) is
+ * sufficient, to ensure that in the worst case the decompressed data for
+ * block will stop the byte before the compressed data for a block begins.
+ * To avoid problems with the compressed data's meta information an extra 18
+ * bytes are needed.  Leading to the formula:
+ *
+ * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size.
+ *
+ * Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
+ * Adding 32768 instead of 32767 just makes for round numbers.
+ * Adding the decompressor_size is necessary as it musht live after all
+ * of the data as well.  Last I measured the decompressor is about 14K.
+ * 10K of actuall data and 4K of bss.
+ *
+ */
 
 /*
  * gzip declarations
@@ -29,15 +111,20 @@ typedef unsigned char  uch;
 typedef unsigned short ush;
 typedef unsigned long  ulg;
 
-#define WSIZE 0x8000		/* Window size must be at least 32k, */
-				/* and a power of two */
+#define WSIZE 0x80000000	/* Window size must be at least 32k,
+				 * and a power of two
+				 * We don't actually have a window just
+				 * a huge output buffer so I report
+				 * a 2G windows size, as that should
+				 * always be larger than our output buffer.
+				 */
 
-static uch *inbuf;	     /* input buffer */
-static uch window[WSIZE];    /* Sliding window buffer */
+static uch *inbuf;	/* input buffer */
+static uch *window;	/* Sliding window buffer, (and final output buffer) */
 
-static unsigned insize = 0;  /* valid bytes in inbuf */
-static unsigned inptr = 0;   /* index of next byte to be processed in inbuf */
-static unsigned outcnt = 0;  /* bytes in output buffer */
+static unsigned insize;  /* valid bytes in inbuf */
+static unsigned inptr;   /* index of next byte to be processed in inbuf */
+static unsigned outcnt;  /* bytes in output buffer */
 
 /* gzip flag byte */
 #define ASCII_FLAG   0x01 /* bit 0 set: file probably ASCII text */
@@ -88,8 +175,6 @@ extern unsigned char input_data[];
 extern int input_len;
 
 static long bytes_out = 0;
-static uch *output_data;
-static unsigned long output_ptr = 0;
 
 static void *malloc(int size);
 static void free(void *where);
@@ -99,17 +184,10 @@ static void *memcpy(void *dest, const void *src, unsigned n);
 
 static void putstr(const char *);
 
-extern int end;
-static long free_mem_ptr = (long)&end;
-static long free_mem_end_ptr;
+static unsigned long free_mem_ptr;
+static unsigned long free_mem_end_ptr;
 
-#define INPLACE_MOVE_ROUTINE  0x1000
-#define LOW_BUFFER_START      0x2000
-#define LOW_BUFFER_MAX       0x90000
 #define HEAP_SIZE             0x3000
-static unsigned int low_buffer_end, low_buffer_size;
-static int high_loaded =0;
-static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/;
 
 static char *vidmem = (char *)0xb8000;
 static int vidport;
@@ -150,7 +228,7 @@ static void gzip_mark(void **ptr)
 
 static void gzip_release(void **ptr)
 {
-	free_mem_ptr = (long) *ptr;
+	free_mem_ptr = (unsigned long) *ptr;
 }
  
 static void scroll(void)
@@ -178,7 +256,7 @@ static void putstr(const char *s)
 				y--;
 			}
 		} else {
-			vidmem [ ( x + cols * y ) * 2 ] = c; 
+			vidmem [ ( x + cols * y ) * 2 ] = c;
 			if ( ++x >= cols ) {
 				x = 0;
 				if ( ++y >= lines ) {
@@ -223,58 +301,31 @@ static void* memcpy(void* dest, const void* src, unsigned n)
  */
 static int fill_inbuf(void)
 {
-	if (insize != 0) {
-		error("ran out of input data");
-	}
-
-	inbuf = input_data;
-	insize = input_len;
-	inptr = 1;
-	return inbuf[0];
+	error("ran out of input data");
+	return 0;
 }
 
 /* ===========================================================================
  * Write the output window window[0..outcnt-1] and update crc and bytes_out.
  * (Used for the decompressed data only.)
  */
-static void flush_window_low(void)
-{
-    ulg c = crc;         /* temporary variable */
-    unsigned n;
-    uch *in, *out, ch;
-    
-    in = window;
-    out = &output_data[output_ptr]; 
-    for (n = 0; n < outcnt; n++) {
-	    ch = *out++ = *in++;
-	    c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
-    }
-    crc = c;
-    bytes_out += (ulg)outcnt;
-    output_ptr += (ulg)outcnt;
-    outcnt = 0;
-}
-
-static void flush_window_high(void)
-{
-    ulg c = crc;         /* temporary variable */
-    unsigned n;
-    uch *in,  ch;
-    in = window;
-    for (n = 0; n < outcnt; n++) {
-	ch = *output_data++ = *in++;
-	if ((ulg)output_data == low_buffer_end) output_data=high_buffer_start;
-	c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
-    }
-    crc = c;
-    bytes_out += (ulg)outcnt;
-    outcnt = 0;
-}
-
 static void flush_window(void)
 {
-	if (high_loaded) flush_window_high();
-	else flush_window_low();
+	/* With my window equal to my output buffer
+	 * I only need to compute the crc here.
+	 */
+	ulg c = crc;         /* temporary variable */
+	unsigned n;
+	uch *in, ch;
+
+	in = window;
+	for (n = 0; n < outcnt; n++) {
+		ch = *in++;
+		c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
+	}
+	crc = c;
+	bytes_out += (ulg)outcnt;
+	outcnt = 0;
 }
 
 static void error(char *x)
@@ -286,66 +337,8 @@ static void error(char *x)
 	while(1);	/* Halt */
 }
 
-#define STACK_SIZE (4096)
-
-long user_stack [STACK_SIZE];
-
-struct {
-	long * a;
-	short b;
-	} stack_start = { & user_stack [STACK_SIZE] , __BOOT_DS };
-
-static void setup_normal_output_buffer(void)
-{
-#ifdef STANDARD_MEMORY_BIOS_CALL
-	if (RM_EXT_MEM_K < 1024) error("Less than 2MB of memory");
-#else
-	if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory");
-#endif
-	output_data = (unsigned char *)CONFIG_PHYSICAL_START; /* Normally Points to 1M */
-	free_mem_end_ptr = (long)real_mode;
-}
-
-struct moveparams {
-	uch *low_buffer_start;  int lcount;
-	uch *high_buffer_start; int hcount;
-};
-
-static void setup_output_buffer_if_we_run_high(struct moveparams *mv)
-{
-	high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE);
-#ifdef STANDARD_MEMORY_BIOS_CALL
-	if (RM_EXT_MEM_K < (3*1024)) error("Less than 4MB of memory");
-#else
-	if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory");
-#endif	
-	mv->low_buffer_start = output_data = (unsigned char *)LOW_BUFFER_START;
-	low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX
-	  ? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff;
-	low_buffer_size = low_buffer_end - LOW_BUFFER_START;
-	high_loaded = 1;
-	free_mem_end_ptr = (long)high_buffer_start;
-	if ( (CONFIG_PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) {
-		high_buffer_start = (uch *)(CONFIG_PHYSICAL_START + low_buffer_size);
-		mv->hcount = 0; /* say: we need not to move high_buffer */
-	}
-	else mv->hcount = -1;
-	mv->high_buffer_start = high_buffer_start;
-}
-
-static void close_output_buffer_if_we_run_high(struct moveparams *mv)
-{
-	if (bytes_out > low_buffer_size) {
-		mv->lcount = low_buffer_size;
-		if (mv->hcount)
-			mv->hcount = bytes_out - low_buffer_size;
-	} else {
-		mv->lcount = bytes_out;
-		mv->hcount = 0;
-	}
-}
-
-asmlinkage int decompress_kernel(struct moveparams *mv, void *rmode)
+asmlinkage void decompress_kernel(void *rmode, unsigned long end,
+			uch *input_data, unsigned long input_len, uch *output)
 {
 	real_mode = rmode;
 
@@ -360,13 +353,25 @@ asmlinkage int decompress_kernel(struct moveparams *mv, void *rmode)
 	lines = RM_SCREEN_INFO.orig_video_lines;
 	cols = RM_SCREEN_INFO.orig_video_cols;
 
-	if (free_mem_ptr < 0x100000) setup_normal_output_buffer();
-	else setup_output_buffer_if_we_run_high(mv);
+	window = output;  	/* Output buffer (Normally at 1M) */
+	free_mem_ptr     = end;	/* Heap  */
+	free_mem_end_ptr = end + HEAP_SIZE;
+	inbuf  = input_data;	/* Input buffer */
+	insize = input_len;
+	inptr  = 0;
+
+	if (((u32)output - CONFIG_PHYSICAL_START) & 0x3fffff)
+		error("Destination address not 4M aligned");
+	if (end > ((-__PAGE_OFFSET-(512 <<20)-1) & 0x7fffffff))
+		error("Destination address too large");
+#ifndef CONFIG_RELOCATABLE
+	if ((u32)output != CONFIG_PHYSICAL_START)
+		error("Wrong destination address");
+#endif
 
 	makecrc();
 	putstr("Uncompressing Linux... ");
 	gunzip();
 	putstr("Ok, booting the kernel.\n");
-	if (high_loaded) close_output_buffer_if_we_run_high(mv);
-	return high_loaded;
+	return;
 }
diff --git a/arch/i386/boot/compressed/relocs.c b/arch/i386/boot/compressed/relocs.c
new file mode 100644
index 0000000..0551ceb
--- /dev/null
+++ b/arch/i386/boot/compressed/relocs.c
@@ -0,0 +1,563 @@
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <elf.h>
+#include <byteswap.h>
+#define USE_BSD
+#include <endian.h>
+
+#define MAX_SHDRS 100
+static Elf32_Ehdr ehdr;
+static Elf32_Shdr shdr[MAX_SHDRS];
+static Elf32_Sym  *symtab[MAX_SHDRS];
+static Elf32_Rel  *reltab[MAX_SHDRS];
+static char *strtab[MAX_SHDRS];
+static unsigned long reloc_count, reloc_idx;
+static unsigned long *relocs;
+
+static void die(char *fmt, ...)
+{
+	va_list ap;
+	va_start(ap, fmt);
+	vfprintf(stderr, fmt, ap);
+	va_end(ap);
+	exit(1);
+}
+
+static const char *sym_type(unsigned type)
+{
+	static const char *type_name[] = {
+#define SYM_TYPE(X) [X] = #X
+		SYM_TYPE(STT_NOTYPE),
+		SYM_TYPE(STT_OBJECT),
+		SYM_TYPE(STT_FUNC),
+		SYM_TYPE(STT_SECTION),
+		SYM_TYPE(STT_FILE),
+		SYM_TYPE(STT_COMMON),
+		SYM_TYPE(STT_TLS),
+#undef SYM_TYPE
+	};
+	const char *name = "unknown sym type name";
+	if (type < sizeof(type_name)/sizeof(type_name[0])) {
+		name = type_name[type];
+	}
+	return name;
+}
+
+static const char *sym_bind(unsigned bind)
+{
+	static const char *bind_name[] = {
+#define SYM_BIND(X) [X] = #X
+		SYM_BIND(STB_LOCAL),
+		SYM_BIND(STB_GLOBAL),
+		SYM_BIND(STB_WEAK),
+#undef SYM_BIND
+	};
+	const char *name = "unknown sym bind name";
+	if (bind < sizeof(bind_name)/sizeof(bind_name[0])) {
+		name = bind_name[bind];
+	}
+	return name;
+}
+
+static const char *sym_visibility(unsigned visibility)
+{
+	static const char *visibility_name[] = {
+#define SYM_VISIBILITY(X) [X] = #X
+		SYM_VISIBILITY(STV_DEFAULT),
+		SYM_VISIBILITY(STV_INTERNAL),
+		SYM_VISIBILITY(STV_HIDDEN),
+		SYM_VISIBILITY(STV_PROTECTED),
+#undef SYM_VISIBILITY
+	};
+	const char *name = "unknown sym visibility name";
+	if (visibility < sizeof(visibility_name)/sizeof(visibility_name[0])) {
+		name = visibility_name[visibility];
+	}
+	return name;
+}
+
+static const char *rel_type(unsigned type)
+{
+	static const char *type_name[] = {
+#define REL_TYPE(X) [X] = #X
+		REL_TYPE(R_386_NONE),
+		REL_TYPE(R_386_32),
+		REL_TYPE(R_386_PC32),
+		REL_TYPE(R_386_GOT32),
+		REL_TYPE(R_386_PLT32),
+		REL_TYPE(R_386_COPY),
+		REL_TYPE(R_386_GLOB_DAT),
+		REL_TYPE(R_386_JMP_SLOT),
+		REL_TYPE(R_386_RELATIVE),
+		REL_TYPE(R_386_GOTOFF),
+		REL_TYPE(R_386_GOTPC),
+#undef REL_TYPE
+	};
+	const char *name = "unknown type rel type name";
+	if (type < sizeof(type_name)/sizeof(type_name[0])) {
+		name = type_name[type];
+	}
+	return name;
+}
+
+static const char *sec_name(unsigned shndx)
+{
+	const char *sec_strtab;
+	const char *name;
+	sec_strtab = strtab[ehdr.e_shstrndx];
+	name = "<noname>";
+	if (shndx < ehdr.e_shnum) {
+		name = sec_strtab + shdr[shndx].sh_name;
+	}
+	else if (shndx == SHN_ABS) {
+		name = "ABSOLUTE";
+	}
+	else if (shndx == SHN_COMMON) {
+		name = "COMMON";
+	}
+	return name;
+}
+
+static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym)
+{
+	const char *name;
+	name = "<noname>";
+	if (sym->st_name) {
+		name = sym_strtab + sym->st_name;
+	}
+	else {
+		name = sec_name(shdr[sym->st_shndx].sh_name);
+	}
+	return name;
+}
+
+
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define le16_to_cpu(val) (val)
+#define le32_to_cpu(val) (val)
+#endif
+#if BYTE_ORDER == BIG_ENDIAN
+#define le16_to_cpu(val) bswap_16(val)
+#define le32_to_cpu(val) bswap_32(val)
+#endif
+
+static uint16_t elf16_to_cpu(uint16_t val)
+{
+	return le16_to_cpu(val);
+}
+
+static uint32_t elf32_to_cpu(uint32_t val)
+{
+	return le32_to_cpu(val);
+}
+
+static void read_ehdr(FILE *fp)
+{
+	if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) {
+		die("Cannot read ELF header: %s\n",
+			strerror(errno));
+	}
+	if (memcmp(ehdr.e_ident, ELFMAG, 4) != 0) {
+		die("No ELF magic\n");
+	}
+	if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) {
+		die("Not a 32 bit executable\n");
+	}
+	if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) {
+		die("Not a LSB ELF executable\n");
+	}
+	if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) {
+		die("Unknown ELF version\n");
+	}
+	/* Convert the fields to native endian */
+	ehdr.e_type      = elf16_to_cpu(ehdr.e_type);
+	ehdr.e_machine   = elf16_to_cpu(ehdr.e_machine);
+	ehdr.e_version   = elf32_to_cpu(ehdr.e_version);
+	ehdr.e_entry     = elf32_to_cpu(ehdr.e_entry);
+	ehdr.e_phoff     = elf32_to_cpu(ehdr.e_phoff);
+	ehdr.e_shoff     = elf32_to_cpu(ehdr.e_shoff);
+	ehdr.e_flags     = elf32_to_cpu(ehdr.e_flags);
+	ehdr.e_ehsize    = elf16_to_cpu(ehdr.e_ehsize);
+	ehdr.e_phentsize = elf16_to_cpu(ehdr.e_phentsize);
+	ehdr.e_phnum     = elf16_to_cpu(ehdr.e_phnum);
+	ehdr.e_shentsize = elf16_to_cpu(ehdr.e_shentsize);
+	ehdr.e_shnum     = elf16_to_cpu(ehdr.e_shnum);
+	ehdr.e_shstrndx  = elf16_to_cpu(ehdr.e_shstrndx);
+
+	if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) {
+		die("Unsupported ELF header type\n");
+	}
+	if (ehdr.e_machine != EM_386) {
+		die("Not for x86\n");
+	}
+	if (ehdr.e_version != EV_CURRENT) {
+		die("Unknown ELF version\n");
+	}
+	if (ehdr.e_ehsize != sizeof(Elf32_Ehdr)) {
+		die("Bad Elf header size\n");
+	}
+	if (ehdr.e_phentsize != sizeof(Elf32_Phdr)) {
+		die("Bad program header entry\n");
+	}
+	if (ehdr.e_shentsize != sizeof(Elf32_Shdr)) {
+		die("Bad section header entry\n");
+	}
+	if (ehdr.e_shstrndx >= ehdr.e_shnum) {
+		die("String table index out of bounds\n");
+	}
+}
+
+static void read_shdrs(FILE *fp)
+{
+	int i;
+	if (ehdr.e_shnum > MAX_SHDRS) {
+		die("%d section headers supported: %d\n",
+			ehdr.e_shnum, MAX_SHDRS);
+	}
+	if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) {
+		die("Seek to %d failed: %s\n",
+			ehdr.e_shoff, strerror(errno));
+	}
+	if (fread(&shdr, sizeof(shdr[0]), ehdr.e_shnum, fp) != ehdr.e_shnum) {
+		die("Cannot read ELF section headers: %s\n",
+			strerror(errno));
+	}
+	for(i = 0; i < ehdr.e_shnum; i++) {
+		shdr[i].sh_name      = elf32_to_cpu(shdr[i].sh_name);
+		shdr[i].sh_type      = elf32_to_cpu(shdr[i].sh_type);
+		shdr[i].sh_flags     = elf32_to_cpu(shdr[i].sh_flags);
+		shdr[i].sh_addr      = elf32_to_cpu(shdr[i].sh_addr);
+		shdr[i].sh_offset    = elf32_to_cpu(shdr[i].sh_offset);
+		shdr[i].sh_size      = elf32_to_cpu(shdr[i].sh_size);
+		shdr[i].sh_link      = elf32_to_cpu(shdr[i].sh_link);
+		shdr[i].sh_info      = elf32_to_cpu(shdr[i].sh_info);
+		shdr[i].sh_addralign = elf32_to_cpu(shdr[i].sh_addralign);
+		shdr[i].sh_entsize   = elf32_to_cpu(shdr[i].sh_entsize);
+	}
+
+}
+
+static void read_strtabs(FILE *fp)
+{
+	int i;
+	for(i = 0; i < ehdr.e_shnum; i++) {
+		if (shdr[i].sh_type != SHT_STRTAB) {
+			continue;
+		}
+		strtab[i] = malloc(shdr[i].sh_size);
+		if (!strtab[i]) {
+			die("malloc of %d bytes for strtab failed\n",
+				shdr[i].sh_size);
+		}
+		if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) {
+			die("Seek to %d failed: %s\n",
+				shdr[i].sh_offset, strerror(errno));
+		}
+		if (fread(strtab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) {
+			die("Cannot read symbol table: %s\n",
+				strerror(errno));
+		}
+	}
+}
+
+static void read_symtabs(FILE *fp)
+{
+	int i,j;
+	for(i = 0; i < ehdr.e_shnum; i++) {
+		if (shdr[i].sh_type != SHT_SYMTAB) {
+			continue;
+		}
+		symtab[i] = malloc(shdr[i].sh_size);
+		if (!symtab[i]) {
+			die("malloc of %d bytes for symtab failed\n",
+				shdr[i].sh_size);
+		}
+		if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) {
+			die("Seek to %d failed: %s\n",
+				shdr[i].sh_offset, strerror(errno));
+		}
+		if (fread(symtab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) {
+			die("Cannot read symbol table: %s\n",
+				strerror(errno));
+		}
+		for(j = 0; j < shdr[i].sh_size/sizeof(symtab[i][0]); j++) {
+			symtab[i][j].st_name  = elf32_to_cpu(symtab[i][j].st_name);
+			symtab[i][j].st_value = elf32_to_cpu(symtab[i][j].st_value);
+			symtab[i][j].st_size  = elf32_to_cpu(symtab[i][j].st_size);
+			symtab[i][j].st_shndx = elf16_to_cpu(symtab[i][j].st_shndx);
+		}
+	}
+}
+
+
+static void read_relocs(FILE *fp)
+{
+	int i,j;
+	for(i = 0; i < ehdr.e_shnum; i++) {
+		if (shdr[i].sh_type != SHT_REL) {
+			continue;
+		}
+		reltab[i] = malloc(shdr[i].sh_size);
+		if (!reltab[i]) {
+			die("malloc of %d bytes for relocs failed\n",
+				shdr[i].sh_size);
+		}
+		if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) {
+			die("Seek to %d failed: %s\n",
+				shdr[i].sh_offset, strerror(errno));
+		}
+		if (fread(reltab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) {
+			die("Cannot read symbol table: %s\n",
+				strerror(errno));
+		}
+		for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) {
+			reltab[i][j].r_offset = elf32_to_cpu(reltab[i][j].r_offset);
+			reltab[i][j].r_info   = elf32_to_cpu(reltab[i][j].r_info);
+		}
+	}
+}
+
+
+static void print_absolute_symbols(void)
+{
+	int i;
+	printf("Absolute symbols\n");
+	printf(" Num:    Value Size  Type       Bind        Visibility  Name\n");
+	for(i = 0; i < ehdr.e_shnum; i++) {
+		char *sym_strtab;
+		Elf32_Sym *sh_symtab;
+		int j;
+		if (shdr[i].sh_type != SHT_SYMTAB) {
+			continue;
+		}
+		sh_symtab = symtab[i];
+		sym_strtab = strtab[shdr[i].sh_link];
+		for(j = 0; j < shdr[i].sh_size/sizeof(symtab[0][0]); j++) {
+			Elf32_Sym *sym;
+			const char *name;
+			sym = &symtab[i][j];
+			name = sym_name(sym_strtab, sym);
+			if (sym->st_shndx != SHN_ABS) {
+				continue;
+			}
+			printf("%5d %08x %5d %10s %10s %12s %s\n",
+				j, sym->st_value, sym->st_size,
+				sym_type(ELF32_ST_TYPE(sym->st_info)),
+				sym_bind(ELF32_ST_BIND(sym->st_info)),
+				sym_visibility(ELF32_ST_VISIBILITY(sym->st_other)),
+				name);
+		}
+	}
+	printf("\n");
+}
+
+static void print_absolute_relocs(void)
+{
+	int i;
+	printf("Absolute relocations\n");
+	printf("Offset     Info     Type     Sym.Value Sym.Name\n");
+	for(i = 0; i < ehdr.e_shnum; i++) {
+		char *sym_strtab;
+		Elf32_Sym *sh_symtab;
+		unsigned sec_applies, sec_symtab;
+		int j;
+		if (shdr[i].sh_type != SHT_REL) {
+			continue;
+		}
+		sec_symtab  = shdr[i].sh_link;
+		sec_applies = shdr[i].sh_info;
+		if (!(shdr[sec_applies].sh_flags & SHF_ALLOC)) {
+			continue;
+		}
+		sh_symtab = symtab[sec_symtab];
+		sym_strtab = strtab[shdr[sec_symtab].sh_link];
+		for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) {
+			Elf32_Rel *rel;
+			Elf32_Sym *sym;
+			const char *name;
+			rel = &reltab[i][j];
+			sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
+			name = sym_name(sym_strtab, sym);
+			if (sym->st_shndx != SHN_ABS) {
+				continue;
+			}
+			printf("%08x %08x %10s %08x  %s\n",
+				rel->r_offset,
+				rel->r_info,
+				rel_type(ELF32_R_TYPE(rel->r_info)),
+				sym->st_value,
+				name);
+		}
+	}
+	printf("\n");
+}
+
+static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
+{
+	int i;
+	/* Walk through the relocations */
+	for(i = 0; i < ehdr.e_shnum; i++) {
+		char *sym_strtab;
+		Elf32_Sym *sh_symtab;
+		unsigned sec_applies, sec_symtab;
+		int j;
+		if (shdr[i].sh_type != SHT_REL) {
+			continue;
+		}
+		sec_symtab  = shdr[i].sh_link;
+		sec_applies = shdr[i].sh_info;
+		if (!(shdr[sec_applies].sh_flags & SHF_ALLOC)) {
+			continue;
+		}
+		sh_symtab = symtab[sec_symtab];
+		sym_strtab = strtab[shdr[sec_symtab].sh_link];
+		for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) {
+			Elf32_Rel *rel;
+			Elf32_Sym *sym;
+			unsigned r_type;
+			rel = &reltab[i][j];
+			sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
+			r_type = ELF32_R_TYPE(rel->r_info);
+			/* Don't visit relocations to absolute symbols */
+			if (sym->st_shndx == SHN_ABS) {
+				continue;
+			}
+			if (r_type == R_386_PC32) {
+				/* PC relative relocations don't need to be adjusted */
+			}
+			else if (r_type == R_386_32) {
+				/* Visit relocations that need to be adjusted */
+				visit(rel, sym);
+			}
+			else {
+				die("Unsupported relocation type: %d\n", r_type);
+			}
+		}
+	}
+}
+
+static void count_reloc(Elf32_Rel *rel, Elf32_Sym *sym)
+{
+	reloc_count += 1;
+}
+
+static void collect_reloc(Elf32_Rel *rel, Elf32_Sym *sym)
+{
+	/* Remember the address that needs to be adjusted. */
+	relocs[reloc_idx++] = rel->r_offset;
+}
+
+static int cmp_relocs(const void *va, const void *vb)
+{
+	const unsigned long *a, *b;
+	a = va; b = vb;
+	return (*a == *b)? 0 : (*a > *b)? 1 : -1;
+}
+
+static void emit_relocs(int as_text)
+{
+	int i;
+	/* Count how many relocations I have and allocate space for them. */
+	reloc_count = 0;
+	walk_relocs(count_reloc);
+	relocs = malloc(reloc_count * sizeof(relocs[0]));
+	if (!relocs) {
+		die("malloc of %d entries for relocs failed\n",
+			reloc_count);
+	}
+	/* Collect up the relocations */
+	reloc_idx = 0;
+	walk_relocs(collect_reloc);
+
+	/* Order the relocations for more efficient processing */
+	qsort(relocs, reloc_count, sizeof(relocs[0]), cmp_relocs);
+
+	/* Print the relocations */
+	if (as_text) {
+		/* Print the relocations in a form suitable that
+		 * gas will like.
+		 */
+		printf(".section \".data.reloc\",\"a\"\n");
+		printf(".balign 4\n");
+		for(i = 0; i < reloc_count; i++) {
+			printf("\t .long 0x%08lx\n", relocs[i]);
+		}
+		printf("\n");
+	}
+	else {
+		unsigned char buf[4];
+		buf[0] = buf[1] = buf[2] = buf[3] = 0;
+		/* Print a stop */
+		printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]);
+		/* Now print each relocation */
+		for(i = 0; i < reloc_count; i++) {
+			buf[0] = (relocs[i] >>  0) & 0xff;
+			buf[1] = (relocs[i] >>  8) & 0xff;
+			buf[2] = (relocs[i] >> 16) & 0xff;
+			buf[3] = (relocs[i] >> 24) & 0xff;
+			printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]);
+		}
+	}
+}
+
+static void usage(void)
+{
+	die("i386_reloc [--abs | --text] vmlinux\n");
+}
+
+int main(int argc, char **argv)
+{
+	int show_absolute;
+	int as_text;
+	const char *fname;
+	FILE *fp;
+	int i;
+
+	show_absolute = 0;
+	as_text = 0;
+	fname = NULL;
+	for(i = 1; i < argc; i++) {
+		char *arg = argv[i];
+		if (*arg == '-') {
+			if (strcmp(argv[1], "--abs") == 0) {
+				show_absolute = 1;
+				continue;
+			}
+			else if (strcmp(argv[1], "--text") == 0) {
+				as_text = 1;
+				continue;
+			}
+		}
+		else if (!fname) {
+			fname = arg;
+			continue;
+		}
+		usage();
+	}
+	if (!fname) {
+		usage();
+	}
+	fp = fopen(fname, "r");
+	if (!fp) {
+		die("Cannot open %s: %s\n",
+			fname, strerror(errno));
+	}
+	read_ehdr(fp);
+	read_shdrs(fp);
+	read_strtabs(fp);
+	read_symtabs(fp);
+	read_relocs(fp);
+	if (show_absolute) {
+		print_absolute_symbols();
+		print_absolute_relocs();
+		return 0;
+	}
+	emit_relocs(as_text);
+	return 0;
+}
diff --git a/arch/i386/boot/compressed/vmlinux.lds b/arch/i386/boot/compressed/vmlinux.lds
new file mode 100644
index 0000000..cc4854f
--- /dev/null
+++ b/arch/i386/boot/compressed/vmlinux.lds
@@ -0,0 +1,43 @@
+OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_ARCH(i386)
+ENTRY(startup_32)
+SECTIONS
+{
+        /* Be careful parts of head.S assume startup_32 is at
+         * address 0.
+	 */
+	. =  0 	;
+	.text.head : {
+		_head = . ;
+		*(.text.head)
+		_ehead = . ;
+	}
+	.data.compressed : {
+		*(.data.compressed)
+	}
+	.text :	{
+		_text = .; 	/* Text */
+		*(.text)
+		*(.text.*)
+		_etext = . ;
+	}
+	.rodata : {
+		_rodata = . ;
+		*(.rodata)	 /* read-only data */
+		*(.rodata.*)
+		_erodata = . ;
+	}
+	.data :	{
+		_data = . ;
+		*(.data)
+		*(.data.*)
+		_edata = . ;
+	}
+	.bss : {
+		_bss = . ;
+		*(.bss)
+		*(.bss.*)
+		*(COMMON)
+		_end = . ;
+	}
+}
diff --git a/arch/i386/boot/compressed/vmlinux.scr b/arch/i386/boot/compressed/vmlinux.scr
index 1ed9d79..707a88f 100644
--- a/arch/i386/boot/compressed/vmlinux.scr
+++ b/arch/i386/boot/compressed/vmlinux.scr
@@ -1,9 +1,10 @@
 SECTIONS
 {
-  .data : { 
+  .data.compressed : {
 	input_len = .;
 	LONG(input_data_end - input_data) input_data = .; 
 	*(.data) 
+	output_len = . - 4;
 	input_data_end = .; 
 	}
 }
diff --git a/arch/i386/boot/setup.S b/arch/i386/boot/setup.S
index 3aec4538..9aa8b05 100644
--- a/arch/i386/boot/setup.S
+++ b/arch/i386/boot/setup.S
@@ -588,11 +588,6 @@ rmodeswtch_normal:
 	call	default_switch
 
 rmodeswtch_end:
-# we get the code32 start address and modify the below 'jmpi'
-# (loader may have changed it)
-	movl	%cs:code32_start, %eax
-	movl	%eax, %cs:code32
-
 # Now we move the system to its rightful place ... but we check if we have a
 # big-kernel. In that case we *must* not move it ...
 	testb	$LOADED_HIGH, %cs:loadflags
@@ -788,11 +783,12 @@ a20_err_msg:
 a20_done:
 
 #endif /* CONFIG_X86_VOYAGER */
-# set up gdt and idt
+# set up gdt and idt and 32bit start address
 	lidt	idt_48				# load idt with 0,0
 	xorl	%eax, %eax			# Compute gdt_base
 	movw	%ds, %ax			# (Convert %ds:gdt to a linear ptr)
 	shll	$4, %eax
+	addl	%eax, code32
 	addl	$gdt, %eax
 	movl	%eax, (gdt_48+2)
 	lgdt	gdt_48				# load gdt with whatever is
@@ -851,9 +847,26 @@ flush_instr:
 #	Manual, Mixing 16-bit and 32-bit code, page 16-6)
 
 	.byte 0x66, 0xea			# prefix + jmpi-opcode
-code32:	.long	0x1000				# will be set to 0x100000
-						# for big kernels
+code32:	.long	startup_32			# will be set to %cs+startup_32
 	.word	__BOOT_CS
+.code32
+startup_32:
+	movl $(__BOOT_DS), %eax
+	movl %eax, %ds
+	movl %eax, %es
+	movl %eax, %fs
+	movl %eax, %gs
+	movl %eax, %ss
+
+	xorl %eax, %eax
+1:	incl %eax				# check that A20 really IS enabled
+	movl %eax, 0x00000000			# loop forever if it isn't
+	cmpl %eax, 0x00100000
+	je 1b
+
+	# Jump to the 32bit entry point
+	jmpl *(code32_start - start + (DELTA_INITSEG << 4))(%esi)
+.code16
 
 # Here's a bunch of information about your current kernel..
 kernel_version:	.ascii	UTS_RELEASE
diff --git a/include/linux/screen_info.h b/include/linux/screen_info.h
index 2925e66..b02308e 100644
--- a/include/linux/screen_info.h
+++ b/include/linux/screen_info.h
@@ -42,7 +42,8 @@ struct screen_info {
 	u16 pages;		/* 0x32 */
 	u16 vesa_attributes;	/* 0x34 */
 	u32 capabilities;       /* 0x36 */
-				/* 0x3a -- 0x3f reserved for future expansion */
+				/* 0x3a -- 0x3b reserved for future expansion */
+				/* 0x3c -- 0x3f micro stack for relocatable kernels */
 };
 
 extern struct screen_info screen_info;
-- 
cgit v0.10.2


From 6a044b3a0a1829ef19bb29548ffe553f48e8d80c Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Thu, 7 Dec 2006 02:14:04 +0100
Subject: [PATCH] i386: Warn upon absolute relocations being present

o Relocations generated w.r.t absolute symbols are not processed as by
  definition, absolute symbols are not to be relocated. Explicitly warn
  user about absolutions relocations present at compile time.

o These relocations get introduced either due to linker optimizations or
  some programming oversights.

o Also create a list of symbols which have been audited to be safe and
  don't emit warnings for these.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/boot/compressed/Makefile b/arch/i386/boot/compressed/Makefile
index cc28da3..a661217 100644
--- a/arch/i386/boot/compressed/Makefile
+++ b/arch/i386/boot/compressed/Makefile
@@ -20,7 +20,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE
 	$(call if_changed,objcopy)
 
 quiet_cmd_relocs = RELOCS  $@
-      cmd_relocs = $(obj)/relocs $< > $@
+      cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
 $(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
 	$(call if_changed,relocs)
 
diff --git a/arch/i386/boot/compressed/relocs.c b/arch/i386/boot/compressed/relocs.c
index 0551ceb..468da89 100644
--- a/arch/i386/boot/compressed/relocs.c
+++ b/arch/i386/boot/compressed/relocs.c
@@ -19,6 +19,33 @@ static char *strtab[MAX_SHDRS];
 static unsigned long reloc_count, reloc_idx;
 static unsigned long *relocs;
 
+/*
+ * Following symbols have been audited. There values are constant and do
+ * not change if bzImage is loaded at a different physical address than
+ * the address for which it has been compiled. Don't warn user about
+ * absolute relocations present w.r.t these symbols.
+ */
+static const char* safe_abs_relocs[] = {
+		"__kernel_vsyscall",
+		"__kernel_rt_sigreturn",
+		"__kernel_sigreturn",
+		"SYSENTER_RETURN",
+};
+
+static int is_safe_abs_reloc(const char* sym_name)
+{
+	int i, array_size;
+
+	array_size = sizeof(safe_abs_relocs)/sizeof(char*);
+
+	for(i = 0; i < array_size; i++) {
+		if (!strcmp(sym_name, safe_abs_relocs[i]))
+			/* Match found */
+			return 1;
+	}
+	return 0;
+}
+
 static void die(char *fmt, ...)
 {
 	va_list ap;
@@ -359,9 +386,8 @@ static void print_absolute_symbols(void)
 
 static void print_absolute_relocs(void)
 {
-	int i;
-	printf("Absolute relocations\n");
-	printf("Offset     Info     Type     Sym.Value Sym.Name\n");
+	int i, printed = 0;
+
 	for(i = 0; i < ehdr.e_shnum; i++) {
 		char *sym_strtab;
 		Elf32_Sym *sh_symtab;
@@ -387,6 +413,31 @@ static void print_absolute_relocs(void)
 			if (sym->st_shndx != SHN_ABS) {
 				continue;
 			}
+
+			/* Absolute symbols are not relocated if bzImage is
+			 * loaded at a non-compiled address. Display a warning
+			 * to user at compile time about the absolute
+			 * relocations present.
+			 *
+			 * User need to audit the code to make sure
+			 * some symbols which should have been section
+			 * relative have not become absolute because of some
+			 * linker optimization or wrong programming usage.
+			 *
+			 * Before warning check if this absolute symbol
+			 * relocation is harmless.
+			 */
+			if (is_safe_abs_reloc(name))
+				continue;
+
+			if (!printed) {
+				printf("WARNING: Absolute relocations"
+					" present\n");
+				printf("Offset     Info     Type     Sym.Value "
+					"Sym.Name\n");
+				printed = 1;
+			}
+
 			printf("%08x %08x %10s %08x  %s\n",
 				rel->r_offset,
 				rel->r_info,
@@ -395,7 +446,9 @@ static void print_absolute_relocs(void)
 				name);
 		}
 	}
-	printf("\n");
+
+	if (printed)
+		printf("\n");
 }
 
 static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
@@ -508,25 +561,31 @@ static void emit_relocs(int as_text)
 
 static void usage(void)
 {
-	die("i386_reloc [--abs | --text] vmlinux\n");
+	die("relocs [--abs-syms |--abs-relocs | --text] vmlinux\n");
 }
 
 int main(int argc, char **argv)
 {
-	int show_absolute;
+	int show_absolute_syms, show_absolute_relocs;
 	int as_text;
 	const char *fname;
 	FILE *fp;
 	int i;
 
-	show_absolute = 0;
+	show_absolute_syms = 0;
+	show_absolute_relocs = 0;
 	as_text = 0;
 	fname = NULL;
 	for(i = 1; i < argc; i++) {
 		char *arg = argv[i];
 		if (*arg == '-') {
-			if (strcmp(argv[1], "--abs") == 0) {
-				show_absolute = 1;
+			if (strcmp(argv[1], "--abs-syms") == 0) {
+				show_absolute_syms = 1;
+				continue;
+			}
+
+			if (strcmp(argv[1], "--abs-relocs") == 0) {
+				show_absolute_relocs = 1;
 				continue;
 			}
 			else if (strcmp(argv[1], "--text") == 0) {
@@ -553,8 +612,11 @@ int main(int argc, char **argv)
 	read_strtabs(fp);
 	read_symtabs(fp);
 	read_relocs(fp);
-	if (show_absolute) {
+	if (show_absolute_syms) {
 		print_absolute_symbols();
+		return 0;
+	}
+	if (show_absolute_relocs) {
 		print_absolute_relocs();
 		return 0;
 	}
-- 
cgit v0.10.2


From e69f202d0a1419219198566e1c22218a5c71a9a6 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Thu, 7 Dec 2006 02:14:04 +0100
Subject: [PATCH] i386: Implement CONFIG_PHYSICAL_ALIGN

o Now CONFIG_PHYSICAL_START is being replaced with CONFIG_PHYSICAL_ALIGN.
  Hardcoding the kernel physical start value creates a problem in relocatable
  kernel context due to boot loader limitations. For ex, if somebody
  compiles a relocatable kernel to be run from address 4MB, but this kernel
  will run from location 1MB as grub loads the kernel at physical address
  1MB. Kernel thinks that I am a relocatable kernel and I should run from
  the address I have been loaded at. So somebody wanting to run kernel
  from 4MB alignment location (for improved performance regions) can't do
  that.

o Hence, Eric proposed that probably CONFIG_PHYSICAL_ALIGN will make
  more sense in relocatable kernel context. At run time kernel will move
  itself to a physical addr location which meets user specified alignment
  restrictions.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index d588ca8..fd2fa7a 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -785,23 +785,26 @@ config RELOCATABLE
           must live at a different physical address than the primary
           kernel.
 
-config PHYSICAL_START
-	hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
-
-	default "0x1000000" if CRASH_DUMP
+config PHYSICAL_ALIGN
+	hex "Alignment value to which kernel should be aligned"
 	default "0x100000"
+	range 0x2000 0x400000
 	help
-	  This gives the physical address where the kernel is loaded. Normally
-	  for regular kernels this value is 0x100000 (1MB). But in the case
-	  of kexec on panic the fail safe kernel needs to run at a different
-	  address than the panic-ed kernel. This option is used to set the load
-	  address for kernels used to capture crash dump on being kexec'ed
-	  after panic. The default value for crash dump kernels is
-	  0x1000000 (16MB). This can also be set based on the "X" value as
-	  specified in the "crashkernel=YM@XM" command line boot parameter
-	  passed to the panic-ed kernel. Typically this parameter is set as
-	  crashkernel=64M@16M. Please take a look at
-	  Documentation/kdump/kdump.txt for more details about crash dumps.
+	  This value puts the alignment restrictions on physical address
+ 	  where kernel is loaded and run from. Kernel is compiled for an
+ 	  address which meets above alignment restriction.
+
+ 	  If bootloader loads the kernel at a non-aligned address and
+ 	  CONFIG_RELOCATABLE is set, kernel will move itself to nearest
+ 	  address aligned to above value and run from there.
+
+ 	  If bootloader loads the kernel at a non-aligned address and
+ 	  CONFIG_RELOCATABLE is not set, kernel will ignore the run time
+ 	  load address and decompress itself to the address it has been
+ 	  compiled for and run from there. The address for which kernel is
+ 	  compiled already meets above alignment restrictions. Hence the
+ 	  end result is that kernel runs from a physical address meeting
+	  above alignment restrictions.
 
 	  Don't change this unless you know what you are doing.
 
diff --git a/arch/i386/boot/compressed/head.S b/arch/i386/boot/compressed/head.S
index e4dd7a6..f395a4b 100644
--- a/arch/i386/boot/compressed/head.S
+++ b/arch/i386/boot/compressed/head.S
@@ -26,6 +26,7 @@
 #include <linux/linkage.h>
 #include <asm/segment.h>
 #include <asm/page.h>
+#include <asm/boot.h>
 
 .section ".text.head"
 	.globl startup_32
@@ -52,17 +53,17 @@ startup_32:
 1:	popl %ebp
 	subl $1b, %ebp
 
-/* Compute the delta between where we were compiled to run at
- * and where the code will actually run at.
+/* %ebp contains the address we are loaded at by the boot loader and %ebx
+ * contains the address where we should move the kernel image temporarily
+ * for safe in-place decompression.
  */
-	/* Start with the delta to where the kernel will run at.  If we are
-	 * a relocatable kernel this is the delta to our load address otherwise
-	 * this is the delta to CONFIG_PHYSICAL start.
-	 */
+
 #ifdef CONFIG_RELOCATABLE
-	movl %ebp, %ebx
+	movl 	%ebp, %ebx
+	addl    $(CONFIG_PHYSICAL_ALIGN - 1), %ebx
+	andl    $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx
 #else
-	movl $(CONFIG_PHYSICAL_START - startup_32), %ebx
+	movl $LOAD_PHYSICAL_ADDR, %ebx
 #endif
 
 	/* Replace the compressed data size with the uncompressed size */
@@ -94,9 +95,10 @@ startup_32:
 /* Compute the kernel start address.
  */
 #ifdef CONFIG_RELOCATABLE
-	leal	startup_32(%ebp), %ebp
+	addl    $(CONFIG_PHYSICAL_ALIGN - 1), %ebp
+	andl    $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp
 #else
-	movl	$CONFIG_PHYSICAL_START, %ebp
+	movl	$LOAD_PHYSICAL_ADDR, %ebp
 #endif
 
 /*
@@ -150,8 +152,8 @@ relocated:
  * and where it was actually loaded.
  */
 	movl %ebp, %ebx
-	subl $CONFIG_PHYSICAL_START, %ebx
-
+	subl $LOAD_PHYSICAL_ADDR, %ebx
+	jz   2f		/* Nothing to be done if loaded at compiled addr. */
 /*
  * Process relocations.
  */
diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c
index 4eac24e..dc15389 100644
--- a/arch/i386/boot/compressed/misc.c
+++ b/arch/i386/boot/compressed/misc.c
@@ -14,6 +14,7 @@
 #include <linux/screen_info.h>
 #include <asm/io.h>
 #include <asm/page.h>
+#include <asm/boot.h>
 
 /* WARNING!!
  * This code is compiled with -fPIC and it is relocated dynamically
@@ -360,12 +361,12 @@ asmlinkage void decompress_kernel(void *rmode, unsigned long end,
 	insize = input_len;
 	inptr  = 0;
 
-	if (((u32)output - CONFIG_PHYSICAL_START) & 0x3fffff)
-		error("Destination address not 4M aligned");
+	if ((u32)output & (CONFIG_PHYSICAL_ALIGN -1))
+		error("Destination address not CONFIG_PHYSICAL_ALIGN aligned");
 	if (end > ((-__PAGE_OFFSET-(512 <<20)-1) & 0x7fffffff))
 		error("Destination address too large");
 #ifndef CONFIG_RELOCATABLE
-	if ((u32)output != CONFIG_PHYSICAL_START)
+	if ((u32)output != LOAD_PHYSICAL_ADDR)
 		error("Wrong destination address");
 #endif
 
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index f8d61ec..6860f20 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -14,6 +14,7 @@
 #include <asm/thread_info.h>
 #include <asm/page.h>
 #include <asm/cache.h>
+#include <asm/boot.h>
 
 OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
 OUTPUT_ARCH(i386)
@@ -27,7 +28,7 @@ PHDRS {
 }
 SECTIONS
 {
-  . = LOAD_OFFSET + CONFIG_PHYSICAL_START;
+  . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
   phys_startup_32 = startup_32 - LOAD_OFFSET;
   /* read-only */
   .text : AT(ADDR(.text) - LOAD_OFFSET) {
diff --git a/include/asm-i386/boot.h b/include/asm-i386/boot.h
index 96b228e..8ce79a6 100644
--- a/include/asm-i386/boot.h
+++ b/include/asm-i386/boot.h
@@ -12,4 +12,8 @@
 #define EXTENDED_VGA	0xfffe		/* 80x50 mode */
 #define ASK_VGA		0xfffd		/* ask for it at bootup */
 
-#endif
+/* Physical address where kenrel should be loaded. */
+#define LOAD_PHYSICAL_ADDR ((0x100000 + CONFIG_PHYSICAL_ALIGN - 1) \
+				& ~(CONFIG_PHYSICAL_ALIGN - 1))
+
+#endif /* _LINUX_BOOT_H */
-- 
cgit v0.10.2


From be274eeaf20b4c7155242645d5e2c48b023e609b Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Thu, 7 Dec 2006 02:14:04 +0100
Subject: [PATCH] i386: extend bzImage protocol for relocatable protected mode
 kernel

Extend bzImage protocol to enable bootloaders to load a completely relocatable
bzImage.  Now protected mode component of kernel is also relocatable and a
boot-loader can load the protected mode component at a differnt physical
address than 1MB.  (If kernel was built with CONFIG_RELOCATABLE)

Kexec can make use of it to load this kernel at a different physical address
to capture kernel crash dumps.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/Documentation/i386/boot.txt b/Documentation/i386/boot.txt
index c51314b..cb28254 100644
--- a/Documentation/i386/boot.txt
+++ b/Documentation/i386/boot.txt
@@ -35,6 +35,8 @@ Protocol 2.03:	(Kernel 2.4.18-pre1) Explicitly makes the highest possible
 		initrd address available to the bootloader.
 
 Protocol 2.04:	(Kernel 2.6.14) Extend the syssize field to four bytes.
+Protocol 2.05:	(Kernel 2.6.20) Make protected mode kernel relocatable.
+		Introduce relocatable_kernel and kernel_alignment fields.
 
 
 **** MEMORY LAYOUT
@@ -129,6 +131,8 @@ Offset	Proto	Name		Meaning
 0226/2	N/A	pad1		Unused
 0228/4	2.02+	cmd_line_ptr	32-bit pointer to the kernel command line
 022C/4	2.03+	initrd_addr_max	Highest legal initrd address
+0230/4	2.04+	kernel_alignment Physical addr alignment required for kernel
+0234/1	2.04+	relocatable_kernel Whether kernel is relocatable or not
 
 (1) For backwards compatibility, if the setup_sects field contains 0, the
     real value is 4.
diff --git a/arch/i386/boot/setup.S b/arch/i386/boot/setup.S
index 9aa8b05..06edf1c 100644
--- a/arch/i386/boot/setup.S
+++ b/arch/i386/boot/setup.S
@@ -81,7 +81,7 @@ start:
 # This is the setup header, and it must start at %cs:2 (old 0x9020:2)
 
 		.ascii	"HdrS"		# header signature
-		.word	0x0204		# header version number (>= 0x0105)
+		.word	0x0205		# header version number (>= 0x0105)
 					# or else old loadlin-1.5 will fail)
 realmode_swtch:	.word	0, 0		# default_switch, SETUPSEG
 start_sys_seg:	.word	SYSSEG
@@ -160,6 +160,17 @@ ramdisk_max:	.long (-__PAGE_OFFSET-(512 << 20)-1) & 0x7fffffff
 					# The highest safe address for
 					# the contents of an initrd
 
+kernel_alignment:  .long CONFIG_PHYSICAL_ALIGN 	#physical addr alignment
+						#required for protected mode
+						#kernel
+#ifdef CONFIG_RELOCATABLE
+relocatable_kernel:    .byte 1
+#else
+relocatable_kernel:    .byte 0
+#endif
+pad2:			.byte 0
+pad3:			.word 0
+
 trampoline:	call	start_of_setup
 		.align 16
 					# The offset at this point is 0x240
-- 
cgit v0.10.2


From 4c7aa6c3b25ef96bc1b723238041195e3d8bf047 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Thu, 7 Dec 2006 02:14:04 +0100
Subject: [PATCH] i386: Mark CONFIG_RELOCATABLE EXPERIMENTAL

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index fd2fa7a..1f0f7b6 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -774,7 +774,8 @@ config CRASH_DUMP
 	  For more details see Documentation/kdump/kdump.txt
 
 config RELOCATABLE
-	bool "Build a relocatable kernel"
+	bool "Build a relocatable kernel(EXPERIMENTAL)"
+	depends on EXPERIMENTAL
 	help
 	  This build a kernel image that retains relocation information
           so it can be loaded someplace besides the default 1MB.
-- 
cgit v0.10.2


From 249e83fe839a13b5dc94285daeeaf70f6e54d930 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 7 Dec 2006 02:14:04 +0100
Subject: [PATCH] x86-64: Extract segment descriptor definitions for use
 outside

Code that wants to use struct desc_struct cannot do so on i386 because
desc.h contains other code that will only compile on x86_64.

So extract the structure definitions into a asm-x86_64/desc_defs.h.

Signed-off-by: Avi Kivity <avi@qumranet.com>
Signed-off-by: Andi Kleen <ak@suse.de>

 include/asm-x86_64/desc.h      |   53 -------------------------------
 include/asm-x86_64/desc_defs.h |   69 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 52 deletions(-)

diff --git a/include/asm-x86_64/desc.h b/include/asm-x86_64/desc.h
index eb7723a..913d6ac 100644
--- a/include/asm-x86_64/desc.h
+++ b/include/asm-x86_64/desc.h
@@ -9,64 +9,13 @@
 
 #include <linux/string.h>
 #include <linux/smp.h>
+#include <asm/desc_defs.h>
 
 #include <asm/segment.h>
 #include <asm/mmu.h>
 
-// 8 byte segment descriptor
-struct desc_struct { 
-	u16 limit0;
-	u16 base0;
-	unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1;
-	unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8;
-} __attribute__((packed)); 
-
-struct n_desc_struct { 
-	unsigned int a,b;
-}; 	
-
 extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
 
-enum { 
-	GATE_INTERRUPT = 0xE, 
-	GATE_TRAP = 0xF, 	
-	GATE_CALL = 0xC,
-}; 	
-
-// 16byte gate
-struct gate_struct {          
-	u16 offset_low;
-	u16 segment; 
-	unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
-	u16 offset_middle;
-	u32 offset_high;
-	u32 zero1; 
-} __attribute__((packed));
-
-#define PTR_LOW(x) ((unsigned long)(x) & 0xFFFF) 
-#define PTR_MIDDLE(x) (((unsigned long)(x) >> 16) & 0xFFFF)
-#define PTR_HIGH(x) ((unsigned long)(x) >> 32)
-
-enum { 
-	DESC_TSS = 0x9,
-	DESC_LDT = 0x2,
-}; 
-
-// LDT or TSS descriptor in the GDT. 16 bytes.
-struct ldttss_desc { 
-	u16 limit0;
-	u16 base0;
-	unsigned base1 : 8, type : 5, dpl : 2, p : 1;
-	unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
-	u32 base3;
-	u32 zero1; 
-} __attribute__((packed)); 
-
-struct desc_ptr {
-	unsigned short size;
-	unsigned long address;
-} __attribute__((packed)) ;
-
 #define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8))
 #define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8))
 #define clear_LDT()  asm volatile("lldt %w0"::"r" (0))
diff --git a/include/asm-x86_64/desc_defs.h b/include/asm-x86_64/desc_defs.h
new file mode 100644
index 0000000..0890040
--- /dev/null
+++ b/include/asm-x86_64/desc_defs.h
@@ -0,0 +1,69 @@
+/* Written 2000 by Andi Kleen */
+#ifndef __ARCH_DESC_DEFS_H
+#define __ARCH_DESC_DEFS_H
+
+/*
+ * Segment descriptor structure definitions, usable from both x86_64 and i386
+ * archs.
+ */
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+// 8 byte segment descriptor
+struct desc_struct {
+	u16 limit0;
+	u16 base0;
+	unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1;
+	unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8;
+} __attribute__((packed));
+
+struct n_desc_struct {
+	unsigned int a,b;
+};
+
+enum {
+	GATE_INTERRUPT = 0xE,
+	GATE_TRAP = 0xF,
+	GATE_CALL = 0xC,
+};
+
+// 16byte gate
+struct gate_struct {
+	u16 offset_low;
+	u16 segment;
+	unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
+	u16 offset_middle;
+	u32 offset_high;
+	u32 zero1;
+} __attribute__((packed));
+
+#define PTR_LOW(x) ((unsigned long)(x) & 0xFFFF)
+#define PTR_MIDDLE(x) (((unsigned long)(x) >> 16) & 0xFFFF)
+#define PTR_HIGH(x) ((unsigned long)(x) >> 32)
+
+enum {
+	DESC_TSS = 0x9,
+	DESC_LDT = 0x2,
+};
+
+// LDT or TSS descriptor in the GDT. 16 bytes.
+struct ldttss_desc {
+	u16 limit0;
+	u16 base0;
+	unsigned base1 : 8, type : 5, dpl : 2, p : 1;
+	unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
+	u32 base3;
+	u32 zero1;
+} __attribute__((packed));
+
+struct desc_ptr {
+	unsigned short size;
+	unsigned long address;
+} __attribute__((packed)) ;
+
+
+#endif /* !__ASSEMBLY__ */
+
+#endif
-- 
cgit v0.10.2


From 74b47a7844501445d41d704fe7c626f4b1819508 Mon Sep 17 00:00:00 2001
From: Joe Korty <joe.korty@ccur.com>
Date: Thu, 7 Dec 2006 02:14:04 +0100
Subject: [PATCH] i386: Fix entry.S code with !CONFIG_VM86

The entry.S code at work_notifysig is surely wrong.  It drops into unrelated
code if the branch to work_notifysig_v86 is taken, and CONFIG_VM86=n.

	[PATCH] Make vm86 support optional
	tree 9b5daef5280800a0006343a17f63072658d91a1d
	pushed to git Jan 8, 2006, and first appears in 2.6.16

The 'fix' here is to also compile out the vm86 test & branch when
CONFIG_VM86=n.

Signed-off-by: Joe Korty <joe.korty@ccur.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index d7423ef..0220bc8 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -457,6 +457,7 @@ work_resched:
 
 work_notifysig:				# deal with pending signals and
 					# notify-resume requests
+#ifdef CONFIG_VM86
 	testl $VM_MASK, PT_EFLAGS(%esp)
 	movl %esp, %eax
 	jne work_notifysig_v86		# returning to kernel-space or
@@ -467,17 +468,18 @@ work_notifysig:				# deal with pending signals and
 
 	ALIGN
 work_notifysig_v86:
-#ifdef CONFIG_VM86
 	pushl %ecx			# save ti_flags for do_notify_resume
 	CFI_ADJUST_CFA_OFFSET 4
 	call save_v86_state		# %eax contains pt_regs pointer
 	popl %ecx
 	CFI_ADJUST_CFA_OFFSET -4
 	movl %eax, %esp
+#else
+	movl %esp, %eax
+#endif
 	xorl %edx, %edx
 	call do_notify_resume
 	jmp resume_userspace_sig
-#endif
 
 	# perform syscall exit tracing
 	ALIGN
-- 
cgit v0.10.2


From ea7322decb974a4a3e804f96a0201e893ff88ce3 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:05 +0100
Subject: [PATCH] x86-64: Speed and clean up cache flushing in change_page_attr

CLFLUSH is a lot faster than WBINVD so avoid the later if at all
possible.

Always pass the complete list of pages to other CPUs to cut down
the number of IPIs.

Minor other cleanup and sync with i386 version.

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index 3e231d76..ccb91dd 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -61,34 +61,40 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
 	return base;
 } 
 
-
-static void flush_kernel_map(void *address) 
+static void cache_flush_page(void *adr)
 {
-	if (0 && address && cpu_has_clflush) {
-		/* is this worth it? */ 
-		int i;
-		for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) 
-			asm volatile("clflush (%0)" :: "r" (address + i)); 
-	} else
-		asm volatile("wbinvd":::"memory"); 
-	if (address)
-		__flush_tlb_one(address);
-	else
-		__flush_tlb_all();
+	int i;
+	for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
+		asm volatile("clflush (%0)" :: "r" (adr + i));
 }
 
+static void flush_kernel_map(void *arg)
+{
+	struct list_head *l = (struct list_head *)arg;
+	struct page *pg;
+
+	/* When clflush is available always use it because it is
+	   much cheaper than WBINVD */
+	if (!cpu_has_clflush)
+		asm volatile("wbinvd" ::: "memory");
+	list_for_each_entry(pg, l, lru) {
+		void *adr = page_address(pg);
+		if (cpu_has_clflush)
+			cache_flush_page(adr);
+		__flush_tlb_one(adr);
+	}
+}
 
-static inline void flush_map(unsigned long address)
+static inline void flush_map(struct list_head *l)
 {	
-	on_each_cpu(flush_kernel_map, (void *)address, 1, 1);
+	on_each_cpu(flush_kernel_map, l, 1, 1);
 }
 
-static struct page *deferred_pages; /* protected by init_mm.mmap_sem */
+static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
 
 static inline void save_page(struct page *fpage)
 {
-	fpage->lru.next = (struct list_head *)deferred_pages;
-	deferred_pages = fpage;
+	list_add(&fpage->lru, &deferred_pages);
 }
 
 /* 
@@ -207,18 +213,18 @@ int change_page_attr(struct page *page, int numpages, pgprot_t prot)
 
 void global_flush_tlb(void)
 { 
-	struct page *dpage;
+	struct page *pg, *next;
+	struct list_head l;
 
 	down_read(&init_mm.mmap_sem);
-	dpage = xchg(&deferred_pages, NULL);
+	list_replace_init(&deferred_pages, &l);
 	up_read(&init_mm.mmap_sem);
 
-	flush_map((dpage && !dpage->lru.next) ? (unsigned long)page_address(dpage) : 0);
-	while (dpage) {
-		struct page *tmp = dpage;
-		dpage = (struct page *)dpage->lru.next;
-		ClearPagePrivate(tmp);
-		__free_page(tmp);
+	flush_map(&l);
+
+	list_for_each_entry_safe(pg, next, &l, lru) {
+		ClearPagePrivate(pg);
+		__free_page(pg);
 	} 
 } 
 
-- 
cgit v0.10.2


From 770d132f03ac15b12919f1bac481f4beda13e094 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:05 +0100
Subject: [PATCH] i386: Retrieve CLFLUSH size from CPUID

Also report it in /proc/cpuinfo similar to x86-64.

Needed for followon patch

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 6958ae5..cda41ae 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -309,6 +309,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 #else
 			c->apicid = (ebx >> 24) & 0xFF;
 #endif
+			if (c->x86_capability[0] & (1<<19))
+				c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
 		} else {
 			/* Have CPUID level 0 only - unheard of */
 			c->x86 = 4;
@@ -373,6 +375,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 	c->x86_vendor_id[0] = '\0'; /* Unset */
 	c->x86_model_id[0] = '\0';  /* Unset */
 	c->x86_max_cores = 1;
+	c->x86_clflush_size = 32;
 	memset(&c->x86_capability, 0, sizeof c->x86_capability);
 
 	if (!have_cpuid_p()) {
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index 76aac08..6624d85 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -152,9 +152,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 				seq_printf(m, " [%d]", i);
 		}
 
-	seq_printf(m, "\nbogomips\t: %lu.%02lu\n\n",
+	seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
 		     c->loops_per_jiffy/(500000/HZ),
 		     (c->loops_per_jiffy/(5000/HZ)) % 100);
+	seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size);
 
 	return 0;
 }
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index f73cf83..98fa73b 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -72,6 +72,7 @@ struct cpuinfo_x86 {
 #endif
 	unsigned char x86_max_cores;	/* cpuid returned max cores value */
 	unsigned char apicid;
+	unsigned short x86_clflush_size;
 #ifdef CONFIG_SMP
 	unsigned char booted_cores;	/* number of cores as seen by OS */
 	__u8 phys_proc_id; 		/* Physical processor id. */
-- 
cgit v0.10.2


From 3760dd6efa75c98e223643da2eb7040406433053 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:05 +0100
Subject: [PATCH] i386: Use CLFLUSH instead of WBINVD in change_page_attr

CLFLUSH is a lot faster than WBINVD so try to use that.

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index 8564b6a..ad91528 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -67,11 +67,17 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
 	return base;
 } 
 
-static void flush_kernel_map(void *dummy) 
+static void flush_kernel_map(void *arg)
 { 
-	/* Could use CLFLUSH here if the CPU supports it (Hammer,P4) */
-	if (boot_cpu_data.x86_model >= 4) 
+	unsigned long adr = (unsigned long)arg;
+
+	if (adr && cpu_has_clflush) {
+		int i;
+		for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
+			asm volatile("clflush (%0)" :: "r" (adr + i));
+	} else if (boot_cpu_data.x86_model >= 4)
 		wbinvd();
+
 	/* Flush all to work around Errata in early athlons regarding 
 	 * large page flushing. 
 	 */
@@ -173,9 +179,9 @@ __change_page_attr(struct page *page, pgprot_t prot)
 	return 0;
 } 
 
-static inline void flush_map(void)
+static inline void flush_map(void *adr)
 {
-	on_each_cpu(flush_kernel_map, NULL, 1, 1);
+	on_each_cpu(flush_kernel_map, adr, 1, 1);
 }
 
 /*
@@ -217,9 +223,13 @@ void global_flush_tlb(void)
 	spin_lock_irq(&cpa_lock);
 	list_replace_init(&df_list, &l);
 	spin_unlock_irq(&cpa_lock);
-	flush_map();
-	list_for_each_entry_safe(pg, next, &l, lru)
+	if (!cpu_has_clflush)
+		flush_map(0);
+	list_for_each_entry_safe(pg, next, &l, lru) {
+		if (cpu_has_clflush)
+			flush_map(page_address(pg));
 		__free_page(pg);
+	}
 }
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
diff --git a/include/asm-i386/cpufeature.h b/include/asm-i386/cpufeature.h
index 2316725..4c83e05 100644
--- a/include/asm-i386/cpufeature.h
+++ b/include/asm-i386/cpufeature.h
@@ -137,6 +137,7 @@
 #define cpu_has_pmm_enabled	boot_cpu_has(X86_FEATURE_PMM_EN)
 #define cpu_has_ds		boot_cpu_has(X86_FEATURE_DS)
 #define cpu_has_pebs 		boot_cpu_has(X86_FEATURE_PEBS)
+#define cpu_has_clflush		boot_cpu_has(X86_FEATURE_CLFLSH)
 
 #endif /* __ASM_I386_CPUFEATURE_H */
 
-- 
cgit v0.10.2


From 5df0287ecc4f53e68bbd188fa8258b555e6b734f Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai.lu@amd.com>
Date: Thu, 7 Dec 2006 02:14:05 +0100
Subject: [PATCH] x86-64: Extend clear_irq_vector

Clear the irq releated entries in irq_vector, irq_domain and vector_irq
instead of clearing irq_vector only. So when new irq is created, it
could reuse that vector. (actually is the second loop scanning from
FIRST_DEVICE_VECTOR+8). This could avoid the vectors are used up
with enough module inserting and removing

Cc: Eric W. Biedierman <ebiederm@xmission.com>
Cc: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-By: Yinghai Lu <yinghai.lu@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index c80081a..f71461b 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -750,6 +750,22 @@ static int assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
 	return vector;
 }
 
+static void __clear_irq_vector(int irq)
+{
+	cpumask_t mask;
+	int cpu, vector;
+
+	BUG_ON(!irq_vector[irq]);
+
+	vector = irq_vector[irq];
+	cpus_and(mask, irq_domain[irq], cpu_online_map);
+	for_each_cpu_mask(cpu, mask)
+		per_cpu(vector_irq, cpu)[vector] = -1;
+
+	irq_vector[irq] = 0;
+	irq_domain[irq] = CPU_MASK_NONE;
+}
+
 void __setup_vector_irq(int cpu)
 {
 	/* Initialize vector_irq on a new cpu */
@@ -1837,7 +1853,7 @@ void destroy_irq(unsigned int irq)
 	dynamic_irq_cleanup(irq);
 
 	spin_lock_irqsave(&vector_lock, flags);
-	irq_vector[irq] = 0;
+	__clear_irq_vector(irq);
 	spin_unlock_irqrestore(&vector_lock, flags);
 }
 
-- 
cgit v0.10.2


From 2fff0a48416af891dce38fd425246e337831e0bb Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:05 +0100
Subject: [PATCH] Generic: Move __user cast into probe_kernel_address

Caller of probe_kernel_address shouldn't need to know that
pka is internally implemented with __get_user. So move the
__user cast into pka.

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index a48d7f1..65a68da 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -36,7 +36,7 @@ static inline unsigned long __copy_from_user_nocache(void *to,
 		long ret;				\
 							\
 		inc_preempt_count();			\
-		ret = __get_user(retval, addr);		\
+		ret = __get_user(retval, (__force typeof(*addr) __user *)addr);\
 		dec_preempt_count();			\
 		ret;					\
 	})
-- 
cgit v0.10.2


From ab2bf0c1c689905b628dca94d0acd9c50e152468 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:06 +0100
Subject: [PATCH] x86-64: Use probe_kernel_address in arch/x86_64/*

Instead of open coded __get_user

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 70bfaab..264db33 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -30,9 +30,9 @@
 #include <linux/kprobes.h>
 #include <linux/kexec.h>
 #include <linux/unwind.h>
+#include <linux/uaccess.h>
 
 #include <asm/system.h>
-#include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/atomic.h>
 #include <asm/debugreg.h>
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 3751b47..a65fc6f 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -23,9 +23,9 @@
 #include <linux/compiler.h>
 #include <linux/module.h>
 #include <linux/kprobes.h>
+#include <linux/uaccess.h>
 
 #include <asm/system.h>
-#include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/smp.h>
 #include <asm/tlbflush.h>
@@ -96,7 +96,7 @@ void bust_spinlocks(int yes)
 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
 				unsigned long error_code)
 { 
-	unsigned char __user *instr;
+	unsigned char *instr;
 	int scan_more = 1;
 	int prefetch = 0; 
 	unsigned char *max_instr;
@@ -116,7 +116,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
 		unsigned char instr_hi;
 		unsigned char instr_lo;
 
-		if (__get_user(opcode, (char __user *)instr))
+		if (probe_kernel_address(instr, opcode))
 			break; 
 
 		instr_hi = opcode & 0xf0; 
@@ -154,7 +154,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
 		case 0x00:
 			/* Prefetch instruction is 0x0F0D or 0x0F18 */
 			scan_more = 0;
-			if (__get_user(opcode, (char __user *)instr))
+			if (probe_kernel_address(instr, opcode))
 				break;
 			prefetch = (instr_lo == 0xF) &&
 				(opcode == 0x0D || opcode == 0x18);
@@ -170,7 +170,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
 static int bad_address(void *p) 
 { 
 	unsigned long dummy;
-	return __get_user(dummy, (unsigned long __user *)p);
+	return probe_kernel_address((unsigned long *)p, dummy);
 } 
 
 void dump_pagetable(unsigned long address)
-- 
cgit v0.10.2


From 11a4180c0b03e2ee0c948fd8430ee092dc1625b3 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:06 +0100
Subject: [PATCH] i386: Use probe_kernel_address instead of __get_user in fault
 paths

Makes the intention of the code cleaner to read and avoids
a potential deadlock on mmap_sem. Also change the types of
the arguments to not include __user because they're really
not user addresses.

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 237f488..7b2f9f0 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -380,7 +380,7 @@ void show_registers(struct pt_regs *regs)
 	 * time of the fault..
 	 */
 	if (in_kernel) {
-		u8 __user *eip;
+		u8 *eip;
 		int code_bytes = 64;
 		unsigned char c;
 
@@ -389,18 +389,20 @@ void show_registers(struct pt_regs *regs)
 
 		printk(KERN_EMERG "Code: ");
 
-		eip = (u8 __user *)regs->eip - 43;
-		if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
+		eip = (u8 *)regs->eip - 43;
+		if (eip < (u8 *)PAGE_OFFSET ||
+			probe_kernel_address(eip, c)) {
 			/* try starting at EIP */
-			eip = (u8 __user *)regs->eip;
+			eip = (u8 *)regs->eip;
 			code_bytes = 32;
 		}
 		for (i = 0; i < code_bytes; i++, eip++) {
-			if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
+			if (eip < (u8 *)PAGE_OFFSET ||
+				probe_kernel_address(eip, c)) {
 				printk(" Bad EIP value.");
 				break;
 			}
-			if (eip == (u8 __user *)regs->eip)
+			if (eip == (u8 *)regs->eip)
 				printk("<%02x> ", c);
 			else
 				printk("%02x ", c);
@@ -416,7 +418,7 @@ static void handle_BUG(struct pt_regs *regs)
 
 	if (eip < PAGE_OFFSET)
 		return;
-	if (probe_kernel_address((unsigned short __user *)eip, ud2))
+	if (probe_kernel_address((unsigned short *)eip, ud2))
 		return;
 	if (ud2 != 0x0b0f)
 		return;
@@ -429,11 +431,11 @@ static void handle_BUG(struct pt_regs *regs)
 		char *file;
 		char c;
 
-		if (probe_kernel_address((unsigned short __user *)(eip + 2),
-					line))
+		if (probe_kernel_address((unsigned short *)(eip + 2), line))
 			break;
-		if (__get_user(file, (char * __user *)(eip + 4)) ||
-		    (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
+		if (probe_kernel_address((char **)(eip + 4), file) ||
+		    (unsigned long)file < PAGE_OFFSET ||
+			probe_kernel_address(file, c))
 			file = "<bad filename>";
 
 		printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index 2581575..aaaa4d2 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -22,9 +22,9 @@
 #include <linux/highmem.h>
 #include <linux/module.h>
 #include <linux/kprobes.h>
+#include <linux/uaccess.h>
 
 #include <asm/system.h>
-#include <asm/uaccess.h>
 #include <asm/desc.h>
 #include <asm/kdebug.h>
 #include <asm/segment.h>
@@ -167,7 +167,7 @@ static inline unsigned long get_segment_eip(struct pt_regs *regs,
 static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
 { 
 	unsigned long limit;
-	unsigned long instr = get_segment_eip (regs, &limit);
+	unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
 	int scan_more = 1;
 	int prefetch = 0; 
 	int i;
@@ -177,9 +177,9 @@ static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
 		unsigned char instr_hi;
 		unsigned char instr_lo;
 
-		if (instr > limit)
+		if (instr > (unsigned char *)limit)
 			break;
-		if (__get_user(opcode, (unsigned char __user *) instr))
+		if (probe_kernel_address(instr, opcode))
 			break; 
 
 		instr_hi = opcode & 0xf0; 
@@ -204,9 +204,9 @@ static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
 		case 0x00:
 			/* Prefetch instruction is 0x0F0D or 0x0F18 */
 			scan_more = 0;
-			if (instr > limit)
+			if (instr > (unsigned char *)limit)
 				break;
-			if (__get_user(opcode, (unsigned char __user *) instr))
+			if (probe_kernel_address(instr, opcode))
 				break;
 			prefetch = (instr_lo == 0xF) &&
 				(opcode == 0x0D || opcode == 0x18);
-- 
cgit v0.10.2


From b026872601976f666bae77b609dc490d1834bf77 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:06 +0100
Subject: [PATCH] x86-64: Try multiple timer variants in check_timer

Instead of adding all kinds of more quirks try various timer
routing variants in check_timer.

In particular this tries to handle quirks from:
- Nvidia NF2-4 reference BIOS: wrong timer override
- Asus: Wrong timer override but no HPET table
- ATI: require timer disabled in 8259
- Some boards: require timer enabled in 8259

We just try many of the the known variants in the hopefully right order
in check_timer.

Trying pin 0/2 on Nvidia suggested by Tim Hockin.

TBD Experimental. Needs a lot of testing

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index f3c57f4..ab9b1c0 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -52,10 +52,6 @@ APICs
 		 apicmaintimer. Useful when your PIT timer is totally
 		 broken.
 
-   disable_8254_timer / enable_8254_timer
-		 Enable interrupt 0 timer routing over the 8254 in addition to over
-	         the IO-APIC. The kernel tries to set a sensible default.
-
 Early Console
 
    syntax: earlyprintk=vga
diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c
index 68273bf..fb0c6da 100644
--- a/arch/x86_64/kernel/early-quirks.c
+++ b/arch/x86_64/kernel/early-quirks.c
@@ -69,11 +69,6 @@ static void nvidia_bugs(void)
 
 static void ati_bugs(void)
 {
-	if (timer_over_8254 == 1) {
-		timer_over_8254 = 0;
-		printk(KERN_INFO
-	 	"ATI board detected. Disabling timer routing over 8254.\n");
-	}
 }
 
 struct chipset {
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index f71461b..88fcc4e 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -55,10 +55,6 @@ int sis_apic_bug; /* not actually supported, dummy for compile */
 
 static int no_timer_check;
 
-static int disable_timer_pin_1 __initdata;
-
-int timer_over_8254 __initdata = 1;
-
 /* Where if anywhere is the i8259 connect in external int mode */
 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 
@@ -348,29 +344,6 @@ static int __init disable_ioapic_setup(char *str)
 }
 early_param("noapic", disable_ioapic_setup);
 
-/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
-static int __init disable_timer_pin_setup(char *arg)
-{
-	disable_timer_pin_1 = 1;
-	return 1;
-}
-__setup("disable_timer_pin_1", disable_timer_pin_setup);
-
-static int __init setup_disable_8254_timer(char *s)
-{
-	timer_over_8254 = -1;
-	return 1;
-}
-static int __init setup_enable_8254_timer(char *s)
-{
-	timer_over_8254 = 2;
-	return 1;
-}
-
-__setup("disable_8254_timer", setup_disable_8254_timer);
-__setup("enable_8254_timer", setup_enable_8254_timer);
-
-
 /*
  * Find the IRQ entry number of a certain pin.
  */
@@ -1579,10 +1552,33 @@ static inline void unlock_ExtINT_logic(void)
  * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
  * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
  * fanatically on his truly buggy board.
- *
- * FIXME: really need to revamp this for modern platforms only.
  */
-static inline void check_timer(void)
+
+static int try_apic_pin(int apic, int pin, char *msg)
+{
+	apic_printk(APIC_VERBOSE, KERN_INFO
+		    "..TIMER: trying IO-APIC=%d PIN=%d %s",
+		    apic, pin, msg);
+
+	/*
+	 * Ok, does IRQ0 through the IOAPIC work?
+	 */
+	if (!no_timer_check && timer_irq_works()) {
+		nmi_watchdog_default();
+		if (nmi_watchdog == NMI_IO_APIC) {
+			disable_8259A_irq(0);
+			setup_nmi();
+			enable_8259A_irq(0);
+		}
+		return 1;
+	}
+	clear_IO_APIC_pin(apic, pin);
+	apic_printk(APIC_QUIET, KERN_ERR " .. failed\n");
+	return 0;
+}
+
+/* The function from hell */
+static void check_timer(void)
 {
 	int apic1, pin1, apic2, pin2;
 	int vector;
@@ -1603,61 +1599,43 @@ static inline void check_timer(void)
 	 */
 	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
 	init_8259A(1);
-	if (timer_over_8254 > 0)
-		enable_8259A_irq(0);
 
 	pin1  = find_isa_irq_pin(0, mp_INT);
 	apic1 = find_isa_irq_apic(0, mp_INT);
 	pin2  = ioapic_i8259.pin;
 	apic2 = ioapic_i8259.apic;
 
-	apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
-		vector, apic1, pin1, apic2, pin2);
+	/* Do this first, otherwise we get double interrupts on ATI boards */
+	if ((pin1 != -1) && try_apic_pin(apic1, pin1,"with 8259 IRQ0 disabled"))
+		return;
 
-	if (pin1 != -1) {
-		/*
-		 * Ok, does IRQ0 through the IOAPIC work?
-		 */
-		unmask_IO_APIC_irq(0);
-		if (!no_timer_check && timer_irq_works()) {
-			nmi_watchdog_default();
-			if (nmi_watchdog == NMI_IO_APIC) {
-				disable_8259A_irq(0);
-				setup_nmi();
-				enable_8259A_irq(0);
-			}
-			if (disable_timer_pin_1 > 0)
-				clear_IO_APIC_pin(0, pin1);
-			return;
-		}
-		clear_IO_APIC_pin(apic1, pin1);
-		apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
-				"connected to IO-APIC\n");
-	}
+	/* Now try again with IRQ0 8259A enabled.
+	   Assumes timer is on IO-APIC 0 ?!? */
+	enable_8259A_irq(0);
+	unmask_IO_APIC_irq(0);
+	if (try_apic_pin(apic1, pin1, "with 8259 IRQ0 enabled"))
+		return;
+	disable_8259A_irq(0);
 
-	apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
-				"through the 8259A ... ");
+	/* Always try pin0 and pin2 on APIC 0 to handle buggy timer overrides
+	   on Nvidia boards */
+	if (!(apic1 == 0 && pin1 == 0) &&
+	    try_apic_pin(0, 0, "fallback with 8259 IRQ0 disabled"))
+		return;
+	if (!(apic1 == 0 && pin1 == 2) &&
+	    try_apic_pin(0, 2, "fallback with 8259 IRQ0 disabled"))
+		return;
+
+	/* Then try pure 8259A routing on the 8259 as reported by BIOS*/
+	enable_8259A_irq(0);
 	if (pin2 != -1) {
-		apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
-			apic2, pin2);
-		/*
-		 * legacy devices should be connected to IO APIC #0
-		 */
 		setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
-		if (timer_irq_works()) {
-			apic_printk(APIC_VERBOSE," works.\n");
-			nmi_watchdog_default();
-			if (nmi_watchdog == NMI_IO_APIC) {
-				setup_nmi();
-			}
+		if (try_apic_pin(apic2,pin2,"8259A broadcast ExtINT from BIOS"))
 			return;
-		}
-		/*
-		 * Cleanup, just in case ...
-		 */
-		clear_IO_APIC_pin(apic2, pin2);
 	}
-	apic_printk(APIC_VERBOSE," failed.\n");
+
+	/* Tried all possibilities to go through the IO-APIC. Now come the
+	   really cheesy fallbacks. */
 
 	if (nmi_watchdog == NMI_IO_APIC) {
 		printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
-- 
cgit v0.10.2


From 8e3de538eec95b57a5b86038988451c38ba83f7e Mon Sep 17 00:00:00 2001
From: Albert Cahalan <acahalan@gmail.com>
Date: Thu, 7 Dec 2006 02:14:06 +0100
Subject: [PATCH] x86-64: Support -mregparm arguments for signals with
 SA_SIGINFO in compat mode

The recent change to make x86_64 support i386 binaries compiled
with -mregparm=3 only covered signal handlers without SA_SIGINFO.
(the 3-arg "real-time" ones)

To be compatible with i386, both types should be supported.

Signed-off-by: Albert Cahalan <acahalan@gmail.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c
index 0e0a266..ff499ef 100644
--- a/arch/x86_64/ia32/ia32_signal.c
+++ b/arch/x86_64/ia32/ia32_signal.c
@@ -584,6 +584,11 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	regs->rdx = (unsigned long) &frame->info;
 	regs->rcx = (unsigned long) &frame->uc;
 
+	/* Make -mregparm=3 work */
+	regs->rax = sig;
+	regs->rdx = (unsigned long) &frame->info;
+	regs->rcx = (unsigned long) &frame->uc;
+
 	asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 
 	asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 
 	
-- 
cgit v0.10.2


From 269c2d81ed66af7c09a1619ffe165f03e7470a5b Mon Sep 17 00:00:00 2001
From: "bibo,mao" <bibo.mao@intel.com>
Date: Thu, 7 Dec 2006 02:14:06 +0100
Subject: [PATCH] i386: i386 create e820.c to handle standard io/mem resources

This patch creates new file named e820.c to hanle standard io/mem
resources, moving request_standard_resources function from setup.c
to e820.c. Also this patch modifies Makfile to compile file e820.c.

Signed-off-by: bibo,mao <bibo.mao@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>

 Makefile |    2
 arch/i386/kernel/Makefile |    2
 arch/i386/kernel/e820.c   |  289 ++++++++++++++++++++++++++++++++++++++++++++++
 arch/i386/kernel/setup.c  |  276 -------------------------------------------
 3 files changed, 293 insertions(+), 274 deletions(-)

diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 1a884b6..f614854 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -6,7 +6,7 @@ extra-y := head.o init_task.o vmlinux.lds
 
 obj-y	:= process.o signal.o entry.o traps.o irq.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
-		pci-dma.o i386_ksyms.o i387.o bootflag.o \
+		pci-dma.o i386_ksyms.o i387.o bootflag.o e820.o\
 		quirks.o i8237.o topology.o alternative.o i8253.o tsc.o
 
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
new file mode 100644
index 0000000..cce7060
--- /dev/null
+++ b/arch/i386/kernel/e820.c
@@ -0,0 +1,289 @@
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/string.h>
+#include <linux/kexec.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/efi.h>
+
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/e820.h>
+
+#ifdef CONFIG_EFI
+int efi_enabled = 0;
+EXPORT_SYMBOL(efi_enabled);
+#endif
+
+struct e820map e820;
+struct resource data_resource = {
+	.name	= "Kernel data",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+struct resource code_resource = {
+	.name	= "Kernel code",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource system_rom_resource = {
+	.name	= "System ROM",
+	.start	= 0xf0000,
+	.end	= 0xfffff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource extension_rom_resource = {
+	.name	= "Extension ROM",
+	.start	= 0xe0000,
+	.end	= 0xeffff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource adapter_rom_resources[] = { {
+	.name 	= "Adapter ROM",
+	.start	= 0xc8000,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+} };
+
+static struct resource video_rom_resource = {
+	.name 	= "Video ROM",
+	.start	= 0xc0000,
+	.end	= 0xc7fff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource video_ram_resource = {
+	.name	= "Video RAM area",
+	.start	= 0xa0000,
+	.end	= 0xbffff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource standard_io_resources[] = { {
+	.name	= "dma1",
+	.start	= 0x0000,
+	.end	= 0x001f,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "pic1",
+	.start	= 0x0020,
+	.end	= 0x0021,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name   = "timer0",
+	.start	= 0x0040,
+	.end    = 0x0043,
+	.flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name   = "timer1",
+	.start  = 0x0050,
+	.end    = 0x0053,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "keyboard",
+	.start	= 0x0060,
+	.end	= 0x006f,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "dma page reg",
+	.start	= 0x0080,
+	.end	= 0x008f,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "pic2",
+	.start	= 0x00a0,
+	.end	= 0x00a1,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "dma2",
+	.start	= 0x00c0,
+	.end	= 0x00df,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "fpu",
+	.start	= 0x00f0,
+	.end	= 0x00ff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+} };
+
+#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
+
+static int __init romchecksum(unsigned char *rom, unsigned long length)
+{
+	unsigned char *p, sum = 0;
+
+	for (p = rom; p < rom + length; p++)
+		sum += *p;
+	return sum == 0;
+}
+
+static void __init probe_roms(void)
+{
+	unsigned long start, length, upper;
+	unsigned char *rom;
+	int	      i;
+
+	/* video rom */
+	upper = adapter_rom_resources[0].start;
+	for (start = video_rom_resource.start; start < upper; start += 2048) {
+		rom = isa_bus_to_virt(start);
+		if (!romsignature(rom))
+			continue;
+
+		video_rom_resource.start = start;
+
+		/* 0 < length <= 0x7f * 512, historically */
+		length = rom[2] * 512;
+
+		/* if checksum okay, trust length byte */
+		if (length && romchecksum(rom, length))
+			video_rom_resource.end = start + length - 1;
+
+		request_resource(&iomem_resource, &video_rom_resource);
+		break;
+	}
+
+	start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
+	if (start < upper)
+		start = upper;
+
+	/* system rom */
+	request_resource(&iomem_resource, &system_rom_resource);
+	upper = system_rom_resource.start;
+
+	/* check for extension rom (ignore length byte!) */
+	rom = isa_bus_to_virt(extension_rom_resource.start);
+	if (romsignature(rom)) {
+		length = extension_rom_resource.end - extension_rom_resource.start + 1;
+		if (romchecksum(rom, length)) {
+			request_resource(&iomem_resource, &extension_rom_resource);
+			upper = extension_rom_resource.start;
+		}
+	}
+
+	/* check for adapter roms on 2k boundaries */
+	for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
+		rom = isa_bus_to_virt(start);
+		if (!romsignature(rom))
+			continue;
+
+		/* 0 < length <= 0x7f * 512, historically */
+		length = rom[2] * 512;
+
+		/* but accept any length that fits if checksum okay */
+		if (!length || start + length > upper || !romchecksum(rom, length))
+			continue;
+
+		adapter_rom_resources[i].start = start;
+		adapter_rom_resources[i].end = start + length - 1;
+		request_resource(&iomem_resource, &adapter_rom_resources[i]);
+
+		start = adapter_rom_resources[i++].end & ~2047UL;
+	}
+}
+
+/*
+ * Request address space for all standard RAM and ROM resources
+ * and also for regions reported as reserved by the e820.
+ */
+static void __init
+legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
+{
+	int i;
+
+	probe_roms();
+	for (i = 0; i < e820.nr_map; i++) {
+		struct resource *res;
+#ifndef CONFIG_RESOURCES_64BIT
+		if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
+			continue;
+#endif
+		res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
+		switch (e820.map[i].type) {
+		case E820_RAM:	res->name = "System RAM"; break;
+		case E820_ACPI:	res->name = "ACPI Tables"; break;
+		case E820_NVS:	res->name = "ACPI Non-volatile Storage"; break;
+		default:	res->name = "reserved";
+		}
+		res->start = e820.map[i].addr;
+		res->end = res->start + e820.map[i].size - 1;
+		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+		if (request_resource(&iomem_resource, res)) {
+			kfree(res);
+			continue;
+		}
+		if (e820.map[i].type == E820_RAM) {
+			/*
+			 *  We don't know which RAM region contains kernel data,
+			 *  so we try it repeatedly and let the resource manager
+			 *  test it.
+			 */
+			request_resource(res, code_resource);
+			request_resource(res, data_resource);
+#ifdef CONFIG_KEXEC
+			request_resource(res, &crashk_res);
+#endif
+		}
+	}
+}
+
+/*
+ * Request address space for all standard resources
+ *
+ * This is called just before pcibios_init(), which is also a
+ * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
+ */
+static int __init request_standard_resources(void)
+{
+	int i;
+
+	printk("Setting up standard PCI resources\n");
+	if (efi_enabled)
+		efi_initialize_iomem_resources(&code_resource, &data_resource);
+	else
+		legacy_init_iomem_resources(&code_resource, &data_resource);
+
+	/* EFI systems may still have VGA */
+	request_resource(&iomem_resource, &video_ram_resource);
+
+	/* request I/O space for devices used on all i[345]86 PCs */
+	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+		request_resource(&ioport_resource, &standard_io_resources[i]);
+	return 0;
+}
+
+subsys_initcall(request_standard_resources);
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 61539af..acd2d93 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -76,11 +76,9 @@ int disable_pse __devinitdata = 0;
 /*
  * Machine setup..
  */
-
-#ifdef CONFIG_EFI
-int efi_enabled = 0;
-EXPORT_SYMBOL(efi_enabled);
-#endif
+extern struct e820map e820;
+extern struct resource code_resource;
+extern struct resource data_resource;
 
 /* cpu data as detected by the assembly code in head.S */
 struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
@@ -134,7 +132,6 @@ struct ist_info ist_info;
 	defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
 EXPORT_SYMBOL(ist_info);
 #endif
-struct e820map e820;
 
 extern void early_cpu_init(void);
 extern int root_mountflags;
@@ -149,203 +146,6 @@ static char command_line[COMMAND_LINE_SIZE];
 
 unsigned char __initdata boot_params[PARAM_SIZE];
 
-static struct resource data_resource = {
-	.name	= "Kernel data",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource code_resource = {
-	.name	= "Kernel code",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource system_rom_resource = {
-	.name	= "System ROM",
-	.start	= 0xf0000,
-	.end	= 0xfffff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource extension_rom_resource = {
-	.name	= "Extension ROM",
-	.start	= 0xe0000,
-	.end	= 0xeffff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource adapter_rom_resources[] = { {
-	.name 	= "Adapter ROM",
-	.start	= 0xc8000,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-} };
-
-static struct resource video_rom_resource = {
-	.name 	= "Video ROM",
-	.start	= 0xc0000,
-	.end	= 0xc7fff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource video_ram_resource = {
-	.name	= "Video RAM area",
-	.start	= 0xa0000,
-	.end	= 0xbffff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource standard_io_resources[] = { {
-	.name	= "dma1",
-	.start	= 0x0000,
-	.end	= 0x001f,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "pic1",
-	.start	= 0x0020,
-	.end	= 0x0021,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name   = "timer0",
-	.start	= 0x0040,
-	.end    = 0x0043,
-	.flags  = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name   = "timer1",
-	.start  = 0x0050,
-	.end    = 0x0053,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "keyboard",
-	.start	= 0x0060,
-	.end	= 0x006f,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "dma page reg",
-	.start	= 0x0080,
-	.end	= 0x008f,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "pic2",
-	.start	= 0x00a0,
-	.end	= 0x00a1,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "dma2",
-	.start	= 0x00c0,
-	.end	= 0x00df,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "fpu",
-	.start	= 0x00f0,
-	.end	= 0x00ff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-} };
-
-#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
-
-static int __init romchecksum(unsigned char *rom, unsigned long length)
-{
-	unsigned char *p, sum = 0;
-
-	for (p = rom; p < rom + length; p++)
-		sum += *p;
-	return sum == 0;
-}
-
-static void __init probe_roms(void)
-{
-	unsigned long start, length, upper;
-	unsigned char *rom;
-	int	      i;
-
-	/* video rom */
-	upper = adapter_rom_resources[0].start;
-	for (start = video_rom_resource.start; start < upper; start += 2048) {
-		rom = isa_bus_to_virt(start);
-		if (!romsignature(rom))
-			continue;
-
-		video_rom_resource.start = start;
-
-		/* 0 < length <= 0x7f * 512, historically */
-		length = rom[2] * 512;
-
-		/* if checksum okay, trust length byte */
-		if (length && romchecksum(rom, length))
-			video_rom_resource.end = start + length - 1;
-
-		request_resource(&iomem_resource, &video_rom_resource);
-		break;
-	}
-
-	start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
-	if (start < upper)
-		start = upper;
-
-	/* system rom */
-	request_resource(&iomem_resource, &system_rom_resource);
-	upper = system_rom_resource.start;
-
-	/* check for extension rom (ignore length byte!) */
-	rom = isa_bus_to_virt(extension_rom_resource.start);
-	if (romsignature(rom)) {
-		length = extension_rom_resource.end - extension_rom_resource.start + 1;
-		if (romchecksum(rom, length)) {
-			request_resource(&iomem_resource, &extension_rom_resource);
-			upper = extension_rom_resource.start;
-		}
-	}
-
-	/* check for adapter roms on 2k boundaries */
-	for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
-		rom = isa_bus_to_virt(start);
-		if (!romsignature(rom))
-			continue;
-
-		/* 0 < length <= 0x7f * 512, historically */
-		length = rom[2] * 512;
-
-		/* but accept any length that fits if checksum okay */
-		if (!length || start + length > upper || !romchecksum(rom, length))
-			continue;
-
-		adapter_rom_resources[i].start = start;
-		adapter_rom_resources[i].end = start + length - 1;
-		request_resource(&iomem_resource, &adapter_rom_resources[i]);
-
-		start = adapter_rom_resources[i++].end & ~2047UL;
-	}
-}
-
 static void __init limit_regions(unsigned long long size)
 {
 	unsigned long long current_addr = 0;
@@ -1200,77 +1000,7 @@ void __init remapped_pgdat_init(void)
 	}
 }
 
-/*
- * Request address space for all standard RAM and ROM resources
- * and also for regions reported as reserved by the e820.
- */
-static void __init
-legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
-{
-	int i;
-
-	probe_roms();
-	for (i = 0; i < e820.nr_map; i++) {
-		struct resource *res;
-#ifndef CONFIG_RESOURCES_64BIT
-		if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
-			continue;
-#endif
-		res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
-		switch (e820.map[i].type) {
-		case E820_RAM:	res->name = "System RAM"; break;
-		case E820_ACPI:	res->name = "ACPI Tables"; break;
-		case E820_NVS:	res->name = "ACPI Non-volatile Storage"; break;
-		default:	res->name = "reserved";
-		}
-		res->start = e820.map[i].addr;
-		res->end = res->start + e820.map[i].size - 1;
-		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-		if (request_resource(&iomem_resource, res)) {
-			kfree(res);
-			continue;
-		}
-		if (e820.map[i].type == E820_RAM) {
-			/*
-			 *  We don't know which RAM region contains kernel data,
-			 *  so we try it repeatedly and let the resource manager
-			 *  test it.
-			 */
-			request_resource(res, code_resource);
-			request_resource(res, data_resource);
-#ifdef CONFIG_KEXEC
-			request_resource(res, &crashk_res);
-#endif
-		}
-	}
-}
-
-/*
- * Request address space for all standard resources
- *
- * This is called just before pcibios_init(), which is also a
- * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
- */
-static int __init request_standard_resources(void)
-{
-	int i;
-
-	printk("Setting up standard PCI resources\n");
-	if (efi_enabled)
-		efi_initialize_iomem_resources(&code_resource, &data_resource);
-	else
-		legacy_init_iomem_resources(&code_resource, &data_resource);
-
-	/* EFI systems may still have VGA */
-	request_resource(&iomem_resource, &video_ram_resource);
-
-	/* request I/O space for devices used on all i[345]86 PCs */
-	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
-		request_resource(&ioport_resource, &standard_io_resources[i]);
-	return 0;
-}
 
-subsys_initcall(request_standard_resources);
 
 static void __init register_memory(void)
 {
-- 
cgit v0.10.2


From 8e3342f736dd1c19ce7c28625dedd7d8730fc7ad Mon Sep 17 00:00:00 2001
From: "bibo,mao" <bibo.mao@intel.com>
Date: Thu, 7 Dec 2006 02:14:06 +0100
Subject: [PATCH] i386: create e820.c for e820 map sanitize and copy function

This patch moves bios e820 map sanitize and copy function from
setup.c to e820.c

Signed-off-by: bibo,mao <bibo.mao@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>

 arch/i386/kernel/e820.c  |  252 +++++++++++++++++++++++++++++++++++++++++++++++
 arch/i386/kernel/setup.c |  240 --------------------------------------------
 2 files changed, 252 insertions(+), 240 deletions(-)

diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
index cce7060..0db9576 100644
--- a/arch/i386/kernel/e820.c
+++ b/arch/i386/kernel/e820.c
@@ -19,6 +19,14 @@ EXPORT_SYMBOL(efi_enabled);
 #endif
 
 struct e820map e820;
+struct change_member {
+	struct e820entry *pbios; /* pointer to original bios entry */
+	unsigned long long addr; /* address for this change point */
+};
+static struct change_member change_point_list[2*E820MAX] __initdata;
+static struct change_member *change_point[2*E820MAX] __initdata;
+static struct e820entry *overlap_list[E820MAX] __initdata;
+static struct e820entry new_bios[E820MAX] __initdata;
 struct resource data_resource = {
 	.name	= "Kernel data",
 	.start	= 0,
@@ -287,3 +295,247 @@ static int __init request_standard_resources(void)
 }
 
 subsys_initcall(request_standard_resources);
+
+void __init add_memory_region(unsigned long long start,
+			      unsigned long long size, int type)
+{
+	int x;
+
+	if (!efi_enabled) {
+       		x = e820.nr_map;
+
+		if (x == E820MAX) {
+		    printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+		    return;
+		}
+
+		e820.map[x].addr = start;
+		e820.map[x].size = size;
+		e820.map[x].type = type;
+		e820.nr_map++;
+	}
+} /* add_memory_region */
+
+/*
+ * Sanitize the BIOS e820 map.
+ *
+ * Some e820 responses include overlapping entries.  The following
+ * replaces the original e820 map with a new one, removing overlaps.
+ *
+ */
+int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
+{
+	struct change_member *change_tmp;
+	unsigned long current_type, last_type;
+	unsigned long long last_addr;
+	int chgidx, still_changing;
+	int overlap_entries;
+	int new_bios_entry;
+	int old_nr, new_nr, chg_nr;
+	int i;
+
+	/*
+		Visually we're performing the following (1,2,3,4 = memory types)...
+
+		Sample memory map (w/overlaps):
+		   ____22__________________
+		   ______________________4_
+		   ____1111________________
+		   _44_____________________
+		   11111111________________
+		   ____________________33__
+		   ___________44___________
+		   __________33333_________
+		   ______________22________
+		   ___________________2222_
+		   _________111111111______
+		   _____________________11_
+		   _________________4______
+
+		Sanitized equivalent (no overlap):
+		   1_______________________
+		   _44_____________________
+		   ___1____________________
+		   ____22__________________
+		   ______11________________
+		   _________1______________
+		   __________3_____________
+		   ___________44___________
+		   _____________33_________
+		   _______________2________
+		   ________________1_______
+		   _________________4______
+		   ___________________2____
+		   ____________________33__
+		   ______________________4_
+	*/
+	printk("sanitize start\n");
+	/* if there's only one memory region, don't bother */
+	if (*pnr_map < 2) {
+		printk("sanitize bail 0\n");
+		return -1;
+	}
+
+	old_nr = *pnr_map;
+
+	/* bail out if we find any unreasonable addresses in bios map */
+	for (i=0; i<old_nr; i++)
+		if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
+			printk("sanitize bail 1\n");
+			return -1;
+		}
+
+	/* create pointers for initial change-point information (for sorting) */
+	for (i=0; i < 2*old_nr; i++)
+		change_point[i] = &change_point_list[i];
+
+	/* record all known change-points (starting and ending addresses),
+	   omitting those that are for empty memory regions */
+	chgidx = 0;
+	for (i=0; i < old_nr; i++)	{
+		if (biosmap[i].size != 0) {
+			change_point[chgidx]->addr = biosmap[i].addr;
+			change_point[chgidx++]->pbios = &biosmap[i];
+			change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
+			change_point[chgidx++]->pbios = &biosmap[i];
+		}
+	}
+	chg_nr = chgidx;    	/* true number of change-points */
+
+	/* sort change-point list by memory addresses (low -> high) */
+	still_changing = 1;
+	while (still_changing)	{
+		still_changing = 0;
+		for (i=1; i < chg_nr; i++)  {
+			/* if <current_addr> > <last_addr>, swap */
+			/* or, if current=<start_addr> & last=<end_addr>, swap */
+			if ((change_point[i]->addr < change_point[i-1]->addr) ||
+				((change_point[i]->addr == change_point[i-1]->addr) &&
+				 (change_point[i]->addr == change_point[i]->pbios->addr) &&
+				 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
+			   )
+			{
+				change_tmp = change_point[i];
+				change_point[i] = change_point[i-1];
+				change_point[i-1] = change_tmp;
+				still_changing=1;
+			}
+		}
+	}
+
+	/* create a new bios memory map, removing overlaps */
+	overlap_entries=0;	 /* number of entries in the overlap table */
+	new_bios_entry=0;	 /* index for creating new bios map entries */
+	last_type = 0;		 /* start with undefined memory type */
+	last_addr = 0;		 /* start with 0 as last starting address */
+	/* loop through change-points, determining affect on the new bios map */
+	for (chgidx=0; chgidx < chg_nr; chgidx++)
+	{
+		/* keep track of all overlapping bios entries */
+		if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
+		{
+			/* add map entry to overlap list (> 1 entry implies an overlap) */
+			overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
+		}
+		else
+		{
+			/* remove entry from list (order independent, so swap with last) */
+			for (i=0; i<overlap_entries; i++)
+			{
+				if (overlap_list[i] == change_point[chgidx]->pbios)
+					overlap_list[i] = overlap_list[overlap_entries-1];
+			}
+			overlap_entries--;
+		}
+		/* if there are overlapping entries, decide which "type" to use */
+		/* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
+		current_type = 0;
+		for (i=0; i<overlap_entries; i++)
+			if (overlap_list[i]->type > current_type)
+				current_type = overlap_list[i]->type;
+		/* continue building up new bios map based on this information */
+		if (current_type != last_type)	{
+			if (last_type != 0)	 {
+				new_bios[new_bios_entry].size =
+					change_point[chgidx]->addr - last_addr;
+				/* move forward only if the new size was non-zero */
+				if (new_bios[new_bios_entry].size != 0)
+					if (++new_bios_entry >= E820MAX)
+						break; 	/* no more space left for new bios entries */
+			}
+			if (current_type != 0)	{
+				new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
+				new_bios[new_bios_entry].type = current_type;
+				last_addr=change_point[chgidx]->addr;
+			}
+			last_type = current_type;
+		}
+	}
+	new_nr = new_bios_entry;   /* retain count for new bios entries */
+
+	/* copy new bios mapping into original location */
+	memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
+	*pnr_map = new_nr;
+
+	printk("sanitize end\n");
+	return 0;
+}
+
+/*
+ * Copy the BIOS e820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory.  If we aren't, we'll fake a memory map.
+ *
+ * We check to see that the memory map contains at least 2 elements
+ * before we'll use it, because the detection code in setup.S may
+ * not be perfect and most every PC known to man has two memory
+ * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
+ * thinkpad 560x, for example, does not cooperate with the memory
+ * detection code.)
+ */
+int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
+{
+	/* Only one memory region (or negative)? Ignore it */
+	if (nr_map < 2)
+		return -1;
+
+	do {
+		unsigned long long start = biosmap->addr;
+		unsigned long long size = biosmap->size;
+		unsigned long long end = start + size;
+		unsigned long type = biosmap->type;
+		printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type);
+
+		/* Overflow in 64 bits? Ignore the memory map. */
+		if (start > end)
+			return -1;
+
+		/*
+		 * Some BIOSes claim RAM in the 640k - 1M region.
+		 * Not right. Fix it up.
+		 */
+		if (type == E820_RAM) {
+			printk("copy_e820_map() type is E820_RAM\n");
+			if (start < 0x100000ULL && end > 0xA0000ULL) {
+				printk("copy_e820_map() lies in range...\n");
+				if (start < 0xA0000ULL) {
+					printk("copy_e820_map() start < 0xA0000ULL\n");
+					add_memory_region(start, 0xA0000ULL-start, type);
+				}
+				if (end <= 0x100000ULL) {
+					printk("copy_e820_map() end <= 0x100000ULL\n");
+					continue;
+				}
+				start = 0x100000ULL;
+				size = end - start;
+			}
+		}
+		add_memory_region(start, size, type);
+	} while (biosmap++,--nr_map);
+	return 0;
+}
+
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index acd2d93..b7509ae 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -191,26 +191,6 @@ static void __init limit_regions(unsigned long long size)
 	}
 }
 
-void __init add_memory_region(unsigned long long start,
-			      unsigned long long size, int type)
-{
-	int x;
-
-	if (!efi_enabled) {
-       		x = e820.nr_map;
-
-		if (x == E820MAX) {
-		    printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
-		    return;
-		}
-
-		e820.map[x].addr = start;
-		e820.map[x].size = size;
-		e820.map[x].type = type;
-		e820.nr_map++;
-	}
-} /* add_memory_region */
-
 #define E820_DEBUG	1
 
 static void __init print_memory_map(char *who)
@@ -239,226 +219,6 @@ static void __init print_memory_map(char *who)
 	}
 }
 
-/*
- * Sanitize the BIOS e820 map.
- *
- * Some e820 responses include overlapping entries.  The following 
- * replaces the original e820 map with a new one, removing overlaps.
- *
- */
-struct change_member {
-	struct e820entry *pbios; /* pointer to original bios entry */
-	unsigned long long addr; /* address for this change point */
-};
-static struct change_member change_point_list[2*E820MAX] __initdata;
-static struct change_member *change_point[2*E820MAX] __initdata;
-static struct e820entry *overlap_list[E820MAX] __initdata;
-static struct e820entry new_bios[E820MAX] __initdata;
-
-int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
-{
-	struct change_member *change_tmp;
-	unsigned long current_type, last_type;
-	unsigned long long last_addr;
-	int chgidx, still_changing;
-	int overlap_entries;
-	int new_bios_entry;
-	int old_nr, new_nr, chg_nr;
-	int i;
-
-	/*
-		Visually we're performing the following (1,2,3,4 = memory types)...
-
-		Sample memory map (w/overlaps):
-		   ____22__________________
-		   ______________________4_
-		   ____1111________________
-		   _44_____________________
-		   11111111________________
-		   ____________________33__
-		   ___________44___________
-		   __________33333_________
-		   ______________22________
-		   ___________________2222_
-		   _________111111111______
-		   _____________________11_
-		   _________________4______
-
-		Sanitized equivalent (no overlap):
-		   1_______________________
-		   _44_____________________
-		   ___1____________________
-		   ____22__________________
-		   ______11________________
-		   _________1______________
-		   __________3_____________
-		   ___________44___________
-		   _____________33_________
-		   _______________2________
-		   ________________1_______
-		   _________________4______
-		   ___________________2____
-		   ____________________33__
-		   ______________________4_
-	*/
-
-	/* if there's only one memory region, don't bother */
-	if (*pnr_map < 2)
-		return -1;
-
-	old_nr = *pnr_map;
-
-	/* bail out if we find any unreasonable addresses in bios map */
-	for (i=0; i<old_nr; i++)
-		if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
-			return -1;
-
-	/* create pointers for initial change-point information (for sorting) */
-	for (i=0; i < 2*old_nr; i++)
-		change_point[i] = &change_point_list[i];
-
-	/* record all known change-points (starting and ending addresses),
-	   omitting those that are for empty memory regions */
-	chgidx = 0;
-	for (i=0; i < old_nr; i++)	{
-		if (biosmap[i].size != 0) {
-			change_point[chgidx]->addr = biosmap[i].addr;
-			change_point[chgidx++]->pbios = &biosmap[i];
-			change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
-			change_point[chgidx++]->pbios = &biosmap[i];
-		}
-	}
-	chg_nr = chgidx;    	/* true number of change-points */
-
-	/* sort change-point list by memory addresses (low -> high) */
-	still_changing = 1;
-	while (still_changing)	{
-		still_changing = 0;
-		for (i=1; i < chg_nr; i++)  {
-			/* if <current_addr> > <last_addr>, swap */
-			/* or, if current=<start_addr> & last=<end_addr>, swap */
-			if ((change_point[i]->addr < change_point[i-1]->addr) ||
-				((change_point[i]->addr == change_point[i-1]->addr) &&
-				 (change_point[i]->addr == change_point[i]->pbios->addr) &&
-				 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
-			   )
-			{
-				change_tmp = change_point[i];
-				change_point[i] = change_point[i-1];
-				change_point[i-1] = change_tmp;
-				still_changing=1;
-			}
-		}
-	}
-
-	/* create a new bios memory map, removing overlaps */
-	overlap_entries=0;	 /* number of entries in the overlap table */
-	new_bios_entry=0;	 /* index for creating new bios map entries */
-	last_type = 0;		 /* start with undefined memory type */
-	last_addr = 0;		 /* start with 0 as last starting address */
-	/* loop through change-points, determining affect on the new bios map */
-	for (chgidx=0; chgidx < chg_nr; chgidx++)
-	{
-		/* keep track of all overlapping bios entries */
-		if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
-		{
-			/* add map entry to overlap list (> 1 entry implies an overlap) */
-			overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
-		}
-		else
-		{
-			/* remove entry from list (order independent, so swap with last) */
-			for (i=0; i<overlap_entries; i++)
-			{
-				if (overlap_list[i] == change_point[chgidx]->pbios)
-					overlap_list[i] = overlap_list[overlap_entries-1];
-			}
-			overlap_entries--;
-		}
-		/* if there are overlapping entries, decide which "type" to use */
-		/* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
-		current_type = 0;
-		for (i=0; i<overlap_entries; i++)
-			if (overlap_list[i]->type > current_type)
-				current_type = overlap_list[i]->type;
-		/* continue building up new bios map based on this information */
-		if (current_type != last_type)	{
-			if (last_type != 0)	 {
-				new_bios[new_bios_entry].size =
-					change_point[chgidx]->addr - last_addr;
-				/* move forward only if the new size was non-zero */
-				if (new_bios[new_bios_entry].size != 0)
-					if (++new_bios_entry >= E820MAX)
-						break; 	/* no more space left for new bios entries */
-			}
-			if (current_type != 0)	{
-				new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
-				new_bios[new_bios_entry].type = current_type;
-				last_addr=change_point[chgidx]->addr;
-			}
-			last_type = current_type;
-		}
-	}
-	new_nr = new_bios_entry;   /* retain count for new bios entries */
-
-	/* copy new bios mapping into original location */
-	memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
-	*pnr_map = new_nr;
-
-	return 0;
-}
-
-/*
- * Copy the BIOS e820 map into a safe place.
- *
- * Sanity-check it while we're at it..
- *
- * If we're lucky and live on a modern system, the setup code
- * will have given us a memory map that we can use to properly
- * set up memory.  If we aren't, we'll fake a memory map.
- *
- * We check to see that the memory map contains at least 2 elements
- * before we'll use it, because the detection code in setup.S may
- * not be perfect and most every PC known to man has two memory
- * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
- * thinkpad 560x, for example, does not cooperate with the memory
- * detection code.)
- */
-int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
-{
-	/* Only one memory region (or negative)? Ignore it */
-	if (nr_map < 2)
-		return -1;
-
-	do {
-		unsigned long long start = biosmap->addr;
-		unsigned long long size = biosmap->size;
-		unsigned long long end = start + size;
-		unsigned long type = biosmap->type;
-
-		/* Overflow in 64 bits? Ignore the memory map. */
-		if (start > end)
-			return -1;
-
-		/*
-		 * Some BIOSes claim RAM in the 640k - 1M region.
-		 * Not right. Fix it up.
-		 */
-		if (type == E820_RAM) {
-			if (start < 0x100000ULL && end > 0xA0000ULL) {
-				if (start < 0xA0000ULL)
-					add_memory_region(start, 0xA0000ULL-start, type);
-				if (end <= 0x100000ULL)
-					continue;
-				start = 0x100000ULL;
-				size = end - start;
-			}
-		}
-		add_memory_region(start, size, type);
-	} while (biosmap++,--nr_map);
-	return 0;
-}
-
 #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
 struct edd edd;
 #ifdef CONFIG_EDD_MODULE
-- 
cgit v0.10.2


From b2dff6a88cbed59d787a8ca7367c76ba385e1187 Mon Sep 17 00:00:00 2001
From: "bibo,mao" <bibo.mao@intel.com>
Date: Thu, 7 Dec 2006 02:14:06 +0100
Subject: [PATCH] i386: Move find_max_pfn function to e820.c

Move more code from setup.c into e820.c

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
index 0db9576..be4934f 100644
--- a/arch/i386/kernel/e820.c
+++ b/arch/i386/kernel/e820.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/efi.h>
+#include <linux/pfn.h>
 
 #include <asm/pgtable.h>
 #include <asm/page.h>
@@ -539,3 +540,54 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
 	return 0;
 }
 
+/*
+ * Callback for efi_memory_walk.
+ */
+static int __init
+efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
+{
+	unsigned long *max_pfn = arg, pfn;
+
+	if (start < end) {
+		pfn = PFN_UP(end -1);
+		if (pfn > *max_pfn)
+			*max_pfn = pfn;
+	}
+	return 0;
+}
+
+static int __init
+efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
+{
+	memory_present(0, PFN_UP(start), PFN_DOWN(end));
+	return 0;
+}
+
+/*
+ * Find the highest page frame number we have available
+ */
+void __init find_max_pfn(void)
+{
+	int i;
+
+	max_pfn = 0;
+	if (efi_enabled) {
+		efi_memmap_walk(efi_find_max_pfn, &max_pfn);
+		efi_memmap_walk(efi_memory_present_wrapper, NULL);
+		return;
+	}
+
+	for (i = 0; i < e820.nr_map; i++) {
+		unsigned long start, end;
+		/* RAM? */
+		if (e820.map[i].type != E820_RAM)
+			continue;
+		start = PFN_UP(e820.map[i].addr);
+		end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
+		if (start >= end)
+			continue;
+		if (end > max_pfn)
+			max_pfn = end;
+		memory_present(0, start, end);
+	}
+}
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index b7509ae..3d80805 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -63,9 +63,6 @@
 #include <setup_arch.h>
 #include <bios_ebda.h>
 
-/* Forward Declaration. */
-void __init find_max_pfn(void);
-
 /* This value is set up by the early boot code to point to the value
    immediately after the boot time page tables.  It contains a *physical*
    address, and must not be in the .bss segment! */
@@ -387,29 +384,6 @@ static int __init parse_reservetop(char *arg)
 }
 early_param("reservetop", parse_reservetop);
 
-/*
- * Callback for efi_memory_walk.
- */
-static int __init
-efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
-{
-	unsigned long *max_pfn = arg, pfn;
-
-	if (start < end) {
-		pfn = PFN_UP(end -1);
-		if (pfn > *max_pfn)
-			*max_pfn = pfn;
-	}
-	return 0;
-}
-
-static int __init
-efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
-{
-	memory_present(0, PFN_UP(start), PFN_DOWN(end));
-	return 0;
-}
-
  /*
   * This function checks if the entire range <start,end> is mapped with type.
   *
@@ -443,35 +417,6 @@ e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
 }
 
 /*
- * Find the highest page frame number we have available
- */
-void __init find_max_pfn(void)
-{
-	int i;
-
-	max_pfn = 0;
-	if (efi_enabled) {
-		efi_memmap_walk(efi_find_max_pfn, &max_pfn);
-		efi_memmap_walk(efi_memory_present_wrapper, NULL);
-		return;
-	}
-
-	for (i = 0; i < e820.nr_map; i++) {
-		unsigned long start, end;
-		/* RAM? */
-		if (e820.map[i].type != E820_RAM)
-			continue;
-		start = PFN_UP(e820.map[i].addr);
-		end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
-		if (start >= end)
-			continue;
-		if (end > max_pfn)
-			max_pfn = end;
-		memory_present(0, start, end);
-	}
-}
-
-/*
  * Determine low and high memory ranges:
  */
 unsigned long __init find_max_low_pfn(void)
diff --git a/include/asm-i386/e820.h b/include/asm-i386/e820.h
index f7514fb..1475694 100644
--- a/include/asm-i386/e820.h
+++ b/include/asm-i386/e820.h
@@ -38,6 +38,7 @@ extern struct e820map e820;
 
 extern int e820_all_mapped(unsigned long start, unsigned long end,
 			   unsigned type);
+extern void find_max_pfn(void);
 
 #endif/*!__ASSEMBLY__*/
 
-- 
cgit v0.10.2


From b5b2405706005cc7765f6ecd00965d29e93f090a Mon Sep 17 00:00:00 2001
From: "bibo,mao" <bibo.mao@intel.com>
Date: Thu, 7 Dec 2006 02:14:06 +0100
Subject: [PATCH] i386: Move e820/efi memmap walking code to e820.c

This patch moves e820/efi memmap table walking function from
setup.c to e820.c, also this patch adds extern declaration in
header file.

Signed-off-by: bibo,mao <bibo.mao@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>

 arch/i386/kernel/e820.c  |  115 +++++++++++++++++++++++++++++++++
 arch/i386/kernel/setup.c |  118 -----------------------------------
 include/asm-i386/e820.h  |    2
 arch/i386/kernel/e820.c  |  115 +++++++++++++++++++++++++++++++++++++++++++++
 arch/i386/kernel/setup.c |  118 -----------------------------------------------
 include/asm-i386/e820.h  |    2
 3 files changed, 117 insertions(+), 118 deletions(-)

diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
index be4934f..47c495b 100644
--- a/arch/i386/kernel/e820.c
+++ b/arch/i386/kernel/e820.c
@@ -28,6 +28,11 @@ static struct change_member change_point_list[2*E820MAX] __initdata;
 static struct change_member *change_point[2*E820MAX] __initdata;
 static struct e820entry *overlap_list[E820MAX] __initdata;
 static struct e820entry new_bios[E820MAX] __initdata;
+/* For PCI or other memory-mapped resources */
+unsigned long pci_mem_start = 0x10000000;
+#ifdef CONFIG_PCI
+EXPORT_SYMBOL(pci_mem_start);
+#endif
 struct resource data_resource = {
 	.name	= "Kernel data",
 	.start	= 0,
@@ -591,3 +596,113 @@ void __init find_max_pfn(void)
 		memory_present(0, start, end);
 	}
 }
+
+/*
+ * Free all available memory for boot time allocation.  Used
+ * as a callback function by efi_memory_walk()
+ */
+
+static int __init
+free_available_memory(unsigned long start, unsigned long end, void *arg)
+{
+	/* check max_low_pfn */
+	if (start >= (max_low_pfn << PAGE_SHIFT))
+		return 0;
+	if (end >= (max_low_pfn << PAGE_SHIFT))
+		end = max_low_pfn << PAGE_SHIFT;
+	if (start < end)
+		free_bootmem(start, end - start);
+
+	return 0;
+}
+/*
+ * Register fully available low RAM pages with the bootmem allocator.
+ */
+void __init register_bootmem_low_pages(unsigned long max_low_pfn)
+{
+	int i;
+
+	if (efi_enabled) {
+		efi_memmap_walk(free_available_memory, NULL);
+		return;
+	}
+	for (i = 0; i < e820.nr_map; i++) {
+		unsigned long curr_pfn, last_pfn, size;
+		/*
+		 * Reserve usable low memory
+		 */
+		if (e820.map[i].type != E820_RAM)
+			continue;
+		/*
+		 * We are rounding up the start address of usable memory:
+		 */
+		curr_pfn = PFN_UP(e820.map[i].addr);
+		if (curr_pfn >= max_low_pfn)
+			continue;
+		/*
+		 * ... and at the end of the usable range downwards:
+		 */
+		last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
+
+		if (last_pfn > max_low_pfn)
+			last_pfn = max_low_pfn;
+
+		/*
+		 * .. finally, did all the rounding and playing
+		 * around just make the area go away?
+		 */
+		if (last_pfn <= curr_pfn)
+			continue;
+
+		size = last_pfn - curr_pfn;
+		free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
+	}
+}
+
+void __init register_memory(void)
+{
+	unsigned long gapstart, gapsize, round;
+	unsigned long long last;
+	int i;
+
+	/*
+	 * Search for the bigest gap in the low 32 bits of the e820
+	 * memory space.
+	 */
+	last = 0x100000000ull;
+	gapstart = 0x10000000;
+	gapsize = 0x400000;
+	i = e820.nr_map;
+	while (--i >= 0) {
+		unsigned long long start = e820.map[i].addr;
+		unsigned long long end = start + e820.map[i].size;
+
+		/*
+		 * Since "last" is at most 4GB, we know we'll
+		 * fit in 32 bits if this condition is true
+		 */
+		if (last > end) {
+			unsigned long gap = last - end;
+
+			if (gap > gapsize) {
+				gapsize = gap;
+				gapstart = end;
+			}
+		}
+		if (start < last)
+			last = start;
+	}
+
+	/*
+	 * See how much we want to round up: start off with
+	 * rounding to the next 1MB area.
+	 */
+	round = 0x100000;
+	while ((gapsize >> 4) > round)
+		round += round;
+	/* Fun with two's complement */
+	pci_mem_start = (gapstart + round) & -round;
+
+	printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
+		pci_mem_start, gapstart, gapsize);
+}
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 3d80805..51ed015 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -94,12 +94,6 @@ unsigned int machine_submodel_id;
 unsigned int BIOS_revision;
 unsigned int mca_pentium_flag;
 
-/* For PCI or other memory-mapped resources */
-unsigned long pci_mem_start = 0x10000000;
-#ifdef CONFIG_PCI
-EXPORT_SYMBOL(pci_mem_start);
-#endif
-
 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
 int bootloader_type;
 
@@ -476,68 +470,6 @@ unsigned long __init find_max_low_pfn(void)
 }
 
 /*
- * Free all available memory for boot time allocation.  Used
- * as a callback function by efi_memory_walk()
- */
-
-static int __init
-free_available_memory(unsigned long start, unsigned long end, void *arg)
-{
-	/* check max_low_pfn */
-	if (start >= (max_low_pfn << PAGE_SHIFT))
-		return 0;
-	if (end >= (max_low_pfn << PAGE_SHIFT))
-		end = max_low_pfn << PAGE_SHIFT;
-	if (start < end)
-		free_bootmem(start, end - start);
-
-	return 0;
-}
-/*
- * Register fully available low RAM pages with the bootmem allocator.
- */
-static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
-{
-	int i;
-
-	if (efi_enabled) {
-		efi_memmap_walk(free_available_memory, NULL);
-		return;
-	}
-	for (i = 0; i < e820.nr_map; i++) {
-		unsigned long curr_pfn, last_pfn, size;
-		/*
-		 * Reserve usable low memory
-		 */
-		if (e820.map[i].type != E820_RAM)
-			continue;
-		/*
-		 * We are rounding up the start address of usable memory:
-		 */
-		curr_pfn = PFN_UP(e820.map[i].addr);
-		if (curr_pfn >= max_low_pfn)
-			continue;
-		/*
-		 * ... and at the end of the usable range downwards:
-		 */
-		last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
-
-		if (last_pfn > max_low_pfn)
-			last_pfn = max_low_pfn;
-
-		/*
-		 * .. finally, did all the rounding and playing
-		 * around just make the area go away?
-		 */
-		if (last_pfn <= curr_pfn)
-			continue;
-
-		size = last_pfn - curr_pfn;
-		free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
-	}
-}
-
-/*
  * workaround for Dell systems that neglect to reserve EBDA
  */
 static void __init reserve_ebda_region(void)
@@ -705,56 +637,6 @@ void __init remapped_pgdat_init(void)
 	}
 }
 
-
-
-static void __init register_memory(void)
-{
-	unsigned long gapstart, gapsize, round;
-	unsigned long long last;
-	int i;
-
-	/*
-	 * Search for the bigest gap in the low 32 bits of the e820
-	 * memory space.
-	 */
-	last = 0x100000000ull;
-	gapstart = 0x10000000;
-	gapsize = 0x400000;
-	i = e820.nr_map;
-	while (--i >= 0) {
-		unsigned long long start = e820.map[i].addr;
-		unsigned long long end = start + e820.map[i].size;
-
-		/*
-		 * Since "last" is at most 4GB, we know we'll
-		 * fit in 32 bits if this condition is true
-		 */
-		if (last > end) {
-			unsigned long gap = last - end;
-
-			if (gap > gapsize) {
-				gapsize = gap;
-				gapstart = end;
-			}
-		}
-		if (start < last)
-			last = start;
-	}
-
-	/*
-	 * See how much we want to round up: start off with
-	 * rounding to the next 1MB area.
-	 */
-	round = 0x100000;
-	while ((gapsize >> 4) > round)
-		round += round;
-	/* Fun with two's complement */
-	pci_mem_start = (gapstart + round) & -round;
-
-	printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
-		pci_mem_start, gapstart, gapsize);
-}
-
 #ifdef CONFIG_MCA
 static void set_mca_bus(int x)
 {
diff --git a/include/asm-i386/e820.h b/include/asm-i386/e820.h
index 1475694..8da4175 100644
--- a/include/asm-i386/e820.h
+++ b/include/asm-i386/e820.h
@@ -39,6 +39,8 @@ extern struct e820map e820;
 extern int e820_all_mapped(unsigned long start, unsigned long end,
 			   unsigned type);
 extern void find_max_pfn(void);
+extern void register_bootmem_low_pages(unsigned long max_low_pfn);
+extern void register_memory(void);
 
 #endif/*!__ASSEMBLY__*/
 
-- 
cgit v0.10.2


From cef518e88b8ed94ea483c436ef5e5b151a3fabc6 Mon Sep 17 00:00:00 2001
From: "bibo,mao" <bibo.mao@intel.com>
Date: Thu, 7 Dec 2006 02:14:06 +0100
Subject: [PATCH] i386: Move memory map printing and other code to e820.c

This patch moves e820 memory map print and memmap boot param
parsing function from setup.c to e820.c, also adds limit_regions
and print_memory_map declaration in header file.

Signed-off-by: bibo,mao <bibo.mao@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>

 arch/i386/kernel/e820.c  |  152 +++++++++++++++++++++++++++
 arch/i386/kernel/setup.c |  158 ---------------------------------
 include/asm-i386/e820.h  |    2
 arch/i386/kernel/e820.c  |  152 ++++++++++++++++++++++++++++++++++++++++++++++
 arch/i386/kernel/setup.c |  153 -----------------------------------------------
 include/asm-i386/e820.h  |    2
 3 files changed, 155 insertions(+), 152 deletions(-)

diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
index 47c495b..b755255 100644
--- a/arch/i386/kernel/e820.c
+++ b/arch/i386/kernel/e820.c
@@ -33,6 +33,7 @@ unsigned long pci_mem_start = 0x10000000;
 #ifdef CONFIG_PCI
 EXPORT_SYMBOL(pci_mem_start);
 #endif
+extern int user_defined_memmap;
 struct resource data_resource = {
 	.name	= "Kernel data",
 	.start	= 0,
@@ -706,3 +707,154 @@ void __init register_memory(void)
 	printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
 		pci_mem_start, gapstart, gapsize);
 }
+
+void __init print_memory_map(char *who)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		printk(" %s: %016Lx - %016Lx ", who,
+			e820.map[i].addr,
+			e820.map[i].addr + e820.map[i].size);
+		switch (e820.map[i].type) {
+		case E820_RAM:	printk("(usable)\n");
+				break;
+		case E820_RESERVED:
+				printk("(reserved)\n");
+				break;
+		case E820_ACPI:
+				printk("(ACPI data)\n");
+				break;
+		case E820_NVS:
+				printk("(ACPI NVS)\n");
+				break;
+		default:	printk("type %lu\n", e820.map[i].type);
+				break;
+		}
+	}
+}
+
+void __init limit_regions(unsigned long long size)
+{
+	unsigned long long current_addr = 0;
+	int i;
+
+	print_memory_map("limit_regions start");
+	if (efi_enabled) {
+		efi_memory_desc_t *md;
+		void *p;
+
+		for (p = memmap.map, i = 0; p < memmap.map_end;
+			p += memmap.desc_size, i++) {
+			md = p;
+			current_addr = md->phys_addr + (md->num_pages << 12);
+			if (md->type == EFI_CONVENTIONAL_MEMORY) {
+				if (current_addr >= size) {
+					md->num_pages -=
+						(((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
+					memmap.nr_map = i + 1;
+					return;
+				}
+			}
+		}
+	}
+	for (i = 0; i < e820.nr_map; i++) {
+		current_addr = e820.map[i].addr + e820.map[i].size;
+		if (current_addr < size)
+			continue;
+
+		if (e820.map[i].type != E820_RAM)
+			continue;
+
+		if (e820.map[i].addr >= size) {
+			/*
+			 * This region starts past the end of the
+			 * requested size, skip it completely.
+			 */
+			e820.nr_map = i;
+		} else {
+			e820.nr_map = i + 1;
+			e820.map[i].size -= current_addr - size;
+		}
+		print_memory_map("limit_regions endfor");
+		return;
+	}
+	print_memory_map("limit_regions endfunc");
+}
+
+ /*
+  * This function checks if the entire range <start,end> is mapped with type.
+  *
+  * Note: this function only works correct if the e820 table is sorted and
+  * not-overlapping, which is the case
+  */
+int __init
+e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
+{
+	u64 start = s;
+	u64 end = e;
+	int i;
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		if (type && ei->type != type)
+			continue;
+		/* is the region (part) in overlap with the current region ?*/
+		if (ei->addr >= end || ei->addr + ei->size <= start)
+			continue;
+		/* if the region is at the beginning of <start,end> we move
+		 * start to the end of the region since it's ok until there
+		 */
+		if (ei->addr <= start)
+			start = ei->addr + ei->size;
+		/* if start is now at or beyond end, we're done, full
+		 * coverage */
+		if (start >= end)
+			return 1; /* we're done */
+	}
+	return 0;
+}
+
+static int __init parse_memmap(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	if (strcmp(arg, "exactmap") == 0) {
+#ifdef CONFIG_CRASH_DUMP
+		/* If we are doing a crash dump, we
+		 * still need to know the real mem
+		 * size before original memory map is
+		 * reset.
+		 */
+		find_max_pfn();
+		saved_max_pfn = max_pfn;
+#endif
+		e820.nr_map = 0;
+		user_defined_memmap = 1;
+	} else {
+		/* If the user specifies memory size, we
+		 * limit the BIOS-provided memory map to
+		 * that size. exactmap can be used to specify
+		 * the exact map. mem=number can be used to
+		 * trim the existing memory map.
+		 */
+		unsigned long long start_at, mem_size;
+
+		mem_size = memparse(arg, &arg);
+		if (*arg == '@') {
+			start_at = memparse(arg+1, &arg);
+			add_memory_region(start_at, mem_size, E820_RAM);
+		} else if (*arg == '#') {
+			start_at = memparse(arg+1, &arg);
+			add_memory_region(start_at, mem_size, E820_ACPI);
+		} else if (*arg == '$') {
+			start_at = memparse(arg+1, &arg);
+			add_memory_region(start_at, mem_size, E820_RESERVED);
+		} else {
+			limit_regions(mem_size);
+			user_defined_memmap = 1;
+		}
+	}
+	return 0;
+}
+early_param("memmap", parse_memmap);
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 51ed015..e5bb87a 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -73,7 +73,6 @@ int disable_pse __devinitdata = 0;
 /*
  * Machine setup..
  */
-extern struct e820map e820;
 extern struct resource code_resource;
 extern struct resource data_resource;
 
@@ -137,79 +136,6 @@ static char command_line[COMMAND_LINE_SIZE];
 
 unsigned char __initdata boot_params[PARAM_SIZE];
 
-static void __init limit_regions(unsigned long long size)
-{
-	unsigned long long current_addr = 0;
-	int i;
-
-	if (efi_enabled) {
-		efi_memory_desc_t *md;
-		void *p;
-
-		for (p = memmap.map, i = 0; p < memmap.map_end;
-			p += memmap.desc_size, i++) {
-			md = p;
-			current_addr = md->phys_addr + (md->num_pages << 12);
-			if (md->type == EFI_CONVENTIONAL_MEMORY) {
-				if (current_addr >= size) {
-					md->num_pages -=
-						(((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
-					memmap.nr_map = i + 1;
-					return;
-				}
-			}
-		}
-	}
-	for (i = 0; i < e820.nr_map; i++) {
-		current_addr = e820.map[i].addr + e820.map[i].size;
-		if (current_addr < size)
-			continue;
-
-		if (e820.map[i].type != E820_RAM)
-			continue;
-
-		if (e820.map[i].addr >= size) {
-			/*
-			 * This region starts past the end of the
-			 * requested size, skip it completely.
-			 */
-			e820.nr_map = i;
-		} else {
-			e820.nr_map = i + 1;
-			e820.map[i].size -= current_addr - size;
-		}
-		return;
-	}
-}
-
-#define E820_DEBUG	1
-
-static void __init print_memory_map(char *who)
-{
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		printk(" %s: %016Lx - %016Lx ", who,
-			e820.map[i].addr,
-			e820.map[i].addr + e820.map[i].size);
-		switch (e820.map[i].type) {
-		case E820_RAM:	printk("(usable)\n");
-				break;
-		case E820_RESERVED:
-				printk("(reserved)\n");
-				break;
-		case E820_ACPI:
-				printk("(ACPI data)\n");
-				break;
-		case E820_NVS:
-				printk("(ACPI NVS)\n");
-				break;
-		default:	printk("type %lu\n", e820.map[i].type);
-				break;
-		}
-	}
-}
-
 #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
 struct edd edd;
 #ifdef CONFIG_EDD_MODULE
@@ -233,7 +159,7 @@ static inline void copy_edd(void)
 }
 #endif
 
-static int __initdata user_defined_memmap = 0;
+int __initdata user_defined_memmap = 0;
 
 /*
  * "mem=nopentium" disables the 4MB page tables.
@@ -270,51 +196,6 @@ static int __init parse_mem(char *arg)
 }
 early_param("mem", parse_mem);
 
-static int __init parse_memmap(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-	if (strcmp(arg, "exactmap") == 0) {
-#ifdef CONFIG_CRASH_DUMP
-		/* If we are doing a crash dump, we
-		 * still need to know the real mem
-		 * size before original memory map is
-		 * reset.
-		 */
-		find_max_pfn();
-		saved_max_pfn = max_pfn;
-#endif
-		e820.nr_map = 0;
-		user_defined_memmap = 1;
-	} else {
-		/* If the user specifies memory size, we
-		 * limit the BIOS-provided memory map to
-		 * that size. exactmap can be used to specify
-		 * the exact map. mem=number can be used to
-		 * trim the existing memory map.
-		 */
-		unsigned long long start_at, mem_size;
-
-		mem_size = memparse(arg, &arg);
-		if (*arg == '@') {
-			start_at = memparse(arg+1, &arg);
-			add_memory_region(start_at, mem_size, E820_RAM);
-		} else if (*arg == '#') {
-			start_at = memparse(arg+1, &arg);
-			add_memory_region(start_at, mem_size, E820_ACPI);
-		} else if (*arg == '$') {
-			start_at = memparse(arg+1, &arg);
-			add_memory_region(start_at, mem_size, E820_RESERVED);
-		} else {
-			limit_regions(mem_size);
-			user_defined_memmap = 1;
-		}
-	}
-	return 0;
-}
-early_param("memmap", parse_memmap);
-
 #ifdef CONFIG_PROC_VMCORE
 /* elfcorehdr= specifies the location of elf core header
  * stored by the crashed kernel.
@@ -378,38 +259,6 @@ static int __init parse_reservetop(char *arg)
 }
 early_param("reservetop", parse_reservetop);
 
- /*
-  * This function checks if the entire range <start,end> is mapped with type.
-  *
-  * Note: this function only works correct if the e820 table is sorted and
-  * not-overlapping, which is the case
-  */
-int __init
-e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
-{
-	u64 start = s;
-	u64 end = e;
-	int i;
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		if (type && ei->type != type)
-			continue;
-		/* is the region (part) in overlap with the current region ?*/
-		if (ei->addr >= end || ei->addr + ei->size <= start)
-			continue;
-		/* if the region is at the beginning of <start,end> we move
-		 * start to the end of the region since it's ok until there
-		 */
-		if (ei->addr <= start)
-			start = ei->addr + ei->size;
-		/* if start is now at or beyond end, we're done, full
-		 * coverage */
-		if (start >= end)
-			return 1; /* we're done */
-	}
-	return 0;
-}
-
 /*
  * Determine low and high memory ranges:
  */
diff --git a/include/asm-i386/e820.h b/include/asm-i386/e820.h
index 8da4175..395077a 100644
--- a/include/asm-i386/e820.h
+++ b/include/asm-i386/e820.h
@@ -41,6 +41,8 @@ extern int e820_all_mapped(unsigned long start, unsigned long end,
 extern void find_max_pfn(void);
 extern void register_bootmem_low_pages(unsigned long max_low_pfn);
 extern void register_memory(void);
+extern void limit_regions(unsigned long long size);
+extern void print_memory_map(char *who);
 
 #endif/*!__ASSEMBLY__*/
 
-- 
cgit v0.10.2


From 58db85482743f5e3495d168c641c60ce1d3dfb06 Mon Sep 17 00:00:00 2001
From: Muli Ben-Yehuda <muli@il.ibm.com>
Date: Thu, 7 Dec 2006 02:14:06 +0100
Subject: [PATCH] calgary: phb_shift can be int

Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index d2ea87a..f53b581 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -740,7 +740,7 @@ static void __init calgary_increase_split_completion_timeout(void __iomem *bbar,
 {
 	u64 val64;
 	void __iomem *target;
-	unsigned long phb_shift = -1;
+	unsigned int phb_shift = ~0; /* silence gcc */
 	u64 mask;
 
 	switch (busno_to_phbid(busnum)) {
-- 
cgit v0.10.2


From b34e90b8f0f30151349134f87b5dc6ef75a5218c Mon Sep 17 00:00:00 2001
From: Laurent Vivier <Laurent.Vivier@bull.net>
Date: Thu, 7 Dec 2006 02:14:06 +0100
Subject: [PATCH] Calgary: use BIOS supplied BBARs and topology information

Find the BBAR register address of each Calgary using the "Extended
BIOS Data Area" rather than calculating it ourselves. Also get the bus
topology (what PHB each bus is on) from Calgary rather than
calculating it ourselves.

This patch fixes http://bugzilla.kernel.org/show_bug.cgi?id=7407.

Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index f53b581..afc0a53 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -41,6 +41,7 @@
 #include <asm/pci-direct.h>
 #include <asm/system.h>
 #include <asm/dma.h>
+#include <asm/rio.h>
 
 #define PCI_DEVICE_ID_IBM_CALGARY 0x02a1
 #define PCI_VENDOR_DEVICE_ID_CALGARY \
@@ -115,14 +116,35 @@ static const unsigned long phb_offsets[] = {
 	0xB000 /* PHB3 */
 };
 
+/* PHB debug registers */
+
+static const unsigned long phb_debug_offsets[] = {
+	0x4000	/* PHB 0 DEBUG */,
+	0x5000	/* PHB 1 DEBUG */,
+	0x6000	/* PHB 2 DEBUG */,
+	0x7000	/* PHB 3 DEBUG */
+};
+
+/*
+ * STUFF register for each debug PHB,
+ * byte 1 = start bus number, byte 2 = end bus number
+ */
+
+#define PHB_DEBUG_STUFF_OFFSET	0x0020
+
 unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
 static int translate_empty_slots __read_mostly = 0;
 static int calgary_detected __read_mostly = 0;
 
+static struct rio_table_hdr	*rio_table_hdr __initdata;
+static struct scal_detail	*scal_devs[MAX_NUMNODES] __initdata;
+static struct rio_detail	*rio_devs[MAX_NUMNODES*4] __initdata;
+
 struct calgary_bus_info {
 	void *tce_space;
 	unsigned char translation_disabled;
 	signed char phbid;
+	void __iomem *bbar;
 };
 
 static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
@@ -475,6 +497,11 @@ static struct dma_mapping_ops calgary_dma_ops = {
 	.unmap_sg = calgary_unmap_sg,
 };
 
+static inline void __iomem * busno_to_bbar(unsigned char num)
+{
+	return bus_info[num].bbar;
+}
+
 static inline int busno_to_phbid(unsigned char num)
 {
 	return bus_info[num].phbid;
@@ -828,31 +855,9 @@ static void __init calgary_disable_translation(struct pci_dev *dev)
 	del_timer_sync(&tbl->watchdog_timer);
 }
 
-static inline unsigned int __init locate_register_space(struct pci_dev *dev)
+static inline void __iomem * __init locate_register_space(struct pci_dev *dev)
 {
-	int rionodeid;
-	u32 address;
-
-	/*
-	 * Each Calgary has four busses. The first four busses (first Calgary)
-	 * have RIO node ID 2, then the next four (second Calgary) have RIO
-	 * node ID 3, the next four (third Calgary) have node ID 2 again, etc.
-	 * We use a gross hack - relying on the dev->bus->number ordering,
-	 * modulo 14 - to decide which Calgary a given bus is on. Busses 0, 1,
-	 * 2 and 4 are on the first Calgary (id 2), 6, 8, a and c are on the
-	 * second (id 3), and then it repeats modulo 14.
- 	 */
-	rionodeid = (dev->bus->number % 14 > 4) ? 3 : 2;
-	/*
-	 * register space address calculation as follows:
-	 * FE0MB-8MB*OneBasedChassisNumber+1MB*(RioNodeId-ChassisBase)
-	 * ChassisBase is always zero for x366/x260/x460
-	 * RioNodeId is 2 for first Calgary, 3 for second Calgary
-	 */
-	address = START_ADDRESS	-
-		(0x800000 * (ONE_BASED_CHASSIS_NUM + dev->bus->number / 14)) +
-		(0x100000) * (rionodeid - CHASSIS_BASE);
-	return address;
+	return busno_to_bbar(dev->bus->number);
 }
 
 static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
@@ -864,15 +869,12 @@ static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
 
 static int __init calgary_init_one(struct pci_dev *dev)
 {
-	u32 address;
 	void __iomem *bbar;
 	int ret;
 
 	BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM);
 
-	address = locate_register_space(dev);
-	/* map entire 1MB of Calgary config space */
-	bbar = ioremap_nocache(address, 1024 * 1024);
+	bbar = locate_register_space(dev);
 	if (!bbar) {
 		ret = -ENODATA;
 		goto done;
@@ -898,6 +900,35 @@ static int __init calgary_init(void)
 {
 	int ret = -ENODEV;
 	struct pci_dev *dev = NULL;
+	int rio, phb, bus;
+	void __iomem *bbar;
+	void __iomem *target;
+	u8 start_bus, end_bus;
+	u32 val;
+
+	for (rio = 0; rio < rio_table_hdr->num_rio_dev; rio++) {
+
+		if ( (rio_devs[rio]->type != COMPAT_CALGARY) &&
+		     (rio_devs[rio]->type != ALT_CALGARY) )
+			continue;
+
+		/* map entire 1MB of Calgary config space */
+		bbar = ioremap_nocache(rio_devs[rio]->BBAR, 1024 * 1024);
+
+		for (phb = 0; phb < PHBS_PER_CALGARY; phb++) {
+
+			target = calgary_reg(bbar, phb_debug_offsets[phb] |
+						   PHB_DEBUG_STUFF_OFFSET);
+			val = be32_to_cpu(readl(target));
+			start_bus = (u8)((val & 0x00FF0000) >> 16);
+			end_bus =   (u8)((val & 0x0000FF00) >> 8);
+			for (bus = start_bus; bus <= end_bus; bus++) {
+				bus_info[bus].bbar = bbar;
+				bus_info[bus].phbid = phb;
+			}
+		}
+	}
+
 
 	do {
 		dev = pci_get_device(PCI_VENDOR_ID_IBM,
@@ -962,13 +993,55 @@ static inline int __init determine_tce_table_size(u64 ram)
 	return ret;
 }
 
+static int __init build_detail_arrays(void)
+{
+	unsigned long ptr;
+	int i, scal_detail_size, rio_detail_size;
+
+	if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){
+		printk(KERN_WARNING
+			"Calgary: MAX_NUMNODES too low!  Defined as %d, "
+			"but system has %d nodes.\n",
+			MAX_NUMNODES, rio_table_hdr->num_scal_dev);
+		return -ENODEV;
+	}
+
+	switch (rio_table_hdr->version){
+	default:
+		printk(KERN_WARNING
+		       "Calgary: Invalid Rio Grande Table Version: %d\n",
+		       rio_table_hdr->version);
+		return -ENODEV;
+	case 2:
+		scal_detail_size = 11;
+		rio_detail_size = 13;
+		break;
+	case 3:
+		scal_detail_size = 12;
+		rio_detail_size = 15;
+		break;
+	}
+
+	ptr = ((unsigned long)rio_table_hdr) + 3;
+	for (i = 0; i < rio_table_hdr->num_scal_dev;
+		    i++, ptr += scal_detail_size)
+		scal_devs[i] = (struct scal_detail *)ptr;
+
+	for (i = 0; i < rio_table_hdr->num_rio_dev;
+		    i++, ptr += rio_detail_size)
+		rio_devs[i] = (struct rio_detail *)ptr;
+
+	return 0;
+}
+
 void __init detect_calgary(void)
 {
 	u32 val;
 	int bus;
 	void *tbl;
 	int calgary_found = 0;
-	int phb = -1;
+	unsigned long ptr;
+	int offset;
 
 	/*
 	 * if the user specified iommu=off or iommu=soft or we found
@@ -980,6 +1053,29 @@ void __init detect_calgary(void)
 	if (!early_pci_allowed())
 		return;
 
+	ptr = (unsigned long)phys_to_virt(get_bios_ebda());
+
+	rio_table_hdr = NULL;
+	offset = 0x180;
+	while (offset) {
+		/* The block id is stored in the 2nd word */
+		if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
+			/* set the pointer past the offset & block id */
+			rio_table_hdr = (struct rio_table_hdr *)(ptr+offset+4);
+			break;
+		}
+		/* The next offset is stored in the 1st word. 0 means no more */
+		offset = *((unsigned short *)(ptr + offset));
+	}
+	if (!rio_table_hdr){
+		printk(KERN_ERR "Calgary: Unable to locate "
+				"Rio Grande Table in EBDA - bailing!\n");
+		return;
+	}
+
+	if (build_detail_arrays())
+		return;
+
 	specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
 
 	for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
@@ -990,12 +1086,6 @@ void __init detect_calgary(void)
 		if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY)
 			continue;
 
-		/*
-		 * There are 4 PHBs per Calgary chip.  Set phb to which phb (0-3)
-		 * it is connected to releative to the clagary chip.
-		 */
-		phb = (phb + 1) % PHBS_PER_CALGARY;
-
 		if (info->translation_disabled)
 			continue;
 
@@ -1010,7 +1100,6 @@ void __init detect_calgary(void)
 				if (!tbl)
 					goto cleanup;
 				info->tce_space = tbl;
-				info->phbid = phb;
 				calgary_found = 1;
 				break;
 			}
diff --git a/include/asm-x86_64/rio.h b/include/asm-x86_64/rio.h
new file mode 100644
index 0000000..1f315c3
--- /dev/null
+++ b/include/asm-x86_64/rio.h
@@ -0,0 +1,76 @@
+/*
+ * Derived from include/asm-i386/mach-summit/mach_mpparse.h
+ *          and include/asm-i386/mach-default/bios_ebda.h
+ *
+ * Author: Laurent Vivier <Laurent.Vivier@bull.net>
+ *
+ */
+
+#ifndef __ASM_RIO_H
+#define __ASM_RIO_H
+
+#define RIO_TABLE_VERSION	3
+
+struct rio_table_hdr {
+	u8 version;      /* Version number of this data structure  */
+	u8 num_scal_dev; /* # of Scalability devices               */
+	u8 num_rio_dev;  /* # of RIO I/O devices                   */
+} __attribute__((packed));
+
+struct scal_detail {
+	u8 node_id;      /* Scalability Node ID                    */
+	u32  CBAR;       /* Address of 1MB register space          */
+	u8 port0node;    /* Node ID port connected to: 0xFF=None   */
+	u8 port0port;    /* Port num port connected to: 0,1,2, or  */
+	                 /* 0xFF=None                              */
+	u8 port1node;    /* Node ID port connected to: 0xFF = None */
+	u8 port1port;    /* Port num port connected to: 0,1,2, or  */
+	                 /* 0xFF=None                              */
+	u8 port2node;    /* Node ID port connected to: 0xFF = None */
+	u8 port2port;    /* Port num port connected to: 0,1,2, or  */
+	                 /* 0xFF=None                              */
+	u8 chassis_num;  /* 1 based Chassis number (1 = boot node) */
+} __attribute__((packed));
+
+struct rio_detail {
+	u8 node_id;      /* RIO Node ID                            */
+	u32  BBAR;       /* Address of 1MB register space          */
+	u8 type;         /* Type of device                         */
+	u8 owner_id;     /* Node ID of Hurricane that owns this    */
+	                 /* node                                   */
+	u8 port0node;    /* Node ID port connected to: 0xFF=None   */
+	u8 port0port;    /* Port num port connected to: 0,1,2, or  */
+	                 /* 0xFF=None                              */
+	u8 port1node;    /* Node ID port connected to: 0xFF=None   */
+	u8 port1port;    /* Port num port connected to: 0,1,2, or  */
+	                 /* 0xFF=None                              */
+	u8 first_slot;   /* Lowest slot number below this Calgary  */
+	u8 status;       /* Bit 0 = 1 : the XAPIC is used          */
+	                 /*       = 0 : the XAPIC is not used, ie: */
+	                 /*            ints fwded to another XAPIC */
+	                 /*           Bits1:7 Reserved             */
+	u8 WP_index;     /* instance index - lower ones have       */
+	                 /*     lower slot numbers/PCI bus numbers */
+	u8 chassis_num;  /* 1 based Chassis number                 */
+} __attribute__((packed));
+
+enum {
+	HURR_SCALABILTY	= 0,  /* Hurricane Scalability info */
+	HURR_RIOIB	= 2,  /* Hurricane RIOIB info       */
+	COMPAT_CALGARY	= 4,  /* Compatibility Calgary      */
+	ALT_CALGARY	= 5,  /* Second Planar Calgary      */
+};
+
+/*
+ * there is a real-mode segmented pointer pointing to the
+ * 4K EBDA area at 0x40E.
+ */
+
+static inline unsigned long get_bios_ebda(void)
+{
+	unsigned long address= *(unsigned short *)phys_to_virt(0x40Eul);
+	address <<= 4;
+	return address;
+}
+
+#endif /* __ASM_RIO_H */
-- 
cgit v0.10.2


From eae93755540bae18aff46b8a0e621b5d65bd5380 Mon Sep 17 00:00:00 2001
From: Muli Ben-Yehuda <muli@il.ibm.com>
Date: Thu, 7 Dec 2006 02:14:06 +0100
Subject: [PATCH] Calgary: check BBAR ioremap success when ioremapping

This patch cleans up the previous "Use BIOS supplied BBAR information"
patch. Mostly stylistic clenaups, but also check for ioremap failure
when we ioremap the BBAR rather than when trying to use it.

Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Laurent Vivier <Laurent.Vivier@bull.net>

diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index afc0a53..8a1e4f3 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -138,7 +138,7 @@ static int calgary_detected __read_mostly = 0;
 
 static struct rio_table_hdr	*rio_table_hdr __initdata;
 static struct scal_detail	*scal_devs[MAX_NUMNODES] __initdata;
-static struct rio_detail	*rio_devs[MAX_NUMNODES*4] __initdata;
+static struct rio_detail	*rio_devs[MAX_NUMNODES * 4] __initdata;
 
 struct calgary_bus_info {
 	void *tce_space;
@@ -855,11 +855,6 @@ static void __init calgary_disable_translation(struct pci_dev *dev)
 	del_timer_sync(&tbl->watchdog_timer);
 }
 
-static inline void __iomem * __init locate_register_space(struct pci_dev *dev)
-{
-	return busno_to_bbar(dev->bus->number);
-}
-
 static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
 {
 	pci_dev_get(dev);
@@ -874,15 +869,10 @@ static int __init calgary_init_one(struct pci_dev *dev)
 
 	BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM);
 
-	bbar = locate_register_space(dev);
-	if (!bbar) {
-		ret = -ENODATA;
-		goto done;
-	}
-
+	bbar = busno_to_bbar(dev->bus->number);
 	ret = calgary_setup_tar(dev, bbar);
 	if (ret)
-		goto iounmap;
+		goto done;
 
 	pci_dev_get(dev);
 	dev->bus->self = dev;
@@ -890,38 +880,39 @@ static int __init calgary_init_one(struct pci_dev *dev)
 
 	return 0;
 
-iounmap:
-	iounmap(bbar);
 done:
 	return ret;
 }
 
-static int __init calgary_init(void)
+static int __init calgary_locate_bbars(void)
 {
-	int ret = -ENODEV;
-	struct pci_dev *dev = NULL;
-	int rio, phb, bus;
+	int ret;
+	int rioidx, phb, bus;
 	void __iomem *bbar;
 	void __iomem *target;
+	unsigned long offset;
 	u8 start_bus, end_bus;
 	u32 val;
 
-	for (rio = 0; rio < rio_table_hdr->num_rio_dev; rio++) {
+	ret = -ENODATA;
+	for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) {
+		struct rio_detail *rio = rio_devs[rioidx];
 
-		if ( (rio_devs[rio]->type != COMPAT_CALGARY) &&
-		     (rio_devs[rio]->type != ALT_CALGARY) )
+		if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY))
 			continue;
 
 		/* map entire 1MB of Calgary config space */
-		bbar = ioremap_nocache(rio_devs[rio]->BBAR, 1024 * 1024);
+		bbar = ioremap_nocache(rio->BBAR, 1024 * 1024);
+		if (!bbar)
+			goto error;
 
 		for (phb = 0; phb < PHBS_PER_CALGARY; phb++) {
+			offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET;
+			target = calgary_reg(bbar, offset);
 
-			target = calgary_reg(bbar, phb_debug_offsets[phb] |
-						   PHB_DEBUG_STUFF_OFFSET);
 			val = be32_to_cpu(readl(target));
 			start_bus = (u8)((val & 0x00FF0000) >> 16);
-			end_bus =   (u8)((val & 0x0000FF00) >> 8);
+			end_bus = (u8)((val & 0x0000FF00) >> 8);
 			for (bus = start_bus; bus <= end_bus; bus++) {
 				bus_info[bus].bbar = bbar;
 				bus_info[bus].phbid = phb;
@@ -929,6 +920,25 @@ static int __init calgary_init(void)
 		}
 	}
 
+	return 0;
+
+error:
+	/* scan bus_info and iounmap any bbars we previously ioremap'd */
+	for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++)
+		if (bus_info[bus].bbar)
+			iounmap(bus_info[bus].bbar);
+
+	return ret;
+}
+
+static int __init calgary_init(void)
+{
+	int ret;
+	struct pci_dev *dev = NULL;
+
+	ret = calgary_locate_bbars();
+	if (ret)
+		return ret;
 
 	do {
 		dev = pci_get_device(PCI_VENDOR_ID_IBM,
@@ -1000,18 +1010,13 @@ static int __init build_detail_arrays(void)
 
 	if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){
 		printk(KERN_WARNING
-			"Calgary: MAX_NUMNODES too low!  Defined as %d, "
+			"Calgary: MAX_NUMNODES too low! Defined as %d, "
 			"but system has %d nodes.\n",
 			MAX_NUMNODES, rio_table_hdr->num_scal_dev);
 		return -ENODEV;
 	}
 
 	switch (rio_table_hdr->version){
-	default:
-		printk(KERN_WARNING
-		       "Calgary: Invalid Rio Grande Table Version: %d\n",
-		       rio_table_hdr->version);
-		return -ENODEV;
 	case 2:
 		scal_detail_size = 11;
 		rio_detail_size = 13;
@@ -1020,6 +1025,11 @@ static int __init build_detail_arrays(void)
 		scal_detail_size = 12;
 		rio_detail_size = 15;
 		break;
+	default:
+		printk(KERN_WARNING
+		       "Calgary: Invalid Rio Grande Table Version: %d\n",
+		       rio_table_hdr->version);
+		return -EPROTO;
 	}
 
 	ptr = ((unsigned long)rio_table_hdr) + 3;
@@ -1042,6 +1052,7 @@ void __init detect_calgary(void)
 	int calgary_found = 0;
 	unsigned long ptr;
 	int offset;
+	int ret;
 
 	/*
 	 * if the user specified iommu=off or iommu=soft or we found
@@ -1061,27 +1072,29 @@ void __init detect_calgary(void)
 		/* The block id is stored in the 2nd word */
 		if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
 			/* set the pointer past the offset & block id */
-			rio_table_hdr = (struct rio_table_hdr *)(ptr+offset+4);
+			rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
 			break;
 		}
 		/* The next offset is stored in the 1st word. 0 means no more */
 		offset = *((unsigned short *)(ptr + offset));
 	}
-	if (!rio_table_hdr){
+	if (!rio_table_hdr) {
 		printk(KERN_ERR "Calgary: Unable to locate "
 				"Rio Grande Table in EBDA - bailing!\n");
 		return;
 	}
 
-	if (build_detail_arrays())
+	ret = build_detail_arrays();
+	if (ret) {
+		printk(KERN_ERR "Calgary: build_detail_arrays ret %d\n", ret);
 		return;
+	}
 
 	specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
 
 	for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
 		int dev;
 		struct calgary_bus_info *info = &bus_info[bus];
-		info->phbid = -1;
 
 		if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY)
 			continue;
diff --git a/include/asm-x86_64/rio.h b/include/asm-x86_64/rio.h
index 1f315c3..c7350f6 100644
--- a/include/asm-x86_64/rio.h
+++ b/include/asm-x86_64/rio.h
@@ -3,7 +3,6 @@
  *          and include/asm-i386/mach-default/bios_ebda.h
  *
  * Author: Laurent Vivier <Laurent.Vivier@bull.net>
- *
  */
 
 #ifndef __ASM_RIO_H
@@ -19,7 +18,7 @@ struct rio_table_hdr {
 
 struct scal_detail {
 	u8 node_id;      /* Scalability Node ID                    */
-	u32  CBAR;       /* Address of 1MB register space          */
+	u32 CBAR;        /* Address of 1MB register space          */
 	u8 port0node;    /* Node ID port connected to: 0xFF=None   */
 	u8 port0port;    /* Port num port connected to: 0,1,2, or  */
 	                 /* 0xFF=None                              */
@@ -34,7 +33,7 @@ struct scal_detail {
 
 struct rio_detail {
 	u8 node_id;      /* RIO Node ID                            */
-	u32  BBAR;       /* Address of 1MB register space          */
+	u32 BBAR;        /* Address of 1MB register space          */
 	u8 type;         /* Type of device                         */
 	u8 owner_id;     /* Node ID of Hurricane that owns this    */
 	                 /* node                                   */
@@ -65,10 +64,9 @@ enum {
  * there is a real-mode segmented pointer pointing to the
  * 4K EBDA area at 0x40E.
  */
-
 static inline unsigned long get_bios_ebda(void)
 {
-	unsigned long address= *(unsigned short *)phys_to_virt(0x40Eul);
+	unsigned long address = *(unsigned short *)phys_to_virt(0x40EUL);
 	address <<= 4;
 	return address;
 }
-- 
cgit v0.10.2


From bff6547bb6a4e82c399d74e7fba78b12d2f162ed Mon Sep 17 00:00:00 2001
From: Muli Ben-Yehuda <muli@il.ibm.com>
Date: Thu, 7 Dec 2006 02:14:07 +0100
Subject: [PATCH] Calgary: allow compiling Calgary in but not using it by
 default

This patch makes it possible to compile Calgary in but not use it by
default. In this mode, use 'iommu=calgary' to activate it.

Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index ab9b1c0..dbdcaf6 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -179,7 +179,7 @@ PCI
 IOMMU
 
  iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge]
-         [,forcesac][,fullflush][,nomerge][,noaperture]
+         [,forcesac][,fullflush][,nomerge][,noaperture][,calgary]
    size  set size of iommu (in bytes)
    noagp don't initialize the AGP driver and use full aperture.
    off   don't use the IOMMU
@@ -200,6 +200,7 @@ IOMMU
 	    buffering.
    nodac    Forbid DMA >4GB
    panic    Always panic when IOMMU overflows
+   calgary  Use the Calgary IOMMU if it is available
 
   swiotlb=pages[,force]
 
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 010d226..5cb509d 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -455,6 +455,17 @@ config CALGARY_IOMMU
 	  Normally the kernel will make the right choice by itself.
 	  If unsure, say Y.
 
+config CALGARY_IOMMU_ENABLED_BY_DEFAULT
+	bool "Should Calgary be enabled by default?"
+	default y
+	depends on CALGARY_IOMMU
+	help
+	  Should Calgary be enabled by default? if you choose 'y', Calgary
+	  will be used (if it exists). If you choose 'n', Calgary will not be
+	  used even if it exists. If you choose 'n' and would like to use
+	  Calgary anyway, pass 'iommu=calgary' on the kernel command line.
+	  If unsure, say Y.
+
 # need this always selected by IOMMU for the VIA workaround
 config SWIOTLB
 	bool
diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 8a1e4f3..0ddf29d 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -43,6 +43,12 @@
 #include <asm/dma.h>
 #include <asm/rio.h>
 
+#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
+int use_calgary __read_mostly = 1;
+#else
+int use_calgary __read_mostly = 0;
+#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */
+
 #define PCI_DEVICE_ID_IBM_CALGARY 0x02a1
 #define PCI_VENDOR_DEVICE_ID_CALGARY \
 	(PCI_VENDOR_ID_IBM | PCI_DEVICE_ID_IBM_CALGARY << 16)
@@ -1061,6 +1067,9 @@ void __init detect_calgary(void)
 	if (swiotlb || no_iommu || iommu_detected)
 		return;
 
+	if (!use_calgary)
+		return;
+
 	if (!early_pci_allowed())
 		return;
 
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index f8d8574..683b7a5 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -296,6 +296,11 @@ __init int iommu_setup(char *p)
 		gart_parse_options(p);
 #endif
 
+#ifdef CONFIG_CALGARY_IOMMU
+		if (!strncmp(p, "calgary", 7))
+			use_calgary = 1;
+#endif /* CONFIG_CALGARY_IOMMU */
+
 		p += strcspn(p, ",");
 		if (*p == ',')
 			++p;
diff --git a/include/asm-x86_64/calgary.h b/include/asm-x86_64/calgary.h
index 6b93f5a..7ee9006 100644
--- a/include/asm-x86_64/calgary.h
+++ b/include/asm-x86_64/calgary.h
@@ -51,6 +51,8 @@ struct iommu_table {
 #define TCE_TABLE_SIZE_4M		6
 #define TCE_TABLE_SIZE_8M		7
 
+extern int use_calgary;
+
 #ifdef CONFIG_CALGARY_IOMMU
 extern int calgary_iommu_init(void);
 extern void detect_calgary(void);
-- 
cgit v0.10.2


From b9a8d94a47f8a41766f6f7944adfb1d641349903 Mon Sep 17 00:00:00 2001
From: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Date: Thu, 7 Dec 2006 02:14:07 +0100
Subject: [PATCH] x86-64: Make x86_64 udelay() round up instead of down.

Port two patches from i386 to x86_64 delay.c to make sure all rounding is done
upward instead of downward.

There is no sign in commit messages that the mismatch was done on purpose, and
"delay() guarantees sleeping at least for the specified time" is still a valid
rule IMHO.

The original x86 patches are both from pre-GIT era, i.e.:

"[PATCH] round up  in __udelay()" in commit
54c7e1f5cc6771ff644d7bc21a2b829308bd126f

"[PATCH] add 1 in __const_udelay()" in commit
42c77a9801b8877d8b90f65f75db758822a0bccc

(both commits are from converted BK repository to x86_64).

AK: fixed gcc warning

linux/arch/x86_64/lib/delay.c:43: warning: suggest parentheses around + or - inside shift
(did this actually work?)

Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c
index 50be909..2dbebd3 100644
--- a/arch/x86_64/lib/delay.c
+++ b/arch/x86_64/lib/delay.c
@@ -40,13 +40,13 @@ EXPORT_SYMBOL(__delay);
 
 inline void __const_udelay(unsigned long xloops)
 {
-	__delay((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32);
+	__delay(((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) + 1);
 }
 EXPORT_SYMBOL(__const_udelay);
 
 void __udelay(unsigned long usecs)
 {
-	__const_udelay(usecs * 0x000010c6);  /* 2**32 / 1000000 */
+	__const_udelay(usecs * 0x000010c7);  /* 2**32 / 1000000 (rounded up) */
 }
 EXPORT_SYMBOL(__udelay);
 
diff --git a/include/asm-x86_64/delay.h b/include/asm-x86_64/delay.h
index 65f64ac..40146f6 100644
--- a/include/asm-x86_64/delay.h
+++ b/include/asm-x86_64/delay.h
@@ -16,7 +16,7 @@ extern void __const_udelay(unsigned long usecs);
 extern void __delay(unsigned long loops);
 
 #define udelay(n) (__builtin_constant_p(n) ? \
-	((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c6ul)) : \
+	((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \
 	__udelay(n))
 
 #define ndelay(n) (__builtin_constant_p(n) ? \
-- 
cgit v0.10.2


From e6536c1262c56d302e749ab1b44fdb0b9786327d Mon Sep 17 00:00:00 2001
From: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Date: Thu, 7 Dec 2006 02:14:07 +0100
Subject: [PATCH] x86: comment magic constants in delay.h

For both i386 and x86_64, copy from arch/$ARCH/lib/delay.c comments about the
used magic constants, plus a few other niceties.

Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andi Kleen <ak@suse.de>

 include/asm-i386/delay.h   |    5 ++++-
 include/asm-x86_64/delay.h |    5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/include/asm-i386/delay.h b/include/asm-i386/delay.h
index b1c7650..9ae5e37 100644
--- a/include/asm-i386/delay.h
+++ b/include/asm-i386/delay.h
@@ -7,6 +7,7 @@
  * Delay routines calling functions in arch/i386/lib/delay.c
  */
  
+/* Undefined functions to get compile-time errors */
 extern void __bad_udelay(void);
 extern void __bad_ndelay(void);
 
@@ -15,10 +16,12 @@ extern void __ndelay(unsigned long nsecs);
 extern void __const_udelay(unsigned long usecs);
 extern void __delay(unsigned long loops);
 
+/* 0x10c7 is 2**32 / 1000000 (rounded up) */
 #define udelay(n) (__builtin_constant_p(n) ? \
 	((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \
 	__udelay(n))
-	
+
+/* 0x5 is 2**32 / 1000000000 (rounded up) */
 #define ndelay(n) (__builtin_constant_p(n) ? \
 	((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
 	__ndelay(n))
diff --git a/include/asm-x86_64/delay.h b/include/asm-x86_64/delay.h
index 40146f6..c2669f1f 100644
--- a/include/asm-x86_64/delay.h
+++ b/include/asm-x86_64/delay.h
@@ -7,18 +7,21 @@
  * Delay routines calling functions in arch/x86_64/lib/delay.c
  */
  
+/* Undefined functions to get compile-time errors */
 extern void __bad_udelay(void);
 extern void __bad_ndelay(void);
 
 extern void __udelay(unsigned long usecs);
-extern void __ndelay(unsigned long usecs);
+extern void __ndelay(unsigned long nsecs);
 extern void __const_udelay(unsigned long usecs);
 extern void __delay(unsigned long loops);
 
+/* 0x10c7 is 2**32 / 1000000 (rounded up) */
 #define udelay(n) (__builtin_constant_p(n) ? \
 	((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \
 	__udelay(n))
 
+/* 0x5 is 2**32 / 1000000000 (rounded up) */
 #define ndelay(n) (__builtin_constant_p(n) ? \
        ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
        __ndelay(n))
-- 
cgit v0.10.2


From d15512f442ef1ea60f6195b0444fb27b3cf8d0e6 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:07 +0100
Subject: [PATCH] i386: Fix race in IO-APIC routing entry setup.

Interrupt could happen between setting the IO-APIC entry
and setting its interrupt data.

Pointed out by Linus.

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 3b7a63e..e33b7a8 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -153,14 +153,20 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
  * the interrupt, and we need to make sure the entry is fully populated
  * before that happens.
  */
-static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+static void
+__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
 {
-	unsigned long flags;
 	union entry_union eu;
 	eu.entry = e;
-	spin_lock_irqsave(&ioapic_lock, flags);
 	io_apic_write(apic, 0x11 + 2*pin, eu.w2);
 	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+}
+
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__ioapic_write_entry(apic, pin, e);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
@@ -1360,8 +1366,8 @@ static void __init setup_IO_APIC_irqs(void)
 			if (!apic && (irq < 16))
 				disable_8259A_irq(irq);
 		}
-		ioapic_write_entry(apic, pin, entry);
 		spin_lock_irqsave(&ioapic_lock, flags);
+		__ioapic_write_entry(apic, pin, entry);
 		set_native_irq_info(irq, TARGET_CPUS);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
@@ -2856,8 +2862,8 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
 	if (!ioapic && (irq < 16))
 		disable_8259A_irq(irq);
 
-	ioapic_write_entry(ioapic, pin, entry);
 	spin_lock_irqsave(&ioapic_lock, flags);
+	__ioapic_write_entry(ioapic, pin, entry);
 	set_native_irq_info(irq, TARGET_CPUS);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
-- 
cgit v0.10.2


From 516d2836434d279fef164bb1b964e83425d312d2 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:07 +0100
Subject: [PATCH] x86-64: Fix race in IO-APIC routing entry setup.

Interrupt could happen between setting the IO-APIC entry
and setting its interrupt data.

Pointed out by Linus.

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 88fcc4e..eaf0b70 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -174,14 +174,20 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
  * the interrupt, and we need to make sure the entry is fully populated
  * before that happens.
  */
-static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+static void
+__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
 {
-	unsigned long flags;
 	union entry_union eu;
 	eu.entry = e;
-	spin_lock_irqsave(&ioapic_lock, flags);
 	io_apic_write(apic, 0x11 + 2*pin, eu.w2);
 	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+}
+
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__ioapic_write_entry(apic, pin, e);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-- 
cgit v0.10.2


From 8c89812684de3b47066d800031dfd7098abbdc74 Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <76306.1226@compuserve.com>
Date: Thu, 7 Dec 2006 02:14:07 +0100
Subject: [PATCH] i386: remove IOPL check on task switch

IOPL is implicitly saved and restored on task switch,
so explicit check is no longer needed.

Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 8f42659..9930851 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -675,12 +675,6 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
 	write_pda(pcurrent, next_p);
 
 	/*
-	 * Restore IOPL if needed.
-	 */
-	if (unlikely(prev->iopl != next->iopl))
-		set_iopl_mask(next->iopl);
-
-	/*
 	 * Now maybe handle debug registers and/or IO bitmaps
 	 */
 	if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)
-- 
cgit v0.10.2


From b6bcc4bb1cdfbc3c8612aad63a8703ac3d59f61a Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:07 +0100
Subject: [PATCH] x86-64: Don't force inlining of do_csum

It's two big and used by two callers. Calls should be cheap enough anyways.

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/lib/csum-partial.c b/arch/x86_64/lib/csum-partial.c
index 06ae630..bc503f5 100644
--- a/arch/x86_64/lib/csum-partial.c
+++ b/arch/x86_64/lib/csum-partial.c
@@ -9,8 +9,6 @@
 #include <linux/module.h>
 #include <asm/checksum.h>
 
-#define __force_inline inline __attribute__((always_inline))
-
 static inline unsigned short from32to16(unsigned a) 
 {
 	unsigned short b = a >> 16; 
@@ -33,7 +31,7 @@ static inline unsigned short from32to16(unsigned a)
  * Unrolling to an 128 bytes inner loop.
  * Using interleaving with more registers to break the carry chains.
  */
-static __force_inline unsigned do_csum(const unsigned char *buff, unsigned len)
+static unsigned do_csum(const unsigned char *buff, unsigned len)
 {
 	unsigned odd, count;
 	unsigned long result = 0;
-- 
cgit v0.10.2


From 3529833f1ca8790df06ce218b5d9d438776696ed Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@cs.washington.edu>
Date: Thu, 7 Dec 2006 02:14:07 +0100
Subject: [PATCH] i386: substitute __va lookup with pfn_to_kaddr

Substitutes allocate_pgdat virtual address lookup with pfn_to_kaddr macro.

Signed-off-by: David Rientjes <rientjes@cs.washington.edu>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
index ddbdb03..103b76e 100644
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -168,7 +168,7 @@ static void __init allocate_pgdat(int nid)
 	if (nid && node_has_online_mem(nid))
 		NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
 	else {
-		NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT));
+		NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn));
 		min_low_pfn += PFN_UP(sizeof(pg_data_t));
 	}
 }
-- 
cgit v0.10.2


From db91b882aabd0b3b55a87cbfb344f2798bb740b4 Mon Sep 17 00:00:00 2001
From: Nicolas Kaiser <nikai@nikai.net>
Date: Thu, 7 Dec 2006 02:14:07 +0100
Subject: [PATCH] i386: Fix double #includes in arch/i386

Fix double #includes in arch/i386

Signed-off-by: Nicolas Kaiser <nikai@nikai.net>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index ab0c327..5c5d450 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -34,7 +34,6 @@
 #include <linux/major.h>
 #include <linux/fs.h>
 #include <linux/smp_lock.h>
-#include <linux/fs.h>
 #include <linux/device.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index fbc9582..7f22e03 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -13,7 +13,6 @@
 
 #include <asm/delay.h>
 #include <asm/tsc.h>
-#include <asm/delay.h>
 #include <asm/io.h>
 
 #include "mach_timer.h"
-- 
cgit v0.10.2


From d3561b7fa0fb0fc583bab0eeda32bec9e4c4056d Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 7 Dec 2006 02:14:07 +0100
Subject: [PATCH] paravirt: header and stubs for paravirtualisation

Create a paravirt.h header for all the critical operations which need to be
replaced with hypervisor calls, and include that instead of defining native
operations, when CONFIG_PARAVIRT.

This patch does the dumbest possible replacement of paravirtualized
instructions: calls through a "paravirt_ops" structure.  Currently these are
function implementations of native hardware: hypervisors will override the ops
structure with their own variants.

All the pv-ops functions are declared "fastcall" so that a specific
register-based ABI is used, to make inlining assember easier.

And:

+From: Andy Whitcroft <apw@shadowen.org>

The paravirt ops introduce a 'weak' attribute onto memory_setup().
Code ordering leads to the following warnings on x86:

    arch/i386/kernel/setup.c:651: warning: weak declaration of
                `memory_setup' after first use results in unspecified behavior

Move memory_setup() to avoid this.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andy Whitcroft <apw@shadowen.org>

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 1f0f7b6..bb1fa06 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -182,6 +182,17 @@ config X86_ES7000
 
 endchoice
 
+config PARAVIRT
+	bool "Paravirtualization support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  Paravirtualization is a way of running multiple instances of
+	  Linux on the same machine, under a hypervisor.  This option
+	  changes the kernel so it can modify itself when it is run
+	  under a hypervisor, improving performance significantly.
+	  However, when run without a hypervisor the kernel is
+	  theoretically slower.  If in doubt, say N.
+
 config ACPI_SRAT
 	bool
 	default y
diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c
index dc15389..c6798c75 100644
--- a/arch/i386/boot/compressed/misc.c
+++ b/arch/i386/boot/compressed/misc.c
@@ -9,6 +9,7 @@
  * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
  */
 
+#undef CONFIG_PARAVIRT
 #include <linux/linkage.h>
 #include <linux/vmalloc.h>
 #include <linux/screen_info.h>
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index f614854..4066121 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_VM86)		+= vm86.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_K8_NB)		+= k8.o
+obj-$(CONFIG_PARAVIRT)		+= paravirt.o
 
 EXTRA_AFLAGS   := -traditional
 
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index 0666eb0e..1b2f3cd 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -101,4 +101,14 @@ void foo(void)
 	BLANK();
  	OFFSET(PDA_cpu, i386_pda, cpu_number);
 	OFFSET(PDA_pcurrent, i386_pda, pcurrent);
+
+#ifdef CONFIG_PARAVIRT
+	BLANK();
+	OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
+	OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
+	OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
+	OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
+	OFFSET(PARAVIRT_iret, paravirt_ops, iret);
+	OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
+#endif
 }
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 0220bc8..d274612 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -62,13 +62,6 @@ DF_MASK		= 0x00000400
 NT_MASK		= 0x00004000
 VM_MASK		= 0x00020000
 
-/* These are replaces for paravirtualization */
-#define DISABLE_INTERRUPTS		cli
-#define ENABLE_INTERRUPTS		sti
-#define ENABLE_INTERRUPTS_SYSEXIT	sti; sysexit
-#define INTERRUPT_RETURN		iret
-#define GET_CR0_INTO_EAX		movl %cr0, %eax
-
 #ifdef CONFIG_PREEMPT
 #define preempt_stop		DISABLE_INTERRUPTS; TRACE_IRQS_OFF
 #else
@@ -416,6 +409,20 @@ ldt_ss:
 	jnz restore_nocheck
 	testl $0x00400000, %eax		# returning to 32bit stack?
 	jnz restore_nocheck		# allright, normal return
+
+#ifdef CONFIG_PARAVIRT
+	/*
+	 * The kernel can't run on a non-flat stack if paravirt mode
+	 * is active.  Rather than try to fixup the high bits of
+	 * ESP, bypass this code entirely.  This may break DOSemu
+	 * and/or Wine support in a paravirt VM, although the option
+	 * is still available to implement the setting of the high
+	 * 16-bits in the INTERRUPT_RETURN paravirt-op.
+	 */
+	cmpl $0, paravirt_ops+PARAVIRT_enabled
+	jne restore_nocheck
+#endif
+
 	/* If returning to userspace with 16bit stack,
 	 * try to fix the higher word of ESP, as the CPU
 	 * won't restore it.
@@ -833,6 +840,19 @@ nmi_espfix_stack:
 .previous
 KPROBE_END(nmi)
 
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_iret)
+1:	iret
+.section __ex_table,"a"
+	.align 4
+	.long 1b,iret_exc
+.previous
+
+ENTRY(native_irq_enable_sysexit)
+	sti
+	sysexit
+#endif
+
 KPROBE_ENTRY(int3)
 	RING0_INT_FRAME
 	pushl $-1			# mark this as an int
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index 62996cd..c8d4582 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -381,7 +381,10 @@ void __init init_ISA_irqs (void)
 	}
 }
 
-void __init init_IRQ(void)
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
 {
 	int i;
 
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
new file mode 100644
index 0000000..478192c
--- /dev/null
+++ b/arch/i386/kernel/paravirt.c
@@ -0,0 +1,404 @@
+/*  Paravirtualization interfaces
+    Copyright (C) 2006 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/bcd.h>
+
+#include <asm/bug.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/arch_hooks.h>
+#include <asm/time.h>
+#include <asm/irq.h>
+#include <asm/delay.h>
+
+/* nop stub */
+static void native_nop(void)
+{
+}
+
+static void __init default_banner(void)
+{
+	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+	       paravirt_ops.name);
+}
+
+char *memory_setup(void)
+{
+	return paravirt_ops.memory_setup();
+}
+
+static fastcall unsigned long native_get_debugreg(int regno)
+{
+	unsigned long val = 0; 	/* Damn you, gcc! */
+
+	switch (regno) {
+	case 0:
+		asm("movl %%db0, %0" :"=r" (val)); break;
+	case 1:
+		asm("movl %%db1, %0" :"=r" (val)); break;
+	case 2:
+		asm("movl %%db2, %0" :"=r" (val)); break;
+	case 3:
+		asm("movl %%db3, %0" :"=r" (val)); break;
+	case 6:
+		asm("movl %%db6, %0" :"=r" (val)); break;
+	case 7:
+		asm("movl %%db7, %0" :"=r" (val)); break;
+	default:
+		BUG();
+	}
+	return val;
+}
+
+static fastcall void native_set_debugreg(int regno, unsigned long value)
+{
+	switch (regno) {
+	case 0:
+		asm("movl %0,%%db0"	: /* no output */ :"r" (value));
+		break;
+	case 1:
+		asm("movl %0,%%db1"	: /* no output */ :"r" (value));
+		break;
+	case 2:
+		asm("movl %0,%%db2"	: /* no output */ :"r" (value));
+		break;
+	case 3:
+		asm("movl %0,%%db3"	: /* no output */ :"r" (value));
+		break;
+	case 6:
+		asm("movl %0,%%db6"	: /* no output */ :"r" (value));
+		break;
+	case 7:
+		asm("movl %0,%%db7"	: /* no output */ :"r" (value));
+		break;
+	default:
+		BUG();
+	}
+}
+
+void init_IRQ(void)
+{
+	paravirt_ops.init_IRQ();
+}
+
+static fastcall void native_clts(void)
+{
+	asm volatile ("clts");
+}
+
+static fastcall unsigned long native_read_cr0(void)
+{
+	unsigned long val;
+	asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
+	return val;
+}
+
+static fastcall void native_write_cr0(unsigned long val)
+{
+	asm volatile("movl %0,%%cr0": :"r" (val));
+}
+
+static fastcall unsigned long native_read_cr2(void)
+{
+	unsigned long val;
+	asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
+	return val;
+}
+
+static fastcall void native_write_cr2(unsigned long val)
+{
+	asm volatile("movl %0,%%cr2": :"r" (val));
+}
+
+static fastcall unsigned long native_read_cr3(void)
+{
+	unsigned long val;
+	asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
+	return val;
+}
+
+static fastcall void native_write_cr3(unsigned long val)
+{
+	asm volatile("movl %0,%%cr3": :"r" (val));
+}
+
+static fastcall unsigned long native_read_cr4(void)
+{
+	unsigned long val;
+	asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
+	return val;
+}
+
+static fastcall unsigned long native_read_cr4_safe(void)
+{
+	unsigned long val;
+	/* This could fault if %cr4 does not exist */
+	asm("1: movl %%cr4, %0		\n"
+		"2:				\n"
+		".section __ex_table,\"a\"	\n"
+		".long 1b,2b			\n"
+		".previous			\n"
+		: "=r" (val): "0" (0));
+	return val;
+}
+
+static fastcall void native_write_cr4(unsigned long val)
+{
+	asm volatile("movl %0,%%cr4": :"r" (val));
+}
+
+static fastcall unsigned long native_save_fl(void)
+{
+	unsigned long f;
+	asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
+	return f;
+}
+
+static fastcall void native_restore_fl(unsigned long f)
+{
+	asm volatile("pushl %0 ; popfl": /* no output */
+			     :"g" (f)
+			     :"memory", "cc");
+}
+
+static fastcall void native_irq_disable(void)
+{
+	asm volatile("cli": : :"memory");
+}
+
+static fastcall void native_irq_enable(void)
+{
+	asm volatile("sti": : :"memory");
+}
+
+static fastcall void native_safe_halt(void)
+{
+	asm volatile("sti; hlt": : :"memory");
+}
+
+static fastcall void native_halt(void)
+{
+	asm volatile("hlt": : :"memory");
+}
+
+static fastcall void native_wbinvd(void)
+{
+	asm volatile("wbinvd": : :"memory");
+}
+
+static fastcall unsigned long long native_read_msr(unsigned int msr, int *err)
+{
+	unsigned long long val;
+
+	asm volatile("2: rdmsr ; xorl %0,%0\n"
+		     "1:\n\t"
+		     ".section .fixup,\"ax\"\n\t"
+		     "3:  movl %3,%0 ; jmp 1b\n\t"
+		     ".previous\n\t"
+ 		     ".section __ex_table,\"a\"\n"
+		     "   .align 4\n\t"
+		     "   .long 	2b,3b\n\t"
+		     ".previous"
+		     : "=r" (*err), "=A" (val)
+		     : "c" (msr), "i" (-EFAULT));
+
+	return val;
+}
+
+static fastcall int native_write_msr(unsigned int msr, unsigned long long val)
+{
+	int err;
+	asm volatile("2: wrmsr ; xorl %0,%0\n"
+		     "1:\n\t"
+		     ".section .fixup,\"ax\"\n\t"
+		     "3:  movl %4,%0 ; jmp 1b\n\t"
+		     ".previous\n\t"
+ 		     ".section __ex_table,\"a\"\n"
+		     "   .align 4\n\t"
+		     "   .long 	2b,3b\n\t"
+		     ".previous"
+		     : "=a" (err)
+		     : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)),
+		       "i" (-EFAULT));
+	return err;
+}
+
+static fastcall unsigned long long native_read_tsc(void)
+{
+	unsigned long long val;
+	asm volatile("rdtsc" : "=A" (val));
+	return val;
+}
+
+static fastcall unsigned long long native_read_pmc(void)
+{
+	unsigned long long val;
+	asm volatile("rdpmc" : "=A" (val));
+	return val;
+}
+
+static fastcall void native_load_tr_desc(void)
+{
+	asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+}
+
+static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr)
+{
+	asm volatile("lgdt %0"::"m" (*dtr));
+}
+
+static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr)
+{
+	asm volatile("lidt %0"::"m" (*dtr));
+}
+
+static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr)
+{
+	asm ("sgdt %0":"=m" (*dtr));
+}
+
+static fastcall void native_store_idt(struct Xgt_desc_struct *dtr)
+{
+	asm ("sidt %0":"=m" (*dtr));
+}
+
+static fastcall unsigned long native_store_tr(void)
+{
+	unsigned long tr;
+	asm ("str %0":"=r" (tr));
+	return tr;
+}
+
+static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
+	C(0); C(1); C(2);
+#undef C
+}
+
+static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high)
+{
+	u32 *lp = (u32 *)((char *)dt + entry*8);
+	lp[0] = entry_low;
+	lp[1] = entry_high;
+}
+
+static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+	native_write_dt_entry(dt, entrynum, low, high);
+}
+
+static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+	native_write_dt_entry(dt, entrynum, low, high);
+}
+
+static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+	native_write_dt_entry(dt, entrynum, low, high);
+}
+
+static fastcall void native_load_esp0(struct tss_struct *tss,
+				      struct thread_struct *thread)
+{
+	tss->esp0 = thread->esp0;
+
+	/* This can only happen when SEP is enabled, no need to test "SEP"arately */
+	if (unlikely(tss->ss1 != thread->sysenter_cs)) {
+		tss->ss1 = thread->sysenter_cs;
+		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+	}
+}
+
+static fastcall void native_io_delay(void)
+{
+	asm volatile("outb %al,$0x80");
+}
+
+/* These are in entry.S */
+extern fastcall void native_iret(void);
+extern fastcall void native_irq_enable_sysexit(void);
+
+static int __init print_banner(void)
+{
+	paravirt_ops.banner();
+	return 0;
+}
+core_initcall(print_banner);
+
+struct paravirt_ops paravirt_ops = {
+	.name = "bare hardware",
+	.paravirt_enabled = 0,
+	.kernel_rpl = 0,
+
+	.banner = default_banner,
+	.arch_setup = native_nop,
+	.memory_setup = machine_specific_memory_setup,
+	.get_wallclock = native_get_wallclock,
+	.set_wallclock = native_set_wallclock,
+	.time_init = time_init_hook,
+	.init_IRQ = native_init_IRQ,
+
+	.cpuid = native_cpuid,
+	.get_debugreg = native_get_debugreg,
+	.set_debugreg = native_set_debugreg,
+	.clts = native_clts,
+	.read_cr0 = native_read_cr0,
+	.write_cr0 = native_write_cr0,
+	.read_cr2 = native_read_cr2,
+	.write_cr2 = native_write_cr2,
+	.read_cr3 = native_read_cr3,
+	.write_cr3 = native_write_cr3,
+	.read_cr4 = native_read_cr4,
+	.read_cr4_safe = native_read_cr4_safe,
+	.write_cr4 = native_write_cr4,
+	.save_fl = native_save_fl,
+	.restore_fl = native_restore_fl,
+	.irq_disable = native_irq_disable,
+	.irq_enable = native_irq_enable,
+	.safe_halt = native_safe_halt,
+	.halt = native_halt,
+	.wbinvd = native_wbinvd,
+	.read_msr = native_read_msr,
+	.write_msr = native_write_msr,
+	.read_tsc = native_read_tsc,
+	.read_pmc = native_read_pmc,
+	.load_tr_desc = native_load_tr_desc,
+	.set_ldt = native_set_ldt,
+	.load_gdt = native_load_gdt,
+	.load_idt = native_load_idt,
+	.store_gdt = native_store_gdt,
+	.store_idt = native_store_idt,
+	.store_tr = native_store_tr,
+	.load_tls = native_load_tls,
+	.write_ldt_entry = native_write_ldt_entry,
+	.write_gdt_entry = native_write_gdt_entry,
+	.write_idt_entry = native_write_idt_entry,
+	.load_esp0 = native_load_esp0,
+
+	.set_iopl_mask = native_set_iopl_mask,
+	.io_delay = native_io_delay,
+	.const_udelay = __const_udelay,
+
+	.irq_enable_sysexit = native_irq_enable_sysexit,
+	.iret = native_iret,
+};
+EXPORT_SYMBOL(paravirt_ops);
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index e5bb87a..695d53f 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -495,6 +495,12 @@ static void set_mca_bus(int x)
 static void set_mca_bus(int x) { }
 #endif
 
+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
+char * __attribute__((weak)) memory_setup(void)
+{
+	return machine_specific_memory_setup();
+}
+
 /*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
@@ -547,7 +553,7 @@ void __init setup_arch(char **cmdline_p)
 		efi_init();
 	else {
 		printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-		print_memory_map(machine_specific_memory_setup());
+		print_memory_map(memory_setup());
 	}
 
 	copy_edd();
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 0956366..cd7de9c 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -33,6 +33,11 @@
  *		Dave Jones	:	Report invalid combinations of Athlon CPUs.
 *		Rusty Russell	:	Hacked into shape for new "hotplug" boot process. */
 
+
+/* SMP boot always wants to use real time delay to allow sufficient time for
+ * the APs to come online */
+#define USE_REAL_TIME_DELAY
+
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 78af572..c505b16 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -56,6 +56,7 @@
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <asm/timer.h>
+#include <asm/time.h>
 
 #include "mach_time.h"
 
@@ -116,10 +117,7 @@ static int set_rtc_mmss(unsigned long nowtime)
 	/* gets recalled with irq locally disabled */
 	/* XXX - does irqsave resolve this? -johnstul */
 	spin_lock_irqsave(&rtc_lock, flags);
-	if (efi_enabled)
-		retval = efi_set_rtc_mmss(nowtime);
-	else
-		retval = mach_set_rtc_mmss(nowtime);
+	retval = set_wallclock(nowtime);
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
 	return retval;
@@ -223,10 +221,7 @@ unsigned long get_cmos_time(void)
 
 	spin_lock_irqsave(&rtc_lock, flags);
 
-	if (efi_enabled)
-		retval = efi_get_time();
-	else
-		retval = mach_get_cmos_time();
+	retval = get_wallclock();
 
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
@@ -370,7 +365,7 @@ static void __init hpet_time_init(void)
 		printk("Using HPET for base-timer\n");
 	}
 
-	time_init_hook();
+	do_time_init();
 }
 #endif
 
@@ -392,5 +387,5 @@ void __init time_init(void)
 
 	do_settimeofday(&ts);
 
-	time_init_hook();
+	do_time_init();
 }
diff --git a/arch/i386/power/cpu.c b/arch/i386/power/cpu.c
index 5a1abef..2c15500 100644
--- a/arch/i386/power/cpu.c
+++ b/arch/i386/power/cpu.c
@@ -26,8 +26,8 @@ void __save_processor_state(struct saved_context *ctxt)
 	/*
 	 * descriptor tables
 	 */
- 	store_gdt(&ctxt->gdt_limit);
- 	store_idt(&ctxt->idt_limit);
+ 	store_gdt(&ctxt->gdt);
+ 	store_idt(&ctxt->idt);
  	store_tr(ctxt->tr);
 
 	/*
@@ -99,8 +99,8 @@ void __restore_processor_state(struct saved_context *ctxt)
 	 * now restore the descriptor tables to their proper values
 	 * ltr is done i fix_processor_context().
 	 */
- 	load_gdt(&ctxt->gdt_limit);
- 	load_idt(&ctxt->idt_limit);
+ 	load_gdt(&ctxt->gdt);
+ 	load_idt(&ctxt->idt);
 
 	/*
 	 * segment registers
diff --git a/drivers/net/de600.c b/drivers/net/de600.c
index 690bb40..8396e41 100644
--- a/drivers/net/de600.c
+++ b/drivers/net/de600.c
@@ -43,7 +43,6 @@ static const char version[] = "de600.c: $Revision: 1.41-2.5 $,  Bjorn Ekwall (bj
  * modify the following "#define": (see <asm/io.h> for more info)
 #define REALLY_SLOW_IO
  */
-#define SLOW_IO_BY_JUMPING /* Looks "better" than dummy write to port 0x80 :-) */
 
 /* use 0 for production, 1 for verification, >2 for debug */
 #ifdef DE600_DEBUG
diff --git a/include/asm-i386/delay.h b/include/asm-i386/delay.h
index 9ae5e37..32d6678 100644
--- a/include/asm-i386/delay.h
+++ b/include/asm-i386/delay.h
@@ -16,6 +16,13 @@ extern void __ndelay(unsigned long nsecs);
 extern void __const_udelay(unsigned long usecs);
 extern void __delay(unsigned long loops);
 
+#if defined(CONFIG_PARAVIRT) && !defined(USE_REAL_TIME_DELAY)
+#define udelay(n) paravirt_ops.const_udelay((n) * 0x10c7ul)
+
+#define ndelay(n) paravirt_ops.const_udelay((n) * 5ul)
+
+#else /* !PARAVIRT || USE_REAL_TIME_DELAY */
+
 /* 0x10c7 is 2**32 / 1000000 (rounded up) */
 #define udelay(n) (__builtin_constant_p(n) ? \
 	((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \
@@ -25,6 +32,7 @@ extern void __delay(unsigned long loops);
 #define ndelay(n) (__builtin_constant_p(n) ? \
 	((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
 	__ndelay(n))
+#endif
 
 void use_tsc_delay(void);
 
diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h
index 6cf2ac2..f19820f 100644
--- a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -55,6 +55,9 @@ static inline void pack_gate(__u32 *a, __u32 *b,
 #define DESCTYPE_DPL3	0x60	/* DPL-3 */
 #define DESCTYPE_S	0x10	/* !system */
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
 #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
 
 #define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
@@ -105,7 +108,11 @@ static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const vo
 	write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
 }
 
-static inline void set_ldt(void *addr, unsigned int entries)
+#define set_ldt native_set_ldt
+#endif /* CONFIG_PARAVIRT */
+
+static inline fastcall void native_set_ldt(const void *addr,
+					   unsigned int entries)
 {
 	if (likely(entries == 0))
 		__asm__ __volatile__("lldt %w0"::"q" (0));
diff --git a/include/asm-i386/io.h b/include/asm-i386/io.h
index 68df0dc3..86ff5e8 100644
--- a/include/asm-i386/io.h
+++ b/include/asm-i386/io.h
@@ -256,11 +256,11 @@ static inline void flush_write_buffers(void)
 
 #endif /* __KERNEL__ */
 
-#ifdef SLOW_IO_BY_JUMPING
-#define __SLOW_DOWN_IO "jmp 1f; 1: jmp 1f; 1:"
+#if defined(CONFIG_PARAVIRT)
+#include <asm/paravirt.h>
 #else
+
 #define __SLOW_DOWN_IO "outb %%al,$0x80;"
-#endif
 
 static inline void slow_down_io(void) {
 	__asm__ __volatile__(
@@ -271,6 +271,8 @@ static inline void slow_down_io(void) {
 		: : );
 }
 
+#endif
+
 #ifdef CONFIG_X86_NUMAQ
 extern void *xquad_portio;    /* Where the IO area was mapped */
 #define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h
index 331726b..9e15ce0 100644
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -41,4 +41,7 @@ extern int irqbalance_disable(char *str);
 extern void fixup_irqs(cpumask_t map);
 #endif
 
+void init_IRQ(void);
+void __init native_init_IRQ(void);
+
 #endif /* _ASM_IRQ_H */
diff --git a/include/asm-i386/irqflags.h b/include/asm-i386/irqflags.h
index e1bdb97..9ce01f3 100644
--- a/include/asm-i386/irqflags.h
+++ b/include/asm-i386/irqflags.h
@@ -10,6 +10,9 @@
 #ifndef _ASM_IRQFLAGS_H
 #define _ASM_IRQFLAGS_H
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
 #ifndef __ASSEMBLY__
 
 static inline unsigned long __raw_local_save_flags(void)
@@ -25,9 +28,6 @@ static inline unsigned long __raw_local_save_flags(void)
 	return flags;
 }
 
-#define raw_local_save_flags(flags) \
-		do { (flags) = __raw_local_save_flags(); } while (0)
-
 static inline void raw_local_irq_restore(unsigned long flags)
 {
 	__asm__ __volatile__(
@@ -66,18 +66,6 @@ static inline void halt(void)
 	__asm__ __volatile__("hlt": : :"memory");
 }
 
-static inline int raw_irqs_disabled_flags(unsigned long flags)
-{
-	return !(flags & (1 << 9));
-}
-
-static inline int raw_irqs_disabled(void)
-{
-	unsigned long flags = __raw_local_save_flags();
-
-	return raw_irqs_disabled_flags(flags);
-}
-
 /*
  * For spinlocks, etc:
  */
@@ -90,9 +78,33 @@ static inline unsigned long __raw_local_irq_save(void)
 	return flags;
 }
 
+#else
+#define DISABLE_INTERRUPTS		cli
+#define ENABLE_INTERRUPTS		sti
+#define ENABLE_INTERRUPTS_SYSEXIT	sti; sysexit
+#define INTERRUPT_RETURN		iret
+#define GET_CR0_INTO_EAX		movl %cr0, %eax
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_PARAVIRT */
+
+#ifndef __ASSEMBLY__
+#define raw_local_save_flags(flags) \
+		do { (flags) = __raw_local_save_flags(); } while (0)
+
 #define raw_local_irq_save(flags) \
 		do { (flags) = __raw_local_irq_save(); } while (0)
 
+static inline int raw_irqs_disabled_flags(unsigned long flags)
+{
+	return !(flags & (1 << 9));
+}
+
+static inline int raw_irqs_disabled(void)
+{
+	unsigned long flags = __raw_local_save_flags();
+
+	return raw_irqs_disabled_flags(flags);
+}
 #endif /* __ASSEMBLY__ */
 
 /*
diff --git a/include/asm-i386/mach-default/setup_arch.h b/include/asm-i386/mach-default/setup_arch.h
index fb42099e..605e3cc 100644
--- a/include/asm-i386/mach-default/setup_arch.h
+++ b/include/asm-i386/mach-default/setup_arch.h
@@ -2,4 +2,6 @@
 
 /* no action for generic */
 
+#ifndef ARCH_SETUP
 #define ARCH_SETUP
+#endif
diff --git a/include/asm-i386/msr.h b/include/asm-i386/msr.h
index 1820d9d..5679d49 100644
--- a/include/asm-i386/msr.h
+++ b/include/asm-i386/msr.h
@@ -1,6 +1,10 @@
 #ifndef __ASM_MSR_H
 #define __ASM_MSR_H
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+
 /*
  * Access to machine-specific registers (available on 586 and better only)
  * Note: the rd* operations modify the parameters directly (without using
@@ -77,6 +81,7 @@ static inline void wrmsrl (unsigned long msr, unsigned long long val)
      __asm__ __volatile__("rdpmc" \
 			  : "=a" (low), "=d" (high) \
 			  : "c" (counter))
+#endif	/* !CONFIG_PARAVIRT */
 
 /* symbolic names for some interesting MSRs */
 /* Intel defined MSRs. */
diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h
new file mode 100644
index 0000000..a7551a4
--- /dev/null
+++ b/include/asm-i386/paravirt.h
@@ -0,0 +1,281 @@
+#ifndef __ASM_PARAVIRT_H
+#define __ASM_PARAVIRT_H
+/* Various instructions on x86 need to be replaced for
+ * para-virtualization: those hooks are defined here. */
+#include <linux/linkage.h>
+
+#ifdef CONFIG_PARAVIRT
+#ifndef __ASSEMBLY__
+struct thread_struct;
+struct Xgt_desc_struct;
+struct tss_struct;
+struct paravirt_ops
+{
+	unsigned int kernel_rpl;
+ 	int paravirt_enabled;
+	const char *name;
+
+	void (*arch_setup)(void);
+	char *(*memory_setup)(void);
+	void (*init_IRQ)(void);
+
+	void (*banner)(void);
+
+	unsigned long (*get_wallclock)(void);
+	int (*set_wallclock)(unsigned long);
+	void (*time_init)(void);
+
+	/* All the function pointers here are declared as "fastcall"
+	   so that we get a specific register-based calling
+	   convention.  This makes it easier to implement inline
+	   assembler replacements. */
+
+	void (fastcall *cpuid)(unsigned int *eax, unsigned int *ebx,
+		      unsigned int *ecx, unsigned int *edx);
+
+	unsigned long (fastcall *get_debugreg)(int regno);
+	void (fastcall *set_debugreg)(int regno, unsigned long value);
+
+	void (fastcall *clts)(void);
+
+	unsigned long (fastcall *read_cr0)(void);
+	void (fastcall *write_cr0)(unsigned long);
+
+	unsigned long (fastcall *read_cr2)(void);
+	void (fastcall *write_cr2)(unsigned long);
+
+	unsigned long (fastcall *read_cr3)(void);
+	void (fastcall *write_cr3)(unsigned long);
+
+	unsigned long (fastcall *read_cr4_safe)(void);
+	unsigned long (fastcall *read_cr4)(void);
+	void (fastcall *write_cr4)(unsigned long);
+
+	unsigned long (fastcall *save_fl)(void);
+	void (fastcall *restore_fl)(unsigned long);
+	void (fastcall *irq_disable)(void);
+	void (fastcall *irq_enable)(void);
+	void (fastcall *safe_halt)(void);
+	void (fastcall *halt)(void);
+	void (fastcall *wbinvd)(void);
+
+	/* err = 0/-EFAULT.  wrmsr returns 0/-EFAULT. */
+	u64 (fastcall *read_msr)(unsigned int msr, int *err);
+	int (fastcall *write_msr)(unsigned int msr, u64 val);
+
+	u64 (fastcall *read_tsc)(void);
+	u64 (fastcall *read_pmc)(void);
+
+	void (fastcall *load_tr_desc)(void);
+	void (fastcall *load_gdt)(const struct Xgt_desc_struct *);
+	void (fastcall *load_idt)(const struct Xgt_desc_struct *);
+	void (fastcall *store_gdt)(struct Xgt_desc_struct *);
+	void (fastcall *store_idt)(struct Xgt_desc_struct *);
+	void (fastcall *set_ldt)(const void *desc, unsigned entries);
+	unsigned long (fastcall *store_tr)(void);
+	void (fastcall *load_tls)(struct thread_struct *t, unsigned int cpu);
+	void (fastcall *write_ldt_entry)(void *dt, int entrynum,
+					 u32 low, u32 high);
+	void (fastcall *write_gdt_entry)(void *dt, int entrynum,
+					 u32 low, u32 high);
+	void (fastcall *write_idt_entry)(void *dt, int entrynum,
+					 u32 low, u32 high);
+	void (fastcall *load_esp0)(struct tss_struct *tss,
+				   struct thread_struct *thread);
+
+	void (fastcall *set_iopl_mask)(unsigned mask);
+
+	void (fastcall *io_delay)(void);
+	void (*const_udelay)(unsigned long loops);
+
+	/* These two are jmp to, not actually called. */
+	void (fastcall *irq_enable_sysexit)(void);
+	void (fastcall *iret)(void);
+};
+
+extern struct paravirt_ops paravirt_ops;
+
+#define paravirt_enabled() (paravirt_ops.paravirt_enabled)
+
+static inline void load_esp0(struct tss_struct *tss,
+			     struct thread_struct *thread)
+{
+	paravirt_ops.load_esp0(tss, thread);
+}
+
+#define ARCH_SETUP			paravirt_ops.arch_setup();
+static inline unsigned long get_wallclock(void)
+{
+	return paravirt_ops.get_wallclock();
+}
+
+static inline int set_wallclock(unsigned long nowtime)
+{
+	return paravirt_ops.set_wallclock(nowtime);
+}
+
+static inline void do_time_init(void)
+{
+	return paravirt_ops.time_init();
+}
+
+/* The paravirtualized CPUID instruction. */
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+			   unsigned int *ecx, unsigned int *edx)
+{
+	paravirt_ops.cpuid(eax, ebx, ecx, edx);
+}
+
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, reg) var = paravirt_ops.get_debugreg(reg)
+#define set_debugreg(val, reg) paravirt_ops.set_debugreg(reg, val)
+
+#define clts() paravirt_ops.clts()
+
+#define read_cr0() paravirt_ops.read_cr0()
+#define write_cr0(x) paravirt_ops.write_cr0(x)
+
+#define read_cr2() paravirt_ops.read_cr2()
+#define write_cr2(x) paravirt_ops.write_cr2(x)
+
+#define read_cr3() paravirt_ops.read_cr3()
+#define write_cr3(x) paravirt_ops.write_cr3(x)
+
+#define read_cr4() paravirt_ops.read_cr4()
+#define read_cr4_safe(x) paravirt_ops.read_cr4_safe()
+#define write_cr4(x) paravirt_ops.write_cr4(x)
+
+static inline unsigned long __raw_local_save_flags(void)
+{
+	return paravirt_ops.save_fl();
+}
+
+static inline void raw_local_irq_restore(unsigned long flags)
+{
+	return paravirt_ops.restore_fl(flags);
+}
+
+static inline void raw_local_irq_disable(void)
+{
+	paravirt_ops.irq_disable();
+}
+
+static inline void raw_local_irq_enable(void)
+{
+	paravirt_ops.irq_enable();
+}
+
+static inline unsigned long __raw_local_irq_save(void)
+{
+	unsigned long flags = paravirt_ops.save_fl();
+
+	paravirt_ops.irq_disable();
+
+	return flags;
+}
+
+static inline void raw_safe_halt(void)
+{
+	paravirt_ops.safe_halt();
+}
+
+static inline void halt(void)
+{
+	paravirt_ops.safe_halt();
+}
+#define wbinvd() paravirt_ops.wbinvd()
+
+#define get_kernel_rpl()  (paravirt_ops.kernel_rpl)
+
+#define rdmsr(msr,val1,val2) do {				\
+	int _err;						\
+	u64 _l = paravirt_ops.read_msr(msr,&_err);		\
+	val1 = (u32)_l;						\
+	val2 = _l >> 32;					\
+} while(0)
+
+#define wrmsr(msr,val1,val2) do {				\
+	u64 _l = ((u64)(val2) << 32) | (val1);			\
+	paravirt_ops.write_msr((msr), _l);			\
+} while(0)
+
+#define rdmsrl(msr,val) do {					\
+	int _err;						\
+	val = paravirt_ops.read_msr((msr),&_err);		\
+} while(0)
+
+#define wrmsrl(msr,val) (paravirt_ops.write_msr((msr),(val)))
+#define wrmsr_safe(msr,a,b) ({					\
+	u64 _l = ((u64)(b) << 32) | (a);			\
+	paravirt_ops.write_msr((msr),_l);			\
+})
+
+/* rdmsr with exception handling */
+#define rdmsr_safe(msr,a,b) ({					\
+	int _err;						\
+	u64 _l = paravirt_ops.read_msr(msr,&_err);		\
+	(*a) = (u32)_l;						\
+	(*b) = _l >> 32;					\
+	_err; })
+
+#define rdtsc(low,high) do {					\
+	u64 _l = paravirt_ops.read_tsc();			\
+	low = (u32)_l;						\
+	high = _l >> 32;					\
+} while(0)
+
+#define rdtscl(low) do {					\
+	u64 _l = paravirt_ops.read_tsc();			\
+	low = (int)_l;						\
+} while(0)
+
+#define rdtscll(val) (val = paravirt_ops.read_tsc())
+
+#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
+
+#define rdpmc(counter,low,high) do {				\
+	u64 _l = paravirt_ops.read_pmc();			\
+	low = (u32)_l;						\
+	high = _l >> 32;					\
+} while(0)
+
+#define load_TR_desc() (paravirt_ops.load_tr_desc())
+#define load_gdt(dtr) (paravirt_ops.load_gdt(dtr))
+#define load_idt(dtr) (paravirt_ops.load_idt(dtr))
+#define set_ldt(addr, entries) (paravirt_ops.set_ldt((addr), (entries)))
+#define store_gdt(dtr) (paravirt_ops.store_gdt(dtr))
+#define store_idt(dtr) (paravirt_ops.store_idt(dtr))
+#define store_tr(tr) ((tr) = paravirt_ops.store_tr())
+#define load_TLS(t,cpu) (paravirt_ops.load_tls((t),(cpu)))
+#define write_ldt_entry(dt, entry, low, high)				\
+	(paravirt_ops.write_ldt_entry((dt), (entry), (low), (high)))
+#define write_gdt_entry(dt, entry, low, high)				\
+	(paravirt_ops.write_gdt_entry((dt), (entry), (low), (high)))
+#define write_idt_entry(dt, entry, low, high)				\
+	(paravirt_ops.write_idt_entry((dt), (entry), (low), (high)))
+#define set_iopl_mask(mask) (paravirt_ops.set_iopl_mask(mask))
+
+/* The paravirtualized I/O functions */
+static inline void slow_down_io(void) {
+	paravirt_ops.io_delay();
+#ifdef REALLY_SLOW_IO
+	paravirt_ops.io_delay();
+	paravirt_ops.io_delay();
+	paravirt_ops.io_delay();
+#endif
+}
+
+#define CLI_STRING	"pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax"
+#define STI_STRING	"pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax"
+#else  /* __ASSEMBLY__ */
+
+#define INTERRUPT_RETURN	jmp *%cs:paravirt_ops+PARAVIRT_iret
+#define DISABLE_INTERRUPTS	pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax
+#define ENABLE_INTERRUPTS	pushl %eax; pushl %ecx; pushl %edx; call *%cs:paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax
+#define ENABLE_INTERRUPTS_SYSEXIT	jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit
+#define GET_CR0_INTO_EAX	call *paravirt_ops+PARAVIRT_read_cr0
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_PARAVIRT */
+#endif	/* __ASM_PARAVIRT_H */
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index 98fa73b..6c2c445 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -144,8 +144,8 @@ static inline void detect_ht(struct cpuinfo_x86 *c) {}
 #define X86_EFLAGS_VIP	0x00100000 /* Virtual Interrupt Pending */
 #define X86_EFLAGS_ID	0x00200000 /* CPUID detection flag */
 
-static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
-			   unsigned int *ecx, unsigned int *edx)
+static inline fastcall void native_cpuid(unsigned int *eax, unsigned int *ebx,
+					 unsigned int *ecx, unsigned int *edx)
 {
 	/* ecx is often an input as well as an output. */
 	__asm__("cpuid"
@@ -491,6 +491,12 @@ struct thread_struct {
 	.io_bitmap	= { [ 0 ... IO_BITMAP_LONGS] = ~0 },		\
 }
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define paravirt_enabled() 0
+#define __cpuid native_cpuid
+
 static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread)
 {
 	tss->esp0 = thread->esp0;
@@ -524,10 +530,13 @@ static inline void load_esp0(struct tss_struct *tss, struct thread_struct *threa
 			: /* no output */			\
 			:"r" (value))
 
+#define set_iopl_mask native_set_iopl_mask
+#endif /* CONFIG_PARAVIRT */
+
 /*
  * Set IOPL bits in EFLAGS from given mask
  */
-static inline void set_iopl_mask(unsigned mask)
+static fastcall inline void native_set_iopl_mask(unsigned mask)
 {
 	unsigned int reg;
 	__asm__ __volatile__ ("pushfl;"
diff --git a/include/asm-i386/segment.h b/include/asm-i386/segment.h
index 5bdda79..3c796af 100644
--- a/include/asm-i386/segment.h
+++ b/include/asm-i386/segment.h
@@ -131,5 +131,7 @@
 #define SEGMENT_LDT		0x4
 #define SEGMENT_GDT		0x0
 
+#ifndef CONFIG_PARAVIRT
 #define get_kernel_rpl()  0
 #endif
+#endif
diff --git a/include/asm-i386/setup.h b/include/asm-i386/setup.h
index 2734909..9930c5a 100644
--- a/include/asm-i386/setup.h
+++ b/include/asm-i386/setup.h
@@ -70,6 +70,7 @@ extern unsigned char boot_params[PARAM_SIZE];
 struct e820entry;
 
 char * __init machine_specific_memory_setup(void);
+char *memory_setup(void);
 
 int __init copy_e820_map(struct e820entry * biosmap, int nr_map);
 int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map);
diff --git a/include/asm-i386/spinlock.h b/include/asm-i386/spinlock.h
index c18b71f..dea6070 100644
--- a/include/asm-i386/spinlock.h
+++ b/include/asm-i386/spinlock.h
@@ -7,8 +7,12 @@
 #include <asm/processor.h>
 #include <linux/compiler.h>
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
 #define CLI_STRING	"cli"
 #define STI_STRING	"sti"
+#endif /* CONFIG_PARAVIRT */
 
 /*
  * Your basic SMP spinlocks, allowing only a single CPU anywhere
diff --git a/include/asm-i386/suspend.h b/include/asm-i386/suspend.h
index 08be1e5..3036152 100644
--- a/include/asm-i386/suspend.h
+++ b/include/asm-i386/suspend.h
@@ -23,12 +23,8 @@ arch_prepare_suspend(void)
 struct saved_context {
   	u16 es, fs, gs, ss;
 	unsigned long cr0, cr2, cr3, cr4;
-	u16 gdt_pad;
-	u16 gdt_limit;
-	unsigned long gdt_base;
-	u16 idt_pad;
-	u16 idt_limit;
-	unsigned long idt_base;
+	struct Xgt_desc_struct gdt;
+	struct Xgt_desc_struct idt;
 	u16 ldt;
 	u16 tss;
 	unsigned long tr;
diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h
index a6dabbc..a6d20d9 100644
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -88,6 +88,9 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" \
 #define savesegment(seg, value) \
 	asm volatile("mov %%" #seg ",%0":"=rm" (value))
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
 #define read_cr0() ({ \
 	unsigned int __dummy; \
 	__asm__ __volatile__( \
@@ -139,17 +142,18 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" \
 #define write_cr4(x) \
 	__asm__ __volatile__("movl %0,%%cr4": :"r" (x))
 
-/*
- * Clear and set 'TS' bit respectively
- */
+#define wbinvd() \
+	__asm__ __volatile__ ("wbinvd": : :"memory")
+
+/* Clear the 'TS' bit */
 #define clts() __asm__ __volatile__ ("clts")
+#endif/* CONFIG_PARAVIRT */
+
+/* Set the 'TS' bit */
 #define stts() write_cr0(8 | read_cr0())
 
 #endif	/* __KERNEL__ */
 
-#define wbinvd() \
-	__asm__ __volatile__ ("wbinvd": : :"memory")
-
 static inline unsigned long get_limit(unsigned long segment)
 {
 	unsigned long __limit;
diff --git a/include/asm-i386/time.h b/include/asm-i386/time.h
new file mode 100644
index 0000000..ea8065a
--- /dev/null
+++ b/include/asm-i386/time.h
@@ -0,0 +1,41 @@
+#ifndef _ASMi386_TIME_H
+#define _ASMi386_TIME_H
+
+#include <linux/efi.h>
+#include "mach_time.h"
+
+static inline unsigned long native_get_wallclock(void)
+{
+	unsigned long retval;
+
+	if (efi_enabled)
+		retval = efi_get_time();
+	else
+		retval = mach_get_cmos_time();
+
+	return retval;
+}
+
+static inline int native_set_wallclock(unsigned long nowtime)
+{
+	int retval;
+
+	if (efi_enabled)
+		retval = efi_set_rtc_mmss(nowtime);
+	else
+		retval = mach_set_rtc_mmss(nowtime);
+
+	return retval;
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else /* !CONFIG_PARAVIRT */
+
+#define get_wallclock() native_get_wallclock()
+#define set_wallclock(x) native_set_wallclock(x)
+#define do_time_init() time_init_hook()
+
+#endif /* CONFIG_PARAVIRT */
+
+#endif
-- 
cgit v0.10.2


From 139ec7c416248b9ea227d21839235344edfee1e0 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 7 Dec 2006 02:14:08 +0100
Subject: [PATCH] paravirt: Patch inline replacements for paravirt intercepts

It turns out that the most called ops, by several orders of magnitude,
are the interrupt manipulation ops.  These are obvious candidates for
patching, so mark them up and create infrastructure for it.

The method used is that the ops structure has a patch function, which
is called for each place which needs to be patched: this returns a
number of instructions (the rest are NOP-padded).

Usually we can spare a register (%eax) for the binary patched code to
use, but in a couple of critical places in entry.S we can't: we make
the clobbers explicit at the call site, and manually clobber the
allowed registers in debug mode as an extra check.

And:

Don't abuse CONFIG_DEBUG_KERNEL, add CONFIG_DEBUG_PARAVIRT.

And:

AK:  Fix warnings in x86-64 alternative.c build

And:

AK: Fix compilation with defconfig

And:

^From: Andrew Morton <akpm@osdl.org>

Some binutlises still like to emit references to __stop_parainstructions and
__start_parainstructions.

And:

AK: Fix warnings about unused variables when PARAVIRT is disabled.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/Kconfig.debug b/arch/i386/Kconfig.debug
index b31c080..f68cc6f 100644
--- a/arch/i386/Kconfig.debug
+++ b/arch/i386/Kconfig.debug
@@ -85,4 +85,14 @@ config DOUBLEFAULT
           option saves about 4k and might cause you much additional grey
           hair.
 
+config DEBUG_PARAVIRT
+	bool "Enable some paravirtualization debugging"
+	default y
+	depends on PARAVIRT && DEBUG_KERNEL
+	help
+	  Currently deliberately clobbers regs which are allowed to be
+	  clobbered in inlined paravirt hooks, even in native mode.
+	  If turning this off solves a problem, then DISABLE_INTERRUPTS() or
+	  ENABLE_INTERRUPTS() is lying about what registers can be clobbered.
+
 endmenu
diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index 535f979..9eca21b 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -124,6 +124,20 @@ static unsigned char** find_nop_table(void)
 
 #endif /* CONFIG_X86_64 */
 
+static void nop_out(void *insns, unsigned int len)
+{
+	unsigned char **noptable = find_nop_table();
+
+	while (len > 0) {
+		unsigned int noplen = len;
+		if (noplen > ASM_NOP_MAX)
+			noplen = ASM_NOP_MAX;
+		memcpy(insns, noptable[noplen], noplen);
+		insns += noplen;
+		len -= noplen;
+	}
+}
+
 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[];
 extern u8 *__smp_locks[], *__smp_locks_end[];
@@ -138,10 +152,9 @@ extern u8 __smp_alt_begin[], __smp_alt_end[];
 
 void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
 {
-	unsigned char **noptable = find_nop_table();
 	struct alt_instr *a;
 	u8 *instr;
-	int diff, i, k;
+	int diff;
 
 	DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
 	for (a = start; a < end; a++) {
@@ -159,13 +172,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
 #endif
 		memcpy(instr, a->replacement, a->replacementlen);
 		diff = a->instrlen - a->replacementlen;
-		/* Pad the rest with nops */
-		for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
-			k = diff;
-			if (k > ASM_NOP_MAX)
-				k = ASM_NOP_MAX;
-			memcpy(a->instr + i, noptable[k], k);
-		}
+		nop_out(instr + a->replacementlen, diff);
 	}
 }
 
@@ -209,7 +216,6 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
 
 static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
 {
-	unsigned char **noptable = find_nop_table();
 	u8 **ptr;
 
 	for (ptr = start; ptr < end; ptr++) {
@@ -217,7 +223,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
 			continue;
 		if (*ptr > text_end)
 			continue;
-		**ptr = noptable[1][0];
+		nop_out(*ptr, 1);
 	};
 }
 
@@ -343,6 +349,40 @@ void alternatives_smp_switch(int smp)
 
 #endif
 
+#ifdef CONFIG_PARAVIRT
+void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
+{
+	struct paravirt_patch *p;
+
+	for (p = start; p < end; p++) {
+		unsigned int used;
+
+		used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr,
+					  p->len);
+#ifdef CONFIG_DEBUG_PARAVIRT
+		{
+		int i;
+		/* Deliberately clobber regs using "not %reg" to find bugs. */
+		for (i = 0; i < 3; i++) {
+			if (p->len - used >= 2 && (p->clobbers & (1 << i))) {
+				memcpy(p->instr + used, "\xf7\xd0", 2);
+				p->instr[used+1] |= i;
+				used += 2;
+			}
+		}
+		}
+#endif
+		/* Pad the rest with nops */
+		nop_out(p->instr + used, p->len - used);
+	}
+
+	/* Sync to be conservative, in case we patched following instructions */
+	sync_core();
+}
+extern struct paravirt_patch __start_parainstructions[],
+	__stop_parainstructions[];
+#endif	/* CONFIG_PARAVIRT */
+
 void __init alternative_instructions(void)
 {
 	unsigned long flags;
@@ -390,5 +430,6 @@ void __init alternative_instructions(void)
 		alternatives_smp_switch(0);
 	}
 #endif
+ 	apply_paravirt(__start_parainstructions, __stop_parainstructions);
 	local_irq_restore(flags);
 }
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index d274612..de34b7fe 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -53,6 +53,19 @@
 #include <asm/dwarf2.h>
 #include "irq_vectors.h"
 
+/*
+ * We use macros for low-level operations which need to be overridden
+ * for paravirtualization.  The following will never clobber any registers:
+ *   INTERRUPT_RETURN (aka. "iret")
+ *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
+ *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
+ *
+ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
+ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
+ * Allowing a register to be clobbered can shrink the paravirt replacement
+ * enough to patch inline, increasing performance.
+ */
+
 #define nr_syscalls ((syscall_table_size)/4)
 
 CF_MASK		= 0x00000001
@@ -63,9 +76,9 @@ NT_MASK		= 0x00004000
 VM_MASK		= 0x00020000
 
 #ifdef CONFIG_PREEMPT
-#define preempt_stop		DISABLE_INTERRUPTS; TRACE_IRQS_OFF
+#define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
 #else
-#define preempt_stop
+#define preempt_stop(clobbers)
 #define resume_kernel		restore_nocheck
 #endif
 
@@ -226,7 +239,7 @@ ENTRY(ret_from_fork)
 	ALIGN
 	RING0_PTREGS_FRAME
 ret_from_exception:
-	preempt_stop
+	preempt_stop(CLBR_ANY)
 ret_from_intr:
 	GET_THREAD_INFO(%ebp)
 check_userspace:
@@ -237,7 +250,7 @@ check_userspace:
 	jb resume_kernel		# not returning to v8086 or userspace
 
 ENTRY(resume_userspace)
- 	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
+ 	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	movl TI_flags(%ebp), %ecx
@@ -248,7 +261,7 @@ ENTRY(resume_userspace)
 
 #ifdef CONFIG_PREEMPT
 ENTRY(resume_kernel)
-	DISABLE_INTERRUPTS
+	DISABLE_INTERRUPTS(CLBR_ANY)
 	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
 	jnz restore_nocheck
 need_resched:
@@ -277,7 +290,7 @@ sysenter_past_esp:
 	 * No need to follow this irqs on/off section: the syscall
 	 * disabled irqs and here we enable it straight after entry:
 	 */
-	ENABLE_INTERRUPTS
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushl $(__USER_DS)
 	CFI_ADJUST_CFA_OFFSET 4
 	/*CFI_REL_OFFSET ss, 0*/
@@ -322,7 +335,7 @@ sysenter_past_esp:
 	jae syscall_badsys
 	call *sys_call_table(,%eax,4)
 	movl %eax,PT_EAX(%esp)
-	DISABLE_INTERRUPTS
+	DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX)
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx
@@ -364,7 +377,7 @@ syscall_call:
 	call *sys_call_table(,%eax,4)
 	movl %eax,PT_EAX(%esp)		# store the return value
 syscall_exit:
-	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
+	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	TRACE_IRQS_OFF
@@ -393,7 +406,7 @@ restore_nocheck_notrace:
 .section .fixup,"ax"
 iret_exc:
 	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushl $0			# no error code
 	pushl $do_iret_error
 	jmp error_code
@@ -436,7 +449,7 @@ ldt_ss:
 	CFI_ADJUST_CFA_OFFSET 4
 	pushl %eax
 	CFI_ADJUST_CFA_OFFSET 4
-	DISABLE_INTERRUPTS
+	DISABLE_INTERRUPTS(CLBR_EAX)
 	TRACE_IRQS_OFF
 	lss (%esp), %esp
 	CFI_ADJUST_CFA_OFFSET -8
@@ -451,7 +464,7 @@ work_pending:
 	jz work_notifysig
 work_resched:
 	call schedule
-	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
+	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	TRACE_IRQS_OFF
@@ -509,7 +522,7 @@ syscall_exit_work:
 	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
 	jz work_pending
 	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS		# could let do_syscall_trace() call
+	ENABLE_INTERRUPTS(CLBR_ANY)	# could let do_syscall_trace() call
 					# schedule() instead
 	movl %esp, %eax
 	movl $1, %edx
@@ -693,7 +706,7 @@ ENTRY(device_not_available)
 	GET_CR0_INTO_EAX
 	testl $0x4, %eax		# EM (math emulation bit)
 	jne device_not_available_emulate
-	preempt_stop
+	preempt_stop(CLBR_ANY)
 	call math_state_restore
 	jmp ret_from_exception
 device_not_available_emulate:
diff --git a/arch/i386/kernel/module.c b/arch/i386/kernel/module.c
index 470cf97..d7d9c8b2 100644
--- a/arch/i386/kernel/module.c
+++ b/arch/i386/kernel/module.c
@@ -108,7 +108,8 @@ int module_finalize(const Elf_Ehdr *hdr,
 		    const Elf_Shdr *sechdrs,
 		    struct module *me)
 {
-	const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL;
+	const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
+		*para = NULL;
 	char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
 
 	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 
@@ -118,6 +119,8 @@ int module_finalize(const Elf_Ehdr *hdr,
 			alt = s;
 		if (!strcmp(".smp_locks", secstrings + s->sh_name))
 			locks= s;
+		if (!strcmp(".parainstructions", secstrings + s->sh_name))
+			para = s;
 	}
 
 	if (alt) {
@@ -132,6 +135,12 @@ int module_finalize(const Elf_Ehdr *hdr,
 					    lseg, lseg + locks->sh_size,
 					    tseg, tseg + text->sh_size);
 	}
+
+	if (para) {
+		void *pseg = (void *)para->sh_addr;
+		apply_paravirt(pseg, pseg + para->sh_size);
+	}
+
 	return 0;
 }
 
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index 478192c..d464604 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -45,6 +45,49 @@ char *memory_setup(void)
 	return paravirt_ops.memory_setup();
 }
 
+/* Simple instruction patching code. */
+#define DEF_NATIVE(name, code)					\
+	extern const char start_##name[], end_##name[];		\
+	asm("start_" #name ": " code "; end_" #name ":")
+DEF_NATIVE(cli, "cli");
+DEF_NATIVE(sti, "sti");
+DEF_NATIVE(popf, "push %eax; popf");
+DEF_NATIVE(pushf, "pushf; pop %eax");
+DEF_NATIVE(pushf_cli, "pushf; pop %eax; cli");
+DEF_NATIVE(iret, "iret");
+DEF_NATIVE(sti_sysexit, "sti; sysexit");
+
+static const struct native_insns
+{
+	const char *start, *end;
+} native_insns[] = {
+	[PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli },
+	[PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti },
+	[PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf },
+	[PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf },
+	[PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli },
+	[PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret },
+	[PARAVIRT_STI_SYSEXIT] = { start_sti_sysexit, end_sti_sysexit },
+};
+
+static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len)
+{
+	unsigned int insn_len;
+
+	/* Don't touch it if we don't have a replacement */
+	if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start)
+		return len;
+
+	insn_len = native_insns[type].end - native_insns[type].start;
+
+	/* Similarly if we can't fit replacement. */
+	if (len < insn_len)
+		return len;
+
+	memcpy(insns, native_insns[type].start, insn_len);
+	return insn_len;
+}
+
 static fastcall unsigned long native_get_debugreg(int regno)
 {
 	unsigned long val = 0; 	/* Damn you, gcc! */
@@ -349,6 +392,7 @@ struct paravirt_ops paravirt_ops = {
 	.paravirt_enabled = 0,
 	.kernel_rpl = 0,
 
+ 	.patch = native_patch,
 	.banner = default_banner,
 	.arch_setup = native_nop,
 	.memory_setup = machine_specific_memory_setup,
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 6860f20..5c69cf0 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -165,6 +165,12 @@ SECTIONS
   .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
 	*(.altinstr_replacement)
   }
+  . = ALIGN(4);
+  __start_parainstructions = .;
+  .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+	*(.parainstructions)
+  }
+  __stop_parainstructions = .;
   /* .exit.text is discard at runtime, not link time, to deal with references
      from .altinstructions and .eh_frame */
   .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
diff --git a/include/asm-i386/alternative.h b/include/asm-i386/alternative.h
index b01a7ec..b8fa955 100644
--- a/include/asm-i386/alternative.h
+++ b/include/asm-i386/alternative.h
@@ -4,7 +4,7 @@
 #ifdef __KERNEL__
 
 #include <asm/types.h>
-
+#include <linux/stddef.h>
 #include <linux/types.h>
 
 struct alt_instr {
@@ -118,4 +118,15 @@ static inline void alternatives_smp_switch(int smp) {}
 #define LOCK_PREFIX ""
 #endif
 
+struct paravirt_patch;
+#ifdef CONFIG_PARAVIRT
+void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end);
+#else
+static inline void
+apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
+{}
+#define __start_parainstructions NULL
+#define __stop_parainstructions NULL
+#endif
+
 #endif /* _I386_ALTERNATIVE_H */
diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h
index f19820f..f398cc4 100644
--- a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -81,31 +81,15 @@ static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
 #undef C
 }
 
-static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b)
-{
-	__u32 *lp = (__u32 *)((char *)dt + entry*8);
-	*lp = entry_a;
-	*(lp+1) = entry_b;
-}
-
 #define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
 #define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
 #define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
 
-static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
-{
-	__u32 a, b;
-	pack_gate(&a, &b, (unsigned long)addr, seg, type, 0);
-	write_idt_entry(idt_table, gate, a, b);
-}
-
-static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
+static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b)
 {
-	__u32 a, b;
-	pack_descriptor(&a, &b, (unsigned long)addr,
-			offsetof(struct tss_struct, __cacheline_filler) - 1,
-			DESCTYPE_TSS, 0);
-	write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
+	__u32 *lp = (__u32 *)((char *)dt + entry*8);
+	*lp = entry_a;
+	*(lp+1) = entry_b;
 }
 
 #define set_ldt native_set_ldt
@@ -128,6 +112,23 @@ static inline fastcall void native_set_ldt(const void *addr,
 	}
 }
 
+static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
+{
+	__u32 a, b;
+	pack_gate(&a, &b, (unsigned long)addr, seg, type, 0);
+	write_idt_entry(idt_table, gate, a, b);
+}
+
+static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
+{
+	__u32 a, b;
+	pack_descriptor(&a, &b, (unsigned long)addr,
+			offsetof(struct tss_struct, __cacheline_filler) - 1,
+			DESCTYPE_TSS, 0);
+	write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
+}
+
+
 #define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
 
 #define LDT_entry_a(info) \
diff --git a/include/asm-i386/irqflags.h b/include/asm-i386/irqflags.h
index 9ce01f3..17b18cf 100644
--- a/include/asm-i386/irqflags.h
+++ b/include/asm-i386/irqflags.h
@@ -79,8 +79,8 @@ static inline unsigned long __raw_local_irq_save(void)
 }
 
 #else
-#define DISABLE_INTERRUPTS		cli
-#define ENABLE_INTERRUPTS		sti
+#define DISABLE_INTERRUPTS(clobbers)	cli
+#define ENABLE_INTERRUPTS(clobbers)	sti
 #define ENABLE_INTERRUPTS_SYSEXIT	sti; sysexit
 #define INTERRUPT_RETURN		iret
 #define GET_CR0_INTO_EAX		movl %cr0, %eax
diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h
index a7551a4..0811947 100644
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -3,8 +3,26 @@
 /* Various instructions on x86 need to be replaced for
  * para-virtualization: those hooks are defined here. */
 #include <linux/linkage.h>
+#include <linux/stringify.h>
 
 #ifdef CONFIG_PARAVIRT
+/* These are the most performance critical ops, so we want to be able to patch
+ * callers */
+#define PARAVIRT_IRQ_DISABLE 0
+#define PARAVIRT_IRQ_ENABLE 1
+#define PARAVIRT_RESTORE_FLAGS 2
+#define PARAVIRT_SAVE_FLAGS 3
+#define PARAVIRT_SAVE_FLAGS_IRQ_DISABLE 4
+#define PARAVIRT_INTERRUPT_RETURN 5
+#define PARAVIRT_STI_SYSEXIT 6
+
+/* Bitmask of what can be clobbered: usually at least eax. */
+#define CLBR_NONE 0x0
+#define CLBR_EAX 0x1
+#define CLBR_ECX 0x2
+#define CLBR_EDX 0x4
+#define CLBR_ANY 0x7
+
 #ifndef __ASSEMBLY__
 struct thread_struct;
 struct Xgt_desc_struct;
@@ -15,6 +33,15 @@ struct paravirt_ops
  	int paravirt_enabled;
 	const char *name;
 
+	/*
+	 * Patch may replace one of the defined code sequences with arbitrary
+	 * code, subject to the same register constraints.  This generally
+	 * means the code is not free to clobber any registers other than EAX.
+	 * The patch function should return the number of bytes of code
+	 * generated, as we nop pad the rest in generic code.
+	 */
+	unsigned (*patch)(u8 type, u16 clobber, void *firstinsn, unsigned len);
+
 	void (*arch_setup)(void);
 	char *(*memory_setup)(void);
 	void (*init_IRQ)(void);
@@ -147,35 +174,6 @@ static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
 #define read_cr4_safe(x) paravirt_ops.read_cr4_safe()
 #define write_cr4(x) paravirt_ops.write_cr4(x)
 
-static inline unsigned long __raw_local_save_flags(void)
-{
-	return paravirt_ops.save_fl();
-}
-
-static inline void raw_local_irq_restore(unsigned long flags)
-{
-	return paravirt_ops.restore_fl(flags);
-}
-
-static inline void raw_local_irq_disable(void)
-{
-	paravirt_ops.irq_disable();
-}
-
-static inline void raw_local_irq_enable(void)
-{
-	paravirt_ops.irq_enable();
-}
-
-static inline unsigned long __raw_local_irq_save(void)
-{
-	unsigned long flags = paravirt_ops.save_fl();
-
-	paravirt_ops.irq_disable();
-
-	return flags;
-}
-
 static inline void raw_safe_halt(void)
 {
 	paravirt_ops.safe_halt();
@@ -267,15 +265,134 @@ static inline void slow_down_io(void) {
 #endif
 }
 
-#define CLI_STRING	"pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax"
-#define STI_STRING	"pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax"
+/* These all sit in the .parainstructions section to tell us what to patch. */
+struct paravirt_patch {
+	u8 *instr; 		/* original instructions */
+	u8 instrtype;		/* type of this instruction */
+	u8 len;			/* length of original instruction */
+	u16 clobbers;		/* what registers you may clobber */
+};
+
+#define paravirt_alt(insn_string, typenum, clobber)	\
+	"771:\n\t" insn_string "\n" "772:\n"		\
+	".pushsection .parainstructions,\"a\"\n"	\
+	"  .long 771b\n"				\
+	"  .byte " __stringify(typenum) "\n"		\
+	"  .byte 772b-771b\n"				\
+	"  .short " __stringify(clobber) "\n"		\
+	".popsection"
+
+static inline unsigned long __raw_local_save_flags(void)
+{
+	unsigned long f;
+
+	__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
+					   "call *%1;"
+					   "popl %%edx; popl %%ecx",
+					  PARAVIRT_SAVE_FLAGS, CLBR_NONE)
+			     : "=a"(f): "m"(paravirt_ops.save_fl)
+			     : "memory", "cc");
+	return f;
+}
+
+static inline void raw_local_irq_restore(unsigned long f)
+{
+	__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
+					   "call *%1;"
+					   "popl %%edx; popl %%ecx",
+					  PARAVIRT_RESTORE_FLAGS, CLBR_EAX)
+			     : "=a"(f) : "m" (paravirt_ops.restore_fl), "0"(f)
+			     : "memory", "cc");
+}
+
+static inline void raw_local_irq_disable(void)
+{
+	__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
+					   "call *%0;"
+					   "popl %%edx; popl %%ecx",
+					  PARAVIRT_IRQ_DISABLE, CLBR_EAX)
+			     : : "m" (paravirt_ops.irq_disable)
+			     : "memory", "eax", "cc");
+}
+
+static inline void raw_local_irq_enable(void)
+{
+	__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
+					   "call *%0;"
+					   "popl %%edx; popl %%ecx",
+					  PARAVIRT_IRQ_ENABLE, CLBR_EAX)
+			     : : "m" (paravirt_ops.irq_enable)
+			     : "memory", "eax", "cc");
+}
+
+static inline unsigned long __raw_local_irq_save(void)
+{
+	unsigned long f;
+
+	__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
+					   "call *%1; pushl %%eax;"
+					   "call *%2; popl %%eax;"
+					   "popl %%edx; popl %%ecx",
+					  PARAVIRT_SAVE_FLAGS_IRQ_DISABLE,
+					  CLBR_NONE)
+			     : "=a"(f)
+			     : "m" (paravirt_ops.save_fl),
+			       "m" (paravirt_ops.irq_disable)
+			     : "memory", "cc");
+	return f;
+}
+
+#define CLI_STRING paravirt_alt("pushl %%ecx; pushl %%edx;"		\
+		     "call *paravirt_ops+%c[irq_disable];"		\
+		     "popl %%edx; popl %%ecx",				\
+		     PARAVIRT_IRQ_DISABLE, CLBR_EAX)
+
+#define STI_STRING paravirt_alt("pushl %%ecx; pushl %%edx;"		\
+		     "call *paravirt_ops+%c[irq_enable];"		\
+		     "popl %%edx; popl %%ecx",				\
+		     PARAVIRT_IRQ_ENABLE, CLBR_EAX)
+#define CLI_STI_CLOBBERS , "%eax"
+#define CLI_STI_INPUT_ARGS \
+	,								\
+	[irq_disable] "i" (offsetof(struct paravirt_ops, irq_disable)),	\
+	[irq_enable] "i" (offsetof(struct paravirt_ops, irq_enable))
+
 #else  /* __ASSEMBLY__ */
 
-#define INTERRUPT_RETURN	jmp *%cs:paravirt_ops+PARAVIRT_iret
-#define DISABLE_INTERRUPTS	pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax
-#define ENABLE_INTERRUPTS	pushl %eax; pushl %ecx; pushl %edx; call *%cs:paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax
-#define ENABLE_INTERRUPTS_SYSEXIT	jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit
-#define GET_CR0_INTO_EAX	call *paravirt_ops+PARAVIRT_read_cr0
+#define PARA_PATCH(ptype, clobbers, ops)	\
+771:;						\
+	ops;					\
+772:;						\
+	.pushsection .parainstructions,"a";	\
+	 .long 771b;				\
+	 .byte ptype;				\
+	 .byte 772b-771b;			\
+	 .short clobbers;			\
+	.popsection
+
+#define INTERRUPT_RETURN				\
+	PARA_PATCH(PARAVIRT_INTERRUPT_RETURN, CLBR_ANY,	\
+	jmp *%cs:paravirt_ops+PARAVIRT_iret)
+
+#define DISABLE_INTERRUPTS(clobbers)			\
+	PARA_PATCH(PARAVIRT_IRQ_DISABLE, clobbers,	\
+	pushl %ecx; pushl %edx;				\
+	call *paravirt_ops+PARAVIRT_irq_disable;	\
+	popl %edx; popl %ecx)				\
+
+#define ENABLE_INTERRUPTS(clobbers)			\
+	PARA_PATCH(PARAVIRT_IRQ_ENABLE, clobbers,	\
+	pushl %ecx; pushl %edx;				\
+	call *%cs:paravirt_ops+PARAVIRT_irq_enable;	\
+	popl %edx; popl %ecx)
+
+#define ENABLE_INTERRUPTS_SYSEXIT			\
+	PARA_PATCH(PARAVIRT_STI_SYSEXIT, CLBR_ANY,	\
+	jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit)
+
+#define GET_CR0_INTO_EAX			\
+	call *paravirt_ops+PARAVIRT_read_cr0
+
 #endif /* __ASSEMBLY__ */
 #endif /* CONFIG_PARAVIRT */
 #endif	/* __ASM_PARAVIRT_H */
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index 6c2c445..5f0418d 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -156,59 +156,6 @@ static inline fastcall void native_cpuid(unsigned int *eax, unsigned int *ebx,
 		: "0" (*eax), "2" (*ecx));
 }
 
-/*
- * Generic CPUID function
- * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
- * resulting in stale register contents being returned.
- */
-static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
-{
-	*eax = op;
-	*ecx = 0;
-	__cpuid(eax, ebx, ecx, edx);
-}
-
-/* Some CPUID calls want 'count' to be placed in ecx */
-static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
-			       int *edx)
-{
-	*eax = op;
-	*ecx = count;
-	__cpuid(eax, ebx, ecx, edx);
-}
-
-/*
- * CPUID functions returning a single datum
- */
-static inline unsigned int cpuid_eax(unsigned int op)
-{
-	unsigned int eax, ebx, ecx, edx;
-
-	cpuid(op, &eax, &ebx, &ecx, &edx);
-	return eax;
-}
-static inline unsigned int cpuid_ebx(unsigned int op)
-{
-	unsigned int eax, ebx, ecx, edx;
-
-	cpuid(op, &eax, &ebx, &ecx, &edx);
-	return ebx;
-}
-static inline unsigned int cpuid_ecx(unsigned int op)
-{
-	unsigned int eax, ebx, ecx, edx;
-
-	cpuid(op, &eax, &ebx, &ecx, &edx);
-	return ecx;
-}
-static inline unsigned int cpuid_edx(unsigned int op)
-{
-	unsigned int eax, ebx, ecx, edx;
-
-	cpuid(op, &eax, &ebx, &ecx, &edx);
-	return edx;
-}
-
 #define load_cr3(pgdir) write_cr3(__pa(pgdir))
 
 /*
@@ -491,22 +438,6 @@ struct thread_struct {
 	.io_bitmap	= { [ 0 ... IO_BITMAP_LONGS] = ~0 },		\
 }
 
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-#define paravirt_enabled() 0
-#define __cpuid native_cpuid
-
-static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread)
-{
-	tss->esp0 = thread->esp0;
-	/* This can only happen when SEP is enabled, no need to test "SEP"arately */
-	if (unlikely(tss->ss1 != thread->sysenter_cs)) {
-		tss->ss1 = thread->sysenter_cs;
-		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
-	}
-}
-
 #define start_thread(regs, new_eip, new_esp) do {		\
 	__asm__("movl %0,%%fs": :"r" (0));			\
 	regs->xgs = 0;						\
@@ -519,36 +450,6 @@ static inline void load_esp0(struct tss_struct *tss, struct thread_struct *threa
 	regs->esp = new_esp;					\
 } while (0)
 
-/*
- * These special macros can be used to get or set a debugging register
- */
-#define get_debugreg(var, register)				\
-		__asm__("movl %%db" #register ", %0"		\
-			:"=r" (var))
-#define set_debugreg(value, register)			\
-		__asm__("movl %0,%%db" #register		\
-			: /* no output */			\
-			:"r" (value))
-
-#define set_iopl_mask native_set_iopl_mask
-#endif /* CONFIG_PARAVIRT */
-
-/*
- * Set IOPL bits in EFLAGS from given mask
- */
-static fastcall inline void native_set_iopl_mask(unsigned mask)
-{
-	unsigned int reg;
-	__asm__ __volatile__ ("pushfl;"
-			      "popl %0;"
-			      "andl %1, %0;"
-			      "orl %2, %0;"
-			      "pushl %0;"
-			      "popfl"
-				: "=&r" (reg)
-				: "i" (~X86_EFLAGS_IOPL), "r" (mask));
-}
-
 /* Forward declaration, a strange C thing */
 struct task_struct;
 struct mm_struct;
@@ -640,6 +541,105 @@ static inline void rep_nop(void)
 
 #define cpu_relax()	rep_nop()
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define paravirt_enabled() 0
+#define __cpuid native_cpuid
+
+static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread)
+{
+	tss->esp0 = thread->esp0;
+	/* This can only happen when SEP is enabled, no need to test "SEP"arately */
+	if (unlikely(tss->ss1 != thread->sysenter_cs)) {
+		tss->ss1 = thread->sysenter_cs;
+		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+	}
+}
+
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, register)				\
+		__asm__("movl %%db" #register ", %0"		\
+			:"=r" (var))
+#define set_debugreg(value, register)			\
+		__asm__("movl %0,%%db" #register		\
+			: /* no output */			\
+			:"r" (value))
+
+#define set_iopl_mask native_set_iopl_mask
+#endif /* CONFIG_PARAVIRT */
+
+/*
+ * Set IOPL bits in EFLAGS from given mask
+ */
+static fastcall inline void native_set_iopl_mask(unsigned mask)
+{
+	unsigned int reg;
+	__asm__ __volatile__ ("pushfl;"
+			      "popl %0;"
+			      "andl %1, %0;"
+			      "orl %2, %0;"
+			      "pushl %0;"
+			      "popfl"
+				: "=&r" (reg)
+				: "i" (~X86_EFLAGS_IOPL), "r" (mask));
+}
+
+/*
+ * Generic CPUID function
+ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
+ * resulting in stale register contents being returned.
+ */
+static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
+{
+	*eax = op;
+	*ecx = 0;
+	__cpuid(eax, ebx, ecx, edx);
+}
+
+/* Some CPUID calls want 'count' to be placed in ecx */
+static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
+			       int *edx)
+{
+	*eax = op;
+	*ecx = count;
+	__cpuid(eax, ebx, ecx, edx);
+}
+
+/*
+ * CPUID functions returning a single datum
+ */
+static inline unsigned int cpuid_eax(unsigned int op)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	cpuid(op, &eax, &ebx, &ecx, &edx);
+	return eax;
+}
+static inline unsigned int cpuid_ebx(unsigned int op)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	cpuid(op, &eax, &ebx, &ecx, &edx);
+	return ebx;
+}
+static inline unsigned int cpuid_ecx(unsigned int op)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	cpuid(op, &eax, &ebx, &ecx, &edx);
+	return ecx;
+}
+static inline unsigned int cpuid_edx(unsigned int op)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	cpuid(op, &eax, &ebx, &ecx, &edx);
+	return edx;
+}
+
 /* generic versions from gas */
 #define GENERIC_NOP1	".byte 0x90\n"
 #define GENERIC_NOP2    	".byte 0x89,0xf6\n"
diff --git a/include/asm-i386/spinlock.h b/include/asm-i386/spinlock.h
index dea6070..d3bcebe 100644
--- a/include/asm-i386/spinlock.h
+++ b/include/asm-i386/spinlock.h
@@ -12,6 +12,8 @@
 #else
 #define CLI_STRING	"cli"
 #define STI_STRING	"sti"
+#define CLI_STI_CLOBBERS
+#define CLI_STI_INPUT_ARGS
 #endif /* CONFIG_PARAVIRT */
 
 /*
@@ -57,25 +59,28 @@ static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long fla
 {
 	asm volatile(
 		"\n1:\t"
-		LOCK_PREFIX " ; decb %0\n\t"
+		LOCK_PREFIX " ; decb %[slock]\n\t"
 		"jns 5f\n"
 		"2:\t"
-		"testl $0x200, %1\n\t"
+		"testl $0x200, %[flags]\n\t"
 		"jz 4f\n\t"
 		STI_STRING "\n"
 		"3:\t"
 		"rep;nop\n\t"
-		"cmpb $0, %0\n\t"
+		"cmpb $0, %[slock]\n\t"
 		"jle 3b\n\t"
 		CLI_STRING "\n\t"
 		"jmp 1b\n"
 		"4:\t"
 		"rep;nop\n\t"
-		"cmpb $0, %0\n\t"
+		"cmpb $0, %[slock]\n\t"
 		"jg 1b\n\t"
 		"jmp 4b\n"
 		"5:\n\t"
-		: "+m" (lock->slock) : "r" (flags) : "memory");
+		: [slock] "+m" (lock->slock)
+		: [flags] "r" (flags)
+	 	  CLI_STI_INPUT_ARGS
+		: "memory" CLI_STI_CLOBBERS);
 }
 #endif
 
diff --git a/include/asm-x86_64/alternative.h b/include/asm-x86_64/alternative.h
index a584826..a6657b4 100644
--- a/include/asm-x86_64/alternative.h
+++ b/include/asm-x86_64/alternative.h
@@ -4,6 +4,7 @@
 #ifdef __KERNEL__
 
 #include <linux/types.h>
+#include <linux/stddef.h>
 #include <asm/cpufeature.h>
 
 struct alt_instr {
@@ -133,4 +134,15 @@ static inline void alternatives_smp_switch(int smp) {}
 #define LOCK_PREFIX ""
 #endif
 
+struct paravirt_patch;
+#ifdef CONFIG_PARAVIRT
+void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end);
+#else
+static inline void
+apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
+{}
+#define __start_parainstructions NULL
+#define __stop_parainstructions NULL
+#endif
+
 #endif /* _X86_64_ALTERNATIVE_H */
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 2e11416..ac0a582 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -911,6 +911,7 @@ static int init_section_ref_ok(const char *name)
 		".toc1",  /* used by ppc64 */
 		".stab",
 		".rodata",
+		".parainstructions",
 		".text.lock",
 		"__bug_table", /* used by powerpc for BUG() */
 		".pci_fixup_header",
@@ -931,6 +932,7 @@ static int init_section_ref_ok(const char *name)
 		".altinstructions",
 		".eh_frame",
 		".debug",
+		".parainstructions",
 		NULL
 	};
 	/* part of section name */
-- 
cgit v0.10.2


From d7cd56111f30259e1b532a12e06f59f8e0a20355 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 7 Dec 2006 02:14:08 +0100
Subject: [PATCH] i386: cpu_detect extraction

Both lhype and Xen want to call the core of the x86 cpu detect code before
calling start_kernel.

(extracted from larger patch)

AK: folded in start_kernel header patch

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index cda41ae..68bcb68 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -236,29 +236,14 @@ static int __cpuinit have_cpuid_p(void)
 	return flag_is_changeable_p(X86_EFLAGS_ID);
 }
 
-/* Do minimum CPU detection early.
-   Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
-   The others are not touched to avoid unwanted side effects.
-
-   WARNING: this function is only called on the BP.  Don't add code here
-   that is supposed to run on all CPUs. */
-static void __init early_cpu_detect(void)
+void __init cpu_detect(struct cpuinfo_x86 *c)
 {
-	struct cpuinfo_x86 *c = &boot_cpu_data;
-
-	c->x86_cache_alignment = 32;
-
-	if (!have_cpuid_p())
-		return;
-
 	/* Get vendor name */
 	cpuid(0x00000000, &c->cpuid_level,
 	      (int *)&c->x86_vendor_id[0],
 	      (int *)&c->x86_vendor_id[8],
 	      (int *)&c->x86_vendor_id[4]);
 
-	get_cpu_vendor(c, 1);
-
 	c->x86 = 4;
 	if (c->cpuid_level >= 0x00000001) {
 		u32 junk, tfms, cap0, misc;
@@ -275,6 +260,26 @@ static void __init early_cpu_detect(void)
 	}
 }
 
+/* Do minimum CPU detection early.
+   Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
+   The others are not touched to avoid unwanted side effects.
+
+   WARNING: this function is only called on the BP.  Don't add code here
+   that is supposed to run on all CPUs. */
+static void __init early_cpu_detect(void)
+{
+	struct cpuinfo_x86 *c = &boot_cpu_data;
+
+	c->x86_cache_alignment = 32;
+
+	if (!have_cpuid_p())
+		return;
+
+	cpu_detect(c);
+
+	get_cpu_vendor(c, 1);
+}
+
 static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 {
 	u32 tfms, xlvl;
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index 5f0418d..a52d654 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -20,6 +20,7 @@
 #include <linux/threads.h>
 #include <asm/percpu.h>
 #include <linux/cpumask.h>
+#include <linux/init.h>
 
 /* flag for disabling the tsc */
 extern int tsc_disable;
@@ -112,6 +113,8 @@ extern struct cpuinfo_x86 cpu_data[];
 extern	int cpu_llc_id[NR_CPUS];
 extern char ignore_fpu_irq;
 
+void __init cpu_detect(struct cpuinfo_x86 *c);
+
 extern void identify_cpu(struct cpuinfo_x86 *);
 extern void print_cpu_info(struct cpuinfo_x86 *);
 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
diff --git a/include/linux/start_kernel.h b/include/linux/start_kernel.h
new file mode 100644
index 0000000..d3e5f27
--- /dev/null
+++ b/include/linux/start_kernel.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_START_KERNEL_H
+#define _LINUX_START_KERNEL_H
+
+#include <linux/linkage.h>
+#include <linux/init.h>
+
+/* Define the prototype for start_kernel here, rather than cluttering
+   up something else. */
+
+extern asmlinkage void __init start_kernel(void);
+
+#endif /* _LINUX_START_KERNEL_H */
diff --git a/init/main.c b/init/main.c
index 36f608a..985c9ed 100644
--- a/init/main.c
+++ b/init/main.c
@@ -29,6 +29,7 @@
 #include <linux/percpu.h>
 #include <linux/kmod.h>
 #include <linux/kernel_stat.h>
+#include <linux/start_kernel.h>
 #include <linux/security.h>
 #include <linux/workqueue.h>
 #include <linux/profile.h>
-- 
cgit v0.10.2


From c9ccf30d77f04064fe5436027ab9d2230c7cdd94 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 7 Dec 2006 02:14:08 +0100
Subject: [PATCH] paravirt: Add startup infrastructure for paravirtualization

1) Each hypervisor writes a probe function to detect whether we are
   running under that hypervisor.  paravirt_probe() registers this
   function.

2) If vmlinux is booted with ring != 0, we call all the probe
   functions (with registers except %esp intact) in link order: the
   winner will not return.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 4066121..1e8988e 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -39,6 +39,8 @@ obj-$(CONFIG_VM86)		+= vm86.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_K8_NB)		+= k8.o
+
+# Make sure this is linked after any other paravirt_ops structs: see head.S
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o
 
 EXTRA_AFLAGS   := -traditional
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index 5b14e95..edef508 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -55,6 +55,12 @@
  */
 ENTRY(startup_32)
 
+#ifdef CONFIG_PARAVIRT
+        movl %cs, %eax
+        testl $0x3, %eax
+        jnz startup_paravirt
+#endif
+
 /*
  * Set segments to known values.
  */
@@ -486,6 +492,33 @@ ignore_int:
 #endif
 	iret
 
+#ifdef CONFIG_PARAVIRT
+startup_paravirt:
+	cld
+ 	movl $(init_thread_union+THREAD_SIZE),%esp
+
+	/* We take pains to preserve all the regs. */
+	pushl	%edx
+	pushl	%ecx
+	pushl	%eax
+
+	/* paravirt.o is last in link, and that probe fn never returns */
+	pushl	$__start_paravirtprobe
+1:
+	movl	0(%esp), %eax
+	pushl	(%eax)
+	movl	8(%esp), %eax
+	call	*(%esp)
+	popl	%eax
+
+	movl	4(%esp), %eax
+	movl	8(%esp), %ecx
+	movl	12(%esp), %edx
+
+	addl	$4, (%esp)
+	jmp	1b
+#endif
+
 /*
  * Real beginning of normal "text" segment
  */
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index d464604..5a9bd32 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/efi.h>
 #include <linux/bcd.h>
+#include <linux/start_kernel.h>
 
 #include <asm/bug.h>
 #include <asm/paravirt.h>
@@ -387,6 +388,9 @@ static int __init print_banner(void)
 }
 core_initcall(print_banner);
 
+/* We simply declare start_kernel to be the paravirt probe of last resort. */
+paravirt_probe(start_kernel);
+
 struct paravirt_ops paravirt_ops = {
 	.name = "bare hardware",
 	.paravirt_enabled = 0,
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 5c69cf0..877dc5c 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -65,6 +65,12 @@ SECTIONS
 	CONSTRUCTORS
 	} :data
 
+  __start_paravirtprobe = .;
+  .paravirtprobe : AT(ADDR(.paravirtprobe) - LOAD_OFFSET) {
+	*(.paravirtprobe)
+  }
+  __stop_paravirtprobe = .;
+
   . = ALIGN(4096);
   .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
   	__nosave_begin = .;
diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h
index 0811947..dd707d8 100644
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -120,6 +120,11 @@ struct paravirt_ops
 	void (fastcall *iret)(void);
 };
 
+/* Mark a paravirt probe function. */
+#define paravirt_probe(fn)						\
+ static asmlinkage void (*__paravirtprobe_##fn)(void) __attribute_used__ \
+		__attribute__((__section__(".paravirtprobe"))) = fn
+
 extern struct paravirt_ops paravirt_ops;
 
 #define paravirt_enabled() (paravirt_ops.paravirt_enabled)
-- 
cgit v0.10.2


From 4f205fd45a5c192907188d6f8f6d7e66db859248 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 7 Dec 2006 02:14:08 +0100
Subject: [PATCH] paravirt: Allow selected bug checks to be

Allow selected bug checks to be skipped by paravirt kernels.  The two most
important are the F00F workaround (which is either done by the hypervisor,
or not required), and the 'hlt' instruction check, which can break under
some hypervisors.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index 798c2f6..3ae795e 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -107,7 +107,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 	 * Note that the workaround only should be initialized once...
 	 */
 	c->f00f_bug = 0;
-	if ( c->x86 == 5 ) {
+	if (!paravirt_enabled() && c->x86 == 5) {
 		static int f00f_workaround_enabled = 0;
 
 		c->f00f_bug = 1;
diff --git a/include/asm-i386/bugs.h b/include/asm-i386/bugs.h
index 592ffee..38f1aeb 100644
--- a/include/asm-i386/bugs.h
+++ b/include/asm-i386/bugs.h
@@ -21,6 +21,7 @@
 #include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/msr.h>
+#include <asm/paravirt.h>
 
 static int __init no_halt(char *s)
 {
@@ -91,6 +92,9 @@ static void __init check_fpu(void)
 
 static void __init check_hlt(void)
 {
+	if (paravirt_enabled())
+		return;
+
 	printk(KERN_INFO "Checking 'hlt' instruction... ");
 	if (!boot_cpu_data.hlt_works_ok) {
 		printk("disabled\n");
-- 
cgit v0.10.2


From 3bbf54725467d604698721384d858b5983b87e8f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:08 +0100
Subject: [PATCH] paravirt: Disable vdso by default when CONFIG_PARAVIRT is
 enabled

They don't work together and this way even glibc still works.

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index 713ba39..92849c7 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -27,7 +27,11 @@
  * Should the kernel map a VDSO page into processes and pass its
  * address down to glibc upon exec()?
  */
+#ifdef CONFIG_PARAVIRT
+unsigned int __read_mostly vdso_enabled = 0;
+#else
 unsigned int __read_mostly vdso_enabled = 1;
+#endif
 
 EXPORT_SYMBOL_GPL(vdso_enabled);
 
-- 
cgit v0.10.2


From 6020c8f315709a508b027ef6749e85b125190947 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 7 Dec 2006 02:14:08 +0100
Subject: [PATCH] paravirt: Allow disable power management under hypervisor

Two legacy power management modes are much easier to just explicitly disable
when running in paravirtualized mode - neither APM nor PnP is still relevant.
The status of ACPI is still debatable, and noacpi is still a common enough
boot parameter that it is not necessary to explicitly disable ACPI.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index a60358f..a97847d 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -231,6 +231,7 @@
 #include <asm/uaccess.h>
 #include <asm/desc.h>
 #include <asm/i8253.h>
+#include <asm/paravirt.h>
 
 #include "io_ports.h"
 
@@ -2235,7 +2236,7 @@ static int __init apm_init(void)
 
 	dmi_check_system(apm_dmi_table);
 
-	if (apm_info.bios.version == 0) {
+	if (apm_info.bios.version == 0 || paravirt_enabled()) {
 		printk(KERN_INFO "apm: BIOS not found.\n");
 		return -ENODEV;
 	}
diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c
index 81a6c83..80066ad 100644
--- a/drivers/pnp/pnpbios/core.c
+++ b/drivers/pnp/pnpbios/core.c
@@ -530,7 +530,8 @@ static int __init pnpbios_init(void)
 	if (check_legacy_ioport(PNPBIOS_BASE))
 		return -ENODEV;
 #endif
-	if (pnpbios_disabled || dmi_check_system(pnpbios_dmi_table)) {
+	if (pnpbios_disabled || dmi_check_system(pnpbios_dmi_table) ||
+	    paravirt_enabled()) {
 		printk(KERN_INFO "PnPBIOS: Disabled\n");
 		return -ENODEV;
 	}
-- 
cgit v0.10.2


From 13623d79309dd82e1964458fa017979d16f33fa8 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 7 Dec 2006 02:14:08 +0100
Subject: [PATCH] paravirt: Add APIC accessors to paravirt-ops.

Add APIC accessors to paravirt-ops.  Unfortunately, we need two write
functions, as some older broken hardware requires workarounds for
Pentium APIC errata - this is the purpose of apic_write_atomic.

AK: replaced __inline with inline

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index 5a9bd32..fe82eb3 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -29,6 +29,8 @@
 #include <asm/time.h>
 #include <asm/irq.h>
 #include <asm/delay.h>
+#include <asm/fixmap.h>
+#include <asm/apic.h>
 
 /* nop stub */
 static void native_nop(void)
@@ -446,6 +448,12 @@ struct paravirt_ops paravirt_ops = {
 	.io_delay = native_io_delay,
 	.const_udelay = __const_udelay,
 
+#ifdef CONFIG_X86_LOCAL_APIC
+	.apic_write = native_apic_write,
+	.apic_write_atomic = native_apic_write_atomic,
+	.apic_read = native_apic_read,
+#endif
+
 	.irq_enable_sysexit = native_irq_enable_sysexit,
 	.iret = native_iret,
 };
diff --git a/include/asm-i386/apic.h b/include/asm-i386/apic.h
index b952957..41a4431 100644
--- a/include/asm-i386/apic.h
+++ b/include/asm-i386/apic.h
@@ -37,18 +37,27 @@ extern void generic_apic_probe(void);
 /*
  * Basic functions accessing APICs.
  */
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define apic_write native_apic_write
+#define apic_write_atomic native_apic_write_atomic
+#define apic_read native_apic_read
+#endif
 
-static __inline void apic_write(unsigned long reg, unsigned long v)
+static __inline fastcall void native_apic_write(unsigned long reg,
+						unsigned long v)
 {
 	*((volatile unsigned long *)(APIC_BASE+reg)) = v;
 }
 
-static __inline void apic_write_atomic(unsigned long reg, unsigned long v)
+static __inline fastcall void native_apic_write_atomic(unsigned long reg,
+						       unsigned long v)
 {
 	xchg((volatile unsigned long *)(APIC_BASE+reg), v);
 }
 
-static __inline unsigned long apic_read(unsigned long reg)
+static __inline fastcall unsigned long native_apic_read(unsigned long reg)
 {
 	return *((volatile unsigned long *)(APIC_BASE+reg));
 }
diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h
index dd707d8..e2c803f 100644
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -115,6 +115,12 @@ struct paravirt_ops
 	void (fastcall *io_delay)(void);
 	void (*const_udelay)(unsigned long loops);
 
+#ifdef CONFIG_X86_LOCAL_APIC
+	void (fastcall *apic_write)(unsigned long reg, unsigned long v);
+	void (fastcall *apic_write_atomic)(unsigned long reg, unsigned long v);
+	unsigned long (fastcall *apic_read)(unsigned long reg);
+#endif
+
 	/* These two are jmp to, not actually called. */
 	void (fastcall *irq_enable_sysexit)(void);
 	void (fastcall *iret)(void);
@@ -270,6 +276,27 @@ static inline void slow_down_io(void) {
 #endif
 }
 
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Basic functions accessing APICs.
+ */
+static inline void apic_write(unsigned long reg, unsigned long v)
+{
+	paravirt_ops.apic_write(reg,v);
+}
+
+static inline void apic_write_atomic(unsigned long reg, unsigned long v)
+{
+	paravirt_ops.apic_write_atomic(reg,v);
+}
+
+static inline unsigned long apic_read(unsigned long reg)
+{
+	return paravirt_ops.apic_read(reg);
+}
+#endif
+
+
 /* These all sit in the .parainstructions section to tell us what to patch. */
 struct paravirt_patch {
 	u8 *instr; 		/* original instructions */
-- 
cgit v0.10.2


From da181a8b3916aa7f2e3c5775d2bd2fe3454cf82d Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 7 Dec 2006 02:14:08 +0100
Subject: [PATCH] paravirt: Add MMU virtualization to paravirt_ops

Add the three bare TLB accessor functions to paravirt-ops.  Most amusingly,
flush_tlb is redefined on SMP, so I can't call the paravirt op flush_tlb.
Instead, I chose to indicate the actual flush type, kernel (global) vs. user
(non-global).  Global in this sense means using the global bit in the page
table entry, which makes TLB entries persistent across CR3 reloads, not
global as in the SMP sense of invoking remote shootdowns, so the term is
confusingly overloaded.

AK: folded in fix from Zach for PAE compilation

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index fe82eb3..3dceab5 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -31,6 +31,7 @@
 #include <asm/delay.h>
 #include <asm/fixmap.h>
 #include <asm/apic.h>
+#include <asm/tlbflush.h>
 
 /* nop stub */
 static void native_nop(void)
@@ -379,6 +380,97 @@ static fastcall void native_io_delay(void)
 	asm volatile("outb %al,$0x80");
 }
 
+static fastcall void native_flush_tlb(void)
+{
+	__native_flush_tlb();
+}
+
+/*
+ * Global pages have to be flushed a bit differently. Not a real
+ * performance problem because this does not happen often.
+ */
+static fastcall void native_flush_tlb_global(void)
+{
+	__native_flush_tlb_global();
+}
+
+static fastcall void native_flush_tlb_single(u32 addr)
+{
+	__native_flush_tlb_single(addr);
+}
+
+#ifndef CONFIG_X86_PAE
+static fastcall void native_set_pte(pte_t *ptep, pte_t pteval)
+{
+	*ptep = pteval;
+}
+
+static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
+{
+	*ptep = pteval;
+}
+
+static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+	*pmdp = pmdval;
+}
+
+#else /* CONFIG_X86_PAE */
+
+static fastcall void native_set_pte(pte_t *ptep, pte_t pte)
+{
+	ptep->pte_high = pte.pte_high;
+	smp_wmb();
+	ptep->pte_low = pte.pte_low;
+}
+
+static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
+{
+	ptep->pte_high = pte.pte_high;
+	smp_wmb();
+	ptep->pte_low = pte.pte_low;
+}
+
+static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+{
+	ptep->pte_low = 0;
+	smp_wmb();
+	ptep->pte_high = pte.pte_high;
+	smp_wmb();
+	ptep->pte_low = pte.pte_low;
+}
+
+static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval)
+{
+	set_64bit((unsigned long long *)ptep,pte_val(pteval));
+}
+
+static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+	set_64bit((unsigned long long *)pmdp,pmd_val(pmdval));
+}
+
+static fastcall void native_set_pud(pud_t *pudp, pud_t pudval)
+{
+	*pudp = pudval;
+}
+
+static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	ptep->pte_low = 0;
+	smp_wmb();
+	ptep->pte_high = 0;
+}
+
+static fastcall void native_pmd_clear(pmd_t *pmd)
+{
+	u32 *tmp = (u32 *)pmd;
+	*tmp = 0;
+	smp_wmb();
+	*(tmp + 1) = 0;
+}
+#endif /* CONFIG_X86_PAE */
+
 /* These are in entry.S */
 extern fastcall void native_iret(void);
 extern fastcall void native_irq_enable_sysexit(void);
@@ -454,6 +546,23 @@ struct paravirt_ops paravirt_ops = {
 	.apic_read = native_apic_read,
 #endif
 
+	.flush_tlb_user = native_flush_tlb,
+	.flush_tlb_kernel = native_flush_tlb_global,
+	.flush_tlb_single = native_flush_tlb_single,
+
+	.set_pte = native_set_pte,
+	.set_pte_at = native_set_pte_at,
+	.set_pmd = native_set_pmd,
+	.pte_update = (void *)native_nop,
+	.pte_update_defer = (void *)native_nop,
+#ifdef CONFIG_X86_PAE
+	.set_pte_atomic = native_set_pte_atomic,
+	.set_pte_present = native_set_pte_present,
+	.set_pud = native_set_pud,
+	.pte_clear = native_pte_clear,
+	.pmd_clear = native_pmd_clear,
+#endif
+
 	.irq_enable_sysexit = native_irq_enable_sysexit,
 	.iret = native_iret,
 };
diff --git a/arch/i386/mm/boot_ioremap.c b/arch/i386/mm/boot_ioremap.c
index 4de11f5..4de95a1 100644
--- a/arch/i386/mm/boot_ioremap.c
+++ b/arch/i386/mm/boot_ioremap.c
@@ -16,6 +16,7 @@
  */
 
 #undef CONFIG_X86_PAE
+#undef CONFIG_PARAVIRT
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h
index e2c803f..9f06265 100644
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -4,6 +4,7 @@
  * para-virtualization: those hooks are defined here. */
 #include <linux/linkage.h>
 #include <linux/stringify.h>
+#include <asm/page.h>
 
 #ifdef CONFIG_PARAVIRT
 /* These are the most performance critical ops, so we want to be able to patch
@@ -27,6 +28,7 @@
 struct thread_struct;
 struct Xgt_desc_struct;
 struct tss_struct;
+struct mm_struct;
 struct paravirt_ops
 {
 	unsigned int kernel_rpl;
@@ -121,6 +123,23 @@ struct paravirt_ops
 	unsigned long (fastcall *apic_read)(unsigned long reg);
 #endif
 
+	void (fastcall *flush_tlb_user)(void);
+	void (fastcall *flush_tlb_kernel)(void);
+	void (fastcall *flush_tlb_single)(u32 addr);
+
+	void (fastcall *set_pte)(pte_t *ptep, pte_t pteval);
+	void (fastcall *set_pte_at)(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval);
+	void (fastcall *set_pmd)(pmd_t *pmdp, pmd_t pmdval);
+	void (fastcall *pte_update)(struct mm_struct *mm, u32 addr, pte_t *ptep);
+	void (fastcall *pte_update_defer)(struct mm_struct *mm, u32 addr, pte_t *ptep);
+#ifdef CONFIG_X86_PAE
+	void (fastcall *set_pte_atomic)(pte_t *ptep, pte_t pteval);
+	void (fastcall *set_pte_present)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte);
+	void (fastcall *set_pud)(pud_t *pudp, pud_t pudval);
+	void (fastcall *pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+	void (fastcall *pmd_clear)(pmd_t *pmdp);
+#endif
+
 	/* These two are jmp to, not actually called. */
 	void (fastcall *irq_enable_sysexit)(void);
 	void (fastcall *iret)(void);
@@ -297,6 +316,62 @@ static inline unsigned long apic_read(unsigned long reg)
 #endif
 
 
+#define __flush_tlb() paravirt_ops.flush_tlb_user()
+#define __flush_tlb_global() paravirt_ops.flush_tlb_kernel()
+#define __flush_tlb_single(addr) paravirt_ops.flush_tlb_single(addr)
+
+static inline void set_pte(pte_t *ptep, pte_t pteval)
+{
+	paravirt_ops.set_pte(ptep, pteval);
+}
+
+static inline void set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
+{
+	paravirt_ops.set_pte_at(mm, addr, ptep, pteval);
+}
+
+static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+	paravirt_ops.set_pmd(pmdp, pmdval);
+}
+
+static inline void pte_update(struct mm_struct *mm, u32 addr, pte_t *ptep)
+{
+	paravirt_ops.pte_update(mm, addr, ptep);
+}
+
+static inline void pte_update_defer(struct mm_struct *mm, u32 addr, pte_t *ptep)
+{
+	paravirt_ops.pte_update_defer(mm, addr, ptep);
+}
+
+#ifdef CONFIG_X86_PAE
+static inline void set_pte_atomic(pte_t *ptep, pte_t pteval)
+{
+	paravirt_ops.set_pte_atomic(ptep, pteval);
+}
+
+static inline void set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+{
+	paravirt_ops.set_pte_present(mm, addr, ptep, pte);
+}
+
+static inline void set_pud(pud_t *pudp, pud_t pudval)
+{
+	paravirt_ops.set_pud(pudp, pudval);
+}
+
+static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	paravirt_ops.pte_clear(mm, addr, ptep);
+}
+
+static inline void pmd_clear(pmd_t *pmdp)
+{
+	paravirt_ops.pmd_clear(pmdp);
+}
+#endif
+
 /* These all sit in the .parainstructions section to tell us what to patch. */
 struct paravirt_patch {
 	u8 *instr; 		/* original instructions */
diff --git a/include/asm-i386/pgtable-2level.h b/include/asm-i386/pgtable-2level.h
index 8d8d3b9..04d6186 100644
--- a/include/asm-i386/pgtable-2level.h
+++ b/include/asm-i386/pgtable-2level.h
@@ -13,11 +13,14 @@
  * within a page table are directly modified.  Thus, the following
  * hook is made available.
  */
+#ifndef CONFIG_PARAVIRT
 #define set_pte(pteptr, pteval) (*(pteptr) = pteval)
 #define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval)
+#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval))
+#endif
+
 #define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
 #define set_pte_present(mm,addr,ptep,pteval) set_pte_at(mm,addr,ptep,pteval)
-#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval))
 
 #define pte_clear(mm,addr,xp)	do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
 #define pmd_clear(xp)	do { set_pmd(xp, __pmd(0)); } while (0)
diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h
index c2d701e..2a6e67d 100644
--- a/include/asm-i386/pgtable-3level.h
+++ b/include/asm-i386/pgtable-3level.h
@@ -44,6 +44,7 @@ static inline int pte_exec_kernel(pte_t pte)
 	return pte_x(pte);
 }
 
+#ifndef CONFIG_PARAVIRT
 /* Rules for using set_pte: the pte being assigned *must* be
  * either not present or in a state where the hardware will
  * not attempt to update the pte.  In places where this is
@@ -81,25 +82,6 @@ static inline void set_pte_present(struct mm_struct *mm, unsigned long addr, pte
 		(*(pudptr) = (pudval))
 
 /*
- * Pentium-II erratum A13: in PAE mode we explicitly have to flush
- * the TLB via cr3 if the top-level pgd is changed...
- * We do not let the generic code free and clear pgd entries due to
- * this erratum.
- */
-static inline void pud_clear (pud_t * pud) { }
-
-#define pud_page(pud) \
-((struct page *) __va(pud_val(pud) & PAGE_MASK))
-
-#define pud_page_vaddr(pud) \
-((unsigned long) __va(pud_val(pud) & PAGE_MASK))
-
-
-/* Find an entry in the second-level page table.. */
-#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
-			pmd_index(address))
-
-/*
  * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
  * entry, so clear the bottom half first and enforce ordering with a compiler
  * barrier.
@@ -118,6 +100,26 @@ static inline void pmd_clear(pmd_t *pmd)
 	smp_wmb();
 	*(tmp + 1) = 0;
 }
+#endif
+
+/*
+ * Pentium-II erratum A13: in PAE mode we explicitly have to flush
+ * the TLB via cr3 if the top-level pgd is changed...
+ * We do not let the generic code free and clear pgd entries due to
+ * this erratum.
+ */
+static inline void pud_clear (pud_t * pud) { }
+
+#define pud_page(pud) \
+((struct page *) __va(pud_val(pud) & PAGE_MASK))
+
+#define pud_page_vaddr(pud) \
+((unsigned long) __va(pud_val(pud) & PAGE_MASK))
+
+
+/* Find an entry in the second-level page table.. */
+#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
+			pmd_index(address))
 
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index 7d398f4..efd7d90 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -15,6 +15,7 @@
 #include <asm/processor.h>
 #include <asm/fixmap.h>
 #include <linux/threads.h>
+#include <asm/paravirt.h>
 
 #ifndef _I386_BITOPS_H
 #include <asm/bitops.h>
@@ -246,6 +247,7 @@ static inline pte_t pte_mkhuge(pte_t pte)	{ (pte).pte_low |= _PAGE_PSE; return p
 # include <asm/pgtable-2level.h>
 #endif
 
+#ifndef CONFIG_PARAVIRT
 /*
  * Rules for using pte_update - it must be called after any PTE update which
  * has not been done using the set_pte / clear_pte interfaces.  It is used by
@@ -261,7 +263,7 @@ static inline pte_t pte_mkhuge(pte_t pte)	{ (pte).pte_low |= _PAGE_PSE; return p
  */
 #define pte_update(mm, addr, ptep)		do { } while (0)
 #define pte_update_defer(mm, addr, ptep)	do { } while (0)
-
+#endif
 
 /*
  * We only update the dirty/accessed state if we set
diff --git a/include/asm-i386/tlbflush.h b/include/asm-i386/tlbflush.h
index 360648b..4dd8284 100644
--- a/include/asm-i386/tlbflush.h
+++ b/include/asm-i386/tlbflush.h
@@ -4,7 +4,15 @@
 #include <linux/mm.h>
 #include <asm/processor.h>
 
-#define __flush_tlb()							\
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define __flush_tlb() __native_flush_tlb()
+#define __flush_tlb_global() __native_flush_tlb_global()
+#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
+#endif
+
+#define __native_flush_tlb()						\
 	do {								\
 		unsigned int tmpreg;					\
 									\
@@ -19,7 +27,7 @@
  * Global pages have to be flushed a bit differently. Not a real
  * performance problem because this does not happen often.
  */
-#define __flush_tlb_global()						\
+#define __native_flush_tlb_global()					\
 	do {								\
 		unsigned int tmpreg, cr4, cr4_orig;			\
 									\
@@ -36,6 +44,9 @@
 			: "memory");					\
 	} while (0)
 
+#define __native_flush_tlb_single(addr) 				\
+	__asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory")
+
 # define __flush_tlb_all()						\
 	do {								\
 		if (cpu_has_pge)					\
@@ -46,9 +57,6 @@
 
 #define cpu_has_invlpg	(boot_cpu_data.x86 > 3)
 
-#define __flush_tlb_single(addr) \
-	__asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory")
-
 #ifdef CONFIG_X86_INVLPG
 # define __flush_tlb_one(addr) __flush_tlb_single(addr)
 #else
-- 
cgit v0.10.2


From bd472c794bbf6771c3fc1c58f188bc16c393d2fe Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 7 Dec 2006 02:14:08 +0100
Subject: [PATCH] paravirt: Be careful about touching BIOS address space

BIOS ROM areas may not be mapped into the guest address space, so be careful
when touching those addresses to make sure they appear to be mapped.

[akpm@osdl.org: fix unused var warning]
AK: Changed __get_user to probe_kernel_address

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
index b755255..b704790f 100644
--- a/arch/i386/kernel/e820.c
+++ b/arch/i386/kernel/e820.c
@@ -9,6 +9,7 @@
 #include <linux/mm.h>
 #include <linux/efi.h>
 #include <linux/pfn.h>
+#include <linux/uaccess.h>
 
 #include <asm/pgtable.h>
 #include <asm/page.h>
@@ -155,7 +156,14 @@ static struct resource standard_io_resources[] = { {
 	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
 } };
 
-#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
+static int romsignature(const unsigned char *x)
+{
+	unsigned short sig;
+	int ret = 0;
+	if (probe_kernel_address((const unsigned short *)x, sig) == 0)
+		ret = (sig == 0xaa55);
+	return ret;
+}
 
 static int __init romchecksum(unsigned char *rom, unsigned long length)
 {
diff --git a/arch/i386/pci/pcbios.c b/arch/i386/pci/pcbios.c
index ed1512a..5f51934 100644
--- a/arch/i386/pci/pcbios.c
+++ b/arch/i386/pci/pcbios.c
@@ -5,6 +5,7 @@
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 #include "pci.h"
 #include "pci-functions.h"
 
@@ -314,6 +315,10 @@ static struct pci_raw_ops * __devinit pci_find_bios(void)
 	for (check = (union bios32 *) __va(0xe0000);
 	     check <= (union bios32 *) __va(0xffff0);
 	     ++check) {
+		long sig;
+		if (probe_kernel_address(&check->fields.signature, sig))
+			continue;
+
 		if (check->fields.signature != BIOS32_SIGNATURE)
 			continue;
 		length = check->fields.length * 16;
@@ -331,11 +336,13 @@ static struct pci_raw_ops * __devinit pci_find_bios(void)
 		}
 		DBG("PCI: BIOS32 Service Directory structure at 0x%p\n", check);
 		if (check->fields.entry >= 0x100000) {
-			printk("PCI: BIOS32 entry (0x%p) in high memory, cannot use.\n", check);
+			printk("PCI: BIOS32 entry (0x%p) in high memory, "
+					"cannot use.\n", check);
 			return NULL;
 		} else {
 			unsigned long bios32_entry = check->fields.entry;
-			DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n", bios32_entry);
+			DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n",
+					bios32_entry);
 			bios32_indirect.address = bios32_entry + PAGE_OFFSET;
 			if (check_pcibios())
 				return &pci_bios_access;
-- 
cgit v0.10.2


From a2952d8949bb0b37c1be92a89c4f180c74292857 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Thu, 7 Dec 2006 02:14:08 +0100
Subject: [PATCH] paravirt: Preparatory mmu header movement

Move header includes for the nopud / nopmd types to the location of the actual
pte / pgd type definitions.  This allows generic 4-level page type code to be
written before the split 2/3 level page table headers are included.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@muc.de>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h
index 2b69686..fd3f64a 100644
--- a/include/asm-i386/page.h
+++ b/include/asm-i386/page.h
@@ -52,6 +52,7 @@ typedef struct { unsigned long long pgprot; } pgprot_t;
 #define pte_val(x)	((x).pte_low | ((unsigned long long)(x).pte_high << 32))
 #define __pmd(x) ((pmd_t) { (x) } )
 #define HPAGE_SHIFT	21
+#include <asm-generic/pgtable-nopud.h>
 #else
 typedef struct { unsigned long pte_low; } pte_t;
 typedef struct { unsigned long pgd; } pgd_t;
@@ -59,6 +60,7 @@ typedef struct { unsigned long pgprot; } pgprot_t;
 #define boot_pte_t pte_t /* or would you rather have a typedef */
 #define pte_val(x)	((x).pte_low)
 #define HPAGE_SHIFT	22
+#include <asm-generic/pgtable-nopmd.h>
 #endif
 #define PTE_MASK	PAGE_MASK
 
diff --git a/include/asm-i386/pgtable-2level.h b/include/asm-i386/pgtable-2level.h
index 04d6186..c08cbc4 100644
--- a/include/asm-i386/pgtable-2level.h
+++ b/include/asm-i386/pgtable-2level.h
@@ -1,8 +1,6 @@
 #ifndef _I386_PGTABLE_2LEVEL_H
 #define _I386_PGTABLE_2LEVEL_H
 
-#include <asm-generic/pgtable-nopmd.h>
-
 #define pte_ERROR(e) \
 	printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low)
 #define pgd_ERROR(e) \
diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h
index 2a6e67d..54287f1 100644
--- a/include/asm-i386/pgtable-3level.h
+++ b/include/asm-i386/pgtable-3level.h
@@ -1,8 +1,6 @@
 #ifndef _I386_PGTABLE_3LEVEL_H
 #define _I386_PGTABLE_3LEVEL_H
 
-#include <asm-generic/pgtable-nopud.h>
-
 /*
  * Intel Physical Address Extension (PAE) Mode - three-level page
  * tables on PPro+ CPUs.
-- 
cgit v0.10.2


From dfbea0ad50e08c52539bddce977b07f77a762ba4 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Thu, 7 Dec 2006 02:14:08 +0100
Subject: [PATCH] paravirt: fix parameter names in mmu operations

Make parameter names match function argument names for the yet to be defined
pte_update_defer accessor.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@muc.de>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index efd7d90..04dd39b 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -277,7 +277,7 @@ static inline pte_t pte_mkhuge(pte_t pte)	{ (pte).pte_low |= _PAGE_PSE; return p
 do {									\
 	if (dirty) {							\
 		(ptep)->pte_low = (entry).pte_low;			\
-		pte_update_defer((vma)->vm_mm, (addr), (ptep));		\
+		pte_update_defer((vma)->vm_mm, (address), (ptep));	\
 		flush_tlb_page(vma, address);				\
 	}								\
 } while (0)
@@ -307,7 +307,7 @@ do {									\
 	__dirty = pte_dirty(*(ptep));					\
 	if (__dirty) {							\
 		clear_bit(_PAGE_BIT_DIRTY, &(ptep)->pte_low);		\
-		pte_update_defer((vma)->vm_mm, (addr), (ptep));		\
+		pte_update_defer((vma)->vm_mm, (address), (ptep));	\
 		flush_tlb_page(vma, address);				\
 	}								\
 	__dirty;							\
@@ -320,7 +320,7 @@ do {									\
 	__young = pte_young(*(ptep));					\
 	if (__young) {							\
 		clear_bit(_PAGE_BIT_ACCESSED, &(ptep)->pte_low);	\
-		pte_update_defer((vma)->vm_mm, (addr), (ptep));		\
+		pte_update_defer((vma)->vm_mm, (address), (ptep));	\
 		flush_tlb_page(vma, address);				\
 	}								\
 	__young;							\
-- 
cgit v0.10.2


From 8ecb8950695e907ed25acffec9e98c6806e311c8 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Thu, 7 Dec 2006 02:14:09 +0100
Subject: [PATCH] paravirt: fix missing pte update

The function ptep_get_and_clear uses an atomic instruction sequence to get and
clear an active pte.  Rather than add such an atomic operator to all virtual
machine implementations in paravirt-ops, it is easier to support the raw
atomic sequence and use either a trapping writable pagetable approach, or a
post-update notification.  For the post update notification, we require the
pte_update function to be called after the access.  Combine the 2-level and
3-level paging operators into one common function which does the post-update
notification, and rename the actual atomic sequences to raw_ptep_xxx
operators.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@muc.de>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/include/asm-i386/pgtable-2level.h b/include/asm-i386/pgtable-2level.h
index c08cbc4..38c3fcc 100644
--- a/include/asm-i386/pgtable-2level.h
+++ b/include/asm-i386/pgtable-2level.h
@@ -23,8 +23,7 @@
 #define pte_clear(mm,addr,xp)	do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
 #define pmd_clear(xp)	do { set_pmd(xp, __pmd(0)); } while (0)
 
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-#define ptep_get_and_clear(mm,addr,xp)	__pte(xchg(&(xp)->pte_low, 0))
+#define raw_ptep_get_and_clear(xp)	__pte(xchg(&(xp)->pte_low, 0))
 
 #define pte_page(x)		pfn_to_page(pte_pfn(x))
 #define pte_none(x)		(!(x).pte_low)
diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h
index 54287f1..7a2318f 100644
--- a/include/asm-i386/pgtable-3level.h
+++ b/include/asm-i386/pgtable-3level.h
@@ -119,8 +119,7 @@ static inline void pud_clear (pud_t * pud) { }
 #define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
 			pmd_index(address))
 
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+static inline pte_t raw_ptep_get_and_clear(pte_t *ptep)
 {
 	pte_t res;
 
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index 04dd39b..b4a301f 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -326,6 +326,14 @@ do {									\
 	__young;							\
 })
 
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	pte_t pte = raw_ptep_get_and_clear(ptep);
+	pte_update(mm, addr, ptep);
+	return pte;
+}
+
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
 static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
 {
-- 
cgit v0.10.2


From 8542b200cbe5609edd7aae0c304c091a1c290452 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Thu, 7 Dec 2006 02:14:09 +0100
Subject: [PATCH] paravirt: Add option to allow skipping the timer check

Add a way to disable the timer IRQ routing check via a boot option.  The
VMI timer code uses this to avoid triggering the pester Mingo code, which
probes for some very unusual and broken motherboard routings.  It fires
100% of the time when using a paravirtual delay mechanism instead of using
a realtime delay, since there is no elapsed real time, and the 4 timer IRQs
have not yet been delivered.

In addition, it is entirely possible, though improbable, that this bug
could surface on real hardware which picks a particularly bad time to enter
SMM mode, causing a long latency during one of the timer IRQs.

While here, make check_timer be __init.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andi Kleen <ak@suse.de>
[chrisw: use no_timer_check to bring inline with x86_64 as per Andi's request]
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 2e1898e..4e90aa4 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -599,8 +599,6 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	hugepages=	[HW,IA-32,IA-64] Maximal number of HugeTLB pages.
 
-	noirqbalance	[IA-32,SMP,KNL] Disable kernel irq balancing
-
 	i8042.direct	[HW] Put keyboard port into non-translated mode
 	i8042.dumbkbd	[HW] Pretend that controller can only read data from
 			     keyboard and cannot control its state
@@ -1052,9 +1050,14 @@ and is between 256 and 4096 characters. It is defined in the file
 			in certain environments such as networked servers or
 			real-time systems.
 
+	noirqbalance	[IA-32,SMP,KNL] Disable kernel irq balancing
+
 	noirqdebug	[IA-32] Disables the code which attempts to detect and
 			disable unhandled interrupt sources.
 
+	no_timer_check	[IA-32,X86_64,APIC] Disables the code which tests for
+			broken timer IRQ sources.
+
 	noisapnp	[ISAPNP] Disables ISA PnP code.
 
 	noinitrd	[RAM] Tells the kernel not to load any configured
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index e33b7a8..993150f 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -1932,6 +1932,15 @@ static void __init setup_ioapic_ids_from_mpc(void)
 static void __init setup_ioapic_ids_from_mpc(void) { }
 #endif
 
+static int no_timer_check __initdata;
+
+static int __init notimercheck(char *s)
+{
+	no_timer_check = 1;
+	return 1;
+}
+__setup("no_timer_check", notimercheck);
+
 /*
  * There is a nasty bug in some older SMP boards, their mptable lies
  * about the timer IRQ. We do the following to work around the situation:
@@ -1940,10 +1949,13 @@ static void __init setup_ioapic_ids_from_mpc(void) { }
  *	- if this function detects that timer IRQs are defunct, then we fall
  *	  back to ISA timer IRQs
  */
-static int __init timer_irq_works(void)
+int __init timer_irq_works(void)
 {
 	unsigned long t1 = jiffies;
 
+	if (no_timer_check)
+		return 1;
+
 	local_irq_enable();
 	/* Let ten ticks pass... */
 	mdelay((10 * 1000) / HZ);
@@ -2214,7 +2226,7 @@ int timer_uses_ioapic_pin_0;
  * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
  * fanatically on his truly buggy board.
  */
-static inline void check_timer(void)
+static inline void __init check_timer(void)
 {
 	int apic1, pin1, apic2, pin2;
 	int vector;
-- 
cgit v0.10.2


From 713819989aa4dd141a37074dbc369e7c620bc619 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:09 +0100
Subject: [PATCH] x86-64: Add option to compile for Core2

Add an option to compile for Intel's Core 2

The Kconfig help is a mouthful due to the inventiveness of Intel's
product naming department.

Mainly for the 64bit cache line sizes because gcc doesn't support
optimizing for core2 yet. However it will and then the kernel
should be ready by passing the right option

Also fix the old MPSC help text to confirm better to reality.

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 5cb509d..6eece27 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -122,7 +122,7 @@ endchoice
 
 choice
 	prompt "Processor family"
-	default MK8
+	default GENERIC_CPU
 
 config MK8
 	bool "AMD-Opteron/Athlon64"
@@ -130,16 +130,31 @@ config MK8
 	  Optimize for AMD Opteron/Athlon64/Hammer/K8 CPUs.
 
 config MPSC
-       bool "Intel EM64T"
+       bool "Intel P4 / older Netburst based Xeon"
        help
-	  Optimize for Intel Pentium 4 and Xeon CPUs with Intel
-	  Extended Memory 64 Technology(EM64T). For details see
+	  Optimize for Intel Pentium 4 and older Nocona/Dempsey Xeon CPUs
+	  with Intel Extended Memory 64 Technology(EM64T). For details see
 	  <http://www.intel.com/technology/64bitextensions/>.
+	  Note the the latest Xeons (Xeon 51xx and 53xx) are not based on the
+          Netburst core and shouldn't use this option. You can distingush them
+	  using the cpu family field
+	  in /proc/cpuinfo. Family 15 is a older Xeon, Family 6 a newer one
+	  (this rule only applies to system that support EM64T)
+
+config MCORE2
+	bool "Intel Core2 / newer Xeon"
+	help
+	  Optimize for Intel Core2 and newer Xeons (51xx)
+	  You can distingush the newer Xeons from the older ones using
+	  the cpu family field in /proc/cpuinfo. 15 is a older Xeon
+	  (use CONFIG_MPSC then), 6 is a newer one. This rule only
+	  applies to CPUs that support EM64T.
 
 config GENERIC_CPU
 	bool "Generic-x86-64"
 	help
 	  Generic x86-64 CPU.
+	  Run equally well on all x86-64 CPUs.
 
 endchoice
 
@@ -149,12 +164,12 @@ endchoice
 config X86_L1_CACHE_BYTES
 	int
 	default "128" if GENERIC_CPU || MPSC
-	default "64" if MK8
+	default "64" if MK8 || MCORE2
 
 config X86_L1_CACHE_SHIFT
 	int
 	default "7" if GENERIC_CPU || MPSC
-	default "6" if MK8
+	default "6" if MK8 || MCORE2
 
 config X86_INTERNODE_CACHE_BYTES
 	int
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
index 6e38d4d..b471b855 100644
--- a/arch/x86_64/Makefile
+++ b/arch/x86_64/Makefile
@@ -30,6 +30,10 @@ cflags-y	:=
 cflags-kernel-y	:=
 cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
 cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
+# gcc doesn't support -march=core2 yet as of gcc 4.3, but I hope it
+# will eventually. Use -mtune=generic as fallback
+cflags-$(CONFIG_MCORE2) += \
+	$(call cc-option,-march=core2,$(call cc-option,-mtune=generic))
 cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
 
 cflags-y += -m64
-- 
cgit v0.10.2


From c55d92d141b9c40c67db249de91f5c224eb49859 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:09 +0100
Subject: [PATCH] i386: Add support for compilation for Core2

gcc doesn't support -mtune=core2 yet, but will be soon. Use -mtune=generic or -mtune=i686
as fallback

TBD need benchmarking for INTEL_USERCOPY etc. So far I used the same defaults as MPENTIUMM

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu
index fc4f2ab..821fd26 100644
--- a/arch/i386/Kconfig.cpu
+++ b/arch/i386/Kconfig.cpu
@@ -103,8 +103,15 @@ config MPENTIUMM
 	  Select this for Intel Pentium M (not Pentium-4 M)
 	  notebook chips.
 
+config MCORE2
+	bool "Core 2/newer Xeon"
+	help
+	  Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and 53xx)
+	  CPUs. You can distingush newer from older Xeons by the CPU family
+	  in /proc/cpuinfo. Newer ones have 6.
+
 config MPENTIUM4
-	bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/Xeon"
+	bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon"
 	help
 	  Select this for Intel Pentium 4 chips.  This includes the
 	  Pentium 4, P4-based Celeron and Xeon, and Pentium-4 M
@@ -229,7 +236,7 @@ config X86_L1_CACHE_SHIFT
 	default "7" if MPENTIUM4 || X86_GENERIC
 	default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
 	default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
-	default "6" if MK7 || MK8 || MPENTIUMM
+	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2
 
 config RWSEM_GENERIC_SPINLOCK
 	bool
@@ -287,17 +294,17 @@ config X86_ALIGNMENT_16
 
 config X86_GOOD_APIC
 	bool
-	depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON
+	depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2
 	default y
 
 config X86_INTEL_USERCOPY
 	bool
-	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON
+	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
 	default y
 
 config X86_USE_PPRO_CHECKSUM
 	bool
-	depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX
+	depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2
 	default y
 
 config X86_USE_3DNOW
@@ -312,5 +319,5 @@ config X86_OOSTORE
 
 config X86_TSC
 	bool
-	depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX) && !X86_NUMAQ
+	depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ
 	default y
diff --git a/arch/i386/Makefile.cpu b/arch/i386/Makefile.cpu
index a11befb..a32c031 100644
--- a/arch/i386/Makefile.cpu
+++ b/arch/i386/Makefile.cpu
@@ -32,6 +32,7 @@ cflags-$(CONFIG_MWINCHIP2)	+= $(call cc-option,-march=winchip2,-march=i586)
 cflags-$(CONFIG_MWINCHIP3D)	+= $(call cc-option,-march=winchip2,-march=i586)
 cflags-$(CONFIG_MCYRIXIII)	+= $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
 cflags-$(CONFIG_MVIAC3_2)	+= $(call cc-option,-march=c3-2,-march=i686)
+cflags-$(CONFIG_MCORE2)		+= -march=i686 $(call cc-option,-mtune=core2,$(call cc-option,-mtune=generic,-mtune=i686))
 
 # AMD Elan support
 cflags-$(CONFIG_X86_ELAN)	+= -march=i486
diff --git a/include/asm-i386/module.h b/include/asm-i386/module.h
index 424661d..fe5ae42 100644
--- a/include/asm-i386/module.h
+++ b/include/asm-i386/module.h
@@ -20,6 +20,8 @@ struct mod_arch_specific
 #define MODULE_PROC_FAMILY "586TSC "
 #elif defined CONFIG_M586MMX
 #define MODULE_PROC_FAMILY "586MMX "
+#elif defined CONFIG_MCORE2
+#define MODULE_PROC_FAMILY "CORE2 "
 #elif defined CONFIG_M686
 #define MODULE_PROC_FAMILY "686 "
 #elif defined CONFIG_MPENTIUMII
-- 
cgit v0.10.2


From 103efcd9aac1de4da6a1477f2f3b9fcf35273a18 Mon Sep 17 00:00:00 2001
From: Ernie Petrides <petrides@redhat.com>
Date: Thu, 7 Dec 2006 02:14:09 +0100
Subject: [PATCH] x86-64: fix perms/range of vsyscall vma in /proc/*/maps

The final line of /proc/<pid>/maps on x86_64 for native 64-bit
tasks shows an incorrect ending address and incorrect permissions.  There
is only a single page mapped in this vsyscall region, and it is accessible
for both read and execute.

The patch below fixes this.  (Since 32-bit-compat tasks have a real vma
with correct perms/range, no change is necessary for that scenario.)

Before the patch, a "cat /proc/self/maps | tail -1" shows this:

        ffffffffff600000-ffffffffffe00000 ---p 00000000 [...]

After the patch, this is the output:

        ffffffffff600000-ffffffffff601000 r-xp 00000000 [...]

Signed-off-by: Ernie Petrides <petrides@redhat.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index 92546c1..c3de9a0 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -290,6 +290,7 @@ static void __init map_vsyscall(void)
 	extern char __vsyscall_0;
 	unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
 
+	/* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
 	__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
 }
 
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 4c0c00e..2968b90e 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -730,14 +730,15 @@ static __init int x8664_sysctl_init(void)
 __initcall(x8664_sysctl_init);
 #endif
 
-/* A pseudo VMAs to allow ptrace access for the vsyscall page.   This only
+/* A pseudo VMA to allow ptrace access for the vsyscall page.  This only
    covers the 64bit vsyscall page now. 32bit has a real VMA now and does
    not need special handling anymore. */
 
 static struct vm_area_struct gate_vma = {
 	.vm_start = VSYSCALL_START,
-	.vm_end = VSYSCALL_END,
-	.vm_page_prot = PAGE_READONLY
+	.vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
+	.vm_page_prot = PAGE_READONLY_EXEC,
+	.vm_flags = VM_READ | VM_EXEC
 };
 
 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
diff --git a/include/asm-x86_64/vsyscall.h b/include/asm-x86_64/vsyscall.h
index 01d1c17..05cb8dd 100644
--- a/include/asm-x86_64/vsyscall.h
+++ b/include/asm-x86_64/vsyscall.h
@@ -10,6 +10,7 @@ enum vsyscall_num {
 #define VSYSCALL_START (-10UL << 20)
 #define VSYSCALL_SIZE 1024
 #define VSYSCALL_END (-2UL << 20)
+#define VSYSCALL_MAPPED_PAGES 1
 #define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr))
 
 #ifdef __KERNEL__
-- 
cgit v0.10.2


From 9a457324229db34d3bcb0b67360130c287289401 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:09 +0100
Subject: [PATCH] x86-64: Rate limit no irq handler messages

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index e46c558..0c06af6 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -120,7 +120,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
 
 	if (likely(irq < NR_IRQS))
 		generic_handle_irq(irq);
-	else
+	else if (printk_ratelimit())
 		printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
 			__func__, smp_processor_id(), vector);
 
-- 
cgit v0.10.2


From b0bfece40b1988aa8e3d910938691dce7859d82d Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 7 Dec 2006 02:14:09 +0100
Subject: [PATCH] i386: clear_fixmap() should not use set_pte()

While not strictly required with the current code (as the upper half of
page table entries generated by __set_fixmap() cannot be non-zero due
to the second parameter of this function being 'unsigned long'), the
use of set_pte() in __set_fixmap() in the context of clear_fixmap() is
still improper with CONFIG_X86_PAE (see the respective comment in
include/asm-i386/pgtable-3level.h) and would turn into a bug if that
second parameter ever gets changed to a 64-bit type.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index 10126e3..65b5c09 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -95,8 +95,11 @@ static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
 		return;
 	}
 	pte = pte_offset_kernel(pmd, vaddr);
-	/* <pfn,flags> stored as-is, to permit clearing entries */
-	set_pte(pte, pfn_pte(pfn, flags));
+	if (pgprot_val(flags))
+		/* <pfn,flags> stored as-is, to permit clearing entries */
+		set_pte(pte, pfn_pte(pfn, flags));
+	else
+		pte_clear(&init_mm, vaddr, pte);
 
 	/*
 	 * It's enough to flush this one mapping.
-- 
cgit v0.10.2


From c6ea396de6836bdeb2d2433368130642bf0f6e15 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 7 Dec 2006 02:14:09 +0100
Subject: [PATCH] i386: Don't touch per cpu memory of offline CPUs in
 touch_nmi_watchdog

Just like on x86-64, don't touch foreign CPUs' memory if the watchdog
isn't enabled at all.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 171194c..f5bc7e1 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -870,14 +870,16 @@ static unsigned int
 
 void touch_nmi_watchdog (void)
 {
-	int i;
+	if (nmi_watchdog > 0) {
+		unsigned cpu;
 
-	/*
-	 * Just reset the alert counters, (other CPUs might be
-	 * spinning on locks we hold):
-	 */
-	for_each_possible_cpu(i)
-		alert_counter[i] = 0;
+		/*
+		 * Just reset the alert counters, (other CPUs might be
+		 * spinning on locks we hold):
+		 */
+		for_each_present_cpu (cpu)
+			alert_counter[cpu] = 0;
+	}
 
 	/*
 	 * Tickle the softlockup detector too:
-- 
cgit v0.10.2


From 4a1c42275078f48b90428cdb062f5220d79ec9da Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 7 Dec 2006 02:14:09 +0100
Subject: [PATCH] x86-64: remove prototype of free_bootmem_generic()

The function doesn't exist (anymore).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index e72cfcd..21fa5af 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -61,7 +61,6 @@ extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
 extern unsigned long numa_free_all_bootmem(void);
 
 extern void reserve_bootmem_generic(unsigned long phys, unsigned len);
-extern void free_bootmem_generic(unsigned long phys, unsigned len);
 
 extern void load_gs_index(unsigned gs);
 
-- 
cgit v0.10.2


From 475850c86b908ae026d5a4be02a1b1e9c408c75a Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 7 Dec 2006 02:14:09 +0100
Subject: [PATCH] i386: conditionalize inclusion of some MTRR flavors

Avoid inclusion of code that's dead for x86-64.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/cpu/mtrr/Makefile b/arch/i386/kernel/cpu/mtrr/Makefile
index a25b701..191fc05 100644
--- a/arch/i386/kernel/cpu/mtrr/Makefile
+++ b/arch/i386/kernel/cpu/mtrr/Makefile
@@ -1,5 +1,3 @@
 obj-y		:= main.o if.o generic.o state.o
-obj-y		+= amd.o
-obj-y		+= cyrix.o
-obj-y		+= centaur.o
+obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
 
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index 2b8b0b3..a4de30b 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -59,7 +59,11 @@ struct mtrr_ops * mtrr_if = NULL;
 static void set_mtrr(unsigned int reg, unsigned long base,
 		     unsigned long size, mtrr_type type);
 
+#ifndef CONFIG_X86_64
 extern int arr3_protected;
+#else
+#define arr3_protected 0
+#endif
 
 void set_mtrr_ops(struct mtrr_ops * ops)
 {
@@ -544,9 +548,11 @@ extern void centaur_init_mtrr(void);
 
 static void __init init_ifs(void)
 {
+#ifndef CONFIG_X86_64
 	amd_init_mtrr();
 	cyrix_init_mtrr();
 	centaur_init_mtrr();
+#endif
 }
 
 /* The suspend/resume methods are only for CPU without MTRR. CPU using generic
-- 
cgit v0.10.2


From eab724e5df17af0ed0dac03da8f75aa336c31206 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 7 Dec 2006 02:14:09 +0100
Subject: [PATCH] x86-64: adjust pmd_bad()

Make pmd_bad() symmetrical to pgd_bad() and pud_bad(). At once,
simplify them all.

TBD: tighten down the checks again as suggested by Hugh D.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index 0555c1c..59901c6 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -221,20 +221,19 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long
 #define __S110	PAGE_SHARED_EXEC
 #define __S111	PAGE_SHARED_EXEC
 
-static inline unsigned long pgd_bad(pgd_t pgd) 
-{ 
-       unsigned long val = pgd_val(pgd);
-       val &= ~PTE_MASK; 
-       val &= ~(_PAGE_USER | _PAGE_DIRTY); 
-       return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);      
-} 
+static inline unsigned long pgd_bad(pgd_t pgd)
+{
+	return pgd_val(pgd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
+}
 
 static inline unsigned long pud_bad(pud_t pud)
 {
-       unsigned long val = pud_val(pud);
-       val &= ~PTE_MASK;
-       val &= ~(_PAGE_USER | _PAGE_DIRTY);
-       return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
+	return pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
+}
+
+static inline unsigned long pmd_bad(pmd_t pmd)
+{
+	return pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
 }
 
 #define pte_none(x)	(!pte_val(x))
@@ -347,7 +346,6 @@ static inline int pmd_large(pmd_t pte) {
 #define pmd_none(x)	(!pmd_val(x))
 #define pmd_present(x)	(pmd_val(x) & _PAGE_PRESENT)
 #define pmd_clear(xp)	do { set_pmd(xp, __pmd(0)); } while (0)
-#define	pmd_bad(x)	((pmd_val(x) & (~PTE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE )
 #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
 #define pmd_pfn(x)  ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
 
-- 
cgit v0.10.2


From 365bff806e9faba000fb4956c7486fbf3a746d96 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 7 Dec 2006 02:14:09 +0100
Subject: [PATCH] i386: fix MTRR code

Until not so long ago, there were system log messages pointing to
inconsistent MTRR setup of the video frame buffer caused by the way vesafb
and X worked. While vesafb was fixed meanwhile, I believe fixing it there
only hides a shortcoming in the MTRR code itself, in that that code is not
symmetric with respect to the ordering of attempts to set up two (or more)
regions where one contains the other. In the current shape, it permits
only setting up sub-regions of pre-exisiting ones. The patch below makes
this symmetric.

While working on that I noticed a few more inconsistencies in that code,
namely
- use of 'unsigned int' for sizes in many, but not all places (the patch
  is converting this to use 'unsigned long' everywhere, which specifically
  might be necessary for x86-64 once a processor supporting more than 44
  physical address bits would become available)
- the code to correct inconsistent settings during secondary processor
  startup tried (if necessary) to correct, among other things, the value
  in IA32_MTRR_DEF_TYPE, however the newly computed value would never get
  used (i.e. stored in the respective MSR)
- the generic range validation code checked that the end of the
  to-be-added range would be above 1MB; the value checked should have been
  the start of the range
- when contained regions are detected, previously this was allowed only
  when the old region was uncacheable; this can be symmetric (i.e. the new
  region can also be uncacheable) and even further as per Intel's
  documentation write-trough and write-back for either region is also
  compatible with the respective opposite in the other

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/cpu/mtrr/amd.c b/arch/i386/kernel/cpu/mtrr/amd.c
index 1a1e04b..0949cdb 100644
--- a/arch/i386/kernel/cpu/mtrr/amd.c
+++ b/arch/i386/kernel/cpu/mtrr/amd.c
@@ -7,7 +7,7 @@
 
 static void
 amd_get_mtrr(unsigned int reg, unsigned long *base,
-	     unsigned int *size, mtrr_type * type)
+	     unsigned long *size, mtrr_type * type)
 {
 	unsigned long low, high;
 
diff --git a/arch/i386/kernel/cpu/mtrr/centaur.c b/arch/i386/kernel/cpu/mtrr/centaur.c
index 33f00ac..cb9aa3a 100644
--- a/arch/i386/kernel/cpu/mtrr/centaur.c
+++ b/arch/i386/kernel/cpu/mtrr/centaur.c
@@ -17,7 +17,7 @@ static u8 centaur_mcr_type;	/* 0 for winchip, 1 for winchip2 */
  */
 
 static int
-centaur_get_free_region(unsigned long base, unsigned long size)
+centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg)
 /*  [SUMMARY] Get a free MTRR.
     <base> The starting (base) address of the region.
     <size> The size (in bytes) of the region.
@@ -26,10 +26,11 @@ centaur_get_free_region(unsigned long base, unsigned long size)
 {
 	int i, max;
 	mtrr_type ltype;
-	unsigned long lbase;
-	unsigned int lsize;
+	unsigned long lbase, lsize;
 
 	max = num_var_ranges;
+	if (replace_reg >= 0 && replace_reg < max)
+		return replace_reg;
 	for (i = 0; i < max; ++i) {
 		if (centaur_mcr_reserved & (1 << i))
 			continue;
@@ -49,7 +50,7 @@ mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
 
 static void
 centaur_get_mcr(unsigned int reg, unsigned long *base,
-		unsigned int *size, mtrr_type * type)
+		unsigned long *size, mtrr_type * type)
 {
 	*base = centaur_mcr[reg].high >> PAGE_SHIFT;
 	*size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT;
diff --git a/arch/i386/kernel/cpu/mtrr/cyrix.c b/arch/i386/kernel/cpu/mtrr/cyrix.c
index 9027a98..0737a59 100644
--- a/arch/i386/kernel/cpu/mtrr/cyrix.c
+++ b/arch/i386/kernel/cpu/mtrr/cyrix.c
@@ -9,7 +9,7 @@ int arr3_protected;
 
 static void
 cyrix_get_arr(unsigned int reg, unsigned long *base,
-	      unsigned int *size, mtrr_type * type)
+	      unsigned long *size, mtrr_type * type)
 {
 	unsigned long flags;
 	unsigned char arr, ccr3, rcr, shift;
@@ -77,7 +77,7 @@ cyrix_get_arr(unsigned int reg, unsigned long *base,
 }
 
 static int
-cyrix_get_free_region(unsigned long base, unsigned long size)
+cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
 /*  [SUMMARY] Get a free ARR.
     <base> The starting (base) address of the region.
     <size> The size (in bytes) of the region.
@@ -86,9 +86,24 @@ cyrix_get_free_region(unsigned long base, unsigned long size)
 {
 	int i;
 	mtrr_type ltype;
-	unsigned long lbase;
-	unsigned int  lsize;
+	unsigned long lbase, lsize;
 
+	switch (replace_reg) {
+	case 7:
+		if (size < 0x40)
+			break;
+	case 6:
+	case 5:
+	case 4:
+		return replace_reg;
+	case 3:
+		if (arr3_protected)
+			break;
+	case 2:
+	case 1:
+	case 0:
+		return replace_reg;
+	}
 	/* If we are to set up a region >32M then look at ARR7 immediately */
 	if (size > 0x2000) {
 		cyrix_get_arr(7, &lbase, &lsize, &ltype);
@@ -214,7 +229,7 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base,
 
 typedef struct {
 	unsigned long base;
-	unsigned int size;
+	unsigned long size;
 	mtrr_type type;
 } arr_state_t;
 
diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c
index ee8dc67..f77fc53 100644
--- a/arch/i386/kernel/cpu/mtrr/generic.c
+++ b/arch/i386/kernel/cpu/mtrr/generic.c
@@ -3,6 +3,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 #include <asm/io.h>
 #include <asm/mtrr.h>
 #include <asm/msr.h>
@@ -15,12 +16,19 @@ struct mtrr_state {
 	struct mtrr_var_range *var_ranges;
 	mtrr_type fixed_ranges[NUM_FIXED_RANGES];
 	unsigned char enabled;
+	unsigned char have_fixed;
 	mtrr_type def_type;
 };
 
 static unsigned long smp_changes_mask;
 static struct mtrr_state mtrr_state = {};
 
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "mtrr."
+
+static __initdata int mtrr_show;
+module_param_named(show, mtrr_show, bool, 0);
+
 /*  Get the MSR pair relating to a var range  */
 static void __init
 get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
@@ -43,6 +51,14 @@ get_fixed_ranges(mtrr_type * frs)
 		rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
 }
 
+static void __init print_fixed(unsigned base, unsigned step, const mtrr_type*types)
+{
+	unsigned i;
+
+	for (i = 0; i < 8; ++i, ++types, base += step)
+		printk(KERN_INFO "MTRR %05X-%05X %s\n", base, base + step - 1, mtrr_attrib_to_str(*types));
+}
+
 /*  Grab all of the MTRR state for this CPU into *state  */
 void __init get_mtrr_state(void)
 {
@@ -58,13 +74,49 @@ void __init get_mtrr_state(void)
 	} 
 	vrs = mtrr_state.var_ranges;
 
+	rdmsr(MTRRcap_MSR, lo, dummy);
+	mtrr_state.have_fixed = (lo >> 8) & 1;
+
 	for (i = 0; i < num_var_ranges; i++)
 		get_mtrr_var_range(i, &vrs[i]);
-	get_fixed_ranges(mtrr_state.fixed_ranges);
+	if (mtrr_state.have_fixed)
+		get_fixed_ranges(mtrr_state.fixed_ranges);
 
 	rdmsr(MTRRdefType_MSR, lo, dummy);
 	mtrr_state.def_type = (lo & 0xff);
 	mtrr_state.enabled = (lo & 0xc00) >> 10;
+
+	if (mtrr_show) {
+		int high_width;
+
+		printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type));
+		if (mtrr_state.have_fixed) {
+			printk(KERN_INFO "MTRR fixed ranges %sabled:\n",
+			       mtrr_state.enabled & 1 ? "en" : "dis");
+			print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
+			for (i = 0; i < 2; ++i)
+				print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
+			for (i = 0; i < 8; ++i)
+				print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
+		}
+		printk(KERN_INFO "MTRR variable ranges %sabled:\n",
+		       mtrr_state.enabled & 2 ? "en" : "dis");
+		high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
+		for (i = 0; i < num_var_ranges; ++i) {
+			if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
+				printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n",
+				       i,
+				       high_width,
+				       mtrr_state.var_ranges[i].base_hi,
+				       mtrr_state.var_ranges[i].base_lo >> 12,
+				       high_width,
+				       mtrr_state.var_ranges[i].mask_hi,
+				       mtrr_state.var_ranges[i].mask_lo >> 12,
+				       mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
+			else
+				printk(KERN_INFO "MTRR %u disabled\n", i);
+		}
+	}
 }
 
 /*  Some BIOS's are fucked and don't set all MTRRs the same!  */
@@ -95,7 +147,7 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
 			smp_processor_id(), msr, a, b);
 }
 
-int generic_get_free_region(unsigned long base, unsigned long size)
+int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
 /*  [SUMMARY] Get a free MTRR.
     <base> The starting (base) address of the region.
     <size> The size (in bytes) of the region.
@@ -104,10 +156,11 @@ int generic_get_free_region(unsigned long base, unsigned long size)
 {
 	int i, max;
 	mtrr_type ltype;
-	unsigned long lbase;
-	unsigned lsize;
+	unsigned long lbase, lsize;
 
 	max = num_var_ranges;
+	if (replace_reg >= 0 && replace_reg < max)
+		return replace_reg;
 	for (i = 0; i < max; ++i) {
 		mtrr_if->get(i, &lbase, &lsize, &ltype);
 		if (lsize == 0)
@@ -117,7 +170,7 @@ int generic_get_free_region(unsigned long base, unsigned long size)
 }
 
 static void generic_get_mtrr(unsigned int reg, unsigned long *base,
-			     unsigned int *size, mtrr_type * type)
+			     unsigned long *size, mtrr_type *type)
 {
 	unsigned int mask_lo, mask_hi, base_lo, base_hi;
 
@@ -202,7 +255,9 @@ static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
 	return changed;
 }
 
-static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi)
+static u32 deftype_lo, deftype_hi;
+
+static unsigned long set_mtrr_state(void)
 /*  [SUMMARY] Set the MTRR state for this CPU.
     <state> The MTRR state information to read.
     <ctxt> Some relevant CPU context.
@@ -217,14 +272,14 @@ static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi)
 		if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i]))
 			change_mask |= MTRR_CHANGE_MASK_VARIABLE;
 
-	if (set_fixed_ranges(mtrr_state.fixed_ranges))
+	if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges))
 		change_mask |= MTRR_CHANGE_MASK_FIXED;
 
 	/*  Set_mtrr_restore restores the old value of MTRRdefType,
 	   so to set it we fiddle with the saved value  */
 	if ((deftype_lo & 0xff) != mtrr_state.def_type
 	    || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
-		deftype_lo |= (mtrr_state.def_type | mtrr_state.enabled << 10);
+		deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | (mtrr_state.enabled << 10);
 		change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
 	}
 
@@ -233,7 +288,6 @@ static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi)
 
 
 static unsigned long cr4 = 0;
-static u32 deftype_lo, deftype_hi;
 static DEFINE_SPINLOCK(set_atomicity_lock);
 
 /*
@@ -271,7 +325,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
 	rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
 
 	/*  Disable MTRRs, and set the default type to uncached  */
-	mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & 0xf300UL, deftype_hi);
+	mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi);
 }
 
 static void post_set(void) __releases(set_atomicity_lock)
@@ -300,7 +354,7 @@ static void generic_set_all(void)
 	prepare_set();
 
 	/* Actually set the state */
-	mask = set_mtrr_state(deftype_lo,deftype_hi);
+	mask = set_mtrr_state();
 
 	post_set();
 	local_irq_restore(flags);
@@ -374,7 +428,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
 		}
 	}
 
-	if (base < 0x100) {
+	if (base + size < 0x100) {
 		printk(KERN_WARNING "mtrr: cannot set region below 1 MiB (0x%lx000,0x%lx000)\n",
 		       base, size);
 		return -EINVAL;
diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c
index 5ac051b..9753bc6 100644
--- a/arch/i386/kernel/cpu/mtrr/if.c
+++ b/arch/i386/kernel/cpu/mtrr/if.c
@@ -17,7 +17,7 @@ extern unsigned int *usage_table;
 
 #define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
 
-static char *mtrr_strings[MTRR_NUM_TYPES] =
+static const char *const mtrr_strings[MTRR_NUM_TYPES] =
 {
     "uncachable",               /* 0 */
     "write-combining",          /* 1 */
@@ -28,7 +28,7 @@ static char *mtrr_strings[MTRR_NUM_TYPES] =
     "write-back",               /* 6 */
 };
 
-char *mtrr_attrib_to_str(int x)
+const char *mtrr_attrib_to_str(int x)
 {
 	return (x <= 6) ? mtrr_strings[x] : "?";
 }
@@ -155,6 +155,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
 {
 	int err = 0;
 	mtrr_type type;
+	unsigned long size;
 	struct mtrr_sentry sentry;
 	struct mtrr_gentry gentry;
 	void __user *arg = (void __user *) __arg;
@@ -235,15 +236,15 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
 	case MTRRIOC_GET_ENTRY:
 		if (gentry.regnum >= num_var_ranges)
 			return -EINVAL;
-		mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type);
+		mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
 
 		/* Hide entries that go above 4GB */
-		if (gentry.base + gentry.size > 0x100000
-		    || gentry.size == 0x100000)
+		if (gentry.base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
+		    || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)))
 			gentry.base = gentry.size = gentry.type = 0;
 		else {
 			gentry.base <<= PAGE_SHIFT;
-			gentry.size <<= PAGE_SHIFT;
+			gentry.size = size << PAGE_SHIFT;
 			gentry.type = type;
 		}
 
@@ -273,8 +274,14 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
 	case MTRRIOC_GET_PAGE_ENTRY:
 		if (gentry.regnum >= num_var_ranges)
 			return -EINVAL;
-		mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type);
-		gentry.type = type;
+		mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
+		/* Hide entries that would overflow */
+		if (size != (__typeof__(gentry.size))size)
+			gentry.base = gentry.size = gentry.type = 0;
+		else {
+			gentry.size = size;
+			gentry.type = type;
+		}
 		break;
 	}
 
@@ -353,8 +360,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
 	char factor;
 	int i, max, len;
 	mtrr_type type;
-	unsigned long base;
-	unsigned int size;
+	unsigned long base, size;
 
 	len = 0;
 	max = num_var_ranges;
@@ -373,7 +379,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
 			}
 			/* RED-PEN: base can be > 32bit */ 
 			len += seq_printf(seq, 
-				   "reg%02i: base=0x%05lx000 (%4liMB), size=%4i%cB: %s, count=%d\n",
+				   "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
 			     i, base, base >> (20 - PAGE_SHIFT), size, factor,
 			     mtrr_attrib_to_str(type), usage_table[i]);
 		}
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index a4de30b..aeea23e 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -172,6 +172,13 @@ static void ipi_handler(void *info)
 
 #endif
 
+static inline int types_compatible(mtrr_type type1, mtrr_type type2) {
+	return type1 == MTRR_TYPE_UNCACHABLE ||
+	       type2 == MTRR_TYPE_UNCACHABLE ||
+	       (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) ||
+	       (type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH);
+}
+
 /**
  * set_mtrr - update mtrrs on all processors
  * @reg:	mtrr in question
@@ -304,11 +311,9 @@ static void set_mtrr(unsigned int reg, unsigned long base,
 int mtrr_add_page(unsigned long base, unsigned long size, 
 		  unsigned int type, char increment)
 {
-	int i;
+	int i, replace, error;
 	mtrr_type ltype;
-	unsigned long lbase;
-	unsigned int lsize;
-	int error;
+	unsigned long lbase, lsize;
 
 	if (!mtrr_if)
 		return -ENXIO;
@@ -328,12 +333,18 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 		return -ENOSYS;
 	}
 
+	if (!size) {
+		printk(KERN_WARNING "mtrr: zero sized request\n");
+		return -EINVAL;
+	}
+
 	if (base & size_or_mask || size & size_or_mask) {
 		printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n");
 		return -EINVAL;
 	}
 
 	error = -EINVAL;
+	replace = -1;
 
 	/* No CPU hotplug when we change MTRR entries */
 	lock_cpu_hotplug();
@@ -341,21 +352,28 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 	mutex_lock(&mtrr_mutex);
 	for (i = 0; i < num_var_ranges; ++i) {
 		mtrr_if->get(i, &lbase, &lsize, &ltype);
-		if (base >= lbase + lsize)
-			continue;
-		if ((base < lbase) && (base + size <= lbase))
+		if (!lsize || base > lbase + lsize - 1 || base + size - 1 < lbase)
 			continue;
 		/*  At this point we know there is some kind of overlap/enclosure  */
-		if ((base < lbase) || (base + size > lbase + lsize)) {
+		if (base < lbase || base + size - 1 > lbase + lsize - 1) {
+			if (base <= lbase && base + size - 1 >= lbase + lsize - 1) {
+				/*  New region encloses an existing region  */
+				if (type == ltype) {
+					replace = replace == -1 ? i : -2;
+					continue;
+				}
+				else if (types_compatible(type, ltype))
+					continue;
+			}
 			printk(KERN_WARNING
 			       "mtrr: 0x%lx000,0x%lx000 overlaps existing"
-			       " 0x%lx000,0x%x000\n", base, size, lbase,
+			       " 0x%lx000,0x%lx000\n", base, size, lbase,
 			       lsize);
 			goto out;
 		}
 		/*  New region is enclosed by an existing region  */
 		if (ltype != type) {
-			if (type == MTRR_TYPE_UNCACHABLE)
+			if (types_compatible(type, ltype))
 				continue;
 			printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
 			     base, size, mtrr_attrib_to_str(ltype),
@@ -368,10 +386,18 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 		goto out;
 	}
 	/*  Search for an empty MTRR  */
-	i = mtrr_if->get_free_region(base, size);
+	i = mtrr_if->get_free_region(base, size, replace);
 	if (i >= 0) {
 		set_mtrr(i, base, size, type);
-		usage_table[i] = 1;
+		if (likely(replace < 0))
+			usage_table[i] = 1;
+		else {
+			usage_table[i] = usage_table[replace] + !!increment;
+			if (unlikely(replace != i)) {
+				set_mtrr(replace, 0, 0, 0);
+				usage_table[replace] = 0;
+			}
+		}
 	} else
 		printk(KERN_INFO "mtrr: no more MTRRs available\n");
 	error = i;
@@ -459,8 +485,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 {
 	int i, max;
 	mtrr_type ltype;
-	unsigned long lbase;
-	unsigned int lsize;
+	unsigned long lbase, lsize;
 	int error = -EINVAL;
 
 	if (!mtrr_if)
@@ -561,7 +586,7 @@ static void __init init_ifs(void)
 struct mtrr_value {
 	mtrr_type	ltype;
 	unsigned long	lbase;
-	unsigned int	lsize;
+	unsigned long	lsize;
 };
 
 static struct mtrr_value * mtrr_state;
diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h
index 99c9f26..d61ea9d 100644
--- a/arch/i386/kernel/cpu/mtrr/mtrr.h
+++ b/arch/i386/kernel/cpu/mtrr/mtrr.h
@@ -43,15 +43,16 @@ struct mtrr_ops {
 	void	(*set_all)(void);
 
 	void	(*get)(unsigned int reg, unsigned long *base,
-		       unsigned int *size, mtrr_type * type);
-	int	(*get_free_region) (unsigned long base, unsigned long size);
-
+		       unsigned long *size, mtrr_type * type);
+	int	(*get_free_region)(unsigned long base, unsigned long size,
+				   int replace_reg);
 	int	(*validate_add_page)(unsigned long base, unsigned long size,
 				     unsigned int type);
 	int	(*have_wrcomb)(void);
 };
 
-extern int generic_get_free_region(unsigned long base, unsigned long size);
+extern int generic_get_free_region(unsigned long base, unsigned long size,
+				   int replace_reg);
 extern int generic_validate_add_page(unsigned long base, unsigned long size,
 				     unsigned int type);
 
@@ -62,17 +63,17 @@ extern int positive_have_wrcomb(void);
 /* library functions for processor-specific routines */
 struct set_mtrr_context {
 	unsigned long flags;
-	unsigned long deftype_lo;
-	unsigned long deftype_hi;
 	unsigned long cr4val;
-	unsigned long ccr3;
+	u32 deftype_lo;
+	u32 deftype_hi;
+	u32 ccr3;
 };
 
 struct mtrr_var_range {
-	unsigned long base_lo;
-	unsigned long base_hi;
-	unsigned long mask_lo;
-	unsigned long mask_hi;
+	u32 base_lo;
+	u32 base_hi;
+	u32 mask_lo;
+	u32 mask_hi;
 };
 
 void set_mtrr_done(struct set_mtrr_context *ctxt);
@@ -92,6 +93,6 @@ extern struct mtrr_ops * mtrr_if;
 extern unsigned int num_var_ranges;
 
 void mtrr_state_warn(void);
-char *mtrr_attrib_to_str(int x);
+const char *mtrr_attrib_to_str(int x);
 void mtrr_wrmsr(unsigned, unsigned, unsigned);
 
-- 
cgit v0.10.2


From ba10650a880c2df23bd1db6c0570ddb66f389641 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 7 Dec 2006 02:14:10 +0100
Subject: [PATCH] i386: alloc_gdt() static

Make the needlessly global alloc_gdt() static.

(against) pda-percpu-init

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@muc.de>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 68bcb68..1b34c56 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -609,7 +609,7 @@ struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
 	return regs;
 }
 
-__cpuinit int alloc_gdt(int cpu)
+static __cpuinit int alloc_gdt(int cpu)
 {
 	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
 	struct desc_struct *gdt;
-- 
cgit v0.10.2


From 2c22d8baa98a92022acb85b0b7c6f4a60df55f47 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Thu, 7 Dec 2006 02:14:10 +0100
Subject: [PATCH] relocatable kernel: Fix kallsyms on avr32 after relocatable
 kernel changes

o On some platforms like avr32, section init comes before .text and
  not necessarily a symbol's relative position w.r.t _text is positive.
  In such cases assembler detects the overflow and emits warning. This
  patch fixes it.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c
index 4c1ad0a..f359b73 100644
--- a/scripts/kallsyms.c
+++ b/scripts/kallsyms.c
@@ -277,8 +277,12 @@ static void write_src(void)
 	output_label("kallsyms_addresses");
 	for (i = 0; i < table_cnt; i++) {
 		if (toupper(table[i].sym[0]) != 'A') {
-			printf("\tPTR\t_text + %#llx\n",
-				table[i].addr - _text);
+			if (_text <= table[i].addr)
+				printf("\tPTR\t_text + %#llx\n",
+					table[i].addr - _text);
+			else
+				printf("\tPTR\t_text - %#llx\n",
+					_text - table[i].addr);
 		} else {
 			printf("\tPTR\t%#llx\n", table[i].addr);
 		}
-- 
cgit v0.10.2


From 79929fd1c1887d2a057cbb80d487a2e2f1c01a02 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Thu, 7 Dec 2006 02:14:10 +0100
Subject: [PATCH] i386: Convert more absolute symbols to section relative

o Convert more absolute symbols to section relative to keep the theme in
  vmlinux.lds.S file and to avoid problem if kernel is relocated.

o Also put a message so that in future people can be aware of it and
  avoid introducing absolute symbols.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 877dc5c..25581e8 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -8,6 +8,12 @@
  * put it inside the section definition.
  */
 
+/* Don't define absolute symbols until and unless you know that symbol
+ * value is should remain constant even if kernel image is relocated
+ * at run time. Absolute symbols are not relocated. If symbol value should
+ * change if kernel is relocated, make the symbol section relative and
+ * put it inside the section definition.
+ */
 #define LOAD_OFFSET __PAGE_OFFSET
 
 #include <asm-generic/vmlinux.lds.h>
@@ -65,11 +71,11 @@ SECTIONS
 	CONSTRUCTORS
 	} :data
 
-  __start_paravirtprobe = .;
   .paravirtprobe : AT(ADDR(.paravirtprobe) - LOAD_OFFSET) {
+  	__start_paravirtprobe = .;
 	*(.paravirtprobe)
+  	__stop_paravirtprobe = .;
   }
-  __stop_paravirtprobe = .;
 
   . = ALIGN(4096);
   .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
@@ -172,11 +178,11 @@ SECTIONS
 	*(.altinstr_replacement)
   }
   . = ALIGN(4);
-  __start_parainstructions = .;
   .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+  	__start_parainstructions = .;
 	*(.parainstructions)
+  	__stop_parainstructions = .;
   }
-  __stop_parainstructions = .;
   /* .exit.text is discard at runtime, not link time, to deal with references
      from .altinstructions and .eh_frame */
   .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
-- 
cgit v0.10.2


From 274e1bbdeeaf16e71418f11f5f305ab26061f2c2 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Thu, 7 Dec 2006 02:14:10 +0100
Subject: [PATCH] x86: add write_pci_config_byte() to direct PCI access
 routines

Mechanism of selecting physical mode in genapic when cpu hotplug is enabled on
x86_64, broke the quirk(quirk_intel_irqbalance()) introduced for working
around the transposing interrupt message errata in E7520/E7320/E7525 (revision
ID 0x9 and below.  errata #23 in
http://download.intel.com/design/chipsets/specupdt/30304203.pdf).

This errata requires the mode to be in logical flat, so that interrupts can be
directed to more than one cpu(and thus use hardware IRQ balancing enabled by
BIOS on these platforms).

Following four patches fixes this by moving the quirk to early quirk and
forcing the x86_64 genapic selection to logical flat on these platforms.

Thanks to Shaohua for pointing out the breakage.

This patch:

Add write_pci_config_byte() to direct PCI access  routines

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: "Li, Shaohua" <shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/pci/early.c b/arch/i386/pci/early.c
index 713d6c8..42df4b6 100644
--- a/arch/i386/pci/early.c
+++ b/arch/i386/pci/early.c
@@ -45,6 +45,13 @@ void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset,
 	outl(val, 0xcfc);
 }
 
+void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val)
+{
+	PDprintk("%x writing to %x: %x\n", slot, offset, val);
+	outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
+	outb(val, 0xcfc);
+}
+
 int early_pci_allowed(void)
 {
 	return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) ==
diff --git a/include/asm-x86_64/pci-direct.h b/include/asm-x86_64/pci-direct.h
index eba9cb4..6823fa4 100644
--- a/include/asm-x86_64/pci-direct.h
+++ b/include/asm-x86_64/pci-direct.h
@@ -10,6 +10,7 @@ extern u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset);
 extern u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset);
 extern u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset);
 extern void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, u32 val);
+extern void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val);
 
 extern int early_pci_allowed(void);
 
-- 
cgit v0.10.2


From fd6d7d26897dec834d0b9fbdc59819b0332a1257 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Thu, 7 Dec 2006 02:14:10 +0100
Subject: [PATCH] i386: introduce the mechanism of disabling cpu hotplug
 control

Add 'enable_cpu_hotplug' flag and when cleared, the hotplug control file
("online") will not be added under /sys/devices/system/cpu/cpuX/

Next patch doing PCI quirks will use this.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: "Li, Shaohua" <shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c
index 07d6da3..844c08f 100644
--- a/arch/i386/kernel/topology.c
+++ b/arch/i386/kernel/topology.c
@@ -40,14 +40,18 @@ int arch_register_cpu(int num)
 	 * restrictions and assumptions in kernel. This basically
 	 * doesnt add a control file, one cannot attempt to offline
 	 * BSP.
+	 *
+	 * Also certain PCI quirks require not to enable hotplug control
+	 * for all CPU's.
 	 */
-	if (!num)
+	if (!num || !enable_cpu_hotplug)
 		cpu_devices[num].cpu.no_control = 1;
 
 	return register_cpu(&cpu_devices[num].cpu, num);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
+int enable_cpu_hotplug = 1;
 
 void arch_unregister_cpu(int num) {
 	return unregister_cpu(&cpu_devices[num].cpu);
diff --git a/include/asm-i386/cpu.h b/include/asm-i386/cpu.h
index b1bc7b1..9d914e1e 100644
--- a/include/asm-i386/cpu.h
+++ b/include/asm-i386/cpu.h
@@ -13,6 +13,9 @@ struct i386_cpu {
 extern int arch_register_cpu(int num);
 #ifdef CONFIG_HOTPLUG_CPU
 extern void arch_unregister_cpu(int);
+extern int enable_cpu_hotplug;
+#else
+#define enable_cpu_hotplug	0
 #endif
 
 DECLARE_PER_CPU(int, cpu_state);
-- 
cgit v0.10.2


From 72486f1f8f0a2bc828b9d30cf4690cf2dd6807fc Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Thu, 7 Dec 2006 02:14:10 +0100
Subject: [PATCH] i386: change the 'no_control' field to 'hotpluggable' in the
 struct cpu

Change the 'no_control' field in the cpu struct to a more positive
and better term 'hotpluggable'. And change(/cleanup) the logic accordingly.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: "Li, Shaohua" <shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c
index 844c08f..79cf608 100644
--- a/arch/i386/kernel/topology.c
+++ b/arch/i386/kernel/topology.c
@@ -44,8 +44,8 @@ int arch_register_cpu(int num)
 	 * Also certain PCI quirks require not to enable hotplug control
 	 * for all CPU's.
 	 */
-	if (!num || !enable_cpu_hotplug)
-		cpu_devices[num].cpu.no_control = 1;
+	if (num && enable_cpu_hotplug)
+		cpu_devices[num].cpu.hotpluggable = 1;
 
 	return register_cpu(&cpu_devices[num].cpu, num);
 }
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index 5629b45..687500d 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -31,11 +31,11 @@ int arch_register_cpu(int num)
 {
 #if defined (CONFIG_ACPI) && defined (CONFIG_HOTPLUG_CPU)
 	/*
-	 * If CPEI cannot be re-targetted, and this is
-	 * CPEI target, then dont create the control file
+	 * If CPEI can be re-targetted or if this is not
+	 * CPEI target, then it is hotpluggable
 	 */
-	if (!can_cpei_retarget() && is_cpu_cpei_target(num))
-		sysfs_cpus[num].cpu.no_control = 1;
+	if (can_cpei_retarget() || !is_cpu_cpei_target(num))
+		sysfs_cpus[num].cpu.hotpluggable = 1;
 	map_cpu_to_node(num, node_cpuid[num].nid);
 #endif
 
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 22123a0..63ed265 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -239,7 +239,7 @@ static void unregister_cpu_online(unsigned int cpu)
 	struct cpu *c = &per_cpu(cpu_devices, cpu);
 	struct sys_device *s = &c->sysdev;
 
-	BUG_ON(c->no_control);
+	BUG_ON(!c->hotpluggable);
 
 	if (!firmware_has_feature(FW_FEATURE_ISERIES) &&
 			cpu_has_feature(CPU_FTR_SMT))
@@ -424,10 +424,10 @@ static int __init topology_init(void)
 		 * CPU.  For instance, the boot cpu might never be valid
 		 * for hotplugging.
 		 */
-		if (!ppc_md.cpu_die)
-			c->no_control = 1;
+		if (ppc_md.cpu_die)
+			c->hotpluggable = 1;
 
-		if (cpu_online(cpu) || (c->no_control == 0)) {
+		if (cpu_online(cpu) || c->hotpluggable) {
 			register_cpu(c, cpu);
 
 			sysdev_create_file(&c->sysdev, &attr_physical_id);
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 1f745f1..7fd095e 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -104,8 +104,8 @@ static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL);
 
 /*
  * register_cpu - Setup a driverfs device for a CPU.
- * @cpu - Callers can set the cpu->no_control field to 1, to indicate not to
- *		  generate a control file in sysfs for this CPU.
+ * @cpu - cpu->hotpluggable field set to 1 will generate a control file in
+ *	  sysfs for this CPU.
  * @num - CPU number to use when creating the device.
  *
  * Initialize and register the CPU device.
@@ -119,7 +119,7 @@ int __devinit register_cpu(struct cpu *cpu, int num)
 
 	error = sysdev_register(&cpu->sysdev);
 
-	if (!error && !cpu->no_control)
+	if (!error && cpu->hotpluggable)
 		register_cpu_control(cpu);
 	if (!error)
 		cpu_sys_devices[num] = &cpu->sysdev;
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index f02d71b..ad90340 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -27,7 +27,7 @@
 
 struct cpu {
 	int node_id;		/* The node which contains the CPU */
-	int no_control;		/* Should the sysfs control file be created? */
+	int hotpluggable;	/* creates sysfs control file if hotpluggable */
 	struct sys_device sysdev;
 };
 
-- 
cgit v0.10.2


From 9899f826fc90beba4f78083f6230e06cbe1050c9 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Thu, 7 Dec 2006 02:14:10 +0100
Subject: [PATCH] x86-64: add genapic_force

Add genapic_force. Used by the next Intel quirks patch.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: "Li, Shaohua" <shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c
index 8e78a75..b007433 100644
--- a/arch/x86_64/kernel/genapic.c
+++ b/arch/x86_64/kernel/genapic.c
@@ -33,7 +33,7 @@ extern struct genapic apic_flat;
 extern struct genapic apic_physflat;
 
 struct genapic *genapic = &apic_flat;
-
+struct genapic *genapic_force;
 
 /*
  * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
@@ -46,6 +46,13 @@ void __init clustered_apic_check(void)
 	u8 cluster_cnt[NUM_APIC_CLUSTERS];
 	int max_apic = 0;
 
+	/* genapic selection can be forced because of certain quirks.
+	 */
+	if (genapic_force) {
+		genapic = genapic_force;
+		goto print;
+	}
+
 #if defined(CONFIG_ACPI)
 	/*
 	 * Some x86_64 machines use physical APIC mode regardless of how many
diff --git a/include/asm-x86_64/genapic.h b/include/asm-x86_64/genapic.h
index a0e9a4b..b80f4bb 100644
--- a/include/asm-x86_64/genapic.h
+++ b/include/asm-x86_64/genapic.h
@@ -30,6 +30,6 @@ struct genapic {
 };
 
 
-extern struct genapic *genapic;
+extern struct genapic *genapic, *genapic_force, apic_flat;
 
 #endif
-- 
cgit v0.10.2


From b0d0a4ba45760b10ecee9035ed45b442c1a6cc84 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Thu, 7 Dec 2006 02:14:10 +0100
Subject: [PATCH] x86: fix the irqbalance quirk for E7320/E7520/E7525

Move the irqbalance quirks for E7320/E7520/E7525(Errata 23 in
http://download.intel.com/design/chipsets/specupdt/30304203.pdf) to early
quirks.

And add a PCI quirk for these platforms to check(which happens very late
during the boot) if the APIC routing is indeed set to default flat mode.

This fixes the breakage(in x86_64) of this quirk due to cpu hotplug which
selects physical mode instead of the logical flat(as needed for this errata
workaround).

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: "Li, Shaohua" <shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c
index c984169..4b60af7 100644
--- a/arch/i386/kernel/acpi/earlyquirk.c
+++ b/arch/i386/kernel/acpi/earlyquirk.c
@@ -10,6 +10,7 @@
 #include <asm/pci-direct.h>
 #include <asm/acpi.h>
 #include <asm/apic.h>
+#include <asm/irq.h>
 
 #ifdef CONFIG_ACPI
 
@@ -49,6 +50,24 @@ static int __init check_bridge(int vendor, int device)
 	return 0;
 }
 
+static void check_intel(void)
+{
+	u16 vendor, device;
+
+	vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID);
+
+	if (vendor != PCI_VENDOR_ID_INTEL)
+		return;
+
+	device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID);
+#ifdef CONFIG_SMP
+	if (device == PCI_DEVICE_ID_INTEL_E7320_MCH ||
+	    device == PCI_DEVICE_ID_INTEL_E7520_MCH ||
+	    device == PCI_DEVICE_ID_INTEL_E7525_MCH)
+		quirk_intel_irqbalance();
+#endif
+}
+
 void __init check_acpi_pci(void)
 {
 	int num, slot, func;
@@ -60,6 +79,8 @@ void __init check_acpi_pci(void)
 	if (!early_pci_allowed())
 		return;
 
+	check_intel();
+
 	/* Poor man's PCI discovery */
 	for (num = 0; num < 32; num++) {
 		for (slot = 0; slot < 32; slot++) {
diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c
index 9f6ab17..a01320a 100644
--- a/arch/i386/kernel/quirks.c
+++ b/arch/i386/kernel/quirks.c
@@ -3,10 +3,23 @@
  */
 #include <linux/pci.h>
 #include <linux/irq.h>
+#include <asm/pci-direct.h>
+#include <asm/genapic.h>
+#include <asm/cpu.h>
 
 #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
+static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
+{
+#ifdef CONFIG_X86_64
+	if (genapic !=  &apic_flat)
+		panic("APIC mode must be flat on this system\n");
+#elif defined(CONFIG_X86_GENERICARCH)
+	if (genapic != &apic_default)
+		panic("APIC mode must be default(flat) on this system. Use apic=default\n");
+#endif
+}
 
-static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
+void __init quirk_intel_irqbalance(void)
 {
 	u8 config, rev;
 	u32 word;
@@ -16,18 +29,18 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
 	 * based platforms.
 	 * Disable SW irqbalance/affinity on those platforms.
 	 */
-	pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
+	rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
 	if (rev > 0x9)
 		return;
 
 	printk(KERN_INFO "Intel E7520/7320/7525 detected.");
 
-	/* enable access to config space*/
-	pci_read_config_byte(dev, 0xf4, &config);
-	pci_write_config_byte(dev, 0xf4, config|0x2);
+	/* enable access to config space */
+	config = read_pci_config_byte(0, 0, 0, 0xf4);
+	write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
 
 	/* read xTPR register */
-	raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
+	word = read_pci_config_16(0, 0, 0x40, 0x4c);
 
 	if (!(word & (1 << 13))) {
 		printk(KERN_INFO "Disabling irq balancing and affinity\n");
@@ -38,13 +51,24 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
 #ifdef CONFIG_PROC_FS
 		no_irq_affinity = 1;
 #endif
+#ifdef CONFIG_HOTPLUG_CPU
+		printk(KERN_INFO "Disabling cpu hotplug control\n");
+		enable_cpu_hotplug = 0;
+#endif
+#ifdef CONFIG_X86_64
+		/* force the genapic selection to flat mode so that
+		 * interrupts can be redirected to more than one CPU.
+		 */
+		genapic_force = &apic_flat;
+#endif
 	}
 
-	/* put back the original value for config space*/
+	/* put back the original value for config space */
 	if (!(config & 0x2))
-		pci_write_config_byte(dev, 0xf4, config);
+		write_pci_config_byte(0, 0, 0, 0xf4, config);
 }
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7320_MCH,	quirk_intel_irqbalance);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7525_MCH,	quirk_intel_irqbalance);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7520_MCH,	quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7320_MCH,  verify_quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7525_MCH,  verify_quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7520_MCH,  verify_quirk_intel_irqbalance);
+
 #endif
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index cd7de9c..346f27f 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -58,6 +58,7 @@
 #include <asm/arch_hooks.h>
 #include <asm/nmi.h>
 #include <asm/pda.h>
+#include <asm/genapic.h>
 
 #include <mach_apic.h>
 #include <mach_wakecpu.h>
@@ -1482,6 +1483,12 @@ int __devinit __cpu_up(unsigned int cpu)
 	cpu_set(cpu, smp_commenced_mask);
 	while (!cpu_isset(cpu, cpu_online_map))
 		cpu_relax();
+
+#ifdef CONFIG_X86_GENERICARCH
+	if (num_online_cpus() > 8 && genapic == &apic_default)
+		panic("Default flat APIC routing can't be used with > 8 cpus\n");
+#endif
+
 	return 0;
 }
 
diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c
index fb0c6da..829698f 100644
--- a/arch/x86_64/kernel/early-quirks.c
+++ b/arch/x86_64/kernel/early-quirks.c
@@ -71,6 +71,18 @@ static void ati_bugs(void)
 {
 }
 
+static void intel_bugs(void)
+{
+	u16 device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID);
+
+#ifdef CONFIG_SMP
+	if (device == PCI_DEVICE_ID_INTEL_E7320_MCH ||
+	    device == PCI_DEVICE_ID_INTEL_E7520_MCH ||
+	    device == PCI_DEVICE_ID_INTEL_E7525_MCH)
+		quirk_intel_irqbalance();
+#endif
+}
+
 struct chipset {
 	u16 vendor;
 	void (*f)(void);
@@ -80,6 +92,7 @@ static struct chipset early_qrk[] = {
 	{ PCI_VENDOR_ID_NVIDIA, nvidia_bugs },
 	{ PCI_VENDOR_ID_VIA, via_bugs },
 	{ PCI_VENDOR_ID_ATI, ati_bugs },
+	{ PCI_VENDOR_ID_INTEL, intel_bugs},
 	{}
 };
 
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 62c2e74..4c161c2 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -60,6 +60,7 @@
 #include <asm/irq.h>
 #include <asm/hw_irq.h>
 #include <asm/numa.h>
+#include <asm/genapic.h>
 
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
@@ -1167,6 +1168,13 @@ int __cpuinit __cpu_up(unsigned int cpu)
 
 	while (!cpu_isset(cpu, cpu_online_map))
 		cpu_relax();
+
+	if (num_online_cpus() > 8 && genapic == &apic_flat) {
+		printk(KERN_WARNING
+		       "flat APIC routing can't be used with > 8 cpus\n");
+		BUG();
+	}
+
 	err = 0;
 
 	return err;
diff --git a/include/asm-i386/genapic.h b/include/asm-i386/genapic.h
index 8ffbb0f..fd2be59 100644
--- a/include/asm-i386/genapic.h
+++ b/include/asm-i386/genapic.h
@@ -122,6 +122,6 @@ struct genapic {
 	APICFUNC(phys_pkg_id) \
 	}
 
-extern struct genapic *genapic;
+extern struct genapic *genapic, apic_default;
 
 #endif
diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h
index 9e15ce0..11761cd 100644
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -37,6 +37,8 @@ static __inline__ int irq_canonicalize(int irq)
 extern int irqbalance_disable(char *str);
 #endif
 
+extern void quirk_intel_irqbalance(void);
+
 #ifdef CONFIG_HOTPLUG_CPU
 extern void fixup_irqs(cpumask_t map);
 #endif
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index 21fa5af..6d324b8 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -87,6 +87,7 @@ extern void syscall32_cpu_init(void);
 extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end);
 
 extern void early_quirks(void);
+extern void quirk_intel_irqbalance(void);
 extern void check_efer(void);
 
 extern int unhandled_signal(struct task_struct *tsk, int sig);
-- 
cgit v0.10.2


From e1cccf48b182dd743c3c83a4fdf8dc570a43b393 Mon Sep 17 00:00:00 2001
From: Artiom Myaskouvskey <artiom.myaskouvskey@intel.com>
Date: Thu, 7 Dec 2006 02:14:11 +0100
Subject: [PATCH] i386: call efi_get_time during suspend

Function efi_get_time called not only during init kernel phase but also
during suspend (from get_cmos_time).

When it is called from get_cmos_time the corresponding runtime service
should be called in virtual and not in physical mode.

Signed-off-by: Artiom Myaskouvskey <artiom.myaskouvskey@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: "Narayanan, Chandramouli" <chandramouli.narayanan@intel.com>
Cc: "Jiossy, Rami" <rami.jiossy@intel.com>
Cc: "Satt, Shai" <shai.satt@intel.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Matt Domsch <Matt_Domsch@dell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index 8b40648..b92c7f0 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -194,17 +194,24 @@ inline int efi_set_rtc_mmss(unsigned long nowtime)
 	return 0;
 }
 /*
- * This should only be used during kernel init and before runtime
- * services have been remapped, therefore, we'll need to call in physical
- * mode.  Note, this call isn't used later, so mark it __init.
+ * This is used during kernel init before runtime
+ * services have been remapped and also during suspend, therefore,
+ * we'll need to call both in physical and virtual modes.
  */
-inline unsigned long __init efi_get_time(void)
+inline unsigned long efi_get_time(void)
 {
 	efi_status_t status;
 	efi_time_t eft;
 	efi_time_cap_t cap;
 
-	status = phys_efi_get_time(&eft, &cap);
+	if (efi.get_time) {
+		/* if we are in virtual mode use remapped function */
+ 		status = efi.get_time(&eft, &cap);
+	} else {
+		/* we are in physical mode */
+		status = phys_efi_get_time(&eft, &cap);
+	}
+
 	if (status != EFI_SUCCESS)
 		printk("Oops: efitime: can't read time status: 0x%lx\n",status);
 
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 66d621d..91ecf49 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -300,7 +300,7 @@ extern int efi_mem_attribute_range (unsigned long phys_addr, unsigned long size,
 extern int __init efi_uart_console_only (void);
 extern void efi_initialize_iomem_resources(struct resource *code_resource,
 					struct resource *data_resource);
-extern unsigned long __init efi_get_time(void);
+extern unsigned long efi_get_time(void);
 extern int __init efi_set_rtc_mmss(unsigned long nowtime);
 extern struct efi_memory_map memmap;
 
-- 
cgit v0.10.2


From 956fb53197f82257974f1f9835485aeeef4510b3 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 7 Dec 2006 02:14:11 +0100
Subject: [PATCH] i386: handle a negative return value

The Coverity checker noted that bad things might happen if
find_isa_irq_apic() returned -1.

[akpm@osdl.org: add debugging checks]
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Acked-by: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 993150f..7bfd6c3 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -2179,9 +2179,15 @@ static inline void unlock_ExtINT_logic(void)
 	unsigned char save_control, save_freq_select;
 
 	pin  = find_isa_irq_pin(8, mp_INT);
+	if (pin == -1) {
+		WARN_ON_ONCE(1);
+		return;
+	}
 	apic = find_isa_irq_apic(8, mp_INT);
-	if (pin == -1)
+	if (apic == -1) {
+		WARN_ON_ONCE(1);
 		return;
+	}
 
 	entry0 = ioapic_read_entry(apic, pin);
 	clear_IO_APIC_pin(apic, pin);
-- 
cgit v0.10.2


From 7e95b593a1aeb6fe1d3904e799d23a45261f2c19 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 7 Dec 2006 02:14:11 +0100
Subject: [PATCH] i386: Make irq_vector static

irq_vector[] can now become static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 7bfd6c3..56f571c 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -1241,7 +1241,7 @@ static inline int IO_APIC_irq_trigger(int irq)
 }
 
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
+static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
 
 static int __assign_irq_vector(int irq)
 {
-- 
cgit v0.10.2


From ee58fad51a2a767cb2567706ace967705233d881 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@hpl.hp.com>
Date: Thu, 7 Dec 2006 02:14:11 +0100
Subject: [PATCH] x86-64: x86-64 add Intel BTS cpufeature bit and detection
 (take 2)

Here is a small patch for x86-64 which adds a cpufeature flag and
detection code for Intel's Branch Trace Store (BTS) feature. This
feature can be found on Intel P4 and Core 2 processors among others.
It can also be used by perfmon.

changelog:
	- add CPU_FEATURE_BTS
	- add Branch Trace Store detection

signed-off-by: stephane eranian <eranian@hpl.hp.com>

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 619af2e..a570c81 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -838,6 +838,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 	if (cpu_has_ds) {
 		unsigned int l1, l2;
 		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
+		if (!(l1 & (1<<11)))
+			set_bit(X86_FEATURE_BTS, c->x86_capability);
 		if (!(l1 & (1<<12)))
 			set_bit(X86_FEATURE_PEBS, c->x86_capability);
 	}
diff --git a/include/asm-x86_64/cpufeature.h b/include/asm-x86_64/cpufeature.h
index d280384..0b3c686 100644
--- a/include/asm-x86_64/cpufeature.h
+++ b/include/asm-x86_64/cpufeature.h
@@ -69,6 +69,7 @@
 #define X86_FEATURE_UP		(3*32+8) /* SMP kernel running on UP */
 #define X86_FEATURE_ARCH_PERFMON (3*32+9) /* Intel Architectural PerfMon */
 #define X86_FEATURE_PEBS	(3*32+10) /* Precise-Event Based Sampling */
+#define X86_FEATURE_BTS		(3*32+11) /* Branch Trace Store */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */
@@ -115,5 +116,6 @@
 #define cpu_has_clflush	       boot_cpu_has(X86_FEATURE_CLFLSH)
 #define cpu_has_ds 	       boot_cpu_has(X86_FEATURE_DS)
 #define cpu_has_pebs 	       boot_cpu_has(X86_FEATURE_PEBS)
+#define cpu_has_bts 	       boot_cpu_has(X86_FEATURE_BTS)
 
 #endif /* __ASM_X8664_CPUFEATURE_H */
-- 
cgit v0.10.2


From 538f188e03c821c93b355c9fc346806cdd34e286 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@hpl.hp.com>
Date: Thu, 7 Dec 2006 02:14:11 +0100
Subject: [PATCH] i386: i386 add Intel BTS cpufeature bit and detection (take
 2)

Here is a small patch for i386 which adds a cpufeature flag and
detection code for Intel's Branch Trace Store (BTS) feature. This
feature can be found on Intel P4 and Core 2 processors among others.
It can also be used by perfmon.

changelog:
	- add CPU_FEATURE_BTS
	- add Branch Trace Store detection

signed-off-by: stephane eranian <eranian@hpl.hp.com>

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index 3ae795e..56fe265 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -199,6 +199,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 	if (cpu_has_ds) {
 		unsigned int l1;
 		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
+		if (!(l1 & (1<<11)))
+			set_bit(X86_FEATURE_BTS, c->x86_capability);
 		if (!(l1 & (1<<12)))
 			set_bit(X86_FEATURE_PEBS, c->x86_capability);
 	}
diff --git a/include/asm-i386/cpufeature.h b/include/asm-i386/cpufeature.h
index 4c83e05..3f92b94 100644
--- a/include/asm-i386/cpufeature.h
+++ b/include/asm-i386/cpufeature.h
@@ -74,6 +74,7 @@
 #define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* FXSAVE leaks FOP/FIP/FOP */
 #define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
 #define X86_FEATURE_PEBS	(3*32+12)  /* Precise-Event Based Sampling */
+#define X86_FEATURE_BTS		(3*32+13)  /* Branch Trace Store */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */
@@ -138,6 +139,7 @@
 #define cpu_has_ds		boot_cpu_has(X86_FEATURE_DS)
 #define cpu_has_pebs 		boot_cpu_has(X86_FEATURE_PEBS)
 #define cpu_has_clflush		boot_cpu_has(X86_FEATURE_CLFLSH)
+#define cpu_has_bts 		boot_cpu_has(X86_FEATURE_BTS)
 
 #endif /* __ASM_I386_CPUFEATURE_H */
 
-- 
cgit v0.10.2


From 9a8cb626a08f2c8251291f3c0a049b29665895d2 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:11 +0100
Subject: [PATCH] i386: Avoid boot warning with apic=debug

There are two consumers of apic=: the apic debug level and the low
level generic architecture code. early_param would warn when the
low level code rejected "debug". Avoid this.

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/mach-generic/probe.c b/arch/i386/mach-generic/probe.c
index 94b1fd9..a7b3999 100644
--- a/arch/i386/mach-generic/probe.c
+++ b/arch/i386/mach-generic/probe.c
@@ -45,7 +45,9 @@ static int __init parse_apic(char *arg)
 			return 0;
 		}
 	}
-	return -ENOENT;
+
+	/* Parsed again by __setup for debug/verbose */
+	return 0;
 }
 early_param("apic", parse_apic);
 
-- 
cgit v0.10.2


From f990fff427d68af3e4e1d16fe799c106abc0bf53 Mon Sep 17 00:00:00 2001
From: Karsten Wiese <fzu@wemgehoertderstaat.de>
Date: Thu, 7 Dec 2006 02:14:11 +0100
Subject: [PATCH] x86: Regard MSRs in lapic_suspend()/lapic_resume()

Read/Write APIC_LVTPC and APIC_LVTTHMR only,
if get_maxlvt() returns certain values.
This is done like everywhere else in i386/kernel/apic.c,
so I guess its correct.
Suspends/Resumes to disk fine and eleminates an smp_error_interrupt()
here on a K8.

AK: ported to x86-64 too

Signed-off-by: Karsten Wiese <fzu@wemgehoertderstaat.de>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 2fd4b7d..776d9be 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -647,23 +647,30 @@ static struct {
 static int lapic_suspend(struct sys_device *dev, pm_message_t state)
 {
 	unsigned long flags;
+	int maxlvt;
 
 	if (!apic_pm_state.active)
 		return 0;
 
+	maxlvt = get_maxlvt();
+
 	apic_pm_state.apic_id = apic_read(APIC_ID);
 	apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
 	apic_pm_state.apic_ldr = apic_read(APIC_LDR);
 	apic_pm_state.apic_dfr = apic_read(APIC_DFR);
 	apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
 	apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
-	apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+	if (maxlvt >= 4)
+		apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
 	apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
 	apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
 	apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
 	apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
 	apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-	apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+#ifdef CONFIG_X86_MCE_P4THERMAL
+	if (maxlvt >= 5)
+		apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+#endif
 	
 	local_irq_save(flags);
 	disable_local_APIC();
@@ -675,10 +682,13 @@ static int lapic_resume(struct sys_device *dev)
 {
 	unsigned int l, h;
 	unsigned long flags;
+	int maxlvt;
 
 	if (!apic_pm_state.active)
 		return 0;
 
+	maxlvt = get_maxlvt();
+
 	local_irq_save(flags);
 
 	/*
@@ -700,8 +710,12 @@ static int lapic_resume(struct sys_device *dev)
 	apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
 	apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
 	apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-	apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
-	apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
+#ifdef CONFIG_X86_MCE_P4THERMAL
+	if (maxlvt >= 5)
+		apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+#endif
+	if (maxlvt >= 4)
+		apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
 	apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
 	apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
 	apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 5c46897..f0b00d8 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -459,23 +459,30 @@ static struct {
 static int lapic_suspend(struct sys_device *dev, pm_message_t state)
 {
 	unsigned long flags;
+	int maxlvt;
 
 	if (!apic_pm_state.active)
 		return 0;
 
+	maxlvt = get_maxlvt();
+
 	apic_pm_state.apic_id = apic_read(APIC_ID);
 	apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
 	apic_pm_state.apic_ldr = apic_read(APIC_LDR);
 	apic_pm_state.apic_dfr = apic_read(APIC_DFR);
 	apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
 	apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
-	apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+	if (maxlvt >= 4)
+		apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
 	apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
 	apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
 	apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
 	apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
 	apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-	apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+#ifdef CONFIG_X86_MCE_INTEL
+	if (maxlvt >= 5)
+		apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+#endif
 	local_irq_save(flags);
 	disable_local_APIC();
 	local_irq_restore(flags);
@@ -486,10 +493,13 @@ static int lapic_resume(struct sys_device *dev)
 {
 	unsigned int l, h;
 	unsigned long flags;
+	int maxlvt;
 
 	if (!apic_pm_state.active)
 		return 0;
 
+	maxlvt = get_maxlvt();
+
 	local_irq_save(flags);
 	rdmsr(MSR_IA32_APICBASE, l, h);
 	l &= ~MSR_IA32_APICBASE_BASE;
@@ -503,8 +513,12 @@ static int lapic_resume(struct sys_device *dev)
 	apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
 	apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
 	apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-	apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
-	apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
+#ifdef CONFIG_X86_MCE_INTEL
+	if (maxlvt >= 5)
+		apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+#endif
+	if (maxlvt >= 4)
+		apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
 	apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
 	apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
 	apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
-- 
cgit v0.10.2


From bf7e6a196318316e921f357557fca9d11d15f486 Mon Sep 17 00:00:00 2001
From: Artiom Myaskouvskey <artiom.myaskouvskey@intel.com>
Date: Thu, 7 Dec 2006 02:14:11 +0100
Subject: [PATCH] i386: Preserve EFI run time regions with memmap parameter

When using memmap kernel parameter in EFI boot we should also add to memory map
memory regions of runtime services to enable their mapping later.

AK: merged and cleaned up the patch

Signed-off-by: Artiom Myaskouvskey <artiom.myaskouvskey@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
index b704790f..2f7d0a9 100644
--- a/arch/i386/kernel/e820.c
+++ b/arch/i386/kernel/e820.c
@@ -742,29 +742,55 @@ void __init print_memory_map(char *who)
 	}
 }
 
-void __init limit_regions(unsigned long long size)
+static __init __always_inline void efi_limit_regions(unsigned long long size)
 {
 	unsigned long long current_addr = 0;
+	efi_memory_desc_t *md, *next_md;
+	void *p, *p1;
+	int i, j;
+
+	j = 0;
+	p1 = memmap.map;
+	for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
+		md = p;
+		next_md = p1;
+		current_addr = md->phys_addr +
+			PFN_PHYS(md->num_pages);
+		if (is_available_memory(md)) {
+			if (md->phys_addr >= size) continue;
+			memcpy(next_md, md, memmap.desc_size);
+			if (current_addr >= size) {
+				next_md->num_pages -=
+					PFN_UP(current_addr-size);
+			}
+			p1 += memmap.desc_size;
+			next_md = p1;
+			j++;
+		} else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
+			   EFI_MEMORY_RUNTIME) {
+			/* In order to make runtime services
+			 * available we have to include runtime
+			 * memory regions in memory map */
+			memcpy(next_md, md, memmap.desc_size);
+			p1 += memmap.desc_size;
+			next_md = p1;
+			j++;
+		}
+	}
+	memmap.nr_map = j;
+	memmap.map_end = memmap.map +
+		(memmap.nr_map * memmap.desc_size);
+}
+
+void __init limit_regions(unsigned long long size)
+{
+	unsigned long long current_addr;
 	int i;
 
 	print_memory_map("limit_regions start");
 	if (efi_enabled) {
-		efi_memory_desc_t *md;
-		void *p;
-
-		for (p = memmap.map, i = 0; p < memmap.map_end;
-			p += memmap.desc_size, i++) {
-			md = p;
-			current_addr = md->phys_addr + (md->num_pages << 12);
-			if (md->type == EFI_CONVENTIONAL_MEMORY) {
-				if (current_addr >= size) {
-					md->num_pages -=
-						(((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
-					memmap.nr_map = i + 1;
-					return;
-				}
-			}
-		}
+		efi_limit_regions(size);
+		return;
 	}
 	for (i = 0; i < e820.nr_map; i++) {
 		current_addr = e820.map[i].addr + e820.map[i].size;
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 1674161..f4dd048 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -192,8 +192,6 @@ static inline int page_kills_ppro(unsigned long pagenr)
 	return 0;
 }
 
-extern int is_available_memory(efi_memory_desc_t *);
-
 int page_is_ram(unsigned long pagenr)
 {
 	int i;
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 91ecf49..df1c918 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -302,6 +302,7 @@ extern void efi_initialize_iomem_resources(struct resource *code_resource,
 					struct resource *data_resource);
 extern unsigned long efi_get_time(void);
 extern int __init efi_set_rtc_mmss(unsigned long nowtime);
+extern int is_available_memory(efi_memory_desc_t * md);
 extern struct efi_memory_map memmap;
 
 /**
-- 
cgit v0.10.2


From 6df0532eef0187c293d3ab1d4c158f92e8f24f8a Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 7 Dec 2006 02:14:11 +0100
Subject: [PATCH] i386: remove duplicate printk

We do the exact same printk about a dozen lines above
with no intermediate printk's.

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index e475809..41cfea5 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -104,10 +104,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 					f_vide();
 				rdtscl(d2);
 				d = d2-d;
-				
-				/* Knock these two lines out if it debugs out ok */
-				printk(KERN_INFO "AMD K6 stepping B detected - ");
-				/* -- cut here -- */
+
 				if (d > 20*K6_BUG_LOOP) 
 					printk("system stability may be impaired when more than 32 MB are used.\n");
 				else 
-- 
cgit v0.10.2


From 86bd58bf4c383fde4e99b83ce917b091a072040d Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@cs.washington.edu>
Date: Thu, 7 Dec 2006 02:14:11 +0100
Subject: [PATCH] x86-64: Remove unused GET_APIC_VERSION call from
 clear_local_APIC

Remove unused GET_APIC_VERSION call from clear_local_APIC() and
__setup_APIC_LVTT().

Reported by D Binderman <dcb314@hotmail.com>.

Cc: Andi Kleen <ak@suse.de>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: David Rientjes <rientjes@cs.washington.edu>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index f0b00d8..124b2d2 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -140,7 +140,6 @@ void clear_local_APIC(void)
 		apic_write(APIC_LVTERR, APIC_LVT_MASKED);
 	if (maxlvt >= 4)
 		apic_write(APIC_LVTPC, APIC_LVT_MASKED);
-	v = GET_APIC_VERSION(apic_read(APIC_LVR));
 	apic_write(APIC_ESR, 0);
 	apic_read(APIC_ESR);
 }
@@ -736,10 +735,9 @@ void __init init_apic_mappings(void)
 
 static void __setup_APIC_LVTT(unsigned int clocks)
 {
-	unsigned int lvtt_value, tmp_value, ver;
+	unsigned int lvtt_value, tmp_value;
 	int cpu = smp_processor_id();
 
-	ver = GET_APIC_VERSION(apic_read(APIC_LVR));
 	lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
 
 	if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask))
-- 
cgit v0.10.2


From c7a3392e9e53e43c44a971de3dd480a8e2788e75 Mon Sep 17 00:00:00 2001
From: Wink Saville <wink@saville.com>
Date: Thu, 7 Dec 2006 02:14:11 +0100
Subject: [PATCH] x86-64: Fix comments for MSR_FS_BASE and MSR_GS_BASE.

The comments for MSR_FS_BASE & MSR_GS_BASE were transposed.

Signed-off-by: Wink Saville <wink@saville.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/include/asm-x86_64/msr.h b/include/asm-x86_64/msr.h
index a745c50..952783d 100644
--- a/include/asm-x86_64/msr.h
+++ b/include/asm-x86_64/msr.h
@@ -169,8 +169,8 @@ static inline unsigned int cpuid_edx(unsigned int op)
 #define MSR_LSTAR 0xc0000082 		/* long mode SYSCALL target */
 #define MSR_CSTAR 0xc0000083		/* compatibility mode SYSCALL target */
 #define MSR_SYSCALL_MASK 0xc0000084	/* EFLAGS mask for syscall */
-#define MSR_FS_BASE 0xc0000100		/* 64bit GS base */
-#define MSR_GS_BASE 0xc0000101		/* 64bit FS base */
+#define MSR_FS_BASE 0xc0000100		/* 64bit FS base */
+#define MSR_GS_BASE 0xc0000101		/* 64bit GS base */
 #define MSR_KERNEL_GS_BASE  0xc0000102	/* SwapGS GS shadow (or USER_GS from kernel) */ 
 /* EFER bits: */ 
 #define _EFER_SCE 0  /* SYSCALL/SYSRET */
-- 
cgit v0.10.2


From 0741f4d207a644482d7a040f05cd264c98cf7ee8 Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <76306.1226@compuserve.com>
Date: Thu, 7 Dec 2006 02:14:11 +0100
Subject: [PATCH] x86: add sysctl for kstack_depth_to_print

Add sysctl for kstack_depth_to_print. This lets users change
the amount of raw stack data printed in dump_stack() without
having to reboot.

Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 0bc7f1e..5922e84 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -27,6 +27,7 @@ show up in /proc/sys/kernel:
 - hotplug
 - java-appletviewer           [ binfmt_java, obsolete ]
 - java-interpreter            [ binfmt_java, obsolete ]
+- kstack_depth_to_print       [ X86 only ]
 - l2cr                        [ PPC only ]
 - modprobe                    ==> Documentation/kmod.txt
 - msgmax
@@ -170,6 +171,13 @@ This flag controls the L2 cache of G3 processor boards. If
 
 ==============================================================
 
+kstack_depth_to_print: (X86 only)
+
+Controls the number of words to print when dumping the raw
+kernel stack.
+
+==============================================================
+
 osrelease, ostype & version:
 
 # cat osrelease
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 7b2f9f0..1d48a75 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -91,7 +91,7 @@ asmlinkage void alignment_check(void);
 asmlinkage void spurious_interrupt_bug(void);
 asmlinkage void machine_check(void);
 
-static int kstack_depth_to_print = 24;
+int kstack_depth_to_print = 24;
 #ifdef CONFIG_STACK_UNWIND
 static int call_trace = 1;
 #else
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 264db33..75cecce 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -108,7 +108,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 	preempt_enable_no_resched();
 }
 
-static int kstack_depth_to_print = 12;
+int kstack_depth_to_print = 12;
 #ifdef CONFIG_STACK_UNWIND
 static int call_trace = 1;
 #else
diff --git a/include/asm-x86_64/stacktrace.h b/include/asm-x86_64/stacktrace.h
index 5eb9799..6f0b545 100644
--- a/include/asm-x86_64/stacktrace.h
+++ b/include/asm-x86_64/stacktrace.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_STACKTRACE_H
 #define _ASM_STACKTRACE_H 1
 
+extern int kstack_depth_to_print;
+
 /* Generic stack tracer with callbacks */
 
 struct stacktrace_ops {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 09e569f4..6fc5e17 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -54,6 +54,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
 
 #ifdef CONFIG_X86
 #include <asm/nmi.h>
+#include <asm/stacktrace.h>
 #endif
 
 #if defined(CONFIG_SYSCTL)
@@ -707,6 +708,14 @@ static ctl_table kern_table[] = {
 		.mode		= 0444,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "kstack_depth_to_print",
+		.data		= &kstack_depth_to_print,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 #if defined(CONFIG_MMU)
 	{
-- 
cgit v0.10.2


From 3df0af0eb064a16bbdbe81b46bc72a4089f88d54 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai.lu@amd.com>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] x86_64: clear_bss before set_intr_gate with early_idt_handler

idt_table is in the .bss section, so clear_bss need to called at first

Signed-off-by: Yinghai Lu <yinghai.lu@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index 9561eb3..cc230b9 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -57,10 +57,12 @@ void __init x86_64_start_kernel(char * real_mode_data)
 {
 	int i;
 
-	for (i = 0; i < 256; i++)
+	/* clear bss before set_intr_gate with early_idt_handler */
+	clear_bss();
+
+	for (i = 0; i < IDT_ENTRIES; i++)
 		set_intr_gate(i, early_idt_handler);
 	asm volatile("lidt %0" :: "m" (idt_descr));
-	clear_bss();
 
 	early_printk("Kernel alive\n");
 
-- 
cgit v0.10.2


From 8fb6e5f5db860113e71ce7b854382ed40559395b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@amd.com>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] x86_64: interrupt array size should be aligned to NR_VECTORS

interrupt array is referred for idt vectors instead of NR_IRQS, so change size
to NR_VECTORS - FIRST_EXTERNAL_VECTOR. Also change to static.

Signed-off-by: Yinghai Lu <yinghai@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index c4ef801..d73c79e 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -76,7 +76,8 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
 	IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
 	IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
 
-void (*interrupt[NR_IRQS])(void) = {
+/* for the irq vectors */
+static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
 					  IRQLIST_16(0x2), IRQLIST_16(0x3),
 	IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
 	IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
-- 
cgit v0.10.2


From 026c66bdda5f07959da7d74d29b18a7c480242f7 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] x86-64: remove duplicate ARCH_DISCONTIGMEM_ENABLE option

One ARCH_DISCONTIGMEM_ENABLE option is enough.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 6eece27..bfbb9bc 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -359,11 +359,6 @@ config ARCH_DISCONTIGMEM_ENABLE
        depends on NUMA
        default y
 
-
-config ARCH_DISCONTIGMEM_ENABLE
-	def_bool y
-	depends on NUMA
-
 config ARCH_DISCONTIGMEM_DEFAULT
 	def_bool y
 	depends on NUMA
-- 
cgit v0.10.2


From d4c45718b3c0d3045eab803cd15fabbed1ea5bcd Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] x86-64: Fix kobject_init() WARN_ON on resume from disk

Make mce_remove_device() clean up the kobject in per_cpu(device_mce, cpu)
after it has been unregistered.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index bbea888..a7440cb 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -652,6 +652,7 @@ static void mce_remove_device(unsigned int cpu)
 	sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
 	sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
 	sysdev_unregister(&per_cpu(device_mce,cpu));
+	memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
 }
 
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
-- 
cgit v0.10.2


From a36df98ab1cdd8a9e7daa4c1b5c48ffa2ad6ea09 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] i386: touch softlockup during backtracing

Sometimes the soft watchdog fires after we're done oopsing.
See http://projects.info-pull.com/mokb/MOKB-25-11-2006.html for an example.

AK: changed to touch_nmi_watchdog()

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 1d48a75..86d8476 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -29,6 +29,7 @@
 #include <linux/kexec.h>
 #include <linux/unwind.h>
 #include <linux/uaccess.h>
+#include <linux/nmi.h>
 
 #ifdef CONFIG_EISA
 #include <linux/ioport.h>
@@ -248,6 +249,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		stack = (unsigned long*)context->previous_esp;
 		if (!stack)
 			break;
+		touch_nmi_watchdog();
 	}
 }
 EXPORT_SYMBOL(dump_trace);
-- 
cgit v0.10.2


From 73ad8355d7db6a3cdcf313d4a4586a8f81b19c2f Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai.lu@amd.com>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] x86-64: remove unused acpi_found_madt in mparse.

remove unused acpi_found_madt in mparse.c

Signed-off-by: Yinghai Lu <yinghai.lu@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index b147ab1..0807256 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -35,8 +35,6 @@
 int smp_found_config;
 unsigned int __initdata maxcpus = NR_CPUS;
 
-int acpi_found_madt;
-
 /*
  * Various Linux-internal data structures created from the
  * MP-table.
-- 
cgit v0.10.2


From a1a70c25bed75ed36ed48bbe18b9029428d2452d Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] i386: always enable regparm

-mregparm=3 has been enabled by default for some time on i386, and AFAIK
there aren't any problems with it left.

This patch removes the REGPARM config option and sets -mregparm=3
unconditionally.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/Documentation/stable_api_nonsense.txt b/Documentation/stable_api_nonsense.txt
index f39c9d7..a2afca3 100644
--- a/Documentation/stable_api_nonsense.txt
+++ b/Documentation/stable_api_nonsense.txt
@@ -62,9 +62,6 @@ consider the following facts about the Linux kernel:
       - different structures can contain different fields
       - Some functions may not be implemented at all, (i.e. some locks
 	compile away to nothing for non-SMP builds.)
-      - Parameter passing of variables from function to function can be
-	done in different ways (the CONFIG_REGPARM option controls
-	this.)
       - Memory within the kernel can be aligned in different ways,
 	depending on the build options.
   - Linux runs on a wide range of different processor architectures.
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index bb1fa06..b6b2df4 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -721,20 +721,6 @@ config BOOT_IOREMAP
 	depends on (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI))
 	default y
 
-config REGPARM
-	bool "Use register arguments"
-	default y
-	help
-	Compile the kernel with -mregparm=3. This instructs gcc to use
-	a more efficient function call ABI which passes the first three
-	arguments of a function call via registers, which results in denser
-	and faster code.
-
-	If this option is disabled, then the default ABI of passing
-	arguments via the stack is used.
-
-	If unsure, say Y.
-
 config SECCOMP
 	bool "Enable seccomp to safely compute untrusted bytecode"
 	depends on PROC_FS
diff --git a/arch/i386/Makefile b/arch/i386/Makefile
index d1aca52..f7ac1ae 100644
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -31,7 +31,7 @@ LDFLAGS_vmlinux := --emit-relocs
 endif
 CHECKFLAGS	+= -D__i386__
 
-CFLAGS += -pipe -msoft-float
+CFLAGS += -pipe -msoft-float -mregparm=3
 
 # prevent gcc from keeping the stack 16 byte aligned
 CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
@@ -39,8 +39,6 @@ CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
 # CPU-specific tuning. Anything which can be shared with UML should go here.
 include $(srctree)/arch/i386/Makefile.cpu
 
-cflags-$(CONFIG_REGPARM) += -mregparm=3
-
 # temporary until string.h is fixed
 cflags-y += -ffreestanding
 
diff --git a/include/asm-i386/module.h b/include/asm-i386/module.h
index fe5ae42..02f8f54 100644
--- a/include/asm-i386/module.h
+++ b/include/asm-i386/module.h
@@ -62,18 +62,12 @@ struct mod_arch_specific
 #error unknown processor family
 #endif
 
-#ifdef CONFIG_REGPARM
-#define MODULE_REGPARM "REGPARM "
-#else
-#define MODULE_REGPARM ""
-#endif
-
 #ifdef CONFIG_4KSTACKS
 #define MODULE_STACKSIZE "4KSTACKS "
 #else
 #define MODULE_STACKSIZE ""
 #endif
 
-#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_REGPARM MODULE_STACKSIZE
+#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE
 
 #endif /* _ASM_I386_MODULE_H */
-- 
cgit v0.10.2


From 616779656989cb8c59177e35cb13e87028b1edc8 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] x86-64: Synchronize RDTSC on single core AMD

There is no guarantee that two RDTSCs in a row are monotonic,
so don't assume it on single core AMD systems.
This will make gettimeofday slower again
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index a570c81..05eaca4 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -732,11 +732,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	/* Fix cpuid4 emulation for more */
 	num_cache_leaves = 3;
 
-	/* When there is only one core no need to synchronize RDTSC */
-	if (num_possible_cpus() == 1)
-	        set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
-	else
-	        clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+	/* RDTSC can be speculated around */
+	clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
 }
 
 static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
-- 
cgit v0.10.2


From e496a0da7f8110054d0ad4039245b0f49f9540f5 Mon Sep 17 00:00:00 2001
From: Muli Ben-Yehuda <muli@il.ibm.com>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] Calgary: remove unused variables

Spotted by d binderman <dcb314@hotmail.com>.

Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 0ddf29d..3215675 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -653,14 +653,9 @@ static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev)
 static void __init calgary_reserve_regions(struct pci_dev *dev)
 {
 	unsigned int npages;
-	void __iomem *bbar;
-	unsigned char busnum;
 	u64 start;
 	struct iommu_table *tbl = dev->sysdata;
 
-	bbar = tbl->bbar;
-	busnum = dev->bus->number;
-
 	/* reserve bad_dma_address in case it's a legal address */
 	iommu_range_reserve(tbl, bad_dma_address, 1);
 
-- 
cgit v0.10.2


From f3d73707a1e84f0687a05144b70b660441e999c7 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] x86-64: Mark rdtsc as sync only for netburst, not for core2

On the Core2 cpus, the rdtsc instruction is not serializing (as defined
in the architecture reference since rdtsc exists) and due to the deep
speculation of these cores, it's possible that you can observe time go
backwards between cores due to this speculation. Since the kernel
already deals with this with the SYNC_RDTSC flag, the solution is
simple, only assume that the instruction is serializing on family 15...

The price one pays for this is a slightly slower gettimeofday (by a
dozen or two cycles), but that increase is quite small to pay for a
really-going-forward tsc counter.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 05eaca4..6595a4e 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -860,7 +860,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 		set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
 	if (c->x86 == 6)
 		set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
-	set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+	if (c->x86 == 15)
+		set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+	else
+		clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
  	c->x86_max_cores = intel_num_cpu_cores(c);
 
 	srat_detect_node();
-- 
cgit v0.10.2


From e2124bb8d369a4bc1afde1959040e33d71c41d5e Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] unwinder: Use probe_kernel_address instead of __get_user in
 kernel/unwind.c

This avoids trouble with the page fault handler if the fault
happens inside an interrupt context.

Suggested by Linus

Cc: jbeulich@novell.com
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/kernel/unwind.c b/kernel/unwind.c
index ed0a21d..af48168 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -14,6 +14,7 @@
 #include <linux/bootmem.h>
 #include <linux/sort.h>
 #include <linux/stop_machine.h>
+#include <linux/uaccess.h>
 #include <asm/sections.h>
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
@@ -550,7 +551,7 @@ static unsigned long read_pointer(const u8 **pLoc,
 		return 0;
 	}
 	if ((ptrType & DW_EH_PE_indirect)
-	    && __get_user(value, (unsigned long *)value))
+	    && probe_kernel_address((unsigned long *)value, value))
 		return 0;
 	*pLoc = ptr.p8;
 
@@ -982,18 +983,19 @@ int unwind(struct unwind_frame_info *frame)
 		        & (sizeof(unsigned long) - 1))) {
 			unsigned long link;
 
-			if (!__get_user(link,
+			if (!probe_kernel_address(
 			                (unsigned long *)(UNW_FP(frame)
-			                                  + FRAME_LINK_OFFSET))
+			                                  + FRAME_LINK_OFFSET),
+						  link)
 # if FRAME_RETADDR_OFFSET < 0
 			   && link > bottom && link < UNW_FP(frame)
 # else
 			   && link > UNW_FP(frame) && link < bottom
 # endif
 			   && !(link & (sizeof(link) - 1))
-			   && !__get_user(UNW_PC(frame),
+			   && !probe_kernel_address(
 			                  (unsigned long *)(UNW_FP(frame)
-			                                    + FRAME_RETADDR_OFFSET))) {
+			                                    + FRAME_RETADDR_OFFSET), UNW_PC(frame))) {
 				UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET
 # if FRAME_RETADDR_OFFSET < 0
 					-
@@ -1104,7 +1106,7 @@ int unwind(struct unwind_frame_info *frame)
 					return -EIO;
 				switch(reg_info[i].width) {
 #define CASE(n)     case sizeof(u##n): \
-					__get_user(FRAME_REG(i, u##n), (u##n *)addr); \
+					probe_kernel_address((u##n *)addr, FRAME_REG(i, u##n)); \
 					break
 				CASES;
 #undef CASE
-- 
cgit v0.10.2


From 446f713ba1afd68568139ae4691335ba273fa7f4 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] unwinder: always use unlocked module list access in unwinder
 fallback

We're already well protected against module unloads because module
unload uses stop_machine(). The only exception is NMIs, but other
users already risk lockless accesses here.

This avoids some hackery in lockdep and also a potential deadlock

This matches what i386 does.

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 75cecce..9864d19 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -317,9 +317,9 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 #define HANDLE_STACK(cond) \
 	do while (cond) { \
 		unsigned long addr = *stack++; \
-		if (oops_in_progress ? 		\
-			__kernel_text_address(addr) : \
-			kernel_text_address(addr)) { \
+		/* Use unlocked access here because except for NMIs	\
+		   we should be already protected against module unloads */ \
+		if (__kernel_text_address(addr)) { \
 			/* \
 			 * If the address is either in the text segment of the \
 			 * kernel, or in the region which contains vmalloc'ed \
-- 
cgit v0.10.2


From eef5e0d185fc049bda11fa14ba286fbd357da896 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] unwinder: Remove lockdep disabling of nested locks for
 unwinder

Shouldn't be needed anymore since __kernel_text_address
is used unconditionally on x86-64

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index c9fefdb..9bb8d78 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -228,11 +228,7 @@ static int save_trace(struct stack_trace *trace)
 	trace->skip = 3;
 	trace->all_contexts = 0;
 
-	/* Make sure to not recurse in case the the unwinder needs to tak
-e	   locks. */
-	lockdep_off();
 	save_stack_trace(trace, NULL);
-	lockdep_on();
 
 	trace->max_entries = trace->nr_entries;
 
-- 
cgit v0.10.2


From 359ad0d4015a9ab39243f2ebc4eb07915bd618b2 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 7 Dec 2006 02:14:13 +0100
Subject: [PATCH] unwinder: more sanity checks in Dwarf2 unwinder

Tighten the requirements on both input to and output from the Dwarf2
unwinder.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 86d8476..c447807 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -161,12 +161,19 @@ dump_trace_unwind(struct unwind_frame_info *info, void *data)
 {
 	struct ops_and_data *oad = (struct ops_and_data *)data;
 	int n = 0;
+	unsigned long sp = UNW_SP(info);
 
+	if (arch_unw_user_mode(info))
+		return -1;
 	while (unwind(info) == 0 && UNW_PC(info)) {
 		n++;
 		oad->ops->address(oad->data, UNW_PC(info));
 		if (arch_unw_user_mode(info))
 			break;
+		if ((sp & ~(PAGE_SIZE - 1)) == (UNW_SP(info) & ~(PAGE_SIZE - 1))
+		    && sp > UNW_SP(info))
+			break;
+		sp = UNW_SP(info);
 	}
 	return n;
 }
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 9864d19..4fdd162 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -225,12 +225,19 @@ static int dump_trace_unwind(struct unwind_frame_info *info, void *context)
 {
 	struct ops_and_data *oad = (struct ops_and_data *)context;
 	int n = 0;
+	unsigned long sp = UNW_SP(info);
 
+	if (arch_unw_user_mode(info))
+		return -1;
 	while (unwind(info) == 0 && UNW_PC(info)) {
 		n++;
 		oad->ops->address(oad->data, UNW_PC(info));
 		if (arch_unw_user_mode(info))
 			break;
+		if ((sp & ~(PAGE_SIZE - 1)) == (UNW_SP(info) & ~(PAGE_SIZE - 1))
+		    && sp > UNW_SP(info))
+			break;
+		sp = UNW_SP(info);
 	}
 	return n;
 }
diff --git a/include/asm-i386/unwind.h b/include/asm-i386/unwind.h
index 601fc67..aa2c931 100644
--- a/include/asm-i386/unwind.h
+++ b/include/asm-i386/unwind.h
@@ -79,17 +79,13 @@ extern asmlinkage int arch_unwind_init_running(struct unwind_frame_info *,
                                                                           void *arg),
                                                void *arg);
 
-static inline int arch_unw_user_mode(const struct unwind_frame_info *info)
+static inline int arch_unw_user_mode(/*const*/ struct unwind_frame_info *info)
 {
-#if 0 /* This can only work when selector register and EFLAGS saves/restores
-         are properly annotated (and tracked in UNW_REGISTER_INFO). */
-	return user_mode_vm(&info->regs);
-#else
-	return info->regs.eip < PAGE_OFFSET
+	return user_mode_vm(&info->regs)
+	       || info->regs.eip < PAGE_OFFSET
 	       || (info->regs.eip >= __fix_to_virt(FIX_VDSO)
-	            && info->regs.eip < __fix_to_virt(FIX_VDSO) + PAGE_SIZE)
+	           && info->regs.eip < __fix_to_virt(FIX_VDSO) + PAGE_SIZE)
 	       || info->regs.esp < PAGE_OFFSET;
-#endif
 }
 
 #else
diff --git a/include/asm-x86_64/unwind.h b/include/asm-x86_64/unwind.h
index 2e7ff10..2f6349e 100644
--- a/include/asm-x86_64/unwind.h
+++ b/include/asm-x86_64/unwind.h
@@ -87,14 +87,10 @@ extern int arch_unwind_init_running(struct unwind_frame_info *,
 
 static inline int arch_unw_user_mode(const struct unwind_frame_info *info)
 {
-#if 0 /* This can only work when selector register saves/restores
-         are properly annotated (and tracked in UNW_REGISTER_INFO). */
-	return user_mode(&info->regs);
-#else
-	return (long)info->regs.rip >= 0
+	return user_mode(&info->regs)
+	       || (long)info->regs.rip >= 0
 	       || (info->regs.rip >= VSYSCALL_START && info->regs.rip < VSYSCALL_END)
 	       || (long)info->regs.rsp >= 0;
-#endif
 }
 
 #else
diff --git a/kernel/unwind.c b/kernel/unwind.c
index af48168..7e721f1 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -95,6 +95,7 @@ static const struct {
 
 typedef unsigned long uleb128_t;
 typedef   signed long sleb128_t;
+#define sleb128abs __builtin_labs
 
 static struct unwind_table {
 	struct {
@@ -787,7 +788,7 @@ int unwind(struct unwind_frame_info *frame)
 #define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
 	const u32 *fde = NULL, *cie = NULL;
 	const u8 *ptr = NULL, *end = NULL;
-	unsigned long pc = UNW_PC(frame) - frame->call_frame;
+	unsigned long pc = UNW_PC(frame) - frame->call_frame, sp;
 	unsigned long startLoc = 0, endLoc = 0, cfa;
 	unsigned i;
 	signed ptrType = -1;
@@ -936,6 +937,9 @@ int unwind(struct unwind_frame_info *frame)
 		state.dataAlign = get_sleb128(&ptr, end);
 		if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
 			cie = NULL;
+		else if (UNW_PC(frame) % state.codeAlign
+		         || UNW_SP(frame) % sleb128abs(state.dataAlign))
+			return -EPERM;
 		else {
 			retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
 			/* skip augmentation */
@@ -968,6 +972,8 @@ int unwind(struct unwind_frame_info *frame)
 #ifdef CONFIG_FRAME_POINTER
 		unsigned long top, bottom;
 
+		if ((UNW_SP(frame) | UNW_FP(frame)) % sizeof(unsigned long))
+			return -EPERM;
 		top = STACK_TOP(frame->task);
 		bottom = STACK_BOTTOM(frame->task);
 # if FRAME_RETADDR_OFFSET < 0
@@ -1018,6 +1024,7 @@ int unwind(struct unwind_frame_info *frame)
 	   || state.regs[retAddrReg].where == Nowhere
 	   || state.cfa.reg >= ARRAY_SIZE(reg_info)
 	   || reg_info[state.cfa.reg].width != sizeof(unsigned long)
+	   || FRAME_REG(state.cfa.reg, unsigned long) % sizeof(unsigned long)
 	   || state.cfa.offs % sizeof(unsigned long))
 		return -EIO;
 	/* update frame */
@@ -1038,6 +1045,8 @@ int unwind(struct unwind_frame_info *frame)
 #else
 # define CASES CASE(8); CASE(16); CASE(32); CASE(64)
 #endif
+	pc = UNW_PC(frame);
+	sp = UNW_SP(frame);
 	for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
 		if (REG_INVALID(i)) {
 			if (state.regs[i].where == Nowhere)
@@ -1118,6 +1127,11 @@ int unwind(struct unwind_frame_info *frame)
 		}
 	}
 
+	if (UNW_PC(frame) % state.codeAlign
+	    || UNW_SP(frame) % sleb128abs(state.dataAlign)
+	    || (pc == UNW_PC(frame) && sp == UNW_SP(frame)))
+		return -EIO;
+
 	return 0;
 #undef CASES
 #undef FRAME_REG
-- 
cgit v0.10.2


From a0429d0d7a6116dedcb71d9128da904bf135f189 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:13 +0100
Subject: [PATCH] x86-64: Remove unwind stack pointer alignment forcing again

This was added as a workaround for the fallback unwinder not supporting
unaligned stack pointers properly. But now it was fixed to do that,
so it's not needed anymore

Cc: mingo@elte.hu
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 4fdd162..a1641ff 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -309,12 +309,6 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 		if (tsk && tsk != current)
 			stack = (unsigned long *)tsk->thread.rsp;
 	}
-	/*
-	 * Align the stack pointer on word boundary, later loops
-	 * rely on that (and corruption / debug info bugs can cause
-	 * unaligned values here):
-	 */
-	stack = (unsigned long *)((unsigned long)stack & ~(sizeof(long)-1));
 
 	/*
 	 * Print function call entries within a stack. 'cond' is the
-- 
cgit v0.10.2


From d331e739f5ad2aaa9d8553891ba6ca823bdbce37 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Thu, 7 Dec 2006 02:14:13 +0100
Subject: [PATCH] x86-64: Fix interrupt race in idle callback (3rd try)

Idle callbacks has some races when enter_idle() sets isidle and subsequent
interrupts that can happen on that CPU, before CPU goes to idle. Due to this,
an IDLE_END can get called before IDLE_START. To avoid these races, disable
interrupts before enter_idle and make sure that all idle routines do not
enable interrupts before entering idle.

Note that poll_idle() still has a this race as it has to enable interrupts
before going to idle. But, all other idle routines have the race fixed.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 0b7b4ca..a418ee4 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -127,6 +127,7 @@ static void default_idle(void)
  */
 static void poll_idle (void)
 {
+	local_irq_enable();
 	cpu_relax();
 }
 
@@ -208,6 +209,12 @@ void cpu_idle (void)
 				idle = default_idle;
 			if (cpu_is_offline(smp_processor_id()))
 				play_dead();
+			/*
+			 * Idle routines should keep interrupts disabled
+			 * from here on, until they go to idle.
+			 * Otherwise, idle callbacks can misfire.
+			 */
+			local_irq_disable();
 			enter_idle();
 			idle();
 			/* In many cases the interrupt that ended idle
@@ -245,8 +252,16 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
 static void mwait_idle(void)
 {
-	local_irq_enable();
-	mwait_idle_with_hints(0,0);
+	if (!need_resched()) {
+		__monitor((void *)&current_thread_info()->flags, 0, 0);
+		smp_mb();
+		if (!need_resched())
+			__sti_mwait(0, 0);
+		else
+			local_irq_enable();
+	} else {
+		local_irq_enable();
+	}
 }
 
 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index cef17e0..76552d7 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -475,6 +475,14 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
 		: :"a" (eax), "c" (ecx));
 }
 
+static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
+{
+	/* "mwait %eax,%ecx;" */
+	asm volatile(
+		"sti; .byte 0x0f,0x01,0xc9;"
+		: :"a" (eax), "c" (ecx));
+}
+
 extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
 
 #define stack_current() \
-- 
cgit v0.10.2


From 3807fd46e94ab9f09e5ee3bff5e6515a94e9b3c7 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:13 +0100
Subject: [PATCH] x86-64: Clarify error message in GART code

- Remove "Disabling IOMMU" message because it confuses people
- Clarify that the GART IOMMU is refered to in other message

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index 16261a8..fc1960f 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -601,10 +601,9 @@ void __init gart_iommu_init(void)
 	    (!force_iommu && end_pfn <= MAX_DMA32_PFN) ||
 	    !iommu_aperture ||
 	    (no_agp && init_k8_gatt(&info) < 0)) {
-		printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n");
 		if (end_pfn > MAX_DMA32_PFN) {
 			printk(KERN_ERR "WARNING more than 4GB of memory "
-					"but IOMMU not available.\n"
+					"but GART IOMMU not available.\n"
 			       KERN_ERR "WARNING 32bit PCI may malfunction.\n");
 		}
 		return;
-- 
cgit v0.10.2


From 6d0185ea611276fdf81991d7774d396bdc1ae392 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 7 Dec 2006 02:14:13 +0100
Subject: [PATCH] unwinder: Add debugging output to the Dwarf2 unwinder

Add debugging printks to the unwinder to allow easier debugging
when something goes wrong with it.

This can be controlled with the new unwinder_debug=N option
Most output is given by N=1

AK: Added documentation of unwinder_debug=

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 4e90aa4..d34fd6a 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1735,6 +1735,9 @@ and is between 256 and 4096 characters. It is defined in the file
 	norandmaps	Don't use address space randomization
 			Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space
 
+ 	unwind_debug=N 	N > 0 will enable dwarf2 unwinder debugging
+			This is useful to get more information why
+			you got a "dwarf2 unwinder stuck"
 
 ______________________________________________________________________
 
diff --git a/kernel/unwind.c b/kernel/unwind.c
index 7e721f1..209e248 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -137,6 +137,17 @@ struct unwind_state {
 
 static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 };
 
+static unsigned unwind_debug;
+static int __init unwind_debug_setup(char *s)
+{
+	unwind_debug = simple_strtoul(s, NULL, 0);
+	return 1;
+}
+__setup("unwind_debug=", unwind_debug_setup);
+#define dprintk(lvl, fmt, args...) \
+	((void)(lvl > unwind_debug \
+	 || printk(KERN_DEBUG "unwind: " fmt "\n", ##args)))
+
 static struct unwind_table *find_table(unsigned long pc)
 {
 	struct unwind_table *table;
@@ -281,6 +292,7 @@ static void __init setup_unwind_table(struct unwind_table *table,
 
 	hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int)
 	        + 2 * n * sizeof(unsigned long);
+	dprintk(2, "Binary lookup table size for %s: %lu bytes", table->name, hdrSize);
 	header = alloc(hdrSize);
 	if (!header)
 		return;
@@ -500,13 +512,17 @@ static unsigned long read_pointer(const u8 **pLoc,
 		const unsigned long *pul;
 	} ptr;
 
-	if (ptrType < 0 || ptrType == DW_EH_PE_omit)
+	if (ptrType < 0 || ptrType == DW_EH_PE_omit) {
+		dprintk(1, "Invalid pointer encoding %02X (%p,%p).", ptrType, *pLoc, end);
 		return 0;
+	}
 	ptr.p8 = *pLoc;
 	switch(ptrType & DW_EH_PE_FORM) {
 	case DW_EH_PE_data2:
-		if (end < (const void *)(ptr.p16u + 1))
+		if (end < (const void *)(ptr.p16u + 1)) {
+			dprintk(1, "Data16 overrun (%p,%p).", ptr.p8, end);
 			return 0;
+		}
 		if(ptrType & DW_EH_PE_signed)
 			value = get_unaligned(ptr.p16s++);
 		else
@@ -514,8 +530,10 @@ static unsigned long read_pointer(const u8 **pLoc,
 		break;
 	case DW_EH_PE_data4:
 #ifdef CONFIG_64BIT
-		if (end < (const void *)(ptr.p32u + 1))
+		if (end < (const void *)(ptr.p32u + 1)) {
+			dprintk(1, "Data32 overrun (%p,%p).", ptr.p8, end);
 			return 0;
+		}
 		if(ptrType & DW_EH_PE_signed)
 			value = get_unaligned(ptr.p32s++);
 		else
@@ -527,8 +545,10 @@ static unsigned long read_pointer(const u8 **pLoc,
 		BUILD_BUG_ON(sizeof(u32) != sizeof(value));
 #endif
 	case DW_EH_PE_native:
-		if (end < (const void *)(ptr.pul + 1))
+		if (end < (const void *)(ptr.pul + 1)) {
+			dprintk(1, "DataUL overrun (%p,%p).", ptr.p8, end);
 			return 0;
+		}
 		value = get_unaligned(ptr.pul++);
 		break;
 	case DW_EH_PE_leb128:
@@ -536,10 +556,14 @@ static unsigned long read_pointer(const u8 **pLoc,
 		value = ptrType & DW_EH_PE_signed
 		        ? get_sleb128(&ptr.p8, end)
 		        : get_uleb128(&ptr.p8, end);
-		if ((const void *)ptr.p8 > end)
+		if ((const void *)ptr.p8 > end) {
+			dprintk(1, "DataLEB overrun (%p,%p).", ptr.p8, end);
 			return 0;
+		}
 		break;
 	default:
+		dprintk(2, "Cannot decode pointer type %02X (%p,%p).",
+		        ptrType, ptr.p8, end);
 		return 0;
 	}
 	switch(ptrType & DW_EH_PE_ADJUST) {
@@ -549,11 +573,16 @@ static unsigned long read_pointer(const u8 **pLoc,
 		value += (unsigned long)*pLoc;
 		break;
 	default:
+		dprintk(2, "Cannot adjust pointer type %02X (%p,%p).",
+		        ptrType, *pLoc, end);
 		return 0;
 	}
 	if ((ptrType & DW_EH_PE_indirect)
-	    && probe_kernel_address((unsigned long *)value, value))
+	    && probe_kernel_address((unsigned long *)value, value)) {
+		dprintk(1, "Cannot read indirect value %lx (%p,%p).",
+		        value, *pLoc, end);
 		return 0;
+	}
 	*pLoc = ptr.p8;
 
 	return value;
@@ -702,8 +731,10 @@ static int processCFI(const u8 *start,
 					state->label = NULL;
 					return 1;
 				}
-				if (state->stackDepth >= MAX_STACK_DEPTH)
+				if (state->stackDepth >= MAX_STACK_DEPTH) {
+					dprintk(1, "State stack overflow (%p,%p).", ptr.p8, end);
 					return 0;
+				}
 				state->stack[state->stackDepth++] = ptr.p8;
 				break;
 			case DW_CFA_restore_state:
@@ -718,8 +749,10 @@ static int processCFI(const u8 *start,
 					result = processCFI(start, end, 0, ptrType, state);
 					state->loc = loc;
 					state->label = label;
-				} else
+				} else {
+					dprintk(1, "State stack underflow (%p,%p).", ptr.p8, end);
 					return 0;
+				}
 				break;
 			case DW_CFA_def_cfa:
 				state->cfa.reg = get_uleb128(&ptr.p8, end);
@@ -751,6 +784,7 @@ static int processCFI(const u8 *start,
 				break;
 			case DW_CFA_GNU_window_save:
 			default:
+				dprintk(1, "Unrecognized CFI op %02X (%p,%p).", ptr.p8[-1], ptr.p8 - 1, end);
 				result = 0;
 				break;
 			}
@@ -766,12 +800,17 @@ static int processCFI(const u8 *start,
 			set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state);
 			break;
 		}
-		if (ptr.p8 > end)
+		if (ptr.p8 > end) {
+			dprintk(1, "Data overrun (%p,%p).", ptr.p8, end);
 			result = 0;
+		}
 		if (result && targetLoc != 0 && targetLoc < state->loc)
 			return 1;
 	}
 
+	if (result && ptr.p8 < end)
+		dprintk(1, "Data underrun (%p,%p).", ptr.p8, end);
+
 	return result
 	   && ptr.p8 == end
 	   && (targetLoc == 0
@@ -843,6 +882,8 @@ int unwind(struct unwind_frame_info *frame)
 					                           hdr[3]);
 			}
 		}
+		if(hdr && !fde)
+			dprintk(3, "Binary lookup for %lx failed.", pc);
 
 		if (fde != NULL) {
 			cie = cie_for_fde(fde, table);
@@ -864,6 +905,8 @@ int unwind(struct unwind_frame_info *frame)
 					fde = NULL;
 			} else
 				fde = NULL;
+			if(!fde)
+				dprintk(1, "Binary lookup result for %lx discarded.", pc);
 		}
 		if (fde == NULL) {
 			for (fde = table->address, tableSize = table->size;
@@ -895,6 +938,8 @@ int unwind(struct unwind_frame_info *frame)
 				if (pc >= startLoc && pc < endLoc)
 					break;
 			}
+			if(!fde)
+				dprintk(3, "Linear lookup for %lx failed.", pc);
 		}
 	}
 	if (cie != NULL) {
@@ -928,6 +973,8 @@ int unwind(struct unwind_frame_info *frame)
 			if (ptr >= end || *ptr)
 				cie = NULL;
 		}
+		if(!cie)
+			dprintk(1, "CIE unusable (%p,%p).", ptr, end);
 		++ptr;
 	}
 	if (cie != NULL) {
@@ -938,9 +985,11 @@ int unwind(struct unwind_frame_info *frame)
 		if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
 			cie = NULL;
 		else if (UNW_PC(frame) % state.codeAlign
-		         || UNW_SP(frame) % sleb128abs(state.dataAlign))
+		         || UNW_SP(frame) % sleb128abs(state.dataAlign)) {
+			dprintk(1, "Input pointer(s) misaligned (%lx,%lx).",
+			        UNW_PC(frame), UNW_SP(frame));
 			return -EPERM;
-		else {
+		} else {
 			retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
 			/* skip augmentation */
 			if (((const char *)(cie + 2))[1] == 'z') {
@@ -954,6 +1003,8 @@ int unwind(struct unwind_frame_info *frame)
 			   || reg_info[retAddrReg].width != sizeof(unsigned long))
 				cie = NULL;
 		}
+		if(!cie)
+			dprintk(1, "CIE validation failed (%p,%p).", ptr, end);
 	}
 	if (cie != NULL) {
 		state.cieStart = ptr;
@@ -967,6 +1018,8 @@ int unwind(struct unwind_frame_info *frame)
 			if ((ptr += augSize) > end)
 				fde = NULL;
 		}
+		if(!fde)
+			dprintk(1, "FDE validation failed (%p,%p).", ptr, end);
 	}
 	if (cie == NULL || fde == NULL) {
 #ifdef CONFIG_FRAME_POINTER
@@ -1025,8 +1078,10 @@ int unwind(struct unwind_frame_info *frame)
 	   || state.cfa.reg >= ARRAY_SIZE(reg_info)
 	   || reg_info[state.cfa.reg].width != sizeof(unsigned long)
 	   || FRAME_REG(state.cfa.reg, unsigned long) % sizeof(unsigned long)
-	   || state.cfa.offs % sizeof(unsigned long))
+	   || state.cfa.offs % sizeof(unsigned long)) {
+		dprintk(1, "Unusable unwind info (%p,%p).", ptr, end);
 		return -EIO;
+	}
 	/* update frame */
 #ifndef CONFIG_AS_CFI_SIGNAL_FRAME
 	if(frame->call_frame
@@ -1051,6 +1106,8 @@ int unwind(struct unwind_frame_info *frame)
 		if (REG_INVALID(i)) {
 			if (state.regs[i].where == Nowhere)
 				continue;
+			dprintk(1, "Cannot restore register %u (%d).",
+			        i, state.regs[i].where);
 			return -EIO;
 		}
 		switch(state.regs[i].where) {
@@ -1059,8 +1116,11 @@ int unwind(struct unwind_frame_info *frame)
 		case Register:
 			if (state.regs[i].value >= ARRAY_SIZE(reg_info)
 			   || REG_INVALID(state.regs[i].value)
-			   || reg_info[i].width > reg_info[state.regs[i].value].width)
+			   || reg_info[i].width > reg_info[state.regs[i].value].width) {
+				dprintk(1, "Cannot restore register %u from register %lu.",
+				        i, state.regs[i].value);
 				return -EIO;
+			}
 			switch(reg_info[state.regs[i].value].width) {
 #define CASE(n) \
 			case sizeof(u##n): \
@@ -1070,6 +1130,9 @@ int unwind(struct unwind_frame_info *frame)
 			CASES;
 #undef CASE
 			default:
+				dprintk(1, "Unsupported register size %u (%lu).",
+				        reg_info[state.regs[i].value].width,
+				        state.regs[i].value);
 				return -EIO;
 			}
 			break;
@@ -1094,12 +1157,17 @@ int unwind(struct unwind_frame_info *frame)
 			CASES;
 #undef CASE
 			default:
+				dprintk(1, "Unsupported register size %u (%u).",
+				        reg_info[i].width, i);
 				return -EIO;
 			}
 			break;
 		case Value:
-			if (reg_info[i].width != sizeof(unsigned long))
+			if (reg_info[i].width != sizeof(unsigned long)) {
+				dprintk(1, "Unsupported value size %u (%u).",
+				        reg_info[i].width, i);
 				return -EIO;
+			}
 			FRAME_REG(i, unsigned long) = cfa + state.regs[i].value
 			                                    * state.dataAlign;
 			break;
@@ -1111,8 +1179,11 @@ int unwind(struct unwind_frame_info *frame)
 				    % sizeof(unsigned long)
 				    || addr < startLoc
 				    || addr + sizeof(unsigned long) < addr
-				    || addr + sizeof(unsigned long) > endLoc)
+				    || addr + sizeof(unsigned long) > endLoc) {
+					dprintk(1, "Bad memory location %lx (%lx).",
+					        addr, state.regs[i].value);
 					return -EIO;
+				}
 				switch(reg_info[i].width) {
 #define CASE(n)     case sizeof(u##n): \
 					probe_kernel_address((u##n *)addr, FRAME_REG(i, u##n)); \
@@ -1120,6 +1191,8 @@ int unwind(struct unwind_frame_info *frame)
 				CASES;
 #undef CASE
 				default:
+					dprintk(1, "Unsupported memory size %u (%u).",
+					        reg_info[i].width, i);
 					return -EIO;
 				}
 			}
@@ -1128,9 +1201,15 @@ int unwind(struct unwind_frame_info *frame)
 	}
 
 	if (UNW_PC(frame) % state.codeAlign
-	    || UNW_SP(frame) % sleb128abs(state.dataAlign)
-	    || (pc == UNW_PC(frame) && sp == UNW_SP(frame)))
+	    || UNW_SP(frame) % sleb128abs(state.dataAlign)) {
+		dprintk(1, "Output pointer(s) misaligned (%lx,%lx).",
+		        UNW_PC(frame), UNW_SP(frame));
 		return -EIO;
+	}
+	if (pc == UNW_PC(frame) && sp == UNW_SP(frame)) {
+		dprintk(1, "No progress (%lx,%lx).", pc, sp);
+		return -EIO;
+	}
 
 	return 0;
 #undef CASES
-- 
cgit v0.10.2


From 9ee4016888f1b226d7bb9a10ddb782868f658b31 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 7 Dec 2006 02:14:13 +0100
Subject: [PATCH] x86-64: include/asm-x86_64/cpufeature.h isn't a userspace
 header

Nothing in include/asm-x86_64/cpufeature.h is part of the
userspace<->kernel interface.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/include/asm-x86_64/Kbuild b/include/asm-x86_64/Kbuild
index 1ee9b07..6c455b4 100644
--- a/include/asm-x86_64/Kbuild
+++ b/include/asm-x86_64/Kbuild
@@ -6,7 +6,6 @@ ALTARCHDEF := defined __i386__
 
 header-y += boot.h
 header-y += bootsetup.h
-header-y += cpufeature.h
 header-y += debugreg.h
 header-y += ldt.h
 header-y += msr.h
-- 
cgit v0.10.2


From d5d2448d896fbb9a427ee12eb8e5f6309f2473f7 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 7 Dec 2006 02:14:13 +0100
Subject: [PATCH] x86-64: Fix numaq build error

  CC      arch/i386/boot/compressed/misc.o
arch/i386/boot/compressed/misc.c:120: error: static declaration of 'xquad_portio' follows non-static declaration
include/asm/io.h:275: error: previous declaration of 'xquad_portio' was here
make[2]: *** [arch/i386/boot/compressed/misc.o] Error 1

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c
index c6798c75..1ce7017 100644
--- a/arch/i386/boot/compressed/misc.c
+++ b/arch/i386/boot/compressed/misc.c
@@ -196,7 +196,7 @@ static int vidport;
 static int lines, cols;
 
 #ifdef CONFIG_X86_NUMAQ
-static void * xquad_portio = NULL;
+void *xquad_portio;
 #endif
 
 #include "../../../../lib/inflate.c"
-- 
cgit v0.10.2


From 9cfa5b5dfafcfe64c1a48906f243cdd302f82471 Mon Sep 17 00:00:00 2001
From: Burman Yan <yan_952@hotmail.com>
Date: Thu, 7 Dec 2006 02:14:13 +0100
Subject: [PATCH] x86-64: replace kmalloc+memset with kzalloc in MTRR code

Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c
index 9753bc6..5ae1705 100644
--- a/arch/i386/kernel/cpu/mtrr/if.c
+++ b/arch/i386/kernel/cpu/mtrr/if.c
@@ -44,10 +44,9 @@ mtrr_file_add(unsigned long base, unsigned long size,
 
 	max = num_var_ranges;
 	if (fcount == NULL) {
-		fcount = kmalloc(max * sizeof *fcount, GFP_KERNEL);
+		fcount = kzalloc(max * sizeof *fcount, GFP_KERNEL);
 		if (!fcount)
 			return -ENOMEM;
-		memset(fcount, 0, max * sizeof *fcount);
 		FILE_FCOUNT(file) = fcount;
 	}
 	if (!page) {
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index aeea23e..16bb7ea 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -596,10 +596,8 @@ static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
 	int i;
 	int size = num_var_ranges * sizeof(struct mtrr_value);
 
-	mtrr_state = kmalloc(size,GFP_ATOMIC);
-	if (mtrr_state)
-		memset(mtrr_state,0,size);
-	else
+	mtrr_state = kzalloc(size,GFP_ATOMIC);
+	if (!mtrr_state)
 		return -ENOMEM;
 
 	for (i = 0; i < num_var_ranges; i++) {
-- 
cgit v0.10.2


From d263b213577a1e8f166b0a7212d85175e36d6c19 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Thu, 7 Dec 2006 02:14:13 +0100
Subject: [PATCH] x86-64: Correct documentation for bzImage protocol v2.05

Correct the documentation for bzImage protocol extension due to relocatable
bzImage.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Acked-by: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/Documentation/i386/boot.txt b/Documentation/i386/boot.txt
index cb28254..9575de3 100644
--- a/Documentation/i386/boot.txt
+++ b/Documentation/i386/boot.txt
@@ -2,7 +2,7 @@
 		     ----------------------------
 
 		    H. Peter Anvin <hpa@zytor.com>
-			Last update 2005-09-02
+			Last update 2006-11-17
 
 On the i386 platform, the Linux kernel uses a rather complicated boot
 convention.  This has evolved partially due to historical aspects, as
@@ -131,8 +131,8 @@ Offset	Proto	Name		Meaning
 0226/2	N/A	pad1		Unused
 0228/4	2.02+	cmd_line_ptr	32-bit pointer to the kernel command line
 022C/4	2.03+	initrd_addr_max	Highest legal initrd address
-0230/4	2.04+	kernel_alignment Physical addr alignment required for kernel
-0234/1	2.04+	relocatable_kernel Whether kernel is relocatable or not
+0230/4	2.05+	kernel_alignment Physical addr alignment required for kernel
+0234/1	2.05+	relocatable_kernel Whether kernel is relocatable or not
 
 (1) For backwards compatibility, if the setup_sects field contains 0, the
     real value is 4.
-- 
cgit v0.10.2


From e4b522d7ef144fb2ad6a4cb23d9cb5ec154be8bc Mon Sep 17 00:00:00 2001
From: Duncan Sands <baldrick@free.fr>
Date: Thu, 7 Dec 2006 02:14:13 +0100
Subject: [PATCH] x86-64: fix asm constraints in i386 atomic_add_return

Since v->counter is both read and written, it should be an output as well
as an input for the asm.  The current code only gets away with this because
counter is volatile.  Also, according to Documents/atomic_ops.txt,
atomic_add_return should provide a memory barrier, in particular a compiler
barrier, so the asm should be marked as clobbering memory.

Test case:

#include <stdio.h>

typedef struct { int counter; } atomic_t; /* NB: no "volatile" */

#define ATOMIC_INIT(i)	{ (i) }

#define atomic_read(v)		((v)->counter)

static __inline__ int atomic_add_return(int i, atomic_t *v)
{
	int __i = i;

	__asm__ __volatile__(
		"lock; xaddl %0, %1;"
		:"=r"(i)
		:"m"(v->counter), "0"(i));
/*	__asm__ __volatile__(
		"lock; xaddl %0, %1"
		:"+r" (i), "+m" (v->counter)
		: : "memory"); */
	return i + __i;
}

int main (void) {
	atomic_t a = ATOMIC_INIT(0);
	int x;

	x = atomic_add_return (1, &a);
	if ((x!=1) || (atomic_read(&a)!=1))
		printf("fail: %i, %i\n", x, atomic_read(&a));
}

Signed-off-by: Duncan Sands <baldrick@free.fr>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/include/asm-i386/atomic.h b/include/asm-i386/atomic.h
index 51a1662..6aab7a1 100644
--- a/include/asm-i386/atomic.h
+++ b/include/asm-i386/atomic.h
@@ -187,9 +187,9 @@ static __inline__ int atomic_add_return(int i, atomic_t *v)
 	/* Modern 486+ processor */
 	__i = i;
 	__asm__ __volatile__(
-		LOCK_PREFIX "xaddl %0, %1;"
-		:"=r"(i)
-		:"m"(v->counter), "0"(i));
+		LOCK_PREFIX "xaddl %0, %1"
+		:"+r" (i), "+m" (v->counter)
+		: : "memory");
 	return i + __i;
 
 #ifdef CONFIG_M386
-- 
cgit v0.10.2


From 9dc452ba2d47f376987a99c0819833af0b46cc3f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:13 +0100
Subject: [PATCH] x86-64: Fix constraints in atomic_add_return()

Following i386 from Duncan Sands
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/include/asm-x86_64/atomic.h b/include/asm-x86_64/atomic.h
index 007e88d..626d371 100644
--- a/include/asm-x86_64/atomic.h
+++ b/include/asm-x86_64/atomic.h
@@ -189,9 +189,9 @@ static __inline__ int atomic_add_return(int i, atomic_t *v)
 {
 	int __i = i;
 	__asm__ __volatile__(
-		LOCK_PREFIX "xaddl %0, %1;"
-		:"=r"(i)
-		:"m"(v->counter), "0"(i));
+		LOCK_PREFIX "xaddl %0, %1"
+		:"+r" (i), "+m" (v->counter)
+		: : "memory");
 	return i + __i;
 }
 
-- 
cgit v0.10.2


From f475ff352c5e05d473c462b97c3a13a5b803af5a Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@cs.washington.edu>
Date: Thu, 7 Dec 2006 02:14:13 +0100
Subject: [PATCH] x86-64: remove unused variable

Remove unused variable in msr_write().

Reported by D Binderman <dcb314@hotmail.com>.

Cc: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: David Rientjes <rientjes@cs.washington.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index a773f77..fd45059 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -195,7 +195,6 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
 {
 	const u32 __user *tmp = (const u32 __user *)buf;
 	u32 data[2];
-	size_t rv;
 	u32 reg = *ppos;
 	int cpu = iminor(file->f_dentry->d_inode);
 	int err;
@@ -203,7 +202,7 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
 	if (count % 8)
 		return -EINVAL;	/* Invalid chunk size */
 
-	for (rv = 0; count; count -= 8) {
+	for (; count; count -= 8) {
 		if (copy_from_user(&data, tmp, 8))
 			return -EFAULT;
 		err = do_wrmsr(cpu, reg, data[0], data[1]);
-- 
cgit v0.10.2


From d7fb02712818643bab79a6b3cb8270a747d0227b Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 7 Dec 2006 02:14:19 +0100
Subject: [PATCH] x86-64: remove remaining pc98 code

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 56f571c..7f015a7 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -842,8 +842,7 @@ static int __init find_isa_irq_pin(int irq, int type)
 
 		if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
 		     mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
-		     mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
-		     mp_bus_id_to_type[lbus] == MP_BUS_NEC98
+		     mp_bus_id_to_type[lbus] == MP_BUS_MCA
 		    ) &&
 		    (mp_irqs[i].mpc_irqtype == type) &&
 		    (mp_irqs[i].mpc_srcbusirq == irq))
@@ -862,8 +861,7 @@ static int __init find_isa_irq_apic(int irq, int type)
 
 		if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
 		     mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
-		     mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
-		     mp_bus_id_to_type[lbus] == MP_BUS_NEC98
+		     mp_bus_id_to_type[lbus] == MP_BUS_MCA
 		    ) &&
 		    (mp_irqs[i].mpc_irqtype == type) &&
 		    (mp_irqs[i].mpc_srcbusirq == irq))
@@ -993,12 +991,6 @@ static int EISA_ELCR(unsigned int irq)
 #define default_MCA_trigger(idx)	(1)
 #define default_MCA_polarity(idx)	(0)
 
-/* NEC98 interrupts are always polarity zero edge triggered,
- * when listed as conforming in the MP table. */
-
-#define default_NEC98_trigger(idx)     (0)
-#define default_NEC98_polarity(idx)    (0)
-
 static int __init MPBIOS_polarity(int idx)
 {
 	int bus = mp_irqs[idx].mpc_srcbus;
@@ -1033,11 +1025,6 @@ static int __init MPBIOS_polarity(int idx)
 					polarity = default_MCA_polarity(idx);
 					break;
 				}
-				case MP_BUS_NEC98: /* NEC 98 pin */
-				{
-					polarity = default_NEC98_polarity(idx);
-					break;
-				}
 				default:
 				{
 					printk(KERN_WARNING "broken BIOS!!\n");
@@ -1107,11 +1094,6 @@ static int MPBIOS_trigger(int idx)
 					trigger = default_MCA_trigger(idx);
 					break;
 				}
-				case MP_BUS_NEC98: /* NEC 98 pin */
-				{
-					trigger = default_NEC98_trigger(idx);
-					break;
-				}
 				default:
 				{
 					printk(KERN_WARNING "broken BIOS!!\n");
@@ -1173,7 +1155,6 @@ static int pin_2_irq(int idx, int apic, int pin)
 		case MP_BUS_ISA: /* ISA pin */
 		case MP_BUS_EISA:
 		case MP_BUS_MCA:
-		case MP_BUS_NEC98:
 		{
 			irq = mp_irqs[idx].mpc_srcbusirq;
 			break;
diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index 442aaf8..2ce6722 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -249,8 +249,6 @@ static void __init MP_bus_info (struct mpc_config_bus *m)
 		mp_current_pci_id++;
 	} else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
 		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
-	} else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
-		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
 	} else {
 		printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
 	}
diff --git a/include/asm-i386/mpspec_def.h b/include/asm-i386/mpspec_def.h
index 76feedf..13bafb1 100644
--- a/include/asm-i386/mpspec_def.h
+++ b/include/asm-i386/mpspec_def.h
@@ -97,7 +97,6 @@ struct mpc_config_bus
 #define BUSTYPE_TC	"TC"
 #define BUSTYPE_VME	"VME"
 #define BUSTYPE_XPRESS	"XPRESS"
-#define BUSTYPE_NEC98	"NEC98"
 
 struct mpc_config_ioapic
 {
@@ -182,7 +181,6 @@ enum mp_bustype {
 	MP_BUS_EISA,
 	MP_BUS_PCI,
 	MP_BUS_MCA,
-	MP_BUS_NEC98
 };
 #endif
 
-- 
cgit v0.10.2


From 116780fc04d9f6cd3ceeab0251681f1dfda53367 Mon Sep 17 00:00:00 2001
From: Burman Yan <yan_952@hotmail.com>
Date: Thu, 7 Dec 2006 02:14:19 +0100
Subject: [PATCH] i386: replace kmalloc+memset with kzalloc

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index 5c43be4..80b4c5d 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -480,12 +480,10 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
 	if (num_cache_leaves == 0)
 		return -ENOENT;
 
-	cpuid4_info[cpu] = kmalloc(
+	cpuid4_info[cpu] = kzalloc(
 	    sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
 	if (unlikely(cpuid4_info[cpu] == NULL))
 		return -ENOMEM;
-	memset(cpuid4_info[cpu], 0,
-	    sizeof(struct _cpuid4_info) * num_cache_leaves);
 
 	oldmask = current->cpus_allowed;
 	retval = set_cpus_allowed(current, cpumask_of_cpu(cpu));
@@ -658,17 +656,14 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
 		return -ENOENT;
 
 	/* Allocate all required memory */
-	cache_kobject[cpu] = kmalloc(sizeof(struct kobject), GFP_KERNEL);
+	cache_kobject[cpu] = kzalloc(sizeof(struct kobject), GFP_KERNEL);
 	if (unlikely(cache_kobject[cpu] == NULL))
 		goto err_out;
-	memset(cache_kobject[cpu], 0, sizeof(struct kobject));
 
-	index_kobject[cpu] = kmalloc(
+	index_kobject[cpu] = kzalloc(
 	    sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL);
 	if (unlikely(index_kobject[cpu] == NULL))
 		goto err_out;
-	memset(index_kobject[cpu], 0,
-	    sizeof(struct _index_kobject) * num_cache_leaves);
 
 	return 0;
 
diff --git a/arch/i386/kernel/mca.c b/arch/i386/kernel/mca.c
index eb57a85..b83672b 100644
--- a/arch/i386/kernel/mca.c
+++ b/arch/i386/kernel/mca.c
@@ -283,10 +283,9 @@ static int __init mca_init(void)
 	bus->f.mca_transform_memory = mca_dummy_transform_memory;
 
 	/* get the motherboard device */
-	mca_dev = kmalloc(sizeof(struct mca_device), GFP_KERNEL);
+	mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL);
 	if(unlikely(!mca_dev))
 		goto out_nomem;
-	memset(mca_dev, 0, sizeof(struct mca_device));
 
 	/*
 	 * We do not expect many MCA interrupts during initialization,
@@ -310,11 +309,9 @@ static int __init mca_init(void)
 	mca_dev->slot = MCA_MOTHERBOARD;
 	mca_register_device(MCA_PRIMARY_BUS, mca_dev);
 
-	mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC);
+	mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
 	if(unlikely(!mca_dev))
 		goto out_unlock_nomem;
-	memset(mca_dev, 0, sizeof(struct mca_device));
-
 
 	/* Put motherboard into video setup mode, read integrated video
 	 * POS registers, and turn motherboard setup off.
@@ -349,10 +346,9 @@ static int __init mca_init(void)
 	}
 	if(which_scsi) {
 		/* found a scsi card */
-		mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC);
+		mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
 		if(unlikely(!mca_dev))
 			goto out_unlock_nomem;
-		memset(mca_dev, 0, sizeof(struct mca_device));
 
 		for(j = 0; j < 8; j++)
 			mca_dev->pos[j] = pos[j];
@@ -378,10 +374,9 @@ static int __init mca_init(void)
 		if(!mca_read_and_store_pos(pos))
 			continue;
 
-		mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC);
+		mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
 		if(unlikely(!mca_dev))
 			goto out_unlock_nomem;
-		memset(mca_dev, 0, sizeof(struct mca_device));
 
 		for(j=0; j<8; j++)
 			mca_dev->pos[j]=pos[j];
diff --git a/arch/i386/kernel/pci-dma.c b/arch/i386/kernel/pci-dma.c
index 5c8c6ef..41af692 100644
--- a/arch/i386/kernel/pci-dma.c
+++ b/arch/i386/kernel/pci-dma.c
@@ -92,14 +92,12 @@ int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
 	if (!mem_base)
 		goto out;
 
-	dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
+	dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
 	if (!dev->dma_mem)
 		goto out;
-	memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem));
-	dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
+	dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
 	if (!dev->dma_mem->bitmap)
 		goto free1_out;
-	memset(dev->dma_mem->bitmap, 0, bitmap_size);
 
 	dev->dma_mem->virt_base = mem_base;
 	dev->dma_mem->device_base = device_addr;
diff --git a/arch/i386/mach-voyager/voyager_cat.c b/arch/i386/mach-voyager/voyager_cat.c
index f50c6c6..943a947 100644
--- a/arch/i386/mach-voyager/voyager_cat.c
+++ b/arch/i386/mach-voyager/voyager_cat.c
@@ -776,7 +776,7 @@ voyager_cat_init(void)
 		for(asic=0; asic < (*modpp)->num_asics; asic++) {
 			int j;
 			voyager_asic_t *asicp = *asicpp 
-				= kmalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++];*/
+				= kzalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++];*/
 			voyager_sp_table_t *sp_table;
 			voyager_at_t *asic_table;
 			voyager_jtt_t *jtag_table;
@@ -785,7 +785,6 @@ voyager_cat_init(void)
 				printk("**WARNING** kmalloc failure in cat_init\n");
 				continue;
 			}
-			memset(asicp, 0, sizeof(voyager_asic_t));
 			asicpp = &(asicp->next);
 			asicp->asic_location = asic;
 			sp_table = (voyager_sp_table_t *)(eprom_buf + sp_offset);
@@ -851,8 +850,7 @@ voyager_cat_init(void)
 #endif
 
 		{
-			struct resource *res = kmalloc(sizeof(struct resource),GFP_KERNEL);
-			memset(res, 0, sizeof(struct resource));
+			struct resource *res = kzalloc(sizeof(struct resource),GFP_KERNEL);
 			res->name = kmalloc(128, GFP_KERNEL);
 			sprintf((char *)res->name, "Voyager %s Quad CPI", cat_module_name(i));
 			res->start = qic_addr;
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index 54d6d7a..46d32ad 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -95,15 +95,7 @@ static inline struct thread_info *current_thread_info(void)
 
 /* thread information allocation */
 #ifdef CONFIG_DEBUG_STACK_USAGE
-#define alloc_thread_info(tsk)					\
-	({							\
-		struct thread_info *ret;			\
-								\
-		ret = kmalloc(THREAD_SIZE, GFP_KERNEL);		\
-		if (ret)					\
-			memset(ret, 0, THREAD_SIZE);		\
-		ret;						\
-	})
+#define alloc_thread_info(tsk) kzalloc(THREAD_SIZE, GFP_KERNEL)
 #else
 #define alloc_thread_info(tsk) kmalloc(THREAD_SIZE, GFP_KERNEL)
 #endif
-- 
cgit v0.10.2


From f6ca8083c261864fc9de94ef99c3311ea259c5c3 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 7 Dec 2006 02:14:19 +0100
Subject: [PATCH] x86-64: Make ix86 default to HIGHMEM4G instead of NOHIGHMEM

Generally better for allmodconfig coverage.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index b6b2df4..ea70359 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -454,7 +454,8 @@ source "drivers/firmware/Kconfig"
 
 choice
 	prompt "High Memory Support"
-	default NOHIGHMEM
+	default HIGHMEM4G if !X86_NUMAQ
+	default HIGHMEM64G if X86_NUMAQ
 
 config NOHIGHMEM
 	bool "off"
-- 
cgit v0.10.2


From ad892f5e0d01f3c3b475a688d1ddc211cf3ea56d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai.lu@amd.com>
Date: Thu, 7 Dec 2006 02:14:19 +0100
Subject: [PATCH] x86-64: check vector in setup_ioapic_dest to verify if need
 setup_IO_APIC_irq

setup_IO_APIC_irqs could fail to get vector for some device when you have too
many devices, because at that time only boot cpu is online.  So check vector
for irq in setup_ioapic_dest and call setup_IO_APIC_irq to make sure IO-APIC
irq-routing table is initialized.

Also seperate setup_IO_APIC_irq from setup_IO_APIC_irqs.

Signed-off-by: Yinghai Lu <yinghai.lu@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index eaf0b70..2a1dcd5 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -789,27 +789,65 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
 					      handle_edge_irq, "edge");
 	}
 }
-
-static void __init setup_IO_APIC_irqs(void)
+static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
 {
 	struct IO_APIC_route_entry entry;
-	int apic, pin, idx, irq, first_notcon = 1, vector;
+	int vector;
 	unsigned long flags;
 
-	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
-	for (apic = 0; apic < nr_ioapics; apic++) {
-	for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+	/*
+	 * add it to the IO-APIC irq-routing table:
+	 */
+	memset(&entry,0,sizeof(entry));
 
-		/*
-		 * add it to the IO-APIC irq-routing table:
-		 */
-		memset(&entry,0,sizeof(entry));
+	entry.delivery_mode = INT_DELIVERY_MODE;
+	entry.dest_mode = INT_DEST_MODE;
+	entry.mask = 0;				/* enable IRQ */
+	entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+
+	entry.trigger = irq_trigger(idx);
+	entry.polarity = irq_polarity(idx);
 
-		entry.delivery_mode = INT_DELIVERY_MODE;
-		entry.dest_mode = INT_DEST_MODE;
-		entry.mask = 0;				/* enable IRQ */
+	if (irq_trigger(idx)) {
+		entry.trigger = 1;
+		entry.mask = 1;
 		entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+	}
+
+	if (!apic && !IO_APIC_IRQ(irq))
+		return;
+
+	if (IO_APIC_IRQ(irq)) {
+		cpumask_t mask;
+		vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
+		if (vector < 0)
+			return;
+
+		entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
+		entry.vector = vector;
+
+		ioapic_register_intr(irq, vector, IOAPIC_AUTO);
+		if (!apic && (irq < 16))
+			disable_8259A_irq(irq);
+	}
+
+	ioapic_write_entry(apic, pin, entry);
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	set_native_irq_info(irq, TARGET_CPUS);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+}
+
+static void __init setup_IO_APIC_irqs(void)
+{
+	int apic, pin, idx, irq, first_notcon = 1;
+
+	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+
+	for (apic = 0; apic < nr_ioapics; apic++) {
+	for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
 
 		idx = find_irq_entry(apic,pin,mp_INT);
 		if (idx == -1) {
@@ -821,39 +859,11 @@ static void __init setup_IO_APIC_irqs(void)
 			continue;
 		}
 
-		entry.trigger = irq_trigger(idx);
-		entry.polarity = irq_polarity(idx);
-
-		if (irq_trigger(idx)) {
-			entry.trigger = 1;
-			entry.mask = 1;
-			entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
-		}
-
 		irq = pin_2_irq(idx, apic, pin);
 		add_pin_to_irq(irq, apic, pin);
 
-		if (!apic && !IO_APIC_IRQ(irq))
-			continue;
+		setup_IO_APIC_irq(apic, pin, idx, irq);
 
-		if (IO_APIC_IRQ(irq)) {
-			cpumask_t mask;
-			vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
-			if (vector < 0)
-				continue;
-
-			entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
-			entry.vector = vector;
-
-			ioapic_register_intr(irq, vector, IOAPIC_AUTO);
-			if (!apic && (irq < 16))
-				disable_8259A_irq(irq);
-		}
-		ioapic_write_entry(apic, pin, entry);
-
-		spin_lock_irqsave(&ioapic_lock, flags);
-		set_native_irq_info(irq, TARGET_CPUS);
-		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 	}
 
@@ -2139,7 +2149,15 @@ void __init setup_ioapic_dest(void)
 			if (irq_entry == -1)
 				continue;
 			irq = pin_2_irq(irq_entry, ioapic, pin);
-			set_ioapic_affinity_irq(irq, TARGET_CPUS);
+
+			/* setup_IO_APIC_irqs could fail to get vector for some device
+			 * when you have too many devices, because at that time only boot
+			 * cpu is online.
+			 */
+			if(!irq_vector[irq])
+				setup_IO_APIC_irq(ioapic, pin, irq_entry, irq);
+			else
+				set_ioapic_affinity_irq(irq, TARGET_CPUS);
 		}
 
 	}
-- 
cgit v0.10.2


From 6bedb2ccb02dcc70ffc8eb76df71c746378190ad Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Thu, 7 Dec 2006 02:14:19 +0100
Subject: [PATCH] x86-64: don't use set_irq_regs()

We don't need to setup _irq_regs in smp_xxx_interrupt (except apic timer).
These handlers run with irqs disabled and do not call functions which need
"struct pt_regs".

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Andi Kleen <ak@suse.de>
Acked-By: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 31e5c65..1b080ab 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -321,7 +321,6 @@ static inline void leave_mm (unsigned long cpu)
 
 fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
 {
-	struct pt_regs *old_regs = set_irq_regs(regs);
 	unsigned long cpu;
 
 	cpu = get_cpu();
@@ -352,7 +351,6 @@ fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
 	smp_mb__after_clear_bit();
 out:
 	put_cpu_no_resched();
-	set_irq_regs(old_regs);
 }
 
 static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
@@ -607,14 +605,11 @@ void smp_send_stop(void)
  */
 fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
 {
-	struct pt_regs *old_regs = set_irq_regs(regs);
 	ack_APIC_irq();
-	set_irq_regs(old_regs);
 }
 
 fastcall void smp_call_function_interrupt(struct pt_regs *regs)
 {
-	struct pt_regs *old_regs = set_irq_regs(regs);
 	void (*func) (void *info) = call_data->func;
 	void *info = call_data->info;
 	int wait = call_data->wait;
@@ -637,7 +632,6 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs)
 		mb();
 		atomic_inc(&call_data->finished);
 	}
-	set_irq_regs(old_regs);
 }
 
 /*
-- 
cgit v0.10.2


From c65f38d911aa301cea109d38d40925750dd6c2da Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 7 Dec 2006 02:14:19 +0100
Subject: [PATCH] unwinder: fully support linker generated .eh_frame_hdr
 section

Now that binutils' ld is able to properly populate .eh_frame_hdr in the
Linux kernel case, here's a patch to add some functionality to the Dwarf2
unwinder to actually be able to make use of this (applies on firstfloor
tree with the previously sent patch to add debug output, but not on plain
2.6.19).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/kernel/unwind.c b/kernel/unwind.c
index 209e248..08645aa 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -164,7 +164,9 @@ static struct unwind_table *find_table(unsigned long pc)
 
 static unsigned long read_pointer(const u8 **pLoc,
                                   const void *end,
-                                  signed ptrType);
+                                  signed ptrType,
+                                  unsigned long text_base,
+                                  unsigned long data_base);
 
 static void init_unwind_table(struct unwind_table *table,
                               const char *name,
@@ -189,10 +191,13 @@ static void init_unwind_table(struct unwind_table *table,
 	/* See if the linker provided table looks valid. */
 	if (header_size <= 4
 	    || header_start[0] != 1
-	    || (void *)read_pointer(&ptr, end, header_start[1]) != table_start
-	    || header_start[2] == DW_EH_PE_omit
-	    || read_pointer(&ptr, end, header_start[2]) <= 0
-	    || header_start[3] == DW_EH_PE_omit)
+	    || (void *)read_pointer(&ptr, end, header_start[1], 0, 0)
+	       != table_start
+	    || !read_pointer(&ptr, end, header_start[2], 0, 0)
+	    || !read_pointer(&ptr, end, header_start[3], 0,
+	                     (unsigned long)header_start)
+	    || !read_pointer(&ptr, end, header_start[3], 0,
+	                     (unsigned long)header_start))
 		header_start = NULL;
 	table->hdrsz = header_size;
 	smp_wmb();
@@ -282,7 +287,7 @@ static void __init setup_unwind_table(struct unwind_table *table,
 		ptr = (const u8 *)(fde + 2);
 		if (!read_pointer(&ptr,
 		                  (const u8 *)(fde + 1) + *fde,
-		                  ptrType))
+		                  ptrType, 0, 0))
 			return;
 		++n;
 	}
@@ -317,7 +322,7 @@ static void __init setup_unwind_table(struct unwind_table *table,
 		ptr = (const u8 *)(fde + 2);
 		header->table[n].start = read_pointer(&ptr,
 		                                      (const u8 *)(fde + 1) + *fde,
-		                                      fde_pointer_type(cie));
+		                                      fde_pointer_type(cie), 0, 0);
 		header->table[n].fde = (unsigned long)fde;
 		++n;
 	}
@@ -500,7 +505,9 @@ static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *table)
 
 static unsigned long read_pointer(const u8 **pLoc,
                                   const void *end,
-                                  signed ptrType)
+                                  signed ptrType,
+                                  unsigned long text_base,
+                                  unsigned long data_base)
 {
 	unsigned long value = 0;
 	union {
@@ -572,6 +579,22 @@ static unsigned long read_pointer(const u8 **pLoc,
 	case DW_EH_PE_pcrel:
 		value += (unsigned long)*pLoc;
 		break;
+	case DW_EH_PE_textrel:
+		if (likely(text_base)) {
+			value += text_base;
+			break;
+		}
+		dprintk(2, "Text-relative encoding %02X (%p,%p), but zero text base.",
+		        ptrType, *pLoc, end);
+		return 0;
+	case DW_EH_PE_datarel:
+		if (likely(data_base)) {
+			value += data_base;
+			break;
+		}
+		dprintk(2, "Data-relative encoding %02X (%p,%p), but zero data base.",
+		        ptrType, *pLoc, end);
+		return 0;
 	default:
 		dprintk(2, "Cannot adjust pointer type %02X (%p,%p).",
 		        ptrType, *pLoc, end);
@@ -625,7 +648,8 @@ static signed fde_pointer_type(const u32 *cie)
 			case 'P': {
 					signed ptrType = *ptr++;
 
-					if (!read_pointer(&ptr, end, ptrType) || ptr > end)
+					if (!read_pointer(&ptr, end, ptrType, 0, 0)
+					    || ptr > end)
 						return -1;
 				}
 				break;
@@ -685,7 +709,8 @@ static int processCFI(const u8 *start,
 			case DW_CFA_nop:
 				break;
 			case DW_CFA_set_loc:
-				if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0)
+				state->loc = read_pointer(&ptr.p8, end, ptrType, 0, 0);
+				if (state->loc == 0)
 					result = 0;
 				break;
 			case DW_CFA_advance_loc1:
@@ -854,9 +879,9 @@ int unwind(struct unwind_frame_info *frame)
 			ptr = hdr + 4;
 			end = hdr + table->hdrsz;
 			if (tableSize
-			    && read_pointer(&ptr, end, hdr[1])
+			    && read_pointer(&ptr, end, hdr[1], 0, 0)
 			       == (unsigned long)table->address
-			    && (i = read_pointer(&ptr, end, hdr[2])) > 0
+			    && (i = read_pointer(&ptr, end, hdr[2], 0, 0)) > 0
 			    && i == (end - ptr) / (2 * tableSize)
 			    && !((end - ptr) % (2 * tableSize))) {
 				do {
@@ -864,7 +889,8 @@ int unwind(struct unwind_frame_info *frame)
 
 					startLoc = read_pointer(&cur,
 					                        cur + tableSize,
-					                        hdr[3]);
+					                        hdr[3], 0,
+					                        (unsigned long)hdr);
 					if (pc < startLoc)
 						i /= 2;
 					else {
@@ -875,11 +901,13 @@ int unwind(struct unwind_frame_info *frame)
 				if (i == 1
 				    && (startLoc = read_pointer(&ptr,
 				                                ptr + tableSize,
-				                                hdr[3])) != 0
+				                                hdr[3], 0,
+				                                (unsigned long)hdr)) != 0
 				    && pc >= startLoc)
 					fde = (void *)read_pointer(&ptr,
 					                           ptr + tableSize,
-					                           hdr[3]);
+					                           hdr[3], 0,
+					                           (unsigned long)hdr);
 			}
 		}
 		if(hdr && !fde)
@@ -894,13 +922,13 @@ int unwind(struct unwind_frame_info *frame)
 			   && (ptrType = fde_pointer_type(cie)) >= 0
 			   && read_pointer(&ptr,
 			                   (const u8 *)(fde + 1) + *fde,
-			                   ptrType) == startLoc) {
+			                   ptrType, 0, 0) == startLoc) {
 				if (!(ptrType & DW_EH_PE_indirect))
 					ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed;
 				endLoc = startLoc
 				         + read_pointer(&ptr,
 				                        (const u8 *)(fde + 1) + *fde,
-				                        ptrType);
+				                        ptrType, 0, 0);
 				if(pc >= endLoc)
 					fde = NULL;
 			} else
@@ -926,7 +954,7 @@ int unwind(struct unwind_frame_info *frame)
 				ptr = (const u8 *)(fde + 2);
 				startLoc = read_pointer(&ptr,
 				                        (const u8 *)(fde + 1) + *fde,
-				                        ptrType);
+				                        ptrType, 0, 0);
 				if (!startLoc)
 					continue;
 				if (!(ptrType & DW_EH_PE_indirect))
@@ -934,7 +962,7 @@ int unwind(struct unwind_frame_info *frame)
 				endLoc = startLoc
 				         + read_pointer(&ptr,
 				                        (const u8 *)(fde + 1) + *fde,
-				                        ptrType);
+				                        ptrType, 0, 0);
 				if (pc >= startLoc && pc < endLoc)
 					break;
 			}
-- 
cgit v0.10.2


From b65780e123ba9b762276482bbfb52836e4d41fd9 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 7 Dec 2006 02:14:19 +0100
Subject: [PATCH] unwinder: move .eh_frame to RODATA

The .eh_frame section contents is never written to, so it can as well
benefit from CONFIG_DEBUG_RODATA.

Diff-ed against firstfloor tree.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 25581e8..56e6ad5c 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -102,15 +102,6 @@ SECTIONS
 	_edata = .;		/* End of data section */
   }
 
-#ifdef CONFIG_STACK_UNWIND
-  . = ALIGN(4);
-  .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) {
-	__start_unwind = .;
-  	*(.eh_frame)
-	__end_unwind = .;
-  }
-#endif
-
   . = ALIGN(THREAD_SIZE);	/* init_task */
   .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
 	*(.data.init_task)
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index d9534e7..6a1f8f4 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -51,15 +51,6 @@ SECTIONS
 
   RODATA
 
-#ifdef CONFIG_STACK_UNWIND
-  . = ALIGN(8);
-  .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) {
-	__start_unwind = .;
-  	*(.eh_frame)
-	__end_unwind = .;
-  }
-#endif
-
   . = ALIGN(PAGE_SIZE);        /* Align data segment to page size boundary */
 				/* Data */
   .data : AT(ADDR(.data) - LOAD_OFFSET) {
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 9f47477..4d4c62d 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -119,8 +119,7 @@
 		*(__ksymtab_strings)					\
 	}								\
 									\
-	/* Unwind data binary search table */				\
-	EH_FRAME_HDR							\
+	EH_FRAME							\
 									\
 	/* Built-in module parameters. */				\
 	__param : AT(ADDR(__param) - LOAD_OFFSET) {			\
@@ -162,15 +161,23 @@
 		VMLINUX_SYMBOL(__kprobes_text_end) = .;
 
 #ifdef CONFIG_STACK_UNWIND
-		/* Unwind data binary search table */
-#define EH_FRAME_HDR							\
+#define EH_FRAME							\
+		/* Unwind data binary search table */			\
+		. = ALIGN(8);						\
         	.eh_frame_hdr : AT(ADDR(.eh_frame_hdr) - LOAD_OFFSET) {	\
 			VMLINUX_SYMBOL(__start_unwind_hdr) = .;		\
 			*(.eh_frame_hdr)				\
 			VMLINUX_SYMBOL(__end_unwind_hdr) = .;		\
+		}							\
+		/* Unwind data */					\
+		. = ALIGN(8);						\
+		.eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) {		\
+			VMLINUX_SYMBOL(__start_unwind) = .;		\
+		  	*(.eh_frame)					\
+			VMLINUX_SYMBOL(__end_unwind) = .;		\
 		}
 #else
-#define EH_FRAME_HDR
+#define EH_FRAME
 #endif
 
 		/* DWARF debug sections.
diff --git a/kernel/unwind.c b/kernel/unwind.c
index 08645aa..09c2613 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -19,7 +19,7 @@
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
 
-extern char __start_unwind[], __end_unwind[];
+extern const char __start_unwind[], __end_unwind[];
 extern const u8 __start_unwind_hdr[], __end_unwind_hdr[];
 
 #define MAX_STACK_DEPTH 8
-- 
cgit v0.10.2


From d9408cefe677636bc1c100fdcfac0b2ab9ff87bf Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 7 Dec 2006 02:14:19 +0100
Subject: [PATCH] i386: Clean up smp_tune_scheduling()

- remove the write-only local variable "bandwidth"
- don't set "max_cache_size" in the (cachesize < 0) case:
  that's already handled in kernel/sched.c:measure_migration_cost()

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>

diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 346f27f..b4e6f32 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -1130,34 +1130,15 @@ exit:
 }
 #endif
 
-static void smp_tune_scheduling (void)
+static void smp_tune_scheduling(void)
 {
 	unsigned long cachesize;       /* kB   */
-	unsigned long bandwidth = 350; /* MB/s */
-	/*
-	 * Rough estimation for SMP scheduling, this is the number of
-	 * cycles it takes for a fully memory-limited process to flush
-	 * the SMP-local cache.
-	 *
-	 * (For a P5 this pretty much means we will choose another idle
-	 *  CPU almost always at wakeup time (this is due to the small
-	 *  L1 cache), on PIIs it's around 50-100 usecs, depending on
-	 *  the cache size)
-	 */
 
-	if (!cpu_khz) {
-		/*
-		 * this basically disables processor-affinity
-		 * scheduling on SMP without a TSC.
-		 */
-		return;
-	} else {
+	if (cpu_khz) {
 		cachesize = boot_cpu_data.x86_cache_size;
-		if (cachesize == -1) {
-			cachesize = 16; /* Pentiums, 2x8kB cache */
-			bandwidth = 100;
-		}
-		max_cache_size = cachesize * 1024;
+
+		if (cachesize > 0)
+			max_cache_size = cachesize * 1024;
 	}
 }
 
-- 
cgit v0.10.2


From 64a26a731235b59c9d73bbe82c1f896d57400d37 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:19 +0100
Subject: [PATCH] x86-64: Export smp_call_function_single

smp_call_function() is exported, makes sense to export this one too.

Signed-off-by: Andi Kleen <ak@suse.de>

diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 9f74c88..1bf0e09 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -385,6 +385,7 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
 	put_cpu();
 	return 0;
 }
+EXPORT_SYMBOL(smp_call_function_single);
 
 /*
  * this function sends a 'generic call function' IPI to all other CPUs
-- 
cgit v0.10.2


From 334c29a64507dda187565dd0db0403de3d70ec8b Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <hadi@cyberus.ca>
Date: Mon, 4 Dec 2006 19:31:51 -0800
Subject: [GENETLINK]: Move command capabilities to flags.

This patch moves command capabilities to command flags. Other than
being cleaner, saves several bytes.
We increment the nlctrl version so as to signal to user space that
to not expect the attributes. We will try to be careful
not to do this too often ;->

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/include/linux/genetlink.h b/include/linux/genetlink.h
index 9049dc6..f7a9377 100644
--- a/include/linux/genetlink.h
+++ b/include/linux/genetlink.h
@@ -17,6 +17,9 @@ struct genlmsghdr {
 #define GENL_HDRLEN	NLMSG_ALIGN(sizeof(struct genlmsghdr))
 
 #define GENL_ADMIN_PERM		0x01
+#define GENL_CMD_CAP_DO		0x02
+#define GENL_CMD_CAP_DUMP	0x04
+#define GENL_CMD_CAP_HASPOL	0x08
 
 /*
  * List of reserved static generic netlink identifiers:
@@ -58,9 +61,6 @@ enum {
 	CTRL_ATTR_OP_UNSPEC,
 	CTRL_ATTR_OP_ID,
 	CTRL_ATTR_OP_FLAGS,
-	CTRL_ATTR_OP_POLICY,
-	CTRL_ATTR_OP_DOIT,
-	CTRL_ATTR_OP_DUMPIT,
 	__CTRL_ATTR_OP_MAX,
 };
 
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index b9b0374..b5df749 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -143,6 +143,13 @@ int genl_register_ops(struct genl_family *family, struct genl_ops *ops)
 		goto errout;
 	}
 
+	if (ops->dumpit)
+		ops->flags |= GENL_CMD_CAP_DO;
+	if (ops->doit)
+		ops->flags |= GENL_CMD_CAP_DUMP;
+	if (ops->policy)
+		ops->flags |= GENL_CMD_CAP_HASPOL;
+
 	genl_lock();
 	list_add_tail(&ops->ops_list, &family->ops_list);
 	genl_unlock();
@@ -387,7 +394,7 @@ static void genl_rcv(struct sock *sk, int len)
 static struct genl_family genl_ctrl = {
 	.id = GENL_ID_CTRL,
 	.name = "nlctrl",
-	.version = 0x1,
+	.version = 0x2,
 	.maxattr = CTRL_ATTR_MAX,
 };
 
@@ -425,15 +432,6 @@ static int ctrl_fill_info(struct genl_family *family, u32 pid, u32 seq,
 			NLA_PUT_U32(skb, CTRL_ATTR_OP_ID, ops->cmd);
 			NLA_PUT_U32(skb, CTRL_ATTR_OP_FLAGS, ops->flags);
 
-			if (ops->policy)
-				NLA_PUT_FLAG(skb, CTRL_ATTR_OP_POLICY);
-
-			if (ops->doit)
-				NLA_PUT_FLAG(skb, CTRL_ATTR_OP_DOIT);
-
-			if (ops->dumpit)
-				NLA_PUT_FLAG(skb, CTRL_ATTR_OP_DUMPIT);
-
 			nla_nest_end(skb, nest);
 		}
 
-- 
cgit v0.10.2


From 170b828a31bbeaee3a80a05acefe3596e38f09e0 Mon Sep 17 00:00:00 2001
From: Ben Collins <bcollins@ubuntu.com>
Date: Mon, 4 Dec 2006 19:33:47 -0800
Subject: [ATM]: Add CPPFLAGS to byteorder.h check

O= builds produced errors in the shell command because of unfound headers.

Signed-off-by: Ben Collins <bcollins@ubuntu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/atm/Makefile b/drivers/atm/Makefile
index b5077ce..1b16f81 100644
--- a/drivers/atm/Makefile
+++ b/drivers/atm/Makefile
@@ -41,7 +41,7 @@ ifeq ($(CONFIG_ATM_FORE200E_PCA),y)
   # guess the target endianess to choose the right PCA-200E firmware image
   ifeq ($(CONFIG_ATM_FORE200E_PCA_DEFAULT_FW),y)
     byteorder.h			:= include$(if $(patsubst $(srctree),,$(objtree)),2)/asm/byteorder.h
-    CONFIG_ATM_FORE200E_PCA_FW	:= $(obj)/pca200e$(if $(shell $(CC) -E -dM $(byteorder.h) | grep ' __LITTLE_ENDIAN '),.bin,_ecd.bin2)
+    CONFIG_ATM_FORE200E_PCA_FW	:= $(obj)/pca200e$(if $(shell $(CC) $(CPPFLAGS) -E -dM $(byteorder.h) | grep ' __LITTLE_ENDIAN '),.bin,_ecd.bin2)
   endif
 endif
 
-- 
cgit v0.10.2


From 1b6651f1bf2453d593478aa88af267f057fd73e2 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 4 Dec 2006 19:59:00 -0800
Subject: [XFRM]: Use output device disable_xfrm for forwarded packets

Currently the behaviour of disable_xfrm is inconsistent between
locally generated and forwarded packets. For locally generated
packets disable_xfrm disables the policy lookup if it is set on
the output device, for forwarded traffic however it looks at the
input device. This makes it impossible to disable xfrm on all
devices but a dummy device and use normal routing to direct
traffic to that device.

Always use the output device when checking disable_xfrm.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 9f3924c..11c1671 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1780,7 +1780,7 @@ static inline int __mkroute_input(struct sk_buff *skb,
 #endif
 	if (in_dev->cnf.no_policy)
 		rth->u.dst.flags |= DST_NOPOLICY;
-	if (in_dev->cnf.no_xfrm)
+	if (out_dev->cnf.no_xfrm)
 		rth->u.dst.flags |= DST_NOXFRM;
 	rth->fl.fl4_dst	= daddr;
 	rth->rt_dst	= daddr;
-- 
cgit v0.10.2


From baf5d743d1b8783fdbd5c1260ada2926e5bbaaee Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <hadi@cyberus.ca>
Date: Mon, 4 Dec 2006 20:02:37 -0800
Subject: [XFRM] Optimize policy dumping

This change optimizes the dumping of Security policies.

1) Before this change ..
speedopolis:~# time ./ip xf pol

real    0m22.274s
user    0m0.000s
sys     0m22.269s

2) Turn off sub-policies

speedopolis:~# ./ip xf pol

real    0m13.496s
user    0m0.000s
sys     0m13.493s

i suppose the above is to be expected

3) With this change ..
speedopolis:~# time ./ip x policy

real    0m7.901s
user    0m0.008s
sys     0m7.896s

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index f6c77bd..4f04222 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -860,33 +860,12 @@ EXPORT_SYMBOL(xfrm_policy_flush);
 int xfrm_policy_walk(u8 type, int (*func)(struct xfrm_policy *, int, int, void*),
 		     void *data)
 {
-	struct xfrm_policy *pol;
+	struct xfrm_policy *pol, *last = NULL;
 	struct hlist_node *entry;
-	int dir, count, error;
+	int dir, last_dir = 0, count, error;
 
 	read_lock_bh(&xfrm_policy_lock);
 	count = 0;
-	for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) {
-		struct hlist_head *table = xfrm_policy_bydst[dir].table;
-		int i;
-
-		hlist_for_each_entry(pol, entry,
-				     &xfrm_policy_inexact[dir], bydst) {
-			if (pol->type == type)
-				count++;
-		}
-		for (i = xfrm_policy_bydst[dir].hmask; i >= 0; i--) {
-			hlist_for_each_entry(pol, entry, table + i, bydst) {
-				if (pol->type == type)
-					count++;
-			}
-		}
-	}
-
-	if (count == 0) {
-		error = -ENOENT;
-		goto out;
-	}
 
 	for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) {
 		struct hlist_head *table = xfrm_policy_bydst[dir].table;
@@ -896,21 +875,37 @@ int xfrm_policy_walk(u8 type, int (*func)(struct xfrm_policy *, int, int, void*)
 				     &xfrm_policy_inexact[dir], bydst) {
 			if (pol->type != type)
 				continue;
-			error = func(pol, dir % XFRM_POLICY_MAX, --count, data);
-			if (error)
-				goto out;
+			if (last) {
+				error = func(last, last_dir % XFRM_POLICY_MAX,
+					     count, data);
+				if (error)
+					goto out;
+			}
+			last = pol;
+			last_dir = dir;
+			count++;
 		}
 		for (i = xfrm_policy_bydst[dir].hmask; i >= 0; i--) {
 			hlist_for_each_entry(pol, entry, table + i, bydst) {
 				if (pol->type != type)
 					continue;
-				error = func(pol, dir % XFRM_POLICY_MAX, --count, data);
-				if (error)
-					goto out;
+				if (last) {
+					error = func(last, last_dir % XFRM_POLICY_MAX,
+						     count, data);
+					if (error)
+						goto out;
+				}
+				last = pol;
+				last_dir = dir;
+				count++;
 			}
 		}
 	}
-	error = 0;
+	if (count == 0) {
+		error = -ENOENT;
+		goto out;
+	}
+	error = func(last, last_dir % XFRM_POLICY_MAX, 0, data);
 out:
 	read_unlock_bh(&xfrm_policy_lock);
 	return error;
-- 
cgit v0.10.2


From 94b9bb5480e73cec4552b19fc3f809742b4ebf67 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <hadi@cyberus.ca>
Date: Mon, 4 Dec 2006 20:03:35 -0800
Subject: [XFRM] Optimize SA dumping

Same comments as in "[XFRM] Optimize policy dumping"

The numbers are (20K SAs):

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index da54a64..a14c88b 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1099,7 +1099,7 @@ int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, void*),
 		    void *data)
 {
 	int i;
-	struct xfrm_state *x;
+	struct xfrm_state *x, *last = NULL;
 	struct hlist_node *entry;
 	int count = 0;
 	int err = 0;
@@ -1107,24 +1107,22 @@ int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, void*),
 	spin_lock_bh(&xfrm_state_lock);
 	for (i = 0; i <= xfrm_state_hmask; i++) {
 		hlist_for_each_entry(x, entry, xfrm_state_bydst+i, bydst) {
-			if (xfrm_id_proto_match(x->id.proto, proto))
-				count++;
+			if (!xfrm_id_proto_match(x->id.proto, proto))
+				continue;
+			if (last) {
+				err = func(last, count, data);
+				if (err)
+					goto out;
+			}
+			last = x;
+			count++;
 		}
 	}
 	if (count == 0) {
 		err = -ENOENT;
 		goto out;
 	}
-
-	for (i = 0; i <= xfrm_state_hmask; i++) {
-		hlist_for_each_entry(x, entry, xfrm_state_bydst+i, bydst) {
-			if (!xfrm_id_proto_match(x->id.proto, proto))
-				continue;
-			err = func(x, --count, data);
-			if (err)
-				goto out;
-		}
-	}
+	err = func(last, 0, data);
 out:
 	spin_unlock_bh(&xfrm_state_lock);
 	return err;
-- 
cgit v0.10.2


From 9fe757b0cfcee0724027a675c533077287a21b96 Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jordan.crouse@amd.com>
Date: Wed, 4 Oct 2006 18:48:57 +1000
Subject: [PATCH] crypto: Add support for the Geode LX AES hardware

Add a driver to support the AES hardware on the Geode LX processor.

Signed-off-by: Jordan Crouse <jordan.crouse@amd.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index adb5541..e816535 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -51,4 +51,17 @@ config CRYPTO_DEV_PADLOCK_SHA
 	  If unsure say M. The compiled module will be
 	  called padlock-sha.ko
 
+config CRYPTO_DEV_GEODE
+	tristate "Support for the Geode LX AES engine"
+	depends on CRYPTO && X86_32
+	select CRYPTO_ALGAPI
+	select CRYPTO_BLKCIPHER
+	default m
+	help
+	  Say 'Y' here to use the AMD Geode LX processor on-board AES
+	  engine for the CryptoAPI AES alogrithm.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called geode-aes.
+
 endmenu
diff --git a/drivers/crypto/Makefile b/drivers/crypto/Makefile
index 4c3d0ec..6059cf8 100644
--- a/drivers/crypto/Makefile
+++ b/drivers/crypto/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_CRYPTO_DEV_PADLOCK) += padlock.o
 obj-$(CONFIG_CRYPTO_DEV_PADLOCK_AES) += padlock-aes.o
 obj-$(CONFIG_CRYPTO_DEV_PADLOCK_SHA) += padlock-sha.o
+obj-$(CONFIG_CRYPTO_DEV_GEODE) += geode-aes.o
diff --git a/drivers/crypto/geode-aes.c b/drivers/crypto/geode-aes.c
new file mode 100644
index 0000000..da2d35d
--- /dev/null
+++ b/drivers/crypto/geode-aes.c
@@ -0,0 +1,474 @@
+ /* Copyright (C) 2004-2006, Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/crypto.h>
+#include <linux/spinlock.h>
+#include <crypto/algapi.h>
+
+#include <asm/io.h>
+#include <asm/delay.h>
+
+#include "geode-aes.h"
+
+/* Register definitions */
+
+#define AES_CTRLA_REG  0x0000
+
+#define AES_CTRL_START     0x01
+#define AES_CTRL_DECRYPT   0x00
+#define AES_CTRL_ENCRYPT   0x02
+#define AES_CTRL_WRKEY     0x04
+#define AES_CTRL_DCA       0x08
+#define AES_CTRL_SCA       0x10
+#define AES_CTRL_CBC       0x20
+
+#define AES_INTR_REG  0x0008
+
+#define AES_INTRA_PENDING (1 << 16)
+#define AES_INTRB_PENDING (1 << 17)
+
+#define AES_INTR_PENDING  (AES_INTRA_PENDING | AES_INTRB_PENDING)
+#define AES_INTR_MASK     0x07
+
+#define AES_SOURCEA_REG   0x0010
+#define AES_DSTA_REG      0x0014
+#define AES_LENA_REG      0x0018
+#define AES_WRITEKEY0_REG 0x0030
+#define AES_WRITEIV0_REG  0x0040
+
+/*  A very large counter that is used to gracefully bail out of an
+ *  operation in case of trouble
+ */
+
+#define AES_OP_TIMEOUT    0x50000
+
+/* Static structures */
+
+static void __iomem * _iobase;
+static spinlock_t lock;
+
+/* Write a 128 bit field (either a writable key or IV) */
+static inline void
+_writefield(u32 offset, void *value)
+{
+	int i;
+	for(i = 0; i < 4; i++)
+		iowrite32(((u32 *) value)[i], _iobase + offset + (i * 4));
+}
+
+/* Read a 128 bit field (either a writable key or IV) */
+static inline void
+_readfield(u32 offset, void *value)
+{
+	int i;
+	for(i = 0; i < 4; i++)
+		((u32 *) value)[i] = ioread32(_iobase + offset + (i * 4));
+}
+
+static int
+do_crypt(void *src, void *dst, int len, u32 flags)
+{
+	u32 status;
+	u32 counter = AES_OP_TIMEOUT;
+
+	iowrite32(virt_to_phys(src), _iobase + AES_SOURCEA_REG);
+	iowrite32(virt_to_phys(dst), _iobase + AES_DSTA_REG);
+	iowrite32(len,  _iobase + AES_LENA_REG);
+
+	/* Start the operation */
+	iowrite32(AES_CTRL_START | flags, _iobase + AES_CTRLA_REG);
+
+	do
+		status = ioread32(_iobase + AES_INTR_REG);
+	while(!(status & AES_INTRA_PENDING) && --counter);
+
+	/* Clear the event */
+	iowrite32((status & 0xFF) | AES_INTRA_PENDING, _iobase + AES_INTR_REG);
+	return counter ? 0 : 1;
+}
+
+unsigned int
+geode_aes_crypt(struct geode_aes_op *op)
+{
+
+	u32 flags = 0;
+	int iflags;
+
+	if (op->len == 0 || op->src == op->dst)
+		return 0;
+
+	if (op->flags & AES_FLAGS_COHERENT)
+		flags |= (AES_CTRL_DCA | AES_CTRL_SCA);
+
+	if (op->dir == AES_DIR_ENCRYPT)
+		flags |= AES_CTRL_ENCRYPT;
+
+	/* Start the critical section */
+
+	spin_lock_irqsave(&lock, iflags);
+
+	if (op->mode == AES_MODE_CBC) {
+		flags |= AES_CTRL_CBC;
+		_writefield(AES_WRITEIV0_REG, op->iv);
+	}
+
+	if (op->flags & AES_FLAGS_USRKEY) {
+		flags |= AES_CTRL_WRKEY;
+		_writefield(AES_WRITEKEY0_REG, op->key);
+	}
+
+	do_crypt(op->src, op->dst, op->len, flags);
+
+	if (op->mode == AES_MODE_CBC)
+		_readfield(AES_WRITEIV0_REG, op->iv);
+
+	spin_unlock_irqrestore(&lock, iflags);
+
+	return op->len;
+}
+
+/* CRYPTO-API Functions */
+
+static int
+geode_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int len)
+{
+	struct geode_aes_op *op = crypto_tfm_ctx(tfm);
+
+	if (len != AES_KEY_LENGTH) {
+		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		return -EINVAL;
+	}
+
+	memcpy(op->key, key, len);
+	return 0;
+}
+
+static void
+geode_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	struct geode_aes_op *op = crypto_tfm_ctx(tfm);
+
+	if ((out == NULL) || (in == NULL))
+		return;
+
+	op->src = (void *) in;
+	op->dst = (void *) out;
+	op->mode = AES_MODE_ECB;
+	op->flags = 0;
+	op->len = AES_MIN_BLOCK_SIZE;
+	op->dir = AES_DIR_ENCRYPT;
+
+	geode_aes_crypt(op);
+}
+
+
+static void
+geode_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	struct geode_aes_op *op = crypto_tfm_ctx(tfm);
+
+	if ((out == NULL) || (in == NULL))
+		return;
+
+	op->src = (void *) in;
+	op->dst = (void *) out;
+	op->mode = AES_MODE_ECB;
+	op->flags = 0;
+	op->len = AES_MIN_BLOCK_SIZE;
+	op->dir = AES_DIR_DECRYPT;
+
+	geode_aes_crypt(op);
+}
+
+
+static struct crypto_alg geode_alg = {
+	.cra_name               =       "aes",
+	.cra_driver_name	=       "geode-aes-128",
+	.cra_priority           =       300,
+	.cra_alignmask          =       15,
+	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		=	AES_MIN_BLOCK_SIZE,
+	.cra_ctxsize		=	sizeof(struct geode_aes_op),
+	.cra_module		=	THIS_MODULE,
+	.cra_list		=	LIST_HEAD_INIT(geode_alg.cra_list),
+	.cra_u			=	{
+		.cipher = {
+			.cia_min_keysize	=  AES_KEY_LENGTH,
+			.cia_max_keysize	=  AES_KEY_LENGTH,
+			.cia_setkey		=  geode_setkey,
+			.cia_encrypt		=  geode_encrypt,
+			.cia_decrypt		=  geode_decrypt
+		}
+	}
+};
+
+static int
+geode_cbc_decrypt(struct blkcipher_desc *desc,
+		  struct scatterlist *dst, struct scatterlist *src,
+		  unsigned int nbytes)
+{
+	struct geode_aes_op *op = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	int err, ret;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	while((nbytes = walk.nbytes)) {
+		op->src = walk.src.virt.addr,
+		op->dst = walk.dst.virt.addr;
+		op->mode = AES_MODE_CBC;
+		op->len = nbytes - (nbytes % AES_MIN_BLOCK_SIZE);
+		op->dir = AES_DIR_DECRYPT;
+
+		memcpy(op->iv, walk.iv, AES_IV_LENGTH);
+
+		ret = geode_aes_crypt(op);
+
+		memcpy(walk.iv, op->iv, AES_IV_LENGTH);
+		nbytes -= ret;
+
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	return err;
+}
+
+static int
+geode_cbc_encrypt(struct blkcipher_desc *desc,
+		  struct scatterlist *dst, struct scatterlist *src,
+		  unsigned int nbytes)
+{
+	struct geode_aes_op *op = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	int err, ret;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	while((nbytes = walk.nbytes)) {
+		op->src = walk.src.virt.addr,
+		op->dst = walk.dst.virt.addr;
+		op->mode = AES_MODE_CBC;
+		op->len = nbytes - (nbytes % AES_MIN_BLOCK_SIZE);
+		op->dir = AES_DIR_ENCRYPT;
+
+		memcpy(op->iv, walk.iv, AES_IV_LENGTH);
+
+		ret = geode_aes_crypt(op);
+		nbytes -= ret;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	return err;
+}
+
+static struct crypto_alg geode_cbc_alg = {
+	.cra_name		=	"cbc(aes)",
+	.cra_driver_name	=	"cbc-aes-geode-128",
+	.cra_priority		=	400,
+	.cra_flags		=	CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		=	AES_MIN_BLOCK_SIZE,
+	.cra_ctxsize		=	sizeof(struct geode_aes_op),
+	.cra_alignmask		=	15,
+	.cra_type		=	&crypto_blkcipher_type,
+	.cra_module		=	THIS_MODULE,
+	.cra_list		=	LIST_HEAD_INIT(geode_cbc_alg.cra_list),
+	.cra_u			=	{
+		.blkcipher = {
+			.min_keysize		=	AES_KEY_LENGTH,
+			.max_keysize		=	AES_KEY_LENGTH,
+			.setkey			=	geode_setkey,
+			.encrypt		=	geode_cbc_encrypt,
+			.decrypt		=	geode_cbc_decrypt,
+		}
+	}
+};
+
+static int
+geode_ecb_decrypt(struct blkcipher_desc *desc,
+		  struct scatterlist *dst, struct scatterlist *src,
+		  unsigned int nbytes)
+{
+	struct geode_aes_op *op = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	int err, ret;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	while((nbytes = walk.nbytes)) {
+		op->src = walk.src.virt.addr,
+		op->dst = walk.dst.virt.addr;
+		op->mode = AES_MODE_ECB;
+		op->len = nbytes - (nbytes % AES_MIN_BLOCK_SIZE);
+		op->dir = AES_DIR_DECRYPT;
+
+		ret = geode_aes_crypt(op);
+		nbytes -= ret;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	return err;
+}
+
+static int
+geode_ecb_encrypt(struct blkcipher_desc *desc,
+		  struct scatterlist *dst, struct scatterlist *src,
+		  unsigned int nbytes)
+{
+	struct geode_aes_op *op = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	int err, ret;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	while((nbytes = walk.nbytes)) {
+		op->src = walk.src.virt.addr,
+		op->dst = walk.dst.virt.addr;
+		op->mode = AES_MODE_ECB;
+		op->len = nbytes - (nbytes % AES_MIN_BLOCK_SIZE);
+		op->dir = AES_DIR_ENCRYPT;
+
+		ret = geode_aes_crypt(op);
+		nbytes -= ret;
+		ret =  blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	return err;
+}
+
+static struct crypto_alg geode_ecb_alg = {
+	.cra_name		=	"ecb(aes)",
+	.cra_driver_name	=	"ecb-aes-geode-128",
+	.cra_priority		=	400,
+	.cra_flags		=	CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		=	AES_MIN_BLOCK_SIZE,
+	.cra_ctxsize		=	sizeof(struct geode_aes_op),
+	.cra_alignmask		=	15,
+	.cra_type		=	&crypto_blkcipher_type,
+	.cra_module		=	THIS_MODULE,
+	.cra_list		=	LIST_HEAD_INIT(geode_ecb_alg.cra_list),
+	.cra_u			=	{
+		.blkcipher = {
+			.min_keysize		=	AES_KEY_LENGTH,
+			.max_keysize		=	AES_KEY_LENGTH,
+			.setkey			=	geode_setkey,
+			.encrypt		=	geode_ecb_encrypt,
+			.decrypt		=	geode_ecb_decrypt,
+		}
+	}
+};
+
+static void
+geode_aes_remove(struct pci_dev *dev)
+{
+	crypto_unregister_alg(&geode_alg);
+	crypto_unregister_alg(&geode_ecb_alg);
+	crypto_unregister_alg(&geode_cbc_alg);
+
+	pci_iounmap(dev, _iobase);
+	_iobase = NULL;
+
+	pci_release_regions(dev);
+	pci_disable_device(dev);
+}
+
+
+static int
+geode_aes_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+	int ret;
+
+	if ((ret = pci_enable_device(dev)))
+		return ret;
+
+	if ((ret = pci_request_regions(dev, "geode-aes-128")))
+		goto eenable;
+
+	_iobase = pci_iomap(dev, 0, 0);
+
+	if (_iobase == NULL) {
+		ret = -ENOMEM;
+		goto erequest;
+	}
+
+	spin_lock_init(&lock);
+
+	/* Clear any pending activity */
+	iowrite32(AES_INTR_PENDING | AES_INTR_MASK, _iobase + AES_INTR_REG);
+
+	if ((ret = crypto_register_alg(&geode_alg)))
+		goto eiomap;
+
+	if ((ret = crypto_register_alg(&geode_ecb_alg)))
+		goto ealg;
+
+	if ((ret = crypto_register_alg(&geode_cbc_alg)))
+		goto eecb;
+
+	printk(KERN_NOTICE "geode-aes: GEODE AES engine enabled.\n");
+	return 0;
+
+ eecb:
+	crypto_unregister_alg(&geode_ecb_alg);
+
+ ealg:
+	crypto_unregister_alg(&geode_alg);
+
+ eiomap:
+	pci_iounmap(dev, _iobase);
+
+ erequest:
+	pci_release_regions(dev);
+
+ eenable:
+	pci_disable_device(dev);
+
+	printk(KERN_ERR "geode-aes:  GEODE AES initialization failed.\n");
+	return ret;
+}
+
+static struct pci_device_id geode_aes_tbl[] = {
+	{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_LX_AES, PCI_ANY_ID, PCI_ANY_ID} ,
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, geode_aes_tbl);
+
+static struct pci_driver geode_aes_driver = {
+	.name = "Geode LX AES",
+	.id_table = geode_aes_tbl,
+	.probe = geode_aes_probe,
+	.remove = __devexit_p(geode_aes_remove)
+};
+
+static int __init
+geode_aes_init(void)
+{
+	return pci_module_init(&geode_aes_driver);
+}
+
+static void __exit
+geode_aes_exit(void)
+{
+	pci_unregister_driver(&geode_aes_driver);
+}
+
+MODULE_AUTHOR("Advanced Micro Devices, Inc.");
+MODULE_DESCRIPTION("Geode LX Hardware AES driver");
+MODULE_LICENSE("GPL");
+
+module_init(geode_aes_init);
+module_exit(geode_aes_exit);
diff --git a/drivers/crypto/geode-aes.h b/drivers/crypto/geode-aes.h
new file mode 100644
index 0000000..3e3a571
--- /dev/null
+++ b/drivers/crypto/geode-aes.h
@@ -0,0 +1,42 @@
+/* Copyright (C) 2003-2006, Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _GEODE_AES_H_
+#define _GEODE_AES_H_
+
+#define AES_KEY_LENGTH 16
+#define AES_IV_LENGTH  16
+
+#define AES_MIN_BLOCK_SIZE 16
+
+#define AES_MODE_ECB 0
+#define AES_MODE_CBC 1
+
+#define AES_DIR_DECRYPT 0
+#define AES_DIR_ENCRYPT 1
+
+#define AES_FLAGS_USRKEY   (1 << 0)
+#define AES_FLAGS_COHERENT (1 << 1)
+
+struct geode_aes_op {
+
+	void *src;
+	void *dst;
+
+	u32 mode;
+	u32 dir;
+	u32 flags;
+	int len;
+
+	u8 key[AES_KEY_LENGTH];
+	u8 iv[AES_IV_LENGTH];
+};
+
+unsigned int geode_aes_crypt(struct geode_aes_op *);
+
+#endif
-- 
cgit v0.10.2


From 48527fa7cf7fefb84e9fe03cddd08ddafc9f15f3 Mon Sep 17 00:00:00 2001
From: Rik Snel <rsnel@cube.dyndns.org>
Date: Sun, 3 Sep 2006 08:56:39 +1000
Subject: [BLOCK] dm-crypt: benbi IV, big endian narrow block count for
 LRW-32-AES

LRW-32-AES needs a certain IV. This IV should be provided dm-crypt.
The block cipher mode could, in principle generate the correct IV from
the plain IV, but I think that it is cleaner to supply the right IV
directly.

The sector -> narrow block calculation uses a shift for performance reasons.
This shift is computed in .ctr and stored in cc->iv_gen_private (as a void *).

Signed-off-by: Rik Snel <rsnel@cube.dyndns.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index ed2d4ef..6dbaeee 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -20,6 +20,7 @@
 #include <asm/atomic.h>
 #include <linux/scatterlist.h>
 #include <asm/page.h>
+#include <asm/unaligned.h>
 
 #include "dm.h"
 
@@ -113,6 +114,9 @@ static kmem_cache_t *_crypt_io_pool;
  *        encrypted with the bulk cipher using a salt as key. The salt
  *        should be derived from the bulk cipher's key via hashing.
  *
+ * benbi: the 64-bit "big-endian 'narrow block'-count", starting at 1
+ *        (needed for LRW-32-AES and possible other narrow block modes)
+ *
  * plumb: unimplemented, see:
  * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
  */
@@ -209,6 +213,44 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
 	return 0;
 }
 
+static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
+			      const char *opts)
+{
+	unsigned int bs = crypto_blkcipher_blocksize(cc->tfm);
+	int log = long_log2(bs);
+
+	/* we need to calculate how far we must shift the sector count
+	 * to get the cipher block count, we use this shift in _gen */
+
+	if (1 << log != bs) {
+		ti->error = "cypher blocksize is not a power of 2";
+		return -EINVAL;
+	}
+
+	if (log > 9) {
+		ti->error = "cypher blocksize is > 512";
+		return -EINVAL;
+	}
+
+	cc->iv_gen_private = (void *)(9 - log);
+
+	return 0;
+}
+
+static void crypt_iv_benbi_dtr(struct crypt_config *cc)
+{
+	cc->iv_gen_private = NULL;
+}
+
+static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+{
+	memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
+	put_unaligned(cpu_to_be64(((u64)sector << (u32)cc->iv_gen_private) + 1),
+		      (__be64 *)(iv + cc->iv_size - sizeof(u64)));
+
+	return 0;
+}
+
 static struct crypt_iv_operations crypt_iv_plain_ops = {
 	.generator = crypt_iv_plain_gen
 };
@@ -219,6 +261,11 @@ static struct crypt_iv_operations crypt_iv_essiv_ops = {
 	.generator = crypt_iv_essiv_gen
 };
 
+static struct crypt_iv_operations crypt_iv_benbi_ops = {
+	.ctr	   = crypt_iv_benbi_ctr,
+	.dtr	   = crypt_iv_benbi_dtr,
+	.generator = crypt_iv_benbi_gen
+};
 
 static int
 crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out,
@@ -768,7 +815,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	cc->tfm = tfm;
 
 	/*
-	 * Choose ivmode. Valid modes: "plain", "essiv:<esshash>".
+	 * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi".
 	 * See comments at iv code
 	 */
 
@@ -778,6 +825,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		cc->iv_gen_ops = &crypt_iv_plain_ops;
 	else if (strcmp(ivmode, "essiv") == 0)
 		cc->iv_gen_ops = &crypt_iv_essiv_ops;
+	else if (strcmp(ivmode, "benbi") == 0)
+		cc->iv_gen_ops = &crypt_iv_benbi_ops;
 	else {
 		ti->error = "Invalid IV mode";
 		goto bad2;
-- 
cgit v0.10.2


From 45789328e5aa2de96d4467e4445418364e5378d7 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 3 Sep 2006 08:58:41 +1000
Subject: [BLOCK] dm-crypt: Align IV to u64 for essiv

This patch makes the IV u64-aligned since essiv does a u64 store to it.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 6dbaeee..facf859 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -272,7 +272,7 @@ crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out,
                           struct scatterlist *in, unsigned int length,
                           int write, sector_t sector)
 {
-	u8 iv[cc->iv_size];
+	u8 iv[cc->iv_size] __attribute__ ((aligned(__alignof__(u64))));
 	struct blkcipher_desc desc = {
 		.tfm = cc->tfm,
 		.info = iv,
-- 
cgit v0.10.2


From 333b0d7eeacbd47159daf23757aa81368470c409 Mon Sep 17 00:00:00 2001
From: Kazunori MIYAZAWA <miyazawa@linux-ipv6.org>
Date: Sat, 28 Oct 2006 13:15:24 +1000
Subject: [CRYPTO] xcbc: New algorithm

This is core code of XCBC.

XCBC is an algorithm that forms a MAC algorithm out of a cipher algorithm.
For example, AES-XCBC-MAC is a MAC algorithm based on the AES cipher
algorithm.

Signed-off-by: Kazunori MIYAZAWA <miyazawa@linux-ipv6.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/crypto/Kconfig b/crypto/Kconfig
index cbae839..4495e466 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -39,6 +39,17 @@ config CRYPTO_HMAC
 	  HMAC: Keyed-Hashing for Message Authentication (RFC2104).
 	  This is required for IPSec.
 
+config CRYPTO_XCBC
+	tristate "XCBC support"
+	depends on EXPERIMENTAL
+	select CRYPTO_HASH
+	select CRYPTO_MANAGER
+	help
+	  XCBC: Keyed-Hashing with encryption algorithm
+		http://www.ietf.org/rfc/rfc3566.txt
+		http://csrc.nist.gov/encryption/modes/proposedmodes/
+		 xcbc-mac/xcbc-mac-spec.pdf
+
 config CRYPTO_NULL
 	tristate "Null algorithms"
 	select CRYPTO_ALGAPI
diff --git a/crypto/Makefile b/crypto/Makefile
index 7236620..aba9625f 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_CRYPTO_HASH) += crypto_hash.o
 
 obj-$(CONFIG_CRYPTO_MANAGER) += cryptomgr.o
 obj-$(CONFIG_CRYPTO_HMAC) += hmac.o
+obj-$(CONFIG_CRYPTO_XCBC) += xcbc.o
 obj-$(CONFIG_CRYPTO_NULL) += crypto_null.o
 obj-$(CONFIG_CRYPTO_MD4) += md4.o
 obj-$(CONFIG_CRYPTO_MD5) += md5.o
diff --git a/crypto/xcbc.c b/crypto/xcbc.c
new file mode 100644
index 0000000..f592950
--- /dev/null
+++ b/crypto/xcbc.c
@@ -0,0 +1,346 @@
+/*
+ * Copyright (C)2006 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Author:
+ * 	Kazunori Miyazawa <miyazawa@linux-ipv6.org>
+ */
+
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/scatterlist.h>
+#include "internal.h"
+
+u_int32_t ks[12] = {0x01010101, 0x01010101, 0x01010101, 0x01010101,
+		    0x02020202, 0x02020202, 0x02020202, 0x02020202,
+		    0x03030303, 0x03030303, 0x03030303, 0x03030303};
+/*
+ * +------------------------
+ * | <parent tfm>
+ * +------------------------
+ * | crypto_xcbc_ctx
+ * +------------------------
+ * | odds (block size)
+ * +------------------------
+ * | prev (block size)
+ * +------------------------
+ * | key (block size)
+ * +------------------------
+ * | consts (block size * 3)
+ * +------------------------
+ */
+struct crypto_xcbc_ctx {
+	struct crypto_tfm *child;
+	u8 *odds;
+	u8 *prev;
+	u8 *key;
+	u8 *consts;
+	void (*xor)(u8 *a, const u8 *b, unsigned int bs);
+	unsigned int keylen;
+	unsigned int len;
+};
+
+static void xor_128(u8 *a, const u8 *b, unsigned int bs)
+{
+	((u32 *)a)[0] ^= ((u32 *)b)[0];
+	((u32 *)a)[1] ^= ((u32 *)b)[1];
+	((u32 *)a)[2] ^= ((u32 *)b)[2];
+	((u32 *)a)[3] ^= ((u32 *)b)[3];
+}
+
+static int _crypto_xcbc_digest_setkey(struct crypto_hash *parent,
+				      struct crypto_xcbc_ctx *ctx)
+{
+	int bs = crypto_hash_blocksize(parent);
+	int err = 0;
+	u8 key1[bs];
+
+	if ((err = crypto_cipher_setkey(ctx->child, ctx->key, ctx->keylen)))
+	    return err;
+
+	ctx->child->__crt_alg->cra_cipher.cia_encrypt(ctx->child, key1,
+			ctx->consts);
+
+	return crypto_cipher_setkey(ctx->child, key1, bs);
+}
+
+static int crypto_xcbc_digest_setkey(struct crypto_hash *parent,
+				     const u8 *inkey, unsigned int keylen)
+{
+	struct crypto_xcbc_ctx *ctx = crypto_hash_ctx_aligned(parent);
+
+	if (keylen != crypto_tfm_alg_blocksize(ctx->child))
+		return -EINVAL;
+
+	ctx->keylen = keylen;
+	memcpy(ctx->key, inkey, keylen);
+	ctx->consts = (u8*)ks;
+
+	return _crypto_xcbc_digest_setkey(parent, ctx);
+}
+
+int crypto_xcbc_digest_init(struct hash_desc *pdesc)
+{
+	struct crypto_xcbc_ctx *ctx = crypto_hash_ctx_aligned(pdesc->tfm);
+	int bs = crypto_hash_blocksize(pdesc->tfm);
+
+	ctx->len = 0;
+	memset(ctx->odds, 0, bs);
+	memset(ctx->prev, 0, bs);
+
+	return 0;
+}
+
+int crypto_xcbc_digest_update(struct hash_desc *pdesc, struct scatterlist *sg, unsigned int nbytes)
+{
+	struct crypto_hash *parent = pdesc->tfm;
+	struct crypto_xcbc_ctx *ctx = crypto_hash_ctx_aligned(parent);
+	struct crypto_tfm *tfm = ctx->child;
+	int bs = crypto_hash_blocksize(parent);
+	unsigned int i = 0;
+
+	do {
+
+		struct page *pg = sg[i].page;
+		unsigned int offset = sg[i].offset;
+		unsigned int slen = sg[i].length;
+
+		while (slen > 0) {
+			unsigned int len = min(slen, ((unsigned int)(PAGE_SIZE)) - offset);
+			char *p = crypto_kmap(pg, 0) + offset;
+
+			/* checking the data can fill the block */
+			if ((ctx->len + len) <= bs) {
+				memcpy(ctx->odds + ctx->len, p, len);
+				ctx->len += len;
+				slen -= len;
+
+				/* checking the rest of the page */
+				if (len + offset >= PAGE_SIZE) {
+					offset = 0;
+					pg++;
+				} else
+					offset += len;
+
+				crypto_kunmap(p, 0);
+				crypto_yield(tfm->crt_flags);
+				continue;
+			}
+
+			/* filling odds with new data and encrypting it */
+			memcpy(ctx->odds + ctx->len, p, bs - ctx->len);
+			len -= bs - ctx->len;
+			p += bs - ctx->len;
+
+			ctx->xor(ctx->prev, ctx->odds, bs);
+			tfm->__crt_alg->cra_cipher.cia_encrypt(tfm, ctx->prev, ctx->prev);
+
+			/* clearing the length */
+			ctx->len = 0;
+
+			/* encrypting the rest of data */
+			while (len > bs) {
+				ctx->xor(ctx->prev, p, bs);
+				tfm->__crt_alg->cra_cipher.cia_encrypt(tfm, ctx->prev, ctx->prev);
+				p += bs;
+				len -= bs;
+			}
+
+			/* keeping the surplus of blocksize */
+			if (len) {
+				memcpy(ctx->odds, p, len);
+				ctx->len = len;
+			}
+			crypto_kunmap(p, 0);
+			crypto_yield(tfm->crt_flags);
+			slen -= min(slen, ((unsigned int)(PAGE_SIZE)) - offset);
+			offset = 0;
+			pg++;
+		}
+		nbytes-=sg[i].length;
+		i++;
+	} while (nbytes>0);
+
+	return 0;
+}
+
+int crypto_xcbc_digest_final(struct hash_desc *pdesc, u8 *out)
+{
+	struct crypto_hash *parent = pdesc->tfm;
+	struct crypto_xcbc_ctx *ctx = crypto_hash_ctx_aligned(parent);
+	struct crypto_tfm *tfm = ctx->child;
+	int bs = crypto_hash_blocksize(parent);
+	int err = 0;
+
+	if (ctx->len == bs) {
+		u8 key2[bs];
+
+		if ((err = crypto_cipher_setkey(tfm, ctx->key, ctx->keylen)) != 0)
+			return err;
+
+		tfm->__crt_alg->cra_cipher.cia_encrypt(tfm, key2, (const u8*)(ctx->consts+bs));
+
+		ctx->xor(ctx->prev, ctx->odds, bs);
+		ctx->xor(ctx->prev, key2, bs);
+		_crypto_xcbc_digest_setkey(parent, ctx);
+
+		tfm->__crt_alg->cra_cipher.cia_encrypt(tfm, out, ctx->prev);
+	} else {
+		u8 key3[bs];
+		unsigned int rlen;
+		u8 *p = ctx->odds + ctx->len;
+		*p = 0x80;
+		p++;
+
+		rlen = bs - ctx->len -1;
+		if (rlen)
+			memset(p, 0, rlen);
+
+		if ((err = crypto_cipher_setkey(tfm, ctx->key, ctx->keylen)) != 0)
+			return err;
+
+		tfm->__crt_alg->cra_cipher.cia_encrypt(tfm, key3, (const u8*)(ctx->consts+bs*2));
+
+		ctx->xor(ctx->prev, ctx->odds, bs);
+		ctx->xor(ctx->prev, key3, bs);
+
+		_crypto_xcbc_digest_setkey(parent, ctx);
+
+		tfm->__crt_alg->cra_cipher.cia_encrypt(tfm, out, ctx->prev);
+	}
+
+	return 0;
+}
+
+static int crypto_xcbc_digest(struct hash_desc *pdesc,
+		  struct scatterlist *sg, unsigned int nbytes, u8 *out)
+{
+	crypto_xcbc_digest_init(pdesc);
+	crypto_xcbc_digest_update(pdesc, sg, nbytes);
+	return crypto_xcbc_digest_final(pdesc, out);
+}
+
+static int xcbc_init_tfm(struct crypto_tfm *tfm)
+{
+	struct crypto_instance *inst = (void *)tfm->__crt_alg;
+	struct crypto_spawn *spawn = crypto_instance_ctx(inst);
+	struct crypto_xcbc_ctx *ctx = crypto_hash_ctx_aligned(__crypto_hash_cast(tfm));
+	int bs = crypto_hash_blocksize(__crypto_hash_cast(tfm));
+
+	tfm = crypto_spawn_tfm(spawn);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	switch(bs) {
+	case 16:
+		ctx->xor = xor_128;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	ctx->child = crypto_cipher_cast(tfm);
+	ctx->odds = (u8*)(ctx+1);
+	ctx->prev = ctx->odds + bs;
+	ctx->key = ctx->prev + bs;
+
+	return 0;
+};
+
+static void xcbc_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct crypto_xcbc_ctx *ctx = crypto_hash_ctx_aligned(__crypto_hash_cast(tfm));
+	crypto_free_cipher(ctx->child);
+}
+
+static struct crypto_instance *xcbc_alloc(void *param, unsigned int len)
+{
+	struct crypto_instance *inst;
+	struct crypto_alg *alg;
+	alg = crypto_get_attr_alg(param, len, CRYPTO_ALG_TYPE_CIPHER,
+				  CRYPTO_ALG_TYPE_HASH_MASK | CRYPTO_ALG_ASYNC);
+	if (IS_ERR(alg))
+		return ERR_PTR(PTR_ERR(alg));
+
+	switch(alg->cra_blocksize) {
+	case 16:
+		break;
+	default:
+		return ERR_PTR(PTR_ERR(alg));
+	}
+
+	inst = crypto_alloc_instance("xcbc", alg);
+	if (IS_ERR(inst))
+		goto out_put_alg;
+
+	inst->alg.cra_flags = CRYPTO_ALG_TYPE_HASH;
+	inst->alg.cra_priority = alg->cra_priority;
+	inst->alg.cra_blocksize = alg->cra_blocksize;
+	inst->alg.cra_alignmask = alg->cra_alignmask;
+	inst->alg.cra_type = &crypto_hash_type;
+
+	inst->alg.cra_hash.digestsize =
+		(alg->cra_flags & CRYPTO_ALG_TYPE_MASK) ==
+		CRYPTO_ALG_TYPE_HASH ? alg->cra_hash.digestsize :
+				       alg->cra_blocksize;
+	inst->alg.cra_ctxsize = sizeof(struct crypto_xcbc_ctx) +
+				ALIGN(inst->alg.cra_blocksize * 3, sizeof(void *));
+	inst->alg.cra_init = xcbc_init_tfm;
+	inst->alg.cra_exit = xcbc_exit_tfm;
+
+	inst->alg.cra_hash.init = crypto_xcbc_digest_init;
+	inst->alg.cra_hash.update = crypto_xcbc_digest_update;
+	inst->alg.cra_hash.final = crypto_xcbc_digest_final;
+	inst->alg.cra_hash.digest = crypto_xcbc_digest;
+	inst->alg.cra_hash.setkey = crypto_xcbc_digest_setkey;
+
+out_put_alg:
+	crypto_mod_put(alg);
+	return inst;
+}
+
+static void xcbc_free(struct crypto_instance *inst)
+{
+	crypto_drop_spawn(crypto_instance_ctx(inst));
+	kfree(inst);
+}
+
+static struct crypto_template crypto_xcbc_tmpl = {
+	.name = "xcbc",
+	.alloc = xcbc_alloc,
+	.free = xcbc_free,
+	.module = THIS_MODULE,
+};
+
+static int __init crypto_xcbc_module_init(void)
+{
+	return crypto_register_template(&crypto_xcbc_tmpl);
+}
+
+static void __exit crypto_xcbc_module_exit(void)
+{
+	crypto_unregister_template(&crypto_xcbc_tmpl);
+}
+
+module_init(crypto_xcbc_module_init);
+module_exit(crypto_xcbc_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("XCBC keyed hash algorithm");
-- 
cgit v0.10.2


From 5b2becf5dc8ebb760b0d1653604831dc0582a121 Mon Sep 17 00:00:00 2001
From: Kazunori MIYAZAWA <miyazawa@linux-ipv6.org>
Date: Sat, 28 Oct 2006 13:18:53 +1000
Subject: [CRYPTO] tcrypt: Add test vectors of AES_XCBC

est vectors of XCBC with AES-128.

Signed-off-by: Kazunori MIYAZAWA <miyazawa@linux-ipv6.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 8330742..d1a5f2b8 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -977,6 +977,9 @@ static void do_test(void)
 		test_hash("hmac(sha256)", hmac_sha256_tv_template,
 			  HMAC_SHA256_TEST_VECTORS);
 
+		test_hash("xcbc(aes)", aes_xcbc128_tv_template,
+			  XCBC_AES_TEST_VECTORS);
+
 		test_hash("michael_mic", michael_mic_tv_template, MICHAEL_MIC_TEST_VECTORS);
 		break;
 
diff --git a/crypto/tcrypt.h b/crypto/tcrypt.h
index a40c441..2d07e8a 100644
--- a/crypto/tcrypt.h
+++ b/crypto/tcrypt.h
@@ -933,6 +933,74 @@ static struct hash_testvec hmac_sha256_tv_template[] = {
 	},
 };
 
+#define XCBC_AES_TEST_VECTORS 6
+
+static struct hash_testvec aes_xcbc128_tv_template[] = {
+	{
+		.key	= { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+			    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+		.plaintext = { [0 ... 15] = 0 },
+		.digest = { 0x75, 0xf0, 0x25, 0x1d, 0x52, 0x8a, 0xc0, 0x1c,
+			    0x45, 0x73, 0xdf, 0xd5, 0x84, 0xd7, 0x9f, 0x29 },
+		.psize	= 0,
+		.ksize	= 16,
+	}, {
+		.key	= { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+			    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+		.plaintext = { 0x00, 0x01, 0x02 },
+		.digest	= { 0x5b, 0x37, 0x65, 0x80, 0xae, 0x2f, 0x19, 0xaf,
+			    0xe7, 0x21, 0x9c, 0xee, 0xf1, 0x72, 0x75, 0x6f },
+		.psize	= 3,
+		.ksize	= 16,
+	} , {
+		.key	= { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+			    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+		.plaintext = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+			       0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+		.digest = { 0xd2, 0xa2, 0x46, 0xfa, 0x34, 0x9b, 0x68, 0xa7,
+			    0x99, 0x98, 0xa4, 0x39, 0x4f, 0xf7, 0xa2, 0x63 },
+		.psize	= 16,
+		.ksize	= 16,
+	}, {
+		.key	= { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+			    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+		.plaintext = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+			       0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+			       0x10, 0x11, 0x12, 0x13 },
+		.digest = { 0x47, 0xf5, 0x1b, 0x45, 0x64, 0x96, 0x62, 0x15,
+			    0xb8, 0x98, 0x5c, 0x63, 0x05, 0x5e, 0xd3, 0x08 },
+		.tap	= { 10, 10 },
+		.psize	= 20,
+		.np	= 2,
+		.ksize	= 16,
+	}, {
+		.key	= { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+			    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+		.plaintext = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+			       0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+			       0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+			       0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f },
+		.digest = { 0xf5, 0x4f, 0x0e, 0xc8, 0xd2, 0xb9, 0xf3, 0xd3,
+			    0x68, 0x07, 0x73, 0x4b, 0xd5, 0x28, 0x3f, 0xd4 },
+		.psize	= 32,
+		.ksize	= 16,
+	}, {
+		.key	= { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+			    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+		.plaintext = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+			       0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+			       0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+			       0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+			       0x20, 0x21 },
+		.digest = { 0xbe, 0xcb, 0xb3, 0xbc, 0xcd, 0xb5, 0x18, 0xa3,
+			    0x06, 0x77, 0xd5, 0x48, 0x1f, 0xb6, 0xb4, 0xd8 },
+		.tap	= { 17, 17 },
+		.psize	= 34,
+		.np	= 2,
+		.ksize	= 16,
+	}
+};
+
 /*
  * DES test vectors.
  */
-- 
cgit v0.10.2


From 7cf4c1a5fd13820d7591179c0b925d739b2be9a7 Mon Sep 17 00:00:00 2001
From: Kazunori MIYAZAWA <miyazawa@linux-ipv6.org>
Date: Sat, 28 Oct 2006 13:21:22 +1000
Subject: [IPSEC]: Add support for AES-XCBC-MAC

The glue of xfrm.

Signed-off-by: Kazunori MIYAZAWA <miyazawa@linux-ipv6.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/include/linux/pfkeyv2.h b/include/linux/pfkeyv2.h
index 0f0b880..265bafa 100644
--- a/include/linux/pfkeyv2.h
+++ b/include/linux/pfkeyv2.h
@@ -285,6 +285,7 @@ struct sadb_x_sec_ctx {
 #define SADB_X_AALG_SHA2_384HMAC	6
 #define SADB_X_AALG_SHA2_512HMAC	7
 #define SADB_X_AALG_RIPEMD160HMAC	8
+#define SADB_X_AALG_AES_XCBC_MAC	9
 #define SADB_X_AALG_NULL		251	/* kame */
 #define SADB_AALG_MAX			251
 
diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c
index 5a0dbeb..6b381fc03 100644
--- a/net/xfrm/xfrm_algo.c
+++ b/net/xfrm/xfrm_algo.c
@@ -119,6 +119,23 @@ static struct xfrm_algo_desc aalg_list[] = {
 		.sadb_alg_maxbits = 160
 	}
 },
+{
+	.name = "xcbc(aes)",
+
+	.uinfo = {
+		.auth = {
+			.icv_truncbits = 96,
+			.icv_fullbits = 128,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_X_AALG_AES_XCBC_MAC,
+		.sadb_alg_ivlen = 0,
+		.sadb_alg_minbits = 128,
+		.sadb_alg_maxbits = 128
+	}
+},
 };
 
 static struct xfrm_algo_desc ealg_list[] = {
-- 
cgit v0.10.2


From 5b37538a514cf4c8746be9d09e8a9f564e7df939 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Fri, 17 Nov 2006 13:43:04 +1100
Subject: [CRYPTO] xcbc: Make needlessly global code static

On Tue, Nov 14, 2006 at 01:41:25AM -0800, Andrew Morton wrote:
>...
> Changes since 2.6.19-rc5-mm2:
>...
>  git-cryptodev.patch
>...
>  git trees
>...

This patch makes some needlessly global code static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/crypto/xcbc.c b/crypto/xcbc.c
index f592950..9347eb6 100644
--- a/crypto/xcbc.c
+++ b/crypto/xcbc.c
@@ -28,9 +28,9 @@
 #include <linux/scatterlist.h>
 #include "internal.h"
 
-u_int32_t ks[12] = {0x01010101, 0x01010101, 0x01010101, 0x01010101,
-		    0x02020202, 0x02020202, 0x02020202, 0x02020202,
-		    0x03030303, 0x03030303, 0x03030303, 0x03030303};
+static u_int32_t ks[12] = {0x01010101, 0x01010101, 0x01010101, 0x01010101,
+			   0x02020202, 0x02020202, 0x02020202, 0x02020202,
+			   0x03030303, 0x03030303, 0x03030303, 0x03030303};
 /*
  * +------------------------
  * | <parent tfm>
@@ -96,7 +96,7 @@ static int crypto_xcbc_digest_setkey(struct crypto_hash *parent,
 	return _crypto_xcbc_digest_setkey(parent, ctx);
 }
 
-int crypto_xcbc_digest_init(struct hash_desc *pdesc)
+static int crypto_xcbc_digest_init(struct hash_desc *pdesc)
 {
 	struct crypto_xcbc_ctx *ctx = crypto_hash_ctx_aligned(pdesc->tfm);
 	int bs = crypto_hash_blocksize(pdesc->tfm);
@@ -108,7 +108,9 @@ int crypto_xcbc_digest_init(struct hash_desc *pdesc)
 	return 0;
 }
 
-int crypto_xcbc_digest_update(struct hash_desc *pdesc, struct scatterlist *sg, unsigned int nbytes)
+static int crypto_xcbc_digest_update(struct hash_desc *pdesc,
+				     struct scatterlist *sg,
+				     unsigned int nbytes)
 {
 	struct crypto_hash *parent = pdesc->tfm;
 	struct crypto_xcbc_ctx *ctx = crypto_hash_ctx_aligned(parent);
@@ -181,7 +183,7 @@ int crypto_xcbc_digest_update(struct hash_desc *pdesc, struct scatterlist *sg, u
 	return 0;
 }
 
-int crypto_xcbc_digest_final(struct hash_desc *pdesc, u8 *out)
+static int crypto_xcbc_digest_final(struct hash_desc *pdesc, u8 *out)
 {
 	struct crypto_hash *parent = pdesc->tfm;
 	struct crypto_xcbc_ctx *ctx = crypto_hash_ctx_aligned(parent);
-- 
cgit v0.10.2


From ab7827059adbbcc3624afbc58880287eabf6d277 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Fri, 17 Nov 2006 13:43:55 +1100
Subject: [CRYPTO] geode: Make needlessly global geode_aes_crypt() static

On Tue, Nov 14, 2006 at 01:41:25AM -0800, Andrew Morton wrote:
>...
> Changes since 2.6.19-rc5-mm2:
>...
>  git-cryptodev.patch
>...
>  git trees
>...

This patch makes the needlessly global geode_aes_crypt() static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/drivers/crypto/geode-aes.c b/drivers/crypto/geode-aes.c
index da2d35d..43a6839 100644
--- a/drivers/crypto/geode-aes.c
+++ b/drivers/crypto/geode-aes.c
@@ -97,7 +97,7 @@ do_crypt(void *src, void *dst, int len, u32 flags)
 	return counter ? 0 : 1;
 }
 
-unsigned int
+static unsigned int
 geode_aes_crypt(struct geode_aes_op *op)
 {
 
diff --git a/drivers/crypto/geode-aes.h b/drivers/crypto/geode-aes.h
index 3e3a571..8003a36 100644
--- a/drivers/crypto/geode-aes.h
+++ b/drivers/crypto/geode-aes.h
@@ -37,6 +37,4 @@ struct geode_aes_op {
 	u8 iv[AES_IV_LENGTH];
 };
 
-unsigned int geode_aes_crypt(struct geode_aes_op *);
-
 #endif
-- 
cgit v0.10.2


From cc44215eaaa5e4032946b962353526ae6c370c0e Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 22 Nov 2006 17:55:00 +1100
Subject: [CRYPTO] api: Remove unused functions

This patch removes the following no longer used functions:
- api.c: crypto_alg_available()
- digest.c: crypto_digest_init()
- digest.c: crypto_digest_update()
- digest.c: crypto_digest_final()
- digest.c: crypto_digest_digest()

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/crypto/api.c b/crypto/api.c
index 4fb7fa4..8c44687 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -466,23 +466,8 @@ void crypto_free_tfm(struct crypto_tfm *tfm)
 	kfree(tfm);
 }
 
-int crypto_alg_available(const char *name, u32 flags)
-{
-	int ret = 0;
-	struct crypto_alg *alg = crypto_alg_mod_lookup(name, 0,
-						       CRYPTO_ALG_ASYNC);
-	
-	if (!IS_ERR(alg)) {
-		crypto_mod_put(alg);
-		ret = 1;
-	}
-	
-	return ret;
-}
-
 EXPORT_SYMBOL_GPL(crypto_alloc_tfm);
 EXPORT_SYMBOL_GPL(crypto_free_tfm);
-EXPORT_SYMBOL_GPL(crypto_alg_available);
 
 int crypto_has_alg(const char *name, u32 type, u32 mask)
 {
diff --git a/crypto/digest.c b/crypto/digest.c
index 0155a94e..8f45932 100644
--- a/crypto/digest.c
+++ b/crypto/digest.c
@@ -21,54 +21,6 @@
 #include "internal.h"
 #include "scatterwalk.h"
 
-void crypto_digest_init(struct crypto_tfm *tfm)
-{
-	struct crypto_hash *hash = crypto_hash_cast(tfm);
-	struct hash_desc desc = { .tfm = hash, .flags = tfm->crt_flags };
-
-	crypto_hash_init(&desc);
-}
-EXPORT_SYMBOL_GPL(crypto_digest_init);
-
-void crypto_digest_update(struct crypto_tfm *tfm,
-			  struct scatterlist *sg, unsigned int nsg)
-{
-	struct crypto_hash *hash = crypto_hash_cast(tfm);
-	struct hash_desc desc = { .tfm = hash, .flags = tfm->crt_flags };
-	unsigned int nbytes = 0;
-	unsigned int i;
-
-	for (i = 0; i < nsg; i++)
-		nbytes += sg[i].length;
-
-	crypto_hash_update(&desc, sg, nbytes);
-}
-EXPORT_SYMBOL_GPL(crypto_digest_update);
-
-void crypto_digest_final(struct crypto_tfm *tfm, u8 *out)
-{
-	struct crypto_hash *hash = crypto_hash_cast(tfm);
-	struct hash_desc desc = { .tfm = hash, .flags = tfm->crt_flags };
-
-	crypto_hash_final(&desc, out);
-}
-EXPORT_SYMBOL_GPL(crypto_digest_final);
-
-void crypto_digest_digest(struct crypto_tfm *tfm,
-			  struct scatterlist *sg, unsigned int nsg, u8 *out)
-{
-	struct crypto_hash *hash = crypto_hash_cast(tfm);
-	struct hash_desc desc = { .tfm = hash, .flags = tfm->crt_flags };
-	unsigned int nbytes = 0;
-	unsigned int i;
-
-	for (i = 0; i < nsg; i++)
-		nbytes += sg[i].length;
-
-	crypto_hash_digest(&desc, sg, nbytes, out);
-}
-EXPORT_SYMBOL_GPL(crypto_digest_digest);
-
 static int init(struct hash_desc *desc)
 {
 	struct crypto_tfm *tfm = crypto_hash_tfm(desc->tfm);
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 6485e97..4aa9046 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -241,12 +241,8 @@ int crypto_unregister_alg(struct crypto_alg *alg);
  * Algorithm query interface.
  */
 #ifdef CONFIG_CRYPTO
-int crypto_alg_available(const char *name, u32 flags)
-	__deprecated_for_modules;
 int crypto_has_alg(const char *name, u32 type, u32 mask);
 #else
-static int crypto_alg_available(const char *name, u32 flags)
-	__deprecated_for_modules;
 static inline int crypto_alg_available(const char *name, u32 flags)
 {
 	return 0;
@@ -707,16 +703,6 @@ static inline void crypto_cipher_decrypt_one(struct crypto_cipher *tfm,
 						dst, src);
 }
 
-void crypto_digest_init(struct crypto_tfm *tfm) __deprecated_for_modules;
-void crypto_digest_update(struct crypto_tfm *tfm,
-			  struct scatterlist *sg, unsigned int nsg)
-	__deprecated_for_modules;
-void crypto_digest_final(struct crypto_tfm *tfm, u8 *out)
-	__deprecated_for_modules;
-void crypto_digest_digest(struct crypto_tfm *tfm,
-			  struct scatterlist *sg, unsigned int nsg, u8 *out)
-	__deprecated_for_modules;
-
 static inline struct crypto_hash *__crypto_hash_cast(struct crypto_tfm *tfm)
 {
 	return (struct crypto_hash *)tfm;
@@ -729,14 +715,6 @@ static inline struct crypto_hash *crypto_hash_cast(struct crypto_tfm *tfm)
 	return __crypto_hash_cast(tfm);
 }
 
-static int crypto_digest_setkey(struct crypto_tfm *tfm, const u8 *key,
-				unsigned int keylen) __deprecated;
-static inline int crypto_digest_setkey(struct crypto_tfm *tfm,
-                                       const u8 *key, unsigned int keylen)
-{
-	return tfm->crt_hash.setkey(crypto_hash_cast(tfm), key, keylen);
-}
-
 static inline struct crypto_hash *crypto_alloc_hash(const char *alg_name,
 						    u32 type, u32 mask)
 {
-- 
cgit v0.10.2


From aec3694b987900de7ab789ea5749d673e0d634c4 Mon Sep 17 00:00:00 2001
From: Rik Snel <rsnel@cube.dyndns.org>
Date: Sun, 29 Oct 2006 11:02:07 +1100
Subject: [CRYPTO] lib: some common 128-bit block operations, nicely
 centralized

128bit is a common blocksize in linux kernel cryptography, so it helps to
centralize some common operations.

The code, while mostly trivial, is based on a header file mode_hdr.h in
http://fp.gladman.plus.com/AES/modes.vc8.19-06-06.zip

The original copyright (and GPL statement) of the original author,
Dr Brian Gladman, is preserved.

Signed-off-by: Rik Snel <rsnel@cube.dyndns.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/include/crypto/b128ops.h b/include/crypto/b128ops.h
new file mode 100644
index 0000000..0b8e6bc
--- /dev/null
+++ b/include/crypto/b128ops.h
@@ -0,0 +1,80 @@
+/* b128ops.h - common 128-bit block operations
+ *
+ * Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.
+ * Copyright (c) 2006, Rik Snel <rsnel@cube.dyndns.org>
+ *
+ * Based on Dr Brian Gladman's (GPL'd) work published at
+ * http://fp.gladman.plus.com/cryptography_technology/index.htm
+ * See the original copyright notice below.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 13/06/2006
+*/
+
+#ifndef _CRYPTO_B128OPS_H
+#define _CRYPTO_B128OPS_H
+
+#include <linux/types.h>
+
+typedef struct {
+	u64 a, b;
+} u128;
+
+typedef struct {
+	__be64 a, b;
+} be128;
+
+typedef struct {
+	__le64 b, a;
+} le128;
+
+static inline void u128_xor(u128 *r, const u128 *p, const u128 *q)
+{
+	r->a = p->a ^ q->a;
+	r->b = p->b ^ q->b;
+}
+
+static inline void be128_xor(be128 *r, const be128 *p, const be128 *q)
+{
+	u128_xor((u128 *)r, (u128 *)p, (u128 *)q);
+}
+
+static inline void le128_xor(le128 *r, const le128 *p, const le128 *q)
+{
+	u128_xor((u128 *)r, (u128 *)p, (u128 *)q);
+}
+
+#endif /* _CRYPTO_B128OPS_H */
-- 
cgit v0.10.2


From c494e0705d670c51ac736c8c4d92750705fe3187 Mon Sep 17 00:00:00 2001
From: Rik Snel <rsnel@cube.dyndns.org>
Date: Wed, 29 Nov 2006 18:59:44 +1100
Subject: [CRYPTO] lib: table driven multiplications in GF(2^128)

A lot of cypher modes need multiplications in GF(2^128). LRW, ABL, GCM...
I use functions from this library in my LRW implementation and I will
also use them in my ABL (Arbitrary Block Length, an unencumbered (correct
me if I am wrong, wide block cipher mode).

Elements of GF(2^128) must be presented as u128 *, it encourages automatic
and proper alignment.

The library contains support for two different representations of GF(2^128),
see the comment in gf128mul.h. There different levels of optimization
(memory/speed tradeoff).

The code is based on work by Dr Brian Gladman. Notable changes:
- deletion of two optimization modes
- change from u32 to u64 for faster handling on 64bit machines
- support for 'bbe' representation in addition to the, already implemented,
  'lle' representation.
- move 'inline void' functions from header to 'static void' in the
  source file
- update to use the linux coding style conventions

The original can be found at:
http://fp.gladman.plus.com/AES/modes.vc8.19-06-06.zip

The copyright (and GPL statement) of the original author is preserved.

Signed-off-by: Rik Snel <rsnel@cube.dyndns.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 4495e466..f941ffb 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -139,6 +139,16 @@ config CRYPTO_TGR192
 	  See also:
 	  <http://www.cs.technion.ac.il/~biham/Reports/Tiger/>.
 
+config CRYPTO_GF128MUL
+	tristate "GF(2^128) multiplication functions (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  Efficient table driven implementation of multiplications in the
+	  field GF(2^128).  This is needed by some cypher modes. This
+	  option will be selected automatically if you select such a
+	  cipher mode.  Only select this option by hand if you expect to load
+	  an external module that requires these functions.
+
 config CRYPTO_ECB
 	tristate "ECB support"
 	select CRYPTO_BLKCIPHER
diff --git a/crypto/Makefile b/crypto/Makefile
index aba9625f..0ab9ff0 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_CRYPTO_SHA256) += sha256.o
 obj-$(CONFIG_CRYPTO_SHA512) += sha512.o
 obj-$(CONFIG_CRYPTO_WP512) += wp512.o
 obj-$(CONFIG_CRYPTO_TGR192) += tgr192.o
+obj-$(CONFIG_CRYPTO_GF128MUL) += gf128mul.o
 obj-$(CONFIG_CRYPTO_ECB) += ecb.o
 obj-$(CONFIG_CRYPTO_CBC) += cbc.o
 obj-$(CONFIG_CRYPTO_DES) += des.o
diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c
new file mode 100644
index 0000000..0a2aadf
--- /dev/null
+++ b/crypto/gf128mul.c
@@ -0,0 +1,466 @@
+/* gf128mul.c - GF(2^128) multiplication functions
+ *
+ * Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.
+ * Copyright (c) 2006, Rik Snel <rsnel@cube.dyndns.org>
+ *
+ * Based on Dr Brian Gladman's (GPL'd) work published at
+ * http://fp.gladman.plus.com/cryptography_technology/index.htm
+ * See the original copyright notice below.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue 31/01/2006
+
+ This file provides fast multiplication in GF(128) as required by several
+ cryptographic authentication modes
+*/
+
+#include <crypto/gf128mul.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#define gf128mul_dat(q) { \
+	q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\
+	q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\
+	q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\
+	q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\
+	q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\
+	q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\
+	q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\
+	q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\
+	q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\
+	q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\
+	q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\
+	q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\
+	q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\
+	q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\
+	q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\
+	q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\
+	q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\
+	q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\
+	q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\
+	q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\
+	q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\
+	q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\
+	q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\
+	q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\
+	q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\
+	q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\
+	q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\
+	q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\
+	q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\
+	q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\
+	q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\
+	q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \
+}
+
+/*	Given the value i in 0..255 as the byte overflow when a field element
+    in GHASH is multipled by x^8, this function will return the values that
+    are generated in the lo 16-bit word of the field value by applying the
+    modular polynomial. The values lo_byte and hi_byte are returned via the
+    macro xp_fun(lo_byte, hi_byte) so that the values can be assembled into
+    memory as required by a suitable definition of this macro operating on
+    the table above
+*/
+
+#define xx(p, q)	0x##p##q
+
+#define xda_bbe(i) ( \
+	(i & 0x80 ? xx(43, 80) : 0) ^ (i & 0x40 ? xx(21, c0) : 0) ^ \
+	(i & 0x20 ? xx(10, e0) : 0) ^ (i & 0x10 ? xx(08, 70) : 0) ^ \
+	(i & 0x08 ? xx(04, 38) : 0) ^ (i & 0x04 ? xx(02, 1c) : 0) ^ \
+	(i & 0x02 ? xx(01, 0e) : 0) ^ (i & 0x01 ? xx(00, 87) : 0) \
+)
+
+#define xda_lle(i) ( \
+	(i & 0x80 ? xx(e1, 00) : 0) ^ (i & 0x40 ? xx(70, 80) : 0) ^ \
+	(i & 0x20 ? xx(38, 40) : 0) ^ (i & 0x10 ? xx(1c, 20) : 0) ^ \
+	(i & 0x08 ? xx(0e, 10) : 0) ^ (i & 0x04 ? xx(07, 08) : 0) ^ \
+	(i & 0x02 ? xx(03, 84) : 0) ^ (i & 0x01 ? xx(01, c2) : 0) \
+)
+
+static const u16 gf128mul_table_lle[256] = gf128mul_dat(xda_lle);
+static const u16 gf128mul_table_bbe[256] = gf128mul_dat(xda_bbe);
+
+/* These functions multiply a field element by x, by x^4 and by x^8
+ * in the polynomial field representation. It uses 32-bit word operations
+ * to gain speed but compensates for machine endianess and hence works
+ * correctly on both styles of machine.
+ */
+
+static void gf128mul_x_lle(be128 *r, const be128 *x)
+{
+	u64 a = be64_to_cpu(x->a);
+	u64 b = be64_to_cpu(x->b);
+	u64 _tt = gf128mul_table_lle[(b << 7) & 0xff];
+
+	r->b = cpu_to_be64((b >> 1) | (a << 63));
+	r->a = cpu_to_be64((a >> 1) ^ (_tt << 48));
+}
+
+static void gf128mul_x_bbe(be128 *r, const be128 *x)
+{
+	u64 a = be64_to_cpu(x->a);
+	u64 b = be64_to_cpu(x->b);
+	u64 _tt = gf128mul_table_bbe[a >> 63];
+
+	r->a = cpu_to_be64((a << 1) | (b >> 63));
+	r->b = cpu_to_be64((b << 1) ^ _tt);
+}
+
+static void gf128mul_x8_lle(be128 *x)
+{
+	u64 a = be64_to_cpu(x->a);
+	u64 b = be64_to_cpu(x->b);
+	u64 _tt = gf128mul_table_lle[b & 0xff];
+
+	x->b = cpu_to_be64((b >> 8) | (a << 56));
+	x->a = cpu_to_be64((a >> 8) ^ (_tt << 48));
+}
+
+static void gf128mul_x8_bbe(be128 *x)
+{
+	u64 a = be64_to_cpu(x->a);
+	u64 b = be64_to_cpu(x->b);
+	u64 _tt = gf128mul_table_bbe[a >> 56];
+
+	x->a = cpu_to_be64((a << 8) | (b >> 56));
+	x->b = cpu_to_be64((b << 8) ^ _tt);
+}
+
+void gf128mul_lle(be128 *r, const be128 *b)
+{
+	be128 p[8];
+	int i;
+
+	p[0] = *r;
+	for (i = 0; i < 7; ++i)
+		gf128mul_x_lle(&p[i + 1], &p[i]);
+
+	memset(r, 0, sizeof(r));
+	for (i = 0;;) {
+		u8 ch = ((u8 *)b)[15 - i];
+
+		if (ch & 0x80)
+			be128_xor(r, r, &p[0]);
+		if (ch & 0x40)
+			be128_xor(r, r, &p[1]);
+		if (ch & 0x20)
+			be128_xor(r, r, &p[2]);
+		if (ch & 0x10)
+			be128_xor(r, r, &p[3]);
+		if (ch & 0x08)
+			be128_xor(r, r, &p[4]);
+		if (ch & 0x04)
+			be128_xor(r, r, &p[5]);
+		if (ch & 0x02)
+			be128_xor(r, r, &p[6]);
+		if (ch & 0x01)
+			be128_xor(r, r, &p[7]);
+
+		if (++i >= 16)
+			break;
+
+		gf128mul_x8_lle(r);
+	}
+}
+EXPORT_SYMBOL(gf128mul_lle);
+
+void gf128mul_bbe(be128 *r, const be128 *b)
+{
+	be128 p[8];
+	int i;
+
+	p[0] = *r;
+	for (i = 0; i < 7; ++i)
+		gf128mul_x_bbe(&p[i + 1], &p[i]);
+
+	memset(r, 0, sizeof(r));
+	for (i = 0;;) {
+		u8 ch = ((u8 *)b)[i];
+
+		if (ch & 0x80)
+			be128_xor(r, r, &p[7]);
+		if (ch & 0x40)
+			be128_xor(r, r, &p[6]);
+		if (ch & 0x20)
+			be128_xor(r, r, &p[5]);
+		if (ch & 0x10)
+			be128_xor(r, r, &p[4]);
+		if (ch & 0x08)
+			be128_xor(r, r, &p[3]);
+		if (ch & 0x04)
+			be128_xor(r, r, &p[2]);
+		if (ch & 0x02)
+			be128_xor(r, r, &p[1]);
+		if (ch & 0x01)
+			be128_xor(r, r, &p[0]);
+
+		if (++i >= 16)
+			break;
+
+		gf128mul_x8_bbe(r);
+	}
+}
+EXPORT_SYMBOL(gf128mul_bbe);
+
+/*      This version uses 64k bytes of table space.
+    A 16 byte buffer has to be multiplied by a 16 byte key
+    value in GF(128).  If we consider a GF(128) value in
+    the buffer's lowest byte, we can construct a table of
+    the 256 16 byte values that result from the 256 values
+    of this byte.  This requires 4096 bytes. But we also
+    need tables for each of the 16 higher bytes in the
+    buffer as well, which makes 64 kbytes in total.
+*/
+/* additional explanation
+ * t[0][BYTE] contains g*BYTE
+ * t[1][BYTE] contains g*x^8*BYTE
+ *  ..
+ * t[15][BYTE] contains g*x^120*BYTE */
+struct gf128mul_64k *gf128mul_init_64k_lle(const be128 *g)
+{
+	struct gf128mul_64k *t;
+	int i, j, k;
+
+	t = kzalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		goto out;
+
+	for (i = 0; i < 16; i++) {
+		t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL);
+		if (!t->t[i]) {
+			gf128mul_free_64k(t);
+			t = NULL;
+			goto out;
+		}
+	}
+
+	t->t[0]->t[128] = *g;
+	for (j = 64; j > 0; j >>= 1)
+		gf128mul_x_lle(&t->t[0]->t[j], &t->t[0]->t[j + j]);
+
+	for (i = 0;;) {
+		for (j = 2; j < 256; j += j)
+			for (k = 1; k < j; ++k)
+				be128_xor(&t->t[i]->t[j + k],
+					  &t->t[i]->t[j], &t->t[i]->t[k]);
+
+		if (++i >= 16)
+			break;
+
+		for (j = 128; j > 0; j >>= 1) {
+			t->t[i]->t[j] = t->t[i - 1]->t[j];
+			gf128mul_x8_lle(&t->t[i]->t[j]);
+		}
+	}
+
+out:
+	return t;
+}
+EXPORT_SYMBOL(gf128mul_init_64k_lle);
+
+struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g)
+{
+	struct gf128mul_64k *t;
+	int i, j, k;
+
+	t = kzalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		goto out;
+
+	for (i = 0; i < 16; i++) {
+		t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL);
+		if (!t->t[i]) {
+			gf128mul_free_64k(t);
+			t = NULL;
+			goto out;
+		}
+	}
+
+	t->t[0]->t[1] = *g;
+	for (j = 1; j <= 64; j <<= 1)
+		gf128mul_x_bbe(&t->t[0]->t[j + j], &t->t[0]->t[j]);
+
+	for (i = 0;;) {
+		for (j = 2; j < 256; j += j)
+			for (k = 1; k < j; ++k)
+				be128_xor(&t->t[i]->t[j + k],
+					  &t->t[i]->t[j], &t->t[i]->t[k]);
+
+		if (++i >= 16)
+			break;
+
+		for (j = 128; j > 0; j >>= 1) {
+			t->t[i]->t[j] = t->t[i - 1]->t[j];
+			gf128mul_x8_bbe(&t->t[i]->t[j]);
+		}
+	}
+
+out:
+	return t;
+}
+EXPORT_SYMBOL(gf128mul_init_64k_bbe);
+
+void gf128mul_free_64k(struct gf128mul_64k *t)
+{
+	int i;
+
+	for (i = 0; i < 16; i++)
+		kfree(t->t[i]);
+	kfree(t);
+}
+EXPORT_SYMBOL(gf128mul_free_64k);
+
+void gf128mul_64k_lle(be128 *a, struct gf128mul_64k *t)
+{
+	u8 *ap = (u8 *)a;
+	be128 r[1];
+	int i;
+
+	*r = t->t[0]->t[ap[0]];
+	for (i = 1; i < 16; ++i)
+		be128_xor(r, r, &t->t[i]->t[ap[i]]);
+	*a = *r;
+}
+EXPORT_SYMBOL(gf128mul_64k_lle);
+
+void gf128mul_64k_bbe(be128 *a, struct gf128mul_64k *t)
+{
+	u8 *ap = (u8 *)a;
+	be128 r[1];
+	int i;
+
+	*r = t->t[0]->t[ap[15]];
+	for (i = 1; i < 16; ++i)
+		be128_xor(r, r, &t->t[i]->t[ap[15 - i]]);
+	*a = *r;
+}
+EXPORT_SYMBOL(gf128mul_64k_bbe);
+
+/*      This version uses 4k bytes of table space.
+    A 16 byte buffer has to be multiplied by a 16 byte key
+    value in GF(128).  If we consider a GF(128) value in a
+    single byte, we can construct a table of the 256 16 byte
+    values that result from the 256 values of this byte.
+    This requires 4096 bytes. If we take the highest byte in
+    the buffer and use this table to get the result, we then
+    have to multiply by x^120 to get the final value. For the
+    next highest byte the result has to be multiplied by x^112
+    and so on. But we can do this by accumulating the result
+    in an accumulator starting with the result for the top
+    byte.  We repeatedly multiply the accumulator value by
+    x^8 and then add in (i.e. xor) the 16 bytes of the next
+    lower byte in the buffer, stopping when we reach the
+    lowest byte. This requires a 4096 byte table.
+*/
+struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g)
+{
+	struct gf128mul_4k *t;
+	int j, k;
+
+	t = kzalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		goto out;
+
+	t->t[128] = *g;
+	for (j = 64; j > 0; j >>= 1)
+		gf128mul_x_lle(&t->t[j], &t->t[j+j]);
+
+	for (j = 2; j < 256; j += j)
+		for (k = 1; k < j; ++k)
+			be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
+
+out:
+	return t;
+}
+EXPORT_SYMBOL(gf128mul_init_4k_lle);
+
+struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g)
+{
+	struct gf128mul_4k *t;
+	int j, k;
+
+	t = kzalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		goto out;
+
+	t->t[1] = *g;
+	for (j = 1; j <= 64; j <<= 1)
+		gf128mul_x_bbe(&t->t[j + j], &t->t[j]);
+
+	for (j = 2; j < 256; j += j)
+		for (k = 1; k < j; ++k)
+			be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
+
+out:
+	return t;
+}
+EXPORT_SYMBOL(gf128mul_init_4k_bbe);
+
+void gf128mul_4k_lle(be128 *a, struct gf128mul_4k *t)
+{
+	u8 *ap = (u8 *)a;
+	be128 r[1];
+	int i = 15;
+
+	*r = t->t[ap[15]];
+	while (i--) {
+		gf128mul_x8_lle(r);
+		be128_xor(r, r, &t->t[ap[i]]);
+	}
+	*a = *r;
+}
+EXPORT_SYMBOL(gf128mul_4k_lle);
+
+void gf128mul_4k_bbe(be128 *a, struct gf128mul_4k *t)
+{
+	u8 *ap = (u8 *)a;
+	be128 r[1];
+	int i = 0;
+
+	*r = t->t[ap[0]];
+	while (++i < 16) {
+		gf128mul_x8_bbe(r);
+		be128_xor(r, r, &t->t[ap[i]]);
+	}
+	*a = *r;
+}
+EXPORT_SYMBOL(gf128mul_4k_bbe);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)");
diff --git a/include/crypto/gf128mul.h b/include/crypto/gf128mul.h
new file mode 100644
index 0000000..4fd3152
--- /dev/null
+++ b/include/crypto/gf128mul.h
@@ -0,0 +1,198 @@
+/* gf128mul.h - GF(2^128) multiplication functions
+ *
+ * Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.
+ * Copyright (c) 2006 Rik Snel <rsnel@cube.dyndns.org>
+ *
+ * Based on Dr Brian Gladman's (GPL'd) work published at
+ * http://fp.gladman.plus.com/cryptography_technology/index.htm
+ * See the original copyright notice below.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 31/01/2006
+
+ An implementation of field multiplication in Galois Field GF(128)
+*/
+
+#ifndef _CRYPTO_GF128MUL_H
+#define _CRYPTO_GF128MUL_H
+
+#include <crypto/b128ops.h>
+#include <linux/slab.h>
+
+/* Comment by Rik:
+ *
+ * For some background on GF(2^128) see for example: http://-
+ * csrc.nist.gov/CryptoToolkit/modes/proposedmodes/gcm/gcm-revised-spec.pdf
+ *
+ * The elements of GF(2^128) := GF(2)[X]/(X^128-X^7-X^2-X^1-1) can
+ * be mapped to computer memory in a variety of ways. Let's examine
+ * three common cases.
+ *
+ * Take a look at the 16 binary octets below in memory order. The msb's
+ * are left and the lsb's are right. char b[16] is an array and b[0] is
+ * the first octet.
+ *
+ * 80000000 00000000 00000000 00000000 .... 00000000 00000000 00000000
+ *   b[0]     b[1]     b[2]     b[3]          b[13]    b[14]    b[15]
+ *
+ * Every bit is a coefficient of some power of X. We can store the bits
+ * in every byte in little-endian order and the bytes themselves also in
+ * little endian order. I will call this lle (little-little-endian).
+ * The above buffer represents the polynomial 1, and X^7+X^2+X^1+1 looks
+ * like 11100001 00000000 .... 00000000 = { 0xE1, 0x00, }.
+ * This format was originally implemented in gf128mul and is used
+ * in GCM (Galois/Counter mode) and in ABL (Arbitrary Block Length).
+ *
+ * Another convention says: store the bits in bigendian order and the
+ * bytes also. This is bbe (big-big-endian). Now the buffer above
+ * represents X^127. X^7+X^2+X^1+1 looks like 00000000 .... 10000111,
+ * b[15] = 0x87 and the rest is 0. LRW uses this convention and bbe
+ * is partly implemented.
+ *
+ * Both of the above formats are easy to implement on big-endian
+ * machines.
+ *
+ * EME (which is patent encumbered) uses the ble format (bits are stored
+ * in big endian order and the bytes in little endian). The above buffer
+ * represents X^7 in this case and the primitive polynomial is b[0] = 0x87.
+ *
+ * The common machine word-size is smaller than 128 bits, so to make
+ * an efficient implementation we must split into machine word sizes.
+ * This file uses one 32bit for the moment. Machine endianness comes into
+ * play. The lle format in relation to machine endianness is discussed
+ * below by the original author of gf128mul Dr Brian Gladman.
+ *
+ * Let's look at the bbe and ble format on a little endian machine.
+ *
+ * bbe on a little endian machine u32 x[4]:
+ *
+ *  MS            x[0]           LS  MS            x[1]		  LS
+ *  ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
+ *  103..96 111.104 119.112 127.120  71...64 79...72 87...80 95...88
+ *
+ *  MS            x[2]           LS  MS            x[3]		  LS
+ *  ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
+ *  39...32 47...40 55...48 63...56  07...00 15...08 23...16 31...24
+ *
+ * ble on a little endian machine
+ *
+ *  MS            x[0]           LS  MS            x[1]		  LS
+ *  ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
+ *  31...24 23...16 15...08 07...00  63...56 55...48 47...40 39...32
+ *
+ *  MS            x[2]           LS  MS            x[3]		  LS
+ *  ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
+ *  95...88 87...80 79...72 71...64  127.120 199.112 111.104 103..96
+ *
+ * Multiplications in GF(2^128) are mostly bit-shifts, so you see why
+ * ble (and lbe also) are easier to implement on a little-endian
+ * machine than on a big-endian machine. The converse holds for bbe
+ * and lle.
+ *
+ * Note: to have good alignment, it seems to me that it is sufficient
+ * to keep elements of GF(2^128) in type u64[2]. On 32-bit wordsize
+ * machines this will automatically aligned to wordsize and on a 64-bit
+ * machine also.
+ */
+/*	Multiply a GF128 field element by x. Field elements are held in arrays
+    of bytes in which field bits 8n..8n + 7 are held in byte[n], with lower
+    indexed bits placed in the more numerically significant bit positions
+    within bytes.
+
+    On little endian machines the bit indexes translate into the bit
+    positions within four 32-bit words in the following way
+
+    MS            x[0]           LS  MS            x[1]		  LS
+    ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
+    24...31 16...23 08...15 00...07  56...63 48...55 40...47 32...39
+
+    MS            x[2]           LS  MS            x[3]		  LS
+    ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
+    88...95 80...87 72...79 64...71  120.127 112.119 104.111 96..103
+
+    On big endian machines the bit indexes translate into the bit
+    positions within four 32-bit words in the following way
+
+    MS            x[0]           LS  MS            x[1]		  LS
+    ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
+    00...07 08...15 16...23 24...31  32...39 40...47 48...55 56...63
+
+    MS            x[2]           LS  MS            x[3]		  LS
+    ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
+    64...71 72...79 80...87 88...95  96..103 104.111 112.119 120.127
+*/
+
+/*	A slow generic version of gf_mul, implemented for lle and bbe
+ * 	It multiplies a and b and puts the result in a */
+void gf128mul_lle(be128 *a, const be128 *b);
+
+void gf128mul_bbe(be128 *a, const be128 *b);
+
+
+/* 4k table optimization */
+
+struct gf128mul_4k {
+	be128 t[256];
+};
+
+struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g);
+struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g);
+void gf128mul_4k_lle(be128 *a, struct gf128mul_4k *t);
+void gf128mul_4k_bbe(be128 *a, struct gf128mul_4k *t);
+
+static inline void gf128mul_free_4k(struct gf128mul_4k *t)
+{
+	kfree(t);
+}
+
+
+/* 64k table optimization, implemented for lle and bbe */
+
+struct gf128mul_64k {
+	struct gf128mul_4k *t[16];
+};
+
+/* first initialize with the constant factor with which you
+ * want to multiply and then call gf128_64k_lle with the other
+ * factor in the first argument, the table in the second and a
+ * scratch register in the third. Afterwards *a = *r. */
+struct gf128mul_64k *gf128mul_init_64k_lle(const be128 *g);
+struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g);
+void gf128mul_free_64k(struct gf128mul_64k *t);
+void gf128mul_64k_lle(be128 *a, struct gf128mul_64k *t);
+void gf128mul_64k_bbe(be128 *a, struct gf128mul_64k *t);
+
+#endif /* _CRYPTO_GF128MUL_H */
-- 
cgit v0.10.2


From 64470f1b8510699dc357a44004dc924bc139c917 Mon Sep 17 00:00:00 2001
From: Rik Snel <rsnel@cube.dyndns.org>
Date: Sun, 26 Nov 2006 09:43:10 +1100
Subject: [CRYPTO] lrw: Liskov Rivest Wagner, a tweakable narrow block cipher
 mode

Main module, this implements the Liskov Rivest Wagner block cipher mode
in the new blockcipher API. The implementation is based on ecb.c.

The LRW-32-AES specification I used can be found at:
http://grouper.ieee.org/groups/1619/email/pdf00017.pdf

It implements the optimization specified as optional in the
specification, and in addition it uses optimized multiplication
routines from gf128mul.c.

Since gf128mul.[ch] is not tested on bigendian, this cipher mode
may currently fail badly on bigendian machines.

Signed-off-by: Rik Snel <rsnel@cube.dyndns.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/crypto/Kconfig b/crypto/Kconfig
index f941ffb..92ba249 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -168,6 +168,19 @@ config CRYPTO_CBC
 	  CBC: Cipher Block Chaining mode
 	  This block cipher algorithm is required for IPSec.
 
+config CRYPTO_LRW
+	tristate "LRW support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_MANAGER
+	select CRYPTO_GF128MUL
+	help
+	  LRW: Liskov Rivest Wagner, a tweakable, non malleable, non movable
+	  narrow block cipher mode for dm-crypt.  Use it with cipher
+	  specification string aes-lrw-benbi, the key must be 256, 320 or 384.
+	  The first 128, 192 or 256 bits in the key are used for AES and the
+	  rest is used to tie each cipher block to its logical position.
+
 config CRYPTO_DES
 	tristate "DES and Triple DES EDE cipher algorithms"
 	select CRYPTO_ALGAPI
diff --git a/crypto/Makefile b/crypto/Makefile
index 0ab9ff0..60e3d24 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_CRYPTO_TGR192) += tgr192.o
 obj-$(CONFIG_CRYPTO_GF128MUL) += gf128mul.o
 obj-$(CONFIG_CRYPTO_ECB) += ecb.o
 obj-$(CONFIG_CRYPTO_CBC) += cbc.o
+obj-$(CONFIG_CRYPTO_LRW) += lrw.o
 obj-$(CONFIG_CRYPTO_DES) += des.o
 obj-$(CONFIG_CRYPTO_BLOWFISH) += blowfish.o
 obj-$(CONFIG_CRYPTO_TWOFISH) += twofish.o
diff --git a/crypto/lrw.c b/crypto/lrw.c
new file mode 100644
index 0000000..5d04315
--- /dev/null
+++ b/crypto/lrw.c
@@ -0,0 +1,301 @@
+/* LRW: as defined by Cyril Guyot in
+ *	http://grouper.ieee.org/groups/1619/email/pdf00017.pdf
+ *
+ * Copyright (c) 2006 Rik Snel <rsnel@cube.dyndns.org>
+ *
+ * Based om ecb.c
+ * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+/* This implementation is checked against the test vectors in the above
+ * document and by a test vector provided by Ken Buchanan at
+ * http://www.mail-archive.com/stds-p1619@listserv.ieee.org/msg00173.html
+ *
+ * The test vectors are included in the testing module tcrypt.[ch] */
+#include <crypto/algapi.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+
+#include <crypto/b128ops.h>
+#include <crypto/gf128mul.h>
+
+struct priv {
+	struct crypto_cipher *child;
+	/* optimizes multiplying a random (non incrementing, as at the
+	 * start of a new sector) value with key2, we could also have
+	 * used 4k optimization tables or no optimization at all. In the
+	 * latter case we would have to store key2 here */
+	struct gf128mul_64k *table;
+	/* stores:
+	 *  key2*{ 0,0,...0,0,0,0,1 }, key2*{ 0,0,...0,0,0,1,1 },
+	 *  key2*{ 0,0,...0,0,1,1,1 }, key2*{ 0,0,...0,1,1,1,1 }
+	 *  key2*{ 0,0,...1,1,1,1,1 }, etc
+	 * needed for optimized multiplication of incrementing values
+	 * with key2 */
+	be128 mulinc[128];
+};
+
+static inline void setbit128_bbe(void *b, int bit)
+{
+	__set_bit(bit ^ 0x78, b);
+}
+
+static int setkey(struct crypto_tfm *parent, const u8 *key,
+		  unsigned int keylen)
+{
+	struct priv *ctx = crypto_tfm_ctx(parent);
+	struct crypto_cipher *child = ctx->child;
+	int err, i;
+	be128 tmp = { 0 };
+	int bsize = crypto_cipher_blocksize(child);
+
+	crypto_cipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
+	crypto_cipher_set_flags(child, crypto_tfm_get_flags(parent) &
+				       CRYPTO_TFM_REQ_MASK);
+	if ((err = crypto_cipher_setkey(child, key, keylen - bsize)))
+		return err;
+	crypto_tfm_set_flags(parent, crypto_cipher_get_flags(child) &
+				     CRYPTO_TFM_RES_MASK);
+
+	if (ctx->table)
+		gf128mul_free_64k(ctx->table);
+
+	/* initialize multiplication table for Key2 */
+	ctx->table = gf128mul_init_64k_bbe((be128 *)(key + keylen - bsize));
+	if (!ctx->table)
+		return -ENOMEM;
+
+	/* initialize optimization table */
+	for (i = 0; i < 128; i++) {
+		setbit128_bbe(&tmp, i);
+		ctx->mulinc[i] = tmp;
+		gf128mul_64k_bbe(&ctx->mulinc[i], ctx->table);
+	}
+
+	return 0;
+}
+
+struct sinfo {
+	be128 t;
+	struct crypto_tfm *tfm;
+	void (*fn)(struct crypto_tfm *, u8 *, const u8 *);
+};
+
+static inline void inc(be128 *iv)
+{
+	if (!(iv->b = cpu_to_be64(be64_to_cpu(iv->b) + 1)))
+		iv->a = cpu_to_be64(be64_to_cpu(iv->a) + 1);
+}
+
+static inline void round(struct sinfo *s, void *dst, const void *src)
+{
+	be128_xor(dst, &s->t, src);		/* PP <- T xor P */
+	s->fn(s->tfm, dst, dst);		/* CC <- E(Key2,PP) */
+	be128_xor(dst, dst, &s->t);		/* C <- T xor CC */
+}
+
+/* this returns the number of consequative 1 bits starting
+ * from the right, get_index128(00 00 00 00 00 00 ... 00 00 10 FB) = 2 */
+static inline int get_index128(be128 *block)
+{
+	int x;
+	__be32 *p = (__be32 *) block;
+
+	for (p += 3, x = 0; x < 128; p--, x += 32) {
+		u32 val = be32_to_cpup(p);
+
+		if (!~val)
+			continue;
+
+		return x + ffz(val);
+	}
+
+	return x;
+}
+
+static int crypt(struct blkcipher_desc *d,
+		 struct blkcipher_walk *w, struct priv *ctx,
+		 void (*fn)(struct crypto_tfm *, u8 *, const u8 *))
+{
+	int err;
+	unsigned int avail;
+	const int bs = crypto_cipher_blocksize(ctx->child);
+	struct sinfo s = {
+		.tfm = crypto_cipher_tfm(ctx->child),
+		.fn = fn
+	};
+	be128 *iv;
+	u8 *wsrc;
+	u8 *wdst;
+
+	err = blkcipher_walk_virt(d, w);
+	if (!(avail = w->nbytes))
+		return err;
+
+	wsrc = w->src.virt.addr;
+	wdst = w->dst.virt.addr;
+
+	/* calculate first value of T */
+	iv = (be128 *)w->iv;
+	s.t = *iv;
+
+	/* T <- I*Key2 */
+	gf128mul_64k_bbe(&s.t, ctx->table);
+
+	goto first;
+
+	for (;;) {
+		do {
+			/* T <- I*Key2, using the optimization
+			 * discussed in the specification */
+			be128_xor(&s.t, &s.t, &ctx->mulinc[get_index128(iv)]);
+			inc(iv);
+
+first:
+			round(&s, wdst, wsrc);
+
+			wsrc += bs;
+			wdst += bs;
+		} while ((avail -= bs) >= bs);
+
+		err = blkcipher_walk_done(d, w, avail);
+		if (!(avail = w->nbytes))
+			break;
+
+		wsrc = w->src.virt.addr;
+		wdst = w->dst.virt.addr;
+	}
+
+	return err;
+}
+
+static int encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		   struct scatterlist *src, unsigned int nbytes)
+{
+	struct priv *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk w;
+
+	blkcipher_walk_init(&w, dst, src, nbytes);
+	return crypt(desc, &w, ctx,
+		     crypto_cipher_alg(ctx->child)->cia_encrypt);
+}
+
+static int decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		   struct scatterlist *src, unsigned int nbytes)
+{
+	struct priv *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk w;
+
+	blkcipher_walk_init(&w, dst, src, nbytes);
+	return crypt(desc, &w, ctx,
+		     crypto_cipher_alg(ctx->child)->cia_decrypt);
+}
+
+static int init_tfm(struct crypto_tfm *tfm)
+{
+	struct crypto_instance *inst = (void *)tfm->__crt_alg;
+	struct crypto_spawn *spawn = crypto_instance_ctx(inst);
+	struct priv *ctx = crypto_tfm_ctx(tfm);
+	u32 *flags = &tfm->crt_flags;
+
+	tfm = crypto_spawn_tfm(spawn);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	if (crypto_tfm_alg_blocksize(tfm) != 16) {
+		*flags |= CRYPTO_TFM_RES_BAD_BLOCK_LEN;
+		return -EINVAL;
+	}
+
+	ctx->child = crypto_cipher_cast(tfm);
+	return 0;
+}
+
+static void exit_tfm(struct crypto_tfm *tfm)
+{
+	struct priv *ctx = crypto_tfm_ctx(tfm);
+	if (ctx->table)
+		gf128mul_free_64k(ctx->table);
+	crypto_free_cipher(ctx->child);
+}
+
+static struct crypto_instance *alloc(void *param, unsigned int len)
+{
+	struct crypto_instance *inst;
+	struct crypto_alg *alg;
+
+	alg = crypto_get_attr_alg(param, len, CRYPTO_ALG_TYPE_CIPHER,
+				  CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_ASYNC);
+	if (IS_ERR(alg))
+		return ERR_PTR(PTR_ERR(alg));
+
+	inst = crypto_alloc_instance("lrw", alg);
+	if (IS_ERR(inst))
+		goto out_put_alg;
+
+	inst->alg.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER;
+	inst->alg.cra_priority = alg->cra_priority;
+	inst->alg.cra_blocksize = alg->cra_blocksize;
+
+	if (alg->cra_alignmask < 7) inst->alg.cra_alignmask = 7;
+	else inst->alg.cra_alignmask = alg->cra_alignmask;
+	inst->alg.cra_type = &crypto_blkcipher_type;
+
+	if (!(alg->cra_blocksize % 4))
+		inst->alg.cra_alignmask |= 3;
+	inst->alg.cra_blkcipher.ivsize = alg->cra_blocksize;
+	inst->alg.cra_blkcipher.min_keysize =
+		alg->cra_cipher.cia_min_keysize + alg->cra_blocksize;
+	inst->alg.cra_blkcipher.max_keysize =
+		alg->cra_cipher.cia_max_keysize + alg->cra_blocksize;
+
+	inst->alg.cra_ctxsize = sizeof(struct priv);
+
+	inst->alg.cra_init = init_tfm;
+	inst->alg.cra_exit = exit_tfm;
+
+	inst->alg.cra_blkcipher.setkey = setkey;
+	inst->alg.cra_blkcipher.encrypt = encrypt;
+	inst->alg.cra_blkcipher.decrypt = decrypt;
+
+out_put_alg:
+	crypto_mod_put(alg);
+	return inst;
+}
+
+static void free(struct crypto_instance *inst)
+{
+	crypto_drop_spawn(crypto_instance_ctx(inst));
+	kfree(inst);
+}
+
+static struct crypto_template crypto_tmpl = {
+	.name = "lrw",
+	.alloc = alloc,
+	.free = free,
+	.module = THIS_MODULE,
+};
+
+static int __init crypto_module_init(void)
+{
+	return crypto_register_template(&crypto_tmpl);
+}
+
+static void __exit crypto_module_exit(void)
+{
+	crypto_unregister_template(&crypto_tmpl);
+}
+
+module_init(crypto_module_init);
+module_exit(crypto_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("LRW block cipher mode");
-- 
cgit v0.10.2


From f3d1044cd0a9b427a25b2492f4d503d2dd54cfd7 Mon Sep 17 00:00:00 2001
From: Rik Snel <rsnel@cube.dyndns.org>
Date: Wed, 29 Nov 2006 19:01:41 +1100
Subject: [CRYPTO] tcrypt: LRW test vectors

Do modprobe tcrypt mode=10 to check the included test vectors, they are
from: http://grouper.ieee.org/groups/1619/email/pdf00017.pdf and from
http://www.mail-archive.com/stds-p1619@listserv.ieee.org/msg00173.html.

To make the last test vector fit, I had to increase the buffer size of
input and result to 512 bytes.

Signed-off-by: Rik Snel <rsnel@cube.dyndns.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index d1a5f2b8..d671e894 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -906,6 +906,10 @@ static void do_test(void)
 			    AES_CBC_ENC_TEST_VECTORS);
 		test_cipher("cbc(aes)", DECRYPT, aes_cbc_dec_tv_template,
 			    AES_CBC_DEC_TEST_VECTORS);
+		test_cipher("lrw(aes)", ENCRYPT, aes_lrw_enc_tv_template,
+			    AES_LRW_ENC_TEST_VECTORS);
+		test_cipher("lrw(aes)", DECRYPT, aes_lrw_dec_tv_template,
+			    AES_LRW_DEC_TEST_VECTORS);
 
 		//CAST5
 		test_cipher("ecb(cast5)", ENCRYPT, cast5_enc_tv_template,
@@ -1055,6 +1059,10 @@ static void do_test(void)
 			    AES_CBC_ENC_TEST_VECTORS);
 		test_cipher("cbc(aes)", DECRYPT, aes_cbc_dec_tv_template,
 			    AES_CBC_DEC_TEST_VECTORS);
+		test_cipher("lrw(aes)", ENCRYPT, aes_lrw_enc_tv_template,
+			    AES_LRW_ENC_TEST_VECTORS);
+		test_cipher("lrw(aes)", DECRYPT, aes_lrw_dec_tv_template,
+			    AES_LRW_DEC_TEST_VECTORS);
 		break;
 
 	case 11:
@@ -1194,6 +1202,10 @@ static void do_test(void)
 				  aes_speed_template);
 		test_cipher_speed("cbc(aes)", DECRYPT, sec, NULL, 0,
 				  aes_speed_template);
+		test_cipher_speed("lrw(aes)", ENCRYPT, sec, NULL, 0,
+				  aes_lrw_speed_template);
+		test_cipher_speed("lrw(aes)", DECRYPT, sec, NULL, 0,
+				  aes_lrw_speed_template);
 		break;
 
 	case 201:
diff --git a/crypto/tcrypt.h b/crypto/tcrypt.h
index 2d07e8a..48a8136 100644
--- a/crypto/tcrypt.h
+++ b/crypto/tcrypt.h
@@ -39,15 +39,15 @@ struct hash_testvec {
 struct cipher_testvec {
 	char key[MAX_KEYLEN] __attribute__ ((__aligned__(4)));
 	char iv[MAX_IVLEN];
-	char input[48];
-	char result[48];
+	char input[512];
+	char result[512];
 	unsigned char tap[MAX_TAP];
 	int np;
 	unsigned char fail;
 	unsigned char wk; /* weak key flag */
 	unsigned char klen;
-	unsigned char ilen;
-	unsigned char rlen;
+	unsigned short ilen;
+	unsigned short rlen;
 };
 
 struct cipher_speed {
@@ -1899,6 +1899,8 @@ static struct cipher_testvec cast6_dec_tv_template[] = {
 #define AES_DEC_TEST_VECTORS 3
 #define AES_CBC_ENC_TEST_VECTORS 2
 #define AES_CBC_DEC_TEST_VECTORS 2
+#define AES_LRW_ENC_TEST_VECTORS 8
+#define AES_LRW_DEC_TEST_VECTORS 8
 
 static struct cipher_testvec aes_enc_tv_template[] = {
 	{ /* From FIPS-197 */
@@ -2036,6 +2038,509 @@ static struct cipher_testvec aes_cbc_dec_tv_template[] = {
 	},
 };
 
+static struct cipher_testvec aes_lrw_enc_tv_template[] = {
+	/* from http://grouper.ieee.org/groups/1619/email/pdf00017.pdf */
+	{ /* LRW-32-AES 1 */
+		.key    = { 0x45, 0x62, 0xac, 0x25, 0xf8, 0x28, 0x17, 0x6d,
+			    0x4c, 0x26, 0x84, 0x14, 0xb5, 0x68, 0x01, 0x85,
+			    0x25, 0x8e, 0x2a, 0x05, 0xe7, 0x3e, 0x9d, 0x03,
+			    0xee, 0x5a, 0x83, 0x0c, 0xcc, 0x09, 0x4c, 0x87 },
+		.klen   = 32,
+		.iv     = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 },
+		.input  = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+			    0x38, 0x39, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46 },
+		.ilen   = 16,
+		.result = { 0xf1, 0xb2, 0x73, 0xcd, 0x65, 0xa3, 0xdf, 0x5f,
+			    0xe9, 0x5d, 0x48, 0x92, 0x54, 0x63, 0x4e, 0xb8 },
+		.rlen   = 16,
+	}, { /* LRW-32-AES 2 */
+		.key    = { 0x59, 0x70, 0x47, 0x14, 0xf5, 0x57, 0x47, 0x8c,
+		  	    0xd7, 0x79, 0xe8, 0x0f, 0x54, 0x88, 0x79, 0x44,
+			    0x0d, 0x48, 0xf0, 0xb7, 0xb1, 0x5a, 0x53, 0xea,
+			    0x1c, 0xaa, 0x6b, 0x29, 0xc2, 0xca, 0xfb, 0xaf
+		},
+		.klen   = 32,
+		.iv     = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02 },
+		.input  = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+			    0x38, 0x39, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46 },
+		.ilen   = 16,
+		.result = { 0x00, 0xc8, 0x2b, 0xae, 0x95, 0xbb, 0xcd, 0xe5,
+			    0x27, 0x4f, 0x07, 0x69, 0xb2, 0x60, 0xe1, 0x36 },
+		.rlen   = 16,
+	}, { /* LRW-32-AES 3 */
+		.key    = { 0xd8, 0x2a, 0x91, 0x34, 0xb2, 0x6a, 0x56, 0x50,
+		  	    0x30, 0xfe, 0x69, 0xe2, 0x37, 0x7f, 0x98, 0x47,
+			    0xcd, 0xf9, 0x0b, 0x16, 0x0c, 0x64, 0x8f, 0xb6,
+			    0xb0, 0x0d, 0x0d, 0x1b, 0xae, 0x85, 0x87, 0x1f },
+		.klen   = 32,
+		.iv     = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			    0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00 },
+		.input  = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+			    0x38, 0x39, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46 },
+		.ilen   = 16,
+		.result = { 0x76, 0x32, 0x21, 0x83, 0xed, 0x8f, 0xf1, 0x82,
+			    0xf9, 0x59, 0x62, 0x03, 0x69, 0x0e, 0x5e, 0x01 },
+		.rlen   = 16,
+	}, { /* LRW-32-AES 4 */
+		.key    = { 0x0f, 0x6a, 0xef, 0xf8, 0xd3, 0xd2, 0xbb, 0x15,
+		  	    0x25, 0x83, 0xf7, 0x3c, 0x1f, 0x01, 0x28, 0x74,
+			    0xca, 0xc6, 0xbc, 0x35, 0x4d, 0x4a, 0x65, 0x54,
+			    0x90, 0xae, 0x61, 0xcf, 0x7b, 0xae, 0xbd, 0xcc,
+			    0xad, 0xe4, 0x94, 0xc5, 0x4a, 0x29, 0xae, 0x70 },
+		.klen   = 40,
+		.iv     = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 },
+		.input  = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+			    0x38, 0x39, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46 },
+		.ilen   = 16,
+		.result = { 0x9c, 0x0f, 0x15, 0x2f, 0x55, 0xa2, 0xd8, 0xf0,
+			    0xd6, 0x7b, 0x8f, 0x9e, 0x28, 0x22, 0xbc, 0x41 },
+		.rlen   = 16,
+	}, { /* LRW-32-AES 5 */
+		.key    = { 0x8a, 0xd4, 0xee, 0x10, 0x2f, 0xbd, 0x81, 0xff,
+		  	    0xf8, 0x86, 0xce, 0xac, 0x93, 0xc5, 0xad, 0xc6,
+			    0xa0, 0x19, 0x07, 0xc0, 0x9d, 0xf7, 0xbb, 0xdd,
+			    0x52, 0x13, 0xb2, 0xb7, 0xf0, 0xff, 0x11, 0xd8,
+			    0xd6, 0x08, 0xd0, 0xcd, 0x2e, 0xb1, 0x17, 0x6f },
+		.klen   = 40,
+		.iv     = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			    0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00 },
+		.input  = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+			    0x38, 0x39, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46 },
+		.ilen   = 16,
+		.result = { 0xd4, 0x27, 0x6a, 0x7f, 0x14, 0x91, 0x3d, 0x65,
+			    0xc8, 0x60, 0x48, 0x02, 0x87, 0xe3, 0x34, 0x06 },
+		.rlen   = 16,
+	}, { /* LRW-32-AES 6 */
+		.key    = { 0xf8, 0xd4, 0x76, 0xff, 0xd6, 0x46, 0xee, 0x6c,
+		  	    0x23, 0x84, 0xcb, 0x1c, 0x77, 0xd6, 0x19, 0x5d,
+			    0xfe, 0xf1, 0xa9, 0xf3, 0x7b, 0xbc, 0x8d, 0x21,
+			    0xa7, 0x9c, 0x21, 0xf8, 0xcb, 0x90, 0x02, 0x89,
+			    0xa8, 0x45, 0x34, 0x8e, 0xc8, 0xc5, 0xb5, 0xf1,
+			    0x26, 0xf5, 0x0e, 0x76, 0xfe, 0xfd, 0x1b, 0x1e },
+		.klen   = 48,
+		.iv     = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 },
+		.input  = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+			    0x38, 0x39, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46 },
+		.ilen   = 16,
+		.result = { 0xbd, 0x06, 0xb8, 0xe1, 0xdb, 0x98, 0x89, 0x9e,
+			    0xc4, 0x98, 0xe4, 0x91, 0xcf, 0x1c, 0x70, 0x2b },
+		.rlen   = 16,
+	}, { /* LRW-32-AES 7 */
+		.key    = { 0xfb, 0x76, 0x15, 0xb2, 0x3d, 0x80, 0x89, 0x1d,
+		  	    0xd4, 0x70, 0x98, 0x0b, 0xc7, 0x95, 0x84, 0xc8,
+			    0xb2, 0xfb, 0x64, 0xce, 0x60, 0x97, 0x87, 0x8d,
+			    0x17, 0xfc, 0xe4, 0x5a, 0x49, 0xe8, 0x30, 0xb7,
+			    0x6e, 0x78, 0x17, 0xe7, 0x2d, 0x5e, 0x12, 0xd4,
+			    0x60, 0x64, 0x04, 0x7a, 0xf1, 0x2f, 0x9e, 0x0c },
+		.klen   = 48,
+		.iv     = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			    0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00 },
+		.input  = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+			    0x38, 0x39, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46 },
+		.ilen   = 16,
+		.result = { 0x5b, 0x90, 0x8e, 0xc1, 0xab, 0xdd, 0x67, 0x5f,
+			    0x3d, 0x69, 0x8a, 0x95, 0x53, 0xc8, 0x9c, 0xe5 },
+		.rlen   = 16,
+	}, {
+/* http://www.mail-archive.com/stds-p1619@listserv.ieee.org/msg00173.html */
+		.key    = { 0xf8, 0xd4, 0x76, 0xff, 0xd6, 0x46, 0xee, 0x6c,
+			    0x23, 0x84, 0xcb, 0x1c, 0x77, 0xd6, 0x19, 0x5d,
+			    0xfe, 0xf1, 0xa9, 0xf3, 0x7b, 0xbc, 0x8d, 0x21,
+			    0xa7, 0x9c, 0x21, 0xf8, 0xcb, 0x90, 0x02, 0x89,
+			    0xa8, 0x45, 0x34, 0x8e, 0xc8, 0xc5, 0xb5, 0xf1,
+			    0x26, 0xf5, 0x0e, 0x76, 0xfe, 0xfd, 0x1b, 0x1e },
+		.klen   = 48,
+		.iv     = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 },
+		.input  = { 0x05, 0x11, 0xb7, 0x18, 0xab, 0xc6, 0x2d, 0xac,
+			    0x70, 0x5d, 0xf6, 0x22, 0x94, 0xcd, 0xe5, 0x6c,
+			    0x17, 0x6b, 0xf6, 0x1c, 0xf0, 0xf3, 0x6e, 0xf8,
+			    0x50, 0x38, 0x1f, 0x71, 0x49, 0xb6, 0x57, 0xd6,
+			    0x8f, 0xcb, 0x8d, 0x6b, 0xe3, 0xa6, 0x29, 0x90,
+			    0xfe, 0x2a, 0x62, 0x82, 0xae, 0x6d, 0x8b, 0xf6,
+			    0xad, 0x1e, 0x9e, 0x20, 0x5f, 0x38, 0xbe, 0x04,
+			    0xda, 0x10, 0x8e, 0xed, 0xa2, 0xa4, 0x87, 0xab,
+			    0xda, 0x6b, 0xb4, 0x0c, 0x75, 0xba, 0xd3, 0x7c,
+			    0xc9, 0xac, 0x42, 0x31, 0x95, 0x7c, 0xc9, 0x04,
+			    0xeb, 0xd5, 0x6e, 0x32, 0x69, 0x8a, 0xdb, 0xa6,
+			    0x15, 0xd7, 0x3f, 0x4f, 0x2f, 0x66, 0x69, 0x03,
+			    0x9c, 0x1f, 0x54, 0x0f, 0xde, 0x1f, 0xf3, 0x65,
+			    0x4c, 0x96, 0x12, 0xed, 0x7c, 0x92, 0x03, 0x01,
+			    0x6f, 0xbc, 0x35, 0x93, 0xac, 0xf1, 0x27, 0xf1,
+			    0xb4, 0x96, 0x82, 0x5a, 0x5f, 0xb0, 0xa0, 0x50,
+			    0x89, 0xa4, 0x8e, 0x66, 0x44, 0x85, 0xcc, 0xfd,
+			    0x33, 0x14, 0x70, 0xe3, 0x96, 0xb2, 0xc3, 0xd3,
+			    0xbb, 0x54, 0x5a, 0x1a, 0xf9, 0x74, 0xa2, 0xc5,
+			    0x2d, 0x64, 0x75, 0xdd, 0xb4, 0x54, 0xe6, 0x74,
+			    0x8c, 0xd3, 0x9d, 0x9e, 0x86, 0xab, 0x51, 0x53,
+			    0xb7, 0x93, 0x3e, 0x6f, 0xd0, 0x4e, 0x2c, 0x40,
+			    0xf6, 0xa8, 0x2e, 0x3e, 0x9d, 0xf4, 0x66, 0xa5,
+			    0x76, 0x12, 0x73, 0x44, 0x1a, 0x56, 0xd7, 0x72,
+			    0x88, 0xcd, 0x21, 0x8c, 0x4c, 0x0f, 0xfe, 0xda,
+			    0x95, 0xe0, 0x3a, 0xa6, 0xa5, 0x84, 0x46, 0xcd,
+			    0xd5, 0x3e, 0x9d, 0x3a, 0xe2, 0x67, 0xe6, 0x60,
+			    0x1a, 0xe2, 0x70, 0x85, 0x58, 0xc2, 0x1b, 0x09,
+			    0xe1, 0xd7, 0x2c, 0xca, 0xad, 0xa8, 0x8f, 0xf9,
+			    0xac, 0xb3, 0x0e, 0xdb, 0xca, 0x2e, 0xe2, 0xb8,
+			    0x51, 0x71, 0xd9, 0x3c, 0x6c, 0xf1, 0x56, 0xf8,
+			    0xea, 0x9c, 0xf1, 0xfb, 0x0c, 0xe6, 0xb7, 0x10,
+			    0x1c, 0xf8, 0xa9, 0x7c, 0xe8, 0x53, 0x35, 0xc1,
+			    0x90, 0x3e, 0x76, 0x4a, 0x74, 0xa4, 0x21, 0x2c,
+			    0xf6, 0x2c, 0x4e, 0x0f, 0x94, 0x3a, 0x88, 0x2e,
+			    0x41, 0x09, 0x6a, 0x33, 0x7d, 0xf6, 0xdd, 0x3f,
+			    0x8d, 0x23, 0x31, 0x74, 0x84, 0xeb, 0x88, 0x6e,
+			    0xcc, 0xb9, 0xbc, 0x22, 0x83, 0x19, 0x07, 0x22,
+			    0xa5, 0x2d, 0xdf, 0xa5, 0xf3, 0x80, 0x85, 0x78,
+			    0x84, 0x39, 0x6a, 0x6d, 0x6a, 0x99, 0x4f, 0xa5,
+			    0x15, 0xfe, 0x46, 0xb0, 0xe4, 0x6c, 0xa5, 0x41,
+			    0x3c, 0xce, 0x8f, 0x42, 0x60, 0x71, 0xa7, 0x75,
+			    0x08, 0x40, 0x65, 0x8a, 0x82, 0xbf, 0xf5, 0x43,
+			    0x71, 0x96, 0xa9, 0x4d, 0x44, 0x8a, 0x20, 0xbe,
+			    0xfa, 0x4d, 0xbb, 0xc0, 0x7d, 0x31, 0x96, 0x65,
+			    0xe7, 0x75, 0xe5, 0x3e, 0xfd, 0x92, 0x3b, 0xc9,
+			    0x55, 0xbb, 0x16, 0x7e, 0xf7, 0xc2, 0x8c, 0xa4,
+			    0x40, 0x1d, 0xe5, 0xef, 0x0e, 0xdf, 0xe4, 0x9a,
+			    0x62, 0x73, 0x65, 0xfd, 0x46, 0x63, 0x25, 0x3d,
+			    0x2b, 0xaf, 0xe5, 0x64, 0xfe, 0xa5, 0x5c, 0xcf,
+			    0x24, 0xf3, 0xb4, 0xac, 0x64, 0xba, 0xdf, 0x4b,
+			    0xc6, 0x96, 0x7d, 0x81, 0x2d, 0x8d, 0x97, 0xf7,
+			    0xc5, 0x68, 0x77, 0x84, 0x32, 0x2b, 0xcc, 0x85,
+			    0x74, 0x96, 0xf0, 0x12, 0x77, 0x61, 0xb9, 0xeb,
+			    0x71, 0xaa, 0x82, 0xcb, 0x1c, 0xdb, 0x89, 0xc8,
+			    0xc6, 0xb5, 0xe3, 0x5c, 0x7d, 0x39, 0x07, 0x24,
+			    0xda, 0x39, 0x87, 0x45, 0xc0, 0x2b, 0xbb, 0x01,
+			    0xac, 0xbc, 0x2a, 0x5c, 0x7f, 0xfc, 0xe8, 0xce,
+			    0x6d, 0x9c, 0x6f, 0xed, 0xd3, 0xc1, 0xa1, 0xd6,
+			    0xc5, 0x55, 0xa9, 0x66, 0x2f, 0xe1, 0xc8, 0x32,
+			    0xa6, 0x5d, 0xa4, 0x3a, 0x98, 0x73, 0xe8, 0x45,
+			    0xa4, 0xc7, 0xa8, 0xb4, 0xf6, 0x13, 0x03, 0xf6,
+			    0xe9, 0x2e, 0xc4, 0x29, 0x0f, 0x84, 0xdb, 0xc4,
+			    0x21, 0xc4, 0xc2, 0x75, 0x67, 0x89, 0x37, 0x0a },
+		.ilen   = 512,
+		.result = { 0x1a, 0x1d, 0xa9, 0x30, 0xad, 0xf9, 0x2f, 0x9b,
+			    0xb6, 0x1d, 0xae, 0xef, 0xf0, 0x2f, 0xf8, 0x5a,
+			    0x39, 0x3c, 0xbf, 0x2a, 0xb2, 0x45, 0xb2, 0x23,
+			    0x1b, 0x63, 0x3c, 0xcf, 0xaa, 0xbe, 0xcf, 0x4e,
+			    0xfa, 0xe8, 0x29, 0xc2, 0x20, 0x68, 0x2b, 0x3c,
+			    0x2e, 0x8b, 0xf7, 0x6e, 0x25, 0xbd, 0xe3, 0x3d,
+			    0x66, 0x27, 0xd6, 0xaf, 0xd6, 0x64, 0x3e, 0xe3,
+			    0xe8, 0x58, 0x46, 0x97, 0x39, 0x51, 0x07, 0xde,
+			    0xcb, 0x37, 0xbc, 0xa9, 0xc0, 0x5f, 0x75, 0xc3,
+			    0x0e, 0x84, 0x23, 0x1d, 0x16, 0xd4, 0x1c, 0x59,
+			    0x9c, 0x1a, 0x02, 0x55, 0xab, 0x3a, 0x97, 0x1d,
+			    0xdf, 0xdd, 0xc7, 0x06, 0x51, 0xd7, 0x70, 0xae,
+			    0x23, 0xc6, 0x8c, 0xf5, 0x1e, 0xa0, 0xe5, 0x82,
+			    0xb8, 0xb2, 0xbf, 0x04, 0xa0, 0x32, 0x8e, 0x68,
+			    0xeb, 0xaf, 0x6e, 0x2d, 0x94, 0x22, 0x2f, 0xce,
+			    0x4c, 0xb5, 0x59, 0xe2, 0xa2, 0x2f, 0xa0, 0x98,
+			    0x1a, 0x97, 0xc6, 0xd4, 0xb5, 0x00, 0x59, 0xf2,
+			    0x84, 0x14, 0x72, 0xb1, 0x9a, 0x6e, 0xa3, 0x7f,
+			    0xea, 0x20, 0xe7, 0xcb, 0x65, 0x77, 0x3a, 0xdf,
+			    0xc8, 0x97, 0x67, 0x15, 0xc2, 0x2a, 0x27, 0xcc,
+			    0x18, 0x55, 0xa1, 0x24, 0x0b, 0x24, 0x24, 0xaf,
+			    0x5b, 0xec, 0x68, 0xb8, 0xc8, 0xf5, 0xba, 0x63,
+			    0xff, 0xed, 0x89, 0xce, 0xd5, 0x3d, 0x88, 0xf3,
+			    0x25, 0xef, 0x05, 0x7c, 0x3a, 0xef, 0xeb, 0xd8,
+			    0x7a, 0x32, 0x0d, 0xd1, 0x1e, 0x58, 0x59, 0x99,
+			    0x90, 0x25, 0xb5, 0x26, 0xb0, 0xe3, 0x2b, 0x6c,
+			    0x4c, 0xa9, 0x8b, 0x84, 0x4f, 0x5e, 0x01, 0x50,
+			    0x41, 0x30, 0x58, 0xc5, 0x62, 0x74, 0x52, 0x1d,
+			    0x45, 0x24, 0x6a, 0x42, 0x64, 0x4f, 0x97, 0x1c,
+			    0xa8, 0x66, 0xb5, 0x6d, 0x79, 0xd4, 0x0d, 0x48,
+			    0xc5, 0x5f, 0xf3, 0x90, 0x32, 0xdd, 0xdd, 0xe1,
+			    0xe4, 0xa9, 0x9f, 0xfc, 0xc3, 0x52, 0x5a, 0x46,
+			    0xe4, 0x81, 0x84, 0x95, 0x36, 0x59, 0x7a, 0x6b,
+			    0xaa, 0xb3, 0x60, 0xad, 0xce, 0x9f, 0x9f, 0x28,
+			    0xe0, 0x01, 0x75, 0x22, 0xc4, 0x4e, 0xa9, 0x62,
+			    0x5c, 0x62, 0x0d, 0x00, 0xcb, 0x13, 0xe8, 0x43,
+			    0x72, 0xd4, 0x2d, 0x53, 0x46, 0xb5, 0xd1, 0x16,
+			    0x22, 0x18, 0xdf, 0x34, 0x33, 0xf5, 0xd6, 0x1c,
+			    0xb8, 0x79, 0x78, 0x97, 0x94, 0xff, 0x72, 0x13,
+			    0x4c, 0x27, 0xfc, 0xcb, 0xbf, 0x01, 0x53, 0xa6,
+			    0xb4, 0x50, 0x6e, 0xde, 0xdf, 0xb5, 0x43, 0xa4,
+			    0x59, 0xdf, 0x52, 0xf9, 0x7c, 0xe0, 0x11, 0x6f,
+			    0x2d, 0x14, 0x8e, 0x24, 0x61, 0x2c, 0xe1, 0x17,
+			    0xcc, 0xce, 0x51, 0x0c, 0x19, 0x8a, 0x82, 0x30,
+			    0x94, 0xd5, 0x3d, 0x6a, 0x53, 0x06, 0x5e, 0xbd,
+			    0xb7, 0xeb, 0xfa, 0xfd, 0x27, 0x51, 0xde, 0x85,
+			    0x1e, 0x86, 0x53, 0x11, 0x53, 0x94, 0x00, 0xee,
+			    0x2b, 0x8c, 0x08, 0x2a, 0xbf, 0xdd, 0xae, 0x11,
+			    0xcb, 0x1e, 0xa2, 0x07, 0x9a, 0x80, 0xcf, 0x62,
+			    0x9b, 0x09, 0xdc, 0x95, 0x3c, 0x96, 0x8e, 0xb1,
+			    0x09, 0xbd, 0xe4, 0xeb, 0xdb, 0xca, 0x70, 0x7a,
+			    0x9e, 0xfa, 0x31, 0x18, 0x45, 0x3c, 0x21, 0x33,
+			    0xb0, 0xb3, 0x2b, 0xea, 0xf3, 0x71, 0x2d, 0xe1,
+			    0x03, 0xad, 0x1b, 0x48, 0xd4, 0x67, 0x27, 0xf0,
+			    0x62, 0xe4, 0x3d, 0xfb, 0x9b, 0x08, 0x76, 0xe7,
+			    0xdd, 0x2b, 0x01, 0x39, 0x04, 0x5a, 0x58, 0x7a,
+			    0xf7, 0x11, 0x90, 0xec, 0xbd, 0x51, 0x5c, 0x32,
+			    0x6b, 0xd7, 0x35, 0x39, 0x02, 0x6b, 0xf2, 0xa6,
+			    0xd0, 0x0d, 0x07, 0xe1, 0x06, 0xc4, 0x5b, 0x7d,
+			    0xe4, 0x6a, 0xd7, 0xee, 0x15, 0x1f, 0x83, 0xb4,
+			    0xa3, 0xa7, 0x5e, 0xc3, 0x90, 0xb7, 0xef, 0xd3,
+			    0xb7, 0x4f, 0xf8, 0x92, 0x4c, 0xb7, 0x3c, 0x29,
+			    0xcd, 0x7e, 0x2b, 0x5d, 0x43, 0xea, 0x42, 0xe7,
+			    0x74, 0x3f, 0x7d, 0x58, 0x88, 0x75, 0xde, 0x3e },
+		.rlen   = 512,
+	}
+};
+
+static struct cipher_testvec aes_lrw_dec_tv_template[] = {
+	/* from http://grouper.ieee.org/groups/1619/email/pdf00017.pdf */
+	/* same as enc vectors with input and result reversed */
+	{ /* LRW-32-AES 1 */
+		.key    = { 0x45, 0x62, 0xac, 0x25, 0xf8, 0x28, 0x17, 0x6d,
+			    0x4c, 0x26, 0x84, 0x14, 0xb5, 0x68, 0x01, 0x85,
+			    0x25, 0x8e, 0x2a, 0x05, 0xe7, 0x3e, 0x9d, 0x03,
+			    0xee, 0x5a, 0x83, 0x0c, 0xcc, 0x09, 0x4c, 0x87 },
+		.klen   = 32,
+		.iv     = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 },
+		.input  = { 0xf1, 0xb2, 0x73, 0xcd, 0x65, 0xa3, 0xdf, 0x5f,
+			    0xe9, 0x5d, 0x48, 0x92, 0x54, 0x63, 0x4e, 0xb8 },
+		.ilen   = 16,
+		.result = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+			    0x38, 0x39, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46 },
+		.rlen   = 16,
+	}, { /* LRW-32-AES 2 */
+		.key    = { 0x59, 0x70, 0x47, 0x14, 0xf5, 0x57, 0x47, 0x8c,
+		  	    0xd7, 0x79, 0xe8, 0x0f, 0x54, 0x88, 0x79, 0x44,
+			    0x0d, 0x48, 0xf0, 0xb7, 0xb1, 0x5a, 0x53, 0xea,
+			    0x1c, 0xaa, 0x6b, 0x29, 0xc2, 0xca, 0xfb, 0xaf
+		},
+		.klen   = 32,
+		.iv     = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02 },
+		.input  = { 0x00, 0xc8, 0x2b, 0xae, 0x95, 0xbb, 0xcd, 0xe5,
+			    0x27, 0x4f, 0x07, 0x69, 0xb2, 0x60, 0xe1, 0x36 },
+		.ilen   = 16,
+		.result  = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+			     0x38, 0x39, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46 },
+		.rlen   = 16,
+	}, { /* LRW-32-AES 3 */
+		.key    = { 0xd8, 0x2a, 0x91, 0x34, 0xb2, 0x6a, 0x56, 0x50,
+		  	    0x30, 0xfe, 0x69, 0xe2, 0x37, 0x7f, 0x98, 0x47,
+			    0xcd, 0xf9, 0x0b, 0x16, 0x0c, 0x64, 0x8f, 0xb6,
+			    0xb0, 0x0d, 0x0d, 0x1b, 0xae, 0x85, 0x87, 0x1f },
+		.klen   = 32,
+		.iv     = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			    0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00 },
+		.input  = { 0x76, 0x32, 0x21, 0x83, 0xed, 0x8f, 0xf1, 0x82,
+			    0xf9, 0x59, 0x62, 0x03, 0x69, 0x0e, 0x5e, 0x01 },
+		.ilen   = 16,
+		.result = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+			    0x38, 0x39, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46 },
+		.rlen   = 16,
+	}, { /* LRW-32-AES 4 */
+		.key    = { 0x0f, 0x6a, 0xef, 0xf8, 0xd3, 0xd2, 0xbb, 0x15,
+		  	    0x25, 0x83, 0xf7, 0x3c, 0x1f, 0x01, 0x28, 0x74,
+			    0xca, 0xc6, 0xbc, 0x35, 0x4d, 0x4a, 0x65, 0x54,
+			    0x90, 0xae, 0x61, 0xcf, 0x7b, 0xae, 0xbd, 0xcc,
+			    0xad, 0xe4, 0x94, 0xc5, 0x4a, 0x29, 0xae, 0x70 },
+		.klen   = 40,
+		.iv     = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 },
+		.input  = { 0x9c, 0x0f, 0x15, 0x2f, 0x55, 0xa2, 0xd8, 0xf0,
+			    0xd6, 0x7b, 0x8f, 0x9e, 0x28, 0x22, 0xbc, 0x41 },
+		.ilen   = 16,
+		.result = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+			    0x38, 0x39, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46 },
+		.rlen   = 16,
+	}, { /* LRW-32-AES 5 */
+		.key    = { 0x8a, 0xd4, 0xee, 0x10, 0x2f, 0xbd, 0x81, 0xff,
+		  	    0xf8, 0x86, 0xce, 0xac, 0x93, 0xc5, 0xad, 0xc6,
+			    0xa0, 0x19, 0x07, 0xc0, 0x9d, 0xf7, 0xbb, 0xdd,
+			    0x52, 0x13, 0xb2, 0xb7, 0xf0, 0xff, 0x11, 0xd8,
+			    0xd6, 0x08, 0xd0, 0xcd, 0x2e, 0xb1, 0x17, 0x6f },
+		.klen   = 40,
+		.iv     = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			    0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00 },
+		.input  = { 0xd4, 0x27, 0x6a, 0x7f, 0x14, 0x91, 0x3d, 0x65,
+			    0xc8, 0x60, 0x48, 0x02, 0x87, 0xe3, 0x34, 0x06 },
+		.ilen   = 16,
+		.result = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+			    0x38, 0x39, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46 },
+		.rlen   = 16,
+	}, { /* LRW-32-AES 6 */
+		.key    = { 0xf8, 0xd4, 0x76, 0xff, 0xd6, 0x46, 0xee, 0x6c,
+		  	    0x23, 0x84, 0xcb, 0x1c, 0x77, 0xd6, 0x19, 0x5d,
+			    0xfe, 0xf1, 0xa9, 0xf3, 0x7b, 0xbc, 0x8d, 0x21,
+			    0xa7, 0x9c, 0x21, 0xf8, 0xcb, 0x90, 0x02, 0x89,
+			    0xa8, 0x45, 0x34, 0x8e, 0xc8, 0xc5, 0xb5, 0xf1,
+			    0x26, 0xf5, 0x0e, 0x76, 0xfe, 0xfd, 0x1b, 0x1e },
+		.klen   = 48,
+		.iv     = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 },
+		.input  = { 0xbd, 0x06, 0xb8, 0xe1, 0xdb, 0x98, 0x89, 0x9e,
+			    0xc4, 0x98, 0xe4, 0x91, 0xcf, 0x1c, 0x70, 0x2b },
+		.ilen   = 16,
+		.result = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+			    0x38, 0x39, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46 },
+		.rlen   = 16,
+	}, { /* LRW-32-AES 7 */
+		.key    = { 0xfb, 0x76, 0x15, 0xb2, 0x3d, 0x80, 0x89, 0x1d,
+		  	    0xd4, 0x70, 0x98, 0x0b, 0xc7, 0x95, 0x84, 0xc8,
+			    0xb2, 0xfb, 0x64, 0xce, 0x60, 0x97, 0x87, 0x8d,
+			    0x17, 0xfc, 0xe4, 0x5a, 0x49, 0xe8, 0x30, 0xb7,
+			    0x6e, 0x78, 0x17, 0xe7, 0x2d, 0x5e, 0x12, 0xd4,
+			    0x60, 0x64, 0x04, 0x7a, 0xf1, 0x2f, 0x9e, 0x0c },
+		.klen   = 48,
+		.iv     = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			    0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00 },
+		.input  = { 0x5b, 0x90, 0x8e, 0xc1, 0xab, 0xdd, 0x67, 0x5f,
+			    0x3d, 0x69, 0x8a, 0x95, 0x53, 0xc8, 0x9c, 0xe5 },
+		.ilen   = 16,
+		.result = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+			    0x38, 0x39, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46 },
+		.rlen   = 16,
+	}, {
+/* http://www.mail-archive.com/stds-p1619@listserv.ieee.org/msg00173.html */
+		.key    = { 0xf8, 0xd4, 0x76, 0xff, 0xd6, 0x46, 0xee, 0x6c,
+			    0x23, 0x84, 0xcb, 0x1c, 0x77, 0xd6, 0x19, 0x5d,
+			    0xfe, 0xf1, 0xa9, 0xf3, 0x7b, 0xbc, 0x8d, 0x21,
+			    0xa7, 0x9c, 0x21, 0xf8, 0xcb, 0x90, 0x02, 0x89,
+			    0xa8, 0x45, 0x34, 0x8e, 0xc8, 0xc5, 0xb5, 0xf1,
+			    0x26, 0xf5, 0x0e, 0x76, 0xfe, 0xfd, 0x1b, 0x1e },
+		.klen   = 48,
+		.iv     = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 },
+		.input	= { 0x1a, 0x1d, 0xa9, 0x30, 0xad, 0xf9, 0x2f, 0x9b,
+			    0xb6, 0x1d, 0xae, 0xef, 0xf0, 0x2f, 0xf8, 0x5a,
+			    0x39, 0x3c, 0xbf, 0x2a, 0xb2, 0x45, 0xb2, 0x23,
+			    0x1b, 0x63, 0x3c, 0xcf, 0xaa, 0xbe, 0xcf, 0x4e,
+			    0xfa, 0xe8, 0x29, 0xc2, 0x20, 0x68, 0x2b, 0x3c,
+			    0x2e, 0x8b, 0xf7, 0x6e, 0x25, 0xbd, 0xe3, 0x3d,
+			    0x66, 0x27, 0xd6, 0xaf, 0xd6, 0x64, 0x3e, 0xe3,
+			    0xe8, 0x58, 0x46, 0x97, 0x39, 0x51, 0x07, 0xde,
+			    0xcb, 0x37, 0xbc, 0xa9, 0xc0, 0x5f, 0x75, 0xc3,
+			    0x0e, 0x84, 0x23, 0x1d, 0x16, 0xd4, 0x1c, 0x59,
+			    0x9c, 0x1a, 0x02, 0x55, 0xab, 0x3a, 0x97, 0x1d,
+			    0xdf, 0xdd, 0xc7, 0x06, 0x51, 0xd7, 0x70, 0xae,
+			    0x23, 0xc6, 0x8c, 0xf5, 0x1e, 0xa0, 0xe5, 0x82,
+			    0xb8, 0xb2, 0xbf, 0x04, 0xa0, 0x32, 0x8e, 0x68,
+			    0xeb, 0xaf, 0x6e, 0x2d, 0x94, 0x22, 0x2f, 0xce,
+			    0x4c, 0xb5, 0x59, 0xe2, 0xa2, 0x2f, 0xa0, 0x98,
+			    0x1a, 0x97, 0xc6, 0xd4, 0xb5, 0x00, 0x59, 0xf2,
+			    0x84, 0x14, 0x72, 0xb1, 0x9a, 0x6e, 0xa3, 0x7f,
+			    0xea, 0x20, 0xe7, 0xcb, 0x65, 0x77, 0x3a, 0xdf,
+			    0xc8, 0x97, 0x67, 0x15, 0xc2, 0x2a, 0x27, 0xcc,
+			    0x18, 0x55, 0xa1, 0x24, 0x0b, 0x24, 0x24, 0xaf,
+			    0x5b, 0xec, 0x68, 0xb8, 0xc8, 0xf5, 0xba, 0x63,
+			    0xff, 0xed, 0x89, 0xce, 0xd5, 0x3d, 0x88, 0xf3,
+			    0x25, 0xef, 0x05, 0x7c, 0x3a, 0xef, 0xeb, 0xd8,
+			    0x7a, 0x32, 0x0d, 0xd1, 0x1e, 0x58, 0x59, 0x99,
+			    0x90, 0x25, 0xb5, 0x26, 0xb0, 0xe3, 0x2b, 0x6c,
+			    0x4c, 0xa9, 0x8b, 0x84, 0x4f, 0x5e, 0x01, 0x50,
+			    0x41, 0x30, 0x58, 0xc5, 0x62, 0x74, 0x52, 0x1d,
+			    0x45, 0x24, 0x6a, 0x42, 0x64, 0x4f, 0x97, 0x1c,
+			    0xa8, 0x66, 0xb5, 0x6d, 0x79, 0xd4, 0x0d, 0x48,
+			    0xc5, 0x5f, 0xf3, 0x90, 0x32, 0xdd, 0xdd, 0xe1,
+			    0xe4, 0xa9, 0x9f, 0xfc, 0xc3, 0x52, 0x5a, 0x46,
+			    0xe4, 0x81, 0x84, 0x95, 0x36, 0x59, 0x7a, 0x6b,
+			    0xaa, 0xb3, 0x60, 0xad, 0xce, 0x9f, 0x9f, 0x28,
+			    0xe0, 0x01, 0x75, 0x22, 0xc4, 0x4e, 0xa9, 0x62,
+			    0x5c, 0x62, 0x0d, 0x00, 0xcb, 0x13, 0xe8, 0x43,
+			    0x72, 0xd4, 0x2d, 0x53, 0x46, 0xb5, 0xd1, 0x16,
+			    0x22, 0x18, 0xdf, 0x34, 0x33, 0xf5, 0xd6, 0x1c,
+			    0xb8, 0x79, 0x78, 0x97, 0x94, 0xff, 0x72, 0x13,
+			    0x4c, 0x27, 0xfc, 0xcb, 0xbf, 0x01, 0x53, 0xa6,
+			    0xb4, 0x50, 0x6e, 0xde, 0xdf, 0xb5, 0x43, 0xa4,
+			    0x59, 0xdf, 0x52, 0xf9, 0x7c, 0xe0, 0x11, 0x6f,
+			    0x2d, 0x14, 0x8e, 0x24, 0x61, 0x2c, 0xe1, 0x17,
+			    0xcc, 0xce, 0x51, 0x0c, 0x19, 0x8a, 0x82, 0x30,
+			    0x94, 0xd5, 0x3d, 0x6a, 0x53, 0x06, 0x5e, 0xbd,
+			    0xb7, 0xeb, 0xfa, 0xfd, 0x27, 0x51, 0xde, 0x85,
+			    0x1e, 0x86, 0x53, 0x11, 0x53, 0x94, 0x00, 0xee,
+			    0x2b, 0x8c, 0x08, 0x2a, 0xbf, 0xdd, 0xae, 0x11,
+			    0xcb, 0x1e, 0xa2, 0x07, 0x9a, 0x80, 0xcf, 0x62,
+			    0x9b, 0x09, 0xdc, 0x95, 0x3c, 0x96, 0x8e, 0xb1,
+			    0x09, 0xbd, 0xe4, 0xeb, 0xdb, 0xca, 0x70, 0x7a,
+			    0x9e, 0xfa, 0x31, 0x18, 0x45, 0x3c, 0x21, 0x33,
+			    0xb0, 0xb3, 0x2b, 0xea, 0xf3, 0x71, 0x2d, 0xe1,
+			    0x03, 0xad, 0x1b, 0x48, 0xd4, 0x67, 0x27, 0xf0,
+			    0x62, 0xe4, 0x3d, 0xfb, 0x9b, 0x08, 0x76, 0xe7,
+			    0xdd, 0x2b, 0x01, 0x39, 0x04, 0x5a, 0x58, 0x7a,
+			    0xf7, 0x11, 0x90, 0xec, 0xbd, 0x51, 0x5c, 0x32,
+			    0x6b, 0xd7, 0x35, 0x39, 0x02, 0x6b, 0xf2, 0xa6,
+			    0xd0, 0x0d, 0x07, 0xe1, 0x06, 0xc4, 0x5b, 0x7d,
+			    0xe4, 0x6a, 0xd7, 0xee, 0x15, 0x1f, 0x83, 0xb4,
+			    0xa3, 0xa7, 0x5e, 0xc3, 0x90, 0xb7, 0xef, 0xd3,
+			    0xb7, 0x4f, 0xf8, 0x92, 0x4c, 0xb7, 0x3c, 0x29,
+			    0xcd, 0x7e, 0x2b, 0x5d, 0x43, 0xea, 0x42, 0xe7,
+			    0x74, 0x3f, 0x7d, 0x58, 0x88, 0x75, 0xde, 0x3e },
+		.ilen   = 512,
+		.result	= { 0x05, 0x11, 0xb7, 0x18, 0xab, 0xc6, 0x2d, 0xac,
+			    0x70, 0x5d, 0xf6, 0x22, 0x94, 0xcd, 0xe5, 0x6c,
+			    0x17, 0x6b, 0xf6, 0x1c, 0xf0, 0xf3, 0x6e, 0xf8,
+			    0x50, 0x38, 0x1f, 0x71, 0x49, 0xb6, 0x57, 0xd6,
+			    0x8f, 0xcb, 0x8d, 0x6b, 0xe3, 0xa6, 0x29, 0x90,
+			    0xfe, 0x2a, 0x62, 0x82, 0xae, 0x6d, 0x8b, 0xf6,
+			    0xad, 0x1e, 0x9e, 0x20, 0x5f, 0x38, 0xbe, 0x04,
+			    0xda, 0x10, 0x8e, 0xed, 0xa2, 0xa4, 0x87, 0xab,
+			    0xda, 0x6b, 0xb4, 0x0c, 0x75, 0xba, 0xd3, 0x7c,
+			    0xc9, 0xac, 0x42, 0x31, 0x95, 0x7c, 0xc9, 0x04,
+			    0xeb, 0xd5, 0x6e, 0x32, 0x69, 0x8a, 0xdb, 0xa6,
+			    0x15, 0xd7, 0x3f, 0x4f, 0x2f, 0x66, 0x69, 0x03,
+			    0x9c, 0x1f, 0x54, 0x0f, 0xde, 0x1f, 0xf3, 0x65,
+			    0x4c, 0x96, 0x12, 0xed, 0x7c, 0x92, 0x03, 0x01,
+			    0x6f, 0xbc, 0x35, 0x93, 0xac, 0xf1, 0x27, 0xf1,
+			    0xb4, 0x96, 0x82, 0x5a, 0x5f, 0xb0, 0xa0, 0x50,
+			    0x89, 0xa4, 0x8e, 0x66, 0x44, 0x85, 0xcc, 0xfd,
+			    0x33, 0x14, 0x70, 0xe3, 0x96, 0xb2, 0xc3, 0xd3,
+			    0xbb, 0x54, 0x5a, 0x1a, 0xf9, 0x74, 0xa2, 0xc5,
+			    0x2d, 0x64, 0x75, 0xdd, 0xb4, 0x54, 0xe6, 0x74,
+			    0x8c, 0xd3, 0x9d, 0x9e, 0x86, 0xab, 0x51, 0x53,
+			    0xb7, 0x93, 0x3e, 0x6f, 0xd0, 0x4e, 0x2c, 0x40,
+			    0xf6, 0xa8, 0x2e, 0x3e, 0x9d, 0xf4, 0x66, 0xa5,
+			    0x76, 0x12, 0x73, 0x44, 0x1a, 0x56, 0xd7, 0x72,
+			    0x88, 0xcd, 0x21, 0x8c, 0x4c, 0x0f, 0xfe, 0xda,
+			    0x95, 0xe0, 0x3a, 0xa6, 0xa5, 0x84, 0x46, 0xcd,
+			    0xd5, 0x3e, 0x9d, 0x3a, 0xe2, 0x67, 0xe6, 0x60,
+			    0x1a, 0xe2, 0x70, 0x85, 0x58, 0xc2, 0x1b, 0x09,
+			    0xe1, 0xd7, 0x2c, 0xca, 0xad, 0xa8, 0x8f, 0xf9,
+			    0xac, 0xb3, 0x0e, 0xdb, 0xca, 0x2e, 0xe2, 0xb8,
+			    0x51, 0x71, 0xd9, 0x3c, 0x6c, 0xf1, 0x56, 0xf8,
+			    0xea, 0x9c, 0xf1, 0xfb, 0x0c, 0xe6, 0xb7, 0x10,
+			    0x1c, 0xf8, 0xa9, 0x7c, 0xe8, 0x53, 0x35, 0xc1,
+			    0x90, 0x3e, 0x76, 0x4a, 0x74, 0xa4, 0x21, 0x2c,
+			    0xf6, 0x2c, 0x4e, 0x0f, 0x94, 0x3a, 0x88, 0x2e,
+			    0x41, 0x09, 0x6a, 0x33, 0x7d, 0xf6, 0xdd, 0x3f,
+			    0x8d, 0x23, 0x31, 0x74, 0x84, 0xeb, 0x88, 0x6e,
+			    0xcc, 0xb9, 0xbc, 0x22, 0x83, 0x19, 0x07, 0x22,
+			    0xa5, 0x2d, 0xdf, 0xa5, 0xf3, 0x80, 0x85, 0x78,
+			    0x84, 0x39, 0x6a, 0x6d, 0x6a, 0x99, 0x4f, 0xa5,
+			    0x15, 0xfe, 0x46, 0xb0, 0xe4, 0x6c, 0xa5, 0x41,
+			    0x3c, 0xce, 0x8f, 0x42, 0x60, 0x71, 0xa7, 0x75,
+			    0x08, 0x40, 0x65, 0x8a, 0x82, 0xbf, 0xf5, 0x43,
+			    0x71, 0x96, 0xa9, 0x4d, 0x44, 0x8a, 0x20, 0xbe,
+			    0xfa, 0x4d, 0xbb, 0xc0, 0x7d, 0x31, 0x96, 0x65,
+			    0xe7, 0x75, 0xe5, 0x3e, 0xfd, 0x92, 0x3b, 0xc9,
+			    0x55, 0xbb, 0x16, 0x7e, 0xf7, 0xc2, 0x8c, 0xa4,
+			    0x40, 0x1d, 0xe5, 0xef, 0x0e, 0xdf, 0xe4, 0x9a,
+			    0x62, 0x73, 0x65, 0xfd, 0x46, 0x63, 0x25, 0x3d,
+			    0x2b, 0xaf, 0xe5, 0x64, 0xfe, 0xa5, 0x5c, 0xcf,
+			    0x24, 0xf3, 0xb4, 0xac, 0x64, 0xba, 0xdf, 0x4b,
+			    0xc6, 0x96, 0x7d, 0x81, 0x2d, 0x8d, 0x97, 0xf7,
+			    0xc5, 0x68, 0x77, 0x84, 0x32, 0x2b, 0xcc, 0x85,
+			    0x74, 0x96, 0xf0, 0x12, 0x77, 0x61, 0xb9, 0xeb,
+			    0x71, 0xaa, 0x82, 0xcb, 0x1c, 0xdb, 0x89, 0xc8,
+			    0xc6, 0xb5, 0xe3, 0x5c, 0x7d, 0x39, 0x07, 0x24,
+			    0xda, 0x39, 0x87, 0x45, 0xc0, 0x2b, 0xbb, 0x01,
+			    0xac, 0xbc, 0x2a, 0x5c, 0x7f, 0xfc, 0xe8, 0xce,
+			    0x6d, 0x9c, 0x6f, 0xed, 0xd3, 0xc1, 0xa1, 0xd6,
+			    0xc5, 0x55, 0xa9, 0x66, 0x2f, 0xe1, 0xc8, 0x32,
+			    0xa6, 0x5d, 0xa4, 0x3a, 0x98, 0x73, 0xe8, 0x45,
+			    0xa4, 0xc7, 0xa8, 0xb4, 0xf6, 0x13, 0x03, 0xf6,
+			    0xe9, 0x2e, 0xc4, 0x29, 0x0f, 0x84, 0xdb, 0xc4,
+			    0x21, 0xc4, 0xc2, 0x75, 0x67, 0x89, 0x37, 0x0a },
+		.rlen   = 512,
+	}
+};
+
 /* Cast5 test vectors from RFC 2144 */
 #define CAST5_ENC_TEST_VECTORS	3
 #define CAST5_DEC_TEST_VECTORS	3
@@ -3152,6 +3657,27 @@ static struct cipher_speed aes_speed_template[] = {
 	{  .klen = 0, .blen = 0, }
 };
 
+static struct cipher_speed aes_lrw_speed_template[] = {
+	{ .klen = 32, .blen = 16, },
+	{ .klen = 32, .blen = 64, },
+	{ .klen = 32, .blen = 256, },
+	{ .klen = 32, .blen = 1024, },
+	{ .klen = 32, .blen = 8192, },
+	{ .klen = 40, .blen = 16, },
+	{ .klen = 40, .blen = 64, },
+	{ .klen = 40, .blen = 256, },
+	{ .klen = 40, .blen = 1024, },
+	{ .klen = 40, .blen = 8192, },
+	{ .klen = 48, .blen = 16, },
+	{ .klen = 48, .blen = 64, },
+	{ .klen = 48, .blen = 256, },
+	{ .klen = 48, .blen = 1024, },
+	{ .klen = 48, .blen = 8192, },
+
+	/* End marker */
+	{  .klen = 0, .blen = 0, }
+};
+
 static struct cipher_speed des3_ede_speed_template[] = {
 	{ .klen = 24, .blen = 16, },
 	{ .klen = 24, .blen = 64, },
-- 
cgit v0.10.2


From 8df3b0a219967080d9dc4b604b5fecacb6967af0 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 2 Dec 2006 14:36:03 +1100
Subject: [CRYPTO] cryptoloop: Select CRYPTO_CBC

As CBC is the default chaining method for cryptoloop, we should select
it from cryptoloop to ease the transition.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 17dc222..e00568e 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -305,6 +305,7 @@ config BLK_DEV_LOOP
 config BLK_DEV_CRYPTOLOOP
 	tristate "Cryptoloop Support"
 	select CRYPTO
+	select CRYPTO_CBC
 	depends on BLK_DEV_LOOP
 	---help---
 	  Say Y here if you want to be able to use the ciphers that are 
-- 
cgit v0.10.2


From 9ebed9d182e03d12d39915b72e4b960046bc4039 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Mon, 4 Dec 2006 20:20:05 -0800
Subject: [CRYPTO] lrw: round --> lrw_round
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes:

crypto/lrw.c:99: warning: conflicting types for built-in function ‘round’

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/crypto/lrw.c b/crypto/lrw.c
index 5d04315..5664258 100644
--- a/crypto/lrw.c
+++ b/crypto/lrw.c
@@ -95,7 +95,7 @@ static inline void inc(be128 *iv)
 		iv->a = cpu_to_be64(be64_to_cpu(iv->a) + 1);
 }
 
-static inline void round(struct sinfo *s, void *dst, const void *src)
+static inline void lrw_round(struct sinfo *s, void *dst, const void *src)
 {
 	be128_xor(dst, &s->t, src);		/* PP <- T xor P */
 	s->fn(s->tfm, dst, dst);		/* CC <- E(Key2,PP) */
@@ -160,7 +160,7 @@ static int crypt(struct blkcipher_desc *d,
 			inc(iv);
 
 first:
-			round(&s, wdst, wsrc);
+			lrw_round(&s, wdst, wsrc);
 
 			wsrc += bs;
 			wdst += bs;
-- 
cgit v0.10.2


From 79066ad32be5bb2edf16733aec36acf2af03fc99 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 5 Dec 2006 13:41:52 -0800
Subject: [CRYPTO] dm-crypt: Make iv_gen_private a union

Rather than stuffing integers into pointers with casts, let's use
a union.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index facf859..0d86865 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -86,7 +86,10 @@ struct crypt_config {
 	 */
 	struct crypt_iv_operations *iv_gen_ops;
 	char *iv_mode;
-	struct crypto_cipher *iv_gen_private;
+	union {
+		struct crypto_cipher *essiv_tfm;
+		int benbi_shift;
+	} iv_gen_private;
 	sector_t iv_offset;
 	unsigned int iv_size;
 
@@ -195,21 +198,21 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 	}
 	kfree(salt);
 
-	cc->iv_gen_private = essiv_tfm;
+	cc->iv_gen_private.essiv_tfm = essiv_tfm;
 	return 0;
 }
 
 static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 {
-	crypto_free_cipher(cc->iv_gen_private);
-	cc->iv_gen_private = NULL;
+	crypto_free_cipher(cc->iv_gen_private.essiv_tfm);
+	cc->iv_gen_private.essiv_tfm = NULL;
 }
 
 static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
 {
 	memset(iv, 0, cc->iv_size);
 	*(u64 *)iv = cpu_to_le64(sector);
-	crypto_cipher_encrypt_one(cc->iv_gen_private, iv, iv);
+	crypto_cipher_encrypt_one(cc->iv_gen_private.essiv_tfm, iv, iv);
 	return 0;
 }
 
@@ -232,21 +235,23 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
 		return -EINVAL;
 	}
 
-	cc->iv_gen_private = (void *)(9 - log);
+	cc->iv_gen_private.benbi_shift = 9 - log;
 
 	return 0;
 }
 
 static void crypt_iv_benbi_dtr(struct crypt_config *cc)
 {
-	cc->iv_gen_private = NULL;
 }
 
 static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
 {
+	__be64 val;
+
 	memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
-	put_unaligned(cpu_to_be64(((u64)sector << (u32)cc->iv_gen_private) + 1),
-		      (__be64 *)(iv + cc->iv_size - sizeof(u64)));
+
+	val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi_shift) + 1);
+	put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
 
 	return 0;
 }
-- 
cgit v0.10.2


From 74c9c0c17dea729d6089c0c82762babd02e65f84 Mon Sep 17 00:00:00 2001
From: Dmitry Mishin <dim@openvz.org>
Date: Tue, 5 Dec 2006 13:43:50 -0800
Subject: [NETFILTER]: Fix {ip,ip6,arp}_tables hook validation

Commit 590bdf7fd2292b47c428111cb1360e312eff207e introduced a regression
in match/target hook validation. mark_source_chains builds a bitmask
for each rule representing the hooks it can be reached from, which is
then used by the matches and targets to make sure they are only called
from valid hooks. The patch moved the match/target specific validation
before the mark_source_chains call, at which point the mask is always zero.

This patch returns back to the old order and moves the standard checks
to mark_source_chains. This allows to get rid of a special case for
standard targets as a nice side-effect.

Signed-off-by: Dmitry Mishin <dim@openvz.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 413c2d0a..71b76ad 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -375,6 +375,13 @@ static int mark_source_chains(struct xt_table_info *newinfo,
 			    && unconditional(&e->arp)) {
 				unsigned int oldpos, size;
 
+				if (t->verdict < -NF_MAX_VERDICT - 1) {
+					duprintf("mark_source_chains: bad "
+						"negative verdict (%i)\n",
+								t->verdict);
+					return 0;
+				}
+
 				/* Return: backtrack through the last
 				 * big jump.
 				 */
@@ -404,6 +411,14 @@ static int mark_source_chains(struct xt_table_info *newinfo,
 				if (strcmp(t->target.u.user.name,
 					   ARPT_STANDARD_TARGET) == 0
 				    && newpos >= 0) {
+					if (newpos > newinfo->size -
+						sizeof(struct arpt_entry)) {
+						duprintf("mark_source_chains: "
+							"bad verdict (%i)\n",
+								newpos);
+						return 0;
+					}
+
 					/* This a jump; chase it. */
 					duprintf("Jump rule %u -> %u\n",
 						 pos, newpos);
@@ -426,8 +441,6 @@ static int mark_source_chains(struct xt_table_info *newinfo,
 static inline int standard_check(const struct arpt_entry_target *t,
 				 unsigned int max_offset)
 {
-	struct arpt_standard_target *targ = (void *)t;
-
 	/* Check standard info. */
 	if (t->u.target_size
 	    != ARPT_ALIGN(sizeof(struct arpt_standard_target))) {
@@ -437,18 +450,6 @@ static inline int standard_check(const struct arpt_entry_target *t,
 		return 0;
 	}
 
-	if (targ->verdict >= 0
-	    && targ->verdict > max_offset - sizeof(struct arpt_entry)) {
-		duprintf("arpt_standard_check: bad verdict (%i)\n",
-			 targ->verdict);
-		return 0;
-	}
-
-	if (targ->verdict < -NF_MAX_VERDICT - 1) {
-		duprintf("arpt_standard_check: bad negative verdict (%i)\n",
-			 targ->verdict);
-		return 0;
-	}
 	return 1;
 }
 
@@ -627,18 +628,20 @@ static int translate_table(const char *name,
 		}
 	}
 
+	if (!mark_source_chains(newinfo, valid_hooks, entry0)) {
+		duprintf("Looping hook\n");
+		return -ELOOP;
+	}
+
 	/* Finally, each sanity check must pass */
 	i = 0;
 	ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size,
 				 check_entry, name, size, &i);
 
-	if (ret != 0)
-		goto cleanup;
-
-	ret = -ELOOP;
-	if (!mark_source_chains(newinfo, valid_hooks, entry0)) {
-		duprintf("Looping hook\n");
-		goto cleanup;
+	if (ret != 0) {
+		ARPT_ENTRY_ITERATE(entry0, newinfo->size,
+				cleanup_entry, &i);
+		return ret;
 	}
 
 	/* And one copy for every other CPU */
@@ -647,9 +650,6 @@ static int translate_table(const char *name,
 			memcpy(newinfo->entries[i], entry0, newinfo->size);
 	}
 
-	return 0;
-cleanup:
-	ARPT_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i);
 	return ret;
 }
 
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 8a45543..2bddf84 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -401,6 +401,13 @@ mark_source_chains(struct xt_table_info *newinfo,
 			    && unconditional(&e->ip)) {
 				unsigned int oldpos, size;
 
+				if (t->verdict < -NF_MAX_VERDICT - 1) {
+					duprintf("mark_source_chains: bad "
+						"negative verdict (%i)\n",
+								t->verdict);
+					return 0;
+				}
+
 				/* Return: backtrack through the last
 				   big jump. */
 				do {
@@ -438,6 +445,13 @@ mark_source_chains(struct xt_table_info *newinfo,
 				if (strcmp(t->target.u.user.name,
 					   IPT_STANDARD_TARGET) == 0
 				    && newpos >= 0) {
+					if (newpos > newinfo->size -
+						sizeof(struct ipt_entry)) {
+						duprintf("mark_source_chains: "
+							"bad verdict (%i)\n",
+								newpos);
+						return 0;
+					}
 					/* This a jump; chase it. */
 					duprintf("Jump rule %u -> %u\n",
 						 pos, newpos);
@@ -470,27 +484,6 @@ cleanup_match(struct ipt_entry_match *m, unsigned int *i)
 }
 
 static inline int
-standard_check(const struct ipt_entry_target *t,
-	       unsigned int max_offset)
-{
-	struct ipt_standard_target *targ = (void *)t;
-
-	/* Check standard info. */
-	if (targ->verdict >= 0
-	    && targ->verdict > max_offset - sizeof(struct ipt_entry)) {
-		duprintf("ipt_standard_check: bad verdict (%i)\n",
-			 targ->verdict);
-		return 0;
-	}
-	if (targ->verdict < -NF_MAX_VERDICT - 1) {
-		duprintf("ipt_standard_check: bad negative verdict (%i)\n",
-			 targ->verdict);
-		return 0;
-	}
-	return 1;
-}
-
-static inline int
 check_match(struct ipt_entry_match *m,
 	    const char *name,
 	    const struct ipt_ip *ip,
@@ -576,12 +569,7 @@ check_entry(struct ipt_entry *e, const char *name, unsigned int size,
 	if (ret)
 		goto err;
 
-	if (t->u.kernel.target == &ipt_standard_target) {
-		if (!standard_check(t, size)) {
-			ret = -EINVAL;
-			goto err;
-		}
-	} else if (t->u.kernel.target->checkentry
+	if (t->u.kernel.target->checkentry
 		   && !t->u.kernel.target->checkentry(name, e, target, t->data,
 						      e->comefrom)) {
 		duprintf("ip_tables: check failed for `%s'.\n",
@@ -718,17 +706,19 @@ translate_table(const char *name,
 		}
 	}
 
+	if (!mark_source_chains(newinfo, valid_hooks, entry0))
+		return -ELOOP;
+
 	/* Finally, each sanity check must pass */
 	i = 0;
 	ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
 				check_entry, name, size, &i);
 
-	if (ret != 0)
-		goto cleanup;
-
-	ret = -ELOOP;
-	if (!mark_source_chains(newinfo, valid_hooks, entry0))
-		goto cleanup;
+	if (ret != 0) {
+		IPT_ENTRY_ITERATE(entry0, newinfo->size,
+				cleanup_entry, &i);
+		return ret;
+	}
 
 	/* And one copy for every other CPU */
 	for_each_possible_cpu(i) {
@@ -736,9 +726,6 @@ translate_table(const char *name,
 			memcpy(newinfo->entries[i], entry0, newinfo->size);
 	}
 
-	return 0;
-cleanup:
-	IPT_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i);
 	return ret;
 }
 
@@ -1591,18 +1578,13 @@ static int compat_copy_entry_from_user(struct ipt_entry *e, void **dstptr,
 	if (ret)
 		goto err;
 
-	ret = -EINVAL;
-	if (t->u.kernel.target == &ipt_standard_target) {
-		if (!standard_check(t, *size))
-			goto err;
-	} else if (t->u.kernel.target->checkentry
+	if (t->u.kernel.target->checkentry
 		   && !t->u.kernel.target->checkentry(name, de, target,
 						      t->data, de->comefrom)) {
 		duprintf("ip_tables: compat: check failed for `%s'.\n",
 			 t->u.kernel.target->name);
-		goto err;
+		ret = -EINVAL;
 	}
-	ret = 0;
 err:
 	return ret;
 }
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index f63fb86..4eec4b3 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -440,6 +440,13 @@ mark_source_chains(struct xt_table_info *newinfo,
 			    && unconditional(&e->ipv6)) {
 				unsigned int oldpos, size;
 
+				if (t->verdict < -NF_MAX_VERDICT - 1) {
+					duprintf("mark_source_chains: bad "
+						"negative verdict (%i)\n",
+								t->verdict);
+					return 0;
+				}
+
 				/* Return: backtrack through the last
 				   big jump. */
 				do {
@@ -477,6 +484,13 @@ mark_source_chains(struct xt_table_info *newinfo,
 				if (strcmp(t->target.u.user.name,
 					   IP6T_STANDARD_TARGET) == 0
 				    && newpos >= 0) {
+					if (newpos > newinfo->size -
+						sizeof(struct ip6t_entry)) {
+						duprintf("mark_source_chains: "
+							"bad verdict (%i)\n",
+								newpos);
+						return 0;
+					}
 					/* This a jump; chase it. */
 					duprintf("Jump rule %u -> %u\n",
 						 pos, newpos);
@@ -509,27 +523,6 @@ cleanup_match(struct ip6t_entry_match *m, unsigned int *i)
 }
 
 static inline int
-standard_check(const struct ip6t_entry_target *t,
-	       unsigned int max_offset)
-{
-	struct ip6t_standard_target *targ = (void *)t;
-
-	/* Check standard info. */
-	if (targ->verdict >= 0
-	    && targ->verdict > max_offset - sizeof(struct ip6t_entry)) {
-		duprintf("ip6t_standard_check: bad verdict (%i)\n",
-			 targ->verdict);
-		return 0;
-	}
-	if (targ->verdict < -NF_MAX_VERDICT - 1) {
-		duprintf("ip6t_standard_check: bad negative verdict (%i)\n",
-			 targ->verdict);
-		return 0;
-	}
-	return 1;
-}
-
-static inline int
 check_match(struct ip6t_entry_match *m,
 	    const char *name,
 	    const struct ip6t_ip6 *ipv6,
@@ -616,12 +609,7 @@ check_entry(struct ip6t_entry *e, const char *name, unsigned int size,
 	if (ret)
 		goto err;
 
-	if (t->u.kernel.target == &ip6t_standard_target) {
-		if (!standard_check(t, size)) {
-			ret = -EINVAL;
-			goto err;
-		}
-	} else if (t->u.kernel.target->checkentry
+	if (t->u.kernel.target->checkentry
 		   && !t->u.kernel.target->checkentry(name, e, target, t->data,
 						      e->comefrom)) {
 		duprintf("ip_tables: check failed for `%s'.\n",
@@ -758,17 +746,19 @@ translate_table(const char *name,
 		}
 	}
 
+	if (!mark_source_chains(newinfo, valid_hooks, entry0))
+		return -ELOOP;
+
 	/* Finally, each sanity check must pass */
 	i = 0;
 	ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size,
 				check_entry, name, size, &i);
 
-	if (ret != 0)
-		goto cleanup;
-
-	ret = -ELOOP;
-	if (!mark_source_chains(newinfo, valid_hooks, entry0))
-		goto cleanup;
+	if (ret != 0) {
+		IP6T_ENTRY_ITERATE(entry0, newinfo->size,
+				   cleanup_entry, &i);
+		return ret;
+	}
 
 	/* And one copy for every other CPU */
 	for_each_possible_cpu(i) {
@@ -777,9 +767,6 @@ translate_table(const char *name,
 	}
 
 	return 0;
-cleanup:
-	IP6T_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i);
-	return ret;
 }
 
 /* Gets counters. */
-- 
cgit v0.10.2


From f6677f4312ee74f8ca68c4cc4060465607b72b41 Mon Sep 17 00:00:00 2001
From: Dmitry Mishin <dim@openvz.org>
Date: Tue, 5 Dec 2006 13:44:07 -0800
Subject: [NETFILTER]: Fix iptables compat hook validation

In compat mode, matches and targets valid hooks checks always successful due
to not initialized e->comefrom field yet. This patch separates this checks from
translation code and moves them after mark_source_chains() call, where these
marks are initialized.

Signed-off-by: Dmitry Mishin <dim@openvz.org>
Signed-off-by; Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 2bddf84..0ff2956 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1516,25 +1516,8 @@ static inline int compat_copy_match_from_user(struct ipt_entry_match *m,
 	void **dstptr, compat_uint_t *size, const char *name,
 	const struct ipt_ip *ip, unsigned int hookmask)
 {
-	struct ipt_entry_match *dm;
-	struct ipt_match *match;
-	int ret;
-
-	dm = (struct ipt_entry_match *)*dstptr;
-	match = m->u.kernel.match;
 	xt_compat_match_from_user(m, dstptr, size);
-
-	ret = xt_check_match(match, AF_INET, dm->u.match_size - sizeof(*dm),
-			     name, hookmask, ip->proto,
-			     ip->invflags & IPT_INV_PROTO);
-	if (!ret && m->u.kernel.match->checkentry
-	    && !m->u.kernel.match->checkentry(name, ip, match, dm->data,
-					      hookmask)) {
-		duprintf("ip_tables: check failed for `%s'.\n",
-			 m->u.kernel.match->name);
-		ret = -EINVAL;
-	}
-	return ret;
+	return 0;
 }
 
 static int compat_copy_entry_from_user(struct ipt_entry *e, void **dstptr,
@@ -1556,7 +1539,7 @@ static int compat_copy_entry_from_user(struct ipt_entry *e, void **dstptr,
 	ret = IPT_MATCH_ITERATE(e, compat_copy_match_from_user, dstptr, size,
 			name, &de->ip, de->comefrom);
 	if (ret)
-		goto err;
+		return ret;
 	de->target_offset = e->target_offset - (origsize - *size);
 	t = ipt_get_target(e);
 	target = t->u.kernel.target;
@@ -1569,26 +1552,62 @@ static int compat_copy_entry_from_user(struct ipt_entry *e, void **dstptr,
 		if ((unsigned char *)de - base < newinfo->underflow[h])
 			newinfo->underflow[h] -= origsize - *size;
 	}
+	return ret;
+}
+
+static inline int compat_check_match(struct ipt_entry_match *m, const char *name,
+				const struct ipt_ip *ip, unsigned int hookmask)
+{
+	struct ipt_match *match;
+	int ret;
+
+	match = m->u.kernel.match;
+	ret = xt_check_match(match, AF_INET, m->u.match_size - sizeof(*m),
+			     name, hookmask, ip->proto,
+			     ip->invflags & IPT_INV_PROTO);
+	if (!ret && m->u.kernel.match->checkentry
+	    && !m->u.kernel.match->checkentry(name, ip, match, m->data,
+					      hookmask)) {
+		duprintf("ip_tables: compat: check failed for `%s'.\n",
+			 m->u.kernel.match->name);
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+static inline int compat_check_target(struct ipt_entry *e, const char *name)
+{
+ 	struct ipt_entry_target *t;
+ 	struct ipt_target *target;
+ 	int ret;
 
-	t = ipt_get_target(de);
+	t = ipt_get_target(e);
 	target = t->u.kernel.target;
 	ret = xt_check_target(target, AF_INET, t->u.target_size - sizeof(*t),
 			      name, e->comefrom, e->ip.proto,
 			      e->ip.invflags & IPT_INV_PROTO);
-	if (ret)
-		goto err;
-
-	if (t->u.kernel.target->checkentry
-		   && !t->u.kernel.target->checkentry(name, de, target,
-						      t->data, de->comefrom)) {
+	if (!ret && t->u.kernel.target->checkentry
+		   && !t->u.kernel.target->checkentry(name, e, target,
+						      t->data, e->comefrom)) {
 		duprintf("ip_tables: compat: check failed for `%s'.\n",
 			 t->u.kernel.target->name);
 		ret = -EINVAL;
 	}
-err:
 	return ret;
 }
 
+static inline int compat_check_entry(struct ipt_entry *e, const char *name)
+{
+	int ret;
+
+	ret = IPT_MATCH_ITERATE(e, compat_check_match, name, &e->ip,
+								e->comefrom);
+	if (ret)
+		return ret;
+
+	return compat_check_target(e, name);
+}
+
 static int
 translate_compat_table(const char *name,
 		unsigned int valid_hooks,
@@ -1677,6 +1696,11 @@ translate_compat_table(const char *name,
 	if (!mark_source_chains(newinfo, valid_hooks, entry1))
 		goto free_newinfo;
 
+	ret = IPT_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry,
+									name);
+	if (ret)
+		goto free_newinfo;
+
 	/* And one copy for every other CPU */
 	for_each_possible_cpu(i)
 		if (newinfo->entries[i] && newinfo->entries[i] != entry1)
-- 
cgit v0.10.2


From 9ee0779e994c6916863045297b831212e285da3b Mon Sep 17 00:00:00 2001
From: Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
Date: Tue, 5 Dec 2006 13:44:31 -0800
Subject: [NETFILTER]: nf_conntrack: fix warning in PPTP helper

Signed-off-by: Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/include/linux/netfilter/nf_conntrack_pptp.h b/include/linux/netfilter/nf_conntrack_pptp.h
index fb049ec..9d8144a 100644
--- a/include/linux/netfilter/nf_conntrack_pptp.h
+++ b/include/linux/netfilter/nf_conntrack_pptp.h
@@ -2,6 +2,8 @@
 #ifndef _NF_CONNTRACK_PPTP_H
 #define _NF_CONNTRACK_PPTP_H
 
+#include <linux/netfilter/nf_conntrack_common.h>
+
 /* state of the control session */
 enum pptp_ctrlsess_state {
 	PPTP_SESSION_NONE,			/* no session present */
@@ -295,7 +297,6 @@ union pptp_ctrl_union {
 /* crap needed for nf_conntrack_compat.h */
 struct nf_conn;
 struct nf_conntrack_expect;
-enum ip_conntrack_info;
 
 extern int
 (*nf_nat_pptp_hook_outbound)(struct sk_buff **pskb,
-- 
cgit v0.10.2


From ece006416d4fb472f4d2114feede5665cff971b2 Mon Sep 17 00:00:00 2001
From: Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
Date: Tue, 5 Dec 2006 13:44:57 -0800
Subject: [NETFILTER]: nf_conntrack: Don't try to find clashed expectation

The original code continues loop to find expectation in list if the master
conntrack of the found expectation is unconfirmed. But it never success
in that case, because nf_conntrack_expect_related() never insert
clashed expectation to the list.

This stops loop in that case.

Signed-off-by: Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 588d379..7df8f9a 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -91,25 +91,28 @@ EXPORT_SYMBOL_GPL(nf_conntrack_expect_find_get);
 struct nf_conntrack_expect *
 find_expectation(const struct nf_conntrack_tuple *tuple)
 {
-	struct nf_conntrack_expect *i;
+	struct nf_conntrack_expect *exp;
+
+	exp = __nf_conntrack_expect_find(tuple);
+	if (!exp)
+		return NULL;
 
-	list_for_each_entry(i, &nf_conntrack_expect_list, list) {
 	/* If master is not in hash table yet (ie. packet hasn't left
 	   this machine yet), how can other end know about expected?
 	   Hence these are not the droids you are looking for (if
 	   master ct never got confirmed, we'd hold a reference to it
 	   and weird things would happen to future packets). */
-		if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
-		    && nf_ct_is_confirmed(i->master)) {
-			if (i->flags & NF_CT_EXPECT_PERMANENT) {
-				atomic_inc(&i->use);
-				return i;
-			} else if (del_timer(&i->timeout)) {
-				nf_ct_unlink_expect(i);
-				return i;
-			}
-		}
+	if (!nf_ct_is_confirmed(exp->master))
+		return NULL;
+
+	if (exp->flags & NF_CT_EXPECT_PERMANENT) {
+		atomic_inc(&exp->use);
+		return exp;
+	} else if (del_timer(&exp->timeout)) {
+		nf_ct_unlink_expect(exp);
+		return exp;
 	}
+
 	return NULL;
 }
 
-- 
cgit v0.10.2


From f216f082b2b37c4943f1e7c393e2786648d48f6f Mon Sep 17 00:00:00 2001
From: Bart De Schuymer <bdschuym@pandora.be>
Date: Tue, 5 Dec 2006 13:45:21 -0800
Subject: [NETFILTER]: bridge netfilter: deal with martians correctly

The attached patch resolves an issue where a IP DNATed packet with a
martian source is forwarded while it's better to drop it. It also
resolves messages complaining about ip forwarding being disabled while
it's actually enabled. Thanks to lepton <ytht.net@gmail.com> for
reporting this problem.

This is probably a candidate for the -stable release.

Signed-off-by: Bart De Schuymer <bdschuym@pandora.be>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index ac47ba2..bd221ad 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -34,6 +34,7 @@
 #include <linux/netfilter_ipv6.h>
 #include <linux/netfilter_arp.h>
 #include <linux/in_route.h>
+#include <linux/inetdevice.h>
 
 #include <net/ip.h>
 #include <net/ipv6.h>
@@ -221,10 +222,14 @@ static void __br_dnat_complain(void)
  *
  * Otherwise, the packet is considered to be routed and we just
  * change the destination MAC address so that the packet will
- * later be passed up to the IP stack to be routed.
+ * later be passed up to the IP stack to be routed. For a redirected
+ * packet, ip_route_input() will give back the localhost as output device,
+ * which differs from the bridge device.
  *
  * Let us now consider the case that ip_route_input() fails:
  *
+ * This can be because the destination address is martian, in which case
+ * the packet will be dropped.
  * After a "echo '0' > /proc/sys/net/ipv4/ip_forward" ip_route_input()
  * will fail, while __ip_route_output_key() will return success. The source
  * address for __ip_route_output_key() is set to zero, so __ip_route_output_key
@@ -237,7 +242,8 @@ static void __br_dnat_complain(void)
  *
  * --Lennert, 20020411
  * --Bart, 20020416 (updated)
- * --Bart, 20021007 (updated) */
+ * --Bart, 20021007 (updated)
+ * --Bart, 20062711 (updated) */
 static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb)
 {
 	if (skb->pkt_type == PACKET_OTHERHOST) {
@@ -264,15 +270,15 @@ static int br_nf_pre_routing_finish(struct sk_buff *skb)
 	struct net_device *dev = skb->dev;
 	struct iphdr *iph = skb->nh.iph;
 	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+	int err;
 
 	if (nf_bridge->mask & BRNF_PKT_TYPE) {
 		skb->pkt_type = PACKET_OTHERHOST;
 		nf_bridge->mask ^= BRNF_PKT_TYPE;
 	}
 	nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
-
 	if (dnat_took_place(skb)) {
-		if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev)) {
+		if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
 			struct rtable *rt;
 			struct flowi fl = {
 				.nl_u = {
@@ -283,19 +289,33 @@ static int br_nf_pre_routing_finish(struct sk_buff *skb)
 				},
 				.proto = 0,
 			};
+			struct in_device *in_dev = in_dev_get(dev);
+
+			/* If err equals -EHOSTUNREACH the error is due to a
+			 * martian destination or due to the fact that
+			 * forwarding is disabled. For most martian packets,
+			 * ip_route_output_key() will fail. It won't fail for 2 types of
+			 * martian destinations: loopback destinations and destination
+			 * 0.0.0.0. In both cases the packet will be dropped because the
+			 * destination is the loopback device and not the bridge. */
+			if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev))
+				goto free_skb;
 
 			if (!ip_route_output_key(&rt, &fl)) {
 				/* - Bridged-and-DNAT'ed traffic doesn't
-				 *   require ip_forwarding.
-				 * - Deal with redirected traffic. */
-				if (((struct dst_entry *)rt)->dev == dev ||
-				    rt->rt_type == RTN_LOCAL) {
+				 *   require ip_forwarding. */
+				if (((struct dst_entry *)rt)->dev == dev) {
 					skb->dst = (struct dst_entry *)rt;
 					goto bridged_dnat;
 				}
+				/* we are sure that forwarding is disabled, so printing
+				 * this message is no problem. Note that the packet could
+				 * still have a martian destination address, in which case
+				 * the packet could be dropped even if forwarding were enabled */
 				__br_dnat_complain();
 				dst_release((struct dst_entry *)rt);
 			}
+free_skb:
 			kfree_skb(skb);
 			return 0;
 		} else {
-- 
cgit v0.10.2


From 5c804bfdcca2593422dd6edc2d7db4dba645543c Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 5 Dec 2006 13:46:13 -0800
Subject: [NET_SCHED]: cls_fw: fix NULL pointer dereference

When the first fw classifier is initialized, there is a small window
between the ->init() and ->change() calls, during which the classifier
is active but not entirely set up and tp->root is still NULL (->init()
does nothing).

When a packet is queued during this window a NULL pointer dereference
occurs in fw_classify() when trying to dereference head->mask;

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index f59a2c4..c797d6a 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -101,9 +101,10 @@ static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
 	struct fw_head *head = (struct fw_head*)tp->root;
 	struct fw_filter *f;
 	int r;
-	u32 id = skb->mark & head->mask;
+	u32 id = skb->mark;
 
 	if (head != NULL) {
+		id &= head->mask;
 		for (f=head->ht[fw_hash(id)]; f; f=f->next) {
 			if (f->id == id) {
 				*res = f->res;
-- 
cgit v0.10.2


From 9a217a1c7e7f36ec4996314d64267dd711dbd9bf Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Tue, 5 Dec 2006 13:47:21 -0800
Subject: [IPV6]: Repair IPv6 Fragments

The commit "[IPV6]: Use kmemdup" (commit-id:
af879cc704372ef762584e916129d19ffb39e844) broke IPv6 fragments.

Bug was spotted by Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index e05ecbb..e9212c7 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -624,13 +624,13 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 		skb_shinfo(skb)->frag_list = NULL;
 		/* BUILD HEADER */
 
+		*prevhdr = NEXTHDR_FRAGMENT;
 		tmp_hdr = kmemdup(skb->nh.raw, hlen, GFP_ATOMIC);
 		if (!tmp_hdr) {
 			IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 			return -ENOMEM;
 		}
 
-		*prevhdr = NEXTHDR_FRAGMENT;
 		__skb_pull(skb, hlen);
 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 		skb->nh.raw = __skb_push(skb, hlen);
-- 
cgit v0.10.2


From 4e33fa14faecc150e97c0e4f2320745bdc7b7112 Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Tue, 5 Dec 2006 13:48:27 -0800
Subject: [IPV6] RAW: Don't release unlocked sock.

When user builds IPv6 header and send it through raw socket, kernel
tries to release unlocked sock. (Kernel log shows
"BUG: bad unlock balance detected" with enabled debug option.)

The lock is held only for non-hdrincl sock in this function
then this patch fix to do nothing about lock for hdrincl one.

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index c2e629d..4ae1b19a 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -854,7 +854,8 @@ back_from_confirm:
 	}
 done:
 	dst_release(dst);
-	release_sock(sk);
+	if (!inet->hdrincl)
+		release_sock(sk);
 out:	
 	fl6_sock_release(flowlabel);
 	return err<0?err:len;
-- 
cgit v0.10.2


From b9e3dd0ea167df1d7c118170551e08e8e28f27b6 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Tue, 5 Dec 2006 13:49:56 -0800
Subject: [NETLIK]: Add a pointer to the Generic Netlink wiki page.

Add a pointer to the OSDL wiki page on Generic Netlink.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX
index b1181ce..e06b6e3 100644
--- a/Documentation/networking/00-INDEX
+++ b/Documentation/networking/00-INDEX
@@ -58,6 +58,8 @@ fore200e.txt
 	- FORE Systems PCA-200E/SBA-200E ATM NIC driver info.
 framerelay.txt
 	- info on using Frame Relay/Data Link Connection Identifier (DLCI).
+generic_netlink.txt
+	- info on Generic Netlink
 ip-sysctl.txt
 	- /proc/sys/net/ipv4/* variables
 ip_dynaddr.txt
diff --git a/Documentation/networking/generic_netlink.txt b/Documentation/networking/generic_netlink.txt
new file mode 100644
index 0000000..d4f8b8b
--- /dev/null
+++ b/Documentation/networking/generic_netlink.txt
@@ -0,0 +1,3 @@
+A wiki document on how to use Generic Netlink can be found here:
+
+ * http://linux-net.osdl.org/index.php/Generic_Netlink_HOWTO
-- 
cgit v0.10.2


From 48d4ed7a86d70a7e381cc8e48a97312182093ce2 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <hadi@cyberus.ca>
Date: Wed, 6 Dec 2006 20:06:25 -0800
Subject: [GENETLINK]: Fix misplaced command flags.

The command flags for dump and do were swapped..

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index b5df749..548e4e6 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -144,9 +144,9 @@ int genl_register_ops(struct genl_family *family, struct genl_ops *ops)
 	}
 
 	if (ops->dumpit)
-		ops->flags |= GENL_CMD_CAP_DO;
-	if (ops->doit)
 		ops->flags |= GENL_CMD_CAP_DUMP;
+	if (ops->doit)
+		ops->flags |= GENL_CMD_CAP_DO;
 	if (ops->policy)
 		ops->flags |= GENL_CMD_CAP_HASPOL;
 
-- 
cgit v0.10.2


From b259e7d250e15d45b3c8362917931aaff1c88d73 Mon Sep 17 00:00:00 2001
From: Paul Sokolovsky <pmiscml@gmail.com>
Date: Wed, 6 Dec 2006 20:07:59 -0800
Subject: [IrDA]: PXA FIR code device model conversion

pxaficp_ir.c was not converted to the device model framework.

Signed-off-by: Paul Sokolovsky <pmiscml@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Samuel Ortiz <samuel@sortiz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/irda/pxaficp_ir.c b/drivers/net/irda/pxaficp_ir.c
index f9a1c88..9137e23 100644
--- a/drivers/net/irda/pxaficp_ir.c
+++ b/drivers/net/irda/pxaficp_ir.c
@@ -704,9 +704,9 @@ static int pxa_irda_stop(struct net_device *dev)
 	return 0;
 }
 
-static int pxa_irda_suspend(struct device *_dev, pm_message_t state)
+static int pxa_irda_suspend(struct platform_device *_dev, pm_message_t state)
 {
-	struct net_device *dev = dev_get_drvdata(_dev);
+	struct net_device *dev = platform_get_drvdata(_dev);
 	struct pxa_irda *si;
 
 	if (dev && netif_running(dev)) {
@@ -718,9 +718,9 @@ static int pxa_irda_suspend(struct device *_dev, pm_message_t state)
 	return 0;
 }
 
-static int pxa_irda_resume(struct device *_dev)
+static int pxa_irda_resume(struct platform_device *_dev)
 {
-	struct net_device *dev = dev_get_drvdata(_dev);
+	struct net_device *dev = platform_get_drvdata(_dev);
 	struct pxa_irda *si;
 
 	if (dev && netif_running(dev)) {
@@ -746,9 +746,8 @@ static int pxa_irda_init_iobuf(iobuff_t *io, int size)
 	return io->head ? 0 : -ENOMEM;
 }
 
-static int pxa_irda_probe(struct device *_dev)
+static int pxa_irda_probe(struct platform_device *pdev)
 {
-	struct platform_device *pdev = to_platform_device(_dev);
 	struct net_device *dev;
 	struct pxa_irda *si;
 	unsigned int baudrate_mask;
@@ -822,9 +821,9 @@ err_mem_1:
 	return err;
 }
 
-static int pxa_irda_remove(struct device *_dev)
+static int pxa_irda_remove(struct platform_device *_dev)
 {
-	struct net_device *dev = dev_get_drvdata(_dev);
+	struct net_device *dev = platform_get_drvdata(_dev);
 
 	if (dev) {
 		struct pxa_irda *si = netdev_priv(dev);
@@ -840,9 +839,10 @@ static int pxa_irda_remove(struct device *_dev)
 	return 0;
 }
 
-static struct device_driver pxa_ir_driver = {
-	.name		= "pxa2xx-ir",
-	.bus		= &platform_bus_type,
+static struct platform_driver pxa_ir_driver = {
+	.driver         = {
+		.name   = "pxa2xx-ir",
+	},
 	.probe		= pxa_irda_probe,
 	.remove		= pxa_irda_remove,
 	.suspend	= pxa_irda_suspend,
@@ -851,12 +851,12 @@ static struct device_driver pxa_ir_driver = {
 
 static int __init pxa_irda_init(void)
 {
-	return driver_register(&pxa_ir_driver);
+	return platform_driver_register(&pxa_ir_driver);
 }
 
 static void __exit pxa_irda_exit(void)
 {
-	driver_unregister(&pxa_ir_driver);
+	platform_driver_unregister(&pxa_ir_driver);
 }
 
 module_init(pxa_irda_init);
-- 
cgit v0.10.2


From e694ba4428d53882489c07fd7d60c39b3e164dd5 Mon Sep 17 00:00:00 2001
From: Jeet Chaudhuri <jeetlinux@yahoo.co.in>
Date: Wed, 6 Dec 2006 20:08:45 -0800
Subject: [IrDA]: Incorrect TTP header reservation

We must reserve SAR + MAX_HEADER bytes for IrLMP to fit in.

Patch from Jeet Chaudhuri <jeetlinux@yahoo.co.in>

Signed-off-by: Samuel Ortiz <samuel@sortiz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/irda/irttp.c b/net/irda/irttp.c
index 252f110..03504f3 100644
--- a/net/irda/irttp.c
+++ b/net/irda/irttp.c
@@ -1100,7 +1100,7 @@ int irttp_connect_request(struct tsap_cb *self, __u8 dtsap_sel,
 			return -ENOMEM;
 
 		/* Reserve space for MUX_CONTROL and LAP header */
-		skb_reserve(tx_skb, TTP_MAX_HEADER);
+		skb_reserve(tx_skb, TTP_MAX_HEADER + TTP_SAR_HEADER);
 	} else {
 		tx_skb = userdata;
 		/*
@@ -1349,7 +1349,7 @@ int irttp_connect_response(struct tsap_cb *self, __u32 max_sdu_size,
 			return -ENOMEM;
 
 		/* Reserve space for MUX_CONTROL and LAP header */
-		skb_reserve(tx_skb, TTP_MAX_HEADER);
+		skb_reserve(tx_skb, TTP_MAX_HEADER + TTP_SAR_HEADER);
 	} else {
 		tx_skb = userdata;
 		/*
-- 
cgit v0.10.2


From 95b99a670df31ca5271f503f378e5cac3aee8f5e Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 6 Dec 2006 20:10:07 -0800
Subject: [IRDA] irlan: Fix compile warning when CONFIG_PROC_FS=n

include/net/irda/irlan_filter.h:31: warning: 'struct seq_file' declared inside parameter list
include/net/irda/irlan_filter.h:31: warning: its scope is only this definition or declaration, which is probably not what you want

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/include/net/irda/irlan_filter.h b/include/net/irda/irlan_filter.h
index 492deda..1720539 100644
--- a/include/net/irda/irlan_filter.h
+++ b/include/net/irda/irlan_filter.h
@@ -28,6 +28,8 @@
 void irlan_check_command_param(struct irlan_cb *self, char *param, 
 			       char *value);
 void irlan_filter_request(struct irlan_cb *self, struct sk_buff *skb);
+#ifdef CONFIG_PROC_FS
 void irlan_print_filter(struct seq_file *seq, int filter_type);
+#endif
 
 #endif /* IRLAN_FILTER_H */
-- 
cgit v0.10.2


From 161a09e737f0761ca064ee6a907313402f7a54b6 Mon Sep 17 00:00:00 2001
From: Joy Latten <latten@austin.ibm.com>
Date: Mon, 27 Nov 2006 13:11:54 -0600
Subject: audit: Add auditing to ipsec

An audit message occurs when an ipsec SA
or ipsec policy is created/deleted.

Signed-off-by: Joy Latten <latten@austin.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/include/linux/audit.h b/include/linux/audit.h
index b2ca666..0e07db6 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -101,6 +101,10 @@
 #define AUDIT_MAC_CIPSOV4_DEL	1408	/* NetLabel: del CIPSOv4 DOI entry */
 #define AUDIT_MAC_MAP_ADD	1409	/* NetLabel: add LSM domain mapping */
 #define AUDIT_MAC_MAP_DEL	1410	/* NetLabel: del LSM domain mapping */
+#define AUDIT_MAC_IPSEC_ADDSA	1411	/* Add a XFRM state */
+#define AUDIT_MAC_IPSEC_DELSA	1412	/* Delete a XFRM state */
+#define AUDIT_MAC_IPSEC_ADDSPD	1413	/* Add a XFRM policy */
+#define AUDIT_MAC_IPSEC_DELSPD	1414	/* Delete a XFRM policy */
 
 #define AUDIT_FIRST_KERN_ANOM_MSG   1700
 #define AUDIT_LAST_KERN_ANOM_MSG    1799
@@ -377,6 +381,7 @@ extern void auditsc_get_stamp(struct audit_context *ctx,
 			      struct timespec *t, unsigned int *serial);
 extern int  audit_set_loginuid(struct task_struct *task, uid_t loginuid);
 extern uid_t audit_get_loginuid(struct audit_context *ctx);
+extern void audit_log_task_context(struct audit_buffer *ab);
 extern int __audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode);
 extern int audit_bprm(struct linux_binprm *bprm);
@@ -449,6 +454,7 @@ extern int audit_n_rules;
 #define audit_inode_update(i) do { ; } while (0)
 #define auditsc_get_stamp(c,t,s) do { BUG(); } while (0)
 #define audit_get_loginuid(c) ({ -1; })
+#define audit_log_task_context(b) do { ; } while (0)
 #define audit_ipc_obj(i) ({ 0; })
 #define audit_ipc_set_perm(q,u,g,m) ({ 0; })
 #define audit_bprm(p) ({ 0; })
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 15ec19d..f699cdc 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -392,6 +392,15 @@ extern int xfrm_unregister_km(struct xfrm_mgr *km);
 
 extern unsigned int xfrm_policy_count[XFRM_POLICY_MAX*2];
 
+/* Audit Information */
+struct xfrm_audit
+{
+	uid_t	loginuid;
+	u32	secid;
+};
+void xfrm_audit_log(uid_t auid, u32 secid, int type, int result,
+		    struct xfrm_policy *xp, struct xfrm_state *x);
+
 static inline void xfrm_pol_hold(struct xfrm_policy *policy)
 {
 	if (likely(policy != NULL))
@@ -906,7 +915,7 @@ static inline int xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **s
 #endif
 extern struct xfrm_state *xfrm_find_acq_byseq(u32 seq);
 extern int xfrm_state_delete(struct xfrm_state *x);
-extern void xfrm_state_flush(u8 proto);
+extern void xfrm_state_flush(u8 proto, struct xfrm_audit *audit_info);
 extern int xfrm_replay_check(struct xfrm_state *x, __be32 seq);
 extern void xfrm_replay_advance(struct xfrm_state *x, __be32 seq);
 extern void xfrm_replay_notify(struct xfrm_state *x, int event);
@@ -959,13 +968,13 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(u8 type, int dir,
 					  struct xfrm_selector *sel,
 					  struct xfrm_sec_ctx *ctx, int delete);
 struct xfrm_policy *xfrm_policy_byid(u8, int dir, u32 id, int delete);
-void xfrm_policy_flush(u8 type);
+void xfrm_policy_flush(u8 type, struct xfrm_audit *audit_info);
 u32 xfrm_get_acqseq(void);
 void xfrm_alloc_spi(struct xfrm_state *x, __be32 minspi, __be32 maxspi);
-struct xfrm_state * xfrm_find_acq(u8 mode, u32 reqid, u8 proto, 
-				  xfrm_address_t *daddr, xfrm_address_t *saddr, 
+struct xfrm_state * xfrm_find_acq(u8 mode, u32 reqid, u8 proto,
+				  xfrm_address_t *daddr, xfrm_address_t *saddr,
 				  int create, unsigned short family);
-extern void xfrm_policy_flush(u8 type);
+extern void xfrm_policy_flush(u8 type, struct xfrm_audit *audit_info);
 extern int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol);
 extern int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *xdst,
 			  struct flowi *fl, int family, int strict);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ab97e51..40722e2 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -731,7 +731,7 @@ static inline void audit_free_context(struct audit_context *context)
 		printk(KERN_ERR "audit: freed %d contexts\n", count);
 }
 
-static void audit_log_task_context(struct audit_buffer *ab)
+void audit_log_task_context(struct audit_buffer *ab)
 {
 	char *ctx = NULL;
 	ssize_t len = 0;
@@ -760,6 +760,8 @@ error_path:
 	return;
 }
 
+EXPORT_SYMBOL(audit_log_task_context);
+
 static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 {
 	char name[sizeof(tsk->comm)];
@@ -1488,6 +1490,8 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
 	return ctx ? ctx->loginuid : -1;
 }
 
+EXPORT_SYMBOL(audit_get_loginuid);
+
 /**
  * __audit_mq_open - record audit data for a POSIX MQ open
  * @oflag: open flag
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 0e1dbfb..5dd5094 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -27,6 +27,7 @@
 #include <linux/proc_fs.h>
 #include <linux/init.h>
 #include <net/xfrm.h>
+#include <linux/audit.h>
 
 #include <net/sock.h>
 
@@ -1420,6 +1421,9 @@ static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr,
 	else
 		err = xfrm_state_update(x);
 
+	xfrm_audit_log(audit_get_loginuid(current->audit_context), 0,
+		       AUDIT_MAC_IPSEC_ADDSA, err ? 0 : 1, NULL, x);
+
 	if (err < 0) {
 		x->km.state = XFRM_STATE_DEAD;
 		__xfrm_state_put(x);
@@ -1460,8 +1464,12 @@ static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 		err = -EPERM;
 		goto out;
 	}
-	
+
 	err = xfrm_state_delete(x);
+
+	xfrm_audit_log(audit_get_loginuid(current->audit_context), 0,
+		       AUDIT_MAC_IPSEC_DELSA, err ? 0 : 1, NULL, x);
+
 	if (err < 0)
 		goto out;
 
@@ -1637,12 +1645,15 @@ static int pfkey_flush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hd
 {
 	unsigned proto;
 	struct km_event c;
+	struct xfrm_audit audit_info;
 
 	proto = pfkey_satype2proto(hdr->sadb_msg_satype);
 	if (proto == 0)
 		return -EINVAL;
 
-	xfrm_state_flush(proto);
+	audit_info.loginuid = audit_get_loginuid(current->audit_context);
+	audit_info.secid = 0;
+	xfrm_state_flush(proto, &audit_info);
 	c.data.proto = proto;
 	c.seq = hdr->sadb_msg_seq;
 	c.pid = hdr->sadb_msg_pid;
@@ -2205,6 +2216,9 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 	err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp,
 				 hdr->sadb_msg_type != SADB_X_SPDUPDATE);
 
+	xfrm_audit_log(audit_get_loginuid(current->audit_context), 0,
+		       AUDIT_MAC_IPSEC_ADDSPD, err ? 0 : 1, xp, NULL);
+
 	if (err)
 		goto out;
 
@@ -2282,6 +2296,10 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
 	xp = xfrm_policy_bysel_ctx(XFRM_POLICY_TYPE_MAIN, pol->sadb_x_policy_dir-1,
 				   &sel, tmp.security, 1);
 	security_xfrm_policy_free(&tmp);
+
+	xfrm_audit_log(audit_get_loginuid(current->audit_context), 0,
+		       AUDIT_MAC_IPSEC_DELSPD, (xp) ? 1 : 0, xp, NULL);
+
 	if (xp == NULL)
 		return -ENOENT;
 
@@ -2416,8 +2434,11 @@ static int key_notify_policy_flush(struct km_event *c)
 static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
 {
 	struct km_event c;
+	struct xfrm_audit audit_info;
 
-	xfrm_policy_flush(XFRM_POLICY_TYPE_MAIN);
+	audit_info.loginuid = audit_get_loginuid(current->audit_context);
+	audit_info.secid = 0;
+	xfrm_policy_flush(XFRM_POLICY_TYPE_MAIN, &audit_info);
 	c.data.type = XFRM_POLICY_TYPE_MAIN;
 	c.event = XFRM_MSG_FLUSHPOLICY;
 	c.pid = hdr->sadb_msg_pid;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 4f04222..47c1364 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -25,6 +25,7 @@
 #include <linux/cache.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
+#include <linux/audit.h>
 
 #include "xfrm_hash.h"
 
@@ -804,7 +805,7 @@ struct xfrm_policy *xfrm_policy_byid(u8 type, int dir, u32 id, int delete)
 }
 EXPORT_SYMBOL(xfrm_policy_byid);
 
-void xfrm_policy_flush(u8 type)
+void xfrm_policy_flush(u8 type, struct xfrm_audit *audit_info)
 {
 	int dir;
 
@@ -824,6 +825,9 @@ void xfrm_policy_flush(u8 type)
 			hlist_del(&pol->byidx);
 			write_unlock_bh(&xfrm_policy_lock);
 
+			xfrm_audit_log(audit_info->loginuid, audit_info->secid,
+				       AUDIT_MAC_IPSEC_DELSPD, 1, pol, NULL);
+
 			xfrm_policy_kill(pol);
 			killed++;
 
@@ -842,6 +846,11 @@ void xfrm_policy_flush(u8 type)
 				hlist_del(&pol->byidx);
 				write_unlock_bh(&xfrm_policy_lock);
 
+				xfrm_audit_log(audit_info->loginuid,
+					       audit_info->secid,
+					       AUDIT_MAC_IPSEC_DELSPD, 1,
+					       pol, NULL);
+
 				xfrm_policy_kill(pol);
 				killed++;
 
@@ -1977,6 +1986,115 @@ int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
 
 EXPORT_SYMBOL(xfrm_bundle_ok);
 
+/* Audit addition and deletion of SAs and ipsec policy */
+
+void xfrm_audit_log(uid_t auid, u32 sid, int type, int result,
+		    struct xfrm_policy *xp, struct xfrm_state *x)
+{
+
+	char *secctx;
+	u32 secctx_len;
+	struct xfrm_sec_ctx *sctx = NULL;
+	struct audit_buffer *audit_buf;
+	int family;
+	extern int audit_enabled;
+
+	if (audit_enabled == 0)
+		return;
+
+	audit_buf = audit_log_start(current->audit_context, GFP_ATOMIC, type);
+	if (audit_buf == NULL)
+	return;
+
+	switch(type) {
+	case AUDIT_MAC_IPSEC_ADDSA:
+		audit_log_format(audit_buf, "SAD add: auid=%u", auid);
+		break;
+	case AUDIT_MAC_IPSEC_DELSA:
+		audit_log_format(audit_buf, "SAD delete: auid=%u", auid);
+		break;
+	case AUDIT_MAC_IPSEC_ADDSPD:
+		audit_log_format(audit_buf, "SPD add: auid=%u", auid);
+		break;
+	case AUDIT_MAC_IPSEC_DELSPD:
+		audit_log_format(audit_buf, "SPD delete: auid=%u", auid);
+		break;
+	default:
+		return;
+	}
+
+	if (sid != 0 &&
+		security_secid_to_secctx(sid, &secctx, &secctx_len) == 0)
+		audit_log_format(audit_buf, " subj=%s", secctx);
+	else
+		audit_log_task_context(audit_buf);
+
+	if (xp) {
+		family = xp->selector.family;
+		if (xp->security)
+			sctx = xp->security;
+	} else {
+		family = x->props.family;
+		if (x->security)
+			sctx = x->security;
+	}
+
+	if (sctx)
+		audit_log_format(audit_buf,
+				" sec_alg=%u sec_doi=%u sec_obj=%s",
+				sctx->ctx_alg, sctx->ctx_doi, sctx->ctx_str);
+
+	switch(family) {
+	case AF_INET:
+		{
+			struct in_addr saddr, daddr;
+			if (xp) {
+				saddr.s_addr = xp->selector.saddr.a4;
+				daddr.s_addr = xp->selector.daddr.a4;
+			} else {
+				saddr.s_addr = x->props.saddr.a4;
+				daddr.s_addr = x->id.daddr.a4;
+			}
+			audit_log_format(audit_buf,
+					 " src=%u.%u.%u.%u dst=%u.%u.%u.%u",
+					 NIPQUAD(saddr), NIPQUAD(daddr));
+		}
+			break;
+	case AF_INET6:
+		{
+			struct in6_addr saddr6, daddr6;
+			if (xp) {
+				memcpy(&saddr6, xp->selector.saddr.a6,
+					sizeof(struct in6_addr));
+				memcpy(&daddr6, xp->selector.daddr.a6,
+					sizeof(struct in6_addr));
+			} else {
+				memcpy(&saddr6, x->props.saddr.a6,
+					sizeof(struct in6_addr));
+				memcpy(&daddr6, x->id.daddr.a6,
+					sizeof(struct in6_addr));
+			}
+			audit_log_format(audit_buf,
+					 " src=" NIP6_FMT "dst=" NIP6_FMT,
+					 NIP6(saddr6), NIP6(daddr6));
+		}
+		break;
+	}
+
+	if (x)
+		audit_log_format(audit_buf, " spi=%lu(0x%lx) protocol=%s",
+				(unsigned long)ntohl(x->id.spi),
+				(unsigned long)ntohl(x->id.spi),
+				x->id.proto == IPPROTO_AH ? "AH" :
+				(x->id.proto == IPPROTO_ESP ?
+				"ESP" : "IPCOMP"));
+
+	audit_log_format(audit_buf, " res=%u", result);
+	audit_log_end(audit_buf);
+}
+
+EXPORT_SYMBOL(xfrm_audit_log);
+
 int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
 {
 	int err = 0;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index a14c88b..d5d3a6f 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/cache.h>
 #include <asm/uaccess.h>
+#include <linux/audit.h>
 
 #include "xfrm_hash.h"
 
@@ -238,6 +239,7 @@ static void xfrm_timer_handler(unsigned long data)
 	unsigned long now = (unsigned long)xtime.tv_sec;
 	long next = LONG_MAX;
 	int warn = 0;
+	int err = 0;
 
 	spin_lock(&x->lock);
 	if (x->km.state == XFRM_STATE_DEAD)
@@ -295,9 +297,14 @@ expired:
 		next = 2;
 		goto resched;
 	}
-	if (!__xfrm_state_delete(x) && x->id.spi)
+
+	err = __xfrm_state_delete(x);
+	if (!err && x->id.spi)
 		km_state_expired(x, 1, 0);
 
+	xfrm_audit_log(audit_get_loginuid(current->audit_context), 0,
+		       AUDIT_MAC_IPSEC_DELSA, err ? 0 : 1, NULL, x);
+
 out:
 	spin_unlock(&x->lock);
 }
@@ -384,9 +391,10 @@ int xfrm_state_delete(struct xfrm_state *x)
 }
 EXPORT_SYMBOL(xfrm_state_delete);
 
-void xfrm_state_flush(u8 proto)
+void xfrm_state_flush(u8 proto, struct xfrm_audit *audit_info)
 {
 	int i;
+	int err = 0;
 
 	spin_lock_bh(&xfrm_state_lock);
 	for (i = 0; i <= xfrm_state_hmask; i++) {
@@ -400,6 +408,11 @@ restart:
 				spin_unlock_bh(&xfrm_state_lock);
 
 				xfrm_state_delete(x);
+				err = xfrm_state_delete(x);
+				xfrm_audit_log(audit_info->loginuid,
+					       audit_info->secid,
+					       AUDIT_MAC_IPSEC_DELSA,
+					       err ? 0 : 1, NULL, x);
 				xfrm_state_put(x);
 
 				spin_lock_bh(&xfrm_state_lock);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 311205f..e5372b1 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -31,6 +31,7 @@
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 #include <linux/in6.h>
 #endif
+#include <linux/audit.h>
 
 static int verify_one_alg(struct rtattr **xfrma, enum xfrm_attr_type_t type)
 {
@@ -454,6 +455,9 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 	else
 		err = xfrm_state_update(x);
 
+	xfrm_audit_log(NETLINK_CB(skb).loginuid, NETLINK_CB(skb).sid,
+		       AUDIT_MAC_IPSEC_ADDSA, err ? 0 : 1, NULL, x);
+
 	if (err < 0) {
 		x->km.state = XFRM_STATE_DEAD;
 		__xfrm_state_put(x);
@@ -523,6 +527,10 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 	}
 
 	err = xfrm_state_delete(x);
+
+	xfrm_audit_log(NETLINK_CB(skb).loginuid, NETLINK_CB(skb).sid,
+		       AUDIT_MAC_IPSEC_DELSA, err ? 0 : 1, NULL, x);
+
 	if (err < 0)
 		goto out;
 
@@ -1030,6 +1038,9 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 	 * a type XFRM_MSG_UPDPOLICY - JHS */
 	excl = nlh->nlmsg_type == XFRM_MSG_NEWPOLICY;
 	err = xfrm_policy_insert(p->dir, xp, excl);
+	xfrm_audit_log(NETLINK_CB(skb).loginuid, NETLINK_CB(skb).sid,
+		       AUDIT_MAC_IPSEC_DELSPD, err ? 0 : 1, xp, NULL);
+
 	if (err) {
 		security_xfrm_policy_free(xp);
 		kfree(xp);
@@ -1257,6 +1268,10 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 		xp = xfrm_policy_bysel_ctx(type, p->dir, &p->sel, tmp.security, delete);
 		security_xfrm_policy_free(&tmp);
 	}
+	if (delete)
+		xfrm_audit_log(NETLINK_CB(skb).loginuid, NETLINK_CB(skb).sid,
+			       AUDIT_MAC_IPSEC_DELSPD, (xp) ? 1 : 0, xp, NULL);
+
 	if (xp == NULL)
 		return -ENOENT;
 
@@ -1291,8 +1306,11 @@ static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma
 {
 	struct km_event c;
 	struct xfrm_usersa_flush *p = NLMSG_DATA(nlh);
+	struct xfrm_audit audit_info;
 
-	xfrm_state_flush(p->proto);
+	audit_info.loginuid = NETLINK_CB(skb).loginuid;
+	audit_info.secid = NETLINK_CB(skb).sid;
+	xfrm_state_flush(p->proto, &audit_info);
 	c.data.proto = p->proto;
 	c.event = nlh->nlmsg_type;
 	c.seq = nlh->nlmsg_seq;
@@ -1442,12 +1460,15 @@ static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **x
 	struct km_event c;
 	u8 type = XFRM_POLICY_TYPE_MAIN;
 	int err;
+	struct xfrm_audit audit_info;
 
 	err = copy_from_user_policy_type(&type, (struct rtattr **)xfrma);
 	if (err)
 		return err;
 
-	xfrm_policy_flush(type);
+	audit_info.loginuid = NETLINK_CB(skb).loginuid;
+	audit_info.secid = NETLINK_CB(skb).sid;
+	xfrm_policy_flush(type, &audit_info);
 	c.data.type = type;
 	c.event = nlh->nlmsg_type;
 	c.seq = nlh->nlmsg_seq;
@@ -1502,6 +1523,9 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, void *
 	err = 0;
 	if (up->hard) {
 		xfrm_policy_delete(xp, p->dir);
+		xfrm_audit_log(NETLINK_CB(skb).loginuid, NETLINK_CB(skb).sid,
+				AUDIT_MAC_IPSEC_DELSPD, 1, xp, NULL);
+
 	} else {
 		// reset the timers here?
 		printk("Dont know what to do with soft policy expire\n");
@@ -1533,8 +1557,11 @@ static int xfrm_add_sa_expire(struct sk_buff *skb, struct nlmsghdr *nlh, void **
 		goto out;
 	km_state_expired(x, ue->hard, current->pid);
 
-	if (ue->hard)
+	if (ue->hard) {
 		__xfrm_state_delete(x);
+		xfrm_audit_log(NETLINK_CB(skb).loginuid, NETLINK_CB(skb).sid,
+			       AUDIT_MAC_IPSEC_DELSA, 1, NULL, x);
+	}
 out:
 	spin_unlock_bh(&x->lock);
 	xfrm_state_put(x);
-- 
cgit v0.10.2


From c9204d9ca79baac564b49d36d0228a69d7ded084 Mon Sep 17 00:00:00 2001
From: Joy Latten <latten@austin.ibm.com>
Date: Thu, 30 Nov 2006 15:50:43 -0600
Subject: audit: disable ipsec auditing when CONFIG_AUDITSYSCALL=n

Disables auditing in ipsec when CONFIG_AUDITSYSCALL is
disabled in the kernel.

Also includes a bug fix for xfrm_state.c as a result of
original ipsec audit patch.

Signed-off-by: Joy Latten <latten@austin.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index f699cdc..e476541 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -398,8 +398,13 @@ struct xfrm_audit
 	uid_t	loginuid;
 	u32	secid;
 };
-void xfrm_audit_log(uid_t auid, u32 secid, int type, int result,
+
+#ifdef CONFIG_AUDITSYSCALL
+extern void xfrm_audit_log(uid_t auid, u32 secid, int type, int result,
 		    struct xfrm_policy *xp, struct xfrm_state *x);
+#else
+#define xfrm_audit_log(a,s,t,r,p,x) do { ; } while (0)
+#endif /* CONFIG_AUDITSYSCALL */
 
 static inline void xfrm_pol_hold(struct xfrm_policy *policy)
 {
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 47c1364..140bb9b 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1986,6 +1986,7 @@ int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
 
 EXPORT_SYMBOL(xfrm_bundle_ok);
 
+#ifdef CONFIG_AUDITSYSCALL
 /* Audit addition and deletion of SAs and ipsec policy */
 
 void xfrm_audit_log(uid_t auid, u32 sid, int type, int result,
@@ -2094,6 +2095,7 @@ void xfrm_audit_log(uid_t auid, u32 sid, int type, int result,
 }
 
 EXPORT_SYMBOL(xfrm_audit_log);
+#endif /* CONFIG_AUDITSYSCALL */
 
 int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
 {
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index d5d3a6f..fdb08d9 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -407,7 +407,6 @@ restart:
 				xfrm_state_hold(x);
 				spin_unlock_bh(&xfrm_state_lock);
 
-				xfrm_state_delete(x);
 				err = xfrm_state_delete(x);
 				xfrm_audit_log(audit_info->loginuid,
 					       audit_info->secid,
-- 
cgit v0.10.2


From 26db167702756d0022f8ea5f1f30cad3018cfe31 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Wed, 6 Dec 2006 23:45:15 -0800
Subject: [IPSEC]: Fix inetpeer leak in ipv4 xfrm dst entries.

We grab a reference to the route's inetpeer entry but
forget to release it in xfrm4_dst_destroy().

Bug discovered by Kazunori MIYAZAWA <kazunori@miyazawa.org>

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index d4107bb..fb9f69c 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -274,6 +274,8 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
 
 	if (likely(xdst->u.rt.idev))
 		in_dev_put(xdst->u.rt.idev);
+	if (likely(xdst->u.rt.peer))
+		inet_putpeer(xdst->u.rt.peer);
 	xfrm_dst_destroy(xdst);
 }
 
-- 
cgit v0.10.2


From e16aa207ccb61c5111525c462eeeba1f3f5fd370 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Thu, 7 Dec 2006 00:11:33 -0800
Subject: [NET]: Memory barrier cleanups

I believe all the below memory barriers only matter on SMP so
therefore the smp_* variant of the barrier should be used.

I'm wondering if the barrier in net/ipv4/inet_timewait_sock.c should be
dropped entirely.  schedule_work's implementation currently implies a
memory barrier and I think sane semantics of schedule_work() should imply
a memory barrier, as needed so the caller shouldn't have to worry.
It's not quite obvious why the barrier in net/packet/af_packet.c is
needed; maybe it should be implied through flush_dcache_page?

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/core/wireless.c b/net/core/wireless.c
index cb1b872..f69ab7b 100644
--- a/net/core/wireless.c
+++ b/net/core/wireless.c
@@ -2130,7 +2130,7 @@ int iw_handler_set_spy(struct net_device *	dev,
 	 * The rtnl_lock() make sure we don't race with the other iw_handlers.
 	 * This make sure wireless_spy_update() "see" that the spy list
 	 * is temporarily disabled. */
-	wmb();
+	smp_wmb();
 
 	/* Are there are addresses to copy? */
 	if(wrqu->data.length > 0) {
@@ -2159,7 +2159,7 @@ int iw_handler_set_spy(struct net_device *	dev,
 	}
 
 	/* Make sure above is updated before re-enabling */
-	wmb();
+	smp_wmb();
 
 	/* Enable addresses */
 	spydata->spy_number = wrqu->data.length;
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 8c74f91..75373f3 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -178,7 +178,7 @@ void inet_twdr_hangman(unsigned long data)
 	need_timer = 0;
 	if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
 		twdr->thread_slots |= (1 << twdr->slot);
-		mb();
+		smp_mb();
 		schedule_work(&twdr->twkill_work);
 		need_timer = 1;
 	} else {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9304034..c701f6a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4235,7 +4235,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		 * Change state from SYN-SENT only after copied_seq
 		 * is initialized. */
 		tp->copied_seq = tp->rcv_nxt;
-		mb();
+		smp_mb();
 		tcp_set_state(sk, TCP_ESTABLISHED);
 
 		security_inet_conn_established(sk, skb);
@@ -4483,7 +4483,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		case TCP_SYN_RECV:
 			if (acceptable) {
 				tp->copied_seq = tp->rcv_nxt;
-				mb();
+				smp_mb();
 				tcp_set_state(sk, TCP_ESTABLISHED);
 				sk->sk_state_change(sk);
 
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 08e68b6..da73e8a 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -660,7 +660,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
 	sll->sll_ifindex = dev->ifindex;
 
 	h->tp_status = status;
-	mb();
+	smp_mb();
 
 	{
 		struct page *p_start, *p_end;
-- 
cgit v0.10.2


From 905eee008b5440e30186ab72c238ec8cb2886f74 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Thu, 7 Dec 2006 00:12:30 -0800
Subject: [TCP] inet_twdr_hangman: Delete unnecessary memory barrier().

As per Ralf Baechle's observations, the schedule_work() call
should give enough of a memory barrier, so the explicit one
here is totally unnecessary.

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 75373f3..061fd7a 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -178,7 +178,6 @@ void inet_twdr_hangman(unsigned long data)
 	need_timer = 0;
 	if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
 		twdr->thread_slots |= (1 << twdr->slot);
-		smp_mb();
 		schedule_work(&twdr->twkill_work);
 		need_timer = 1;
 	} else {
-- 
cgit v0.10.2


From 456c38f9682645cbb0537cb4e16848a28c1d3a76 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Thu, 7 Dec 2006 00:18:22 -0800
Subject: [WANROUTER]: Kill kmalloc debugging code.

It duplicates what SLAB debug can do already.

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c
index 316211d..769cdd6 100644
--- a/net/wanrouter/wanmain.c
+++ b/net/wanrouter/wanmain.c
@@ -62,63 +62,6 @@
 
 #define KMEM_SAFETYZONE 8
 
-/***********FOR DEBUGGING PURPOSES*********************************************
-static void * dbg_kmalloc(unsigned int size, int prio, int line) {
-	int i = 0;
-	void * v = kmalloc(size+sizeof(unsigned int)+2*KMEM_SAFETYZONE*8,prio);
-	char * c1 = v;
-	c1 += sizeof(unsigned int);
-	*((unsigned int *)v) = size;
-
-	for (i = 0; i < KMEM_SAFETYZONE; i++) {
-		c1[0] = 'D'; c1[1] = 'E'; c1[2] = 'A'; c1[3] = 'D';
-		c1[4] = 'B'; c1[5] = 'E'; c1[6] = 'E'; c1[7] = 'F';
-		c1 += 8;
-	}
-	c1 += size;
-	for (i = 0; i < KMEM_SAFETYZONE; i++) {
-		c1[0] = 'M'; c1[1] = 'U'; c1[2] = 'N'; c1[3] = 'G';
-		c1[4] = 'W'; c1[5] = 'A'; c1[6] = 'L'; c1[7] = 'L';
-		c1 += 8;
-	}
-	v = ((char *)v) + sizeof(unsigned int) + KMEM_SAFETYZONE*8;
-	printk(KERN_INFO "line %d  kmalloc(%d,%d) = %p\n",line,size,prio,v);
-	return v;
-}
-static void dbg_kfree(void * v, int line) {
-	unsigned int * sp = (unsigned int *)(((char *)v) - (sizeof(unsigned int) + KMEM_SAFETYZONE*8));
-	unsigned int size = *sp;
-	char * c1 = ((char *)v) - KMEM_SAFETYZONE*8;
-	int i = 0;
-	for (i = 0; i < KMEM_SAFETYZONE; i++) {
-		if (   c1[0] != 'D' || c1[1] != 'E' || c1[2] != 'A' || c1[3] != 'D'
-		    || c1[4] != 'B' || c1[5] != 'E' || c1[6] != 'E' || c1[7] != 'F') {
-			printk(KERN_INFO "kmalloced block at %p has been corrupted (underrun)!\n",v);
-			printk(KERN_INFO " %4x: %2x %2x %2x %2x %2x %2x %2x %2x\n", i*8,
-			                c1[0],c1[1],c1[2],c1[3],c1[4],c1[5],c1[6],c1[7] );
-		}
-		c1 += 8;
-	}
-	c1 += size;
-	for (i = 0; i < KMEM_SAFETYZONE; i++) {
-		if (   c1[0] != 'M' || c1[1] != 'U' || c1[2] != 'N' || c1[3] != 'G'
-		    || c1[4] != 'W' || c1[5] != 'A' || c1[6] != 'L' || c1[7] != 'L'
-		   ) {
-			printk(KERN_INFO "kmalloced block at %p has been corrupted (overrun):\n",v);
-			printk(KERN_INFO " %4x: %2x %2x %2x %2x %2x %2x %2x %2x\n", i*8,
-			                c1[0],c1[1],c1[2],c1[3],c1[4],c1[5],c1[6],c1[7] );
-		}
-		c1 += 8;
-	}
-	printk(KERN_INFO "line %d  kfree(%p)\n",line,v);
-	v = ((char *)v) - (sizeof(unsigned int) + KMEM_SAFETYZONE*8);
-	kfree(v);
-}
-
-#define kmalloc(x,y) dbg_kmalloc(x,y,__LINE__)
-#define kfree(x) dbg_kfree(x,__LINE__)
-*****************************************************************************/
-
 /*
  * 	Function Prototypes
  */
-- 
cgit v0.10.2


From 5d64ad34f468278ce66f9eb4d876dd221490e94c Mon Sep 17 00:00:00 2001
From: Michael Chan <mchan@broadcom.com>
Date: Thu, 7 Dec 2006 00:19:40 -0800
Subject: [TG3]: Fix Phy loopback.

Phy loopback on most 10/100 devices need to be run in 1Gbps mode in
GMII mode.

Signed-off-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index d9123c9..e6561c1 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -8781,17 +8781,20 @@ static int tg3_run_loopback(struct tg3 *tp, int loopback_mode)
 					tg3_writephy(tp, 0x10, phy & ~0x4000);
 				tg3_writephy(tp, MII_TG3_EPHY_TEST, phytest);
 			}
-		}
-		val = BMCR_LOOPBACK | BMCR_FULLDPLX;
-		if (tp->tg3_flags & TG3_FLAG_10_100_ONLY)
-			val |= BMCR_SPEED100;
-		else
-			val |= BMCR_SPEED1000;
+			val = BMCR_LOOPBACK | BMCR_FULLDPLX | BMCR_SPEED100;
+		} else
+			val = BMCR_LOOPBACK | BMCR_FULLDPLX | BMCR_SPEED1000;
 
 		tg3_writephy(tp, MII_BMCR, val);
 		udelay(40);
-		if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5906)
+
+		mac_mode = (tp->mac_mode & ~MAC_MODE_PORT_MODE_MASK) |
+			   MAC_MODE_LINK_POLARITY;
+		if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5906) {
 			tg3_writephy(tp, MII_TG3_EPHY_PTEST, 0x1800);
+			mac_mode |= MAC_MODE_PORT_MODE_MII;
+		} else
+			mac_mode |= MAC_MODE_PORT_MODE_GMII;
 
 		/* reset to prevent losing 1st rx packet intermittently */
 		if (tp->tg3_flags2 & TG3_FLG2_MII_SERDES) {
@@ -8799,12 +8802,6 @@ static int tg3_run_loopback(struct tg3 *tp, int loopback_mode)
 			udelay(10);
 			tw32_f(MAC_RX_MODE, tp->rx_mode);
 		}
-		mac_mode = (tp->mac_mode & ~MAC_MODE_PORT_MODE_MASK) |
-			   MAC_MODE_LINK_POLARITY;
-		if (tp->tg3_flags & TG3_FLAG_10_100_ONLY)
-			mac_mode |= MAC_MODE_PORT_MODE_MII;
-		else
-			mac_mode |= MAC_MODE_PORT_MODE_GMII;
 		if ((tp->phy_id & PHY_ID_MASK) == PHY_ID_BCM5401) {
 			mac_mode &= ~MAC_MODE_LINK_POLARITY;
 			tg3_writephy(tp, MII_TG3_EXT_CTRL,
-- 
cgit v0.10.2


From 676917d488212303ce4a7d033815ce8799201010 Mon Sep 17 00:00:00 2001
From: Michael Chan <mchan@broadcom.com>
Date: Thu, 7 Dec 2006 00:20:22 -0800
Subject: [TG3]: Add 5787F device ID.

Signed-off-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index e6561c1..5514828 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -192,6 +192,7 @@ static struct pci_device_id tg3_pci_tbl[] = {
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5786)},
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5787)},
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5787M)},
+	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5787F)},
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5714)},
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5714S)},
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5715)},
@@ -10859,7 +10860,8 @@ static int __devinit tg3_get_invariants(struct tg3 *tp)
 	      tp->pdev->device == PCI_DEVICE_ID_TIGON3_5705F)) ||
 	    (tp->pdev->vendor == PCI_VENDOR_ID_BROADCOM &&
 	     (tp->pdev->device == PCI_DEVICE_ID_TIGON3_5751F ||
-	      tp->pdev->device == PCI_DEVICE_ID_TIGON3_5753F)) ||
+	      tp->pdev->device == PCI_DEVICE_ID_TIGON3_5753F ||
+	      tp->pdev->device == PCI_DEVICE_ID_TIGON3_5787F)) ||
 	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5906)
 		tp->tg3_flags |= TG3_FLAG_10_100_ONLY;
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index c09da1e..edddcce 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1931,6 +1931,7 @@
 #define PCI_DEVICE_ID_TIGON3_5750M	0x167c
 #define PCI_DEVICE_ID_TIGON3_5751M	0x167d
 #define PCI_DEVICE_ID_TIGON3_5751F	0x167e
+#define PCI_DEVICE_ID_TIGON3_5787F	0x167f
 #define PCI_DEVICE_ID_TIGON3_5787M	0x1693
 #define PCI_DEVICE_ID_TIGON3_5782	0x1696
 #define PCI_DEVICE_ID_TIGON3_5786	0x169a
-- 
cgit v0.10.2


From 9d26e213423923c9e033ccd373705118131827c9 Mon Sep 17 00:00:00 2001
From: Michael Chan <mchan@broadcom.com>
Date: Thu, 7 Dec 2006 00:21:14 -0800
Subject: [TG3]: Add TG3_FLG2_IS_NIC flag.

Add Tg3_FLG2_IS_NIC flag to unambiguously determine whether the
device is NIC or onboard.  Previously, the EEPROM_WRITE_PROT flag was
overloaded to also mean onboard.  With the separation, we can
support some devices that are onboard but do not use eeprom write
protect.

Signed-off-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 5514828..16bc05f 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -1062,7 +1062,7 @@ static void tg3_frob_aux_power(struct tg3 *tp)
 {
 	struct tg3 *tp_peer = tp;
 
-	if ((tp->tg3_flags & TG3_FLAG_EEPROM_WRITE_PROT) != 0)
+	if ((tp->tg3_flags2 & TG3_FLG2_IS_NIC) == 0)
 		return;
 
 	if ((GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5704) ||
@@ -1213,8 +1213,8 @@ static int tg3_set_power_state(struct tg3 *tp, pci_power_t state)
 				      power_control);
 		udelay(100);	/* Delay after power state change */
 
-		/* Switch out of Vaux if it is not a LOM */
-		if (!(tp->tg3_flags & TG3_FLAG_EEPROM_WRITE_PROT))
+		/* Switch out of Vaux if it is a NIC */
+		if (tp->tg3_flags2 & TG3_FLG2_IS_NIC)
 			tw32_wait_f(GRC_LOCAL_CTRL, tp->grc_local_ctrl, 100);
 
 		return 0;
@@ -6397,16 +6397,17 @@ static int tg3_reset_hw(struct tg3 *tp, int reset_phy)
 	udelay(40);
 
 	/* tp->grc_local_ctrl is partially set up during tg3_get_invariants().
-	 * If TG3_FLAG_EEPROM_WRITE_PROT is set, we should read the
+	 * If TG3_FLG2_IS_NIC is zero, we should read the
 	 * register to preserve the GPIO settings for LOMs. The GPIOs,
 	 * whether used as inputs or outputs, are set by boot code after
 	 * reset.
 	 */
-	if (tp->tg3_flags & TG3_FLAG_EEPROM_WRITE_PROT) {
+	if (!(tp->tg3_flags2 & TG3_FLG2_IS_NIC)) {
 		u32 gpio_mask;
 
-		gpio_mask = GRC_LCLCTRL_GPIO_OE0 | GRC_LCLCTRL_GPIO_OE2 |
-			    GRC_LCLCTRL_GPIO_OUTPUT0 | GRC_LCLCTRL_GPIO_OUTPUT2;
+		gpio_mask = GRC_LCLCTRL_GPIO_OE0 | GRC_LCLCTRL_GPIO_OE1 |
+			    GRC_LCLCTRL_GPIO_OE2 | GRC_LCLCTRL_GPIO_OUTPUT0 |
+			    GRC_LCLCTRL_GPIO_OUTPUT1 | GRC_LCLCTRL_GPIO_OUTPUT2;
 
 		if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5752)
 			gpio_mask |= GRC_LCLCTRL_GPIO_OE3 |
@@ -6418,8 +6419,9 @@ static int tg3_reset_hw(struct tg3 *tp, int reset_phy)
 		tp->grc_local_ctrl |= tr32(GRC_LOCAL_CTRL) & gpio_mask;
 
 		/* GPIO1 must be driven high for eeprom write protect */
-		tp->grc_local_ctrl |= (GRC_LCLCTRL_GPIO_OE1 |
-				       GRC_LCLCTRL_GPIO_OUTPUT1);
+		if (tp->tg3_flags & TG3_FLAG_EEPROM_WRITE_PROT)
+			tp->grc_local_ctrl |= (GRC_LCLCTRL_GPIO_OE1 |
+					       GRC_LCLCTRL_GPIO_OUTPUT1);
 	}
 	tw32_f(GRC_LOCAL_CTRL, tp->grc_local_ctrl);
 	udelay(100);
@@ -9963,8 +9965,10 @@ static void __devinit tg3_get_eeprom_hw_cfg(struct tg3 *tp)
 	tp->tg3_flags |= TG3_FLAG_EEPROM_WRITE_PROT;
 
 	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5906) {
-		if (!(tr32(PCIE_TRANSACTION_CFG) & PCIE_TRANS_CFG_LOM))
+		if (!(tr32(PCIE_TRANSACTION_CFG) & PCIE_TRANS_CFG_LOM)) {
 			tp->tg3_flags &= ~TG3_FLAG_EEPROM_WRITE_PROT;
+			tp->tg3_flags2 |= TG3_FLG2_IS_NIC;
+		}
 		return;
 	}
 
@@ -10064,10 +10068,17 @@ static void __devinit tg3_get_eeprom_hw_cfg(struct tg3 *tp)
 		    tp->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL)
 			tp->led_ctrl = LED_CTRL_MODE_PHY_2;
 
-		if (nic_cfg & NIC_SRAM_DATA_CFG_EEPROM_WP)
+		if (nic_cfg & NIC_SRAM_DATA_CFG_EEPROM_WP) {
 			tp->tg3_flags |= TG3_FLAG_EEPROM_WRITE_PROT;
-		else
+			if ((tp->pdev->subsystem_vendor ==
+			     PCI_VENDOR_ID_ARIMA) &&
+			    (tp->pdev->subsystem_device == 0x205a ||
+			     tp->pdev->subsystem_device == 0x2063))
+				tp->tg3_flags &= ~TG3_FLAG_EEPROM_WRITE_PROT;
+		} else {
 			tp->tg3_flags &= ~TG3_FLAG_EEPROM_WRITE_PROT;
+			tp->tg3_flags2 |= TG3_FLG2_IS_NIC;
+		}
 
 		if (nic_cfg & NIC_SRAM_DATA_CFG_ASF_ENABLE) {
 			tp->tg3_flags |= TG3_FLAG_ENABLE_ASF;
@@ -10693,7 +10704,7 @@ static int __devinit tg3_get_invariants(struct tg3 *tp)
 		tp->tg3_flags |= TG3_FLAG_SRAM_USE_CONFIG;
 
 	/* Get eeprom hw config before calling tg3_set_power_state().
-	 * In particular, the TG3_FLAG_EEPROM_WRITE_PROT flag must be
+	 * In particular, the TG3_FLG2_IS_NIC flag must be
 	 * determined before calling tg3_set_power_state() so that
 	 * we know whether or not to switch out of Vaux power.
 	 * When the flag is set, it means that GPIO1 is used for eeprom
diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h
index 92f5300..dfaf4ed 100644
--- a/drivers/net/tg3.h
+++ b/drivers/net/tg3.h
@@ -2233,6 +2233,7 @@ struct tg3 {
 #define TG3_FLG2_PCI_EXPRESS		0x00000200
 #define TG3_FLG2_ASF_NEW_HANDSHAKE	0x00000400
 #define TG3_FLG2_HW_AUTONEG		0x00000800
+#define TG3_FLG2_IS_NIC			0x00001000
 #define TG3_FLG2_PHY_SERDES		0x00002000
 #define TG3_FLG2_CAPACITIVE_COUPLING	0x00004000
 #define TG3_FLG2_FLASH			0x00008000
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index edddcce..ebc597d 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2003,6 +2003,8 @@
 #define PCI_DEVICE_ID_FARSITE_TE1       0x1610
 #define PCI_DEVICE_ID_FARSITE_TE1C      0x1612
 
+#define PCI_VENDOR_ID_ARIMA		0x161f
+
 #define PCI_VENDOR_ID_SIBYTE		0x166d
 #define PCI_DEVICE_ID_BCM1250_PCI	0x0001
 #define PCI_DEVICE_ID_BCM1250_HT	0x0002
-- 
cgit v0.10.2


From 3600d918d870456ea8e7bb9d47f327de5c20f3d6 Mon Sep 17 00:00:00 2001
From: Michael Chan <mchan@broadcom.com>
Date: Thu, 7 Dec 2006 00:21:48 -0800
Subject: [TG3]: Allow partial speed advertisement.

Honor the advertisement bitmask from ethtool.  We used to always
advertise the full capability when autoneg was set to on.

Signed-off-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 16bc05f..576e9ea 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -1558,12 +1558,6 @@ static void tg3_phy_copper_begin(struct tg3 *tp)
 
 		tg3_writephy(tp, MII_ADVERTISE, new_adv);
 	} else if (tp->link_config.speed == SPEED_INVALID) {
-		tp->link_config.advertising =
-			(ADVERTISED_10baseT_Half | ADVERTISED_10baseT_Full |
-			 ADVERTISED_100baseT_Half | ADVERTISED_100baseT_Full |
-			 ADVERTISED_1000baseT_Half | ADVERTISED_1000baseT_Full |
-			 ADVERTISED_Autoneg | ADVERTISED_MII);
-
 		if (tp->tg3_flags & TG3_FLAG_10_100_ONLY)
 			tp->link_config.advertising &=
 				~(ADVERTISED_1000baseT_Half |
@@ -1707,25 +1701,36 @@ static int tg3_init_5401phy_dsp(struct tg3 *tp)
 	return err;
 }
 
-static int tg3_copper_is_advertising_all(struct tg3 *tp)
+static int tg3_copper_is_advertising_all(struct tg3 *tp, u32 mask)
 {
-	u32 adv_reg, all_mask;
+	u32 adv_reg, all_mask = 0;
+
+	if (mask & ADVERTISED_10baseT_Half)
+		all_mask |= ADVERTISE_10HALF;
+	if (mask & ADVERTISED_10baseT_Full)
+		all_mask |= ADVERTISE_10FULL;
+	if (mask & ADVERTISED_100baseT_Half)
+		all_mask |= ADVERTISE_100HALF;
+	if (mask & ADVERTISED_100baseT_Full)
+		all_mask |= ADVERTISE_100FULL;
 
 	if (tg3_readphy(tp, MII_ADVERTISE, &adv_reg))
 		return 0;
 
-	all_mask = (ADVERTISE_10HALF | ADVERTISE_10FULL |
-		    ADVERTISE_100HALF | ADVERTISE_100FULL);
 	if ((adv_reg & all_mask) != all_mask)
 		return 0;
 	if (!(tp->tg3_flags & TG3_FLAG_10_100_ONLY)) {
 		u32 tg3_ctrl;
 
+		all_mask = 0;
+		if (mask & ADVERTISED_1000baseT_Half)
+			all_mask |= ADVERTISE_1000HALF;
+		if (mask & ADVERTISED_1000baseT_Full)
+			all_mask |= ADVERTISE_1000FULL;
+
 		if (tg3_readphy(tp, MII_TG3_CTRL, &tg3_ctrl))
 			return 0;
 
-		all_mask = (MII_TG3_CTRL_ADV_1000_HALF |
-			    MII_TG3_CTRL_ADV_1000_FULL);
 		if ((tg3_ctrl & all_mask) != all_mask)
 			return 0;
 	}
@@ -1885,7 +1890,8 @@ static int tg3_setup_copper_phy(struct tg3 *tp, int force_reset)
 				/* Force autoneg restart if we are exiting
 				 * low power mode.
 				 */
-				if (!tg3_copper_is_advertising_all(tp))
+				if (!tg3_copper_is_advertising_all(tp,
+						tp->link_config.advertising))
 					current_link_up = 0;
 			} else {
 				current_link_up = 0;
@@ -10156,7 +10162,7 @@ static int __devinit tg3_phy_probe(struct tg3 *tp)
 
 	if (!(tp->tg3_flags2 & TG3_FLG2_ANY_SERDES) &&
 	    !(tp->tg3_flags & TG3_FLAG_ENABLE_ASF)) {
-		u32 bmsr, adv_reg, tg3_ctrl;
+		u32 bmsr, adv_reg, tg3_ctrl, mask;
 
 		tg3_readphy(tp, MII_BMSR, &bmsr);
 		if (!tg3_readphy(tp, MII_BMSR, &bmsr) &&
@@ -10180,7 +10186,10 @@ static int __devinit tg3_phy_probe(struct tg3 *tp)
 					     MII_TG3_CTRL_ENABLE_AS_MASTER);
 		}
 
-		if (!tg3_copper_is_advertising_all(tp)) {
+		mask = (ADVERTISED_10baseT_Half | ADVERTISED_10baseT_Full |
+			ADVERTISED_100baseT_Half | ADVERTISED_100baseT_Full |
+			ADVERTISED_1000baseT_Half | ADVERTISED_1000baseT_Full);
+		if (!tg3_copper_is_advertising_all(tp, mask)) {
 			tg3_writephy(tp, MII_ADVERTISE, adv_reg);
 
 			if (!(tp->tg3_flags & TG3_FLAG_10_100_ONLY))
-- 
cgit v0.10.2


From 9f88f29fc502192824aba092e90af1297a87eb82 Mon Sep 17 00:00:00 2001
From: Michael Chan <mchan@broadcom.com>
Date: Thu, 7 Dec 2006 00:22:54 -0800
Subject: [TG3]: Use netif_msg_*.

Use netif_msg_* to turn on or off some messages.

Based on Stephen Hemminger's initial patch.

Signed-off-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 576e9ea..0b50f1f 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -1402,8 +1402,10 @@ static int tg3_set_power_state(struct tg3 *tp, pci_power_t state)
 static void tg3_link_report(struct tg3 *tp)
 {
 	if (!netif_carrier_ok(tp->dev)) {
-		printk(KERN_INFO PFX "%s: Link is down.\n", tp->dev->name);
-	} else {
+		if (netif_msg_link(tp))
+			printk(KERN_INFO PFX "%s: Link is down.\n",
+			       tp->dev->name);
+	} else if (netif_msg_link(tp)) {
 		printk(KERN_INFO PFX "%s: Link is up at %d Mbps, %s duplex.\n",
 		       tp->dev->name,
 		       (tp->link_config.active_speed == SPEED_1000 ?
@@ -3710,8 +3712,9 @@ static void tg3_tx_timeout(struct net_device *dev)
 {
 	struct tg3 *tp = netdev_priv(dev);
 
-	printk(KERN_ERR PFX "%s: transmit timed out, resetting\n",
-	       dev->name);
+	if (netif_msg_tx_err(tp))
+		printk(KERN_ERR PFX "%s: transmit timed out, resetting\n",
+		       dev->name);
 
 	schedule_work(&tp->reset_task);
 }
@@ -8665,7 +8668,9 @@ static int tg3_test_registers(struct tg3 *tp)
 	return 0;
 
 out:
-	printk(KERN_ERR PFX "Register test failed at offset %x\n", offset);
+	if (netif_msg_hw(tp))
+		printk(KERN_ERR PFX "Register test failed at offset %x\n",
+		       offset);
 	tw32(offset, save_val);
 	return -EIO;
 }
-- 
cgit v0.10.2


From 9d57f01c1331cb7bfd0a9d4f7723da5b9329394f Mon Sep 17 00:00:00 2001
From: Michael Chan <mchan@broadcom.com>
Date: Thu, 7 Dec 2006 00:23:25 -0800
Subject: [TG3]: Use msleep.

Change some udelay() in some eeprom functions to msleep().  Eeprom
related functions are always called from sleepable context.

Signed-off-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 0b50f1f..cfb9098 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -9467,16 +9467,12 @@ static void __devinit tg3_get_5906_nvram_info(struct tg3 *tp)
 /* Chips other than 5700/5701 use the NVRAM for fetching info. */
 static void __devinit tg3_nvram_init(struct tg3 *tp)
 {
-	int j;
-
 	tw32_f(GRC_EEPROM_ADDR,
 	     (EEPROM_ADDR_FSM_RESET |
 	      (EEPROM_DEFAULT_CLOCK_PERIOD <<
 	       EEPROM_ADDR_CLKPERD_SHIFT)));
 
-	/* XXX schedule_timeout() ... */
-	for (j = 0; j < 100; j++)
-		udelay(10);
+	msleep(1);
 
 	/* Enable seeprom accesses. */
 	tw32_f(GRC_LOCAL_CTRL,
@@ -9537,12 +9533,12 @@ static int tg3_nvram_read_using_eeprom(struct tg3 *tp,
 	      EEPROM_ADDR_ADDR_MASK) |
 	     EEPROM_ADDR_READ | EEPROM_ADDR_START);
 
-	for (i = 0; i < 10000; i++) {
+	for (i = 0; i < 1000; i++) {
 		tmp = tr32(GRC_EEPROM_ADDR);
 
 		if (tmp & EEPROM_ADDR_COMPLETE)
 			break;
-		udelay(100);
+		msleep(1);
 	}
 	if (!(tmp & EEPROM_ADDR_COMPLETE))
 		return -EBUSY;
@@ -9667,12 +9663,12 @@ static int tg3_nvram_write_block_using_eeprom(struct tg3 *tp,
 			EEPROM_ADDR_START |
 			EEPROM_ADDR_WRITE);
 
-		for (j = 0; j < 10000; j++) {
+		for (j = 0; j < 1000; j++) {
 			val = tr32(GRC_EEPROM_ADDR);
 
 			if (val & EEPROM_ADDR_COMPLETE)
 				break;
-			udelay(100);
+			msleep(1);
 		}
 		if (!(val & EEPROM_ADDR_COMPLETE)) {
 			rc = -EBUSY;
-- 
cgit v0.10.2


From cbb45d21fb2fcbcafc19ea859350f564252a0878 Mon Sep 17 00:00:00 2001
From: Michael Chan <mchan@broadcom.com>
Date: Thu, 7 Dec 2006 00:24:09 -0800
Subject: [TG3]: Identify Serdes devices more clearly.

Change the message to more clearly identify Serdes devices.

Update version to 3.70.

Signed-off-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index cfb9098..571320ae 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -68,8 +68,8 @@
 
 #define DRV_MODULE_NAME		"tg3"
 #define PFX DRV_MODULE_NAME	": "
-#define DRV_MODULE_VERSION	"3.69"
-#define DRV_MODULE_RELDATE	"November 15, 2006"
+#define DRV_MODULE_VERSION	"3.70"
+#define DRV_MODULE_RELDATE	"December 1, 2006"
 
 #define TG3_DEF_MAC_MODE	0
 #define TG3_DEF_RX_MODE		0
@@ -11932,13 +11932,15 @@ static int __devinit tg3_init_one(struct pci_dev *pdev,
 
 	pci_set_drvdata(pdev, dev);
 
-	printk(KERN_INFO "%s: Tigon3 [partno(%s) rev %04x PHY(%s)] (%s) %sBaseT Ethernet ",
+	printk(KERN_INFO "%s: Tigon3 [partno(%s) rev %04x PHY(%s)] (%s) %s Ethernet ",
 	       dev->name,
 	       tp->board_part_number,
 	       tp->pci_chip_rev_id,
 	       tg3_phy_string(tp),
 	       tg3_bus_string(tp, str),
-	       (tp->tg3_flags & TG3_FLAG_10_100_ONLY) ? "10/100" : "10/100/1000");
+	       ((tp->tg3_flags & TG3_FLAG_10_100_ONLY) ? "10/100Base-TX" :
+		((tp->tg3_flags2 & TG3_FLG2_ANY_SERDES) ? "1000Base-SX" :
+		 "10/100/1000Base-T")));
 
 	for (i = 0; i < 6; i++)
 		printk("%2.2x%c", dev->dev_addr[i],
-- 
cgit v0.10.2


From 272491ef423b6976a230a998b10f46976aa91342 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 7 Dec 2006 01:17:24 -0800
Subject: [NETFILTER]: Fix non-ANSI func. decl.

Fix non-ANSI function declaration:
net/netfilter/nf_conntrack_core.c:1096:25: warning: non-ANSI function declaration of function 'nf_conntrack_flush'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index eaa0f8a..9b3158c 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1093,7 +1093,7 @@ static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size)
 			   get_order(sizeof(struct list_head) * size));
 }
 
-void nf_conntrack_flush()
+void nf_conntrack_flush(void)
 {
 	nf_ct_iterate_cleanup(kill_all, NULL);
 }
-- 
cgit v0.10.2


From c57ee096b6caf8f7e17abe46185d24f2b649b9f9 Mon Sep 17 00:00:00 2001
From: Andrew Victor <andrew@sanpeople.com>
Date: Tue, 5 Dec 2006 15:09:16 +0200
Subject: [PATCH] AT91RM9200 Ethernet: Remove 'at91_dev' and use netdev_priv()

Remove the global 'at91_dev' variable.
Use netdev_priv() instead of casting dev->priv directly.

Signed-off-by: Andrew Victor <andrew@sanpeople.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/net/arm/at91_ether.c b/drivers/net/arm/at91_ether.c
index b54b857..58da5b7 100644
--- a/drivers/net/arm/at91_ether.c
+++ b/drivers/net/arm/at91_ether.c
@@ -41,8 +41,6 @@
 #define DRV_NAME	"at91_ether"
 #define DRV_VERSION	"1.0"
 
-static struct net_device *at91_dev;
-
 static struct timer_list check_timer;
 #define LINK_POLL_INTERVAL	(HZ)
 
@@ -146,7 +144,7 @@ static void read_phy(unsigned char phy_addr, unsigned char address, unsigned int
  */
 static void update_linkspeed(struct net_device *dev, int silent)
 {
-	struct at91_private *lp = (struct at91_private *) dev->priv;
+	struct at91_private *lp = netdev_priv(dev);
 	unsigned int bmsr, bmcr, lpa, mac_cfg;
 	unsigned int speed, duplex;
 
@@ -199,7 +197,7 @@ static void update_linkspeed(struct net_device *dev, int silent)
 static irqreturn_t at91ether_phy_interrupt(int irq, void *dev_id)
 {
 	struct net_device *dev = (struct net_device *) dev_id;
-	struct at91_private *lp = (struct at91_private *) dev->priv;
+	struct at91_private *lp = netdev_priv(dev);
 	unsigned int phy;
 
 	/*
@@ -242,7 +240,7 @@ done:
  */
 static void enable_phyirq(struct net_device *dev)
 {
-	struct at91_private *lp = (struct at91_private *) dev->priv;
+	struct at91_private *lp = netdev_priv(dev);
 	unsigned int dsintr, irq_number;
 	int status;
 
@@ -294,7 +292,7 @@ static void enable_phyirq(struct net_device *dev)
  */
 static void disable_phyirq(struct net_device *dev)
 {
-	struct at91_private *lp = (struct at91_private *) dev->priv;
+	struct at91_private *lp = netdev_priv(dev);
 	unsigned int dsintr;
 	unsigned int irq_number;
 
@@ -340,7 +338,7 @@ static void disable_phyirq(struct net_device *dev)
 #if 0
 static void reset_phy(struct net_device *dev)
 {
-	struct at91_private *lp = (struct at91_private *) dev->priv;
+	struct at91_private *lp = netdev_priv(dev);
 	unsigned int bmcr;
 
 	spin_lock_irq(&lp->lock);
@@ -590,7 +588,7 @@ static void mdio_write(struct net_device *dev, int phy_id, int location, int val
 
 static int at91ether_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
-	struct at91_private *lp = (struct at91_private *) dev->priv;
+	struct at91_private *lp = netdev_priv(dev);
 	int ret;
 
 	spin_lock_irq(&lp->lock);
@@ -611,7 +609,7 @@ static int at91ether_get_settings(struct net_device *dev, struct ethtool_cmd *cm
 
 static int at91ether_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
-	struct at91_private *lp = (struct at91_private *) dev->priv;
+	struct at91_private *lp = netdev_priv(dev);
 	int ret;
 
 	spin_lock_irq(&lp->lock);
@@ -627,7 +625,7 @@ static int at91ether_set_settings(struct net_device *dev, struct ethtool_cmd *cm
 
 static int at91ether_nwayreset(struct net_device *dev)
 {
-	struct at91_private *lp = (struct at91_private *) dev->priv;
+	struct at91_private *lp = netdev_priv(dev);
 	int ret;
 
 	spin_lock_irq(&lp->lock);
@@ -658,7 +656,7 @@ static const struct ethtool_ops at91ether_ethtool_ops = {
 
 static int at91ether_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
-	struct at91_private *lp = (struct at91_private *) dev->priv;
+	struct at91_private *lp = netdev_priv(dev);
 	int res;
 
 	if (!netif_running(dev))
@@ -680,7 +678,7 @@ static int at91ether_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
  */
 static void at91ether_start(struct net_device *dev)
 {
-	struct at91_private *lp = (struct at91_private *) dev->priv;
+	struct at91_private *lp = netdev_priv(dev);
 	struct recv_desc_bufs *dlist, *dlist_phys;
 	int i;
 	unsigned long ctl;
@@ -712,7 +710,7 @@ static void at91ether_start(struct net_device *dev)
  */
 static int at91ether_open(struct net_device *dev)
 {
-	struct at91_private *lp = (struct at91_private *) dev->priv;
+	struct at91_private *lp = netdev_priv(dev);
 	unsigned long ctl;
 
 	if (!is_valid_ether_addr(dev->dev_addr))
@@ -752,7 +750,7 @@ static int at91ether_open(struct net_device *dev)
  */
 static int at91ether_close(struct net_device *dev)
 {
-	struct at91_private *lp = (struct at91_private *) dev->priv;
+	struct at91_private *lp = netdev_priv(dev);
 	unsigned long ctl;
 
 	/* Disable Receiver and Transmitter */
@@ -779,7 +777,7 @@ static int at91ether_close(struct net_device *dev)
  */
 static int at91ether_tx(struct sk_buff *skb, struct net_device *dev)
 {
-	struct at91_private *lp = (struct at91_private *) dev->priv;
+	struct at91_private *lp = netdev_priv(dev);
 
 	if (at91_emac_read(AT91_EMAC_TSR) & AT91_EMAC_TSR_BNQ) {
 		netif_stop_queue(dev);
@@ -811,7 +809,7 @@ static int at91ether_tx(struct sk_buff *skb, struct net_device *dev)
  */
 static struct net_device_stats *at91ether_stats(struct net_device *dev)
 {
-	struct at91_private *lp = (struct at91_private *) dev->priv;
+	struct at91_private *lp = netdev_priv(dev);
 	int ale, lenerr, seqe, lcol, ecol;
 
 	if (netif_running(dev)) {
@@ -847,7 +845,7 @@ static struct net_device_stats *at91ether_stats(struct net_device *dev)
  */
 static void at91ether_rx(struct net_device *dev)
 {
-	struct at91_private *lp = (struct at91_private *) dev->priv;
+	struct at91_private *lp = netdev_priv(dev);
 	struct recv_desc_bufs *dlist;
 	unsigned char *p_recv;
 	struct sk_buff *skb;
@@ -891,7 +889,7 @@ static void at91ether_rx(struct net_device *dev)
 static irqreturn_t at91ether_interrupt(int irq, void *dev_id)
 {
 	struct net_device *dev = (struct net_device *) dev_id;
-	struct at91_private *lp = (struct at91_private *) dev->priv;
+	struct at91_private *lp = netdev_priv(dev);
 	unsigned long intstatus, ctl;
 
 	/* MAC Interrupt Status register indicates what interrupts are pending.
@@ -939,9 +937,6 @@ static int __init at91ether_setup(unsigned long phy_type, unsigned short phy_add
 	unsigned int val;
 	int res;
 
-	if (at91_dev)			/* already initialized */
-		return 0;
-
 	dev = alloc_etherdev(sizeof(struct at91_private));
 	if (!dev)
 		return -ENOMEM;
@@ -957,7 +952,7 @@ static int __init at91ether_setup(unsigned long phy_type, unsigned short phy_add
 	}
 
 	/* Allocate memory for DMA Receive descriptors */
-	lp = (struct at91_private *)dev->priv;
+	lp = netdev_priv(dev);
 	lp->dlist = (struct recv_desc_bufs *) dma_alloc_coherent(NULL, sizeof(struct recv_desc_bufs), (dma_addr_t *) &lp->dlist_phys, GFP_KERNEL);
 	if (lp->dlist == NULL) {
 		free_irq(dev->irq, dev);
@@ -1024,7 +1019,6 @@ static int __init at91ether_setup(unsigned long phy_type, unsigned short phy_add
 		dma_free_coherent(NULL, sizeof(struct recv_desc_bufs), lp->dlist, (dma_addr_t)lp->dlist_phys);
 		return res;
 	}
-	at91_dev = dev;
 
 	/* Determine current link speed */
 	spin_lock_irq(&lp->lock);
@@ -1115,15 +1109,16 @@ static int __init at91ether_probe(struct platform_device *pdev)
 
 static int __devexit at91ether_remove(struct platform_device *pdev)
 {
-	struct at91_private *lp = (struct at91_private *) at91_dev->priv;
+	struct net_device *dev = platform_get_drvdata(pdev);
+	struct at91_private *lp = netdev_priv(dev);
 
-	unregister_netdev(at91_dev);
-	free_irq(at91_dev->irq, at91_dev);
+	unregister_netdev(dev);
+	free_irq(dev->irq, dev);
 	dma_free_coherent(NULL, sizeof(struct recv_desc_bufs), lp->dlist, (dma_addr_t)lp->dlist_phys);
 	clk_put(lp->ether_clk);
 
-	free_netdev(at91_dev);
-	at91_dev = NULL;
+	platform_set_drvdata(pdev, NULL);
+	free_netdev(dev);
 	return 0;
 }
 
@@ -1131,8 +1126,8 @@ static int __devexit at91ether_remove(struct platform_device *pdev)
 
 static int at91ether_suspend(struct platform_device *pdev, pm_message_t mesg)
 {
-	struct at91_private *lp = (struct at91_private *) at91_dev->priv;
 	struct net_device *net_dev = platform_get_drvdata(pdev);
+	struct at91_private *lp = netdev_priv(net_dev);
 	int phy_irq = lp->board_data.phy_irq_pin;
 
 	if (netif_running(net_dev)) {
@@ -1149,8 +1144,8 @@ static int at91ether_suspend(struct platform_device *pdev, pm_message_t mesg)
 
 static int at91ether_resume(struct platform_device *pdev)
 {
-	struct at91_private *lp = (struct at91_private *) at91_dev->priv;
 	struct net_device *net_dev = platform_get_drvdata(pdev);
+	struct at91_private *lp = netdev_priv(net_dev);
 	int phy_irq = lp->board_data.phy_irq_pin;
 
 	if (netif_running(net_dev)) {
-- 
cgit v0.10.2


From cf42553ab43e102bc98eca05523d2390a1eedde9 Mon Sep 17 00:00:00 2001
From: Andrew Victor <andrew@sanpeople.com>
Date: Tue, 5 Dec 2006 15:21:19 +0200
Subject: [PATCH] AT91RM9200 Ethernet: Move check_timer variable and use
 mod_timer()

Move the global 'check_timer' variable into the private data structure.
Also now use mod_timer().

Signed-off-by: Andrew Victor <andrew@sanpeople.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/net/arm/at91_ether.c b/drivers/net/arm/at91_ether.c
index 58da5b7..918368c 100644
--- a/drivers/net/arm/at91_ether.c
+++ b/drivers/net/arm/at91_ether.c
@@ -41,7 +41,6 @@
 #define DRV_NAME	"at91_ether"
 #define DRV_VERSION	"1.0"
 
-static struct timer_list check_timer;
 #define LINK_POLL_INTERVAL	(HZ)
 
 /* ..................................................................... */
@@ -250,8 +249,7 @@ static void enable_phyirq(struct net_device *dev)
 		 * PHY doesn't have an IRQ pin (RTL8201, DP83847, AC101L),
 		 * or board does not have it connected.
 		 */
-		check_timer.expires = jiffies + LINK_POLL_INTERVAL;
-		add_timer(&check_timer);
+		mod_timer(&lp->check_timer, jiffies + LINK_POLL_INTERVAL);
 		return;
 	}
 
@@ -298,7 +296,7 @@ static void disable_phyirq(struct net_device *dev)
 
 	irq_number = lp->board_data.phy_irq_pin;
 	if (!irq_number) {
-		del_timer_sync(&check_timer);
+		del_timer_sync(&lp->check_timer);
 		return;
 	}
 
@@ -360,13 +358,13 @@ static void reset_phy(struct net_device *dev)
 static void at91ether_check_link(unsigned long dev_id)
 {
 	struct net_device *dev = (struct net_device *) dev_id;
+	struct at91_private *lp = netdev_priv(dev);
 
 	enable_mdi();
 	update_linkspeed(dev, 1);
 	disable_mdi();
 
-	check_timer.expires = jiffies + LINK_POLL_INTERVAL;
-	add_timer(&check_timer);
+	mod_timer(&lp->check_timer, jiffies + LINK_POLL_INTERVAL);
 }
 
 /* ......................... ADDRESS MANAGEMENT ........................ */
@@ -1030,9 +1028,9 @@ static int __init at91ether_setup(unsigned long phy_type, unsigned short phy_add
 
 	/* If board has no PHY IRQ, use a timer to poll the PHY */
 	if (!lp->board_data.phy_irq_pin) {
-		init_timer(&check_timer);
-		check_timer.data = (unsigned long)dev;
-		check_timer.function = at91ether_check_link;
+		init_timer(&lp->check_timer);
+		lp->check_timer.data = (unsigned long)dev;
+		lp->check_timer.function = at91ether_check_link;
 	}
 
 	/* Display ethernet banner */
diff --git a/drivers/net/arm/at91_ether.h b/drivers/net/arm/at91_ether.h
index d1e72e0..b6b665d 100644
--- a/drivers/net/arm/at91_ether.h
+++ b/drivers/net/arm/at91_ether.h
@@ -87,6 +87,7 @@ struct at91_private
 	spinlock_t lock;			/* lock for MDI interface */
 	short phy_media;			/* media interface type */
 	unsigned short phy_address;		/* 5-bit MDI address of PHY (0..31) */
+	struct timer_list check_timer;		/* Poll link status */
 
 	/* Transmit */
 	struct sk_buff *skb;			/* holds skb until xmit interrupt completes */
-- 
cgit v0.10.2


From 51cc21045714cc9f48eb6901d95eb4e552ef2ca4 Mon Sep 17 00:00:00 2001
From: Andrew Victor <andrew@sanpeople.com>
Date: Tue, 5 Dec 2006 15:33:05 +0200
Subject: [PATCH] AT91RM9200 Ethernet: Add netpoll / netconsole support

Adds netpoll / netconsole support.

Original patch from Bill Gatliff.

Signed-off-by: Andrew Victor <andrew@sanpeople.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/net/arm/at91_ether.c b/drivers/net/arm/at91_ether.c
index 918368c..f33d957 100644
--- a/drivers/net/arm/at91_ether.c
+++ b/drivers/net/arm/at91_ether.c
@@ -923,6 +923,17 @@ static irqreturn_t at91ether_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void at91ether_poll_controller(struct net_device *dev)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	at91ether_interrupt(dev->irq, dev);
+	local_irq_restore(flags);
+}
+#endif
+
 /*
  * Initialize the ethernet interface
  */
@@ -972,6 +983,9 @@ static int __init at91ether_setup(unsigned long phy_type, unsigned short phy_add
 	dev->set_mac_address = set_mac_address;
 	dev->ethtool_ops = &at91ether_ethtool_ops;
 	dev->do_ioctl = at91ether_ioctl;
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	dev->poll_controller = at91ether_poll_controller;
+#endif
 
 	SET_NETDEV_DEV(dev, &pdev->dev);
 
-- 
cgit v0.10.2


From a3f63e4f4be0da938771d754e846ff0019f9d42e Mon Sep 17 00:00:00 2001
From: Andrew Victor <andrew@sanpeople.com>
Date: Tue, 5 Dec 2006 15:37:02 +0200
Subject: [PATCH] AT91RM9200 Ethernet: Use dev_alloc_skb()

Use dev_alloc_skb() instead of alloc_skb().

It is also not necessary to adjust skb->len manually since that's
already done by skb_put().

Signed-off-by: Andrew Victor <andrew@sanpeople.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/net/arm/at91_ether.c b/drivers/net/arm/at91_ether.c
index f33d957..fada15d 100644
--- a/drivers/net/arm/at91_ether.c
+++ b/drivers/net/arm/at91_ether.c
@@ -853,14 +853,13 @@ static void at91ether_rx(struct net_device *dev)
 	while (dlist->descriptors[lp->rxBuffIndex].addr & EMAC_DESC_DONE) {
 		p_recv = dlist->recv_buf[lp->rxBuffIndex];
 		pktlen = dlist->descriptors[lp->rxBuffIndex].size & 0x7ff;	/* Length of frame including FCS */
-		skb = alloc_skb(pktlen + 2, GFP_ATOMIC);
+		skb = dev_alloc_skb(pktlen + 2);
 		if (skb != NULL) {
 			skb_reserve(skb, 2);
 			memcpy(skb_put(skb, pktlen), p_recv, pktlen);
 
 			skb->dev = dev;
 			skb->protocol = eth_type_trans(skb, dev);
-			skb->len = pktlen;
 			dev->last_rx = jiffies;
 			lp->stats.rx_bytes += pktlen;
 			netif_rx(skb);
-- 
cgit v0.10.2


From 2a45b49c30c422c83c9227cb8ca99f129a5cf5d0 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Mon, 4 Dec 2006 15:53:16 -0800
Subject: [PATCH] sky2: add PCI for 88ec033

Add another new/missing pci id for 88ec033 chip.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index 842abd9..a0563e0 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -117,6 +117,7 @@ static const struct pci_device_id sky2_id_table[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4351) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4352) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4353) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4356) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4360) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4361) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4362) },
-- 
cgit v0.10.2


From e5b74c7ddd46d1779bea21d7c8efb39bbcc3df21 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Mon, 4 Dec 2006 15:53:36 -0800
Subject: [PATCH] sky2: add comments to PCI ids

Add comments to sky2 driver to show relationship between PCI id and
hardware.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index a0563e0..b1df594 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -100,34 +100,32 @@ module_param(idle_timeout, int, 0);
 MODULE_PARM_DESC(idle_timeout, "Watchdog timer for lost interrupts (ms)");
 
 static const struct pci_device_id sky2_id_table[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9000) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9E00) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9000) }, /* SK-9Sxx */
+	{ PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9E00) }, /* SK-9Exx */
 	{ PCI_DEVICE(PCI_VENDOR_ID_DLINK, 0x4b00) },	/* DGE-560T */
 	{ PCI_DEVICE(PCI_VENDOR_ID_DLINK, 0x4001) }, 	/* DGE-550SX */
 	{ PCI_DEVICE(PCI_VENDOR_ID_DLINK, 0x4B02) },	/* DGE-560SX */
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4340) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4341) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4342) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4343) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4344) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4345) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4346) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4347) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4350) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4351) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4352) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4353) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4356) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4360) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4361) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4362) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4363) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4364) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4365) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4366) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4367) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4368) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4369) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4340) }, /* 88E8021 */
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4341) }, /* 88E8022 */
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4342) }, /* 88E8061 */
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4343) }, /* 88E8062 */
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4344) }, /* 88E8021 */
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4345) }, /* 88E8022 */
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4346) }, /* 88E8061 */
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4347) }, /* 88E8062 */
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4350) }, /* 88E8035 */
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4351) }, /* 88E8036 */
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4352) }, /* 88E8038 */
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4353) }, /* 88E8039 */
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4356) }, /* 88EC033 */
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4360) }, /* 88E8052 */
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4361) }, /* 88E8050 */
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4362) }, /* 88E8053 */
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4363) }, /* 88E8055 */
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4364) }, /* 88E8056 */
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4366) }, /* 88EC036 */
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4367) }, /* 88EC032 */
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4368) }, /* 88EC034 */
 	{ 0 }
 };
 
-- 
cgit v0.10.2


From 6771290102c4703dae56bc3e121deb63530e206c Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Mon, 4 Dec 2006 15:53:45 -0800
Subject: [PATCH] sky2: beter ram buffer partitioning

Different chips have different sizes of ram buffers, and some versions have
no ram buffer at all!.  Be more careful about sizing the ram usage because
it maybe a problem if vendor keeps changing sizes.

There is the (unlikely) possibility that some of the errors on some of the
chips have been caused by partitioning not on a 1K boundary.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index b1df594..b9f7eb5 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -696,10 +696,15 @@ static void sky2_mac_init(struct sky2_hw *hw, unsigned port)
 
 }
 
-/* Assign Ram Buffer allocation in units of 64bit (8 bytes) */
-static void sky2_ramset(struct sky2_hw *hw, u16 q, u32 start, u32 end)
+/* Assign Ram Buffer allocation to queue */
+static void sky2_ramset(struct sky2_hw *hw, u16 q, u32 start, u32 space)
 {
-	pr_debug(PFX "q %d %#x %#x\n", q, start, end);
+	u32 end;
+
+	/* convert from K bytes to qwords used for hw register */
+	start *= 1024/8;
+	space *= 1024/8;
+	end = start + space - 1;
 
 	sky2_write8(hw, RB_ADDR(q, RB_CTRL), RB_RST_CLR);
 	sky2_write32(hw, RB_ADDR(q, RB_START), start);
@@ -708,7 +713,6 @@ static void sky2_ramset(struct sky2_hw *hw, u16 q, u32 start, u32 end)
 	sky2_write32(hw, RB_ADDR(q, RB_RP), start);
 
 	if (q == Q_R1 || q == Q_R2) {
-		u32 space = end - start + 1;
 		u32 tp = space - space/4;
 
 		/* On receive queue's set the thresholds
@@ -1138,7 +1142,7 @@ static int sky2_up(struct net_device *dev)
 	struct sky2_port *sky2 = netdev_priv(dev);
 	struct sky2_hw *hw = sky2->hw;
 	unsigned port = sky2->port;
-	u32 ramsize, rxspace, imask;
+	u32 ramsize, imask;
 	int cap, err = -ENOMEM;
 	struct net_device *otherdev = hw->dev[sky2->port^1];
 
@@ -1191,20 +1195,25 @@ static int sky2_up(struct net_device *dev)
 
 	sky2_mac_init(hw, port);
 
-	/* Determine available ram buffer space in qwords.  */
-	ramsize = sky2_read8(hw, B2_E_0) * 4096/8;
+	/* Register is number of 4K blocks on internal RAM buffer. */
+	ramsize = sky2_read8(hw, B2_E_0) * 4;
+	printk(KERN_INFO PFX "%s: ram buffer %dK\n", dev->name, ramsize);
 
-	if (ramsize > 6*1024/8)
-		rxspace = ramsize - (ramsize + 2) / 3;
-	else
-		rxspace = ramsize / 2;
+	if (ramsize > 0) {
+		u32 rxspace;
 
-	sky2_ramset(hw, rxqaddr[port], 0, rxspace-1);
-	sky2_ramset(hw, txqaddr[port], rxspace, ramsize-1);
+		if (ramsize < 16)
+			rxspace = ramsize / 2;
+		else
+			rxspace = 8 + (2*(ramsize - 16))/3;
 
-	/* Make sure SyncQ is disabled */
-	sky2_write8(hw, RB_ADDR(port == 0 ? Q_XS1 : Q_XS2, RB_CTRL),
-		    RB_RST_SET);
+		sky2_ramset(hw, rxqaddr[port], 0, rxspace);
+		sky2_ramset(hw, txqaddr[port], rxspace, ramsize - rxspace);
+
+		/* Make sure SyncQ is disabled */
+		sky2_write8(hw, RB_ADDR(port == 0 ? Q_XS1 : Q_XS2, RB_CTRL),
+			    RB_RST_SET);
+	}
 
 	sky2_qset(hw, txqaddr[port]);
 
-- 
cgit v0.10.2


From c3905bc4b71ab562acf69765e8c4778bd263b9db Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Mon, 4 Dec 2006 17:08:19 -0800
Subject: [PATCH] sky2: receive queue watermark tweak

This patch makes the receive performance on some systems go from
714MB/s to 941MB/s. It adjusts the watermark of the receive queue
to be lower, thereby avoiding excess hardware flow control. This is
most important on the systems which have little/no additional buffering.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index b9f7eb5..a8e0963 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -1062,11 +1062,16 @@ static int sky2_rx_start(struct sky2_port *sky2)
 	sky2->rx_put = sky2->rx_next = 0;
 	sky2_qset(hw, rxq);
 
+	/* On PCI express lowering the watermark gives better performance */
+	if (pci_find_capability(hw->pdev, PCI_CAP_ID_EXP))
+		sky2_write32(hw, Q_ADDR(rxq, Q_WM), BMU_WM_PEX);
+
+	/* These chips have no ram buffer?
+	 * MAC Rx RAM Read is controlled by hardware */
 	if (hw->chip_id == CHIP_ID_YUKON_EC_U &&
-	    (hw->chip_rev == CHIP_REV_YU_EC_U_A1 || hw->chip_rev == CHIP_REV_YU_EC_U_B0)) {
-		/* MAC Rx RAM Read is controlled by hardware */
+	    (hw->chip_rev == CHIP_REV_YU_EC_U_A1
+	     || hw->chip_rev == CHIP_REV_YU_EC_U_B0))
 		sky2_write32(hw, Q_ADDR(rxq, Q_F), F_M_RX_RAM_DIS);
-	}
 
 	sky2_prefetch_init(hw, rxq, sky2->rx_le_map, RX_LE_SIZE - 1);
 
diff --git a/drivers/net/sky2.h b/drivers/net/sky2.h
index 7760545..a63f605 100644
--- a/drivers/net/sky2.h
+++ b/drivers/net/sky2.h
@@ -680,6 +680,7 @@ enum {
 			  BMU_FIFO_ENA | BMU_OP_ON,
 
 	BMU_WM_DEFAULT = 0x600,
+	BMU_WM_PEX     = 0x80,
 };
 
 /* Tx BMU Control / Status Registers (Yukon-2) */
-- 
cgit v0.10.2


From e67bda55e27d3308ba0b4ce8cf2da51850ef1453 Mon Sep 17 00:00:00 2001
From: Brice Goglin <brice@myri.com>
Date: Tue, 5 Dec 2006 17:26:27 +0100
Subject: [PATCH] myri10ge: write as 2 32-byte blocks in myri10ge_submit_8rx

In the myri10ge_submit_8rx() routine, write the 64 byte request block as
2 32-byte blocks so that it is handled by the hardware pio write handler
if write-combining is enabled.

Signed-off-by: Brice Goglin <brice@myri.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
index b8f57df..81f127a 100644
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -795,7 +795,9 @@ myri10ge_submit_8rx(struct mcp_kreq_ether_recv __iomem * dst,
 
 	low = src->addr_low;
 	src->addr_low = htonl(DMA_32BIT_MASK);
-	myri10ge_pio_copy(dst, src, 8 * sizeof(*src));
+	myri10ge_pio_copy(dst, src, 4 * sizeof(*src));
+	mb();
+	myri10ge_pio_copy(dst + 4, src + 4, 4 * sizeof(*src));
 	mb();
 	src->addr_low = low;
 	put_be32(low, &dst->addr_low);
-- 
cgit v0.10.2


From 7f4b45c5269049e223eda31c7e3879c226039e4a Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Tue, 5 Dec 2006 12:02:50 -0800
Subject: [PATCH] skge: fix sparse warnings

Fix sparse warnings from using enum as part of arithmetic
expression, and comment indentation fixes

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/net/skge.h b/drivers/net/skge.h
index 23e5275..f6223c5 100644
--- a/drivers/net/skge.h
+++ b/drivers/net/skge.h
@@ -389,10 +389,10 @@ enum {
 /* Packet Arbiter Registers */
 /*	B3_PA_CTRL		16 bit	Packet Arbiter Ctrl Register */
 enum {
-	PA_CLR_TO_TX2	= 1<<13,	/* Clear IRQ Packet Timeout TX2 */
-	PA_CLR_TO_TX1	= 1<<12,	/* Clear IRQ Packet Timeout TX1 */
-	PA_CLR_TO_RX2	= 1<<11,	/* Clear IRQ Packet Timeout RX2 */
-	PA_CLR_TO_RX1	= 1<<10,	/* Clear IRQ Packet Timeout RX1 */
+	PA_CLR_TO_TX2	= 1<<13,/* Clear IRQ Packet Timeout TX2 */
+	PA_CLR_TO_TX1	= 1<<12,/* Clear IRQ Packet Timeout TX1 */
+	PA_CLR_TO_RX2	= 1<<11,/* Clear IRQ Packet Timeout RX2 */
+	PA_CLR_TO_RX1	= 1<<10,/* Clear IRQ Packet Timeout RX1 */
 	PA_ENA_TO_TX2	= 1<<9,	/* Enable  Timeout Timer TX2 */
 	PA_DIS_TO_TX2	= 1<<8,	/* Disable Timeout Timer TX2 */
 	PA_ENA_TO_TX1	= 1<<7,	/* Enable  Timeout Timer TX1 */
@@ -481,14 +481,14 @@ enum {
 /* RAM Buffer Register Offsets */
 enum {
 
-	RB_START	= 0x00,/* 32 bit	RAM Buffer Start Address */
+	RB_START= 0x00,/* 32 bit	RAM Buffer Start Address */
 	RB_END	= 0x04,/* 32 bit	RAM Buffer End Address */
 	RB_WP	= 0x08,/* 32 bit	RAM Buffer Write Pointer */
 	RB_RP	= 0x0c,/* 32 bit	RAM Buffer Read Pointer */
-	RB_RX_UTPP	= 0x10,/* 32 bit	Rx Upper Threshold, Pause Packet */
-	RB_RX_LTPP	= 0x14,/* 32 bit	Rx Lower Threshold, Pause Packet */
-	RB_RX_UTHP	= 0x18,/* 32 bit	Rx Upper Threshold, High Prio */
-	RB_RX_LTHP	= 0x1c,/* 32 bit	Rx Lower Threshold, High Prio */
+	RB_RX_UTPP= 0x10,/* 32 bit	Rx Upper Threshold, Pause Packet */
+	RB_RX_LTPP= 0x14,/* 32 bit	Rx Lower Threshold, Pause Packet */
+	RB_RX_UTHP= 0x18,/* 32 bit	Rx Upper Threshold, High Prio */
+	RB_RX_LTHP= 0x1c,/* 32 bit	Rx Lower Threshold, High Prio */
 	/* 0x10 - 0x1f:	reserved at Tx RAM Buffer Registers */
 	RB_PC	= 0x20,/* 32 bit	RAM Buffer Packet Counter */
 	RB_LEV	= 0x24,/* 32 bit	RAM Buffer Level Register */
@@ -532,7 +532,7 @@ enum {
 	PHY_ADDR_MARV	= 0,
 };
 
-#define RB_ADDR(offs, queue) (B16_RAM_REGS + (queue) + (offs))
+#define RB_ADDR(offs, queue) ((u16)B16_RAM_REGS + (u16)(queue) + (offs))
 
 /* Receive MAC FIFO, Receive LED, and Link_Sync regs (GENESIS only) */
 enum {
@@ -578,15 +578,15 @@ enum {
 	MFF_DIS_TIST	= 1<<2,	/* Disable Time Stamp Gener */
 	MFF_CLR_INTIST	= 1<<1,	/* Clear IRQ No Time Stamp */
 	MFF_CLR_INSTAT	= 1<<0,	/* Clear IRQ No Status */
-#define MFF_RX_CTRL_DEF MFF_ENA_TIM_PAT
+	MFF_RX_CTRL_DEF = MFF_ENA_TIM_PAT,
 };
 
 /*	TX_MFF_CTRL1	16 bit	Transmit MAC FIFO Control Reg 1 */
 enum {
-	MFF_CLR_PERR	= 1<<15,	/* Clear Parity Error IRQ */
-								/* Bit 14:	reserved */
-	MFF_ENA_PKT_REC	= 1<<13,	/* Enable  Packet Recovery */
-	MFF_DIS_PKT_REC	= 1<<12,	/* Disable Packet Recovery */
+	MFF_CLR_PERR	= 1<<15, /* Clear Parity Error IRQ */
+
+	MFF_ENA_PKT_REC	= 1<<13, /* Enable  Packet Recovery */
+	MFF_DIS_PKT_REC	= 1<<12, /* Disable Packet Recovery */
 
 	MFF_ENA_W4E	= 1<<7,	/* Enable  Wait for Empty */
 	MFF_DIS_W4E	= 1<<6,	/* Disable Wait for Empty */
@@ -595,9 +595,10 @@ enum {
 	MFF_DIS_LOOPB	= 1<<2,	/* Disable Loopback */
 	MFF_CLR_MAC_RST	= 1<<1,	/* Clear XMAC Reset */
 	MFF_SET_MAC_RST	= 1<<0,	/* Set   XMAC Reset */
+
+	MFF_TX_CTRL_DEF	 = MFF_ENA_PKT_REC | (u16) MFF_ENA_TIM_PAT | MFF_ENA_FLUSH,
 };
 
-#define MFF_TX_CTRL_DEF	(MFF_ENA_PKT_REC | MFF_ENA_TIM_PAT | MFF_ENA_FLUSH)
 
 /*	RX_MFF_TST2	 	 8 bit	Receive MAC FIFO Test Register 2 */
 /*	TX_MFF_TST2	 	 8 bit	Transmit MAC FIFO Test Register 2 */
@@ -1304,8 +1305,8 @@ enum {
 
 /* special defines for FIBER (88E1011S only) */
 enum {
-	PHY_M_AN_ASP_X	= 1<<8, /* Asymmetric Pause */
-	PHY_M_AN_PC_X	= 1<<7, /* MAC Pause implemented */
+	PHY_M_AN_ASP_X		= 1<<8, /* Asymmetric Pause */
+	PHY_M_AN_PC_X		= 1<<7, /* MAC Pause implemented */
 	PHY_M_AN_1000X_AHD	= 1<<6, /* Advertise 10000Base-X Half Duplex */
 	PHY_M_AN_1000X_AFD	= 1<<5, /* Advertise 10000Base-X Full Duplex */
 };
@@ -1320,7 +1321,7 @@ enum {
 
 /*****  PHY_MARV_1000T_CTRL	16 bit r/w	1000Base-T Control Reg *****/
 enum {
-	PHY_M_1000C_TEST	= 7<<13,/* Bit 15..13:	Test Modes */
+	PHY_M_1000C_TEST= 7<<13,/* Bit 15..13:	Test Modes */
 	PHY_M_1000C_MSE	= 1<<12, /* Manual Master/Slave Enable */
 	PHY_M_1000C_MSC	= 1<<11, /* M/S Configuration (1=Master) */
 	PHY_M_1000C_MPD	= 1<<10, /* Multi-Port Device */
@@ -1349,7 +1350,7 @@ enum {
 	PHY_M_PC_EN_DET_PLUS	= 3<<8, /* Energy Detect Plus (Mode 2) */
 };
 
-#define PHY_M_PC_MDI_XMODE(x)	(((x)<<5) & PHY_M_PC_MDIX_MSK)
+#define PHY_M_PC_MDI_XMODE(x)	((((u16)(x)<<5) & PHY_M_PC_MDIX_MSK)
 
 enum {
 	PHY_M_PC_MAN_MDI	= 0, /* 00 = Manual MDI configuration */
@@ -1432,24 +1433,24 @@ enum {
 	PHY_M_EC_DIS_LINK_P = 1<<12, /* Disable Link Pulses (88E1111 only) */
 	PHY_M_EC_M_DSC_MSK  = 3<<10, /* Bit 11..10:	Master Downshift Counter */
 					/* (88E1011 only) */
-	PHY_M_EC_S_DSC_MSK	= 3<<8,/* Bit  9.. 8:	Slave  Downshift Counter */
+	PHY_M_EC_S_DSC_MSK  = 3<<8,  /* Bit  9.. 8:	Slave  Downshift Counter */
 				       /* (88E1011 only) */
-	PHY_M_EC_M_DSC_MSK2	= 7<<9,/* Bit 11.. 9:	Master Downshift Counter */
+	PHY_M_EC_M_DSC_MSK2  = 7<<9, /* Bit 11.. 9:	Master Downshift Counter */
 					/* (88E1111 only) */
-	PHY_M_EC_DOWN_S_ENA	= 1<<8, /* Downshift Enable (88E1111 only) */
+	PHY_M_EC_DOWN_S_ENA  = 1<<8, /* Downshift Enable (88E1111 only) */
 					/* !!! Errata in spec. (1 = disable) */
-	PHY_M_EC_RX_TIM_CT	= 1<<7, /* RGMII Rx Timing Control*/
-	PHY_M_EC_MAC_S_MSK	= 7<<4,/* Bit  6.. 4:	Def. MAC interface speed */
-	PHY_M_EC_FIB_AN_ENA	= 1<<3, /* Fiber Auto-Neg. Enable (88E1011S only) */
-	PHY_M_EC_DTE_D_ENA	= 1<<2, /* DTE Detect Enable (88E1111 only) */
-	PHY_M_EC_TX_TIM_CT	= 1<<1, /* RGMII Tx Timing Control */
-	PHY_M_EC_TRANS_DIS	= 1<<0, /* Transmitter Disable (88E1111 only) */};
-
-#define PHY_M_EC_M_DSC(x)	((x)<<10) /* 00=1x; 01=2x; 10=3x; 11=4x */
-#define PHY_M_EC_S_DSC(x)	((x)<<8) /* 00=dis; 01=1x; 10=2x; 11=3x */
-#define PHY_M_EC_MAC_S(x)	((x)<<4) /* 01X=0; 110=2.5; 111=25 (MHz) */
-
-#define PHY_M_EC_M_DSC_2(x)	((x)<<9) /* 000=1x; 001=2x; 010=3x; 011=4x */
+	PHY_M_EC_RX_TIM_CT   = 1<<7, /* RGMII Rx Timing Control*/
+	PHY_M_EC_MAC_S_MSK   = 7<<4, /* Bit  6.. 4:	Def. MAC interface speed */
+	PHY_M_EC_FIB_AN_ENA  = 1<<3, /* Fiber Auto-Neg. Enable (88E1011S only) */
+	PHY_M_EC_DTE_D_ENA   = 1<<2, /* DTE Detect Enable (88E1111 only) */
+	PHY_M_EC_TX_TIM_CT   = 1<<1, /* RGMII Tx Timing Control */
+	PHY_M_EC_TRANS_DIS   = 1<<0, /* Transmitter Disable (88E1111 only) */};
+
+#define PHY_M_EC_M_DSC(x)	((u16)(x)<<10) /* 00=1x; 01=2x; 10=3x; 11=4x */
+#define PHY_M_EC_S_DSC(x)	((u16)(x)<<8) /* 00=dis; 01=1x; 10=2x; 11=3x */
+#define PHY_M_EC_MAC_S(x)	((u16)(x)<<4) /* 01X=0; 110=2.5; 111=25 (MHz) */
+
+#define PHY_M_EC_M_DSC_2(x)	((u16)(x)<<9) /* 000=1x; 001=2x; 010=3x; 011=4x */
 											/* 100=5x; 101=6x; 110=7x; 111=8x */
 enum {
 	MAC_TX_CLK_0_MHZ	= 2,
@@ -1468,10 +1469,12 @@ enum {
 	PHY_M_LEDC_LK_C_MSK	= 7<<3,/* Bit  5.. 3: Link Control Mask */
 					/* (88E1111 only) */
 };
+#define PHY_M_LED_PULS_DUR(x)	(((u16)(x)<<12) & PHY_M_LEDC_PULS_MSK)
+#define PHY_M_LED_BLINK_RT(x)	(((u16)(x)<<8) & PHY_M_LEDC_BL_R_MSK)
 
 enum {
-	PHY_M_LEDC_LINK_MSK	= 3<<3,/* Bit  4.. 3: Link Control Mask */
-									/* (88E1011 only) */
+	PHY_M_LEDC_LINK_MSK	= 3<<3, /* Bit  4.. 3: Link Control Mask */
+					/* (88E1011 only) */
 	PHY_M_LEDC_DP_CTRL	= 1<<2, /* Duplex Control */
 	PHY_M_LEDC_DP_C_MSB	= 1<<2, /* Duplex Control (MSB, 88E1111 only) */
 	PHY_M_LEDC_RX_CTRL	= 1<<1, /* Rx Activity / Link */
@@ -1479,27 +1482,24 @@ enum {
 	PHY_M_LEDC_TX_C_MSB	= 1<<0, /* Tx Control (MSB, 88E1111 only) */
 };
 
-#define PHY_M_LED_PULS_DUR(x)	(((x)<<12) & PHY_M_LEDC_PULS_MSK)
-
 enum {
-	PULS_NO_STR	= 0,/* no pulse stretching */
-	PULS_21MS	= 1,/* 21 ms to 42 ms */
-	PULS_42MS	= 2,/* 42 ms to 84 ms */
-	PULS_84MS	= 3,/* 84 ms to 170 ms */
-	PULS_170MS	= 4,/* 170 ms to 340 ms */
-	PULS_340MS	= 5,/* 340 ms to 670 ms */
-	PULS_670MS	= 6,/* 670 ms to 1.3 s */
-	PULS_1300MS	= 7,/* 1.3 s to 2.7 s */
+	PULS_NO_STR	= 0, /* no pulse stretching */
+	PULS_21MS	= 1, /* 21 ms to 42 ms */
+	PULS_42MS	= 2, /* 42 ms to 84 ms */
+	PULS_84MS	= 3, /* 84 ms to 170 ms */
+	PULS_170MS	= 4, /* 170 ms to 340 ms */
+	PULS_340MS	= 5, /* 340 ms to 670 ms */
+	PULS_670MS	= 6, /* 670 ms to 1.3 s */
+	PULS_1300MS	= 7, /* 1.3 s to 2.7 s */
 };
 
-#define PHY_M_LED_BLINK_RT(x)	(((x)<<8) & PHY_M_LEDC_BL_R_MSK)
 
 enum {
-	BLINK_42MS	= 0,/* 42 ms */
-	BLINK_84MS	= 1,/* 84 ms */
-	BLINK_170MS	= 2,/* 170 ms */
-	BLINK_340MS	= 3,/* 340 ms */
-	BLINK_670MS	= 4,/* 670 ms */
+	BLINK_42MS	= 0, /* 42 ms */
+	BLINK_84MS	= 1, /* 84 ms */
+	BLINK_170MS	= 2, /* 170 ms */
+	BLINK_340MS	= 3, /* 340 ms */
+	BLINK_670MS	= 4, /* 670 ms */
 };
 
 /*****  PHY_MARV_LED_OVER	16 bit r/w	Manual LED Override Reg *****/
@@ -1525,7 +1525,7 @@ enum {
 	PHY_M_EC2_FO_IMPED	= 1<<5, /* Fiber Output Impedance */
 	PHY_M_EC2_FO_M_CLK	= 1<<4, /* Fiber Mode Clock Enable */
 	PHY_M_EC2_FO_BOOST	= 1<<3, /* Fiber Output Boost */
-	PHY_M_EC2_FO_AM_MSK	= 7,/* Bit  2.. 0:	Fiber Output Amplitude */
+	PHY_M_EC2_FO_AM_MSK	= 7, /* Bit  2.. 0:	Fiber Output Amplitude */
 };
 
 /*****  PHY_MARV_EXT_P_STAT 16 bit r/w	Ext. PHY Specific Status *****/
@@ -1550,7 +1550,7 @@ enum {
 	PHY_M_CABD_DIS_WAIT	= 1<<15, /* Disable Waiting Period (Page 1) */
 					/* (88E1111 only) */
 	PHY_M_CABD_STAT_MSK	= 3<<13, /* Bit 14..13: Status Mask */
-	PHY_M_CABD_AMPL_MSK	= 0x1f<<8,/* Bit 12.. 8: Amplitude Mask */
+	PHY_M_CABD_AMPL_MSK	= 0x1f<<8, /* Bit 12.. 8: Amplitude Mask */
 					/* (88E1111 only) */
 	PHY_M_CABD_DIST_MSK	= 0xff, /* Bit  7.. 0: Distance Mask */
 };
@@ -1605,9 +1605,9 @@ enum {
 
 /*****  PHY_MARV_PHY_CTRL (page 3)		16 bit r/w	LED Control Reg. *****/
 enum {
-	PHY_M_LEDC_LOS_MSK	= 0xf<<12,/* Bit 15..12: LOS LED Ctrl. Mask */
+	PHY_M_LEDC_LOS_MSK	= 0xf<<12, /* Bit 15..12: LOS LED Ctrl. Mask */
 	PHY_M_LEDC_INIT_MSK	= 0xf<<8, /* Bit 11.. 8: INIT LED Ctrl. Mask */
-	PHY_M_LEDC_STA1_MSK	= 0xf<<4,/* Bit  7.. 4: STAT1 LED Ctrl. Mask */
+	PHY_M_LEDC_STA1_MSK	= 0xf<<4, /* Bit  7.. 4: STAT1 LED Ctrl. Mask */
 	PHY_M_LEDC_STA0_MSK	= 0xf, /* Bit  3.. 0: STAT0 LED Ctrl. Mask */
 };
 
@@ -1804,8 +1804,8 @@ enum {
 
 /*	GM_SMI_CTRL			16 bit r/w	SMI Control Register */
 enum {
-	GM_SMI_CT_PHY_A_MSK	= 0x1f<<11,/* Bit 15..11:	PHY Device Address */
-	GM_SMI_CT_REG_A_MSK	= 0x1f<<6,/* Bit 10.. 6:	PHY Register Address */
+	GM_SMI_CT_PHY_A_MSK	= 0x1f<<11, /* Bit 15..11:	PHY Device Address */
+	GM_SMI_CT_REG_A_MSK	= 0x1f<<6, /* Bit 10.. 6:	PHY Register Address */
 	GM_SMI_CT_OP_RD		= 1<<5,	/* Bit  5:	OpCode Read (0=Write)*/
 	GM_SMI_CT_RD_VAL	= 1<<4,	/* Bit  4:	Read Valid (Read completed) */
 	GM_SMI_CT_BUSY		= 1<<3,	/* Bit  3:	Busy (Operation in progress) */
@@ -1875,9 +1875,9 @@ enum {
 
 /*	TX_GMF_CTRL_T	32 bit	Tx GMAC FIFO Control/Test */
 enum {
-	GMF_WSP_TST_ON	= 1<<18,/* Write Shadow Pointer Test On */
-	GMF_WSP_TST_OFF	= 1<<17,/* Write Shadow Pointer Test Off */
-	GMF_WSP_STEP	= 1<<16,/* Write Shadow Pointer Step/Increment */
+	GMF_WSP_TST_ON	= 1<<18, /* Write Shadow Pointer Test On */
+	GMF_WSP_TST_OFF	= 1<<17, /* Write Shadow Pointer Test Off */
+	GMF_WSP_STEP	= 1<<16, /* Write Shadow Pointer Step/Increment */
 
 	GMF_CLI_TX_FU	= 1<<6,	/* Clear IRQ Tx FIFO Underrun */
 	GMF_CLI_TX_FC	= 1<<5,	/* Clear IRQ Tx Frame Complete */
@@ -2111,18 +2111,18 @@ enum {
 
 /*	XM_MMU_CMD	16 bit r/w	MMU Command Register */
 enum {
-	XM_MMU_PHY_RDY	= 1<<12,/* Bit 12:	PHY Read Ready */
-	XM_MMU_PHY_BUSY	= 1<<11,/* Bit 11:	PHY Busy */
-	XM_MMU_IGN_PF	= 1<<10,/* Bit 10:	Ignore Pause Frame */
-	XM_MMU_MAC_LB	= 1<<9,	/* Bit  9:	Enable MAC Loopback */
-	XM_MMU_FRC_COL	= 1<<7,	/* Bit  7:	Force Collision */
-	XM_MMU_SIM_COL	= 1<<6,	/* Bit  6:	Simulate Collision */
-	XM_MMU_NO_PRE	= 1<<5,	/* Bit  5:	No MDIO Preamble */
-	XM_MMU_GMII_FD	= 1<<4,	/* Bit  4:	GMII uses Full Duplex */
-	XM_MMU_RAT_CTRL	= 1<<3,	/* Bit  3:	Enable Rate Control */
-	XM_MMU_GMII_LOOP= 1<<2,	/* Bit  2:	PHY is in Loopback Mode */
-	XM_MMU_ENA_RX	= 1<<1,	/* Bit  1:	Enable Receiver */
-	XM_MMU_ENA_TX	= 1<<0,	/* Bit  0:	Enable Transmitter */
+	XM_MMU_PHY_RDY	= 1<<12, /* Bit 12:	PHY Read Ready */
+	XM_MMU_PHY_BUSY	= 1<<11, /* Bit 11:	PHY Busy */
+	XM_MMU_IGN_PF	= 1<<10, /* Bit 10:	Ignore Pause Frame */
+	XM_MMU_MAC_LB	= 1<<9,	 /* Bit  9:	Enable MAC Loopback */
+	XM_MMU_FRC_COL	= 1<<7,	 /* Bit  7:	Force Collision */
+	XM_MMU_SIM_COL	= 1<<6,	 /* Bit  6:	Simulate Collision */
+	XM_MMU_NO_PRE	= 1<<5,	 /* Bit  5:	No MDIO Preamble */
+	XM_MMU_GMII_FD	= 1<<4,	 /* Bit  4:	GMII uses Full Duplex */
+	XM_MMU_RAT_CTRL	= 1<<3,	 /* Bit  3:	Enable Rate Control */
+	XM_MMU_GMII_LOOP= 1<<2,	 /* Bit  2:	PHY is in Loopback Mode */
+	XM_MMU_ENA_RX	= 1<<1,	 /* Bit  1:	Enable Receiver */
+	XM_MMU_ENA_TX	= 1<<0,	 /* Bit  0:	Enable Transmitter */
 };
 
 
@@ -2506,7 +2506,7 @@ static inline void skge_write8(const struct skge_hw *hw, int reg, u8 val)
 }
 
 /* MAC Related Registers inside the device. */
-#define SK_REG(port,reg)	(((port)<<7)+(reg))
+#define SK_REG(port,reg)	(((port)<<7)+(u16)(reg))
 #define SK_XMAC_REG(port, reg) \
 	((BASE_XMAC_1 + (port) * (BASE_XMAC_2 - BASE_XMAC_1)) | (reg) << 1)
 
-- 
cgit v0.10.2


From 0efdf2626676db4b30d343ff88f8461ad09130da Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Tue, 5 Dec 2006 12:03:41 -0800
Subject: [PATCH] sky2: sparse warnings

Get rid of sparse warnings in sky2 driver because of mixed enum
usage.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index a8e0963..fb1d2c3 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -521,7 +521,7 @@ static void sky2_phy_init(struct sky2_hw *hw, unsigned port)
 		/* set Tx LED (LED_TX) to blink mode on Rx OR Tx activity */
 		ledctrl |= PHY_M_LED_BLINK_RT(BLINK_84MS) | PHY_M_LEDC_TX_CTRL;
 		/* turn off the Rx LED (LED_RX) */
-		ledover |= PHY_M_LED_MO_RX(MO_LED_OFF);
+		ledover &= ~PHY_M_LED_MO_RX;
 	}
 
 	if (hw->chip_id == CHIP_ID_YUKON_EC_U && hw->chip_rev == CHIP_REV_YU_EC_A1) {
@@ -544,7 +544,7 @@ static void sky2_phy_init(struct sky2_hw *hw, unsigned port)
 
 		if (sky2->autoneg == AUTONEG_DISABLE || sky2->speed == SPEED_100) {
 			/* turn on 100 Mbps LED (LED_LINK100) */
-			ledover |= PHY_M_LED_MO_100(MO_LED_ON);
+			ledover |= PHY_M_LED_MO_100;
 		}
 
 		if (ledover)
@@ -2930,18 +2930,8 @@ static void sky2_led(struct sky2_hw *hw, unsigned port, int on)
 
 	default:
 		gm_phy_write(hw, port, PHY_MARV_LED_CTRL, 0);
-		gm_phy_write(hw, port, PHY_MARV_LED_OVER,
-			     on ? PHY_M_LED_MO_DUP(MO_LED_ON) |
-			     PHY_M_LED_MO_10(MO_LED_ON) |
-			     PHY_M_LED_MO_100(MO_LED_ON) |
-			     PHY_M_LED_MO_1000(MO_LED_ON) |
-			     PHY_M_LED_MO_RX(MO_LED_ON)
-			     : PHY_M_LED_MO_DUP(MO_LED_OFF) |
-			     PHY_M_LED_MO_10(MO_LED_OFF) |
-			     PHY_M_LED_MO_100(MO_LED_OFF) |
-			     PHY_M_LED_MO_1000(MO_LED_OFF) |
-			     PHY_M_LED_MO_RX(MO_LED_OFF));
-
+		gm_phy_write(hw, port, PHY_MARV_LED_OVER, 
+			     on ? PHY_M_LED_ALL : 0);
 	}
 }
 
diff --git a/drivers/net/sky2.h b/drivers/net/sky2.h
index a63f605..6ed1d47d 100644
--- a/drivers/net/sky2.h
+++ b/drivers/net/sky2.h
@@ -608,7 +608,7 @@ enum {
 	PHY_ADDR_MARV	= 0,
 };
 
-#define RB_ADDR(offs, queue) (B16_RAM_REGS + (queue) + (offs))
+#define RB_ADDR(offs, queue) ((u16) B16_RAM_REGS + (queue) + (offs))
 
 
 enum {
@@ -1061,7 +1061,7 @@ enum {
 	PHY_M_PC_EN_DET_PLUS	= 3<<8, /* Energy Detect Plus (Mode 2) */
 };
 
-#define PHY_M_PC_MDI_XMODE(x)	(((x)<<5) & PHY_M_PC_MDIX_MSK)
+#define PHY_M_PC_MDI_XMODE(x)	(((u16)(x)<<5) & PHY_M_PC_MDIX_MSK)
 
 enum {
 	PHY_M_PC_MAN_MDI	= 0, /* 00 = Manual MDI configuration */
@@ -1157,13 +1157,13 @@ enum {
 	PHY_M_EC_TX_TIM_CT  = 1<<1, /* RGMII Tx Timing Control */
 	PHY_M_EC_TRANS_DIS  = 1<<0, /* Transmitter Disable (88E1111 only) */};
 
-#define PHY_M_EC_M_DSC(x)	((x)<<10 & PHY_M_EC_M_DSC_MSK)
+#define PHY_M_EC_M_DSC(x)	((u16)(x)<<10 & PHY_M_EC_M_DSC_MSK)
 					/* 00=1x; 01=2x; 10=3x; 11=4x */
-#define PHY_M_EC_S_DSC(x)	((x)<<8 & PHY_M_EC_S_DSC_MSK)
+#define PHY_M_EC_S_DSC(x)	((u16)(x)<<8 & PHY_M_EC_S_DSC_MSK)
 					/* 00=dis; 01=1x; 10=2x; 11=3x */
-#define PHY_M_EC_DSC_2(x)	((x)<<9 & PHY_M_EC_M_DSC_MSK2)
+#define PHY_M_EC_DSC_2(x)	((u16)(x)<<9 & PHY_M_EC_M_DSC_MSK2)
 					/* 000=1x; 001=2x; 010=3x; 011=4x */
-#define PHY_M_EC_MAC_S(x)	((x)<<4 & PHY_M_EC_MAC_S_MSK)
+#define PHY_M_EC_MAC_S(x)	((u16)(x)<<4 & PHY_M_EC_MAC_S_MSK)
 					/* 01X=0; 110=2.5; 111=25 (MHz) */
 
 /* for Yukon-2 Gigabit Ethernet PHY (88E1112 only) */
@@ -1174,7 +1174,7 @@ enum {
 };
 /* !!! Errata in spec. (1 = disable) */
 
-#define PHY_M_PC_DSC(x)			(((x)<<12) & PHY_M_PC_DSC_MSK)
+#define PHY_M_PC_DSC(x)			(((u16)(x)<<12) & PHY_M_PC_DSC_MSK)
 											/* 100=5x; 101=6x; 110=7x; 111=8x */
 enum {
 	MAC_TX_CLK_0_MHZ	= 2,
@@ -1204,7 +1204,7 @@ enum {
 	PHY_M_LEDC_TX_C_MSB	= 1<<0, /* Tx Control (MSB, 88E1111 only) */
 };
 
-#define PHY_M_LED_PULS_DUR(x)	(((x)<<12) & PHY_M_LEDC_PULS_MSK)
+#define PHY_M_LED_PULS_DUR(x)	(((u16)(x)<<12) & PHY_M_LEDC_PULS_MSK)
 
 /*****  PHY_MARV_PHY_STAT (page 3)16 bit r/w	Polarity Control Reg. *****/
 enum {
@@ -1234,7 +1234,7 @@ enum {
 	PULS_1300MS	= 7,/* 1.3 s to 2.7 s */
 };
 
-#define PHY_M_LED_BLINK_RT(x)	(((x)<<8) & PHY_M_LEDC_BL_R_MSK)
+#define PHY_M_LED_BLINK_RT(x)	(((u16)(x)<<8) & PHY_M_LEDC_BL_R_MSK)
 
 enum {
 	BLINK_42MS	= 0,/* 42 ms */
@@ -1244,21 +1244,18 @@ enum {
 	BLINK_670MS	= 4,/* 670 ms */
 };
 
-/*****  PHY_MARV_LED_OVER	16 bit r/w	Manual LED Override Reg *****/
-#define PHY_M_LED_MO_SGMII(x)	((x)<<14) /* Bit 15..14:  SGMII AN Timer */
-										/* Bit 13..12:	reserved */
-#define PHY_M_LED_MO_DUP(x)	((x)<<10) /* Bit 11..10:  Duplex */
-#define PHY_M_LED_MO_10(x)	((x)<<8) /* Bit  9.. 8:  Link 10 */
-#define PHY_M_LED_MO_100(x)	((x)<<6) /* Bit  7.. 6:  Link 100 */
-#define PHY_M_LED_MO_1000(x)	((x)<<4) /* Bit  5.. 4:  Link 1000 */
-#define PHY_M_LED_MO_RX(x)	((x)<<2) /* Bit  3.. 2:  Rx */
-#define PHY_M_LED_MO_TX(x)	((x)<<0) /* Bit  1.. 0:  Tx */
-
+/**** PHY_MARV_LED_OVER    16 bit r/w LED control */
 enum {
-	MO_LED_NORM	= 0,
-	MO_LED_BLINK	= 1,
-	MO_LED_OFF	= 2,
-	MO_LED_ON	= 3,
+	PHY_M_LED_MO_DUP  = 3<<10,/* Bit 11..10:  Duplex */
+	PHY_M_LED_MO_10	  = 3<<8, /* Bit  9.. 8:  Link 10 */
+	PHY_M_LED_MO_100  = 3<<6, /* Bit  7.. 6:  Link 100 */
+	PHY_M_LED_MO_1000 = 3<<4, /* Bit  5.. 4:  Link 1000 */
+	PHY_M_LED_MO_RX	  = 3<<2, /* Bit  3.. 2:  Rx */
+	PHY_M_LED_MO_TX	  = 3<<0, /* Bit  1.. 0:  Tx */
+
+	PHY_M_LED_ALL	  = PHY_M_LED_MO_DUP | PHY_M_LED_MO_10 
+			    | PHY_M_LED_MO_100 | PHY_M_LED_MO_1000
+			    | PHY_M_LED_MO_RX,
 };
 
 /*****  PHY_MARV_EXT_CTRL_2	16 bit r/w	Ext. PHY Specific Ctrl 2 *****/
@@ -1295,9 +1292,9 @@ enum {
 	PHY_M_FELP_LED0_MSK = 0xf, /* Bit  3.. 0: LED0 Mask (SPEED) */
 };
 
-#define PHY_M_FELP_LED2_CTRL(x)	(((x)<<8) & PHY_M_FELP_LED2_MSK)
-#define PHY_M_FELP_LED1_CTRL(x)	(((x)<<4) & PHY_M_FELP_LED1_MSK)
-#define PHY_M_FELP_LED0_CTRL(x)	(((x)<<0) & PHY_M_FELP_LED0_MSK)
+#define PHY_M_FELP_LED2_CTRL(x)	(((u16)(x)<<8) & PHY_M_FELP_LED2_MSK)
+#define PHY_M_FELP_LED1_CTRL(x)	(((u16)(x)<<4) & PHY_M_FELP_LED1_MSK)
+#define PHY_M_FELP_LED0_CTRL(x)	(((u16)(x)<<0) & PHY_M_FELP_LED0_MSK)
 
 enum {
 	LED_PAR_CTRL_COLX	= 0x00,
@@ -1553,8 +1550,8 @@ enum {
 	GM_SMI_CT_BUSY		= 1<<3,	/* Bit  3:	Busy (Operation in progress) */
 };
 
-#define GM_SMI_CT_PHY_AD(x)	(((x)<<11) & GM_SMI_CT_PHY_A_MSK)
-#define GM_SMI_CT_REG_AD(x)	(((x)<<6) & GM_SMI_CT_REG_A_MSK)
+#define GM_SMI_CT_PHY_AD(x)	(((u16)(x)<<11) & GM_SMI_CT_PHY_A_MSK)
+#define GM_SMI_CT_REG_AD(x)	(((u16)(x)<<6) & GM_SMI_CT_REG_A_MSK)
 
 /*	GM_PHY_ADDR				16 bit r/w	GPHY Address Register */
 enum {
-- 
cgit v0.10.2


From 0bfdcc88df969af8de087d0fdddf8c0efa76b4b0 Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Thu, 7 Dec 2006 06:30:07 -0500
Subject: [netdrvr] netxen: workqueue-related build fixes


diff --git a/drivers/net/netxen/netxen_nic_init.c b/drivers/net/netxen/netxen_nic_init.c
index 290145e..869725f 100644
--- a/drivers/net/netxen/netxen_nic_init.c
+++ b/drivers/net/netxen/netxen_nic_init.c
@@ -1023,8 +1023,7 @@ int netxen_process_cmd_ring(unsigned long data)
 			     && netif_carrier_ok(port->netdev))
 		    && ((jiffies - port->netdev->trans_start) >
 			port->netdev->watchdog_timeo)) {
-			SCHEDULE_WORK(port->adapter->tx_timeout_task
-				      + port->portnum);
+			SCHEDULE_WORK(&port->adapter->tx_timeout_task);
 		}
 
 		last_consumer = get_next_index(last_consumer,
diff --git a/drivers/net/netxen/netxen_nic_main.c b/drivers/net/netxen/netxen_nic_main.c
index 913e814..575b71b 100644
--- a/drivers/net/netxen/netxen_nic_main.c
+++ b/drivers/net/netxen/netxen_nic_main.c
@@ -969,7 +969,7 @@ static void netxen_tx_timeout(struct net_device *netdev)
 {
 	struct netxen_port *port = (struct netxen_port *)netdev_priv(netdev);
 
-	SCHEDULE_WORK(port->adapter->tx_timeout_task + port->portnum);
+	SCHEDULE_WORK(&port->adapter->tx_timeout_task);
 }
 
 static void netxen_tx_timeout_task(struct work_struct *work)
-- 
cgit v0.10.2


From 0ae851352a87db3f829511816a2da227860bf585 Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Thu, 7 Dec 2006 06:30:30 -0500
Subject: [wireless] zd1211rw: workqueue-related build fixes

Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/net/wireless/zd1211rw/zd_mac.c b/drivers/net/wireless/zd1211rw/zd_mac.c
index 61c7916..00ca704 100644
--- a/drivers/net/wireless/zd1211rw/zd_mac.c
+++ b/drivers/net/wireless/zd1211rw/zd_mac.c
@@ -39,7 +39,7 @@ static void housekeeping_init(struct zd_mac *mac);
 static void housekeeping_enable(struct zd_mac *mac);
 static void housekeeping_disable(struct zd_mac *mac);
 
-static void set_multicast_hash_handler(void *mac_ptr);
+static void set_multicast_hash_handler(struct work_struct *work);
 
 int zd_mac_init(struct zd_mac *mac,
 	        struct net_device *netdev,
@@ -57,8 +57,7 @@ int zd_mac_init(struct zd_mac *mac,
 	softmac_init(ieee80211_priv(netdev));
 	zd_chip_init(&mac->chip, netdev, intf);
 	housekeeping_init(mac);
-	INIT_WORK(&mac->set_multicast_hash_work, set_multicast_hash_handler,
-		  mac);
+	INIT_WORK(&mac->set_multicast_hash_work, set_multicast_hash_handler);
 	return 0;
 }
 
@@ -261,9 +260,10 @@ int zd_mac_set_mac_address(struct net_device *netdev, void *p)
 	return 0;
 }
 
-static void set_multicast_hash_handler(void *mac_ptr)
+static void set_multicast_hash_handler(struct work_struct *work)
 {
-	struct zd_mac *mac = mac_ptr;
+	struct zd_mac *mac = container_of(work, struct zd_mac,
+					  set_multicast_hash_work);
 	struct zd_mc_hash hash;
 
 	spin_lock_irq(&mac->lock);
-- 
cgit v0.10.2


From d324d4627d7442d9a74b0b93fc40f71ce194632a Mon Sep 17 00:00:00 2001
From: Mikael Pettersson <mikpe@it.uu.se>
Date: Wed, 6 Dec 2006 09:55:43 +0100
Subject: [PATCH] sata_promise: cleanups, take 2

This patch performs two simple cleanups of sata_promise.

* Remove board_20771 and map device id 0x3577 to board_2057x.
  After the recent corrections for SATAII chips, board_20771 and
  board_2057x were equivalent in the driver.

* Remove hp->hotplug_offset and use hp->flags & PDC_FLAG_GEN_II
  to compute hotplug_offset in pdc_host_init(). hp->hotplug_offset
  was used to distinguish 1st and 2nd generation chips in one
  particular case, but now we have that information in a more
  general form in hp->flags, so hp->hotplug_offset is redundant.

Changes since previous submission: rebased on libata-dev #upstream,
cleaned up hotplug_offset computation based on Tejun's comments,
expanded hotplug_offset removal rationale.

This patch does not depend on the pending new EH conversion patch.

Signed-off-by: Mikael Pettersson <mikpe@it.uu.se>
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/ata/sata_promise.c b/drivers/ata/sata_promise.c
index a2778cf..6f7bc5b 100644
--- a/drivers/ata/sata_promise.c
+++ b/drivers/ata/sata_promise.c
@@ -66,9 +66,8 @@ enum {
 	board_2037x		= 0,	/* FastTrak S150 TX2plus */
 	board_20319		= 1,	/* FastTrak S150 TX4 */
 	board_20619		= 2,	/* FastTrak TX4000 */
-	board_20771		= 3,	/* FastTrak TX2300 */
-	board_2057x		= 4,	/* SATAII150 Tx2plus */
-	board_40518		= 5,	/* SATAII150 Tx4 */
+	board_2057x		= 3,	/* SATAII150 Tx2plus */
+	board_40518		= 4,	/* SATAII150 Tx4 */
 
 	PDC_HAS_PATA		= (1 << 1), /* PDC20375/20575 has PATA */
 
@@ -90,7 +89,6 @@ struct pdc_port_priv {
 
 struct pdc_host_priv {
 	unsigned long		flags;
-	int			hotplug_offset;
 };
 
 static u32 pdc_sata_scr_read (struct ata_port *ap, unsigned int sc_reg);
@@ -205,16 +203,6 @@ static const struct ata_port_info pdc_port_info[] = {
 		.port_ops	= &pdc_pata_ops,
 	},
 
-	/* board_20771 */
-	{
-		.sht		= &pdc_ata_sht,
-		.flags		= PDC_COMMON_FLAGS | ATA_FLAG_SATA,
-		.pio_mask	= 0x1f, /* pio0-4 */
-		.mwdma_mask	= 0x07, /* mwdma0-2 */
-		.udma_mask	= 0x7f, /* udma0-6 ; FIXME */
-		.port_ops	= &pdc_sata_ops,
-	},
-
 	/* board_2057x */
 	{
 		.sht		= &pdc_ata_sht,
@@ -244,6 +232,7 @@ static const struct pci_device_id pdc_ata_pci_tbl[] = {
 	{ PCI_VDEVICE(PROMISE, 0x3570), board_2057x },
 	{ PCI_VDEVICE(PROMISE, 0x3571), board_2057x },
 	{ PCI_VDEVICE(PROMISE, 0x3574), board_2057x },
+	{ PCI_VDEVICE(PROMISE, 0x3577), board_2057x },
 	{ PCI_VDEVICE(PROMISE, 0x3d73), board_2057x },
 	{ PCI_VDEVICE(PROMISE, 0x3d75), board_2057x },
 
@@ -256,15 +245,6 @@ static const struct pci_device_id pdc_ata_pci_tbl[] = {
 
 	{ PCI_VDEVICE(PROMISE, 0x6629), board_20619 },
 
-/* TODO: remove all associated board_20771 code, as it completely
- * duplicates board_2037x code, unless reason for separation can be
- * divined.
- */
-#if 0
-	{ PCI_VDEVICE(PROMISE, 0x3570), board_20771 },
-#endif
-	{ PCI_VDEVICE(PROMISE, 0x3577), board_20771 },
-
 	{ }	/* terminate list */
 };
 
@@ -645,9 +625,14 @@ static void pdc_host_init(unsigned int chip_id, struct ata_probe_ent *pe)
 {
 	void __iomem *mmio = pe->mmio_base;
 	struct pdc_host_priv *hp = pe->private_data;
-	int hotplug_offset = hp->hotplug_offset;
+	int hotplug_offset;
 	u32 tmp;
 
+	if (hp->flags & PDC_FLAG_GEN_II)
+		hotplug_offset = PDC2_SATA_PLUG_CSR;
+	else
+		hotplug_offset = PDC_SATA_PLUG_CSR;
+
 	/*
 	 * Except for the hotplug stuff, this is voodoo from the
 	 * Promise driver.  Label this entire section
@@ -742,8 +727,6 @@ static int pdc_ata_init_one (struct pci_dev *pdev, const struct pci_device_id *e
 		goto err_out_free_ent;
 	}
 
-	/* Set default hotplug offset */
-	hp->hotplug_offset = PDC_SATA_PLUG_CSR;
 	probe_ent->private_data = hp;
 
 	probe_ent->sht		= pdc_port_info[board_idx].sht;
@@ -767,8 +750,6 @@ static int pdc_ata_init_one (struct pci_dev *pdev, const struct pci_device_id *e
 	switch (board_idx) {
 	case board_40518:
 		hp->flags |= PDC_FLAG_GEN_II;
-		/* Override hotplug offset for SATAII150 */
-		hp->hotplug_offset = PDC2_SATA_PLUG_CSR;
 		/* Fall through */
 	case board_20319:
        		probe_ent->n_ports = 4;
@@ -780,10 +761,7 @@ static int pdc_ata_init_one (struct pci_dev *pdev, const struct pci_device_id *e
 		probe_ent->port[3].scr_addr = base + 0x700;
 		break;
 	case board_2057x:
-	case board_20771:
 		hp->flags |= PDC_FLAG_GEN_II;
-		/* Override hotplug offset for SATAII150 */
-		hp->hotplug_offset = PDC2_SATA_PLUG_CSR;
 		/* Fall through */
 	case board_2037x:
 		probe_ent->n_ports = 2;
-- 
cgit v0.10.2


From e3472cbe5c10a91c737405cd706142787736392c Mon Sep 17 00:00:00 2001
From: Albert Lee <albertcc@tw.ibm.com>
Date: Thu, 7 Dec 2006 11:37:58 +0800
Subject: [PATCH] libata: let ATA_FLAG_PIO_POLLING use polling pio for
 ATA_PROT_NODATA

Even if ATA_FLAG_PIO_POLLING is set, libata uses irq pio for the ATA_PROT_NODATA protocol.
This patch let ATA_FLAG_PIO_POLLING use polling pio for the ATA_PROT_NODATA protocol.

Signed-off-by: Albert Lee <albertcc@tw.ibm.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 8816e30..d2e6863 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4960,6 +4960,7 @@ unsigned int ata_qc_issue_prot(struct ata_queued_cmd *qc)
 	if (ap->flags & ATA_FLAG_PIO_POLLING) {
 		switch (qc->tf.protocol) {
 		case ATA_PROT_PIO:
+		case ATA_PROT_NODATA:
 		case ATA_PROT_ATAPI:
 		case ATA_PROT_ATAPI_NODATA:
 			qc->tf.flags |= ATA_TFLAG_POLLING;
-- 
cgit v0.10.2


From 25b93d81b9abe3b786bdf3396d728bb13b0a911c Mon Sep 17 00:00:00 2001
From: Mikael Pettersson <mikpe@it.uu.se>
Date: Thu, 7 Dec 2006 00:06:51 +0100
Subject: [PATCH] sata_promise: new EH conversion, take 2

This patch converts sata_promise to use new-style libata error
handling on Promise SATA chips, for both SATA and PATA ports.

* ATA_FLAG_SRST is no longer set
* ->phy_reset is no longer set as it is unused when ->error_handler
   is present, and pdc_sata_phy_reset() has been removed
* pdc_freeze() masks interrupts and halts DMA via PDC_CTLSTAT
* pdc_thaw() clears interrupt status in PDC_INT_SEQMASK and then
  unmasks interrupts in PDC_CTLSTAT
* pdc_error_handler() reinitialises the port if it isn't frozen,
  and then invokes ata_do_eh() with standard {s,}ata reset methods
* pdc_post_internal_cmd() resets the port in case of errors
* the PATA-only 20619 chip continues to use old-style EH:
  not by necessity but simply because I don't have documentation
  for it or any way to test it

Since the previous version pdc_error_handler() has been rewritten
and it now mostly matches ahci and sata_sil24. In case anyone
wonders: the call to pdc_reset_port() isn't a heavy-duty reset,
it's a light-weight reset to quickly put a port into a sane state.

The discussion about the PCI flushes in pdc_freeze() and pdc_thaw()
seemed to end with a consensus that the flushes are OK and not
obviously redundant, so I decided to keep them for now.

This patch was prepared against 2.6.19-git7, but it also applies
to 2.6.19 + libata #upstream, with or without the revised sata_promise
cleanup patch I recently submitted.

This patch does conflict with the #promise-sata-pata patch:
this patch removes pdc_sata_phy_reset() while #promise-sata-pata
modifies it. The correct patch resolution is to remove the function.

Tested on 2037x and 2057x chips, with PATA patches on top and disks
on both SATA and PATA ports.

Signed-off-by: Mikael Pettersson <mikpe@it.uu.se>
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/ata/sata_promise.c b/drivers/ata/sata_promise.c
index 6f7bc5b..f055874 100644
--- a/drivers/ata/sata_promise.c
+++ b/drivers/ata/sata_promise.c
@@ -71,9 +71,12 @@ enum {
 
 	PDC_HAS_PATA		= (1 << 1), /* PDC20375/20575 has PATA */
 
+	/* PDC_CTLSTAT bit definitions */
+	PDC_DMA_ENABLE		= (1 << 7),
+	PDC_IRQ_DISABLE		= (1 << 10),
 	PDC_RESET		= (1 << 11), /* HDMA reset */
 
-	PDC_COMMON_FLAGS	= ATA_FLAG_NO_LEGACY | ATA_FLAG_SRST |
+	PDC_COMMON_FLAGS	= ATA_FLAG_NO_LEGACY |
 				  ATA_FLAG_MMIO | ATA_FLAG_NO_ATAPI |
 				  ATA_FLAG_PIO_POLLING,
 
@@ -99,13 +102,16 @@ static void pdc_eng_timeout(struct ata_port *ap);
 static int pdc_port_start(struct ata_port *ap);
 static void pdc_port_stop(struct ata_port *ap);
 static void pdc_pata_phy_reset(struct ata_port *ap);
-static void pdc_sata_phy_reset(struct ata_port *ap);
 static void pdc_qc_prep(struct ata_queued_cmd *qc);
 static void pdc_tf_load_mmio(struct ata_port *ap, const struct ata_taskfile *tf);
 static void pdc_exec_command_mmio(struct ata_port *ap, const struct ata_taskfile *tf);
 static void pdc_irq_clear(struct ata_port *ap);
 static unsigned int pdc_qc_issue_prot(struct ata_queued_cmd *qc);
 static void pdc_host_stop(struct ata_host *host);
+static void pdc_freeze(struct ata_port *ap);
+static void pdc_thaw(struct ata_port *ap);
+static void pdc_error_handler(struct ata_port *ap);
+static void pdc_post_internal_cmd(struct ata_queued_cmd *qc);
 
 
 static struct scsi_host_template pdc_ata_sht = {
@@ -134,11 +140,12 @@ static const struct ata_port_operations pdc_sata_ops = {
 	.exec_command		= pdc_exec_command_mmio,
 	.dev_select		= ata_std_dev_select,
 
-	.phy_reset		= pdc_sata_phy_reset,
-
 	.qc_prep		= pdc_qc_prep,
 	.qc_issue		= pdc_qc_issue_prot,
-	.eng_timeout		= pdc_eng_timeout,
+	.freeze			= pdc_freeze,
+	.thaw			= pdc_thaw,
+	.error_handler		= pdc_error_handler,
+	.post_internal_cmd	= pdc_post_internal_cmd,
 	.data_xfer		= ata_mmio_data_xfer,
 	.irq_handler		= pdc_interrupt,
 	.irq_clear		= pdc_irq_clear,
@@ -196,7 +203,7 @@ static const struct ata_port_info pdc_port_info[] = {
 	/* board_20619 */
 	{
 		.sht		= &pdc_ata_sht,
-		.flags		= PDC_COMMON_FLAGS | ATA_FLAG_SLAVE_POSS,
+		.flags		= PDC_COMMON_FLAGS | ATA_FLAG_SRST | ATA_FLAG_SLAVE_POSS,
 		.pio_mask	= 0x1f, /* pio0-4 */
 		.mwdma_mask	= 0x07, /* mwdma0-2 */
 		.udma_mask	= 0x7f, /* udma0-6 ; FIXME */
@@ -346,12 +353,6 @@ static void pdc_reset_port(struct ata_port *ap)
 	readl(mmio);	/* flush */
 }
 
-static void pdc_sata_phy_reset(struct ata_port *ap)
-{
-	pdc_reset_port(ap);
-	sata_phy_reset(ap);
-}
-
 static void pdc_pata_cbl_detect(struct ata_port *ap)
 {
 	u8 tmp;
@@ -419,6 +420,61 @@ static void pdc_qc_prep(struct ata_queued_cmd *qc)
 	}
 }
 
+static void pdc_freeze(struct ata_port *ap)
+{
+	void __iomem *mmio = (void __iomem *) ap->ioaddr.cmd_addr;
+	u32 tmp;
+
+	tmp = readl(mmio + PDC_CTLSTAT);
+	tmp |= PDC_IRQ_DISABLE;
+	tmp &= ~PDC_DMA_ENABLE;
+	writel(tmp, mmio + PDC_CTLSTAT);
+	readl(mmio + PDC_CTLSTAT); /* flush */
+}
+
+static void pdc_thaw(struct ata_port *ap)
+{
+	void __iomem *mmio = (void __iomem *) ap->ioaddr.cmd_addr;
+	u32 tmp;
+
+	/* clear IRQ */
+	readl(mmio + PDC_INT_SEQMASK);
+
+	/* turn IRQ back on */
+	tmp = readl(mmio + PDC_CTLSTAT);
+	tmp &= ~PDC_IRQ_DISABLE;
+	writel(tmp, mmio + PDC_CTLSTAT);
+	readl(mmio + PDC_CTLSTAT); /* flush */
+}
+
+static void pdc_error_handler(struct ata_port *ap)
+{
+	ata_reset_fn_t hardreset;
+
+	if (!(ap->pflags & ATA_PFLAG_FROZEN))
+		pdc_reset_port(ap);
+
+	hardreset = NULL;
+	if (sata_scr_valid(ap))
+		hardreset = sata_std_hardreset;
+
+	/* perform recovery */
+	ata_do_eh(ap, ata_std_prereset, ata_std_softreset, hardreset,
+		  ata_std_postreset);
+}
+
+static void pdc_post_internal_cmd(struct ata_queued_cmd *qc)
+{
+	struct ata_port *ap = qc->ap;
+
+	if (qc->flags & ATA_QCFLAG_FAILED)
+		qc->err_mask |= AC_ERR_OTHER;
+
+	/* make DMA engine forget about the failed command */
+	if (qc->err_mask)
+		pdc_reset_port(ap);
+}
+
 static void pdc_eng_timeout(struct ata_port *ap)
 {
 	struct ata_host *host = ap->host;
-- 
cgit v0.10.2


From fd3367af3d1212f645094c4b5c4d458bdd061475 Mon Sep 17 00:00:00 2001
From: Alan <alan@lxorguk.ukuu.org.uk>
Date: Thu, 7 Dec 2006 12:41:18 +0000
Subject: [PATCH] libata: Incorrect timing computation for PIO5/6

The ata timing computation code makes some mistakes in PIO5/6 because a
check was not updated correctly when I put this support into the kernel.

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index d2e6863..011c0a8 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2303,7 +2303,7 @@ int ata_timing_compute(struct ata_device *adev, unsigned short speed,
 	 * DMA cycle timing is slower/equal than the fastest PIO timing.
 	 */
 
-	if (speed > XFER_PIO_4) {
+	if (speed > XFER_PIO_6) {
 		ata_timing_compute(adev, adev->pio_mode, &p, T, UT);
 		ata_timing_merge(&p, t, t, ATA_TIMING_ALL);
 	}
-- 
cgit v0.10.2


From 34126f9f41901ca9d7d0031c2b11fc0d6c07b72d Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 7 Dec 2006 09:13:14 -0500
Subject: [GFS2] Change gfs2_fsync() to use write_inode_now()

This is a bit better than the previous version of gfs2_fsync()
although it would be better still if we were able to call a
function which only wrote the inode & metadata. Its no big deal
though that this will potentially write the data as well since
the VFS has already done that before calling gfs2_fsync(). I've
also added a comment to explain whats going on here.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Andrew Morton <akpm@osdl.org>

diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 7bd971b..b3f1e03 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -510,6 +510,11 @@ static int gfs2_close(struct inode *inode, struct file *file)
  * is set. For stuffed inodes we must flush the log in order to
  * ensure that all data is on disk.
  *
+ * The call to write_inode_now() is there to write back metadata and
+ * the inode itself. It does also try and write the data, but thats
+ * (hopefully) a no-op due to the VFS having already called filemap_fdatawrite()
+ * for us.
+ *
  * Returns: errno
  */
 
@@ -518,10 +523,6 @@ static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
 	struct inode *inode = dentry->d_inode;
 	int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
 	int ret = 0;
-	struct writeback_control wbc = {
-		.sync_mode = WB_SYNC_ALL,
-		.nr_to_write = 0,
-	};
 
 	if (gfs2_is_jdata(GFS2_I(inode))) {
 		gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
@@ -530,7 +531,7 @@ static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
 
 	if (sync_state != 0) {
 		if (!datasync)
-			ret = sync_inode(inode, &wbc);
+			ret = write_inode_now(inode, 0);
 
 		if (gfs2_is_stuffed(GFS2_I(inode)))
 			gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
-- 
cgit v0.10.2


From ac33d0710595579e3cfca42dde2257eb0b123f6d Mon Sep 17 00:00:00 2001
From: Patrick Caulfield <pcaulfie@redhat.com>
Date: Wed, 6 Dec 2006 15:10:37 +0000
Subject: [DLM] Clean up lowcomms

This fixes up most of the things pointed out by akpm and Pavel Machek
with comments below indicating why some things have been left:

Andrew Morton wrote:
>
>> +static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc)
>> +{
>> +	struct nodeinfo *ni;
>> +	int r;
>> +	int n;
>> +
>> +	down_read(&nodeinfo_lock);
>
> Given that this function can sleep, I wonder if `alloc' is useful.
>
> I see lots of callers passing in a literal "0" for `alloc'.  That's in fact
> a secret (GFP_ATOMIC & ~__GFP_HIGH).  I doubt if that's what you really
> meant.  Particularly as the code could at least have used __GFP_WAIT (aka
> GFP_NOIO) which is much, much more reliable than "0".  In fact "0" is the
> least reliable mode possible.
>
> IOW, this is all bollixed up.

When 0 is passed into nodeid2nodeinfo the function does not try to allocate a
new structure at all. it's an indication that the caller only wants the nodeinfo
struct for that nodeid if there actually is one in existance.
I've tidied the function itself so it's more obvious, (and tidier!)

>> +/* Data received from remote end */
>> +static int receive_from_sock(void)
>> +{
>> +	int ret = 0;
>> +	struct msghdr msg;
>> +	struct kvec iov[2];
>> +	unsigned len;
>> +	int r;
>> +	struct sctp_sndrcvinfo *sinfo;
>> +	struct cmsghdr *cmsg;
>> +	struct nodeinfo *ni;
>> +
>> +	/* These two are marginally too big for stack allocation, but this
>> +	 * function is (currently) only called by dlm_recvd so static should be
>> +	 * OK.
>> +	 */
>> +	static struct sockaddr_storage msgname;
>> +	static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
>
> whoa.  This is globally singly-threaded code??

Yes. it is only ever run in the context of dlm_recvd.
>>
>> +static void initiate_association(int nodeid)
>> +{
>> +	struct sockaddr_storage rem_addr;
>> +	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
>
> Another static buffer to worry about.  Globally singly-threaded code?

Yes. Only ever called by dlm_sendd.

>> +
>> +/* Send a message */
>> +static int send_to_sock(struct nodeinfo *ni)
>> +{
>> +	int ret = 0;
>> +	struct writequeue_entry *e;
>> +	int len, offset;
>> +	struct msghdr outmsg;
>> +	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
>
> Singly-threaded?

Yep.

>>
>> +static void dealloc_nodeinfo(void)
>> +{
>> +	int i;
>> +
>> +	for (i=1; i<=max_nodeid; i++) {
>> +		struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
>> +		if (ni) {
>> +			idr_remove(&nodeinfo_idr, i);
>
> Didn't that need locking?

Not. it's only ever called at DLM shutdown after all the other threads
have been stopped.

>>
>> +static int write_list_empty(void)
>> +{
>> +	int status;
>> +
>> +	spin_lock_bh(&write_nodes_lock);
>> +	status = list_empty(&write_nodes);
>> +	spin_unlock_bh(&write_nodes_lock);
>> +
>> +	return status;
>> +}
>
> This function's return value is meaningless.  As soon as the lock gets
> dropped, the return value can get out of sync with reality.
>
> Looking at the caller, this _might_ happen to be OK, but it's a nasty and
> dangerous thing.  Really the locking should be moved into the caller.

It's just an optimisation to allow the caller to schedule if there is no work
to do. if something arrives immediately afterwards then it will get picked up
when the process re-awakes (and it will be woken by that arrival).

The 'accepting' atomic has gone completely. as Andrew pointed out it didn't
really achieve much anyway. I suspect it was a plaster over some other
startup or shutdown bug to be honest.


Signed-off-by: Patrick Caulfield <pcaulfie@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Andrew Morton <akpm@osdl.org>
Cc: Pavel Machek <pavel@ucw.cz>

diff --git a/fs/dlm/lowcomms-sctp.c b/fs/dlm/lowcomms-sctp.c
index 6da6b14..fe158d7 100644
--- a/fs/dlm/lowcomms-sctp.c
+++ b/fs/dlm/lowcomms-sctp.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -75,13 +75,13 @@ struct nodeinfo {
 };
 
 static DEFINE_IDR(nodeinfo_idr);
-static struct rw_semaphore	nodeinfo_lock;
-static int			max_nodeid;
+static DECLARE_RWSEM(nodeinfo_lock);
+static int max_nodeid;
 
 struct cbuf {
-	unsigned		base;
-	unsigned		len;
-	unsigned		mask;
+	unsigned int base;
+	unsigned int len;
+	unsigned int mask;
 };
 
 /* Just the one of these, now. But this struct keeps
@@ -90,9 +90,9 @@ struct cbuf {
 #define CF_READ_PENDING 1
 
 struct connection {
-	struct socket          *sock;
+	struct socket           *sock;
 	unsigned long		flags;
-	struct page            *rx_page;
+	struct page             *rx_page;
 	atomic_t		waiting_requests;
 	struct cbuf		cb;
 	int                     eagain_flag;
@@ -102,36 +102,40 @@ struct connection {
 
 struct writequeue_entry {
 	struct list_head	list;
-	struct page            *page;
+	struct page             *page;
 	int			offset;
 	int			len;
 	int			end;
 	int			users;
-	struct nodeinfo        *ni;
+	struct nodeinfo         *ni;
 };
 
-#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
-#define CBUF_EMPTY(cb) ((cb)->len == 0)
-#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
-#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
+static void cbuf_add(struct cbuf *cb, int n)
+{
+	cb->len += n;
+}
 
-#define CBUF_INIT(cb, size) \
-do { \
-	(cb)->base = (cb)->len = 0; \
-	(cb)->mask = ((size)-1); \
-} while(0)
+static int cbuf_data(struct cbuf *cb)
+{
+	return ((cb->base + cb->len) & cb->mask);
+}
 
-#define CBUF_EAT(cb, n) \
-do { \
-	(cb)->len  -= (n); \
-	(cb)->base += (n); \
-	(cb)->base &= (cb)->mask; \
-} while(0)
+static void cbuf_init(struct cbuf *cb, int size)
+{
+	cb->base = cb->len = 0;
+	cb->mask = size-1;
+}
 
+static void cbuf_eat(struct cbuf *cb, int n)
+{
+	cb->len  -= n;
+	cb->base += n;
+	cb->base &= cb->mask;
+}
 
 /* List of nodes which have writes pending */
-static struct list_head write_nodes;
-static spinlock_t write_nodes_lock;
+static LIST_HEAD(write_nodes);
+static DEFINE_SPINLOCK(write_nodes_lock);
 
 /* Maximum number of incoming messages to process before
  * doing a schedule()
@@ -141,8 +145,7 @@ static spinlock_t write_nodes_lock;
 /* Manage daemons */
 static struct task_struct *recv_task;
 static struct task_struct *send_task;
-static wait_queue_head_t lowcomms_recv_wait;
-static atomic_t accepting;
+static DECLARE_WAIT_QUEUE_HEAD(lowcomms_recv_wait);
 
 /* The SCTP connection */
 static struct connection sctp_con;
@@ -161,11 +164,11 @@ static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
 		return error;
 
 	if (dlm_local_addr[0]->ss_family == AF_INET) {
-	        struct sockaddr_in *in4  = (struct sockaddr_in *) &addr;
+		struct sockaddr_in *in4  = (struct sockaddr_in *) &addr;
 		struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
 		ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
 	} else {
-	        struct sockaddr_in6 *in6  = (struct sockaddr_in6 *) &addr;
+		struct sockaddr_in6 *in6  = (struct sockaddr_in6 *) &addr;
 		struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
 		memcpy(&ret6->sin6_addr, &in6->sin6_addr,
 		       sizeof(in6->sin6_addr));
@@ -174,6 +177,8 @@ static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
 	return 0;
 }
 
+/* If alloc is 0 here we will not attempt to allocate a new
+   nodeinfo struct */
 static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc)
 {
 	struct nodeinfo *ni;
@@ -184,44 +189,45 @@ static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc)
 	ni = idr_find(&nodeinfo_idr, nodeid);
 	up_read(&nodeinfo_lock);
 
-	if (!ni && alloc) {
-		down_write(&nodeinfo_lock);
+	if (ni || !alloc)
+		return ni;
 
-		ni = idr_find(&nodeinfo_idr, nodeid);
-		if (ni)
-			goto out_up;
+	down_write(&nodeinfo_lock);
 
-		r = idr_pre_get(&nodeinfo_idr, alloc);
-		if (!r)
-			goto out_up;
+	ni = idr_find(&nodeinfo_idr, nodeid);
+	if (ni)
+		goto out_up;
 
-		ni = kmalloc(sizeof(struct nodeinfo), alloc);
-		if (!ni)
-			goto out_up;
+	r = idr_pre_get(&nodeinfo_idr, alloc);
+	if (!r)
+		goto out_up;
 
-		r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
-		if (r) {
-			kfree(ni);
-			ni = NULL;
-			goto out_up;
-		}
-		if (n != nodeid) {
-			idr_remove(&nodeinfo_idr, n);
-			kfree(ni);
-			ni = NULL;
-			goto out_up;
-		}
-		memset(ni, 0, sizeof(struct nodeinfo));
-		spin_lock_init(&ni->lock);
-		INIT_LIST_HEAD(&ni->writequeue);
-		spin_lock_init(&ni->writequeue_lock);
-		ni->nodeid = nodeid;
-
-		if (nodeid > max_nodeid)
-			max_nodeid = nodeid;
-	out_up:
-		up_write(&nodeinfo_lock);
+	ni = kmalloc(sizeof(struct nodeinfo), alloc);
+	if (!ni)
+		goto out_up;
+
+	r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
+	if (r) {
+		kfree(ni);
+		ni = NULL;
+		goto out_up;
 	}
+	if (n != nodeid) {
+		idr_remove(&nodeinfo_idr, n);
+		kfree(ni);
+		ni = NULL;
+		goto out_up;
+	}
+	memset(ni, 0, sizeof(struct nodeinfo));
+	spin_lock_init(&ni->lock);
+	INIT_LIST_HEAD(&ni->writequeue);
+	spin_lock_init(&ni->writequeue_lock);
+	ni->nodeid = nodeid;
+
+	if (nodeid > max_nodeid)
+		max_nodeid = nodeid;
+out_up:
+	up_write(&nodeinfo_lock);
 
 	return ni;
 }
@@ -279,13 +285,13 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
 		in4_addr->sin_port = cpu_to_be16(port);
 		memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
 		memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
-				      sizeof(struct sockaddr_in));
+		       sizeof(struct sockaddr_in));
 		*addr_len = sizeof(struct sockaddr_in);
 	} else {
 		struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
 		in6_addr->sin6_port = cpu_to_be16(port);
 		memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
-				      sizeof(struct sockaddr_in6));
+		       sizeof(struct sockaddr_in6));
 		*addr_len = sizeof(struct sockaddr_in6);
 	}
 }
@@ -324,7 +330,7 @@ static void send_shutdown(sctp_assoc_t associd)
 	cmsg->cmsg_type = SCTP_SNDRCV;
 	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
 	outmessage.msg_controllen = cmsg->cmsg_len;
-	sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+	sinfo = CMSG_DATA(cmsg);
 	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
 
 	sinfo->sinfo_flags |= MSG_EOF;
@@ -387,7 +393,7 @@ static void process_sctp_notification(struct msghdr *msg, char *buf)
 
 			if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
 				log_print("COMM_UP for invalid assoc ID %d",
-					 (int)sn->sn_assoc_change.sac_assoc_id);
+					  (int)sn->sn_assoc_change.sac_assoc_id);
 				init_failed();
 				return;
 			}
@@ -398,15 +404,18 @@ static void process_sctp_notification(struct msghdr *msg, char *buf)
 			fs = get_fs();
 			set_fs(get_ds());
 			ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
-						IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
-						(char*)&prim, &prim_len);
+							     IPPROTO_SCTP,
+							     SCTP_PRIMARY_ADDR,
+							     (char*)&prim,
+							     &prim_len);
 			set_fs(fs);
 			if (ret < 0) {
 				struct nodeinfo *ni;
 
 				log_print("getsockopt/sctp_primary_addr on "
 					  "new assoc %d failed : %d",
-				    (int)sn->sn_assoc_change.sac_assoc_id, ret);
+					  (int)sn->sn_assoc_change.sac_assoc_id,
+					  ret);
 
 				/* Retry INIT later */
 				ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
@@ -426,12 +435,10 @@ static void process_sctp_notification(struct msghdr *msg, char *buf)
 				return;
 
 			/* Save the assoc ID */
-			spin_lock(&ni->lock);
 			ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
-			spin_unlock(&ni->lock);
 
 			log_print("got new/restarted association %d nodeid %d",
-			       (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
+				  (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
 
 			/* Send any pending writes */
 			clear_bit(NI_INIT_PENDING, &ni->flags);
@@ -507,13 +514,12 @@ static int receive_from_sock(void)
 		sctp_con.rx_page = alloc_page(GFP_ATOMIC);
 		if (sctp_con.rx_page == NULL)
 			goto out_resched;
-		CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE);
+		cbuf_init(&sctp_con.cb, PAGE_CACHE_SIZE);
 	}
 
 	memset(&incmsg, 0, sizeof(incmsg));
 	memset(&msgname, 0, sizeof(msgname));
 
-	memset(incmsg, 0, sizeof(incmsg));
 	msg.msg_name = &msgname;
 	msg.msg_namelen = sizeof(msgname);
 	msg.msg_flags = 0;
@@ -532,17 +538,17 @@ static int receive_from_sock(void)
 	 * iov[0] is the bit of the circular buffer between the current end
 	 * point (cb.base + cb.len) and the end of the buffer.
 	 */
-	iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb);
+	iov[0].iov_len = sctp_con.cb.base - cbuf_data(&sctp_con.cb);
 	iov[0].iov_base = page_address(sctp_con.rx_page) +
-			  CBUF_DATA(&sctp_con.cb);
+		cbuf_data(&sctp_con.cb);
 	iov[1].iov_len = 0;
 
 	/*
 	 * iov[1] is the bit of the circular buffer between the start of the
 	 * buffer and the start of the currently used section (cb.base)
 	 */
-	if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) {
-		iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb);
+	if (cbuf_data(&sctp_con.cb) >= sctp_con.cb.base) {
+		iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&sctp_con.cb);
 		iov[1].iov_len = sctp_con.cb.base;
 		iov[1].iov_base = page_address(sctp_con.rx_page);
 		msg.msg_iovlen = 2;
@@ -557,7 +563,7 @@ static int receive_from_sock(void)
 	msg.msg_control = incmsg;
 	msg.msg_controllen = sizeof(incmsg);
 	cmsg = CMSG_FIRSTHDR(&msg);
-	sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+	sinfo = CMSG_DATA(cmsg);
 
 	if (msg.msg_flags & MSG_NOTIFICATION) {
 		process_sctp_notification(&msg, page_address(sctp_con.rx_page));
@@ -583,29 +589,29 @@ static int receive_from_sock(void)
 	if (r == 1)
 		return 0;
 
-	CBUF_ADD(&sctp_con.cb, ret);
+	cbuf_add(&sctp_con.cb, ret);
 	ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
 					  page_address(sctp_con.rx_page),
 					  sctp_con.cb.base, sctp_con.cb.len,
 					  PAGE_CACHE_SIZE);
 	if (ret < 0)
 		goto out_close;
-	CBUF_EAT(&sctp_con.cb, ret);
+	cbuf_eat(&sctp_con.cb, ret);
 
-      out:
+out:
 	ret = 0;
 	goto out_ret;
 
-      out_resched:
+out_resched:
 	lowcomms_data_ready(sctp_con.sock->sk, 0);
 	ret = 0;
-	schedule();
+	cond_resched();
 	goto out_ret;
 
-      out_close:
+out_close:
 	if (ret != -EAGAIN)
 		log_print("error reading from sctp socket: %d", ret);
-      out_ret:
+out_ret:
 	return ret;
 }
 
@@ -619,10 +625,12 @@ static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
 	set_fs(get_ds());
 	if (num == 1)
 		result = sctp_con.sock->ops->bind(sctp_con.sock,
-					(struct sockaddr *) addr, addr_len);
+						  (struct sockaddr *) addr,
+						  addr_len);
 	else
 		result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
-				SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len);
+							SCTP_SOCKOPT_BINDX_ADD,
+							(char *)addr, addr_len);
 	set_fs(fs);
 
 	if (result < 0)
@@ -719,10 +727,10 @@ static int init_sock(void)
 
 	return 0;
 
- create_delsock:
+create_delsock:
 	sock_release(sock);
 	sctp_con.sock = NULL;
- out:
+out:
 	return result;
 }
 
@@ -756,16 +764,13 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
 	int users = 0;
 	struct nodeinfo *ni;
 
-	if (!atomic_read(&accepting))
-		return NULL;
-
 	ni = nodeid2nodeinfo(nodeid, allocation);
 	if (!ni)
 		return NULL;
 
 	spin_lock(&ni->writequeue_lock);
 	e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
-	if (((struct list_head *) e == &ni->writequeue) ||
+	if ((&e->list == &ni->writequeue) ||
 	    (PAGE_CACHE_SIZE - e->end < len)) {
 		e = NULL;
 	} else {
@@ -776,7 +781,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
 	spin_unlock(&ni->writequeue_lock);
 
 	if (e) {
-	      got_one:
+	got_one:
 		if (users == 0)
 			kmap(e->page);
 		*ppc = page_address(e->page) + offset;
@@ -803,9 +808,6 @@ void dlm_lowcomms_commit_buffer(void *arg)
 	int users;
 	struct nodeinfo *ni = e->ni;
 
-	if (!atomic_read(&accepting))
-		return;
-
 	spin_lock(&ni->writequeue_lock);
 	users = --e->users;
 	if (users)
@@ -822,7 +824,7 @@ void dlm_lowcomms_commit_buffer(void *arg)
 	}
 	return;
 
-      out:
+out:
 	spin_unlock(&ni->writequeue_lock);
 	return;
 }
@@ -878,7 +880,7 @@ static void initiate_association(int nodeid)
 	cmsg->cmsg_level = IPPROTO_SCTP;
 	cmsg->cmsg_type = SCTP_SNDRCV;
 	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
-	sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+	sinfo = CMSG_DATA(cmsg);
 	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
 	sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
 
@@ -892,7 +894,7 @@ static void initiate_association(int nodeid)
 }
 
 /* Send a message */
-static int send_to_sock(struct nodeinfo *ni)
+static void send_to_sock(struct nodeinfo *ni)
 {
 	int ret = 0;
 	struct writequeue_entry *e;
@@ -903,13 +905,13 @@ static int send_to_sock(struct nodeinfo *ni)
 	struct sctp_sndrcvinfo *sinfo;
 	struct kvec iov;
 
-        /* See if we need to init an association before we start
+	/* See if we need to init an association before we start
 	   sending precious messages */
 	spin_lock(&ni->lock);
 	if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
 		spin_unlock(&ni->lock);
 		initiate_association(ni->nodeid);
-		return 0;
+		return;
 	}
 	spin_unlock(&ni->lock);
 
@@ -923,7 +925,7 @@ static int send_to_sock(struct nodeinfo *ni)
 	cmsg->cmsg_level = IPPROTO_SCTP;
 	cmsg->cmsg_type = SCTP_SNDRCV;
 	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
-	sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+	sinfo = CMSG_DATA(cmsg);
 	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
 	sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
 	sinfo->sinfo_assoc_id = ni->assoc_id;
@@ -955,7 +957,7 @@ static int send_to_sock(struct nodeinfo *ni)
 				goto send_error;
 		} else {
 			/* Don't starve people filling buffers */
-			schedule();
+			cond_resched();
 		}
 
 		spin_lock(&ni->writequeue_lock);
@@ -964,15 +966,16 @@ static int send_to_sock(struct nodeinfo *ni)
 
 		if (e->len == 0 && e->users == 0) {
 			list_del(&e->list);
+			kunmap(e->page);
 			free_entry(e);
 			continue;
 		}
 	}
 	spin_unlock(&ni->writequeue_lock);
- out:
-	return ret;
+out:
+	return;
 
- send_error:
+send_error:
 	log_print("Error sending to node %d %d", ni->nodeid, ret);
 	spin_lock(&ni->lock);
 	if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
@@ -982,7 +985,7 @@ static int send_to_sock(struct nodeinfo *ni)
 	} else
 		spin_unlock(&ni->lock);
 
-	return ret;
+	return;
 }
 
 /* Try to send any messages that are pending */
@@ -994,7 +997,7 @@ static void process_output_queue(void)
 	spin_lock_bh(&write_nodes_lock);
 	list_for_each_safe(list, temp, &write_nodes) {
 		struct nodeinfo *ni =
-		    list_entry(list, struct nodeinfo, write_list);
+			list_entry(list, struct nodeinfo, write_list);
 		clear_bit(NI_WRITE_PENDING, &ni->flags);
 		list_del(&ni->write_list);
 
@@ -1106,7 +1109,7 @@ static int dlm_recvd(void *data)
 		set_current_state(TASK_INTERRUPTIBLE);
 		add_wait_queue(&lowcomms_recv_wait, &wait);
 		if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
-			schedule();
+			cond_resched();
 		remove_wait_queue(&lowcomms_recv_wait, &wait);
 		set_current_state(TASK_RUNNING);
 
@@ -1118,12 +1121,12 @@ static int dlm_recvd(void *data)
 
 				/* Don't starve out everyone else */
 				if (++count >= MAX_RX_MSG_COUNT) {
-					schedule();
+					cond_resched();
 					count = 0;
 				}
 			} while (!kthread_should_stop() && ret >=0);
 		}
-		schedule();
+		cond_resched();
 	}
 
 	return 0;
@@ -1138,7 +1141,7 @@ static int dlm_sendd(void *data)
 	while (!kthread_should_stop()) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (write_list_empty())
-			schedule();
+			cond_resched();
 		set_current_state(TASK_RUNNING);
 
 		if (sctp_con.eagain_flag) {
@@ -1166,7 +1169,7 @@ static int daemons_start(void)
 
 	p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
 	error = IS_ERR(p);
-       	if (error) {
+	if (error) {
 		log_print("can't start dlm_recvd %d", error);
 		return error;
 	}
@@ -1174,7 +1177,7 @@ static int daemons_start(void)
 
 	p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
 	error = IS_ERR(p);
-       	if (error) {
+	if (error) {
 		log_print("can't start dlm_sendd %d", error);
 		kthread_stop(recv_task);
 		return error;
@@ -1197,43 +1200,28 @@ int dlm_lowcomms_start(void)
 	error = daemons_start();
 	if (error)
 		goto fail_sock;
-	atomic_set(&accepting, 1);
 	return 0;
 
- fail_sock:
+fail_sock:
 	close_connection();
 	return error;
 }
 
-/* Set all the activity flags to prevent any socket activity. */
-
 void dlm_lowcomms_stop(void)
 {
-	atomic_set(&accepting, 0);
+	int i;
+
 	sctp_con.flags = 0x7;
 	daemons_stop();
 	clean_writequeues();
 	close_connection();
 	dealloc_nodeinfo();
 	max_nodeid = 0;
-}
 
-int dlm_lowcomms_init(void)
-{
-	init_waitqueue_head(&lowcomms_recv_wait);
-	spin_lock_init(&write_nodes_lock);
-	INIT_LIST_HEAD(&write_nodes);
-	init_rwsem(&nodeinfo_lock);
-	return 0;
-}
-
-void dlm_lowcomms_exit(void)
-{
-	int i;
+	dlm_local_count = 0;
+	dlm_local_nodeid = 0;
 
 	for (i = 0; i < dlm_local_count; i++)
 		kfree(dlm_local_addr[i]);
-	dlm_local_count = 0;
-	dlm_local_nodeid = 0;
 }
 
diff --git a/fs/dlm/lowcomms-tcp.c b/fs/dlm/lowcomms-tcp.c
index 7289e59..8f2791f 100644
--- a/fs/dlm/lowcomms-tcp.c
+++ b/fs/dlm/lowcomms-tcp.c
@@ -54,44 +54,59 @@
 #include "config.h"
 
 struct cbuf {
-	unsigned base;
-	unsigned len;
-	unsigned mask;
+	unsigned int base;
+	unsigned int len;
+	unsigned int mask;
 };
 
-#ifndef FALSE
-#define FALSE 0
-#define TRUE 1
-#endif
 #define NODE_INCREMENT 32
+static void cbuf_add(struct cbuf *cb, int n)
+{
+	cb->len += n;
+}
 
-#define CBUF_INIT(cb, size) do { (cb)->base = (cb)->len = 0; (cb)->mask = ((size)-1); } while(0)
-#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
-#define CBUF_EMPTY(cb) ((cb)->len == 0)
-#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
-#define CBUF_EAT(cb, n) do { (cb)->len  -= (n); \
-                             (cb)->base += (n); (cb)->base &= (cb)->mask; } while(0)
-#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
+static int cbuf_data(struct cbuf *cb)
+{
+	return ((cb->base + cb->len) & cb->mask);
+}
+
+static void cbuf_init(struct cbuf *cb, int size)
+{
+	cb->base = cb->len = 0;
+	cb->mask = size-1;
+}
+
+static void cbuf_eat(struct cbuf *cb, int n)
+{
+	cb->len  -= n;
+	cb->base += n;
+	cb->base &= cb->mask;
+}
+
+static bool cbuf_empty(struct cbuf *cb)
+{
+	return cb->len == 0;
+}
 
 /* Maximum number of incoming messages to process before
-   doing a schedule()
+   doing a cond_resched()
 */
 #define MAX_RX_MSG_COUNT 25
 
 struct connection {
 	struct socket *sock;	/* NULL if not connected */
 	uint32_t nodeid;	/* So we know who we are in the list */
-	struct rw_semaphore sock_sem;	/* Stop connect races */
-	struct list_head read_list;	/* On this list when ready for reading */
-	struct list_head write_list;	/* On this list when ready for writing */
-	struct list_head state_list;	/* On this list when ready to connect */
+	struct rw_semaphore sock_sem; /* Stop connect races */
+	struct list_head read_list;   /* On this list when ready for reading */
+	struct list_head write_list;  /* On this list when ready for writing */
+	struct list_head state_list;  /* On this list when ready to connect */
 	unsigned long flags;	/* bit 1,2 = We are on the read/write lists */
 #define CF_READ_PENDING 1
 #define CF_WRITE_PENDING 2
 #define CF_CONNECT_PENDING 3
 #define CF_IS_OTHERCON 4
-	struct list_head writequeue;	/* List of outgoing writequeue_entries */
-	struct list_head listenlist;    /* List of allocated listening sockets */
+	struct list_head writequeue;  /* List of outgoing writequeue_entries */
+	struct list_head listenlist;  /* List of allocated listening sockets */
 	spinlock_t writequeue_lock;
 	int (*rx_action) (struct connection *);	/* What to do when active */
 	struct page *rx_page;
@@ -121,28 +136,27 @@ static struct task_struct *recv_task;
 static struct task_struct *send_task;
 
 static wait_queue_t lowcomms_send_waitq_head;
-static wait_queue_head_t lowcomms_send_waitq;
+static DECLARE_WAIT_QUEUE_HEAD(lowcomms_send_waitq);
 static wait_queue_t lowcomms_recv_waitq_head;
-static wait_queue_head_t lowcomms_recv_waitq;
+static DECLARE_WAIT_QUEUE_HEAD(lowcomms_recv_waitq);
 
 /* An array of pointers to connections, indexed by NODEID */
 static struct connection **connections;
-static struct semaphore connections_lock;
+static DECLARE_MUTEX(connections_lock);
 static kmem_cache_t *con_cache;
 static int conn_array_size;
-static atomic_t accepting;
 
 /* List of sockets that have reads pending */
-static struct list_head read_sockets;
-static spinlock_t read_sockets_lock;
+static LIST_HEAD(read_sockets);
+static DEFINE_SPINLOCK(read_sockets_lock);
 
 /* List of sockets which have writes pending */
-static struct list_head write_sockets;
-static spinlock_t write_sockets_lock;
+static LIST_HEAD(write_sockets);
+static DEFINE_SPINLOCK(write_sockets_lock);
 
 /* List of sockets which have connects pending */
-static struct list_head state_sockets;
-static spinlock_t state_sockets_lock;
+static LIST_HEAD(state_sockets);
+static DEFINE_SPINLOCK(state_sockets_lock);
 
 static struct connection *nodeid2con(int nodeid, gfp_t allocation)
 {
@@ -153,12 +167,11 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation)
 		int new_size = nodeid + NODE_INCREMENT;
 		struct connection **new_conns;
 
-		new_conns = kmalloc(sizeof(struct connection *) *
+		new_conns = kzalloc(sizeof(struct connection *) *
 				    new_size, allocation);
 		if (!new_conns)
 			goto finish;
 
-		memset(new_conns, 0, sizeof(struct connection *) * new_size);
 		memcpy(new_conns, connections,  sizeof(struct connection *) * conn_array_size);
 		conn_array_size = new_size;
 		kfree(connections);
@@ -168,11 +181,10 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation)
 
 	con = connections[nodeid];
 	if (con == NULL && allocation) {
-		con = kmem_cache_alloc(con_cache, allocation);
+		con = kmem_cache_zalloc(con_cache, allocation);
 		if (!con)
 			goto finish;
 
-		memset(con, 0, sizeof(*con));
 		con->nodeid = nodeid;
 		init_rwsem(&con->sock_sem);
 		INIT_LIST_HEAD(&con->writequeue);
@@ -181,7 +193,7 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation)
 		connections[nodeid] = con;
 	}
 
- finish:
+finish:
 	up(&connections_lock);
 	return con;
 }
@@ -220,8 +232,6 @@ static inline void lowcomms_connect_sock(struct connection *con)
 {
 	if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
 		return;
-	if (!atomic_read(&accepting))
-		return;
 
 	spin_lock_bh(&state_sockets_lock);
 	list_add_tail(&con->state_list, &state_sockets);
@@ -232,31 +242,8 @@ static inline void lowcomms_connect_sock(struct connection *con)
 
 static void lowcomms_state_change(struct sock *sk)
 {
-/*	struct connection *con = sock2con(sk); */
-
-	switch (sk->sk_state) {
-	case TCP_ESTABLISHED:
+	if (sk->sk_state == TCP_ESTABLISHED)
 		lowcomms_write_space(sk);
-		break;
-
-	case TCP_FIN_WAIT1:
-	case TCP_FIN_WAIT2:
-	case TCP_TIME_WAIT:
-	case TCP_CLOSE:
-	case TCP_CLOSE_WAIT:
-	case TCP_LAST_ACK:
-	case TCP_CLOSING:
-		/* FIXME: I think this causes more trouble than it solves.
-		   lowcomms wil reconnect anyway when there is something to
-		   send. This just attempts reconnection if a node goes down!
-		*/
-		/* lowcomms_connect_sock(con); */
-		break;
-
-	default:
-		printk("dlm: lowcomms_state_change: state=%d\n", sk->sk_state);
-		break;
-	}
 }
 
 /* Make a socket active */
@@ -277,13 +264,12 @@ static int add_sock(struct socket *sock, struct connection *con)
 static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
 			  int *addr_len)
 {
-        saddr->ss_family =  dlm_local_addr.ss_family;
-        if (saddr->ss_family == AF_INET) {
+	saddr->ss_family =  dlm_local_addr.ss_family;
+	if (saddr->ss_family == AF_INET) {
 		struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
 		in4_addr->sin_port = cpu_to_be16(port);
 		*addr_len = sizeof(struct sockaddr_in);
-	}
-	else {
+	} else {
 		struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
 		in6_addr->sin6_port = cpu_to_be16(port);
 		*addr_len = sizeof(struct sockaddr_in6);
@@ -291,7 +277,7 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
 }
 
 /* Close a remote connection and tidy up */
-static void close_connection(struct connection *con, int and_other)
+static void close_connection(struct connection *con, bool and_other)
 {
 	down_write(&con->sock_sem);
 
@@ -300,11 +286,8 @@ static void close_connection(struct connection *con, int and_other)
 		con->sock = NULL;
 	}
 	if (con->othercon && and_other) {
-		/* Argh! recursion in kernel code!
-		   Actually, this isn't a list so it
-		   will only re-enter once.
-		*/
-		close_connection(con->othercon, FALSE);
+		/* Will only re-enter once. */
+		close_connection(con->othercon, false);
 	}
 	if (con->rx_page) {
 		__free_page(con->rx_page);
@@ -337,7 +320,7 @@ static int receive_from_sock(struct connection *con)
 		con->rx_page = alloc_page(GFP_ATOMIC);
 		if (con->rx_page == NULL)
 			goto out_resched;
-		CBUF_INIT(&con->cb, PAGE_CACHE_SIZE);
+		cbuf_init(&con->cb, PAGE_CACHE_SIZE);
 	}
 
 	msg.msg_control = NULL;
@@ -352,16 +335,16 @@ static int receive_from_sock(struct connection *con)
 	 * iov[0] is the bit of the circular buffer between the current end
 	 * point (cb.base + cb.len) and the end of the buffer.
 	 */
-	iov[0].iov_len = con->cb.base - CBUF_DATA(&con->cb);
-	iov[0].iov_base = page_address(con->rx_page) + CBUF_DATA(&con->cb);
+	iov[0].iov_len = con->cb.base - cbuf_data(&con->cb);
+	iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb);
 	iov[1].iov_len = 0;
 
 	/*
 	 * iov[1] is the bit of the circular buffer between the start of the
 	 * buffer and the start of the currently used section (cb.base)
 	 */
-	if (CBUF_DATA(&con->cb) >= con->cb.base) {
-		iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&con->cb);
+	if (cbuf_data(&con->cb) >= con->cb.base) {
+		iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&con->cb);
 		iov[1].iov_len = con->cb.base;
 		iov[1].iov_base = page_address(con->rx_page);
 		msg.msg_iovlen = 2;
@@ -378,7 +361,7 @@ static int receive_from_sock(struct connection *con)
 		goto out_close;
 	if (ret == len)
 		call_again_soon = 1;
-	CBUF_ADD(&con->cb, ret);
+	cbuf_add(&con->cb, ret);
 	ret = dlm_process_incoming_buffer(con->nodeid,
 					  page_address(con->rx_page),
 					  con->cb.base, con->cb.len,
@@ -391,35 +374,32 @@ static int receive_from_sock(struct connection *con)
 	}
 	if (ret < 0)
 		goto out_close;
-	CBUF_EAT(&con->cb, ret);
+	cbuf_eat(&con->cb, ret);
 
-	if (CBUF_EMPTY(&con->cb) && !call_again_soon) {
+	if (cbuf_empty(&con->cb) && !call_again_soon) {
 		__free_page(con->rx_page);
 		con->rx_page = NULL;
 	}
 
-      out:
+out:
 	if (call_again_soon)
 		goto out_resched;
 	up_read(&con->sock_sem);
-	ret = 0;
-	goto out_ret;
+	return 0;
 
-      out_resched:
+out_resched:
 	lowcomms_data_ready(con->sock->sk, 0);
 	up_read(&con->sock_sem);
-	ret = 0;
-	schedule();
-	goto out_ret;
+	cond_resched();
+	return 0;
 
-      out_close:
+out_close:
 	up_read(&con->sock_sem);
 	if (ret != -EAGAIN && !test_bit(CF_IS_OTHERCON, &con->flags)) {
-		close_connection(con, FALSE);
+		close_connection(con, false);
 		/* Reconnect when there is something to send */
 	}
 
-      out_ret:
 	return ret;
 }
 
@@ -434,7 +414,8 @@ static int accept_from_sock(struct connection *con)
 	struct connection *newcon;
 
 	memset(&peeraddr, 0, sizeof(peeraddr));
-	result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &newsock);
+	result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM,
+				  IPPROTO_TCP, &newsock);
 	if (result < 0)
 		return -ENOMEM;
 
@@ -462,7 +443,7 @@ static int accept_from_sock(struct connection *con)
 	/* Get the new node's NODEID */
 	make_sockaddr(&peeraddr, 0, &len);
 	if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) {
-	    	printk("dlm: connect from non cluster node\n");
+		printk("dlm: connect from non cluster node\n");
 		sock_release(newsock);
 		up_read(&con->sock_sem);
 		return -1;
@@ -483,17 +464,16 @@ static int accept_from_sock(struct connection *con)
 	}
 	down_write(&newcon->sock_sem);
 	if (newcon->sock) {
-	        struct connection *othercon = newcon->othercon;
+		struct connection *othercon = newcon->othercon;
 
 		if (!othercon) {
-			othercon = kmem_cache_alloc(con_cache, GFP_KERNEL);
+			othercon = kmem_cache_zalloc(con_cache, GFP_KERNEL);
 			if (!othercon) {
 				printk("dlm: failed to allocate incoming socket\n");
 				up_write(&newcon->sock_sem);
 				result = -ENOMEM;
 				goto accept_err;
 			}
-			memset(othercon, 0, sizeof(*othercon));
 			othercon->nodeid = nodeid;
 			othercon->rx_action = receive_from_sock;
 			init_rwsem(&othercon->sock_sem);
@@ -523,7 +503,7 @@ static int accept_from_sock(struct connection *con)
 
 	return 0;
 
-      accept_err:
+accept_err:
 	up_read(&con->sock_sem);
 	sock_release(newsock);
 
@@ -533,7 +513,7 @@ static int accept_from_sock(struct connection *con)
 }
 
 /* Connect a new socket to its peer */
-static int connect_to_sock(struct connection *con)
+static void connect_to_sock(struct connection *con)
 {
 	int result = -EHOSTUNREACH;
 	struct sockaddr_storage saddr;
@@ -542,7 +522,7 @@ static int connect_to_sock(struct connection *con)
 
 	if (con->nodeid == 0) {
 		log_print("attempt to connect sock 0 foiled");
-		return 0;
+		return;
 	}
 
 	down_write(&con->sock_sem);
@@ -556,13 +536,14 @@ static int connect_to_sock(struct connection *con)
 	}
 
 	/* Create a socket to communicate with */
-	result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &sock);
+	result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM,
+				  IPPROTO_TCP, &sock);
 	if (result < 0)
 		goto out_err;
 
 	memset(&saddr, 0, sizeof(saddr));
 	if (dlm_nodeid_to_addr(con->nodeid, &saddr))
-	        goto out_err;
+		goto out_err;
 
 	sock->sk->sk_user_data = con;
 	con->rx_action = receive_from_sock;
@@ -574,22 +555,13 @@ static int connect_to_sock(struct connection *con)
 	log_print("connecting to %d", con->nodeid);
 	result =
 		sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
-			       O_NONBLOCK);
+				   O_NONBLOCK);
 	if (result == -EINPROGRESS)
 		result = 0;
-	if (result != 0)
-		goto out_err;
-
-      out:
-	up_write(&con->sock_sem);
-	/*
-	 * Returning an error here means we've given up trying to connect to
-	 * a remote node, otherwise we return 0 and reschedule the connetion
-	 * attempt
-	 */
-	return result;
+	if (result == 0)
+		goto out;
 
-      out_err:
+out_err:
 	if (con->sock) {
 		sock_release(con->sock);
 		con->sock = NULL;
@@ -604,12 +576,15 @@ static int connect_to_sock(struct connection *con)
 		lowcomms_connect_sock(con);
 		result = 0;
 	}
-	goto out;
+out:
+	up_write(&con->sock_sem);
+	return;
 }
 
-static struct socket *create_listen_sock(struct connection *con, struct sockaddr_storage *saddr)
+static struct socket *create_listen_sock(struct connection *con,
+					 struct sockaddr_storage *saddr)
 {
-        struct socket *sock = NULL;
+	struct socket *sock = NULL;
 	mm_segment_t fs;
 	int result = 0;
 	int one = 1;
@@ -629,10 +604,12 @@ static struct socket *create_listen_sock(struct connection *con, struct sockaddr
 
 	fs = get_fs();
 	set_fs(get_ds());
-	result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&one, sizeof(one));
+	result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+				 (char *)&one, sizeof(one));
 	set_fs(fs);
 	if (result < 0) {
-		printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",result);
+		printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",
+		       result);
 	}
 	sock->sk->sk_user_data = con;
 	con->rx_action = accept_from_sock;
@@ -652,7 +629,8 @@ static struct socket *create_listen_sock(struct connection *con, struct sockaddr
 	fs = get_fs();
 	set_fs(get_ds());
 
-	result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&one, sizeof(one));
+	result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+				 (char *)&one, sizeof(one));
 	set_fs(fs);
 	if (result < 0) {
 		printk("dlm: Set keepalive failed: %d\n", result);
@@ -666,7 +644,7 @@ static struct socket *create_listen_sock(struct connection *con, struct sockaddr
 		goto create_out;
 	}
 
-      create_out:
+create_out:
 	return sock;
 }
 
@@ -679,10 +657,6 @@ static int listen_for_all(void)
 	int result = -EINVAL;
 
 	/* We don't support multi-homed hosts */
-	memset(con, 0, sizeof(*con));
-	init_rwsem(&con->sock_sem);
-	spin_lock_init(&con->writequeue_lock);
-	INIT_LIST_HEAD(&con->writequeue);
 	set_bit(CF_IS_OTHERCON, &con->flags);
 
 	sock = create_listen_sock(con, &dlm_local_addr);
@@ -731,16 +705,12 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len,
 	int offset = 0;
 	int users = 0;
 
-	if (!atomic_read(&accepting))
-		return NULL;
-
 	con = nodeid2con(nodeid, allocation);
 	if (!con)
 		return NULL;
 
-	spin_lock(&con->writequeue_lock);
 	e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
-	if (((struct list_head *) e == &con->writequeue) ||
+	if ((&e->list == &con->writequeue) ||
 	    (PAGE_CACHE_SIZE - e->end < len)) {
 		e = NULL;
 	} else {
@@ -751,7 +721,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len,
 	spin_unlock(&con->writequeue_lock);
 
 	if (e) {
-	      got_one:
+	got_one:
 		if (users == 0)
 			kmap(e->page);
 		*ppc = page_address(e->page) + offset;
@@ -777,10 +747,6 @@ void dlm_lowcomms_commit_buffer(void *mh)
 	struct connection *con = e->con;
 	int users;
 
-	if (!atomic_read(&accepting))
-		return;
-
-	spin_lock(&con->writequeue_lock);
 	users = --e->users;
 	if (users)
 		goto out;
@@ -797,7 +763,7 @@ void dlm_lowcomms_commit_buffer(void *mh)
 	}
 	return;
 
-      out:
+out:
 	spin_unlock(&con->writequeue_lock);
 	return;
 }
@@ -809,7 +775,7 @@ static void free_entry(struct writequeue_entry *e)
 }
 
 /* Send a message */
-static int send_to_sock(struct connection *con)
+static void send_to_sock(struct connection *con)
 {
 	int ret = 0;
 	ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
@@ -846,7 +812,7 @@ static int send_to_sock(struct connection *con)
 		}
 		else {
 			/* Don't starve people filling buffers */
-			schedule();
+			cond_resched();
 		}
 
 		spin_lock(&con->writequeue_lock);
@@ -855,25 +821,26 @@ static int send_to_sock(struct connection *con)
 
 		if (e->len == 0 && e->users == 0) {
 			list_del(&e->list);
+			kunmap(e->page);
 			free_entry(e);
 			continue;
 		}
 	}
 	spin_unlock(&con->writequeue_lock);
-      out:
+out:
 	up_read(&con->sock_sem);
-	return ret;
+	return;
 
-      send_error:
+send_error:
 	up_read(&con->sock_sem);
-	close_connection(con, FALSE);
+	close_connection(con, false);
 	lowcomms_connect_sock(con);
-	return ret;
+	return;
 
-      out_connect:
+out_connect:
 	up_read(&con->sock_sem);
 	lowcomms_connect_sock(con);
-	return 0;
+	return;
 }
 
 static void clean_one_writequeue(struct connection *con)
@@ -904,12 +871,12 @@ int dlm_lowcomms_close(int nodeid)
 	con = nodeid2con(nodeid, 0);
 	if (con) {
 		clean_one_writequeue(con);
-		close_connection(con, TRUE);
+		close_connection(con, true);
 		atomic_set(&con->waiting_requests, 0);
 	}
 	return 0;
 
-      out:
+out:
 	return -1;
 }
 
@@ -940,7 +907,7 @@ static void process_sockets(void)
 	list_for_each_safe(list, temp, &read_sockets) {
 
 		struct connection *con =
-		    list_entry(list, struct connection, read_list);
+			list_entry(list, struct connection, read_list);
 		list_del(&con->read_list);
 		clear_bit(CF_READ_PENDING, &con->flags);
 
@@ -959,7 +926,7 @@ static void process_sockets(void)
 
 			/* Don't starve out everyone else */
 			if (++count >= MAX_RX_MSG_COUNT) {
-				schedule();
+				cond_resched();
 				count = 0;
 			}
 
@@ -977,20 +944,16 @@ static void process_output_queue(void)
 {
 	struct list_head *list;
 	struct list_head *temp;
-	int ret;
 
 	spin_lock_bh(&write_sockets_lock);
 	list_for_each_safe(list, temp, &write_sockets) {
 		struct connection *con =
-		    list_entry(list, struct connection, write_list);
+			list_entry(list, struct connection, write_list);
 		clear_bit(CF_WRITE_PENDING, &con->flags);
 		list_del(&con->write_list);
 
 		spin_unlock_bh(&write_sockets_lock);
-
-		ret = send_to_sock(con);
-		if (ret < 0) {
-		}
+		send_to_sock(con);
 		spin_lock_bh(&write_sockets_lock);
 	}
 	spin_unlock_bh(&write_sockets_lock);
@@ -1000,19 +963,16 @@ static void process_state_queue(void)
 {
 	struct list_head *list;
 	struct list_head *temp;
-	int ret;
 
 	spin_lock_bh(&state_sockets_lock);
 	list_for_each_safe(list, temp, &state_sockets) {
 		struct connection *con =
-		    list_entry(list, struct connection, state_list);
+			list_entry(list, struct connection, state_list);
 		list_del(&con->state_list);
 		clear_bit(CF_CONNECT_PENDING, &con->flags);
 		spin_unlock_bh(&state_sockets_lock);
 
-		ret = connect_to_sock(con);
-		if (ret < 0) {
-		}
+		connect_to_sock(con);
 		spin_lock_bh(&state_sockets_lock);
 	}
 	spin_unlock_bh(&state_sockets_lock);
@@ -1046,14 +1006,13 @@ static int read_list_empty(void)
 /* DLM Transport comms receive daemon */
 static int dlm_recvd(void *data)
 {
-	init_waitqueue_head(&lowcomms_recv_waitq);
 	init_waitqueue_entry(&lowcomms_recv_waitq_head, current);
 	add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head);
 
 	while (!kthread_should_stop()) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (read_list_empty())
-			schedule();
+			cond_resched();
 		set_current_state(TASK_RUNNING);
 
 		process_sockets();
@@ -1081,14 +1040,13 @@ static int write_and_state_lists_empty(void)
 /* DLM Transport send daemon */
 static int dlm_sendd(void *data)
 {
-	init_waitqueue_head(&lowcomms_send_waitq);
 	init_waitqueue_entry(&lowcomms_send_waitq_head, current);
 	add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head);
 
 	while (!kthread_should_stop()) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (write_and_state_lists_empty())
-			schedule();
+			cond_resched();
 		set_current_state(TASK_RUNNING);
 
 		process_state_queue();
@@ -1111,7 +1069,7 @@ static int daemons_start(void)
 
 	p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
 	error = IS_ERR(p);
-       	if (error) {
+	if (error) {
 		log_print("can't start dlm_recvd %d", error);
 		return error;
 	}
@@ -1119,7 +1077,7 @@ static int daemons_start(void)
 
 	p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
 	error = IS_ERR(p);
-       	if (error) {
+	if (error) {
 		log_print("can't start dlm_sendd %d", error);
 		kthread_stop(recv_task);
 		return error;
@@ -1141,21 +1099,20 @@ void dlm_lowcomms_stop(void)
 {
 	int i;
 
-	atomic_set(&accepting, 0);
-
-	/* Set all the activity flags to prevent any
+	/* Set all the flags to prevent any
 	   socket activity.
 	*/
 	for (i = 0; i < conn_array_size; i++) {
 		if (connections[i])
-			connections[i]->flags |= 0x7;
+			connections[i]->flags |= 0xFF;
 	}
+
 	daemons_stop();
 	clean_writequeues();
 
 	for (i = 0; i < conn_array_size; i++) {
 		if (connections[i]) {
-			close_connection(connections[i], TRUE);
+			close_connection(connections[i], true);
 			if (connections[i]->othercon)
 				kmem_cache_free(con_cache, connections[i]->othercon);
 			kmem_cache_free(con_cache, connections[i]);
@@ -1173,24 +1130,12 @@ int dlm_lowcomms_start(void)
 {
 	int error = 0;
 
-	error = -ENOTCONN;
-
-	/*
-	 * Temporarily initialise the waitq head so that lowcomms_send_message
-	 * doesn't crash if it gets called before the thread is fully
-	 * initialised
-	 */
-	init_waitqueue_head(&lowcomms_send_waitq);
-
 	error = -ENOMEM;
-	connections = kmalloc(sizeof(struct connection *) *
+	connections = kzalloc(sizeof(struct connection *) *
 			      NODE_INCREMENT, GFP_KERNEL);
 	if (!connections)
 		goto out;
 
-	memset(connections, 0,
-	       sizeof(struct connection *) * NODE_INCREMENT);
-
 	conn_array_size = NODE_INCREMENT;
 
 	if (dlm_our_addr(&dlm_local_addr, 0)) {
@@ -1203,7 +1148,8 @@ int dlm_lowcomms_start(void)
 	}
 
 	con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection),
-				      __alignof__(struct connection), 0, NULL, NULL);
+				      __alignof__(struct connection), 0,
+				      NULL, NULL);
 	if (!con_cache)
 		goto fail_free_conn;
 
@@ -1217,40 +1163,20 @@ int dlm_lowcomms_start(void)
 	if (error)
 		goto fail_unlisten;
 
-	atomic_set(&accepting, 1);
-
 	return 0;
 
-      fail_unlisten:
-	close_connection(connections[0], 0);
+fail_unlisten:
+	close_connection(connections[0], false);
 	kmem_cache_free(con_cache, connections[0]);
 	kmem_cache_destroy(con_cache);
 
-      fail_free_conn:
+fail_free_conn:
 	kfree(connections);
 
-      out:
+out:
 	return error;
 }
 
-int dlm_lowcomms_init(void)
-{
-	INIT_LIST_HEAD(&read_sockets);
-	INIT_LIST_HEAD(&write_sockets);
-	INIT_LIST_HEAD(&state_sockets);
-
-	spin_lock_init(&read_sockets_lock);
-	spin_lock_init(&write_sockets_lock);
-	spin_lock_init(&state_sockets_lock);
-	init_MUTEX(&connections_lock);
-
-	return 0;
-}
-
-void dlm_lowcomms_exit(void)
-{
-}
-
 /*
  * Overrides for Emacs so that we follow Linus's tabbing style.
  * Emacs will notice this stuff at the end of the file and automatically
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
index 2d045e0..a9a9618 100644
--- a/fs/dlm/lowcomms.h
+++ b/fs/dlm/lowcomms.h
@@ -14,8 +14,6 @@
 #ifndef __LOWCOMMS_DOT_H__
 #define __LOWCOMMS_DOT_H__
 
-int dlm_lowcomms_init(void);
-void dlm_lowcomms_exit(void);
 int dlm_lowcomms_start(void);
 void dlm_lowcomms_stop(void);
 int dlm_lowcomms_close(int nodeid);
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index a8da8dc..162fbae 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -16,7 +16,6 @@
 #include "lock.h"
 #include "user.h"
 #include "memory.h"
-#include "lowcomms.h"
 #include "config.h"
 
 #ifdef CONFIG_DLM_DEBUG
@@ -47,20 +46,14 @@ static int __init init_dlm(void)
 	if (error)
 		goto out_config;
 
-	error = dlm_lowcomms_init();
-	if (error)
-		goto out_debug;
-
 	error = dlm_user_init();
 	if (error)
-		goto out_lowcomms;
+		goto out_debug;
 
 	printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
 
 	return 0;
 
- out_lowcomms:
-	dlm_lowcomms_exit();
  out_debug:
 	dlm_unregister_debugfs();
  out_config:
@@ -76,7 +69,6 @@ static int __init init_dlm(void)
 static void __exit exit_dlm(void)
 {
 	dlm_user_exit();
-	dlm_lowcomms_exit();
 	dlm_config_exit();
 	dlm_memory_exit();
 	dlm_lockspace_exit();
-- 
cgit v0.10.2


From 676dcb8bc2ec78d80091037773598d6ec8c673d6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 6 Dec 2006 20:31:30 -0800
Subject: [PATCH] add bottom_half.h

With CONFIG_SMP=n:

  drivers/input/ff-memless.c:384: warning: implicit declaration of function 'local_bh_disable'
  drivers/input/ff-memless.c:393: warning: implicit declaration of function 'local_bh_enable'

Really linux/spinlock.h should include linux/interrupt.h.  But interrupt.h
includes sched.h which will need spinlock.h.

So the patch breaks the _bh declarations out into a separate header and
includes it in both interrupt.h and spinlock.h.

Cc: "Randy.Dunlap" <rdunlap@xenotime.net>
Cc: Andi Kleen <ak@suse.de>
Cc: <stable@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
new file mode 100644
index 0000000..777dbf6
--- /dev/null
+++ b/include/linux/bottom_half.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_BH_H
+#define _LINUX_BH_H
+
+extern void local_bh_disable(void);
+extern void __local_bh_enable(void);
+extern void _local_bh_enable(void);
+extern void local_bh_enable(void);
+extern void local_bh_enable_ip(unsigned long ip);
+
+#endif /* _LINUX_BH_H */
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 5b83e7b..de7593f 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -11,6 +11,7 @@
 #include <linux/hardirq.h>
 #include <linux/sched.h>
 #include <linux/irqflags.h>
+#include <linux/bottom_half.h>
 #include <asm/atomic.h>
 #include <asm/ptrace.h>
 #include <asm/system.h>
@@ -217,12 +218,6 @@ static inline void __deprecated save_and_cli(unsigned long *x)
 #define save_and_cli(x)	save_and_cli(&x)
 #endif /* CONFIG_SMP */
 
-extern void local_bh_disable(void);
-extern void __local_bh_enable(void);
-extern void _local_bh_enable(void);
-extern void local_bh_enable(void);
-extern void local_bh_enable_ip(unsigned long ip);
-
 /* PLEASE, avoid to allocate new softirqs, if you need not _really_ high
    frequency threaded job scheduling. For almost all the purposes
    tasklets are more than enough. F.e. all serial device BHs et
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 8451052..94b767d 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -52,6 +52,7 @@
 #include <linux/thread_info.h>
 #include <linux/kernel.h>
 #include <linux/stringify.h>
+#include <linux/bottom_half.h>
 
 #include <asm/system.h>
 
-- 
cgit v0.10.2


From a1e85378ba50694cf8f27b190c7e16d7c8dad276 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 6 Dec 2006 20:31:33 -0800
Subject: [PATCH] drm-sis linkage fix

Fix http://bugzilla.kernel.org/show_bug.cgi?id=7606

WARNING: "drm_sman_set_manager" [drivers/char/drm/sis.ko] undefined!

Cc: <daniel-silveira@gee.inatel.br>
Cc: Dave Airlie <airlied@linux.ie>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/drm/drm_sman.c b/drivers/char/drm/drm_sman.c
index 425c823..19c81d2 100644
--- a/drivers/char/drm/drm_sman.c
+++ b/drivers/char/drm/drm_sman.c
@@ -162,6 +162,7 @@ drm_sman_set_manager(drm_sman_t * sman, unsigned int manager,
 
 	return 0;
 }
+EXPORT_SYMBOL(drm_sman_set_manager);
 
 static drm_owner_item_t *drm_sman_get_owner_item(drm_sman_t * sman,
 						 unsigned long owner)
-- 
cgit v0.10.2


From 822191a2fa1584a29c3224ab328507adcaeac1ab Mon Sep 17 00:00:00 2001
From: Andrey Mirkin <amirkin@openvz.org>
Date: Wed, 6 Dec 2006 20:31:35 -0800
Subject: [PATCH] skip data conversion in compat_sys_mount when data_page is
 NULL

OpenVZ Linux kernel team has found a problem with mounting in compat mode.

Simple command "mount -t smbfs ..." on Fedora Core 5 distro in 32-bit mode
leads to oops:

  Unable to handle kernel NULL pointer dereference at 0000000000000000 RIP: compat_sys_mount+0xd6/0x290
  Process mount (pid: 14656, veid=300, threadinfo ffff810034d30000, task ffff810034c86bc0)
  Call Trace: ia32_sysret+0x0/0xa

The problem is that data_page pointer can be NULL, so we should skip data
conversion in this case.

Signed-off-by: Andrey Mirkin <amirkin@openvz.org>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/compat.c b/fs/compat.c
index 06dad66..7aef541 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -871,7 +871,7 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
 
 	retval = -EINVAL;
 
-	if (type_page) {
+	if (type_page && data_page) {
 		if (!strcmp((char *)type_page, SMBFS_NAME)) {
 			do_smb_super_data_conv((void *)data_page);
 		} else if (!strcmp((char *)type_page, NCPFS_NAME)) {
-- 
cgit v0.10.2


From a2ce774096110ccc5c02cbdc05897d005fcd3db8 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 6 Dec 2006 20:31:36 -0800
Subject: [PATCH] uml: workqueue build fix

  arch/um/drivers/chan_kern.c:643: error: conflicting types for 'chan_interrupt'
  arch/um/include/chan_kern.h:31: error: previous declaration of 'chan_interrupt'

Cc: David Howells <dhowells@redhat.com>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 426633e..aa3090d 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -31,9 +31,9 @@ static irqreturn_t line_interrupt(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static void line_timer_cb(void *arg)
+static void line_timer_cb(struct work_struct *work)
 {
-	struct line *line = arg;
+	struct line *line = container_of(work, struct line, task.work);
 
 	if(!line->throttled)
 		chan_interrupt(&line->chan_list, &line->task, line->tty,
@@ -443,7 +443,7 @@ int line_open(struct line *lines, struct tty_struct *tty)
 		 * is registered.
 		 */
 		enable_chan(line);
-		INIT_WORK(&line->task, line_timer_cb, line);
+		INIT_DELAYED_WORK(&line->task, line_timer_cb);
 
 		if(!line->sigio){
 			chan_enable_winch(&line->chan_list, tty);
diff --git a/arch/um/include/chan_kern.h b/arch/um/include/chan_kern.h
index 572d286..9003a34 100644
--- a/arch/um/include/chan_kern.h
+++ b/arch/um/include/chan_kern.h
@@ -27,7 +27,7 @@ struct chan {
 	void *data;
 };
 
-extern void chan_interrupt(struct list_head *chans, struct work_struct *task,
+extern void chan_interrupt(struct list_head *chans, struct delayed_work *task,
 			   struct tty_struct *tty, int irq);
 extern int parse_chan_pair(char *str, struct line *line, int device,
 			   const struct chan_opts *opts);
diff --git a/arch/um/include/line.h b/arch/um/include/line.h
index 7be2481..214ee76 100644
--- a/arch/um/include/line.h
+++ b/arch/um/include/line.h
@@ -51,7 +51,7 @@ struct line {
 	char *tail;
 
 	int sigio;
-	struct work_struct task;
+	struct delayed_work task;
 	const struct line_driver *driver;
 	int have_irq;
 };
-- 
cgit v0.10.2


From 0798e5193cd70f6c867ec176d7730589f944c627 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 6 Dec 2006 20:31:38 -0800
Subject: [PATCH] memory page alloc minor cleanups

- s/freeliest/freelist/ spelling fix

- Check for NULL *z zone seems useless - even if it could happen, so
  what?  Perhaps we should have a check later on if we are faced with an
  allocation request that is not allowed to fail - shouldn't that be a
  serious kernel error, passing an empty zonelist with a mandate to not
  fail?

- Initializing 'z' to zonelist->zones can wait until after the first
  get_page_from_freelist() fails; we only use 'z' in the wakeup_kswapd()
  loop, so let's initialize 'z' there, in a 'for' loop.  Seems clearer.

- Remove superfluous braces around a break

- Fix a couple errant spaces

- Adjust indentation on the cpuset_zone_allowed() check, to match the
  lines just before it -- seems easier to read in this case.

- Add another set of braces to the zone_watermark_ok logic

From: Paul Jackson <pj@sgi.com>

  Backout one item from a previous "memory page_alloc minor cleanups" patch.
   Until and unless we are certain that no one can ever pass an empty zonelist
  to __alloc_pages(), this check for an empty zonelist (or some BUG
  equivalent) is essential.  The code in get_page_from_freelist() blow ups if
  passed an empty zonelist.

Signed-off-by: Paul Jackson <pj@sgi.com>
Acked-by: Christoph Lameter <clameter@sgi.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index aa6fcc7..08360aa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -486,7 +486,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order)
 	spin_lock(&zone->lock);
 	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
-	__free_one_page(page, zone ,order);
+	__free_one_page(page, zone, order);
 	spin_unlock(&zone->lock);
 }
 
@@ -926,7 +926,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 }
 
 /*
- * get_page_from_freeliest goes through the zonelist trying to allocate
+ * get_page_from_freelist goes through the zonelist trying to allocate
  * a page.
  */
 static struct page *
@@ -948,8 +948,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
 			zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
 				break;
 		if ((alloc_flags & ALLOC_CPUSET) &&
-				!cpuset_zone_allowed(zone, gfp_mask))
-			continue;
+			!cpuset_zone_allowed(zone, gfp_mask))
+				continue;
 
 		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
 			unsigned long mark;
@@ -959,17 +959,18 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
 				mark = zone->pages_low;
 			else
 				mark = zone->pages_high;
-			if (!zone_watermark_ok(zone , order, mark,
-				    classzone_idx, alloc_flags))
+			if (!zone_watermark_ok(zone, order, mark,
+				    classzone_idx, alloc_flags)) {
 				if (!zone_reclaim_mode ||
 				    !zone_reclaim(zone, gfp_mask, order))
 					continue;
+			}
 		}
 
 		page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
-		if (page) {
+		if (page)
 			break;
-		}
+
 	} while (*(++z) != NULL);
 	return page;
 }
@@ -1005,9 +1006,8 @@ restart:
 	if (page)
 		goto got_pg;
 
-	do {
+	for (z = zonelist->zones; *z; z++)
 		wakeup_kswapd(*z, order);
-	} while (*(++z));
 
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
-- 
cgit v0.10.2


From c0a499c2c42992cff097b38be29d2ba60d2fd99a Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Wed, 6 Dec 2006 20:31:39 -0800
Subject: [PATCH] __unmap_hugepage_range(): add comment

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a088f59..f7355bf 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -365,6 +365,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	pte_t pte;
 	struct page *page;
 	struct page *tmp;
+	/*
+	 * A page gathering list, protected by per file i_mmap_lock. The
+	 * lock is used to avoid list corruption from multiple unmapping
+	 * of the same page since we are using page->lru.
+	 */
 	LIST_HEAD(page_list);
 
 	WARN_ON(!is_vm_hugetlb_page(vma));
-- 
cgit v0.10.2


From 89689ae7f95995723fbcd5c116c47933a3bb8b13 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:31:45 -0800
Subject: [PATCH] Get rid of zone_table[]

The zone table is mostly not needed.  If we have a node in the page flags
then we can get to the zone via NODE_DATA() which is much more likely to be
already in the cpu cache.

In case of SMP and UP NODE_DATA() is a constant pointer which allows us to
access an exact replica of zonetable in the node_zones field.  In all of
the above cases there will be no need at all for the zone table.

The only remaining case is if in a NUMA system the node numbers do not fit
into the page flags.  In that case we make sparse generate a table that
maps sections to nodes and use that table to to figure out the node number.
 This table is sized to fit in a single cache line for the known 32 bit
NUMA platform which makes it very likely that the information can be
obtained without a cache miss.

For sparsemem the zone table seems to be have been fairly large based on
the maximum possible number of sections and the number of zones per node.
There is some memory saving by removing zone_table.  The main benefit is to
reduce the cache foootprint of the VM from the frequent lookups of zones.
Plus it simplifies the page allocator.

[akpm@osdl.org: build fix]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d538de9..ab6e497 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -396,7 +396,9 @@ void split_page(struct page *page, unsigned int order);
  * We are going to use the flags for the page to node mapping if its in
  * there.  This includes the case where there is no node, so it is implicit.
  */
-#define FLAGS_HAS_NODE		(NODES_WIDTH > 0 || NODES_SHIFT == 0)
+#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0)
+#define NODE_NOT_IN_PAGE_FLAGS
+#endif
 
 #ifndef PFN_SECTION_SHIFT
 #define PFN_SECTION_SHIFT 0
@@ -411,13 +413,18 @@ void split_page(struct page *page, unsigned int order);
 #define NODES_PGSHIFT		(NODES_PGOFF * (NODES_WIDTH != 0))
 #define ZONES_PGSHIFT		(ZONES_PGOFF * (ZONES_WIDTH != 0))
 
-/* NODE:ZONE or SECTION:ZONE is used to lookup the zone from a page. */
-#if FLAGS_HAS_NODE
-#define ZONETABLE_SHIFT		(NODES_SHIFT + ZONES_SHIFT)
+/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allcator */
+#ifdef NODE_NOT_IN_PAGEFLAGS
+#define ZONEID_SHIFT		(SECTIONS_SHIFT + ZONES_SHIFT)
 #else
-#define ZONETABLE_SHIFT		(SECTIONS_SHIFT + ZONES_SHIFT)
+#define ZONEID_SHIFT		(NODES_SHIFT + ZONES_SHIFT)
+#endif
+
+#if ZONES_WIDTH > 0
+#define ZONEID_PGSHIFT		ZONES_PGSHIFT
+#else
+#define ZONEID_PGSHIFT		NODES_PGOFF
 #endif
-#define ZONETABLE_PGSHIFT	ZONES_PGSHIFT
 
 #if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
 #error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
@@ -426,23 +433,25 @@ void split_page(struct page *page, unsigned int order);
 #define ZONES_MASK		((1UL << ZONES_WIDTH) - 1)
 #define NODES_MASK		((1UL << NODES_WIDTH) - 1)
 #define SECTIONS_MASK		((1UL << SECTIONS_WIDTH) - 1)
-#define ZONETABLE_MASK		((1UL << ZONETABLE_SHIFT) - 1)
+#define ZONEID_MASK		((1UL << ZONEID_SHIFT) - 1)
 
 static inline enum zone_type page_zonenum(struct page *page)
 {
 	return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
 }
 
-struct zone;
-extern struct zone *zone_table[];
-
+/*
+ * The identification function is only used by the buddy allocator for
+ * determining if two pages could be buddies. We are not really
+ * identifying a zone since we could be using a the section number
+ * id if we have not node id available in page flags.
+ * We guarantee only that it will return the same value for two
+ * combinable pages in a zone.
+ */
 static inline int page_zone_id(struct page *page)
 {
-	return (page->flags >> ZONETABLE_PGSHIFT) & ZONETABLE_MASK;
-}
-static inline struct zone *page_zone(struct page *page)
-{
-	return zone_table[page_zone_id(page)];
+	BUILD_BUG_ON(ZONEID_PGSHIFT == 0 && ZONEID_MASK);
+	return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
 }
 
 static inline unsigned long zone_to_nid(struct zone *zone)
@@ -454,13 +463,20 @@ static inline unsigned long zone_to_nid(struct zone *zone)
 #endif
 }
 
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+extern unsigned long page_to_nid(struct page *page);
+#else
 static inline unsigned long page_to_nid(struct page *page)
 {
-	if (FLAGS_HAS_NODE)
-		return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
-	else
-		return zone_to_nid(page_zone(page));
+	return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
 }
+#endif
+
+static inline struct zone *page_zone(struct page *page)
+{
+	return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
+}
+
 static inline unsigned long page_to_section(struct page *page)
 {
 	return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
@@ -477,6 +493,7 @@ static inline void set_page_node(struct page *page, unsigned long node)
 	page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
 	page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
 }
+
 static inline void set_page_section(struct page *page, unsigned long section)
 {
 	page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
@@ -947,8 +964,6 @@ extern void mem_init(void);
 extern void show_mem(void);
 extern void si_meminfo(struct sysinfo * val);
 extern void si_meminfo_node(struct sysinfo *val, int nid);
-extern void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
-					unsigned long pfn, unsigned long size);
 
 #ifdef CONFIG_NUMA
 extern void setup_per_cpu_pageset(void);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fd678a6..0c055a09 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -72,7 +72,6 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 			return ret;
 	}
 	memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
-	zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
 	return 0;
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 08360aa..23bc5bc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -83,13 +83,6 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
 
 EXPORT_SYMBOL(totalram_pages);
 
-/*
- * Used by page_zone() to look up the address of the struct zone whose
- * id is encoded in the upper bits of page->flags
- */
-struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
-EXPORT_SYMBOL(zone_table);
-
 static char *zone_names[MAX_NR_ZONES] = {
 	 "DMA",
 #ifdef CONFIG_ZONE_DMA32
@@ -1715,20 +1708,6 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
 	}
 }
 
-#define ZONETABLE_INDEX(x, zone_nr)	((x << ZONES_SHIFT) | zone_nr)
-void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
-		unsigned long pfn, unsigned long size)
-{
-	unsigned long snum = pfn_to_section_nr(pfn);
-	unsigned long end = pfn_to_section_nr(pfn + size);
-
-	if (FLAGS_HAS_NODE)
-		zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
-	else
-		for (; snum <= end; snum++)
-			zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
-}
-
 #ifndef __HAVE_ARCH_MEMMAP_INIT
 #define memmap_init(size, nid, zone, start_pfn) \
 	memmap_init_zone((size), (nid), (zone), (start_pfn))
@@ -2421,7 +2400,6 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
 		if (!size)
 			continue;
 
-		zonetable_add(zone, nid, j, zone_start_pfn, size);
 		ret = init_currently_empty_zone(zone, zone_start_pfn, size);
 		BUG_ON(ret);
 		zone_start_pfn += size;
diff --git a/mm/sparse.c b/mm/sparse.c
index b3c82ba..158d6a2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -24,6 +24,25 @@ struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
 #endif
 EXPORT_SYMBOL(mem_section);
 
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+/*
+ * If we did not store the node number in the page then we have to
+ * do a lookup in the section_to_node_table in order to find which
+ * node the page belongs to.
+ */
+#if MAX_NUMNODES <= 256
+static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
+#else
+static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
+#endif
+
+unsigned long page_to_nid(struct page *page)
+{
+	return section_to_node_table[page_to_section(page)];
+}
+EXPORT_SYMBOL(page_to_nid);
+#endif
+
 #ifdef CONFIG_SPARSEMEM_EXTREME
 static struct mem_section *sparse_index_alloc(int nid)
 {
@@ -49,6 +68,10 @@ static int sparse_index_init(unsigned long section_nr, int nid)
 	struct mem_section *section;
 	int ret = 0;
 
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+	section_to_node_table[section_nr] = nid;
+#endif
+
 	if (mem_section[root])
 		return -EEXIST;
 
-- 
cgit v0.10.2


From 9276b1bc96a132f4068fdee00983c532f43d3a26 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 6 Dec 2006 20:31:48 -0800
Subject: [PATCH] memory page_alloc zonelist caching speedup

Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.

Remembers the zones in a zonelist that were short of free memory in the
last second.  And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)

Recent changes:

    This differs in a significant way from a similar patch that I
    posted a week ago.  Now, instead of having a nodemask_t of
    recently full nodes, I have a bitmask of recently full zones.
    This solves a problem that last weeks patch had, which on
    systems with multiple zones per node (such as DMA zone) would
    take seeing any of these zones full as meaning that all zones
    on that node were full.

    Also I changed names - from "zonelist faster" to "zonelist cache",
    as that seemed to better convey what we're doing here - caching
    some of the key zonelist state (for faster access.)

    See below for some performance benchmark results.  After all that
    discussion with David on why I didn't need them, I went and got
    some ;).  I wanted to verify that I had not hurt the normal case
    of memory allocation noticeably.  At least for my one little
    microbenchmark, I found (1) the normal case wasn't affected, and
    (2) workloads that forced scanning across multiple nodes for
    memory improved up to 10% fewer System CPU cycles and lower
    elapsed clock time ('sys' and 'real').  Good.  See details, below.

    I didn't have the logic in get_page_from_freelist() for various
    full nodes and zone reclaim failures correct.  That should be
    fixed up now - notice the new goto labels zonelist_scan,
    this_zone_full, and try_next_zone, in get_page_from_freelist().

There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:

 1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
    have seen real customer loads where the cost to scan the zonelist
    was a problem, due to many nodes being full of memory before
    we got to a node we could use.  Or at least, I think we have.
    This was related to me by another engineer, based on experiences
    from some time past.  So this is not guaranteed.  Most likely, though.

    The following approach should help such real numa systems just as
    much as it helps fake numa systems, or any combination thereof.

 2) The effort to distinguish fake from real numa, using node_distance,
    so that we could cache a fake numa node and optimize choosing
    it over equivalent distance fake nodes, while continuing to
    properly scan all real nodes in distance order, was going to
    require a nasty blob of zonelist and node distance munging.

    The following approach has no new dependency on node distances or
    zone sorting.

See comment in the patch below for a description of what it actually does.

Technical details of note (or controversy):

 - See the use of "zlc_active" and "did_zlc_setup" below, to delay
   adding any work for this new mechanism until we've looked at the
   first zone in zonelist.  I figured the odds of the first zone
   having the memory we needed were high enough that we should just
   look there, first, then get fancy only if we need to keep looking.

 - Some odd hackery was needed to add items to struct zonelist, while
   not tripping up the custom zonelists built by the mm/mempolicy.c
   code for MPOL_BIND.  My usual wordy comments below explain this.
   Search for "MPOL_BIND".

 - Some per-node data in the struct zonelist is now modified frequently,
   with no locking.  Multiple CPU cores on a node could hit and mangle
   this data.  The theory is that this is just performance hint data,
   and the memory allocator will work just fine despite any such mangling.
   The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
   (a bitmask) and 'last_full_zap' (unsigned long jiffies).  It should
   all be self correcting after at most a one second delay.

 - This still does a linear scan of the same lengths as before.  All
   I've optimized is making the scan faster, not algorithmically
   shorter.  It is now able to scan a compact array of 'unsigned
   short' in the case of many full nodes, so one cache line should
   cover quite a few nodes, rather than each node hitting another
   one or two new and distinct cache lines.

 - If both Andi and Nick don't find this too complicated, I will be
   (pleasantly) flabbergasted.

 - I removed the comment claiming we only use one cachline's worth of
   zonelist.  We seem, at least in the fake numa case, to have put the
   lie to that claim.

 - I pay no attention to the various watermarks and such in this performance
   hint.  A node could be marked full for one watermark, and then skipped
   over when searching for a page using a different watermark.  I think
   that's actually quite ok, as it will tend to slightly increase the
   spreading of memory over other nodes, away from a memory stressed node.

===============

Performance - some benchmark results and analysis:

This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.

Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.)  System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.

Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added.  The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.

      number     2.6.18-mm3	   zonelist-cache    delta (< 0 good)	percent
 GBs    N  	------------	   --------------    ----------------	systime
 mem threads   sys user  real	  sys  user  real     sys  user  real	 better
  12	 1     153   24   177	  151	 24   176      -2     0    -1	   1%
  12	19	99   22     8	   99	 22	8	0     0     0	   0%
  12	37     111   25     6	  112	 25	6	1     0     0	  -0%
  12	55     115   25     5	  110	 23	5      -5    -2     0	   4%
  38	 1     502   74   576	  497	 73   570      -5    -1    -6	   0%
  38	19     426   78    48	  373	 76    39     -53    -2    -9	  12%
  38	37     544   83    36	  547	 82    36	3    -1     0	  -0%
  38	55     501   77    23	  511	 80    24      10     3     1	  -1%
  64	 1     917  125  1042	  890	124  1014     -27    -1   -28	   2%
  64	19    1118  138   119	  965	141   103    -153     3   -16	  13%
  64	37    1202  151    94	 1136	150    81     -66    -1   -13	   5%
  64	55    1118  141    61	 1072	140    58     -46    -1    -3	   4%
  90	 1    1342  177  1519	 1275	174  1450     -67    -3   -69	   4%
  90	19    2392  199   192	 2116	189   176    -276   -10   -16	  11%
  90	37    3313  238   175	 2972	225   145    -341   -13   -30	  10%
  90	55    1948  210   104	 1843	213   100    -105     3    -4	   5%

Notes:
 1) This test ran a memory hog program that started a specified number N of
    threads, and had each thread allocate and touch 1/N'th of
    the total memory to be used in the test run in a single loop,
    writing a constant word to memory, one store every 4096 bytes.
    Watching this test during some earlier trial runs, I would see
    each of these threads sit down on one CPU and stay there, for
    the remainder of the pass, a different CPU for each thread.

 2) The 'real' column is not comparable to the 'sys' or 'user' columns.
    The 'real' column is seconds wall clock time elapsed, from beginning
    to end of that test pass.  The 'sys' and 'user' columns are total
    CPU seconds spent on that test pass.  For a 19 thread test run,
    for example, the sum of 'sys' and 'user' could be up to 19 times the
    number of 'real' elapsed wall clock seconds.

 3) Tests were run on a fresh, single-user boot, to minimize the amount
    of memory already in use at the start of the test, and to minimize
    the amount of background activity that might interfere.

 4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.

 5) Notice that the 'real' time gets large for the single thread runs, even
    though the measured 'sys' and 'user' times are modest.  I'm not sure what
    that means - probably something to do with it being slow for one thread to
    be accessing memory along ways away.  Perhaps the fake numa system, running
    ostensibly the same workload, would not show this substantial degradation
    of 'real' time for one thread on many nodes -- lets hope not.

 6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
    ran quite efficiently, as one might expect.  Each pair of threads needed
    to allocate and touch the memory on the node the two threads shared, a
    pleasantly parallizable workload.

 7) The intermediate thread count passes, when asking for alot of memory forcing
    them to go to a few neighboring nodes, improved the most with this zonelist
    caching patch.

Conclusions:
 * This zonelist cache patch probably makes little difference one way or the
   other for most workloads on real numa hardware, if those workloads avoid
   heavy off node allocations.
 * For memory intensive workloads requiring substantial off-node allocations
   on real numa hardware, this patch improves both kernel and elapsed timings
   up to ten per-cent.
 * For fake numa systems, I'm optimistic, but will have to leave that up to
   Rohit Seth to actually test (once I get him a 2.6.18 backport.)

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 4d8adf6..748d2c9 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -23,6 +23,7 @@ extern void cpuset_fork(struct task_struct *p);
 extern void cpuset_exit(struct task_struct *p);
 extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
+#define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
 void cpuset_update_task_memory_state(void);
 #define cpuset_nodes_subset_current_mems_allowed(nodes) \
@@ -83,6 +84,7 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
 	return node_possible_map;
 }
 
+#define cpuset_current_mems_allowed (node_online_map)
 static inline void cpuset_init_current_mems_allowed(void) {}
 static inline void cpuset_update_task_memory_state(void) {}
 #define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e06683e..09bf9d8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -288,19 +288,94 @@ struct zone {
  */
 #define DEF_PRIORITY 12
 
+/* Maximum number of zones on a zonelist */
+#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)
+
+#ifdef CONFIG_NUMA
+/*
+ * We cache key information from each zonelist for smaller cache
+ * footprint when scanning for free pages in get_page_from_freelist().
+ *
+ * 1) The BITMAP fullzones tracks which zones in a zonelist have come
+ *    up short of free memory since the last time (last_fullzone_zap)
+ *    we zero'd fullzones.
+ * 2) The array z_to_n[] maps each zone in the zonelist to its node
+ *    id, so that we can efficiently evaluate whether that node is
+ *    set in the current tasks mems_allowed.
+ *
+ * Both fullzones and z_to_n[] are one-to-one with the zonelist,
+ * indexed by a zones offset in the zonelist zones[] array.
+ *
+ * The get_page_from_freelist() routine does two scans.  During the
+ * first scan, we skip zones whose corresponding bit in 'fullzones'
+ * is set or whose corresponding node in current->mems_allowed (which
+ * comes from cpusets) is not set.  During the second scan, we bypass
+ * this zonelist_cache, to ensure we look methodically at each zone.
+ *
+ * Once per second, we zero out (zap) fullzones, forcing us to
+ * reconsider nodes that might have regained more free memory.
+ * The field last_full_zap is the time we last zapped fullzones.
+ *
+ * This mechanism reduces the amount of time we waste repeatedly
+ * reexaming zones for free memory when they just came up low on
+ * memory momentarilly ago.
+ *
+ * The zonelist_cache struct members logically belong in struct
+ * zonelist.  However, the mempolicy zonelists constructed for
+ * MPOL_BIND are intentionally variable length (and usually much
+ * shorter).  A general purpose mechanism for handling structs with
+ * multiple variable length members is more mechanism than we want
+ * here.  We resort to some special case hackery instead.
+ *
+ * The MPOL_BIND zonelists don't need this zonelist_cache (in good
+ * part because they are shorter), so we put the fixed length stuff
+ * at the front of the zonelist struct, ending in a variable length
+ * zones[], as is needed by MPOL_BIND.
+ *
+ * Then we put the optional zonelist cache on the end of the zonelist
+ * struct.  This optional stuff is found by a 'zlcache_ptr' pointer in
+ * the fixed length portion at the front of the struct.  This pointer
+ * both enables us to find the zonelist cache, and in the case of
+ * MPOL_BIND zonelists, (which will just set the zlcache_ptr to NULL)
+ * to know that the zonelist cache is not there.
+ *
+ * The end result is that struct zonelists come in two flavors:
+ *  1) The full, fixed length version, shown below, and
+ *  2) The custom zonelists for MPOL_BIND.
+ * The custom MPOL_BIND zonelists have a NULL zlcache_ptr and no zlcache.
+ *
+ * Even though there may be multiple CPU cores on a node modifying
+ * fullzones or last_full_zap in the same zonelist_cache at the same
+ * time, we don't lock it.  This is just hint data - if it is wrong now
+ * and then, the allocator will still function, perhaps a bit slower.
+ */
+
+
+struct zonelist_cache {
+	DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST);	/* zone full? */
+	unsigned short z_to_n[MAX_ZONES_PER_ZONELIST];		/* zone->nid */
+	unsigned long last_full_zap;		/* when last zap'd (jiffies) */
+};
+#else
+struct zonelist_cache;
+#endif
+
 /*
  * One allocation request operates on a zonelist. A zonelist
  * is a list of zones, the first one is the 'goal' of the
  * allocation, the other zones are fallback zones, in decreasing
  * priority.
  *
- * Right now a zonelist takes up less than a cacheline. We never
- * modify it apart from boot-up, and only a few indices are used,
- * so despite the zonelist table being relatively big, the cache
- * footprint of this construct is very small.
+ * If zlcache_ptr is not NULL, then it is just the address of zlcache,
+ * as explained above.  If zlcache_ptr is NULL, there is no zlcache.
  */
+
 struct zonelist {
-	struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited
+	struct zonelist_cache *zlcache_ptr;		     // NULL or &zlcache
+	struct zone *zones[MAX_ZONES_PER_ZONELIST + 1];      // NULL delimited
+#ifdef CONFIG_NUMA
+	struct zonelist_cache zlcache;			     // optional ...
+#endif
 };
 
 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 617fb31..fb90723 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -141,9 +141,11 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 	enum zone_type k;
 
 	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
+	max++;			/* space for zlcache_ptr (see mmzone.h) */
 	zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
 	if (!zl)
 		return NULL;
+	zl->zlcache_ptr = NULL;
 	num = 0;
 	/* First put in the highest zones from all nodes, then all the next 
 	   lower zones etc. Avoid empty zones because the memory allocator
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 23bc5bc..230771d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -918,6 +918,126 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 	return 1;
 }
 
+#ifdef CONFIG_NUMA
+/*
+ * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
+ * skip over zones that are not allowed by the cpuset, or that have
+ * been recently (in last second) found to be nearly full.  See further
+ * comments in mmzone.h.  Reduces cache footprint of zonelist scans
+ * that have to skip over alot of full or unallowed zones.
+ *
+ * If the zonelist cache is present in the passed in zonelist, then
+ * returns a pointer to the allowed node mask (either the current
+ * tasks mems_allowed, or node_online_map.)
+ *
+ * If the zonelist cache is not available for this zonelist, does
+ * nothing and returns NULL.
+ *
+ * If the fullzones BITMAP in the zonelist cache is stale (more than
+ * a second since last zap'd) then we zap it out (clear its bits.)
+ *
+ * We hold off even calling zlc_setup, until after we've checked the
+ * first zone in the zonelist, on the theory that most allocations will
+ * be satisfied from that first zone, so best to examine that zone as
+ * quickly as we can.
+ */
+static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
+{
+	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
+	nodemask_t *allowednodes;	/* zonelist_cache approximation */
+
+	zlc = zonelist->zlcache_ptr;
+	if (!zlc)
+		return NULL;
+
+	if (jiffies - zlc->last_full_zap > 1 * HZ) {
+		bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+		zlc->last_full_zap = jiffies;
+	}
+
+	allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
+					&cpuset_current_mems_allowed :
+					&node_online_map;
+	return allowednodes;
+}
+
+/*
+ * Given 'z' scanning a zonelist, run a couple of quick checks to see
+ * if it is worth looking at further for free memory:
+ *  1) Check that the zone isn't thought to be full (doesn't have its
+ *     bit set in the zonelist_cache fullzones BITMAP).
+ *  2) Check that the zones node (obtained from the zonelist_cache
+ *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
+ * Return true (non-zero) if zone is worth looking at further, or
+ * else return false (zero) if it is not.
+ *
+ * This check -ignores- the distinction between various watermarks,
+ * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
+ * found to be full for any variation of these watermarks, it will
+ * be considered full for up to one second by all requests, unless
+ * we are so low on memory on all allowed nodes that we are forced
+ * into the second scan of the zonelist.
+ *
+ * In the second scan we ignore this zonelist cache and exactly
+ * apply the watermarks to all zones, even it is slower to do so.
+ * We are low on memory in the second scan, and should leave no stone
+ * unturned looking for a free page.
+ */
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+						nodemask_t *allowednodes)
+{
+	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
+	int i;				/* index of *z in zonelist zones */
+	int n;				/* node that zone *z is on */
+
+	zlc = zonelist->zlcache_ptr;
+	if (!zlc)
+		return 1;
+
+	i = z - zonelist->zones;
+	n = zlc->z_to_n[i];
+
+	/* This zone is worth trying if it is allowed but not full */
+	return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
+}
+
+/*
+ * Given 'z' scanning a zonelist, set the corresponding bit in
+ * zlc->fullzones, so that subsequent attempts to allocate a page
+ * from that zone don't waste time re-examining it.
+ */
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+{
+	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
+	int i;				/* index of *z in zonelist zones */
+
+	zlc = zonelist->zlcache_ptr;
+	if (!zlc)
+		return;
+
+	i = z - zonelist->zones;
+
+	set_bit(i, zlc->fullzones);
+}
+
+#else	/* CONFIG_NUMA */
+
+static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
+{
+	return NULL;
+}
+
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+				nodemask_t *allowednodes)
+{
+	return 1;
+}
+
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+{
+}
+#endif	/* CONFIG_NUMA */
+
 /*
  * get_page_from_freelist goes through the zonelist trying to allocate
  * a page.
@@ -926,23 +1046,32 @@ static struct page *
 get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
 		struct zonelist *zonelist, int alloc_flags)
 {
-	struct zone **z = zonelist->zones;
+	struct zone **z;
 	struct page *page = NULL;
-	int classzone_idx = zone_idx(*z);
+	int classzone_idx = zone_idx(zonelist->zones[0]);
 	struct zone *zone;
+	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
+	int zlc_active = 0;		/* set if using zonelist_cache */
+	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
 
+zonelist_scan:
 	/*
-	 * Go through the zonelist once, looking for a zone with enough free.
+	 * Scan zonelist, looking for a zone with enough free.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
+	z = zonelist->zones;
+
 	do {
+		if (NUMA_BUILD && zlc_active &&
+			!zlc_zone_worth_trying(zonelist, z, allowednodes))
+				continue;
 		zone = *z;
 		if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
 			zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
 				break;
 		if ((alloc_flags & ALLOC_CPUSET) &&
 			!cpuset_zone_allowed(zone, gfp_mask))
-				continue;
+				goto try_next_zone;
 
 		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
 			unsigned long mark;
@@ -956,15 +1085,30 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
 				    classzone_idx, alloc_flags)) {
 				if (!zone_reclaim_mode ||
 				    !zone_reclaim(zone, gfp_mask, order))
-					continue;
+					goto this_zone_full;
 			}
 		}
 
 		page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
 		if (page)
 			break;
-
+this_zone_full:
+		if (NUMA_BUILD)
+			zlc_mark_zone_full(zonelist, z);
+try_next_zone:
+		if (NUMA_BUILD && !did_zlc_setup) {
+			/* we do zlc_setup after the first zone is tried */
+			allowednodes = zlc_setup(zonelist, alloc_flags);
+			zlc_active = 1;
+			did_zlc_setup = 1;
+		}
 	} while (*(++z) != NULL);
+
+	if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
+		/* Disable zlc cache for second zonelist scan */
+		zlc_active = 0;
+		goto zonelist_scan;
+	}
 	return page;
 }
 
@@ -1535,6 +1679,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
 	}
 }
 
+/* Construct the zonelist performance cache - see further mmzone.h */
+static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+{
+	int i;
+
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		struct zonelist *zonelist;
+		struct zonelist_cache *zlc;
+		struct zone **z;
+
+		zonelist = pgdat->node_zonelists + i;
+		zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
+		bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+		for (z = zonelist->zones; *z; z++)
+			zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
+	}
+}
+
 #else	/* CONFIG_NUMA */
 
 static void __meminit build_zonelists(pg_data_t *pgdat)
@@ -1572,14 +1734,26 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
 	}
 }
 
+/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
+static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+{
+	int i;
+
+	for (i = 0; i < MAX_NR_ZONES; i++)
+		pgdat->node_zonelists[i].zlcache_ptr = NULL;
+}
+
 #endif	/* CONFIG_NUMA */
 
 /* return values int ....just for stop_machine_run() */
 static int __meminit __build_all_zonelists(void *dummy)
 {
 	int nid;
-	for_each_online_node(nid)
+
+	for_each_online_node(nid) {
 		build_zonelists(NODE_DATA(nid));
+		build_zonelist_cache(NODE_DATA(nid));
+	}
 	return 0;
 }
 
-- 
cgit v0.10.2


From 7253f4ef04b1cd138baf2b29a95473743ac0a307 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 6 Dec 2006 20:31:49 -0800
Subject: [PATCH] memory page_alloc zonelist caching reorder structure

Rearrange the struct members in the 'struct zonelist_cache' structure, so
as to put the readonly (once initialized) z_to_n[] array first, where it
will come right after the zones[] array in struct zonelist.

This pretty much eliminates the chance that the two frequently written
elements of 'struct zonelist_cache', the fullzones bitmap and last_full_zap
times, will end up on the same cache line as the performance sensitive,
frequently read, never (after init) written zones[] array.

Keeping frequently written data off frequently read cache lines is good for
performance.

Thanks to Rohit Seth for the suggestion.

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 09bf9d8..da6002d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -352,8 +352,8 @@ struct zone {
 
 
 struct zonelist_cache {
-	DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST);	/* zone full? */
 	unsigned short z_to_n[MAX_ZONES_PER_ZONELIST];		/* zone->nid */
+	DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST);	/* zone full? */
 	unsigned long last_full_zap;		/* when last zap'd (jiffies) */
 };
 #else
-- 
cgit v0.10.2


From c33e0fca3508f0aa387b1c10d0ef158102deb140 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 6 Dec 2006 20:31:50 -0800
Subject: [PATCH] oom: don't kill unkillable children or siblings

Abort the kill if any of our threads have OOM_DISABLE set.  Having this
test here also prevents any OOM_DISABLE child of the "selected" process
from being killed.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2e3ce3a..bc2627d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -313,15 +313,24 @@ static int oom_kill_task(struct task_struct *p, const char *message)
 	if (mm == NULL)
 		return 1;
 
+	/*
+	 * Don't kill the process if any threads are set to OOM_DISABLE
+	 */
+	do_each_thread(g, q) {
+		if (q->mm == mm && p->oomkilladj == OOM_DISABLE)
+			return 1;
+	} while_each_thread(g, q);
+
 	__oom_kill_task(p, message);
+
 	/*
 	 * kill all processes that share the ->mm (i.e. all threads),
 	 * but are in a different thread group
 	 */
-	do_each_thread(g, q)
+	do_each_thread(g, q) {
 		if (q->mm == mm && q->tgid != p->tgid)
 			__oom_kill_task(q, message);
-	while_each_thread(g, q);
+	} while_each_thread(g, q);
 
 	return 0;
 }
-- 
cgit v0.10.2


From f3af38d30c18538d069a95e624a3db7c3d486a1e Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 6 Dec 2006 20:31:51 -0800
Subject: [PATCH] oom: cleanup messages

Clean up the OOM killer messages to be more consistent.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index bc2627d..a6879f2 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -264,7 +264,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
  * flag though it's unlikely that  we select a process with CAP_SYS_RAW_IO
  * set.
  */
-static void __oom_kill_task(struct task_struct *p, const char *message)
+static void __oom_kill_task(struct task_struct *p, int verbose)
 {
 	if (is_init(p)) {
 		WARN_ON(1);
@@ -278,10 +278,8 @@ static void __oom_kill_task(struct task_struct *p, const char *message)
 		return;
 	}
 
-	if (message) {
-		printk(KERN_ERR "%s: Killed process %d (%s).\n",
-				message, p->pid, p->comm);
-	}
+	if (verbose)
+		printk(KERN_ERR "Killed process %d (%s)\n", p->pid, p->comm);
 
 	/*
 	 * We give our sacrificial lamb high priority and access to
@@ -294,7 +292,7 @@ static void __oom_kill_task(struct task_struct *p, const char *message)
 	force_sig(SIGKILL, p);
 }
 
-static int oom_kill_task(struct task_struct *p, const char *message)
+static int oom_kill_task(struct task_struct *p)
 {
 	struct mm_struct *mm;
 	struct task_struct *g, *q;
@@ -321,15 +319,15 @@ static int oom_kill_task(struct task_struct *p, const char *message)
 			return 1;
 	} while_each_thread(g, q);
 
-	__oom_kill_task(p, message);
+	__oom_kill_task(p, 1);
 
 	/*
 	 * kill all processes that share the ->mm (i.e. all threads),
-	 * but are in a different thread group
+	 * but are in a different thread group.
 	 */
 	do_each_thread(g, q) {
 		if (q->mm == mm && q->tgid != p->tgid)
-			__oom_kill_task(q, message);
+			__oom_kill_task(q, 1);
 	} while_each_thread(g, q);
 
 	return 0;
@@ -346,21 +344,22 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
 	 * its children or threads, just set TIF_MEMDIE so it can die quickly
 	 */
 	if (p->flags & PF_EXITING) {
-		__oom_kill_task(p, NULL);
+		__oom_kill_task(p, 0);
 		return 0;
 	}
 
-	printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li"
-			" and children.\n", p->pid, p->comm, points);
+	printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
+					message, p->pid, p->comm, points);
+
 	/* Try to kill a child first */
 	list_for_each(tsk, &p->children) {
 		c = list_entry(tsk, struct task_struct, sibling);
 		if (c->mm == p->mm)
 			continue;
-		if (!oom_kill_task(c, message))
+		if (!oom_kill_task(c))
 			return 0;
 	}
-	return oom_kill_task(p, message);
+	return oom_kill_task(p);
 }
 
 static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
-- 
cgit v0.10.2


From f2a2a7108aa0039ba7a5fe7a0d2ecef2219a7584 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 6 Dec 2006 20:31:52 -0800
Subject: [PATCH] oom: less memdie

Don't cause all threads in all other thread groups to gain TIF_MEMDIE
otherwise we'll get a thundering herd eating our memory reserve.  This may not
be the optimal scheme, but it fits our policy of allowing just one TIF_MEMDIE
in the system at once.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index a6879f2..223d9cc 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -323,11 +323,12 @@ static int oom_kill_task(struct task_struct *p)
 
 	/*
 	 * kill all processes that share the ->mm (i.e. all threads),
-	 * but are in a different thread group.
+	 * but are in a different thread group. Don't let them have access
+	 * to memory reserves though, otherwise we might deplete all memory.
 	 */
 	do_each_thread(g, q) {
 		if (q->mm == mm && q->tgid != p->tgid)
-			__oom_kill_task(q, 1);
+			force_sig(SIGKILL, p);
 	} while_each_thread(g, q);
 
 	return 0;
-- 
cgit v0.10.2


From cd54e7e54318d333227b13186f9a464bf1f68d27 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 6 Dec 2006 20:31:53 -0800
Subject: [PATCH] mm: incorrect VM_FAULT_OOM returns from drivers

Some drivers are returning OOM when it is not in response to a memory
shortage.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Dave Airlie <airlied@linux.ie>
Cc: Jaroslav Kysela <perex@suse.cz>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/drm/drm_vm.c b/drivers/char/drm/drm_vm.c
index b40ae43..ae26919 100644
--- a/drivers/char/drm/drm_vm.c
+++ b/drivers/char/drm/drm_vm.c
@@ -147,14 +147,14 @@ static __inline__ struct page *drm_do_vm_shm_nopage(struct vm_area_struct *vma,
 	if (address > vma->vm_end)
 		return NOPAGE_SIGBUS;	/* Disallow mremap */
 	if (!map)
-		return NOPAGE_OOM;	/* Nothing allocated */
+		return NOPAGE_SIGBUS;	/* Nothing allocated */
 
 	offset = address - vma->vm_start;
 	i = (unsigned long)map->handle + offset;
 	page = (map->type == _DRM_CONSISTENT) ?
 		virt_to_page((void *)i) : vmalloc_to_page((void *)i);
 	if (!page)
-		return NOPAGE_OOM;
+		return NOPAGE_SIGBUS;
 	get_page(page);
 
 	DRM_DEBUG("shm_nopage 0x%lx\n", address);
@@ -272,7 +272,7 @@ static __inline__ struct page *drm_do_vm_dma_nopage(struct vm_area_struct *vma,
 	if (address > vma->vm_end)
 		return NOPAGE_SIGBUS;	/* Disallow mremap */
 	if (!dma->pagelist)
-		return NOPAGE_OOM;	/* Nothing allocated */
+		return NOPAGE_SIGBUS;	/* Nothing allocated */
 
 	offset = address - vma->vm_start;	/* vm_[pg]off[set] should be 0 */
 	page_nr = offset >> PAGE_SHIFT;
@@ -310,7 +310,7 @@ static __inline__ struct page *drm_do_vm_sg_nopage(struct vm_area_struct *vma,
 	if (address > vma->vm_end)
 		return NOPAGE_SIGBUS;	/* Disallow mremap */
 	if (!entry->pagelist)
-		return NOPAGE_OOM;	/* Nothing allocated */
+		return NOPAGE_SIGBUS;	/* Nothing allocated */
 
 	offset = address - vma->vm_start;
 	map_offset = map->offset - (unsigned long)dev->sg->virtual;
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 66e24b5..6ea67b1 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -3027,7 +3027,7 @@ static struct page * snd_pcm_mmap_status_nopage(struct vm_area_struct *area,
 	struct page * page;
 	
 	if (substream == NULL)
-		return NOPAGE_OOM;
+		return NOPAGE_SIGBUS;
 	runtime = substream->runtime;
 	page = virt_to_page(runtime->status);
 	get_page(page);
@@ -3070,7 +3070,7 @@ static struct page * snd_pcm_mmap_control_nopage(struct vm_area_struct *area,
 	struct page * page;
 	
 	if (substream == NULL)
-		return NOPAGE_OOM;
+		return NOPAGE_SIGBUS;
 	runtime = substream->runtime;
 	page = virt_to_page(runtime->control);
 	get_page(page);
@@ -3131,18 +3131,18 @@ static struct page *snd_pcm_mmap_data_nopage(struct vm_area_struct *area,
 	size_t dma_bytes;
 	
 	if (substream == NULL)
-		return NOPAGE_OOM;
+		return NOPAGE_SIGBUS;
 	runtime = substream->runtime;
 	offset = area->vm_pgoff << PAGE_SHIFT;
 	offset += address - area->vm_start;
-	snd_assert((offset % PAGE_SIZE) == 0, return NOPAGE_OOM);
+	snd_assert((offset % PAGE_SIZE) == 0, return NOPAGE_SIGBUS);
 	dma_bytes = PAGE_ALIGN(runtime->dma_bytes);
 	if (offset > dma_bytes - PAGE_SIZE)
 		return NOPAGE_SIGBUS;
 	if (substream->ops->page) {
 		page = substream->ops->page(substream, offset);
 		if (! page)
-			return NOPAGE_OOM;
+			return NOPAGE_OOM; /* XXX: is this really due to OOM? */
 	} else {
 		vaddr = runtime->dma_area + offset;
 		page = virt_to_page(vaddr);
diff --git a/sound/oss/via82cxxx_audio.c b/sound/oss/via82cxxx_audio.c
index 17837d4..c96cc8c 100644
--- a/sound/oss/via82cxxx_audio.c
+++ b/sound/oss/via82cxxx_audio.c
@@ -2120,8 +2120,8 @@ static struct page * via_mm_nopage (struct vm_area_struct * vma,
 		return NOPAGE_SIGBUS; /* Disallow mremap */
 	}
         if (!card) {
-		DPRINTK ("EXIT, returning NOPAGE_OOM\n");
-		return NOPAGE_OOM;	/* Nothing allocated */
+		DPRINTK ("EXIT, returning NOPAGE_SIGBUS\n");
+		return NOPAGE_SIGBUS;	/* Nothing allocated */
 	}
 
 	pgoff = vma->vm_pgoff + ((address - vma->vm_start) >> PAGE_SHIFT);
diff --git a/sound/usb/usx2y/usX2Yhwdep.c b/sound/usb/usx2y/usX2Yhwdep.c
index 4b52d18..b76b3dd 100644
--- a/sound/usb/usx2y/usX2Yhwdep.c
+++ b/sound/usb/usx2y/usX2Yhwdep.c
@@ -48,7 +48,7 @@ static struct page * snd_us428ctls_vm_nopage(struct vm_area_struct *area, unsign
 	
 	offset = area->vm_pgoff << PAGE_SHIFT;
 	offset += address - area->vm_start;
-	snd_assert((offset % PAGE_SIZE) == 0, return NOPAGE_OOM);
+	snd_assert((offset % PAGE_SIZE) == 0, return NOPAGE_SIGBUS);
 	vaddr = (char*)((struct usX2Ydev *)area->vm_private_data)->us428ctls_sharedmem + offset;
 	page = virt_to_page(vaddr);
 	get_page(page);
-- 
cgit v0.10.2


From 098fe651f7e9d759d1117c78c1a642b9b3945922 Mon Sep 17 00:00:00 2001
From: Ashwin Chaugule <ashwin.chaugule@celunite.com>
Date: Wed, 6 Dec 2006 20:31:54 -0800
Subject: [PATCH] grab swap token reordered

Make sure the contention for the token happens _before_ any read-in and
kicks the swap-token algo only when the VM is under pressure.

Signed-off-by: Ashwin Chaugule <ashwin.chaugule@celunite.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/filemap.c b/mm/filemap.c
index 13df01c..af7e2f5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1445,7 +1445,6 @@ no_cached_page:
 	 * effect.
 	 */
 	error = page_cache_read(file, pgoff);
-	grab_swap_token();
 
 	/*
 	 * The page we want has now been added to the page cache.
diff --git a/mm/memory.c b/mm/memory.c
index 156861f..a07120d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1991,6 +1991,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
 	page = lookup_swap_cache(entry);
 	if (!page) {
+		grab_swap_token(); /* Contend for token _before_ read-in */
  		swapin_readahead(entry, address, vma);
  		page = read_swap_cache_async(entry, vma, address);
 		if (!page) {
@@ -2008,7 +2009,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		/* Had to read the page from swap area: Major fault */
 		ret = VM_FAULT_MAJOR;
 		count_vm_event(PGMAJFAULT);
-		grab_swap_token();
 	}
 
 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
-- 
cgit v0.10.2


From 7602bdf2fd14a40dd9b104e516fdc05e1bd17952 Mon Sep 17 00:00:00 2001
From: Ashwin Chaugule <ashwin.chaugule@celunite.com>
Date: Wed, 6 Dec 2006 20:31:57 -0800
Subject: [PATCH] new scheme to preempt swap token

The new swap token patches replace the current token traversal algo.  The old
algo had a crude timeout parameter that was used to handover the token from
one task to another.  This algo, transfers the token to the tasks that are in
need of the token.  The urgency for the token is based on the number of times
a task is required to swap-in pages.  Accordingly, the priority of a task is
incremented if it has been badly affected due to swap-outs.  To ensure that
the token doesnt bounce around rapidly, the token holders are given a priority
boost.  The priority of tasks is also decremented, if their rate of swap-in's
keeps reducing.  This way, the condition to check whether to pre-empt the swap
token, is a matter of comparing two task's priority fields.

[akpm@osdl.org: cleanups]
Signed-off-by: Ashwin Chaugule <ashwin.chaugule@celunite.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index eafe4a7..cad6a16 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -344,9 +344,16 @@ struct mm_struct {
 	/* Architecture-specific MM context */
 	mm_context_t context;
 
-	/* Token based thrashing protection. */
-	unsigned long swap_token_time;
-	char recent_pagein;
+	/* Swap token stuff */
+	/*
+	 * Last value of global fault stamp as seen by this process.
+	 * In other words, this value gives an indication of how long
+	 * it has been since this task got the token.
+	 * Look at mm/thrash.c
+	 */
+	unsigned int faultstamp;
+	unsigned int token_priority;
+	unsigned int last_interval;
 
 	/* coredumping support */
 	int core_waiters;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index e7c36ba..89f8a39 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -259,7 +259,6 @@ extern spinlock_t swap_lock;
 
 /* linux/mm/thrash.c */
 extern struct mm_struct * swap_token_mm;
-extern unsigned long swap_token_default_timeout;
 extern void grab_swap_token(void);
 extern void __put_swap_token(struct mm_struct *);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 8cdd3e7..5678e6c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -479,6 +479,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
 
 	memcpy(mm, oldmm, sizeof(*mm));
 
+	/* Initializing for Swap token stuff */
+	mm->token_priority = 0;
+	mm->last_interval = 0;
+
 	if (!mm_init(mm))
 		goto fail_nomem;
 
@@ -542,6 +546,10 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 		goto fail_nomem;
 
 good_mm:
+	/* Initializing for Swap token stuff */
+	mm->token_priority = 0;
+	mm->last_interval = 0;
+
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	return 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 09e569f4..7abe970 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -977,17 +977,6 @@ static ctl_table vm_table[] = {
 		.extra1		= &zero,
 	},
 #endif
-#ifdef CONFIG_SWAP
-	{
-		.ctl_name	= VM_SWAP_TOKEN_TIMEOUT,
-		.procname	= "swap_token_timeout",
-		.data		= &swap_token_default_timeout,
-		.maxlen		= sizeof(swap_token_default_timeout),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
-		.strategy	= &sysctl_jiffies,
-	},
-#endif
 #ifdef CONFIG_NUMA
 	{
 		.ctl_name	= VM_ZONE_RECLAIM_MODE,
diff --git a/mm/thrash.c b/mm/thrash.c
index f4c560b..19e428c 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -7,100 +7,74 @@
  *
  * Simple token based thrashing protection, using the algorithm
  * described in:  http://www.cs.wm.edu/~sjiang/token.pdf
+ *
+ * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
+ * Improved algorithm to pass token:
+ * Each task has a priority which is incremented if it contended
+ * for the token in an interval less than its previous attempt.
+ * If the token is acquired, that task's priority is boosted to prevent
+ * the token from bouncing around too often and to let the task make
+ * some progress in its execution.
  */
+
 #include <linux/jiffies.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/swap.h>
 
 static DEFINE_SPINLOCK(swap_token_lock);
-static unsigned long swap_token_timeout;
-static unsigned long swap_token_check;
-struct mm_struct * swap_token_mm = &init_mm;
-
-#define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2)
-#define SWAP_TOKEN_TIMEOUT	(300 * HZ)
-/*
- * Currently disabled; Needs further code to work at HZ * 300.
- */
-unsigned long swap_token_default_timeout = SWAP_TOKEN_TIMEOUT;
-
-/*
- * Take the token away if the process had no page faults
- * in the last interval, or if it has held the token for
- * too long.
- */
-#define SWAP_TOKEN_ENOUGH_RSS 1
-#define SWAP_TOKEN_TIMED_OUT 2
-static int should_release_swap_token(struct mm_struct *mm)
-{
-	int ret = 0;
-	if (!mm->recent_pagein)
-		ret = SWAP_TOKEN_ENOUGH_RSS;
-	else if (time_after(jiffies, swap_token_timeout))
-		ret = SWAP_TOKEN_TIMED_OUT;
-	mm->recent_pagein = 0;
-	return ret;
-}
+struct mm_struct *swap_token_mm;
+unsigned int global_faults;
 
-/*
- * Try to grab the swapout protection token.  We only try to
- * grab it once every TOKEN_CHECK_INTERVAL, both to prevent
- * SMP lock contention and to check that the process that held
- * the token before is no longer thrashing.
- */
 void grab_swap_token(void)
 {
-	struct mm_struct *mm;
-	int reason;
+	int current_interval;
 
-	/* We have the token. Let others know we still need it. */
-	if (has_swap_token(current->mm)) {
-		current->mm->recent_pagein = 1;
-		if (unlikely(!swap_token_default_timeout))
-			disable_swap_token();
-		return;
-	}
-
-	if (time_after(jiffies, swap_token_check)) {
+	global_faults++;
 
-		if (!swap_token_default_timeout) {
-			swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
-			return;
-		}
-
-		/* ... or if we recently held the token. */
-		if (time_before(jiffies, current->mm->swap_token_time))
-			return;
+	current_interval = global_faults - current->mm->faultstamp;
 
-		if (!spin_trylock(&swap_token_lock))
-			return;
+	if (!spin_trylock(&swap_token_lock))
+		return;
 
-		swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
+	/* First come first served */
+	if (swap_token_mm == NULL) {
+		current->mm->token_priority = current->mm->token_priority + 2;
+		swap_token_mm = current->mm;
+		goto out;
+	}
 
-		mm = swap_token_mm;
-		if ((reason = should_release_swap_token(mm))) {
-			unsigned long eligible = jiffies;
-			if (reason == SWAP_TOKEN_TIMED_OUT) {
-				eligible += swap_token_default_timeout;
-			}
-			mm->swap_token_time = eligible;
-			swap_token_timeout = jiffies + swap_token_default_timeout;
+	if (current->mm != swap_token_mm) {
+		if (current_interval < current->mm->last_interval)
+			current->mm->token_priority++;
+		else {
+			current->mm->token_priority--;
+			if (unlikely(current->mm->token_priority < 0))
+				current->mm->token_priority = 0;
+		}
+		/* Check if we deserve the token */
+		if (current->mm->token_priority >
+				swap_token_mm->token_priority) {
+			current->mm->token_priority += 2;
 			swap_token_mm = current->mm;
 		}
-		spin_unlock(&swap_token_lock);
+	} else {
+		/* Token holder came in again! */
+		current->mm->token_priority += 2;
 	}
-	return;
+
+out:
+	current->mm->faultstamp = global_faults;
+	current->mm->last_interval = current_interval;
+	spin_unlock(&swap_token_lock);
+return;
 }
 
 /* Called on process exit. */
 void __put_swap_token(struct mm_struct *mm)
 {
 	spin_lock(&swap_token_lock);
-	if (likely(mm == swap_token_mm)) {
-		mm->swap_token_time = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
-		swap_token_mm = &init_mm;
-		swap_token_check = jiffies;
-	}
+	if (likely(mm == swap_token_mm))
+		swap_token_mm = NULL;
 	spin_unlock(&swap_token_lock);
 }
-- 
cgit v0.10.2


From cc102509074bba0316f2b5deebd7ef4447da295e Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 6 Dec 2006 20:32:00 -0800
Subject: [PATCH] mm: add arch_alloc_page

Add an arch_alloc_page to match arch_free_page.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index bf2b6bc..00c314a 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -116,6 +116,9 @@ static inline enum zone_type gfp_zone(gfp_t flags)
 #ifndef HAVE_ARCH_FREE_PAGE
 static inline void arch_free_page(struct page *page, int order) { }
 #endif
+#ifndef HAVE_ARCH_ALLOC_PAGE
+static inline void arch_alloc_page(struct page *page, int order) { }
+#endif
 
 extern struct page *
 FASTCALL(__alloc_pages(gfp_t, unsigned int, struct zonelist *));
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 230771d..cd47e8f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -598,6 +598,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 			1 << PG_checked | 1 << PG_mappedtodisk);
 	set_page_private(page, 0);
 	set_page_refcounted(page);
+
+	arch_alloc_page(page, order);
 	kernel_map_pages(page, 1 << order, 1);
 
 	if (gfp_flags & __GFP_ZERO)
-- 
cgit v0.10.2


From e1dbeda60a7ea9e82a908d93c07308d104d50d79 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 6 Dec 2006 20:32:01 -0800
Subject: [PATCH] balance_pdgat() cleanup

Despaghettify balance_pdgat() a bit.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 518540a..2e97baa 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1172,11 +1172,12 @@ loop_again:
 			if (!zone_watermark_ok(zone, order, zone->pages_high,
 					       0, 0)) {
 				end_zone = i;
-				goto scan;
+				break;
 			}
 		}
-		goto out;
-scan:
+		if (i < 0)
+			goto out;
+
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 
-- 
cgit v0.10.2


From 39dde65c9940c97fcd178a3d2b1c57ed8b7b68aa Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Wed, 6 Dec 2006 20:32:03 -0800
Subject: [PATCH] shared page table for hugetlb page

Following up with the work on shared page table done by Dave McCracken.  This
set of patch target shared page table for hugetlb memory only.

The shared page table is particular useful in the situation of large number of
independent processes sharing large shared memory segments.  In the normal
page case, the amount of memory saved from process' page table is quite
significant.  For hugetlb, the saving on page table memory is not the primary
objective (as hugetlb itself already cuts down page table overhead
significantly), instead, the purpose of using shared page table on hugetlb is
to allow faster TLB refill and smaller cache pollution upon TLB miss.

With PT sharing, pte entries are shared among hundreds of processes, the cache
consumption used by all the page table is smaller and in return, application
gets much higher cache hit ratio.  One other effect is that cache hit ratio
with hardware page walker hitting on pte in cache will be higher and this
helps to reduce tlb miss latency.  These two effects contribute to higher
application performance.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Cc: Dave McCracken <dmccr@us.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Adam Litke <agl@us.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index 1719a81..34728e4 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -17,6 +17,113 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
+static unsigned long page_table_shareable(struct vm_area_struct *svma,
+				struct vm_area_struct *vma,
+				unsigned long addr, pgoff_t idx)
+{
+	unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
+				svma->vm_start;
+	unsigned long sbase = saddr & PUD_MASK;
+	unsigned long s_end = sbase + PUD_SIZE;
+
+	/*
+	 * match the virtual addresses, permission and the alignment of the
+	 * page table page.
+	 */
+	if (pmd_index(addr) != pmd_index(saddr) ||
+	    vma->vm_flags != svma->vm_flags ||
+	    sbase < svma->vm_start || svma->vm_end < s_end)
+		return 0;
+
+	return saddr;
+}
+
+static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+{
+	unsigned long base = addr & PUD_MASK;
+	unsigned long end = base + PUD_SIZE;
+
+	/*
+	 * check on proper vm_flags and page table alignment
+	 */
+	if (vma->vm_flags & VM_MAYSHARE &&
+	    vma->vm_start <= base && end <= vma->vm_end)
+		return 1;
+	return 0;
+}
+
+/*
+ * search for a shareable pmd page for hugetlb.
+ */
+static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+{
+	struct vm_area_struct *vma = find_vma(mm, addr);
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+			vma->vm_pgoff;
+	struct prio_tree_iter iter;
+	struct vm_area_struct *svma;
+	unsigned long saddr;
+	pte_t *spte = NULL;
+
+	if (!vma_shareable(vma, addr))
+		return;
+
+	spin_lock(&mapping->i_mmap_lock);
+	vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
+		if (svma == vma)
+			continue;
+
+		saddr = page_table_shareable(svma, vma, addr, idx);
+		if (saddr) {
+			spte = huge_pte_offset(svma->vm_mm, saddr);
+			if (spte) {
+				get_page(virt_to_page(spte));
+				break;
+			}
+		}
+	}
+
+	if (!spte)
+		goto out;
+
+	spin_lock(&mm->page_table_lock);
+	if (pud_none(*pud))
+		pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
+	else
+		put_page(virt_to_page(spte));
+	spin_unlock(&mm->page_table_lock);
+out:
+	spin_unlock(&mapping->i_mmap_lock);
+}
+
+/*
+ * unmap huge page backed by shared pte.
+ *
+ * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
+ * indicated by page_count > 1, unmap is achieved by clearing pud and
+ * decrementing the ref count. If count == 1, the pte page is not shared.
+ *
+ * called with vma->vm_mm->page_table_lock held.
+ *
+ * returns: 1 successfully unmapped a shared pte page
+ *	    0 the underlying pte page is not shared, or it is the last user
+ */
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+	pgd_t *pgd = pgd_offset(mm, *addr);
+	pud_t *pud = pud_offset(pgd, *addr);
+
+	BUG_ON(page_count(virt_to_page(ptep)) == 0);
+	if (page_count(virt_to_page(ptep)) == 1)
+		return 0;
+
+	pud_clear(pud);
+	put_page(virt_to_page(ptep));
+	*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+	return 1;
+}
+
 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
@@ -25,8 +132,11 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 
 	pgd = pgd_offset(mm, addr);
 	pud = pud_alloc(mm, pgd, addr);
-	if (pud)
+	if (pud) {
+		if (pud_none(*pud))
+			huge_pmd_share(mm, addr, pud);
 		pte = (pte_t *) pmd_alloc(mm, pud, addr);
+	}
 	BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
 
 	return pte;
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index f3a9585..0c7e94e 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -64,6 +64,11 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr)
 	return pte;
 }
 
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+	return 0;
+}
+
 #define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; }
 
 /*
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 506d897..424a8f5 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -146,6 +146,11 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	return hugepte_offset(hpdp, addr);
 }
 
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+	return 0;
+}
+
 static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
 {
 	pte_t *hugepte = hugepd_page(*hpdp);
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 329059d..cf2c2ee 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -63,6 +63,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	return pte;
 }
 
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+	return 0;
+}
+
 struct page *follow_huge_addr(struct mm_struct *mm,
 			      unsigned long address, int write)
 {
diff --git a/arch/sh64/mm/hugetlbpage.c b/arch/sh64/mm/hugetlbpage.c
index 187cf01..4b455f6 100644
--- a/arch/sh64/mm/hugetlbpage.c
+++ b/arch/sh64/mm/hugetlbpage.c
@@ -53,6 +53,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	return pte;
 }
 
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+	return 0;
+}
+
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t entry)
 {
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index 53b9b1f..33fd0b2 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -235,6 +235,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	return pte;
 }
 
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+	return 0;
+}
+
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t entry)
 {
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index ace64e5..a60995a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -35,6 +35,7 @@ extern int sysctl_hugetlb_shm_group;
 
 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr);
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr);
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
 			      int write);
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f7355bf..9244971 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -386,6 +386,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 		if (!ptep)
 			continue;
 
+		if (huge_pmd_unshare(mm, &address, ptep))
+			continue;
+
 		pte = huge_ptep_get_and_clear(mm, address, ptep);
 		if (pte_none(pte))
 			continue;
@@ -658,11 +661,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 	BUG_ON(address >= end);
 	flush_cache_range(vma, address, end);
 
+	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
 	spin_lock(&mm->page_table_lock);
 	for (; address < end; address += HPAGE_SIZE) {
 		ptep = huge_pte_offset(mm, address);
 		if (!ptep)
 			continue;
+		if (huge_pmd_unshare(mm, &address, ptep))
+			continue;
 		if (!pte_none(*ptep)) {
 			pte = huge_ptep_get_and_clear(mm, address, ptep);
 			pte = pte_mkhuge(pte_modify(pte, newprot));
@@ -671,6 +677,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 		}
 	}
 	spin_unlock(&mm->page_table_lock);
+	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
 
 	flush_tlb_range(vma, start, end);
 }
-- 
cgit v0.10.2


From cace673d376d97b0c66ffa0a49b8d588a696d5d2 Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Wed, 6 Dec 2006 20:32:07 -0800
Subject: [PATCH] htlb forget rss with pt sharing

Imprecise RSS accounting is an irritating ill effect with pt sharing.  After
consulted with several VM experts, I have tried various methods to solve that
problem: (1) iterate through all mm_structs that share the PT and increment
count; (2) keep RSS count in page table structure and then sum them up at
reporting time.  None of the above methods yield any satisfactory
implementation.

Since process RSS accounting is pure information only, I propose we don't
count them at all for hugetlb page.  rlimit has such field, though there is
absolutely no enforcement on limiting that resource.  One other method is to
account all RSS at hugetlb mmap time regardless they are faulted or not.  I
opt for the simplicity of no accounting at all.

Hugetlb page are special, they are reserved up front in global reservation
pool and is not reclaimable.  From physical memory resource point of view, it
is already consumed regardless whether there are users using them.

If the concern is that RSS can be used to control resource allocation, we
already can specify hugetlb fs size limit and sysadmin can enforce that at
mount time.  Combined with the two points mentioned above, I fail to see if
there is anything got affected because of this patch.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Cc: Dave McCracken <dmccr@us.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Adam Litke <agl@us.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9244971..2911a36 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -344,7 +344,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			entry = *src_pte;
 			ptepage = pte_page(entry);
 			get_page(ptepage);
-			add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
 			set_huge_pte_at(dst, addr, dst_pte, entry);
 		}
 		spin_unlock(&src->page_table_lock);
@@ -377,10 +376,6 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	BUG_ON(end & ~HPAGE_MASK);
 
 	spin_lock(&mm->page_table_lock);
-
-	/* Update high watermark before we lower rss */
-	update_hiwater_rss(mm);
-
 	for (address = start; address < end; address += HPAGE_SIZE) {
 		ptep = huge_pte_offset(mm, address);
 		if (!ptep)
@@ -395,9 +390,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 
 		page = pte_page(pte);
 		list_add(&page->lru, &page_list);
-		add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
 	}
-
 	spin_unlock(&mm->page_table_lock);
 	flush_tlb_range(vma, start, end);
 	list_for_each_entry_safe(page, tmp, &page_list, lru) {
@@ -523,7 +516,6 @@ retry:
 	if (!pte_none(*ptep))
 		goto backout;
 
-	add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
 	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
 				&& (vma->vm_flags & VM_SHARED)));
 	set_huge_pte_at(mm, address, ptep, new_pte);
-- 
cgit v0.10.2


From a44b56d354b49f9abb184e5a14f71889856283bb Mon Sep 17 00:00:00 2001
From: Kevin Hilman <khilman@mvista.com>
Date: Wed, 6 Dec 2006 20:32:11 -0800
Subject: [PATCH] slab debug and ARCH_SLAB_MINALIGN don't get along

When CONFIG_SLAB_DEBUG is used in combination with ARCH_SLAB_MINALIGN, some
debug flags should be disabled which depend on BYTES_PER_WORD alignment.

The disabling of these debug flags is not properly handled when
BYTES_PER_WORD < ARCH_SLAB_MEMALIGN < cache_line_size()

This patch fixes that and also adds an alignment check to
cache_alloc_debugcheck_after() when ARCH_SLAB_MINALIGN is used.

Signed-off-by: Kevin Hilman <khilman@mvista.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/slab.c b/mm/slab.c
index 5de8147..ff60a94 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2197,18 +2197,17 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
 		ralign = BYTES_PER_WORD;
 
-	/* 2) arch mandated alignment: disables debug if necessary */
+	/* 2) arch mandated alignment */
 	if (ralign < ARCH_SLAB_MINALIGN) {
 		ralign = ARCH_SLAB_MINALIGN;
-		if (ralign > BYTES_PER_WORD)
-			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
 	}
-	/* 3) caller mandated alignment: disables debug if necessary */
+	/* 3) caller mandated alignment */
 	if (ralign < align) {
 		ralign = align;
-		if (ralign > BYTES_PER_WORD)
-			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
 	}
+	/* disable debug if necessary */
+	if (ralign > BYTES_PER_WORD)
+		flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
 	/*
 	 * 4) Store it.
 	 */
@@ -3063,6 +3062,12 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 
 		cachep->ctor(objp, cachep, ctor_flags);
 	}
+#if ARCH_SLAB_MINALIGN
+	if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
+		printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
+		       objp, ARCH_SLAB_MINALIGN);
+	}
+#endif
 	return objp;
 }
 #else
-- 
cgit v0.10.2


From 8f5be20bf87da7c7c59c5cc84f630a1eca5cc99c Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Wed, 6 Dec 2006 20:32:14 -0800
Subject: [PATCH] mm: slab: eliminate lock_cpu_hotplug from slab

Here's an attempt towards doing away with lock_cpu_hotplug in the slab
subsystem.  This approach also fixes a bug which shows up when cpus are
being offlined/onlined and slab caches are being tuned simultaneously.

http://marc.theaimsgroup.com/?l=linux-kernel&m=116098888100481&w=2

The patch has been stress tested overnight on a 2 socket 4 core AMD box with
repeated cpu online and offline, while dbench and kernbench process are
running, and slab caches being tuned at the same time.
There were no lockdep warnings either.  (This test on 2,6.18 as 2.6.19-rc
crashes at __drain_pages
http://marc.theaimsgroup.com/?l=linux-kernel&m=116172164217678&w=2 )

The approach here is to hold cache_chain_mutex from CPU_UP_PREPARE until
CPU_ONLINE (similar in approach as worqueue_mutex) .  Slab code sensitive
to cpu_online_map (kmem_cache_create, kmem_cache_destroy, slabinfo_write,
__cache_shrink) is already serialized with cache_chain_mutex.  (This patch
lengthens cache_chain_mutex hold time at kmem_cache_destroy to cover this).
 This patch also takes the cache_chain_sem at kmem_cache_shrink to protect
sanity of cpu_online_map at __cache_shrink, as viewed by slab.
(kmem_cache_shrink->__cache_shrink->drain_cpu_caches).  But, really,
kmem_cache_shrink is used at just one place in the acpi subsystem!  Do we
really need to keep kmem_cache_shrink at all?

Another note.  Looks like a cpu hotplug event can send  CPU_UP_CANCELED to
a registered subsystem even if the subsystem did not receive CPU_UP_PREPARE.
This could be due to a subsystem registered for notification earlier than
the current subsystem crapping out with NOTIFY_BAD. Badness can occur with
in the CPU_UP_CANCELED code path at slab if this happens (The same would
apply for workqueue.c as well).  To overcome this, we might have to use either
a) a per subsystem flag and avoid handling of CPU_UP_CANCELED, or
b) Use a special notifier events like LOCK_ACQUIRE/RELEASE as Gautham was
   using in his experiments, or
c) Do not send CPU_UP_CANCELED to a subsystem which did not receive
   CPU_UP_PREPARE.

I would prefer c).

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/slab.c b/mm/slab.c
index ff60a94..3318252 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -730,7 +730,10 @@ static inline void init_lock_keys(void)
 }
 #endif
 
-/* Guard access to the cache-chain. */
+/*
+ * 1. Guard access to the cache-chain.
+ * 2. Protect sanity of cpu_online_map against cpu hotplug events
+ */
 static DEFINE_MUTEX(cache_chain_mutex);
 static struct list_head cache_chain;
 
@@ -1230,12 +1233,18 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 			kfree(shared);
 			free_alien_cache(alien);
 		}
-		mutex_unlock(&cache_chain_mutex);
 		break;
 	case CPU_ONLINE:
+		mutex_unlock(&cache_chain_mutex);
 		start_cpu_timer(cpu);
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DOWN_PREPARE:
+		mutex_lock(&cache_chain_mutex);
+		break;
+	case CPU_DOWN_FAILED:
+		mutex_unlock(&cache_chain_mutex);
+		break;
 	case CPU_DEAD:
 		/*
 		 * Even if all the cpus of a node are down, we don't free the
@@ -1246,8 +1255,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 		 * gets destroyed at kmem_cache_destroy().
 		 */
 		/* fall thru */
+#endif
 	case CPU_UP_CANCELED:
-		mutex_lock(&cache_chain_mutex);
 		list_for_each_entry(cachep, &cache_chain, next) {
 			struct array_cache *nc;
 			struct array_cache *shared;
@@ -1308,11 +1317,9 @@ free_array_cache:
 		}
 		mutex_unlock(&cache_chain_mutex);
 		break;
-#endif
 	}
 	return NOTIFY_OK;
 bad:
-	mutex_unlock(&cache_chain_mutex);
 	return NOTIFY_BAD;
 }
 
@@ -2098,11 +2105,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	}
 
 	/*
-	 * Prevent CPUs from coming and going.
-	 * lock_cpu_hotplug() nests outside cache_chain_mutex
+	 * We use cache_chain_mutex to ensure a consistent view of
+	 * cpu_online_map as well.  Please see cpuup_callback
 	 */
-	lock_cpu_hotplug();
-
 	mutex_lock(&cache_chain_mutex);
 
 	list_for_each_entry(pc, &cache_chain, next) {
@@ -2325,7 +2330,6 @@ oops:
 		panic("kmem_cache_create(): failed to create slab `%s'\n",
 		      name);
 	mutex_unlock(&cache_chain_mutex);
-	unlock_cpu_hotplug();
 	return cachep;
 }
 EXPORT_SYMBOL(kmem_cache_create);
@@ -2443,6 +2447,7 @@ out:
 	return nr_freed;
 }
 
+/* Called with cache_chain_mutex held to protect against cpu hotplug */
 static int __cache_shrink(struct kmem_cache *cachep)
 {
 	int ret = 0, i = 0;
@@ -2473,9 +2478,13 @@ static int __cache_shrink(struct kmem_cache *cachep)
  */
 int kmem_cache_shrink(struct kmem_cache *cachep)
 {
+	int ret;
 	BUG_ON(!cachep || in_interrupt());
 
-	return __cache_shrink(cachep);
+	mutex_lock(&cache_chain_mutex);
+	ret = __cache_shrink(cachep);
+	mutex_unlock(&cache_chain_mutex);
+	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
 
@@ -2499,23 +2508,16 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
 {
 	BUG_ON(!cachep || in_interrupt());
 
-	/* Don't let CPUs to come and go */
-	lock_cpu_hotplug();
-
 	/* Find the cache in the chain of caches. */
 	mutex_lock(&cache_chain_mutex);
 	/*
 	 * the chain is never empty, cache_cache is never destroyed
 	 */
 	list_del(&cachep->next);
-	mutex_unlock(&cache_chain_mutex);
-
 	if (__cache_shrink(cachep)) {
 		slab_error(cachep, "Can't free all objects");
-		mutex_lock(&cache_chain_mutex);
 		list_add(&cachep->next, &cache_chain);
 		mutex_unlock(&cache_chain_mutex);
-		unlock_cpu_hotplug();
 		return;
 	}
 
@@ -2523,7 +2525,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
 		synchronize_rcu();
 
 	__kmem_cache_destroy(cachep);
-	unlock_cpu_hotplug();
+	mutex_unlock(&cache_chain_mutex);
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 
-- 
cgit v0.10.2


From 3395ee0588795b0b3bd889c260e55959cf2b61f5 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Wed, 6 Dec 2006 20:32:16 -0800
Subject: [PATCH] mm: add noaliencache boot option to disable numa alien caches

When using numa=fake on non-NUMA hardware there is no benefit to having the
alien caches, and they consume much memory.

Add a kernel boot option to disable them.

Christoph sayeth "This is good to have even on large NUMA.  The problem is
that the alien caches grow by the square of the size of the system in terms of
nodes."

Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 2e1898e..2ddc43e 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1012,6 +1012,10 @@ and is between 256 and 4096 characters. It is defined in the file
 			emulation library even if a 387 maths coprocessor
 			is present.
 
+	noaliencache	[MM, NUMA] Disables the allcoation of alien caches in
+			the slab allocator.  Saves per-node memory, but will
+			impact performance on real NUMA hardware.
+
 	noalign		[KNL,ARM]
 
 	noapic		[SMP,APIC] Tells the kernel to not make use of any
diff --git a/mm/slab.c b/mm/slab.c
index 3318252..bfd654c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -869,6 +869,22 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
 	dump_stack();
 }
 
+/*
+ * By default on NUMA we use alien caches to stage the freeing of
+ * objects allocated from other nodes. This causes massive memory
+ * inefficiencies when using fake NUMA setup to split memory into a
+ * large number of small nodes, so it can be disabled on the command
+ * line
+  */
+
+static int use_alien_caches __read_mostly = 1;
+static int __init noaliencache_setup(char *s)
+{
+	use_alien_caches = 0;
+	return 1;
+}
+__setup("noaliencache", noaliencache_setup);
+
 #ifdef CONFIG_NUMA
 /*
  * Special reaping functions for NUMA systems called from cache_reap().
@@ -1117,7 +1133,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 	 * Make sure we are not freeing a object from another node to the array
 	 * cache on this cpu.
 	 */
-	if (likely(slabp->nodeid == node))
+	if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches))
 		return 0;
 
 	l3 = cachep->nodelists[node];
@@ -1195,7 +1211,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 		list_for_each_entry(cachep, &cache_chain, next) {
 			struct array_cache *nc;
 			struct array_cache *shared;
-			struct array_cache **alien;
+			struct array_cache **alien = NULL;
 
 			nc = alloc_arraycache(node, cachep->limit,
 						cachep->batchcount);
@@ -1207,9 +1223,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 			if (!shared)
 				goto bad;
 
-			alien = alloc_alien_cache(node, cachep->limit);
-			if (!alien)
-				goto bad;
+			if (use_alien_caches) {
+                                alien = alloc_alien_cache(node, cachep->limit);
+                                if (!alien)
+                                        goto bad;
+                        }
 			cachep->array[cpu] = nc;
 			l3 = cachep->nodelists[node];
 			BUG_ON(!l3);
@@ -3590,13 +3608,15 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 	int node;
 	struct kmem_list3 *l3;
 	struct array_cache *new_shared;
-	struct array_cache **new_alien;
+	struct array_cache **new_alien = NULL;
 
 	for_each_online_node(node) {
 
-		new_alien = alloc_alien_cache(node, cachep->limit);
-		if (!new_alien)
-			goto fail;
+                if (use_alien_caches) {
+                        new_alien = alloc_alien_cache(node, cachep->limit);
+                        if (!new_alien)
+                                goto fail;
+                }
 
 		new_shared = alloc_arraycache(node,
 				cachep->shared*cachep->batchcount,
-- 
cgit v0.10.2


From 6edaf68a87d17570790fd55f0c451a29ec1d6703 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 6 Dec 2006 20:32:18 -0800
Subject: [PATCH] mm: arch do_page_fault() vs in_atomic()

In light of the recent pagefault and filemap_copy_from_user work I've gone
through all the arch pagefault handlers to make sure the inc_preempt_count()
'feature' works as expected.

Several sections of code (including the new filemap_copy_from_user) rely on
the fact that faults do not take locks under increased preempt count.

arch/x86_64 - good
arch/powerpc - good
arch/cris - fixed
arch/i386 - good
arch/parisc - fixed
arch/sh - good
arch/sparc - good
arch/s390 - good
arch/m68k - fixed
arch/ppc - good
arch/alpha - fixed
arch/mips - good
arch/sparc64 - good
arch/ia64 - good
arch/arm - fixed
arch/um - good
arch/avr32 - good
arch/h8300 - NA
arch/m32r - good
arch/v850 - good
arch/frv - fixed
arch/m68knommu - NA
arch/arm26 - fixed
arch/sh64 - fixed
arch/xtensa - good

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index 8871529..8aa9db8 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -108,7 +108,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 
 	/* If we're in an interrupt context, or have no user context,
 	   we must not take the fault.  */
-	if (!mm || in_interrupt())
+	if (!mm || in_atomic())
 		goto no_context;
 
 #ifdef CONFIG_ALPHA_LARGE_VMALLOC
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 5e658a8..9fd6d2e 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -230,7 +230,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_interrupt() || !mm)
+	if (in_atomic() || !mm)
 		goto no_context;
 
 	/*
diff --git a/arch/arm26/mm/fault.c b/arch/arm26/mm/fault.c
index a1f6d8a..93c0cee 100644
--- a/arch/arm26/mm/fault.c
+++ b/arch/arm26/mm/fault.c
@@ -215,7 +215,7 @@ int do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_interrupt() || !mm)
+	if (in_atomic() || !mm)
 		goto no_context;
 
 	down_read(&mm->mmap_sem);
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index 934c510..c73e91f 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -232,7 +232,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
 	 * context, we must not take the fault..
 	 */
 
-	if (in_interrupt() || !mm)
+	if (in_atomic() || !mm)
 		goto no_context;
 
 	down_read(&mm->mmap_sem);
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c
index 8b3eb50..3f12296 100644
--- a/arch/frv/mm/fault.c
+++ b/arch/frv/mm/fault.c
@@ -78,7 +78,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_interrupt() || !mm)
+	if (in_atomic() || !mm)
 		goto no_context;
 
 	down_read(&mm->mmap_sem);
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index 911f2ce..2adbeb1 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -99,7 +99,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_interrupt() || !mm)
+	if (in_atomic() || !mm)
 		goto no_context;
 
 	down_read(&mm->mmap_sem);
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index 64785e4..641f9c9 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -152,7 +152,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
 	const struct exception_table_entry *fix;
 	unsigned long acc_type;
 
-	if (in_interrupt() || !mm)
+	if (in_atomic() || !mm)
 		goto no_context;
 
 	down_read(&mm->mmap_sem);
diff --git a/arch/sh64/mm/fault.c b/arch/sh64/mm/fault.c
index 8e2f6c2..4f72ab3 100644
--- a/arch/sh64/mm/fault.c
+++ b/arch/sh64/mm/fault.c
@@ -154,7 +154,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_interrupt() || !mm)
+	if (in_atomic() || !mm)
 		goto no_context;
 
 	/* TLB misses upon some cache flushes get done under cli() */
-- 
cgit v0.10.2


From a866374aecc90c7d90619727ccd851ac096b2fc7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 6 Dec 2006 20:32:20 -0800
Subject: [PATCH] mm: pagefault_{disable,enable}()

Introduce pagefault_{disable,enable}() and use these where previously we did
manual preempt increments/decrements to make the pagefault handler do the
atomic thing.

Currently they still rely on the increased preempt count, but do not rely on
the disabled preemption, this might go away in the future.

(NOTE: the extra barrier() in pagefault_disable might fix some holes on
       machines which have too many registers for their own good)

[heiko.carstens@de.ibm.com: s390 fix]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Nick Piggin <npiggin@suse.de>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/frv/kernel/futex.c b/arch/frv/kernel/futex.c
index eae874a..53dc5ed 100644
--- a/arch/frv/kernel/futex.c
+++ b/arch/frv/kernel/futex.c
@@ -200,7 +200,7 @@ int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
-	inc_preempt_count();
+	pagefault_disable();
 
 	switch (op) {
 	case FUTEX_OP_SET:
@@ -223,7 +223,7 @@ int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 		break;
 	}
 
-	dec_preempt_count();
+	pagefault_enable();
 
 	if (!ret) {
 		switch (cmp) {
diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c
index f9f647c..178bbfe 100644
--- a/arch/i386/mm/highmem.c
+++ b/arch/i386/mm/highmem.c
@@ -32,7 +32,7 @@ void *kmap_atomic(struct page *page, enum km_type type)
 	unsigned long vaddr;
 
 	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
-	inc_preempt_count();
+	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
 
@@ -52,8 +52,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
 
 #ifdef CONFIG_DEBUG_HIGHMEM
 	if (vaddr >= PAGE_OFFSET && vaddr < (unsigned long)high_memory) {
-		dec_preempt_count();
-		preempt_check_resched();
+		pagefault_enable();
 		return;
 	}
 
@@ -68,8 +67,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
 	 */
 	kpte_clear_flush(kmap_pte-idx, vaddr);
 
-	dec_preempt_count();
-	preempt_check_resched();
+	pagefault_enable();
 }
 
 /* This is the same as kmap_atomic() but can map memory that doesn't
@@ -80,7 +78,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
 	enum fixed_addresses idx;
 	unsigned long vaddr;
 
-	inc_preempt_count();
+	pagefault_disable();
 
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c
index 99ebf3c..675502a 100644
--- a/arch/mips/mm/highmem.c
+++ b/arch/mips/mm/highmem.c
@@ -39,7 +39,7 @@ void *__kmap_atomic(struct page *page, enum km_type type)
 	unsigned long vaddr;
 
 	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
-	inc_preempt_count();
+	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
 
@@ -62,8 +62,7 @@ void __kunmap_atomic(void *kvaddr, enum km_type type)
 	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
 
 	if (vaddr < FIXADDR_START) { // FIXME
-		dec_preempt_count();
-		preempt_check_resched();
+		pagefault_enable();
 		return;
 	}
 
@@ -78,8 +77,7 @@ void __kunmap_atomic(void *kvaddr, enum km_type type)
 	local_flush_tlb_one(vaddr);
 #endif
 
-	dec_preempt_count();
-	preempt_check_resched();
+	pagefault_enable();
 }
 
 #ifndef CONFIG_LIMITED_DMA
@@ -92,7 +90,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
 	enum fixed_addresses idx;
 	unsigned long vaddr;
 
-	inc_preempt_count();
+	pagefault_disable();
 
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
diff --git a/arch/s390/lib/uaccess_std.c b/arch/s390/lib/uaccess_std.c
index 2d549ed..bbaca66 100644
--- a/arch/s390/lib/uaccess_std.c
+++ b/arch/s390/lib/uaccess_std.c
@@ -11,7 +11,7 @@
 
 #include <linux/errno.h>
 #include <linux/mm.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/futex.h>
 
 #ifndef __s390x__
@@ -258,7 +258,7 @@ int futex_atomic_op(int op, int __user *uaddr, int oparg, int *old)
 {
 	int oldval = 0, newval, ret;
 
-	inc_preempt_count();
+	pagefault_disable();
 
 	switch (op) {
 	case FUTEX_OP_SET:
@@ -284,7 +284,7 @@ int futex_atomic_op(int op, int __user *uaddr, int oparg, int *old)
 	default:
 		ret = -ENOSYS;
 	}
-	dec_preempt_count();
+	pagefault_enable();
 	*old = oldval;
 	return ret;
 }
diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c
index 4d8ed9c..01fc6c2 100644
--- a/arch/sparc/mm/highmem.c
+++ b/arch/sparc/mm/highmem.c
@@ -35,7 +35,7 @@ void *kmap_atomic(struct page *page, enum km_type type)
 	unsigned long vaddr;
 
 	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
-	inc_preempt_count();
+	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
 
@@ -70,8 +70,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
 	unsigned long idx = type + KM_TYPE_NR*smp_processor_id();
 
 	if (vaddr < FIXADDR_START) { // FIXME
-		dec_preempt_count();
-		preempt_check_resched();
+		pagefault_enable();
 		return;
 	}
 
@@ -97,8 +96,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
 #endif
 #endif
 
-	dec_preempt_count();
-	preempt_check_resched();
+	pagefault_enable();
 }
 
 /* We may be fed a pagetable here by ptep_to_xxx and others. */
diff --git a/include/asm-frv/highmem.h b/include/asm-frv/highmem.h
index 0f390f4..ff4d6cd 100644
--- a/include/asm-frv/highmem.h
+++ b/include/asm-frv/highmem.h
@@ -115,7 +115,7 @@ static inline void *kmap_atomic(struct page *page, enum km_type type)
 {
 	unsigned long paddr;
 
-	inc_preempt_count();
+	pagefault_disable();
 	paddr = page_to_phys(page);
 
 	switch (type) {
@@ -170,8 +170,7 @@ static inline void kunmap_atomic(void *kvaddr, enum km_type type)
 	default:
 		BUG();
 	}
-	dec_preempt_count();
-	preempt_check_resched();
+	pagefault_enable();
 }
 
 #endif /* !__ASSEMBLY__ */
diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h
index df893c1..f422df0 100644
--- a/include/asm-generic/futex.h
+++ b/include/asm-generic/futex.h
@@ -21,7 +21,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
-	inc_preempt_count();
+	pagefault_disable();
 
 	switch (op) {
 	case FUTEX_OP_SET:
@@ -33,7 +33,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 		ret = -ENOSYS;
 	}
 
-	dec_preempt_count();
+	pagefault_enable();
 
 	if (!ret) {
 		switch (cmp) {
diff --git a/include/asm-i386/futex.h b/include/asm-i386/futex.h
index 946d97c..438ef0e 100644
--- a/include/asm-i386/futex.h
+++ b/include/asm-i386/futex.h
@@ -56,7 +56,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
-	inc_preempt_count();
+	pagefault_disable();
 
 	if (op == FUTEX_OP_SET)
 		__futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg);
@@ -88,7 +88,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 		}
 	}
 
-	dec_preempt_count();
+	pagefault_enable();
 
 	if (!ret) {
 		switch (cmp) {
diff --git a/include/asm-ia64/futex.h b/include/asm-ia64/futex.h
index 07d77f3..8a98a26 100644
--- a/include/asm-ia64/futex.h
+++ b/include/asm-ia64/futex.h
@@ -59,7 +59,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
-	inc_preempt_count();
+	pagefault_disable();
 
 	switch (op) {
 	case FUTEX_OP_SET:
@@ -83,7 +83,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 		ret = -ENOSYS;
 	}
 
-	dec_preempt_count();
+	pagefault_enable();
 
 	if (!ret) {
 		switch (cmp) {
diff --git a/include/asm-mips/futex.h b/include/asm-mips/futex.h
index 927a216..47e5679 100644
--- a/include/asm-mips/futex.h
+++ b/include/asm-mips/futex.h
@@ -88,7 +88,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
-	inc_preempt_count();
+	pagefault_disable();
 
 	switch (op) {
 	case FUTEX_OP_SET:
@@ -115,7 +115,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 		ret = -ENOSYS;
 	}
 
-	dec_preempt_count();
+	pagefault_enable();
 
 	if (!ret) {
 		switch (cmp) {
diff --git a/include/asm-parisc/futex.h b/include/asm-parisc/futex.h
index d84bbb2..dbee6e6 100644
--- a/include/asm-parisc/futex.h
+++ b/include/asm-parisc/futex.h
@@ -21,7 +21,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
-	inc_preempt_count();
+	pagefault_disable();
 
 	switch (op) {
 	case FUTEX_OP_SET:
@@ -33,7 +33,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 		ret = -ENOSYS;
 	}
 
-	dec_preempt_count();
+	pagefault_enable();
 
 	if (!ret) {
 		switch (cmp) {
diff --git a/include/asm-powerpc/futex.h b/include/asm-powerpc/futex.h
index 936422e..3f3673f 100644
--- a/include/asm-powerpc/futex.h
+++ b/include/asm-powerpc/futex.h
@@ -43,7 +43,7 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
-	inc_preempt_count();
+	pagefault_disable();
 
 	switch (op) {
 	case FUTEX_OP_SET:
@@ -65,7 +65,7 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 		ret = -ENOSYS;
 	}
 
-	dec_preempt_count();
+	pagefault_enable();
 
 	if (!ret) {
 		switch (cmp) {
diff --git a/include/asm-ppc/highmem.h b/include/asm-ppc/highmem.h
index 1d2c4ef..f7b21ee 100644
--- a/include/asm-ppc/highmem.h
+++ b/include/asm-ppc/highmem.h
@@ -79,7 +79,7 @@ static inline void *kmap_atomic(struct page *page, enum km_type type)
 	unsigned long vaddr;
 
 	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
-	inc_preempt_count();
+	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
 
@@ -101,8 +101,7 @@ static inline void kunmap_atomic(void *kvaddr, enum km_type type)
 	unsigned int idx = type + KM_TYPE_NR*smp_processor_id();
 
 	if (vaddr < KMAP_FIX_BEGIN) { // FIXME
-		dec_preempt_count();
-		preempt_check_resched();
+		pagefault_enable();
 		return;
 	}
 
@@ -115,8 +114,7 @@ static inline void kunmap_atomic(void *kvaddr, enum km_type type)
 	pte_clear(&init_mm, vaddr, kmap_pte+idx);
 	flush_tlb_page(NULL, vaddr);
 #endif
-	dec_preempt_count();
-	preempt_check_resched();
+	pagefault_enable();
 }
 
 static inline struct page *kmap_atomic_to_page(void *ptr)
diff --git a/include/asm-sparc64/futex.h b/include/asm-sparc64/futex.h
index 7392fc4..876312f 100644
--- a/include/asm-sparc64/futex.h
+++ b/include/asm-sparc64/futex.h
@@ -45,7 +45,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
 		oparg = 1 << oparg;
 
-	inc_preempt_count();
+	pagefault_disable();
 
 	switch (op) {
 	case FUTEX_OP_SET:
@@ -67,7 +67,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 		ret = -ENOSYS;
 	}
 
-	dec_preempt_count();
+	pagefault_enable();
 
 	if (!ret) {
 		switch (cmp) {
diff --git a/include/asm-x86_64/futex.h b/include/asm-x86_64/futex.h
index 9804bf0..5cdfb08 100644
--- a/include/asm-x86_64/futex.h
+++ b/include/asm-x86_64/futex.h
@@ -55,7 +55,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
-	inc_preempt_count();
+	pagefault_disable();
 
 	switch (op) {
 	case FUTEX_OP_SET:
@@ -78,7 +78,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 		ret = -ENOSYS;
 	}
 
-	dec_preempt_count();
+	pagefault_enable();
 
 	if (!ret) {
 		switch (cmp) {
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index a48d7f1..67918c2 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -1,8 +1,43 @@
 #ifndef __LINUX_UACCESS_H__
 #define __LINUX_UACCESS_H__
 
+#include <linux/preempt.h>
 #include <asm/uaccess.h>
 
+/*
+ * These routines enable/disable the pagefault handler in that
+ * it will not take any locks and go straight to the fixup table.
+ *
+ * They have great resemblance to the preempt_disable/enable calls
+ * and in fact they are identical; this is because currently there is
+ * no other way to make the pagefault handlers do this. So we do
+ * disable preemption but we don't necessarily care about that.
+ */
+static inline void pagefault_disable(void)
+{
+	inc_preempt_count();
+	/*
+	 * make sure to have issued the store before a pagefault
+	 * can hit.
+	 */
+	barrier();
+}
+
+static inline void pagefault_enable(void)
+{
+	/*
+	 * make sure to issue those last loads/stores before enabling
+	 * the pagefault handler again.
+	 */
+	barrier();
+	dec_preempt_count();
+	/*
+	 * make sure we do..
+	 */
+	barrier();
+	preempt_check_resched();
+}
+
 #ifndef ARCH_HAS_NOCACHE_UACCESS
 
 static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
@@ -35,9 +70,9 @@ static inline unsigned long __copy_from_user_nocache(void *to,
 	({						\
 		long ret;				\
 							\
-		inc_preempt_count();			\
+		pagefault_disable();			\
 		ret = __get_user(retval, addr);		\
-		dec_preempt_count();			\
+		pagefault_enable();			\
 		ret;					\
 	})
 
diff --git a/kernel/futex.c b/kernel/futex.c
index 93ef30b..af7b81c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -282,9 +282,9 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
 {
 	int ret;
 
-	inc_preempt_count();
+	pagefault_disable();
 	ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
-	dec_preempt_count();
+	pagefault_enable();
 
 	return ret ? -EFAULT : 0;
 }
@@ -585,9 +585,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 	if (!(uval & FUTEX_OWNER_DIED)) {
 		newval = FUTEX_WAITERS | new_owner->pid;
 
-		inc_preempt_count();
+		pagefault_disable();
 		curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
-		dec_preempt_count();
+		pagefault_enable();
 		if (curval == -EFAULT)
 			return -EFAULT;
 		if (curval != uval)
@@ -618,9 +618,9 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
 	 * There is no waiter, so we unlock the futex. The owner died
 	 * bit has not to be preserved here. We are the owner:
 	 */
-	inc_preempt_count();
+	pagefault_disable();
 	oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
-	dec_preempt_count();
+	pagefault_enable();
 
 	if (oldval == -EFAULT)
 		return oldval;
@@ -1158,9 +1158,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
 	 */
 	newval = current->pid;
 
-	inc_preempt_count();
+	pagefault_disable();
 	curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
-	dec_preempt_count();
+	pagefault_enable();
 
 	if (unlikely(curval == -EFAULT))
 		goto uaddr_faulted;
@@ -1183,9 +1183,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
 	uval = curval;
 	newval = uval | FUTEX_WAITERS;
 
-	inc_preempt_count();
+	pagefault_disable();
 	curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
-	dec_preempt_count();
+	pagefault_enable();
 
 	if (unlikely(curval == -EFAULT))
 		goto uaddr_faulted;
@@ -1215,10 +1215,10 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
 			newval = current->pid |
 				FUTEX_OWNER_DIED | FUTEX_WAITERS;
 
-			inc_preempt_count();
+			pagefault_disable();
 			curval = futex_atomic_cmpxchg_inatomic(uaddr,
 							       uval, newval);
-			dec_preempt_count();
+			pagefault_enable();
 
 			if (unlikely(curval == -EFAULT))
 				goto uaddr_faulted;
@@ -1390,9 +1390,9 @@ retry_locked:
 	 * anyone else up:
 	 */
 	if (!(uval & FUTEX_OWNER_DIED)) {
-		inc_preempt_count();
+		pagefault_disable();
 		uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
-		dec_preempt_count();
+		pagefault_enable();
 	}
 
 	if (unlikely(uval == -EFAULT))
-- 
cgit v0.10.2


From ad76fb6b5a5183255279e0ab5260715481770678 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 6 Dec 2006 20:32:21 -0800
Subject: [PATCH] mm: k{,um}map_atomic() vs in_atomic()

Make kmap_atomic/kunmap_atomic denote a pagefault disabled scope.  All non
trivial implementations already do this anyway.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/asm-mips/highmem.h b/include/asm-mips/highmem.h
index c976bfa..f8c8182 100644
--- a/include/asm-mips/highmem.h
+++ b/include/asm-mips/highmem.h
@@ -21,6 +21,7 @@
 
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/uaccess.h>
 #include <asm/kmap_types.h>
 
 /* undef for production */
@@ -70,11 +71,16 @@ static inline void *kmap(struct page *page)
 
 static inline void *kmap_atomic(struct page *page, enum km_type type)
 {
+	pagefault_disable();
 	return page_address(page);
 }
 
-static inline void kunmap_atomic(void *kvaddr, enum km_type type) { }
-#define kmap_atomic_pfn(pfn, idx)	page_address(pfn_to_page(pfn))
+static inline void kunmap_atomic(void *kvaddr, enum km_type type)
+{
+	pagefault_enable();
+}
+
+#define kmap_atomic_pfn(pfn, idx) kmap_atomic(pfn_to_page(pfn), (idx))
 
 #define kmap_atomic_to_page(ptr) virt_to_page(ptr)
 
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index fd7d12d..3d8768b 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -3,6 +3,7 @@
 
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/uaccess.h>
 
 #include <asm/cacheflush.h>
 
@@ -41,9 +42,10 @@ static inline void *kmap(struct page *page)
 
 #define kunmap(page) do { (void) (page); } while (0)
 
-#define kmap_atomic(page, idx)		page_address(page)
-#define kunmap_atomic(addr, idx)	do { } while (0)
-#define kmap_atomic_pfn(pfn, idx)	page_address(pfn_to_page(pfn))
+#define kmap_atomic(page, idx) \
+	({ pagefault_disable(); page_address(page); })
+#define kunmap_atomic(addr, idx)	do { pagefault_enable(); } while (0)
+#define kmap_atomic_pfn(pfn, idx)	kmap_atomic(pfn_to_page(pfn), (idx))
 #define kmap_atomic_to_page(ptr)	virt_to_page(ptr)
 #endif
 
-- 
cgit v0.10.2


From 3b17979bda74493633364c2c263b452b7788e350 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 6 Dec 2006 20:32:22 -0800
Subject: [PATCH] Fix kunmap_atomic's use of kpte_clear_flush()

kunmap_atomic() will call kpte_clear_flush with vaddr/ptep arguments which
don't correspond if the vaddr is just a normal lowmem address (ie, not in
the KMAP area).  This patch makes sure that the pte is only cleared if kmap
area was actually used for the mapping.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c
index 178bbfe..e0fa6cb 100644
--- a/arch/i386/mm/highmem.c
+++ b/arch/i386/mm/highmem.c
@@ -50,22 +50,20 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
 	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
 
-#ifdef CONFIG_DEBUG_HIGHMEM
-	if (vaddr >= PAGE_OFFSET && vaddr < (unsigned long)high_memory) {
-		pagefault_enable();
-		return;
-	}
-
-	if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
-		BUG();
-#endif
 	/*
 	 * Force other mappings to Oops if they'll try to access this pte
 	 * without first remap it.  Keeping stale mappings around is a bad idea
 	 * also, in case the page changes cacheability attributes or becomes
 	 * a protected page in a hypervisor.
 	 */
-	kpte_clear_flush(kmap_pte-idx, vaddr);
+	if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
+		kpte_clear_flush(kmap_pte-idx, vaddr);
+	else {
+#ifdef CONFIG_DEBUG_HIGHMEM
+		BUG_ON(vaddr < PAGE_OFFSET);
+		BUG_ON(vaddr >= (unsigned long)high_memory);
+#endif
+	}
 
 	pagefault_enable();
 }
-- 
cgit v0.10.2


From 8fb4fc68ca391862b061b3d358a288ccf6abed39 Mon Sep 17 00:00:00 2001
From: Guillem Jover <guillem.jover@nokia.com>
Date: Wed, 6 Dec 2006 20:32:24 -0800
Subject: [PATCH] Allow user processes to raise their oom_adj value

Currently a user process cannot rise its own oom_adj value (i.e.
unprotecting itself from the OOM killer).  As this value is stored in the
task structure it gets inherited and the unprivileged childs will be unable
to rise it.

The EPERM will be handled by the generic proc fs layer, as only processes
with the proper caps or the owner of the process will be able to write to
the file.  So we allow only the processes with CAP_SYS_RESOURCE to lower
the value, otherwise it will get an EACCES which seems more appropriate
than EPERM.

Signed-off-by: Guillem Jover <guillem.jover@nokia.com>
Acked-by: Andrea Arcangeli <andrea@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 795319c..05ace70 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -683,8 +683,6 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	char buffer[PROC_NUMBUF], *end;
 	int oom_adjust;
 
-	if (!capable(CAP_SYS_RESOURCE))
-		return -EPERM;
 	memset(buffer, 0, sizeof(buffer));
 	if (count > sizeof(buffer) - 1)
 		count = sizeof(buffer) - 1;
@@ -699,6 +697,10 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	task = get_proc_task(file->f_dentry->d_inode);
 	if (!task)
 		return -ESRCH;
+	if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) {
+		put_task_struct(task);
+		return -EACCES;
+	}
 	task->oomkilladj = oom_adjust;
 	put_task_struct(task);
 	if (end - buffer == 0)
-- 
cgit v0.10.2


From a3eea484f7a1aadb70ed6665338026a09ad6ce85 Mon Sep 17 00:00:00 2001
From: Rik Bobbaers <Rik.Bobbaers@cc.kuleuven.be>
Date: Wed, 6 Dec 2006 20:32:25 -0800
Subject: [PATCH] mlock cleanup

mm is defined as vma->vm_mm, so use that.

Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/mlock.c b/mm/mlock.c
index b90c595..3446b7e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -65,7 +65,7 @@ success:
 			ret = make_pages_present(start, end);
 	}
 
-	vma->vm_mm->locked_vm -= pages;
+	mm->locked_vm -= pages;
 out:
 	if (ret == -ENOMEM)
 		ret = -EAGAIN;
-- 
cgit v0.10.2


From b43a57bb4dae72e8f7232e7c821a8799eda30022 Mon Sep 17 00:00:00 2001
From: Kirill Korotaev <dev@openvz.org>
Date: Wed, 6 Dec 2006 20:32:27 -0800
Subject: [PATCH] OOM can panic due to processes stuck in __alloc_pages()

OOM can panic due to the processes stuck in __alloc_pages() doing infinite
rebalance loop while no memory can be reclaimed.  OOM killer tries to kill
some processes, but unfortunetaly, rebalance label was moved by someone
below the TIF_MEMDIE check, so buddy allocator doesn't see that process is
OOM-killed and it can simply fail the allocation :/

Observed in reality on RHEL4(2.6.9)+OpenVZ kernel when a user doing some
memory allocation tricks triggered OOM panic.

Signed-off-by: Denis Lunev <den@sw.ru>
Signed-off-by: Kirill Korotaev <dev@openvz.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cd47e8f..a840e70 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1180,6 +1180,7 @@ restart:
 
 	/* This allocation should allow future memory freeing. */
 
+rebalance:
 	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
 			&& !in_interrupt()) {
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
@@ -1201,7 +1202,6 @@ nofail_alloc:
 	if (!wait)
 		goto nopage;
 
-rebalance:
 	cond_resched();
 
 	/* We now go into synchronous reclaim */
-- 
cgit v0.10.2


From 881e4aabe4c6b3fce93674b67c6adc646fe2683b Mon Sep 17 00:00:00 2001
From: Suleiman Souhlal <ssouhlal@FreeBSD.org>
Date: Wed, 6 Dec 2006 20:32:28 -0800
Subject: [PATCH] Always print out the header line in /proc/swaps

It would be possible for /proc/swaps to not always print out the header:

swapon /dev/hdc2
swapon /dev/hde2
swapoff /dev/hdc2

At this point /proc/swaps would not have a header.

Signed-off-by: Suleiman Souhlal <suleiman@google.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/swapfile.c b/mm/swapfile.c
index a15def6..8e206ce 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1274,10 +1274,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
 
 	mutex_lock(&swapon_mutex);
 
+	if (!l)
+		return SEQ_START_TOKEN;
+
 	for (i = 0; i < nr_swapfiles; i++, ptr++) {
 		if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
 			continue;
-		if (!l--)
+		if (!--l)
 			return ptr;
 	}
 
@@ -1286,10 +1289,17 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
 
 static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
 {
-	struct swap_info_struct *ptr = v;
+	struct swap_info_struct *ptr;
 	struct swap_info_struct *endptr = swap_info + nr_swapfiles;
 
-	for (++ptr; ptr < endptr; ptr++) {
+	if (v == SEQ_START_TOKEN)
+		ptr = swap_info;
+	else {
+		ptr = v;
+		ptr++;
+	}
+
+	for (; ptr < endptr; ptr++) {
 		if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
 			continue;
 		++*pos;
@@ -1310,8 +1320,10 @@ static int swap_show(struct seq_file *swap, void *v)
 	struct file *file;
 	int len;
 
-	if (v == swap_info)
-		seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+	if (ptr == SEQ_START_TOKEN) {
+		seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+		return 0;
+	}
 
 	file = ptr->swap_file;
 	len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\");
-- 
cgit v0.10.2


From 8b98c1699eba23cfd2e8b366625c50ff5fd1415b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Dec 2006 20:32:30 -0800
Subject: [PATCH] leak tracking for kmalloc_node

We have variants of kmalloc and kmem_cache_alloc that leave leak tracking to
the caller.  This is used for subsystem-specific allocators like skb_alloc.

To make skb_alloc node-aware we need similar routines for the node-aware slab
allocator, which this patch adds.

Note that the code is rather ugly, but it mirrors the non-node-aware code 1:1:

[akpm@osdl.org: add module export]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/slab.h b/include/linux/slab.h
index c4947b8..66c4640 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -236,7 +236,25 @@ found:
 	}
 	return __kmalloc_node(size, flags, node);
 }
+
+/*
+ * kmalloc_node_track_caller is a special version of kmalloc_node that
+ * records the calling function of the routine calling it for slab leak
+ * tracking instead of just the calling function (confusing, eh?).
+ * It's useful when the call to kmalloc_node comes from a widely-used
+ * standard allocator where we care about the real place the memory
+ * allocation request comes from.
+ */
+#ifndef CONFIG_DEBUG_SLAB
+#define kmalloc_node_track_caller(size, flags, node) \
+	__kmalloc_node(size, flags, node)
 #else
+extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, void *);
+#define kmalloc_node_track_caller(size, flags, node) \
+	__kmalloc_node_track_caller(size, flags, node, \
+			__builtin_return_address(0))
+#endif
+#else /* CONFIG_NUMA */
 static inline void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int node)
 {
 	return kmem_cache_alloc(cachep, flags);
@@ -245,6 +263,9 @@ static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	return kmalloc(size, flags);
 }
+
+#define kmalloc_node_track_caller(size, flags, node) \
+	kmalloc_track_caller(size, flags)
 #endif
 
 extern int FASTCALL(kmem_cache_reap(int));
@@ -283,6 +304,8 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
 #define kzalloc(s, f) __kzalloc(s, f)
 #define kmalloc_track_caller kmalloc
 
+#define kmalloc_node_track_caller kmalloc_node
+
 #endif /* CONFIG_SLOB */
 
 /* System wide caches */
diff --git a/mm/slab.c b/mm/slab.c
index bfd654c..8f3f61c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1015,7 +1015,7 @@ static inline void *alternate_node_alloc(struct kmem_cache *cachep,
 	return NULL;
 }
 
-static inline void *__cache_alloc_node(struct kmem_cache *cachep,
+static inline void *____cache_alloc_node(struct kmem_cache *cachep,
 		 gfp_t flags, int nodeid)
 {
 	return NULL;
@@ -1023,7 +1023,7 @@ static inline void *__cache_alloc_node(struct kmem_cache *cachep,
 
 #else	/* CONFIG_NUMA */
 
-static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
+static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
 static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
 
 static struct array_cache **alloc_alien_cache(int node, int limit)
@@ -3130,10 +3130,10 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
 		objp = ____cache_alloc(cachep, flags);
 	/*
 	 * We may just have run out of memory on the local node.
-	 * __cache_alloc_node() knows how to locate memory on other nodes
+	 * ____cache_alloc_node() knows how to locate memory on other nodes
 	 */
  	if (NUMA_BUILD && !objp)
- 		objp = __cache_alloc_node(cachep, flags, numa_node_id());
+ 		objp = ____cache_alloc_node(cachep, flags, numa_node_id());
 	local_irq_restore(save_flags);
 	objp = cache_alloc_debugcheck_after(cachep, flags, objp,
 					    caller);
@@ -3160,7 +3160,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
 	else if (current->mempolicy)
 		nid_alloc = slab_node(current->mempolicy);
 	if (nid_alloc != nid_here)
-		return __cache_alloc_node(cachep, flags, nid_alloc);
+		return ____cache_alloc_node(cachep, flags, nid_alloc);
 	return NULL;
 }
 
@@ -3183,7 +3183,7 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 		if (zone_idx(*z) <= ZONE_NORMAL &&
 				cpuset_zone_allowed(*z, flags) &&
 				cache->nodelists[nid])
-			obj = __cache_alloc_node(cache,
+			obj = ____cache_alloc_node(cache,
 					flags | __GFP_THISNODE, nid);
 	}
 	return obj;
@@ -3192,7 +3192,7 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 /*
  * A interface to enable slab creation on nodeid
  */
-static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
 				int nodeid)
 {
 	struct list_head *entry;
@@ -3465,7 +3465,9 @@ out:
  * New and improved: it will now make sure that the object gets
  * put on the correct node list so that there is no false sharing.
  */
-void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static __always_inline void *
+__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+		int nodeid, void *caller)
 {
 	unsigned long save_flags;
 	void *ptr;
@@ -3477,17 +3479,23 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 			!cachep->nodelists[nodeid])
 		ptr = ____cache_alloc(cachep, flags);
 	else
-		ptr = __cache_alloc_node(cachep, flags, nodeid);
+		ptr = ____cache_alloc_node(cachep, flags, nodeid);
 	local_irq_restore(save_flags);
 
-	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr,
-					   __builtin_return_address(0));
+	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
 
 	return ptr;
 }
+
+void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+{
+	return __cache_alloc_node(cachep, flags, nodeid,
+			__builtin_return_address(0));
+}
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 
-void *__kmalloc_node(size_t size, gfp_t flags, int node)
+static __always_inline void *
+__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
 {
 	struct kmem_cache *cachep;
 
@@ -3496,8 +3504,29 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 		return NULL;
 	return kmem_cache_alloc_node(cachep, flags, node);
 }
+
+#ifdef CONFIG_DEBUG_SLAB
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+	return __do_kmalloc_node(size, flags, node,
+			__builtin_return_address(0));
+}
 EXPORT_SYMBOL(__kmalloc_node);
-#endif
+
+void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
+		int node, void *caller)
+{
+	return __do_kmalloc_node(size, flags, node, caller);
+}
+EXPORT_SYMBOL(__kmalloc_node_track_caller);
+#else
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+	return __do_kmalloc_node(size, flags, node, NULL);
+}
+EXPORT_SYMBOL(__kmalloc_node);
+#endif /* CONFIG_DEBUG_SLAB */
+#endif /* CONFIG_NUMA */
 
 /**
  * __do_kmalloc - allocate memory
-- 
cgit v0.10.2


From 873481367edb18a7d0d7e5a285e6728c16bb44a9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Dec 2006 20:32:33 -0800
Subject: [PATCH] add numa node information to struct device

For node-aware skb allocations we need information about the node in struct
net_device or struct device.  Davem suggested to put it into struct device
which this patch does.

In particular:

 - struct device gets a new int numa_node member if CONFIG_NUMA is set
 - there are two new helpers, dev_to_node and set_dev_node to
   transparently deal with the non-numa case
 - for pci devices the node-info is set to the value we get from
   pcibus_to_node.

Note that for some architectures pcibus_to_node doesn't work yet at the time
we call it currently.  This is harmless and will just mean skb allocations
aren't node-local on this architectures until the implementation of
pcibus_to_node on these architectures have been updated (There are patches for
x86 and x86_64 floating around)

[akpm@osdl.org: cleanup]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/base/core.c b/drivers/base/core.c
index e4b530e..67b79a7 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -386,6 +386,7 @@ void device_initialize(struct device *dev)
 	INIT_LIST_HEAD(&dev->node);
 	init_MUTEX(&dev->sem);
 	device_init_wakeup(dev, 0);
+	set_dev_node(dev, -1);
 }
 
 #ifdef CONFIG_SYSFS_DEPRECATED
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 0eeac60..6a3c1e7 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -873,6 +873,7 @@ void __devinit pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
 	dev->dev.release = pci_release_dev;
 	pci_dev_get(dev);
 
+	set_dev_node(&dev->dev, pcibus_to_node(bus));
 	dev->dev.dma_mask = &dev->dma_mask;
 	dev->dev.coherent_dma_mask = 0xffffffffull;
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 583a341..49ab53c 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -371,6 +371,9 @@ struct device {
 					   core doesn't touch it */
 	struct dev_pm_info	power;
 
+#ifdef CONFIG_NUMA
+	int		numa_node;	/* NUMA node this device is close to */
+#endif
 	u64		*dma_mask;	/* dma mask (if dma'able device) */
 	u64		coherent_dma_mask;/* Like dma_mask, but for
 					     alloc_coherent mappings as
@@ -394,6 +397,25 @@ struct device {
 	void	(*release)(struct device * dev);
 };
 
+#ifdef CONFIG_NUMA
+static inline int dev_to_node(struct device *dev)
+{
+	return dev->numa_node;
+}
+static inline void set_dev_node(struct device *dev, int node)
+{
+	dev->numa_node = node;
+}
+#else
+static inline int dev_to_node(struct device *dev)
+{
+	return -1;
+}
+static inline void set_dev_node(struct device *dev, int node)
+{
+}
+#endif
+
 static inline void *
 dev_get_drvdata (struct device *dev)
 {
-- 
cgit v0.10.2


From b30973f877fea1a3fb84e05599890fcc082a88e5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Dec 2006 20:32:36 -0800
Subject: [PATCH] node-aware skb allocation

Node-aware allocation of skbs for the receive path.

Details:

  - __alloc_skb gets a new node argument and cals the node-aware
    slab functions with it.
  - netdev_alloc_skb passed the node number it gets from dev_to_node
    to it, everyone else passes -1 (any node)

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a05a5f7..1d649f3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -332,17 +332,17 @@ struct sk_buff {
 extern void kfree_skb(struct sk_buff *skb);
 extern void	       __kfree_skb(struct sk_buff *skb);
 extern struct sk_buff *__alloc_skb(unsigned int size,
-				   gfp_t priority, int fclone);
+				   gfp_t priority, int fclone, int node);
 static inline struct sk_buff *alloc_skb(unsigned int size,
 					gfp_t priority)
 {
-	return __alloc_skb(size, priority, 0);
+	return __alloc_skb(size, priority, 0, -1);
 }
 
 static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
 					       gfp_t priority)
 {
-	return __alloc_skb(size, priority, 1);
+	return __alloc_skb(size, priority, 1, -1);
 }
 
 extern struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8e1c385..7217fb8 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -132,6 +132,7 @@ EXPORT_SYMBOL(skb_truesize_bug);
  *	@gfp_mask: allocation mask
  *	@fclone: allocate from fclone cache instead of head cache
  *		and allocate a cloned (child) skb
+ *	@node: numa node to allocate memory on
  *
  *	Allocate a new &sk_buff. The returned buffer has no headroom and a
  *	tail room of size bytes. The object has a reference count of one.
@@ -141,7 +142,7 @@ EXPORT_SYMBOL(skb_truesize_bug);
  *	%GFP_ATOMIC.
  */
 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
-			    int fclone)
+			    int fclone, int node)
 {
 	kmem_cache_t *cache;
 	struct skb_shared_info *shinfo;
@@ -151,14 +152,14 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 	cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
 
 	/* Get the HEAD */
-	skb = kmem_cache_alloc(cache, gfp_mask & ~__GFP_DMA);
+	skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
 	if (!skb)
 		goto out;
 
 	/* Get the DATA. Size must match skb_add_mtu(). */
 	size = SKB_DATA_ALIGN(size);
-	data = kmalloc_track_caller(size + sizeof(struct skb_shared_info),
-			gfp_mask);
+	data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
+			gfp_mask, node);
 	if (!data)
 		goto nodata;
 
@@ -267,9 +268,10 @@ nodata:
 struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
 		unsigned int length, gfp_t gfp_mask)
 {
+	int node = dev->class_dev.dev ? dev_to_node(dev->class_dev.dev) : -1;
 	struct sk_buff *skb;
 
-	skb = alloc_skb(length + NET_SKB_PAD, gfp_mask);
+ 	skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node);
 	if (likely(skb)) {
 		skb_reserve(skb, NET_SKB_PAD);
 		skb->dev = dev;
-- 
cgit v0.10.2


From a120586873d3d64de93bd6d593d237e131994e58 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Wed, 6 Dec 2006 20:32:37 -0800
Subject: [PATCH] Allow NULL pointers in percpu_free

The patch (as824b) makes percpu_free() ignore NULL arguments, as one would
expect for a deallocation routine.  (Note that free_percpu is #defined as
percpu_free in include/linux/percpu.h.) A few callers are updated to remove
now-unneeded tests for NULL.  A few other callers already seem to assume
that passing a NULL pointer to percpu_free() is okay!

The patch also removes an unnecessary NULL check in percpu_depopulate().

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/i386/kernel/acpi/cstate.c b/arch/i386/kernel/acpi/cstate.c
index 4664b55..12e937c 100644
--- a/arch/i386/kernel/acpi/cstate.c
+++ b/arch/i386/kernel/acpi/cstate.c
@@ -156,10 +156,8 @@ static int __init ffh_cstate_init(void)
 
 static void __exit ffh_cstate_exit(void)
 {
-	if (cpu_cstate_entry) {
-		free_percpu(cpu_cstate_entry);
-		cpu_cstate_entry = NULL;
-	}
+	free_percpu(cpu_cstate_entry);
+	cpu_cstate_entry = NULL;
 }
 
 arch_initcall(ffh_cstate_init);
diff --git a/block/blktrace.c b/block/blktrace.c
index 74e02c0..d3679dd 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -394,8 +394,7 @@ err:
 	if (bt) {
 		if (bt->dropped_file)
 			debugfs_remove(bt->dropped_file);
-		if (bt->sequence)
-			free_percpu(bt->sequence);
+		free_percpu(bt->sequence);
 		if (bt->rchan)
 			relay_close(bt->rchan);
 		kfree(bt);
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index eaa9abe..b2486cf 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -17,10 +17,9 @@
 void percpu_depopulate(void *__pdata, int cpu)
 {
 	struct percpu_data *pdata = __percpu_disguise(__pdata);
-	if (pdata->ptrs[cpu]) {
-		kfree(pdata->ptrs[cpu]);
-		pdata->ptrs[cpu] = NULL;
-	}
+
+	kfree(pdata->ptrs[cpu]);
+	pdata->ptrs[cpu] = NULL;
 }
 EXPORT_SYMBOL_GPL(percpu_depopulate);
 
@@ -123,6 +122,8 @@ EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
  */
 void percpu_free(void *__pdata)
 {
+	if (unlikely(!__pdata))
+		return;
 	__percpu_depopulate_mask(__pdata, &cpu_possible_map);
 	kfree(__percpu_disguise(__pdata));
 }
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 87c8f54..e5cd83b 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -720,10 +720,8 @@ snmp6_mib_free(void *ptr[2])
 {
 	if (ptr == NULL)
 		return;
-	if (ptr[0])
-		free_percpu(ptr[0]);
-	if (ptr[1])
-		free_percpu(ptr[1]);
+	free_percpu(ptr[0]);
+	free_percpu(ptr[1]);
 	ptr[0] = ptr[1] = NULL;
 }
 
-- 
cgit v0.10.2


From 7c309a64d6afa90a0a07813c836ba480aeaeca8c Mon Sep 17 00:00:00 2001
From: Christian Krafft <krafft@de.ibm.com>
Date: Wed, 6 Dec 2006 20:32:41 -0800
Subject: [PATCH] enable booting a NUMA system where some nodes have no memory

When booting a NUMA system with nodes that have no memory (eg by limiting
memory), bootmem_alloc_core tried to find pages in an uninitialized
bootmem_map.  This caused a null pointer access.  This fix adds a check, so
that NULL is returned.  That will enable the caller (bootmem_alloc_nopanic)
to alloc memory on other without a panic.

Signed-off-by: Christian Krafft <krafft@de.ibm.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Martin Bligh <mbligh@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/bootmem.c b/mm/bootmem.c
index d53112f..9425342 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -196,6 +196,10 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
 	if (limit && bdata->node_boot_start >= limit)
 		return NULL;
 
+	/* on nodes without memory - bootmem_map is NULL */
+	if (!bdata->node_bootmem_map)
+		return NULL;
+
 	end_pfn = bdata->node_low_pfn;
 	limit = PFN_DOWN(limit);
 	if (limit && end_pfn > limit)
-- 
cgit v0.10.2


From e30500557eca09ddd340806ce44abf84d9115ab1 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:32:43 -0800
Subject: [PATCH] make mm/thrash.c:global_faults static

This patch makes the needlessly global "global_faults" static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/thrash.c b/mm/thrash.c
index 19e428c..9ef9071 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -24,7 +24,7 @@
 
 static DEFINE_SPINLOCK(swap_token_lock);
 struct mm_struct *swap_token_mm;
-unsigned int global_faults;
+static unsigned int global_faults;
 
 void grab_swap_token(void)
 {
-- 
cgit v0.10.2


From 54cc211ce3fc73a9d21c6316886db0676beaca95 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:32:45 -0800
Subject: [PATCH] Remove bio_cachep from slab.h

Remove bio_cachep from slab.h - it no longer exists.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 66c4640..6b7d096 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -315,7 +315,6 @@ extern kmem_cache_t	*files_cachep;
 extern kmem_cache_t	*filp_cachep;
 extern kmem_cache_t	*fs_cachep;
 extern kmem_cache_t	*sighand_cachep;
-extern kmem_cache_t	*bio_cachep;
 
 #endif	/* __KERNEL__ */
 
-- 
cgit v0.10.2


From 298ec1e2ac85cecce3eddd167286359358c44d5d Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:32:47 -0800
Subject: [PATCH] Move sighand_cachep to include/signal.h

Move sighand_cachep definitioni to linux/signal.h

The sighand cache is only used in fs/exec.c and kernel/fork.c.  It is defined
in kernel/fork.c but only used in fs/exec.c.

The sighand_cachep is related to signal processing.  So add the definition to
signal.h.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/signal.h b/include/linux/signal.h
index 117135e..1474905 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -241,6 +241,8 @@ extern int sigprocmask(int, sigset_t *, sigset_t *);
 struct pt_regs;
 extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie);
 
+extern struct kmem_cache *sighand_cachep;
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_SIGNAL_H */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 6b7d096..467b297 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -314,7 +314,6 @@ extern kmem_cache_t	*names_cachep;
 extern kmem_cache_t	*files_cachep;
 extern kmem_cache_t	*filp_cachep;
 extern kmem_cache_t	*fs_cachep;
-extern kmem_cache_t	*sighand_cachep;
 
 #endif	/* __KERNEL__ */
 
-- 
cgit v0.10.2


From c43692e85f306667545b91194c748a6e46c1f8b4 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:32:48 -0800
Subject: [PATCH] Move vm_area_cachep to include/mm.h

vm_area_cachep is used to store vm_area_structs. So move to mm.h.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ab6e497..8403037 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -114,6 +114,8 @@ struct vm_area_struct {
 #endif
 };
 
+extern struct kmem_cache *vm_area_cachep;
+
 /*
  * This struct defines the per-mm list of VMAs for uClinux. If CONFIG_MMU is
  * disabled, then there's a single shared list of VMAs maintained by the
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 467b297..a30a402 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -309,7 +309,6 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
 #endif /* CONFIG_SLOB */
 
 /* System wide caches */
-extern kmem_cache_t	*vm_area_cachep;
 extern kmem_cache_t	*names_cachep;
 extern kmem_cache_t	*files_cachep;
 extern kmem_cache_t	*filp_cachep;
-- 
cgit v0.10.2


From 5d6538fcf231faccb2ac42f92851d259d00e62f9 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:32:50 -0800
Subject: [PATCH] Move files_cachep to include/file.h

Proper place is in file.h since files_cachep uses are rated to file I/O.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/file.h b/include/linux/file.h
index 74183e6..be37423 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -114,4 +114,6 @@ struct files_struct *get_files_struct(struct task_struct *);
 void FASTCALL(put_files_struct(struct files_struct *fs));
 void reset_files_struct(struct task_struct *, struct files_struct *);
 
+extern struct kmem_cache *files_cachep;
+
 #endif /* __LINUX_FILE_H */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index a30a402..dd9efec 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -310,7 +310,6 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
 
 /* System wide caches */
 extern kmem_cache_t	*names_cachep;
-extern kmem_cache_t	*files_cachep;
 extern kmem_cache_t	*filp_cachep;
 extern kmem_cache_t	*fs_cachep;
 
-- 
cgit v0.10.2


From 8b7d91eb7f6a3e8f0caaa613937bda5ab7dc7dc2 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:32:52 -0800
Subject: [PATCH] Move filep_cachep to include/file.h

filp_cachep is only used in fs/file_table.c and in fs/dcache.c where
it is defined.

Move it to related definitions in linux/file.h.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/file.h b/include/linux/file.h
index be37423..6e77b91 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -64,6 +64,8 @@ struct files_struct {
 
 #define files_fdtable(files) (rcu_dereference((files)->fdt))
 
+extern struct kmem_cache *filp_cachep;
+
 extern void FASTCALL(__fput(struct file *));
 extern void FASTCALL(fput(struct file *));
 
diff --git a/include/linux/slab.h b/include/linux/slab.h
index dd9efec..f9202d6 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -310,7 +310,6 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
 
 /* System wide caches */
 extern kmem_cache_t	*names_cachep;
-extern kmem_cache_t	*filp_cachep;
 extern kmem_cache_t	*fs_cachep;
 
 #endif	/* __KERNEL__ */
-- 
cgit v0.10.2


From aa362a83e78d2e9320da588805cf2a0b53356bc3 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:32:54 -0800
Subject: [PATCH] Move fs_cachep to linux/fs_struct.h

fs_cachep is only used in kernel/exit.c and in kernel/fork.c.

It is used to store fs_struct items so it should be placed in linux/fs_struct.h

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index c623d12..11a36ce 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -18,6 +18,8 @@ struct fs_struct {
 	.umask		= 0022, \
 }
 
+extern struct kmem_cache *fs_cachep;
+
 extern void exit_fs(struct task_struct *);
 extern void set_fs_altroot(void);
 extern void set_fs_root(struct fs_struct *, struct vfsmount *, struct dentry *);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index f9202d6..bfc063e 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -310,7 +310,6 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
 
 /* System wide caches */
 extern kmem_cache_t	*names_cachep;
-extern kmem_cache_t	*fs_cachep;
 
 #endif	/* __KERNEL__ */
 
-- 
cgit v0.10.2


From b86c089b83b8ae2bc814db865057768a9ba787b5 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:32:57 -0800
Subject: [PATCH] Move names_cachep to linux/fs.h

The names_cachep is used for getname() and putname().  So lets put it into
fs.h near those two definitions.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/fs.h b/include/linux/fs.h
index cac7b1e..a8039c8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1481,6 +1481,8 @@ extern char * getname(const char __user *);
 extern void __init vfs_caches_init_early(void);
 extern void __init vfs_caches_init(unsigned long);
 
+extern struct kmem_cache *names_cachep;
+
 #define __getname()	kmem_cache_alloc(names_cachep, SLAB_KERNEL)
 #define __putname(name) kmem_cache_free(names_cachep, (void *)(name))
 #ifndef CONFIG_AUDITSYSCALL
diff --git a/include/linux/slab.h b/include/linux/slab.h
index bfc063e..e67314e 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -308,9 +308,6 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
 
 #endif /* CONFIG_SLOB */
 
-/* System wide caches */
-extern kmem_cache_t	*names_cachep;
-
 #endif	/* __KERNEL__ */
 
 #endif	/* _LINUX_SLAB_H */
-- 
cgit v0.10.2


From ebe29738f3934ad6a93c8bd76e30aa5d797a269d Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:32:59 -0800
Subject: [PATCH] Remove uses of kmem_cache_t from mm/* and
 include/linux/slab.h

Remove all uses of kmem_cache_t (the most were left in slab.h).  The
typedef for kmem_cache_t is then only necessary for other kernel
subsystems.  Add a comment to that effect.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/slab.h b/include/linux/slab.h
index e67314e..b831776 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -9,6 +9,7 @@
 
 #if	defined(__KERNEL__)
 
+/* kmem_cache_t exists for legacy reasons and is not used by code in mm */
 typedef struct kmem_cache kmem_cache_t;
 
 #include	<linux/gfp.h>
@@ -57,22 +58,23 @@ typedef struct kmem_cache kmem_cache_t;
 /* prototypes */
 extern void __init kmem_cache_init(void);
 
-extern kmem_cache_t *kmem_cache_create(const char *, size_t, size_t, unsigned long,
-				       void (*)(void *, kmem_cache_t *, unsigned long),
-				       void (*)(void *, kmem_cache_t *, unsigned long));
-extern void kmem_cache_destroy(kmem_cache_t *);
-extern int kmem_cache_shrink(kmem_cache_t *);
-extern void *kmem_cache_alloc(kmem_cache_t *, gfp_t);
+extern struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
+			unsigned long,
+			void (*)(void *, struct kmem_cache *, unsigned long),
+			void (*)(void *, struct kmem_cache *, unsigned long));
+extern void kmem_cache_destroy(struct kmem_cache *);
+extern int kmem_cache_shrink(struct kmem_cache *);
+extern void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
 extern void *kmem_cache_zalloc(struct kmem_cache *, gfp_t);
-extern void kmem_cache_free(kmem_cache_t *, void *);
-extern unsigned int kmem_cache_size(kmem_cache_t *);
-extern const char *kmem_cache_name(kmem_cache_t *);
+extern void kmem_cache_free(struct kmem_cache *, void *);
+extern unsigned int kmem_cache_size(struct kmem_cache *);
+extern const char *kmem_cache_name(struct kmem_cache *);
 
 /* Size description struct for general caches. */
 struct cache_sizes {
-	size_t		 cs_size;
-	kmem_cache_t	*cs_cachep;
-	kmem_cache_t	*cs_dmacachep;
+	size_t		 	cs_size;
+	struct kmem_cache	*cs_cachep;
+	struct kmem_cache	*cs_dmacachep;
 };
 extern struct cache_sizes malloc_sizes[];
 
@@ -211,7 +213,7 @@ extern unsigned int ksize(const void *);
 extern int slab_is_available(void);
 
 #ifdef CONFIG_NUMA
-extern void *kmem_cache_alloc_node(kmem_cache_t *, gfp_t flags, int node);
+extern void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
 extern void *__kmalloc_node(size_t size, gfp_t flags, int node);
 
 static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
@@ -255,7 +257,8 @@ extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, void *);
 			__builtin_return_address(0))
 #endif
 #else /* CONFIG_NUMA */
-static inline void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int node)
+static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep,
+					gfp_t flags, int node)
 {
 	return kmem_cache_alloc(cachep, flags);
 }
@@ -269,7 +272,7 @@ static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 #endif
 
 extern int FASTCALL(kmem_cache_reap(int));
-extern int FASTCALL(kmem_ptr_validate(kmem_cache_t *cachep, void *ptr));
+extern int FASTCALL(kmem_ptr_validate(struct kmem_cache *cachep, void *ptr));
 
 #else /* CONFIG_SLOB */
 
-- 
cgit v0.10.2


From bc4ba393c007248f76c05945abb7b7b892cdd1cc Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:33:02 -0800
Subject: [PATCH] drain_node_page(): Drain pages in batch units

drain_node_pages() currently drains the complete pageset of all pages.  If
there are a large number of pages in the queues then we may hold off
interrupts for too long.

Duplicate the method used in free_hot_cold_page.  Only drain pcp->batch
pages at one time.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a840e70..86f2984 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -685,9 +685,15 @@ void drain_node_pages(int nodeid)
 
 			pcp = &pset->pcp[i];
 			if (pcp->count) {
+				int to_drain;
+
 				local_irq_save(flags);
-				free_pages_bulk(zone, pcp->count, &pcp->list, 0);
-				pcp->count = 0;
+				if (pcp->count >= pcp->batch)
+					to_drain = pcp->batch;
+				else
+					to_drain = pcp->count;
+				free_pages_bulk(zone, to_drain, &pcp->list, 0);
+				pcp->count -= to_drain;
 				local_irq_restore(flags);
 			}
 		}
-- 
cgit v0.10.2


From 25ba77c141dbcd2602dd0171824d0d72aa023a01 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Wed, 6 Dec 2006 20:33:03 -0800
Subject: [PATCH] numa node ids are int, page_to_nid and zone_to_nid should
 return int

NUMA node ids are passed as either int or unsigned int almost exclusivly
page_to_nid and zone_to_nid both return unsigned long.  This is a throw
back to when page_to_nid was a #define and was thus exposing the real type
of the page flags field.

In addition to fixing up the definitions of page_to_nid and zone_to_nid I
audited the users of these functions identifying the following incorrect
uses:

1) mm/page_alloc.c show_node() -- printk dumping the node id,
2) include/asm-ia64/pgalloc.h pgtable_quicklist_free() -- comparison
   against numa_node_id() which returns an int from cpu_to_node(), and
3) mm/mpolicy.c check_pte_range -- used as an index in node_isset which
   uses bit_set which in generic code takes an int.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/asm-ia64/pgalloc.h b/include/asm-ia64/pgalloc.h
index 9cb68e9..393e04c 100644
--- a/include/asm-ia64/pgalloc.h
+++ b/include/asm-ia64/pgalloc.h
@@ -60,7 +60,7 @@ static inline void *pgtable_quicklist_alloc(void)
 static inline void pgtable_quicklist_free(void *pgtable_entry)
 {
 #ifdef CONFIG_NUMA
-	unsigned long nid = page_to_nid(virt_to_page(pgtable_entry));
+	int nid = page_to_nid(virt_to_page(pgtable_entry));
 
 	if (unlikely(nid != numa_node_id())) {
 		free_page((unsigned long)pgtable_entry);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8403037..0e266fe 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -456,7 +456,7 @@ static inline int page_zone_id(struct page *page)
 	return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
 }
 
-static inline unsigned long zone_to_nid(struct zone *zone)
+static inline int zone_to_nid(struct zone *zone)
 {
 #ifdef CONFIG_NUMA
 	return zone->node;
@@ -466,9 +466,9 @@ static inline unsigned long zone_to_nid(struct zone *zone)
 }
 
 #ifdef NODE_NOT_IN_PAGE_FLAGS
-extern unsigned long page_to_nid(struct page *page);
+extern int page_to_nid(struct page *page);
 #else
-static inline unsigned long page_to_nid(struct page *page)
+static inline int page_to_nid(struct page *page)
 {
 	return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index fb90723..e7b69c9 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -221,7 +221,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	do {
 		struct page *page;
-		unsigned int nid;
+		int nid;
 
 		if (!pte_present(*pte))
 			continue;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 86f2984..614d427 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1407,7 +1407,7 @@ unsigned int nr_free_pagecache_pages(void)
 static inline void show_node(struct zone *zone)
 {
 	if (NUMA_BUILD)
-		printk("Node %ld ", zone_to_nid(zone));
+		printk("Node %d ", zone_to_nid(zone));
 }
 
 void si_meminfo(struct sysinfo *val)
diff --git a/mm/sparse.c b/mm/sparse.c
index 158d6a2..ac26eb0 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -36,7 +36,7 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
 static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
 #endif
 
-unsigned long page_to_nid(struct page *page)
+int page_to_nid(struct page *page)
 {
 	return section_to_node_table[page_to_section(page)];
 }
-- 
cgit v0.10.2


From 4af2bfc1202041006a0f01d0591a975f6c573f09 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Wed, 6 Dec 2006 20:33:04 -0800
Subject: [PATCH] silence unused pgdat warning from alloc_bootmem_node and
 friends

x86 NUMA systems only define bootmem for node 0.  alloc_bootmem_node() and
friends therefore ignore the passed pgdat and use NODE_DATA(0) in all
cases.  This leads to the following warnings as we are not using the passed
parameter:

  .../mm/page_alloc.c: In function 'zone_wait_table_init':
  .../mm/page_alloc.c:2259: warning: unused variable 'pgdat'

One option would be to define all variables used with these macros
__attribute__ ((unused)), but this would leave us exposed should these
become genuinely unused.

The key here is that we _are_ using the value, we ignore it but that is a
deliberate action.  This patch adds a nested local variable within the
alloc_bootmem_node helper to which the pgdat parameter is assigned making
it 'used'.  The nested local is marked __attribute__ ((unused)) to silence
this same warning for it.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h
index 61b0733..3503ad6 100644
--- a/include/asm-i386/mmzone.h
+++ b/include/asm-i386/mmzone.h
@@ -120,13 +120,26 @@ static inline int pfn_valid(int pfn)
 	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low_pages(x) \
 	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
-#define alloc_bootmem_node(ignore, x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_pages_node(ignore, x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low_pages_node(ignore, x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
-
+#define alloc_bootmem_node(pgdat, x)					\
+({									\
+	struct pglist_data  __attribute__ ((unused))			\
+				*__alloc_bootmem_node__pgdat = (pgdat);	\
+	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES,	\
+						__pa(MAX_DMA_ADDRESS));	\
+})
+#define alloc_bootmem_pages_node(pgdat, x)				\
+({									\
+	struct pglist_data  __attribute__ ((unused))			\
+				*__alloc_bootmem_node__pgdat = (pgdat);	\
+	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE,		\
+						__pa(MAX_DMA_ADDRESS))	\
+})
+#define alloc_bootmem_low_pages_node(pgdat, x)				\
+({									\
+	struct pglist_data  __attribute__ ((unused))			\
+				*__alloc_bootmem_node__pgdat = (pgdat);	\
+	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0);		\
+})
 #endif /* CONFIG_NEED_MULTIPLE_NODES */
 
 #endif /* _ASM_MMZONE_H_ */
-- 
cgit v0.10.2


From 5d1854e15ee979f8e27330f0d3ce5e2703afa1dc Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 6 Dec 2006 20:33:06 -0800
Subject: [PATCH] reject corrupt swapfiles earlier

The fsfuzzer found this; with a corrupt small swapfile that claims to have
many pages:

  [root]# file swap.741.img
  swap.741.img: Linux/i386 swap file (new style) 1 (4K pages) size 1040191487 pages
  [root]# ls -l swap.741.img
  -rw-r--r-- 1 root root 16777216 Nov 22 05:18 swap.741.img

sys_swapon() will try to vmalloc all those pages, and -then- check to see if
the file is actually that large:

                if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
  <snip>
        if (swapfilesize && maxpages > swapfilesize) {
                printk(KERN_WARNING
                       "Swap area shorter than signature indicates\n");

It seems to me that it would make more sense to move this test up before
the vmalloc, with the other checks, to avoid the OOM-killer in this
situation...

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8e206ce..f315131 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1552,6 +1552,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		error = -EINVAL;
 		if (!maxpages)
 			goto bad_swap;
+		if (swapfilesize && maxpages > swapfilesize) {
+			printk(KERN_WARNING
+			       "Swap area shorter than signature indicates\n");
+			goto bad_swap;
+		}
 		if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
 			goto bad_swap;
 		if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
@@ -1579,12 +1584,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 			goto bad_swap;
 	}
 
-	if (swapfilesize && maxpages > swapfilesize) {
-		printk(KERN_WARNING
-		       "Swap area shorter than signature indicates\n");
-		error = -EINVAL;
-		goto bad_swap;
-	}
 	if (nr_good_pages) {
 		p->swap_map[0] = SWAP_MAP_BAD;
 		p->max = maxpages;
-- 
cgit v0.10.2


From ce421c799b5bde77aa60776d6fb61036ae0aea11 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Wed, 6 Dec 2006 20:33:08 -0800
Subject: [PATCH] mm: cleanup indentation on switch for CPU operations

These patches introduced new switch statements which are indented contrary
to the concensus in mm/*.c.  Fix them up to match that concensus.

    [PATCH] node local per-cpu-pages
    [PATCH] ZVC: Scale thresholds depending on the size of the system
    commit e7c8d5c9955a4d2e88e36b640563f5d6d5aba48a
    commit df9ecaba3f152d1ea79f2a5e0b87505e03f47590

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 614d427..5d123b3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2042,16 +2042,16 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
 	int ret = NOTIFY_OK;
 
 	switch (action) {
-		case CPU_UP_PREPARE:
-			if (process_zones(cpu))
-				ret = NOTIFY_BAD;
-			break;
-		case CPU_UP_CANCELED:
-		case CPU_DEAD:
-			free_zone_pagesets(cpu);
-			break;
-		default:
-			break;
+	case CPU_UP_PREPARE:
+		if (process_zones(cpu))
+			ret = NOTIFY_BAD;
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_DEAD:
+		free_zone_pagesets(cpu);
+		break;
+	default:
+		break;
 	}
 	return ret;
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8614e8f..ef41768 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -679,13 +679,13 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
 		void *hcpu)
 {
 	switch (action) {
-		case CPU_UP_PREPARE:
-		case CPU_UP_CANCELED:
-		case CPU_DEAD:
-			refresh_zone_stat_thresholds();
-			break;
-		default:
-			break;
+	case CPU_UP_PREPARE:
+	case CPU_UP_CANCELED:
+	case CPU_DEAD:
+		refresh_zone_stat_thresholds();
+		break;
+	default:
+		break;
 	}
 	return NOTIFY_OK;
 }
-- 
cgit v0.10.2


From 2d4d862f729f0cb1ad8027203aceff49dc9f63fa Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 6 Dec 2006 20:33:09 -0800
Subject: [PATCH] kill install_file_pte's pte_val

David Binderman and his Intel C compiler rightly observe that
install_file_pte no longer has any use for its pte_val.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: d binderman <dcb314@hotmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/fremap.c b/mm/fremap.c
index 7a9d0f5..b77a002 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -101,7 +101,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	int err = -ENOMEM;
 	pte_t *pte;
-	pte_t pte_val;
 	spinlock_t *ptl;
 
 	pte = get_locked_pte(mm, addr, &ptl);
@@ -114,7 +113,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 
 	set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
-	pte_val = *pte;
 	/*
 	 * We don't need to run update_mmu_cache() here because the "file pte"
 	 * being installed by install_file_pte() is not a real pte - it's a
-- 
cgit v0.10.2


From 6e0eaa4b05cf53ca5caa702fd2760a5b3376be69 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:33:10 -0800
Subject: [PATCH] slab: remove SLAB_NO_GROW

It is only used internally in the slab.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/slab.h b/include/linux/slab.h
index b831776..9ffd1c1 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -28,8 +28,6 @@ typedef struct kmem_cache kmem_cache_t;
 
 #define SLAB_LEVEL_MASK		GFP_LEVEL_MASK
 
-#define	SLAB_NO_GROW		__GFP_NO_GROW	/* don't grow a cache */
-
 /* flags to pass to kmem_cache_create().
  * The first 3 are only valid when the allocator as been build
  * SLAB_DEBUG_SUPPORT.
diff --git a/mm/slab.c b/mm/slab.c
index 8f3f61c..e853dfe 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2721,8 +2721,8 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	 * Be lazy and only check for valid flags here,  keeping it out of the
 	 * critical path in kmem_cache_alloc().
 	 */
-	BUG_ON(flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW));
-	if (flags & SLAB_NO_GROW)
+	BUG_ON(flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | __GFP_NO_GROW));
+	if (flags & __GFP_NO_GROW)
 		return 0;
 
 	ctor_flags = SLAB_CTOR_CONSTRUCTOR;
-- 
cgit v0.10.2


From a06d72c1dcbff015250df6ad9f0b1d18c02113bf Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:33:12 -0800
Subject: [PATCH] slab: remove SLAB_LEVEL_MASK

SLAB_LEVEL_MASK is only used internally to the slab and is
and alias of GFP_LEVEL_MASK.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 9ffd1c1..6f7b9bb 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -26,8 +26,6 @@ typedef struct kmem_cache kmem_cache_t;
 #define	SLAB_KERNEL		GFP_KERNEL
 #define	SLAB_DMA		GFP_DMA
 
-#define SLAB_LEVEL_MASK		GFP_LEVEL_MASK
-
 /* flags to pass to kmem_cache_create().
  * The first 3 are only valid when the allocator as been build
  * SLAB_DEBUG_SUPPORT.
diff --git a/mm/slab.c b/mm/slab.c
index e853dfe..9f34b49 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2721,12 +2721,12 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	 * Be lazy and only check for valid flags here,  keeping it out of the
 	 * critical path in kmem_cache_alloc().
 	 */
-	BUG_ON(flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | __GFP_NO_GROW));
+	BUG_ON(flags & ~(SLAB_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW));
 	if (flags & __GFP_NO_GROW)
 		return 0;
 
 	ctor_flags = SLAB_CTOR_CONSTRUCTOR;
-	local_flags = (flags & SLAB_LEVEL_MASK);
+	local_flags = (flags & GFP_LEVEL_MASK);
 	if (!(local_flags & __GFP_WAIT))
 		/*
 		 * Not allowed to sleep.  Need to tell a constructor about
-- 
cgit v0.10.2


From 55acbda0965ca0a29b0ca276e7d17a55edc11d1b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:33:13 -0800
Subject: [PATCH] slab: remove SLAB_NOIO

SLAB_NOIO is an alias of GFP_NOIO with a single instance of use.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/usb/storage/transport.c b/drivers/usb/storage/transport.c
index 47644b5..323293a 100644
--- a/drivers/usb/storage/transport.c
+++ b/drivers/usb/storage/transport.c
@@ -427,7 +427,7 @@ static int usb_stor_bulk_transfer_sglist(struct us_data *us, unsigned int pipe,
 	US_DEBUGP("%s: xfer %u bytes, %d entries\n", __FUNCTION__,
 			length, num_sg);
 	result = usb_sg_init(&us->current_sg, us->pusb_dev, pipe, 0,
-			sg, num_sg, length, SLAB_NOIO);
+			sg, num_sg, length, GFP_NOIO);
 	if (result) {
 		US_DEBUGP("usb_sg_init returned %d\n", result);
 		return USB_STOR_XFER_ERROR;
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 6f7b9bb..43ced80 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -20,7 +20,6 @@ typedef struct kmem_cache kmem_cache_t;
 
 /* flags for kmem_cache_alloc() */
 #define	SLAB_NOFS		GFP_NOFS
-#define	SLAB_NOIO		GFP_NOIO
 #define	SLAB_ATOMIC		GFP_ATOMIC
 #define	SLAB_USER		GFP_USER
 #define	SLAB_KERNEL		GFP_KERNEL
-- 
cgit v0.10.2


From e6b4f8da3a88457148038bc952043e99a7fdba64 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:33:14 -0800
Subject: [PATCH] slab: remove SLAB_NOFS

SLAB_NOFS is an alias of GFP_NOFS.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index bbc9cd3..8355daf 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -153,7 +153,7 @@ cifs_buf_get(void)
    albeit slightly larger than necessary and maxbuffersize 
    defaults to this and can not be bigger */
 	ret_buf =
-	    (struct smb_hdr *) mempool_alloc(cifs_req_poolp, SLAB_KERNEL | SLAB_NOFS);
+	    (struct smb_hdr *) mempool_alloc(cifs_req_poolp, SLAB_KERNEL | GFP_NOFS);
 
 	/* clear the first few header bytes */
 	/* for most paths, more is cleared in header_assemble */
@@ -192,7 +192,7 @@ cifs_small_buf_get(void)
    albeit slightly larger than necessary and maxbuffersize 
    defaults to this and can not be bigger */
 	ret_buf =
-	    (struct smb_hdr *) mempool_alloc(cifs_sm_req_poolp, SLAB_KERNEL | SLAB_NOFS);
+	    (struct smb_hdr *) mempool_alloc(cifs_sm_req_poolp, SLAB_KERNEL | GFP_NOFS);
 	if (ret_buf) {
 	/* No need to clear memory here, cleared in header assemble */
 	/*	memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 48d47b4..7514237 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -51,7 +51,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct cifsSesInfo *ses)
 	}
 	
 	temp = (struct mid_q_entry *) mempool_alloc(cifs_mid_poolp,
-						    SLAB_KERNEL | SLAB_NOFS);
+						    SLAB_KERNEL | GFP_NOFS);
 	if (temp == NULL)
 		return temp;
 	else {
diff --git a/fs/dquot.c b/fs/dquot.c
index 9af7895..c6ae6c0 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -600,7 +600,7 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 {
 	struct dquot *dquot;
 
-	dquot = kmem_cache_alloc(dquot_cachep, SLAB_NOFS);
+	dquot = kmem_cache_alloc(dquot_cachep, GFP_NOFS);
 	if(!dquot)
 		return NODQUOT;
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index afc2d4f..0cf633f 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -445,7 +445,7 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
 {
 	struct ext3_inode_info *ei;
 
-	ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS);
+	ei = kmem_cache_alloc(ext3_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b4b022a..c730cbc8 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -495,7 +495,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 {
 	struct ext4_inode_info *ei;
 
-	ei = kmem_cache_alloc(ext4_inode_cachep, SLAB_NOFS);
+	ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
 #ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 450b5e0..46ceadd 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -165,7 +165,7 @@ static kmem_cache_t * hpfs_inode_cachep;
 static struct inode *hpfs_alloc_inode(struct super_block *sb)
 {
 	struct hpfs_inode_info *ei;
-	ei = (struct hpfs_inode_info *)kmem_cache_alloc(hpfs_inode_cachep, SLAB_NOFS);
+	ei = (struct hpfs_inode_info *)kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
 	ei->vfs_inode.i_version = 1;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index c2e49c3..56f66f0 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -46,7 +46,7 @@ static mempool_t *nfs_rdata_mempool;
 struct nfs_read_data *nfs_readdata_alloc(size_t len)
 {
 	unsigned int pagecount = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS);
+	struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS);
 
 	if (p) {
 		memset(p, 0, sizeof(*p));
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 883dd4a..f7dd0d0 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -93,7 +93,7 @@ static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
 
 struct nfs_write_data *nfs_commit_alloc(void)
 {
-	struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, SLAB_NOFS);
+	struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
 
 	if (p) {
 		memset(p, 0, sizeof(*p));
@@ -112,7 +112,7 @@ void nfs_commit_free(struct nfs_write_data *p)
 struct nfs_write_data *nfs_writedata_alloc(size_t len)
 {
 	unsigned int pagecount = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, SLAB_NOFS);
+	struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS);
 
 	if (p) {
 		memset(p, 0, sizeof(*p));
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 9f08e85..c577d8e 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -1272,7 +1272,7 @@ ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, MFT_RECORD *mrec)
 {
 	ntfs_attr_search_ctx *ctx;
 
-	ctx = kmem_cache_alloc(ntfs_attr_ctx_cache, SLAB_NOFS);
+	ctx = kmem_cache_alloc(ntfs_attr_ctx_cache, GFP_NOFS);
 	if (ctx)
 		ntfs_attr_init_search_ctx(ctx, ni, mrec);
 	return ctx;
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index e32cde4..2194eff 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -38,7 +38,7 @@ ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni)
 {
 	ntfs_index_context *ictx;
 
-	ictx = kmem_cache_alloc(ntfs_index_ctx_cache, SLAB_NOFS);
+	ictx = kmem_cache_alloc(ntfs_index_ctx_cache, GFP_NOFS);
 	if (ictx)
 		*ictx = (ntfs_index_context){ .idx_ni = idx_ni };
 	return ictx;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 2d3de9c..2479898 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -324,7 +324,7 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb)
 	ntfs_inode *ni;
 
 	ntfs_debug("Entering.");
-	ni = kmem_cache_alloc(ntfs_big_inode_cache, SLAB_NOFS);
+	ni = kmem_cache_alloc(ntfs_big_inode_cache, GFP_NOFS);
 	if (likely(ni != NULL)) {
 		ni->state = 0;
 		return VFS_I(ni);
@@ -349,7 +349,7 @@ static inline ntfs_inode *ntfs_alloc_extent_inode(void)
 	ntfs_inode *ni;
 
 	ntfs_debug("Entering.");
-	ni = kmem_cache_alloc(ntfs_inode_cache, SLAB_NOFS);
+	ni = kmem_cache_alloc(ntfs_inode_cache, GFP_NOFS);
 	if (likely(ni != NULL)) {
 		ni->state = 0;
 		return ni;
diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c
index 6a495f7..005ca4b 100644
--- a/fs/ntfs/unistr.c
+++ b/fs/ntfs/unistr.c
@@ -266,7 +266,7 @@ int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins,
 
 	/* We do not trust outside sources. */
 	if (likely(ins)) {
-		ucs = kmem_cache_alloc(ntfs_name_cache, SLAB_NOFS);
+		ucs = kmem_cache_alloc(ntfs_name_cache, GFP_NOFS);
 		if (likely(ucs)) {
 			for (i = o = 0; i < ins_len; i += wc_len) {
 				wc_len = nls->char2uni(ins + i, ins_len - i,
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 16b8d1b..01f9150 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -276,7 +276,7 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb)
 {
 	struct dlmfs_inode_private *ip;
 
-	ip = kmem_cache_alloc(dlmfs_inode_cache, SLAB_NOFS);
+	ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS);
 	if (!ip)
 		return NULL;
 
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index d9b4214..7574d26 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -303,7 +303,7 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
 {
 	struct ocfs2_inode_info *oi;
 
-	oi = kmem_cache_alloc(ocfs2_inode_cachep, SLAB_NOFS);
+	oi = kmem_cache_alloc(ocfs2_inode_cachep, GFP_NOFS);
 	if (!oi)
 		return NULL;
 
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 43ced80..5b70d52 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -19,7 +19,6 @@ typedef struct kmem_cache kmem_cache_t;
 #include	<asm/cache.h>		/* kmalloc_sizes.h needs L1_CACHE_BYTES */
 
 /* flags for kmem_cache_alloc() */
-#define	SLAB_NOFS		GFP_NOFS
 #define	SLAB_ATOMIC		GFP_ATOMIC
 #define	SLAB_USER		GFP_USER
 #define	SLAB_KERNEL		GFP_KERNEL
-- 
cgit v0.10.2


From f7267c0c0721fd02ad3dc37c3d6dd24ccd81d4d6 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:33:15 -0800
Subject: [PATCH] slab: remove SLAB_USER

SLAB_USER is an alias of GFP_USER

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index f63a775..776b2ee 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1334,7 +1334,7 @@ int ecryptfs_write_headers(struct dentry *ecryptfs_dentry,
 		goto out;
 	}
 	/* Released in this function */
-	page_virt = kmem_cache_alloc(ecryptfs_header_cache_0, SLAB_USER);
+	page_virt = kmem_cache_alloc(ecryptfs_header_cache_0, GFP_USER);
 	if (!page_virt) {
 		ecryptfs_printk(KERN_ERR, "Out of memory\n");
 		rc = -ENOMEM;
@@ -1493,7 +1493,7 @@ int ecryptfs_read_headers(struct dentry *ecryptfs_dentry,
 	    &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
 
 	/* Read the first page from the underlying file */
-	page_virt = kmem_cache_alloc(ecryptfs_header_cache_1, SLAB_USER);
+	page_virt = kmem_cache_alloc(ecryptfs_header_cache_1, GFP_USER);
 	if (!page_virt) {
 		rc = -ENOMEM;
 		ecryptfs_printk(KERN_ERR, "Unable to allocate page_virt\n");
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index dfcc684..7091141 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -404,7 +404,7 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
 	/* Released in this function */
 	page_virt =
 	    (char *)kmem_cache_alloc(ecryptfs_header_cache_2,
-				     SLAB_USER);
+				     GFP_USER);
 	if (!page_virt) {
 		rc = -ENOMEM;
 		ecryptfs_printk(KERN_ERR,
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 5b70d52..d7ee28e 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -20,7 +20,6 @@ typedef struct kmem_cache kmem_cache_t;
 
 /* flags for kmem_cache_alloc() */
 #define	SLAB_ATOMIC		GFP_ATOMIC
-#define	SLAB_USER		GFP_USER
 #define	SLAB_KERNEL		GFP_KERNEL
 #define	SLAB_DMA		GFP_DMA
 
-- 
cgit v0.10.2


From 54e6ecb23951b195d02433a741c7f7cb0b796c78 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:33:16 -0800
Subject: [PATCH] slab: remove SLAB_ATOMIC

SLAB_ATOMIC is an alias of GFP_ATOMIC

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/atm/he.c b/drivers/atm/he.c
index c7314a7..2a2f0fc 100644
--- a/drivers/atm/he.c
+++ b/drivers/atm/he.c
@@ -1724,7 +1724,7 @@ __alloc_tpd(struct he_dev *he_dev)
 	struct he_tpd *tpd;
 	dma_addr_t dma_handle; 
 
-	tpd = pci_pool_alloc(he_dev->tpd_pool, SLAB_ATOMIC|SLAB_DMA, &dma_handle);              
+	tpd = pci_pool_alloc(he_dev->tpd_pool, GFP_ATOMIC|SLAB_DMA, &dma_handle);
 	if (tpd == NULL)
 		return NULL;
 			
diff --git a/drivers/base/dmapool.c b/drivers/base/dmapool.c
index b2efbd4..fa46752 100644
--- a/drivers/base/dmapool.c
+++ b/drivers/base/dmapool.c
@@ -297,7 +297,7 @@ restart:
 			}
 		}
 	}
-	if (!(page = pool_alloc_page (pool, SLAB_ATOMIC))) {
+	if (!(page = pool_alloc_page (pool, GFP_ATOMIC))) {
 		if (mem_flags & __GFP_WAIT) {
 			DECLARE_WAITQUEUE (wait, current);
 
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 742d074..8d81a3a 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -324,13 +324,13 @@ static boolean DAC960_CreateAuxiliaryStructures(DAC960_Controller_T *Controller)
       Command->Next = Controller->FreeCommands;
       Controller->FreeCommands = Command;
       Controller->Commands[CommandIdentifier-1] = Command;
-      ScatterGatherCPU = pci_pool_alloc(ScatterGatherPool, SLAB_ATOMIC,
+      ScatterGatherCPU = pci_pool_alloc(ScatterGatherPool, GFP_ATOMIC,
 							&ScatterGatherDMA);
       if (ScatterGatherCPU == NULL)
 	  return DAC960_Failure(Controller, "AUXILIARY STRUCTURE CREATION");
 
       if (RequestSensePool != NULL) {
-  	  RequestSenseCPU = pci_pool_alloc(RequestSensePool, SLAB_ATOMIC,
+  	  RequestSenseCPU = pci_pool_alloc(RequestSensePool, GFP_ATOMIC,
 						&RequestSenseDMA);
   	  if (RequestSenseCPU == NULL) {
                 pci_pool_free(ScatterGatherPool, ScatterGatherCPU,
diff --git a/drivers/char/watchdog/pcwd_usb.c b/drivers/char/watchdog/pcwd_usb.c
index e275dd4..6113872 100644
--- a/drivers/char/watchdog/pcwd_usb.c
+++ b/drivers/char/watchdog/pcwd_usb.c
@@ -634,7 +634,7 @@ static int usb_pcwd_probe(struct usb_interface *interface, const struct usb_devi
 	usb_pcwd->intr_size = (le16_to_cpu(endpoint->wMaxPacketSize) > 8 ? le16_to_cpu(endpoint->wMaxPacketSize) : 8);
 
 	/* set up the memory buffer's */
-	if (!(usb_pcwd->intr_buffer = usb_buffer_alloc(udev, usb_pcwd->intr_size, SLAB_ATOMIC, &usb_pcwd->intr_dma))) {
+	if (!(usb_pcwd->intr_buffer = usb_buffer_alloc(udev, usb_pcwd->intr_size, GFP_ATOMIC, &usb_pcwd->intr_dma))) {
 		printk(KERN_ERR PFX "Out of memory\n");
 		goto error;
 	}
diff --git a/drivers/ieee1394/raw1394.c b/drivers/ieee1394/raw1394.c
index 5ec4f5e..47f6a4e 100644
--- a/drivers/ieee1394/raw1394.c
+++ b/drivers/ieee1394/raw1394.c
@@ -259,7 +259,7 @@ static void host_reset(struct hpsb_host *host)
 	if (hi != NULL) {
 		list_for_each_entry(fi, &hi->file_info_list, list) {
 			if (fi->notification == RAW1394_NOTIFY_ON) {
-				req = __alloc_pending_request(SLAB_ATOMIC);
+				req = __alloc_pending_request(GFP_ATOMIC);
 
 				if (req != NULL) {
 					req->file_info = fi;
@@ -306,13 +306,13 @@ static void iso_receive(struct hpsb_host *host, int channel, quadlet_t * data,
 			if (!(fi->listen_channels & (1ULL << channel)))
 				continue;
 
-			req = __alloc_pending_request(SLAB_ATOMIC);
+			req = __alloc_pending_request(GFP_ATOMIC);
 			if (!req)
 				break;
 
 			if (!ibs) {
 				ibs = kmalloc(sizeof(*ibs) + length,
-					      SLAB_ATOMIC);
+					      GFP_ATOMIC);
 				if (!ibs) {
 					kfree(req);
 					break;
@@ -367,13 +367,13 @@ static void fcp_request(struct hpsb_host *host, int nodeid, int direction,
 			if (!fi->fcp_buffer)
 				continue;
 
-			req = __alloc_pending_request(SLAB_ATOMIC);
+			req = __alloc_pending_request(GFP_ATOMIC);
 			if (!req)
 				break;
 
 			if (!ibs) {
 				ibs = kmalloc(sizeof(*ibs) + length,
-					      SLAB_ATOMIC);
+					      GFP_ATOMIC);
 				if (!ibs) {
 					kfree(req);
 					break;
@@ -593,7 +593,7 @@ static int state_initialized(struct file_info *fi, struct pending_request *req)
 	switch (req->req.type) {
 	case RAW1394_REQ_LIST_CARDS:
 		spin_lock_irqsave(&host_info_lock, flags);
-		khl = kmalloc(sizeof(*khl) * host_count, SLAB_ATOMIC);
+		khl = kmalloc(sizeof(*khl) * host_count, GFP_ATOMIC);
 
 		if (khl) {
 			req->req.misc = host_count;
@@ -1045,7 +1045,7 @@ static int arm_read(struct hpsb_host *host, int nodeid, quadlet_t * buffer,
 	}
 	if (arm_addr->notification_options & ARM_READ) {
 		DBGMSG("arm_read -> entering notification-section");
-		req = __alloc_pending_request(SLAB_ATOMIC);
+		req = __alloc_pending_request(GFP_ATOMIC);
 		if (!req) {
 			DBGMSG("arm_read -> rcode_conflict_error");
 			spin_unlock_irqrestore(&host_info_lock, irqflags);
@@ -1064,7 +1064,7 @@ static int arm_read(struct hpsb_host *host, int nodeid, quadlet_t * buffer,
 			    sizeof(struct arm_response) +
 			    sizeof(struct arm_request_response);
 		}
-		req->data = kmalloc(size, SLAB_ATOMIC);
+		req->data = kmalloc(size, GFP_ATOMIC);
 		if (!(req->data)) {
 			free_pending_request(req);
 			DBGMSG("arm_read -> rcode_conflict_error");
@@ -1198,7 +1198,7 @@ static int arm_write(struct hpsb_host *host, int nodeid, int destid,
 	}
 	if (arm_addr->notification_options & ARM_WRITE) {
 		DBGMSG("arm_write -> entering notification-section");
-		req = __alloc_pending_request(SLAB_ATOMIC);
+		req = __alloc_pending_request(GFP_ATOMIC);
 		if (!req) {
 			DBGMSG("arm_write -> rcode_conflict_error");
 			spin_unlock_irqrestore(&host_info_lock, irqflags);
@@ -1209,7 +1209,7 @@ static int arm_write(struct hpsb_host *host, int nodeid, int destid,
 		    sizeof(struct arm_request) + sizeof(struct arm_response) +
 		    (length) * sizeof(byte_t) +
 		    sizeof(struct arm_request_response);
-		req->data = kmalloc(size, SLAB_ATOMIC);
+		req->data = kmalloc(size, GFP_ATOMIC);
 		if (!(req->data)) {
 			free_pending_request(req);
 			DBGMSG("arm_write -> rcode_conflict_error");
@@ -1400,7 +1400,7 @@ static int arm_lock(struct hpsb_host *host, int nodeid, quadlet_t * store,
 	if (arm_addr->notification_options & ARM_LOCK) {
 		byte_t *buf1, *buf2;
 		DBGMSG("arm_lock -> entering notification-section");
-		req = __alloc_pending_request(SLAB_ATOMIC);
+		req = __alloc_pending_request(GFP_ATOMIC);
 		if (!req) {
 			DBGMSG("arm_lock -> rcode_conflict_error");
 			spin_unlock_irqrestore(&host_info_lock, irqflags);
@@ -1408,7 +1408,7 @@ static int arm_lock(struct hpsb_host *host, int nodeid, quadlet_t * store,
 							   The request may be retried */
 		}
 		size = sizeof(struct arm_request) + sizeof(struct arm_response) + 3 * sizeof(*store) + sizeof(struct arm_request_response);	/* maximum */
-		req->data = kmalloc(size, SLAB_ATOMIC);
+		req->data = kmalloc(size, GFP_ATOMIC);
 		if (!(req->data)) {
 			free_pending_request(req);
 			DBGMSG("arm_lock -> rcode_conflict_error");
@@ -1628,7 +1628,7 @@ static int arm_lock64(struct hpsb_host *host, int nodeid, octlet_t * store,
 	if (arm_addr->notification_options & ARM_LOCK) {
 		byte_t *buf1, *buf2;
 		DBGMSG("arm_lock64 -> entering notification-section");
-		req = __alloc_pending_request(SLAB_ATOMIC);
+		req = __alloc_pending_request(GFP_ATOMIC);
 		if (!req) {
 			spin_unlock_irqrestore(&host_info_lock, irqflags);
 			DBGMSG("arm_lock64 -> rcode_conflict_error");
@@ -1636,7 +1636,7 @@ static int arm_lock64(struct hpsb_host *host, int nodeid, octlet_t * store,
 							   The request may be retried */
 		}
 		size = sizeof(struct arm_request) + sizeof(struct arm_response) + 3 * sizeof(*store) + sizeof(struct arm_request_response);	/* maximum */
-		req->data = kmalloc(size, SLAB_ATOMIC);
+		req->data = kmalloc(size, GFP_ATOMIC);
 		if (!(req->data)) {
 			free_pending_request(req);
 			spin_unlock_irqrestore(&host_info_lock, irqflags);
@@ -2443,7 +2443,7 @@ static void queue_rawiso_event(struct file_info *fi)
 	/* only one ISO activity event may be in the queue */
 	if (!__rawiso_event_in_queue(fi)) {
 		struct pending_request *req =
-		    __alloc_pending_request(SLAB_ATOMIC);
+		    __alloc_pending_request(GFP_ATOMIC);
 
 		if (req) {
 			req->file_info = fi;
diff --git a/drivers/infiniband/hw/amso1100/c2_vq.c b/drivers/infiniband/hw/amso1100/c2_vq.c
index 40caeb5..36620a2 100644
--- a/drivers/infiniband/hw/amso1100/c2_vq.c
+++ b/drivers/infiniband/hw/amso1100/c2_vq.c
@@ -164,7 +164,7 @@ void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *r)
  */
 void *vq_repbuf_alloc(struct c2_dev *c2dev)
 {
-	return kmem_cache_alloc(c2dev->host_msg_cache, SLAB_ATOMIC);
+	return kmem_cache_alloc(c2dev->host_msg_cache, GFP_ATOMIC);
 }
 
 /*
diff --git a/drivers/infiniband/hw/mthca/mthca_av.c b/drivers/infiniband/hw/mthca/mthca_av.c
index 57cdc1b..27caf3b 100644
--- a/drivers/infiniband/hw/mthca/mthca_av.c
+++ b/drivers/infiniband/hw/mthca/mthca_av.c
@@ -189,7 +189,7 @@ int mthca_create_ah(struct mthca_dev *dev,
 on_hca_fail:
 	if (ah->type == MTHCA_AH_PCI_POOL) {
 		ah->av = pci_pool_alloc(dev->av_table.pool,
-					SLAB_ATOMIC, &ah->avdma);
+					GFP_ATOMIC, &ah->avdma);
 		if (!ah->av)
 			return -ENOMEM;
 
diff --git a/drivers/isdn/gigaset/bas-gigaset.c b/drivers/isdn/gigaset/bas-gigaset.c
index 0c93732..5857e7e 100644
--- a/drivers/isdn/gigaset/bas-gigaset.c
+++ b/drivers/isdn/gigaset/bas-gigaset.c
@@ -572,7 +572,7 @@ static int atread_submit(struct cardstate *cs, int timeout)
 			     ucs->rcvbuf, ucs->rcvbuf_size,
 			     read_ctrl_callback, cs->inbuf);
 
-	if ((ret = usb_submit_urb(ucs->urb_cmd_in, SLAB_ATOMIC)) != 0) {
+	if ((ret = usb_submit_urb(ucs->urb_cmd_in, GFP_ATOMIC)) != 0) {
 		update_basstate(ucs, 0, BS_ATRDPEND);
 		dev_err(cs->dev, "could not submit HD_READ_ATMESSAGE: %s\n",
 			get_usb_rcmsg(ret));
@@ -747,7 +747,7 @@ static void read_int_callback(struct urb *urb)
 	check_pending(ucs);
 
 resubmit:
-	rc = usb_submit_urb(urb, SLAB_ATOMIC);
+	rc = usb_submit_urb(urb, GFP_ATOMIC);
 	if (unlikely(rc != 0 && rc != -ENODEV)) {
 		dev_err(cs->dev, "could not resubmit interrupt URB: %s\n",
 			get_usb_rcmsg(rc));
@@ -807,7 +807,7 @@ static void read_iso_callback(struct urb *urb)
 			urb->number_of_packets = BAS_NUMFRAMES;
 			gig_dbg(DEBUG_ISO, "%s: isoc read overrun/resubmit",
 				__func__);
-			rc = usb_submit_urb(urb, SLAB_ATOMIC);
+			rc = usb_submit_urb(urb, GFP_ATOMIC);
 			if (unlikely(rc != 0 && rc != -ENODEV)) {
 				dev_err(bcs->cs->dev,
 					"could not resubmit isochronous read "
@@ -900,7 +900,7 @@ static int starturbs(struct bc_state *bcs)
 		}
 
 		dump_urb(DEBUG_ISO, "Initial isoc read", urb);
-		if ((rc = usb_submit_urb(urb, SLAB_ATOMIC)) != 0)
+		if ((rc = usb_submit_urb(urb, GFP_ATOMIC)) != 0)
 			goto error;
 	}
 
@@ -935,7 +935,7 @@ static int starturbs(struct bc_state *bcs)
 	/* submit two URBs, keep third one */
 	for (k = 0; k < 2; ++k) {
 		dump_urb(DEBUG_ISO, "Initial isoc write", urb);
-		rc = usb_submit_urb(ubc->isoouturbs[k].urb, SLAB_ATOMIC);
+		rc = usb_submit_urb(ubc->isoouturbs[k].urb, GFP_ATOMIC);
 		if (rc != 0)
 			goto error;
 	}
@@ -1042,7 +1042,7 @@ static int submit_iso_write_urb(struct isow_urbctx_t *ucx)
 		return 0;	/* no data to send */
 	urb->number_of_packets = nframe;
 
-	rc = usb_submit_urb(urb, SLAB_ATOMIC);
+	rc = usb_submit_urb(urb, GFP_ATOMIC);
 	if (unlikely(rc)) {
 		if (rc == -ENODEV)
 			/* device removed - give up silently */
@@ -1341,7 +1341,7 @@ static void read_iso_tasklet(unsigned long data)
 		urb->dev = bcs->cs->hw.bas->udev;
 		urb->transfer_flags = URB_ISO_ASAP;
 		urb->number_of_packets = BAS_NUMFRAMES;
-		rc = usb_submit_urb(urb, SLAB_ATOMIC);
+		rc = usb_submit_urb(urb, GFP_ATOMIC);
 		if (unlikely(rc != 0 && rc != -ENODEV)) {
 			dev_err(cs->dev,
 				"could not resubmit isochronous read URB: %s\n",
@@ -1458,7 +1458,7 @@ static void write_ctrl_callback(struct urb *urb)
 			   ucs->retry_ctrl);
 		/* urb->dev is clobbered by USB subsystem */
 		urb->dev = ucs->udev;
-		rc = usb_submit_urb(urb, SLAB_ATOMIC);
+		rc = usb_submit_urb(urb, GFP_ATOMIC);
 		if (unlikely(rc)) {
 			dev_err(&ucs->interface->dev,
 				"could not resubmit request 0x%02x: %s\n",
@@ -1517,7 +1517,7 @@ static int req_submit(struct bc_state *bcs, int req, int val, int timeout)
 			     (unsigned char*) &ucs->dr_ctrl, NULL, 0,
 			     write_ctrl_callback, ucs);
 	ucs->retry_ctrl = 0;
-	ret = usb_submit_urb(ucs->urb_ctrl, SLAB_ATOMIC);
+	ret = usb_submit_urb(ucs->urb_ctrl, GFP_ATOMIC);
 	if (unlikely(ret)) {
 		dev_err(bcs->cs->dev, "could not submit request 0x%02x: %s\n",
 			req, get_usb_rcmsg(ret));
@@ -1763,7 +1763,7 @@ static int atwrite_submit(struct cardstate *cs, unsigned char *buf, int len)
 			     usb_sndctrlpipe(ucs->udev, 0),
 			     (unsigned char*) &ucs->dr_cmd_out, buf, len,
 			     write_command_callback, cs);
-	rc = usb_submit_urb(ucs->urb_cmd_out, SLAB_ATOMIC);
+	rc = usb_submit_urb(ucs->urb_cmd_out, GFP_ATOMIC);
 	if (unlikely(rc)) {
 		update_basstate(ucs, 0, BS_ATWRPEND);
 		dev_err(cs->dev, "could not submit HD_WRITE_ATMESSAGE: %s\n",
diff --git a/drivers/isdn/gigaset/usb-gigaset.c b/drivers/isdn/gigaset/usb-gigaset.c
index 5ebf49ac9..af89ce1 100644
--- a/drivers/isdn/gigaset/usb-gigaset.c
+++ b/drivers/isdn/gigaset/usb-gigaset.c
@@ -410,7 +410,7 @@ static void gigaset_read_int_callback(struct urb *urb)
 
 	if (resubmit) {
 		spin_lock_irqsave(&cs->lock, flags);
-		r = cs->connected ? usb_submit_urb(urb, SLAB_ATOMIC) : -ENODEV;
+		r = cs->connected ? usb_submit_urb(urb, GFP_ATOMIC) : -ENODEV;
 		spin_unlock_irqrestore(&cs->lock, flags);
 		if (r)
 			dev_err(cs->dev, "error %d when resubmitting urb.\n",
@@ -486,7 +486,7 @@ static int send_cb(struct cardstate *cs, struct cmdbuf_t *cb)
 			atomic_set(&ucs->busy, 1);
 
 			spin_lock_irqsave(&cs->lock, flags);
-			status = cs->connected ? usb_submit_urb(ucs->bulk_out_urb, SLAB_ATOMIC) : -ENODEV;
+			status = cs->connected ? usb_submit_urb(ucs->bulk_out_urb, GFP_ATOMIC) : -ENODEV;
 			spin_unlock_irqrestore(&cs->lock, flags);
 
 			if (status) {
@@ -664,7 +664,7 @@ static int write_modem(struct cardstate *cs)
 						  ucs->bulk_out_endpointAddr & 0x0f),
 				  ucs->bulk_out_buffer, count,
 				  gigaset_write_bulk_callback, cs);
-		ret = usb_submit_urb(ucs->bulk_out_urb, SLAB_ATOMIC);
+		ret = usb_submit_urb(ucs->bulk_out_urb, GFP_ATOMIC);
 	} else {
 		ret = -ENODEV;
 	}
diff --git a/drivers/media/dvb/dvb-usb/usb-urb.c b/drivers/media/dvb/dvb-usb/usb-urb.c
index 78035ee..397f51a 100644
--- a/drivers/media/dvb/dvb-usb/usb-urb.c
+++ b/drivers/media/dvb/dvb-usb/usb-urb.c
@@ -116,7 +116,7 @@ static int usb_allocate_stream_buffers(struct usb_data_stream *stream, int num,
 	for (stream->buf_num = 0; stream->buf_num < num; stream->buf_num++) {
 		deb_mem("allocating buffer %d\n",stream->buf_num);
 		if (( stream->buf_list[stream->buf_num] =
-					usb_buffer_alloc(stream->udev, size, SLAB_ATOMIC,
+					usb_buffer_alloc(stream->udev, size, GFP_ATOMIC,
 					&stream->dma_addr[stream->buf_num]) ) == NULL) {
 			deb_mem("not enough memory for urb-buffer allocation.\n");
 			usb_free_stream_buffers(stream);
diff --git a/drivers/media/dvb/ttusb-dec/ttusb_dec.c b/drivers/media/dvb/ttusb-dec/ttusb_dec.c
index 8135f3e..10b121a 100644
--- a/drivers/media/dvb/ttusb-dec/ttusb_dec.c
+++ b/drivers/media/dvb/ttusb-dec/ttusb_dec.c
@@ -1244,7 +1244,7 @@ static int ttusb_dec_init_usb(struct ttusb_dec *dec)
 			return -ENOMEM;
 		}
 		dec->irq_buffer = usb_buffer_alloc(dec->udev,IRQ_PACKET_SIZE,
-					SLAB_ATOMIC, &dec->irq_dma_handle);
+					GFP_ATOMIC, &dec->irq_dma_handle);
 		if(!dec->irq_buffer) {
 			return -ENOMEM;
 		}
diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c
index 277826c..067f151 100644
--- a/drivers/s390/scsi/zfcp_fsf.c
+++ b/drivers/s390/scsi/zfcp_fsf.c
@@ -109,7 +109,7 @@ zfcp_fsf_req_alloc(mempool_t *pool, int req_flags)
 			ptr = kmalloc(size, GFP_ATOMIC);
 		else
 			ptr = kmem_cache_alloc(zfcp_data.fsf_req_qtcb_cache,
-					       SLAB_ATOMIC);
+					       GFP_ATOMIC);
 	}
 
 	if (unlikely(!ptr))
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 9be41ed..0a46acf 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -460,7 +460,7 @@ void usb_hub_tt_clear_buffer (struct usb_device *udev, int pipe)
 	 * since each TT has "at least two" buffers that can need it (and
 	 * there can be many TTs per hub).  even if they're uncommon.
 	 */
-	if ((clear = kmalloc (sizeof *clear, SLAB_ATOMIC)) == NULL) {
+	if ((clear = kmalloc (sizeof *clear, GFP_ATOMIC)) == NULL) {
 		dev_err (&udev->dev, "can't save CLEAR_TT_BUFFER state\n");
 		/* FIXME recover somehow ... RESET_TT? */
 		return;
diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index 7390b67..149aa8b 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -488,7 +488,7 @@ void usb_sg_wait (struct usb_sg_request *io)
 		int	retval;
 
 		io->urbs [i]->dev = io->dev;
-		retval = usb_submit_urb (io->urbs [i], SLAB_ATOMIC);
+		retval = usb_submit_urb (io->urbs [i], GFP_ATOMIC);
 
 		/* after we submit, let completions or cancelations fire;
 		 * we handshake using io->status.
diff --git a/drivers/usb/host/ehci-dbg.c b/drivers/usb/host/ehci-dbg.c
index 34b7a31..56349d2 100644
--- a/drivers/usb/host/ehci-dbg.c
+++ b/drivers/usb/host/ehci-dbg.c
@@ -492,7 +492,7 @@ show_periodic (struct class_device *class_dev, char *buf)
 	unsigned		i;
 	__le32			tag;
 
-	if (!(seen = kmalloc (DBG_SCHED_LIMIT * sizeof *seen, SLAB_ATOMIC)))
+	if (!(seen = kmalloc (DBG_SCHED_LIMIT * sizeof *seen, GFP_ATOMIC)))
 		return 0;
 	seen_count = 0;
 
diff --git a/drivers/usb/host/hc_crisv10.c b/drivers/usb/host/hc_crisv10.c
index 87eca6a..396dc69 100644
--- a/drivers/usb/host/hc_crisv10.c
+++ b/drivers/usb/host/hc_crisv10.c
@@ -188,7 +188,7 @@ static DEFINE_TIMER(bulk_eot_timer, NULL, 0, 0);
 #define CHECK_ALIGN(x) if (((__u32)(x)) & 0x00000003) \
 {panic("Alignment check (DWORD) failed at %s:%s:%d\n", __FILE__, __FUNCTION__, __LINE__);}
 
-#define SLAB_FLAG     (in_interrupt() ? SLAB_ATOMIC : SLAB_KERNEL)
+#define SLAB_FLAG     (in_interrupt() ? GFP_ATOMIC : SLAB_KERNEL)
 #define KMALLOC_FLAG  (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)
 
 /* Most helpful debugging aid */
@@ -1743,7 +1743,7 @@ static irqreturn_t etrax_usb_tx_interrupt(int irq, void *vhc)
 
 		*R_DMA_CH8_SUB3_CLR_INTR = IO_STATE(R_DMA_CH8_SUB3_CLR_INTR, clr_descr, do);
 
-		comp_data = (usb_isoc_complete_data_t*)kmem_cache_alloc(isoc_compl_cache, SLAB_ATOMIC);
+		comp_data = (usb_isoc_complete_data_t*)kmem_cache_alloc(isoc_compl_cache, GFP_ATOMIC);
 		assert(comp_data != NULL);
 
                 INIT_WORK(&comp_data->usb_bh, etrax_usb_isoc_descr_interrupt_bottom_half, comp_data);
@@ -3010,7 +3010,7 @@ static void etrax_usb_add_to_isoc_sb_list(struct urb *urb, int epid)
 			if (!urb->iso_frame_desc[i].length)
 				continue;
 
-			next_sb_desc = (USB_SB_Desc_t*)kmem_cache_alloc(usb_desc_cache, SLAB_ATOMIC);
+			next_sb_desc = (USB_SB_Desc_t*)kmem_cache_alloc(usb_desc_cache, GFP_ATOMIC);
 			assert(next_sb_desc != NULL);
 
 			if (urb->iso_frame_desc[i].length > 0) {
@@ -3063,7 +3063,7 @@ static void etrax_usb_add_to_isoc_sb_list(struct urb *urb, int epid)
 		if (TxIsocEPList[epid].sub == 0) {
 			dbg_isoc("Isoc traffic not already running, allocating SB");
 
-			next_sb_desc = (USB_SB_Desc_t*)kmem_cache_alloc(usb_desc_cache, SLAB_ATOMIC);
+			next_sb_desc = (USB_SB_Desc_t*)kmem_cache_alloc(usb_desc_cache, GFP_ATOMIC);
 			assert(next_sb_desc != NULL);
 
 			next_sb_desc->command = (IO_STATE(USB_SB_command, tt, in) |
@@ -3317,7 +3317,7 @@ static irqreturn_t etrax_usb_hc_interrupt_top_half(int irq, void *vhc)
 
 	restore_flags(flags);
 
-	reg = (usb_interrupt_registers_t *)kmem_cache_alloc(top_half_reg_cache, SLAB_ATOMIC);
+	reg = (usb_interrupt_registers_t *)kmem_cache_alloc(top_half_reg_cache, GFP_ATOMIC);
 
 	assert(reg != NULL);
 
diff --git a/drivers/usb/host/ohci-dbg.c b/drivers/usb/host/ohci-dbg.c
index 8293c1d..0f47a57 100644
--- a/drivers/usb/host/ohci-dbg.c
+++ b/drivers/usb/host/ohci-dbg.c
@@ -505,7 +505,7 @@ show_periodic (struct class_device *class_dev, char *buf)
 	char			*next;
 	unsigned		i;
 
-	if (!(seen = kmalloc (DBG_SCHED_LIMIT * sizeof *seen, SLAB_ATOMIC)))
+	if (!(seen = kmalloc (DBG_SCHED_LIMIT * sizeof *seen, GFP_ATOMIC)))
 		return 0;
 	seen_count = 0;
 
diff --git a/drivers/usb/host/uhci-q.c b/drivers/usb/host/uhci-q.c
index 06115f2..30b8845 100644
--- a/drivers/usb/host/uhci-q.c
+++ b/drivers/usb/host/uhci-q.c
@@ -498,7 +498,7 @@ static inline struct urb_priv *uhci_alloc_urb_priv(struct uhci_hcd *uhci,
 {
 	struct urb_priv *urbp;
 
-	urbp = kmem_cache_alloc(uhci_up_cachep, SLAB_ATOMIC);
+	urbp = kmem_cache_alloc(uhci_up_cachep, GFP_ATOMIC);
 	if (!urbp)
 		return NULL;
 
diff --git a/drivers/usb/input/aiptek.c b/drivers/usb/input/aiptek.c
index bf42818..9f52429 100644
--- a/drivers/usb/input/aiptek.c
+++ b/drivers/usb/input/aiptek.c
@@ -1988,7 +1988,7 @@ aiptek_probe(struct usb_interface *intf, const struct usb_device_id *id)
 		goto fail1;
 
 	aiptek->data = usb_buffer_alloc(usbdev, AIPTEK_PACKET_LENGTH,
-					SLAB_ATOMIC, &aiptek->data_dma);
+					GFP_ATOMIC, &aiptek->data_dma);
 	if (!aiptek->data)
 		goto fail1;
 
diff --git a/drivers/usb/input/ati_remote.c b/drivers/usb/input/ati_remote.c
index ff23318..b724e36 100644
--- a/drivers/usb/input/ati_remote.c
+++ b/drivers/usb/input/ati_remote.c
@@ -592,7 +592,7 @@ static void ati_remote_irq_in(struct urb *urb)
 			__FUNCTION__, urb->status);
 	}
 
-	retval = usb_submit_urb(urb, SLAB_ATOMIC);
+	retval = usb_submit_urb(urb, GFP_ATOMIC);
 	if (retval)
 		dev_err(&ati_remote->interface->dev, "%s: usb_submit_urb()=%d\n",
 			__FUNCTION__, retval);
@@ -604,12 +604,12 @@ static void ati_remote_irq_in(struct urb *urb)
 static int ati_remote_alloc_buffers(struct usb_device *udev,
 				    struct ati_remote *ati_remote)
 {
-	ati_remote->inbuf = usb_buffer_alloc(udev, DATA_BUFSIZE, SLAB_ATOMIC,
+	ati_remote->inbuf = usb_buffer_alloc(udev, DATA_BUFSIZE, GFP_ATOMIC,
 					     &ati_remote->inbuf_dma);
 	if (!ati_remote->inbuf)
 		return -1;
 
-	ati_remote->outbuf = usb_buffer_alloc(udev, DATA_BUFSIZE, SLAB_ATOMIC,
+	ati_remote->outbuf = usb_buffer_alloc(udev, DATA_BUFSIZE, GFP_ATOMIC,
 					      &ati_remote->outbuf_dma);
 	if (!ati_remote->outbuf)
 		return -1;
diff --git a/drivers/usb/input/hid-core.c b/drivers/usb/input/hid-core.c
index 4295bab..f1d0e1d 100644
--- a/drivers/usb/input/hid-core.c
+++ b/drivers/usb/input/hid-core.c
@@ -1079,7 +1079,7 @@ static void hid_irq_in(struct urb *urb)
 			warn("input irq status %d received", urb->status);
 	}
 
-	status = usb_submit_urb(urb, SLAB_ATOMIC);
+	status = usb_submit_urb(urb, GFP_ATOMIC);
 	if (status) {
 		clear_bit(HID_IN_RUNNING, &hid->iofl);
 		if (status != -EPERM) {
@@ -1864,13 +1864,13 @@ static void hid_find_max_report(struct hid_device *hid, unsigned int type, int *
 
 static int hid_alloc_buffers(struct usb_device *dev, struct hid_device *hid)
 {
-	if (!(hid->inbuf = usb_buffer_alloc(dev, hid->bufsize, SLAB_ATOMIC, &hid->inbuf_dma)))
+	if (!(hid->inbuf = usb_buffer_alloc(dev, hid->bufsize, GFP_ATOMIC, &hid->inbuf_dma)))
 		return -1;
-	if (!(hid->outbuf = usb_buffer_alloc(dev, hid->bufsize, SLAB_ATOMIC, &hid->outbuf_dma)))
+	if (!(hid->outbuf = usb_buffer_alloc(dev, hid->bufsize, GFP_ATOMIC, &hid->outbuf_dma)))
 		return -1;
-	if (!(hid->cr = usb_buffer_alloc(dev, sizeof(*(hid->cr)), SLAB_ATOMIC, &hid->cr_dma)))
+	if (!(hid->cr = usb_buffer_alloc(dev, sizeof(*(hid->cr)), GFP_ATOMIC, &hid->cr_dma)))
 		return -1;
-	if (!(hid->ctrlbuf = usb_buffer_alloc(dev, hid->bufsize, SLAB_ATOMIC, &hid->ctrlbuf_dma)))
+	if (!(hid->ctrlbuf = usb_buffer_alloc(dev, hid->bufsize, GFP_ATOMIC, &hid->ctrlbuf_dma)))
 		return -1;
 
 	return 0;
diff --git a/drivers/usb/input/keyspan_remote.c b/drivers/usb/input/keyspan_remote.c
index 50aa810..98bd323 100644
--- a/drivers/usb/input/keyspan_remote.c
+++ b/drivers/usb/input/keyspan_remote.c
@@ -456,7 +456,7 @@ static int keyspan_probe(struct usb_interface *interface, const struct usb_devic
 	remote->in_endpoint = endpoint;
 	remote->toggle = -1;	/* Set to -1 so we will always not match the toggle from the first remote message. */
 
-	remote->in_buffer = usb_buffer_alloc(udev, RECV_SIZE, SLAB_ATOMIC, &remote->in_dma);
+	remote->in_buffer = usb_buffer_alloc(udev, RECV_SIZE, GFP_ATOMIC, &remote->in_dma);
 	if (!remote->in_buffer) {
 		retval = -ENOMEM;
 		goto fail1;
diff --git a/drivers/usb/input/mtouchusb.c b/drivers/usb/input/mtouchusb.c
index 79a85d4..92c4e07 100644
--- a/drivers/usb/input/mtouchusb.c
+++ b/drivers/usb/input/mtouchusb.c
@@ -164,7 +164,7 @@ static int mtouchusb_alloc_buffers(struct usb_device *udev, struct mtouch_usb *m
 	dbg("%s - called", __FUNCTION__);
 
 	mtouch->data = usb_buffer_alloc(udev, MTOUCHUSB_REPORT_DATA_SIZE,
-					SLAB_ATOMIC, &mtouch->data_dma);
+					GFP_ATOMIC, &mtouch->data_dma);
 
 	if (!mtouch->data)
 		return -1;
diff --git a/drivers/usb/input/powermate.c b/drivers/usb/input/powermate.c
index 0bf9177..fea97e5 100644
--- a/drivers/usb/input/powermate.c
+++ b/drivers/usb/input/powermate.c
@@ -277,12 +277,12 @@ static int powermate_input_event(struct input_dev *dev, unsigned int type, unsig
 static int powermate_alloc_buffers(struct usb_device *udev, struct powermate_device *pm)
 {
 	pm->data = usb_buffer_alloc(udev, POWERMATE_PAYLOAD_SIZE_MAX,
-				    SLAB_ATOMIC, &pm->data_dma);
+				    GFP_ATOMIC, &pm->data_dma);
 	if (!pm->data)
 		return -1;
 
 	pm->configcr = usb_buffer_alloc(udev, sizeof(*(pm->configcr)),
-					SLAB_ATOMIC, &pm->configcr_dma);
+					GFP_ATOMIC, &pm->configcr_dma);
 	if (!pm->configcr)
 		return -1;
 
diff --git a/drivers/usb/input/touchkitusb.c b/drivers/usb/input/touchkitusb.c
index 05c0d1c..2a314b0 100644
--- a/drivers/usb/input/touchkitusb.c
+++ b/drivers/usb/input/touchkitusb.c
@@ -248,7 +248,7 @@ static int touchkit_alloc_buffers(struct usb_device *udev,
 				  struct touchkit_usb *touchkit)
 {
 	touchkit->data = usb_buffer_alloc(udev, TOUCHKIT_REPORT_DATA_SIZE,
-	                                  SLAB_ATOMIC, &touchkit->data_dma);
+	                                  GFP_ATOMIC, &touchkit->data_dma);
 
 	if (!touchkit->data)
 		return -1;
diff --git a/drivers/usb/input/usbkbd.c b/drivers/usb/input/usbkbd.c
index dac8864..8505824 100644
--- a/drivers/usb/input/usbkbd.c
+++ b/drivers/usb/input/usbkbd.c
@@ -122,7 +122,7 @@ static void usb_kbd_irq(struct urb *urb)
 	memcpy(kbd->old, kbd->new, 8);
 
 resubmit:
-	i = usb_submit_urb (urb, SLAB_ATOMIC);
+	i = usb_submit_urb (urb, GFP_ATOMIC);
 	if (i)
 		err ("can't resubmit intr, %s-%s/input0, status %d",
 				kbd->usbdev->bus->bus_name,
@@ -196,11 +196,11 @@ static int usb_kbd_alloc_mem(struct usb_device *dev, struct usb_kbd *kbd)
 		return -1;
 	if (!(kbd->led = usb_alloc_urb(0, GFP_KERNEL)))
 		return -1;
-	if (!(kbd->new = usb_buffer_alloc(dev, 8, SLAB_ATOMIC, &kbd->new_dma)))
+	if (!(kbd->new = usb_buffer_alloc(dev, 8, GFP_ATOMIC, &kbd->new_dma)))
 		return -1;
-	if (!(kbd->cr = usb_buffer_alloc(dev, sizeof(struct usb_ctrlrequest), SLAB_ATOMIC, &kbd->cr_dma)))
+	if (!(kbd->cr = usb_buffer_alloc(dev, sizeof(struct usb_ctrlrequest), GFP_ATOMIC, &kbd->cr_dma)))
 		return -1;
-	if (!(kbd->leds = usb_buffer_alloc(dev, 1, SLAB_ATOMIC, &kbd->leds_dma)))
+	if (!(kbd->leds = usb_buffer_alloc(dev, 1, GFP_ATOMIC, &kbd->leds_dma)))
 		return -1;
 
 	return 0;
diff --git a/drivers/usb/input/usbmouse.c b/drivers/usb/input/usbmouse.c
index 68a5564..64a33e4 100644
--- a/drivers/usb/input/usbmouse.c
+++ b/drivers/usb/input/usbmouse.c
@@ -86,7 +86,7 @@ static void usb_mouse_irq(struct urb *urb)
 
 	input_sync(dev);
 resubmit:
-	status = usb_submit_urb (urb, SLAB_ATOMIC);
+	status = usb_submit_urb (urb, GFP_ATOMIC);
 	if (status)
 		err ("can't resubmit intr, %s-%s/input0, status %d",
 				mouse->usbdev->bus->bus_name,
@@ -137,7 +137,7 @@ static int usb_mouse_probe(struct usb_interface *intf, const struct usb_device_i
 	if (!mouse || !input_dev)
 		goto fail1;
 
-	mouse->data = usb_buffer_alloc(dev, 8, SLAB_ATOMIC, &mouse->data_dma);
+	mouse->data = usb_buffer_alloc(dev, 8, GFP_ATOMIC, &mouse->data_dma);
 	if (!mouse->data)
 		goto fail1;
 
diff --git a/drivers/usb/input/xpad.c b/drivers/usb/input/xpad.c
index df97e5c..e4bc76e 100644
--- a/drivers/usb/input/xpad.c
+++ b/drivers/usb/input/xpad.c
@@ -325,7 +325,7 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id
 		goto fail1;
 
 	xpad->idata = usb_buffer_alloc(udev, XPAD_PKT_LEN,
-				       SLAB_ATOMIC, &xpad->idata_dma);
+				       GFP_ATOMIC, &xpad->idata_dma);
 	if (!xpad->idata)
 		goto fail1;
 
diff --git a/drivers/usb/input/yealink.c b/drivers/usb/input/yealink.c
index 2268ca3..caff8e6 100644
--- a/drivers/usb/input/yealink.c
+++ b/drivers/usb/input/yealink.c
@@ -874,17 +874,17 @@ static int usb_probe(struct usb_interface *intf, const struct usb_device_id *id)
 
 	/* allocate usb buffers */
 	yld->irq_data = usb_buffer_alloc(udev, USB_PKT_LEN,
-					SLAB_ATOMIC, &yld->irq_dma);
+					GFP_ATOMIC, &yld->irq_dma);
 	if (yld->irq_data == NULL)
 		return usb_cleanup(yld, -ENOMEM);
 
 	yld->ctl_data = usb_buffer_alloc(udev, USB_PKT_LEN,
-					SLAB_ATOMIC, &yld->ctl_dma);
+					GFP_ATOMIC, &yld->ctl_dma);
 	if (!yld->ctl_data)
 		return usb_cleanup(yld, -ENOMEM);
 
 	yld->ctl_req = usb_buffer_alloc(udev, sizeof(*(yld->ctl_req)),
-					SLAB_ATOMIC, &yld->ctl_req_dma);
+					GFP_ATOMIC, &yld->ctl_req_dma);
 	if (yld->ctl_req == NULL)
 		return usb_cleanup(yld, -ENOMEM);
 
diff --git a/drivers/usb/misc/phidgetkit.c b/drivers/usb/misc/phidgetkit.c
index 9659c79..371bf2b 100644
--- a/drivers/usb/misc/phidgetkit.c
+++ b/drivers/usb/misc/phidgetkit.c
@@ -377,7 +377,7 @@ static void interfacekit_irq(struct urb *urb)
 		schedule_delayed_work(&kit->do_notify, 0);
 
 resubmit:
-	status = usb_submit_urb(urb, SLAB_ATOMIC);
+	status = usb_submit_urb(urb, GFP_ATOMIC);
 	if (status)
 		err("can't resubmit intr, %s-%s/interfacekit0, status %d",
 			kit->udev->bus->bus_name,
@@ -568,7 +568,7 @@ static int interfacekit_probe(struct usb_interface *intf, const struct usb_devic
 
 	kit->dev_no = -1;
 	kit->ifkit = ifkit;
-	kit->data = usb_buffer_alloc(dev, URB_INT_SIZE, SLAB_ATOMIC, &kit->data_dma);
+	kit->data = usb_buffer_alloc(dev, URB_INT_SIZE, GFP_ATOMIC, &kit->data_dma);
 	if (!kit->data)
 		goto out;
 
diff --git a/drivers/usb/misc/phidgetmotorcontrol.c b/drivers/usb/misc/phidgetmotorcontrol.c
index 2bb4fa5..5727e1e 100644
--- a/drivers/usb/misc/phidgetmotorcontrol.c
+++ b/drivers/usb/misc/phidgetmotorcontrol.c
@@ -151,7 +151,7 @@ static void motorcontrol_irq(struct urb *urb)
 		schedule_delayed_work(&mc->do_notify, 0);
 
 resubmit:
-	status = usb_submit_urb(urb, SLAB_ATOMIC);
+	status = usb_submit_urb(urb, GFP_ATOMIC);
 	if (status)
 		dev_err(&mc->intf->dev,
 			"can't resubmit intr, %s-%s/motorcontrol0, status %d",
@@ -338,7 +338,7 @@ static int motorcontrol_probe(struct usb_interface *intf, const struct usb_devic
 		goto out;
 
 	mc->dev_no = -1;
-	mc->data = usb_buffer_alloc(dev, URB_INT_SIZE, SLAB_ATOMIC, &mc->data_dma);
+	mc->data = usb_buffer_alloc(dev, URB_INT_SIZE, GFP_ATOMIC, &mc->data_dma);
 	if (!mc->data)
 		goto out;
 
diff --git a/drivers/usb/misc/usbtest.c b/drivers/usb/misc/usbtest.c
index 194065d..ea04dcc 100644
--- a/drivers/usb/misc/usbtest.c
+++ b/drivers/usb/misc/usbtest.c
@@ -819,7 +819,7 @@ error:
 
 	/* resubmit if we need to, else mark this as done */
 	if ((status == 0) && (ctx->pending < ctx->count)) {
-		if ((status = usb_submit_urb (urb, SLAB_ATOMIC)) != 0) {
+		if ((status = usb_submit_urb (urb, GFP_ATOMIC)) != 0) {
 			dbg ("can't resubmit ctrl %02x.%02x, err %d",
 				reqp->bRequestType, reqp->bRequest, status);
 			urb->dev = NULL;
@@ -999,7 +999,7 @@ test_ctrl_queue (struct usbtest_dev *dev, struct usbtest_param *param)
 	context.urb = urb;
 	spin_lock_irq (&context.lock);
 	for (i = 0; i < param->sglen; i++) {
-		context.status = usb_submit_urb (urb [i], SLAB_ATOMIC);
+		context.status = usb_submit_urb (urb [i], GFP_ATOMIC);
 		if (context.status != 0) {
 			dbg ("can't submit urb[%d], status %d",
 					i, context.status);
@@ -1041,7 +1041,7 @@ static void unlink1_callback (struct urb *urb)
 
 	// we "know" -EPIPE (stall) never happens
 	if (!status)
-		status = usb_submit_urb (urb, SLAB_ATOMIC);
+		status = usb_submit_urb (urb, GFP_ATOMIC);
 	if (status) {
 		urb->status = status;
 		complete ((struct completion *) urb->context);
@@ -1481,7 +1481,7 @@ test_iso_queue (struct usbtest_dev *dev, struct usbtest_param *param,
 	spin_lock_irq (&context.lock);
 	for (i = 0; i < param->sglen; i++) {
 		++context.pending;
-		status = usb_submit_urb (urbs [i], SLAB_ATOMIC);
+		status = usb_submit_urb (urbs [i], GFP_ATOMIC);
 		if (status < 0) {
 			ERROR (dev, "submit iso[%d], error %d\n", i, status);
 			if (i == 0) {
diff --git a/drivers/usb/mon/mon_text.c b/drivers/usb/mon/mon_text.c
index 7a2346c..46aecc8 100644
--- a/drivers/usb/mon/mon_text.c
+++ b/drivers/usb/mon/mon_text.c
@@ -147,7 +147,7 @@ static void mon_text_event(struct mon_reader_text *rp, struct urb *urb,
 	stamp = mon_get_timestamp();
 
 	if (rp->nevents >= EVENT_MAX ||
-	    (ep = kmem_cache_alloc(rp->e_slab, SLAB_ATOMIC)) == NULL) {
+	    (ep = kmem_cache_alloc(rp->e_slab, GFP_ATOMIC)) == NULL) {
 		rp->r.m_bus->cnt_text_lost++;
 		return;
 	}
@@ -188,7 +188,7 @@ static void mon_text_error(void *data, struct urb *urb, int error)
 	struct mon_event_text *ep;
 
 	if (rp->nevents >= EVENT_MAX ||
-	    (ep = kmem_cache_alloc(rp->e_slab, SLAB_ATOMIC)) == NULL) {
+	    (ep = kmem_cache_alloc(rp->e_slab, GFP_ATOMIC)) == NULL) {
 		rp->r.m_bus->cnt_text_lost++;
 		return;
 	}
diff --git a/drivers/usb/net/catc.c b/drivers/usb/net/catc.c
index 907b820..4852012 100644
--- a/drivers/usb/net/catc.c
+++ b/drivers/usb/net/catc.c
@@ -345,7 +345,7 @@ static void catc_irq_done(struct urb *urb)
 		} 
 	}
 resubmit:
-	status = usb_submit_urb (urb, SLAB_ATOMIC);
+	status = usb_submit_urb (urb, GFP_ATOMIC);
 	if (status)
 		err ("can't resubmit intr, %s-%s, status %d",
 				catc->usbdev->bus->bus_name,
diff --git a/drivers/usb/net/net1080.c b/drivers/usb/net/net1080.c
index a774105..4936359 100644
--- a/drivers/usb/net/net1080.c
+++ b/drivers/usb/net/net1080.c
@@ -383,7 +383,7 @@ static void nc_ensure_sync(struct usbnet *dev)
 		int			status;
 
 		/* Send a flush */
-		urb = usb_alloc_urb(0, SLAB_ATOMIC);
+		urb = usb_alloc_urb(0, GFP_ATOMIC);
 		if (!urb)
 			return;
 
diff --git a/drivers/usb/net/pegasus.c b/drivers/usb/net/pegasus.c
index b5690b3..d48c024 100644
--- a/drivers/usb/net/pegasus.c
+++ b/drivers/usb/net/pegasus.c
@@ -856,7 +856,7 @@ static void intr_callback(struct urb *urb)
 		pegasus->stats.rx_missed_errors += ((d[3] & 0x7f) << 8) | d[4];
 	}
 
-	status = usb_submit_urb(urb, SLAB_ATOMIC);
+	status = usb_submit_urb(urb, GFP_ATOMIC);
 	if (status == -ENODEV)
 		netif_device_detach(pegasus->net);
 	if (status && netif_msg_timer(pegasus))
diff --git a/drivers/usb/net/rtl8150.c b/drivers/usb/net/rtl8150.c
index 72171f9..c54235f 100644
--- a/drivers/usb/net/rtl8150.c
+++ b/drivers/usb/net/rtl8150.c
@@ -587,7 +587,7 @@ static void intr_callback(struct urb *urb)
 	}
 
 resubmit:
-	status = usb_submit_urb (urb, SLAB_ATOMIC);
+	status = usb_submit_urb (urb, GFP_ATOMIC);
 	if (status == -ENODEV)
 		netif_device_detach(dev->netdev);
 	else if (status)
diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index 82cd15b..70f93b1 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c
@@ -363,7 +363,7 @@ static int mos7720_open(struct usb_serial_port *port, struct file * filp)
 
 	/* Initialising the write urb pool */
 	for (j = 0; j < NUM_URBS; ++j) {
-		urb = usb_alloc_urb(0,SLAB_ATOMIC);
+		urb = usb_alloc_urb(0,GFP_ATOMIC);
 		mos7720_port->write_urb_pool[j] = urb;
 
 		if (urb == NULL) {
diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c
index 02c89e1..5432c63 100644
--- a/drivers/usb/serial/mos7840.c
+++ b/drivers/usb/serial/mos7840.c
@@ -826,7 +826,7 @@ static int mos7840_open(struct usb_serial_port *port, struct file *filp)
 
 	/* Initialising the write urb pool */
 	for (j = 0; j < NUM_URBS; ++j) {
-		urb = usb_alloc_urb(0, SLAB_ATOMIC);
+		urb = usb_alloc_urb(0, GFP_ATOMIC);
 		mos7840_port->write_urb_pool[j] = urb;
 
 		if (urb == NULL) {
@@ -2786,7 +2786,7 @@ static int mos7840_startup(struct usb_serial *serial)
 				    i + 1, status);
 
 		}
-		mos7840_port->control_urb = usb_alloc_urb(0, SLAB_ATOMIC);
+		mos7840_port->control_urb = usb_alloc_urb(0, GFP_ATOMIC);
 		mos7840_port->ctrl_buf = kmalloc(16, GFP_KERNEL);
 
 	}
diff --git a/drivers/usb/storage/onetouch.c b/drivers/usb/storage/onetouch.c
index 3a158d5..e565d3d 100644
--- a/drivers/usb/storage/onetouch.c
+++ b/drivers/usb/storage/onetouch.c
@@ -76,7 +76,7 @@ static void usb_onetouch_irq(struct urb *urb)
 	input_sync(dev);
 
 resubmit:
-	status = usb_submit_urb (urb, SLAB_ATOMIC);
+	status = usb_submit_urb (urb, GFP_ATOMIC);
 	if (status)
 		err ("can't resubmit intr, %s-%s/input0, status %d",
 			onetouch->udev->bus->bus_name,
@@ -154,7 +154,7 @@ int onetouch_connect_input(struct us_data *ss)
 		goto fail1;
 
 	onetouch->data = usb_buffer_alloc(udev, ONETOUCH_PKT_LEN,
-					  SLAB_ATOMIC, &onetouch->data_dma);
+					  GFP_ATOMIC, &onetouch->data_dma);
 	if (!onetouch->data)
 		goto fail1;
 
diff --git a/include/linux/slab.h b/include/linux/slab.h
index d7ee28e..34b046e 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -19,7 +19,6 @@ typedef struct kmem_cache kmem_cache_t;
 #include	<asm/cache.h>		/* kmalloc_sizes.h needs L1_CACHE_BYTES */
 
 /* flags for kmem_cache_alloc() */
-#define	SLAB_ATOMIC		GFP_ATOMIC
 #define	SLAB_KERNEL		GFP_KERNEL
 #define	SLAB_DMA		GFP_DMA
 
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index e37baaf..426f0fe 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -60,7 +60,7 @@ struct request_sock {
 
 static inline struct request_sock *reqsk_alloc(const struct request_sock_ops *ops)
 {
-	struct request_sock *req = kmem_cache_alloc(ops->slab, SLAB_ATOMIC);
+	struct request_sock *req = kmem_cache_alloc(ops->slab, GFP_ATOMIC);
 
 	if (req != NULL)
 		req->rsk_ops = ops;
diff --git a/net/core/dst.c b/net/core/dst.c
index 1a5e49d..836ec66 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -125,7 +125,7 @@ void * dst_alloc(struct dst_ops * ops)
 		if (ops->gc())
 			return NULL;
 	}
-	dst = kmem_cache_alloc(ops->kmem_cachep, SLAB_ATOMIC);
+	dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
 	if (!dst)
 		return NULL;
 	memset(dst, 0, ops->entry_size);
diff --git a/net/core/flow.c b/net/core/flow.c
index b16d31a..5df3e29 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -211,7 +211,7 @@ void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
 		if (flow_count(cpu) > flow_hwm)
 			flow_cache_shrink(cpu);
 
-		fle = kmem_cache_alloc(flow_cachep, SLAB_ATOMIC);
+		fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
 		if (fle) {
 			fle->next = *head;
 			*head = fle;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index ba509a4..0ab1987 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -251,7 +251,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
 			goto out_entries;
 	}
 
-	n = kmem_cache_alloc(tbl->kmem_cachep, SLAB_ATOMIC);
+	n = kmem_cache_alloc(tbl->kmem_cachep, GFP_ATOMIC);
 	if (!n)
 		goto out_entries;
 
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index cf8c07b..66a27b9 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -295,7 +295,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 	new_packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist);
 	if (new_packet == NULL || new_packet->dccphtx_sent) {
 		new_packet = dccp_tx_hist_entry_new(ccid3_tx_hist,
-						    SLAB_ATOMIC);
+						    GFP_ATOMIC);
 
 		if (unlikely(new_packet == NULL)) {
 			DCCP_WARN("%s, sk=%p, not enough mem to add to history,"
@@ -889,7 +889,7 @@ static void ccid3_hc_rx_update_li(struct sock *sk, u64 seq_loss, u8 win_loss)
 		/* new loss event detected */
 		/* calculate last interval length */
 		seq_temp = dccp_delta_seqno(head->dccplih_seqno, seq_loss);
-		entry = dccp_li_hist_entry_new(ccid3_li_hist, SLAB_ATOMIC);
+		entry = dccp_li_hist_entry_new(ccid3_li_hist, GFP_ATOMIC);
 
 		if (entry == NULL) {
 			DCCP_BUG("out of memory - can not allocate entry");
@@ -1011,7 +1011,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	}
 
 	packet = dccp_rx_hist_entry_new(ccid3_rx_hist, sk, opt_recv->dccpor_ndp,
-					skb, SLAB_ATOMIC);
+					skb, GFP_ATOMIC);
 	if (unlikely(packet == NULL)) {
 		DCCP_WARN("%s, sk=%p, Not enough mem to add rx packet "
 			  "to history, consider it lost!\n", dccp_role(sk), sk);
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index 48b9b93..0a0baef 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -125,7 +125,7 @@ int dccp_li_hist_interval_new(struct dccp_li_hist *hist,
 	int i;
 
 	for (i = 0; i < DCCP_LI_HIST_IVAL_F_LENGTH; i++) {
-		entry = dccp_li_hist_entry_new(hist, SLAB_ATOMIC);
+		entry = dccp_li_hist_entry_new(hist, GFP_ATOMIC);
 		if (entry == NULL) {
 			dccp_li_hist_purge(hist, list);
 			DCCP_BUG("loss interval list entry is NULL");
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 244c4f4..bd6c9bc 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -31,7 +31,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep,
 						 struct inet_bind_hashbucket *head,
 						 const unsigned short snum)
 {
-	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC);
+	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
 
 	if (tb != NULL) {
 		tb->port      = snum;
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 8c74f91..e28330a 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -91,7 +91,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
 {
 	struct inet_timewait_sock *tw =
 		kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
-				 SLAB_ATOMIC);
+				 GFP_ATOMIC);
 	if (tw != NULL) {
 		const struct inet_sock *inet = inet_sk(sk);
 
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index bf52611..97a8cfb 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -150,7 +150,7 @@ static __inline__ struct fib6_node * node_alloc(void)
 {
 	struct fib6_node *fn;
 
-	if ((fn = kmem_cache_alloc(fib6_node_kmem, SLAB_ATOMIC)) != NULL)
+	if ((fn = kmem_cache_alloc(fib6_node_kmem, GFP_ATOMIC)) != NULL)
 		memset(fn, 0, sizeof(struct fib6_node));
 
 	return fn;
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 01a5c52..d4f68b0 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -180,7 +180,7 @@ try_next_2:;
 	spi = 0;
 	goto out;
 alloc_spi:
-	x6spi = kmem_cache_alloc(xfrm6_tunnel_spi_kmem, SLAB_ATOMIC);
+	x6spi = kmem_cache_alloc(xfrm6_tunnel_spi_kmem, GFP_ATOMIC);
 	if (!x6spi)
 		goto out;
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 04954e5..8d55d10 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -979,7 +979,7 @@ struct sctp_chunk *sctp_chunkify(struct sk_buff *skb,
 {
 	struct sctp_chunk *retval;
 
-	retval = kmem_cache_alloc(sctp_chunk_cachep, SLAB_ATOMIC);
+	retval = kmem_cache_alloc(sctp_chunk_cachep, GFP_ATOMIC);
 
 	if (!retval)
 		goto nodata;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 02b2714..4960779 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4989,7 +4989,7 @@ static struct sctp_bind_bucket *sctp_bucket_create(
 {
 	struct sctp_bind_bucket *pp;
 
-	pp = kmem_cache_alloc(sctp_bucket_cachep, SLAB_ATOMIC);
+	pp = kmem_cache_alloc(sctp_bucket_cachep, GFP_ATOMIC);
 	SCTP_DBG_OBJCNT_INC(bind_bucket);
 	if (pp) {
 		pp->port = snum;
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index e8198a2..a898a6a 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -27,7 +27,7 @@ struct sec_path *secpath_dup(struct sec_path *src)
 {
 	struct sec_path *sp;
 
-	sp = kmem_cache_alloc(secpath_cachep, SLAB_ATOMIC);
+	sp = kmem_cache_alloc(secpath_cachep, GFP_ATOMIC);
 	if (!sp)
 		return NULL;
 
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index e73ac1a..65b4ec9 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -332,7 +332,7 @@ static struct avc_node *avc_alloc_node(void)
 {
 	struct avc_node *node;
 
-	node = kmem_cache_alloc(avc_node_cachep, SLAB_ATOMIC);
+	node = kmem_cache_alloc(avc_node_cachep, GFP_ATOMIC);
 	if (!node)
 		goto out;
 
-- 
cgit v0.10.2


From e94b1766097d53e6f3ccfb36c8baa562ffeda3fc Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:33:17 -0800
Subject: [PATCH] slab: remove SLAB_KERNEL

SLAB_KERNEL is an alias of GFP_KERNEL.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index 713ba39..0bbacd0 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -132,7 +132,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
 		goto up_fail;
 	}
 
-	vma = kmem_cache_zalloc(vm_area_cachep, SLAB_KERNEL);
+	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 	if (!vma) {
 		ret = -ENOMEM;
 		goto up_fail;
diff --git a/arch/ia64/ia32/binfmt_elf32.c b/arch/ia64/ia32/binfmt_elf32.c
index daa6b91..578737e 100644
--- a/arch/ia64/ia32/binfmt_elf32.c
+++ b/arch/ia64/ia32/binfmt_elf32.c
@@ -91,7 +91,7 @@ ia64_elf32_init (struct pt_regs *regs)
 	 * it with privilege level 3 because the IVE uses non-privileged accesses to these
 	 * tables.  IA-32 segmentation is used to protect against IA-32 accesses to them.
 	 */
-	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 	if (vma) {
 		memset(vma, 0, sizeof(*vma));
 		vma->vm_mm = current->mm;
@@ -117,7 +117,7 @@ ia64_elf32_init (struct pt_regs *regs)
 	 * code is locked in specific gate page, which is pointed by pretcode
 	 * when setup_frame_ia32
 	 */
-	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 	if (vma) {
 		memset(vma, 0, sizeof(*vma));
 		vma->vm_mm = current->mm;
@@ -142,7 +142,7 @@ ia64_elf32_init (struct pt_regs *regs)
 	 * Install LDT as anonymous memory.  This gives us all-zero segment descriptors
 	 * until a task modifies them via modify_ldt().
 	 */
-	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 	if (vma) {
 		memset(vma, 0, sizeof(*vma));
 		vma->vm_mm = current->mm;
@@ -214,7 +214,7 @@ ia32_setup_arg_pages (struct linux_binprm *bprm, int executable_stack)
 		bprm->loader += stack_base;
 	bprm->exec += stack_base;
 
-	mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	mpnt = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 	if (!mpnt)
 		return -ENOMEM;
 
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 3aaede0..e232153 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2302,7 +2302,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
 	DPRINT(("smpl_buf @%p\n", smpl_buf));
 
 	/* allocate vma */
-	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 	if (!vma) {
 		DPRINT(("Cannot allocate vma\n"));
 		goto error_kmem;
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index ff87a5c..56dc2024 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -156,7 +156,7 @@ ia64_init_addr_space (void)
 	 * the problem.  When the process attempts to write to the register backing store
 	 * for the first time, it will get a SEGFAULT in this case.
 	 */
-	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 	if (vma) {
 		memset(vma, 0, sizeof(*vma));
 		vma->vm_mm = current->mm;
@@ -175,7 +175,7 @@ ia64_init_addr_space (void)
 
 	/* map NaT-page at address zero to speed up speculative dereferencing of NULL: */
 	if (!(current->personality & MMAP_PAGE_ZERO)) {
-		vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+		vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 		if (vma) {
 			memset(vma, 0, sizeof(*vma));
 			vma->vm_mm = current->mm;
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index c913ad5..a4b28c7 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -264,7 +264,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
 
 
 	/* Allocate a VMA structure and fill it up */
-	vma = kmem_cache_zalloc(vm_area_cachep, SLAB_KERNEL);
+	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 	if (vma == NULL) {
 		rc = -ENOMEM;
 		goto fail_mmapsem;
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index c7d0107..7edfcc9 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -48,7 +48,7 @@ spufs_alloc_inode(struct super_block *sb)
 {
 	struct spufs_inode_info *ei;
 
-	ei = kmem_cache_alloc(spufs_inode_cache, SLAB_KERNEL);
+	ei = kmem_cache_alloc(spufs_inode_cache, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 
diff --git a/arch/sh/kernel/vsyscall/vsyscall.c b/arch/sh/kernel/vsyscall/vsyscall.c
index 075d6cc..deb4694 100644
--- a/arch/sh/kernel/vsyscall/vsyscall.c
+++ b/arch/sh/kernel/vsyscall/vsyscall.c
@@ -97,7 +97,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
 		goto up_fail;
 	}
 
-	vma = kmem_cache_zalloc(vm_area_cachep, SLAB_KERNEL);
+	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 	if (!vma) {
 		ret = -ENOMEM;
 		goto up_fail;
diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c
index 82ef182..932a62a 100644
--- a/arch/x86_64/ia32/ia32_binfmt.c
+++ b/arch/x86_64/ia32/ia32_binfmt.c
@@ -351,7 +351,7 @@ int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top,
 		bprm->loader += stack_base;
 	bprm->exec += stack_base;
 
-	mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	mpnt = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 	if (!mpnt) 
 		return -ENOMEM; 
 
diff --git a/arch/x86_64/ia32/syscall32.c b/arch/x86_64/ia32/syscall32.c
index 3a01329..3e5ed20 100644
--- a/arch/x86_64/ia32/syscall32.c
+++ b/arch/x86_64/ia32/syscall32.c
@@ -49,7 +49,7 @@ int syscall32_setup_pages(struct linux_binprm *bprm, int exstack)
 	struct mm_struct *mm = current->mm;
 	int ret;
 
-	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 	if (!vma)
 		return -ENOMEM;
 
diff --git a/drivers/atm/he.c b/drivers/atm/he.c
index 2a2f0fc..ec8a7a6 100644
--- a/drivers/atm/he.c
+++ b/drivers/atm/he.c
@@ -820,7 +820,7 @@ he_init_group(struct he_dev *he_dev, int group)
 		void *cpuaddr;
 
 #ifdef USE_RBPS_POOL 
-		cpuaddr = pci_pool_alloc(he_dev->rbps_pool, SLAB_KERNEL|SLAB_DMA, &dma_handle);
+		cpuaddr = pci_pool_alloc(he_dev->rbps_pool, GFP_KERNEL|SLAB_DMA, &dma_handle);
 		if (cpuaddr == NULL)
 			return -ENOMEM;
 #else
@@ -884,7 +884,7 @@ he_init_group(struct he_dev *he_dev, int group)
 		void *cpuaddr;
 
 #ifdef USE_RBPL_POOL
-		cpuaddr = pci_pool_alloc(he_dev->rbpl_pool, SLAB_KERNEL|SLAB_DMA, &dma_handle);
+		cpuaddr = pci_pool_alloc(he_dev->rbpl_pool, GFP_KERNEL|SLAB_DMA, &dma_handle);
 		if (cpuaddr == NULL)
 			return -ENOMEM;
 #else
diff --git a/drivers/base/dmapool.c b/drivers/base/dmapool.c
index fa46752..dbe0735 100644
--- a/drivers/base/dmapool.c
+++ b/drivers/base/dmapool.c
@@ -126,7 +126,7 @@ dma_pool_create (const char *name, struct device *dev,
 	} else if (allocation < size)
 		return NULL;
 
-	if (!(retval = kmalloc (sizeof *retval, SLAB_KERNEL)))
+	if (!(retval = kmalloc (sizeof *retval, GFP_KERNEL)))
 		return retval;
 
 	strlcpy (retval->name, name, sizeof retval->name);
diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index 0358419..8e87261 100644
--- a/drivers/dma/ioatdma.c
+++ b/drivers/dma/ioatdma.c
@@ -636,10 +636,10 @@ static int ioat_self_test(struct ioat_device *device)
 	dma_cookie_t cookie;
 	int err = 0;
 
-	src = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, SLAB_KERNEL);
+	src = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL);
 	if (!src)
 		return -ENOMEM;
-	dest = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, SLAB_KERNEL);
+	dest = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL);
 	if (!dest) {
 		kfree(src);
 		return -ENOMEM;
diff --git a/drivers/ieee1394/hosts.c b/drivers/ieee1394/hosts.c
index 8f4378a..b935e08 100644
--- a/drivers/ieee1394/hosts.c
+++ b/drivers/ieee1394/hosts.c
@@ -123,7 +123,7 @@ struct hpsb_host *hpsb_alloc_host(struct hpsb_host_driver *drv, size_t extra,
 	int i;
 	int hostnum = 0;
 
-	h = kzalloc(sizeof(*h) + extra, SLAB_KERNEL);
+	h = kzalloc(sizeof(*h) + extra, GFP_KERNEL);
 	if (!h)
 		return NULL;
 
diff --git a/drivers/ieee1394/ohci1394.c b/drivers/ieee1394/ohci1394.c
index 6e8ea91..eae97d8 100644
--- a/drivers/ieee1394/ohci1394.c
+++ b/drivers/ieee1394/ohci1394.c
@@ -1225,7 +1225,7 @@ static int ohci_iso_recv_init(struct hpsb_iso *iso)
 	int ctx;
 	int ret = -ENOMEM;
 
-	recv = kmalloc(sizeof(*recv), SLAB_KERNEL);
+	recv = kmalloc(sizeof(*recv), GFP_KERNEL);
 	if (!recv)
 		return -ENOMEM;
 
@@ -1918,7 +1918,7 @@ static int ohci_iso_xmit_init(struct hpsb_iso *iso)
 	int ctx;
 	int ret = -ENOMEM;
 
-	xmit = kmalloc(sizeof(*xmit), SLAB_KERNEL);
+	xmit = kmalloc(sizeof(*xmit), GFP_KERNEL);
 	if (!xmit)
 		return -ENOMEM;
 
@@ -3021,7 +3021,7 @@ alloc_dma_rcv_ctx(struct ti_ohci *ohci, struct dma_rcv_ctx *d,
 			return -ENOMEM;
 		}
 
-		d->prg_cpu[i] = pci_pool_alloc(d->prg_pool, SLAB_KERNEL, d->prg_bus+i);
+		d->prg_cpu[i] = pci_pool_alloc(d->prg_pool, GFP_KERNEL, d->prg_bus+i);
 		OHCI_DMA_ALLOC("pool dma_rcv prg[%d]", i);
 
                 if (d->prg_cpu[i] != NULL) {
@@ -3117,7 +3117,7 @@ alloc_dma_trm_ctx(struct ti_ohci *ohci, struct dma_trm_ctx *d,
 	OHCI_DMA_ALLOC("dma_rcv prg pool");
 
 	for (i = 0; i < d->num_desc; i++) {
-		d->prg_cpu[i] = pci_pool_alloc(d->prg_pool, SLAB_KERNEL, d->prg_bus+i);
+		d->prg_cpu[i] = pci_pool_alloc(d->prg_pool, GFP_KERNEL, d->prg_bus+i);
 		OHCI_DMA_ALLOC("pool dma_trm prg[%d]", i);
 
                 if (d->prg_cpu[i] != NULL) {
diff --git a/drivers/ieee1394/pcilynx.c b/drivers/ieee1394/pcilynx.c
index 0a7412e..9cab1d6 100644
--- a/drivers/ieee1394/pcilynx.c
+++ b/drivers/ieee1394/pcilynx.c
@@ -1428,7 +1428,7 @@ static int __devinit add_card(struct pci_dev *dev,
         	struct i2c_algo_bit_data i2c_adapter_data;
 
         	error = -ENOMEM;
-		i2c_ad = kmalloc(sizeof(*i2c_ad), SLAB_KERNEL);
+		i2c_ad = kmalloc(sizeof(*i2c_ad), GFP_KERNEL);
         	if (!i2c_ad) FAIL("failed to allocate I2C adapter memory");
 
 		memcpy(i2c_ad, &bit_ops, sizeof(struct i2c_adapter));
diff --git a/drivers/ieee1394/raw1394.c b/drivers/ieee1394/raw1394.c
index 47f6a4e..bf71e06 100644
--- a/drivers/ieee1394/raw1394.c
+++ b/drivers/ieee1394/raw1394.c
@@ -112,7 +112,7 @@ static struct pending_request *__alloc_pending_request(gfp_t flags)
 
 static inline struct pending_request *alloc_pending_request(void)
 {
-	return __alloc_pending_request(SLAB_KERNEL);
+	return __alloc_pending_request(GFP_KERNEL);
 }
 
 static void free_pending_request(struct pending_request *req)
@@ -1737,7 +1737,7 @@ static int arm_register(struct file_info *fi, struct pending_request *req)
 		return (-EINVAL);
 	}
 	/* addr-list-entry for fileinfo */
-	addr = kmalloc(sizeof(*addr), SLAB_KERNEL);
+	addr = kmalloc(sizeof(*addr), GFP_KERNEL);
 	if (!addr) {
 		req->req.length = 0;
 		return (-ENOMEM);
@@ -2103,7 +2103,7 @@ static int write_phypacket(struct file_info *fi, struct pending_request *req)
 static int get_config_rom(struct file_info *fi, struct pending_request *req)
 {
 	int ret = sizeof(struct raw1394_request);
-	quadlet_t *data = kmalloc(req->req.length, SLAB_KERNEL);
+	quadlet_t *data = kmalloc(req->req.length, GFP_KERNEL);
 	int status;
 
 	if (!data)
@@ -2133,7 +2133,7 @@ static int get_config_rom(struct file_info *fi, struct pending_request *req)
 static int update_config_rom(struct file_info *fi, struct pending_request *req)
 {
 	int ret = sizeof(struct raw1394_request);
-	quadlet_t *data = kmalloc(req->req.length, SLAB_KERNEL);
+	quadlet_t *data = kmalloc(req->req.length, GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
 	if (copy_from_user(data, int2ptr(req->req.sendb), req->req.length)) {
@@ -2779,7 +2779,7 @@ static int raw1394_open(struct inode *inode, struct file *file)
 {
 	struct file_info *fi;
 
-	fi = kzalloc(sizeof(*fi), SLAB_KERNEL);
+	fi = kzalloc(sizeof(*fi), GFP_KERNEL);
 	if (!fi)
 		return -ENOMEM;
 
diff --git a/drivers/infiniband/hw/ehca/ehca_av.c b/drivers/infiniband/hw/ehca/ehca_av.c
index 214e2fd..0d6e2c4 100644
--- a/drivers/infiniband/hw/ehca/ehca_av.c
+++ b/drivers/infiniband/hw/ehca/ehca_av.c
@@ -57,7 +57,7 @@ struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
 	struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
 					      ib_device);
 
-	av = kmem_cache_alloc(av_cache, SLAB_KERNEL);
+	av = kmem_cache_alloc(av_cache, GFP_KERNEL);
 	if (!av) {
 		ehca_err(pd->device, "Out of memory pd=%p ah_attr=%p",
 			 pd, ah_attr);
diff --git a/drivers/infiniband/hw/ehca/ehca_cq.c b/drivers/infiniband/hw/ehca/ehca_cq.c
index 458fe19..93995b6 100644
--- a/drivers/infiniband/hw/ehca/ehca_cq.c
+++ b/drivers/infiniband/hw/ehca/ehca_cq.c
@@ -134,7 +134,7 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe,
 	if (cqe >= 0xFFFFFFFF - 64 - additional_cqe)
 		return ERR_PTR(-EINVAL);
 
-	my_cq = kmem_cache_alloc(cq_cache, SLAB_KERNEL);
+	my_cq = kmem_cache_alloc(cq_cache, GFP_KERNEL);
 	if (!my_cq) {
 		ehca_err(device, "Out of memory for ehca_cq struct device=%p",
 			 device);
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
index 3d1c1c5..cc47e4c 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -108,7 +108,7 @@ static struct kmem_cache *ctblk_cache = NULL;
 
 void *ehca_alloc_fw_ctrlblock(void)
 {
-	void *ret = kmem_cache_zalloc(ctblk_cache, SLAB_KERNEL);
+	void *ret = kmem_cache_zalloc(ctblk_cache, GFP_KERNEL);
 	if (!ret)
 		ehca_gen_err("Out of memory for ctblk");
 	return ret;
diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c
index abce676..0a5e221 100644
--- a/drivers/infiniband/hw/ehca/ehca_mrmw.c
+++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c
@@ -53,7 +53,7 @@ static struct ehca_mr *ehca_mr_new(void)
 {
 	struct ehca_mr *me;
 
-	me = kmem_cache_alloc(mr_cache, SLAB_KERNEL);
+	me = kmem_cache_alloc(mr_cache, GFP_KERNEL);
 	if (me) {
 		memset(me, 0, sizeof(struct ehca_mr));
 		spin_lock_init(&me->mrlock);
@@ -72,7 +72,7 @@ static struct ehca_mw *ehca_mw_new(void)
 {
 	struct ehca_mw *me;
 
-	me = kmem_cache_alloc(mw_cache, SLAB_KERNEL);
+	me = kmem_cache_alloc(mw_cache, GFP_KERNEL);
 	if (me) {
 		memset(me, 0, sizeof(struct ehca_mw));
 		spin_lock_init(&me->mwlock);
diff --git a/drivers/infiniband/hw/ehca/ehca_pd.c b/drivers/infiniband/hw/ehca/ehca_pd.c
index 2c3cdc6..d5345e5 100644
--- a/drivers/infiniband/hw/ehca/ehca_pd.c
+++ b/drivers/infiniband/hw/ehca/ehca_pd.c
@@ -50,7 +50,7 @@ struct ib_pd *ehca_alloc_pd(struct ib_device *device,
 {
 	struct ehca_pd *pd;
 
-	pd = kmem_cache_alloc(pd_cache, SLAB_KERNEL);
+	pd = kmem_cache_alloc(pd_cache, GFP_KERNEL);
 	if (!pd) {
 		ehca_err(device, "device=%p context=%p out of memory",
 			 device, context);
diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c
index 8682aa5..c6c9cef 100644
--- a/drivers/infiniband/hw/ehca/ehca_qp.c
+++ b/drivers/infiniband/hw/ehca/ehca_qp.c
@@ -450,7 +450,7 @@ struct ib_qp *ehca_create_qp(struct ib_pd *pd,
 	if (pd->uobject && udata)
 		context = pd->uobject->context;
 
-	my_qp = kmem_cache_alloc(qp_cache, SLAB_KERNEL);
+	my_qp = kmem_cache_alloc(qp_cache, GFP_KERNEL);
 	if (!my_qp) {
 		ehca_err(pd->device, "pd=%p not enough memory to alloc qp", pd);
 		return ERR_PTR(-ENOMEM);
diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c
index f56d6a0..0517c73 100644
--- a/drivers/input/touchscreen/ads7846.c
+++ b/drivers/input/touchscreen/ads7846.c
@@ -189,7 +189,7 @@ static int ads7846_read12_ser(struct device *dev, unsigned command)
 {
 	struct spi_device	*spi = to_spi_device(dev);
 	struct ads7846		*ts = dev_get_drvdata(dev);
-	struct ser_req		*req = kzalloc(sizeof *req, SLAB_KERNEL);
+	struct ser_req		*req = kzalloc(sizeof *req, GFP_KERNEL);
 	int			status;
 	int			sample;
 	int			i;
diff --git a/drivers/isdn/gigaset/bas-gigaset.c b/drivers/isdn/gigaset/bas-gigaset.c
index 5857e7e..63b629b 100644
--- a/drivers/isdn/gigaset/bas-gigaset.c
+++ b/drivers/isdn/gigaset/bas-gigaset.c
@@ -2218,21 +2218,21 @@ static int gigaset_probe(struct usb_interface *interface,
 	 * - three for the different uses of the default control pipe
 	 * - three for each isochronous pipe
 	 */
-	if (!(ucs->urb_int_in = usb_alloc_urb(0, SLAB_KERNEL)) ||
-	    !(ucs->urb_cmd_in = usb_alloc_urb(0, SLAB_KERNEL)) ||
-	    !(ucs->urb_cmd_out = usb_alloc_urb(0, SLAB_KERNEL)) ||
-	    !(ucs->urb_ctrl = usb_alloc_urb(0, SLAB_KERNEL)))
+	if (!(ucs->urb_int_in = usb_alloc_urb(0, GFP_KERNEL)) ||
+	    !(ucs->urb_cmd_in = usb_alloc_urb(0, GFP_KERNEL)) ||
+	    !(ucs->urb_cmd_out = usb_alloc_urb(0, GFP_KERNEL)) ||
+	    !(ucs->urb_ctrl = usb_alloc_urb(0, GFP_KERNEL)))
 		goto allocerr;
 
 	for (j = 0; j < 2; ++j) {
 		ubc = cs->bcs[j].hw.bas;
 		for (i = 0; i < BAS_OUTURBS; ++i)
 			if (!(ubc->isoouturbs[i].urb =
-			      usb_alloc_urb(BAS_NUMFRAMES, SLAB_KERNEL)))
+			      usb_alloc_urb(BAS_NUMFRAMES, GFP_KERNEL)))
 				goto allocerr;
 		for (i = 0; i < BAS_INURBS; ++i)
 			if (!(ubc->isoinurbs[i] =
-			      usb_alloc_urb(BAS_NUMFRAMES, SLAB_KERNEL)))
+			      usb_alloc_urb(BAS_NUMFRAMES, GFP_KERNEL)))
 				goto allocerr;
 	}
 
@@ -2246,7 +2246,7 @@ static int gigaset_probe(struct usb_interface *interface,
 					(endpoint->bEndpointAddress) & 0x0f),
 			 ucs->int_in_buf, 3, read_int_callback, cs,
 			 endpoint->bInterval);
-	if ((rc = usb_submit_urb(ucs->urb_int_in, SLAB_KERNEL)) != 0) {
+	if ((rc = usb_submit_urb(ucs->urb_int_in, GFP_KERNEL)) != 0) {
 		dev_err(cs->dev, "could not submit interrupt URB: %s\n",
 			get_usb_rcmsg(rc));
 		goto error;
diff --git a/drivers/isdn/gigaset/usb-gigaset.c b/drivers/isdn/gigaset/usb-gigaset.c
index af89ce1..04f2ad7 100644
--- a/drivers/isdn/gigaset/usb-gigaset.c
+++ b/drivers/isdn/gigaset/usb-gigaset.c
@@ -763,7 +763,7 @@ static int gigaset_probe(struct usb_interface *interface,
 		goto error;
 	}
 
-	ucs->bulk_out_urb = usb_alloc_urb(0, SLAB_KERNEL);
+	ucs->bulk_out_urb = usb_alloc_urb(0, GFP_KERNEL);
 	if (!ucs->bulk_out_urb) {
 		dev_err(cs->dev, "Couldn't allocate bulk_out_urb\n");
 		retval = -ENOMEM;
@@ -774,7 +774,7 @@ static int gigaset_probe(struct usb_interface *interface,
 
 	atomic_set(&ucs->busy, 0);
 
-	ucs->read_urb = usb_alloc_urb(0, SLAB_KERNEL);
+	ucs->read_urb = usb_alloc_urb(0, GFP_KERNEL);
 	if (!ucs->read_urb) {
 		dev_err(cs->dev, "No free urbs available\n");
 		retval = -ENOMEM;
@@ -797,7 +797,7 @@ static int gigaset_probe(struct usb_interface *interface,
 			 gigaset_read_int_callback,
 			 cs->inbuf + 0, endpoint->bInterval);
 
-	retval = usb_submit_urb(ucs->read_urb, SLAB_KERNEL);
+	retval = usb_submit_urb(ucs->read_urb, GFP_KERNEL);
 	if (retval) {
 		dev_err(cs->dev, "Could not submit URB (error %d)\n", -retval);
 		goto error;
diff --git a/drivers/media/dvb/cinergyT2/cinergyT2.c b/drivers/media/dvb/cinergyT2/cinergyT2.c
index 206c13e..9123147 100644
--- a/drivers/media/dvb/cinergyT2/cinergyT2.c
+++ b/drivers/media/dvb/cinergyT2/cinergyT2.c
@@ -287,7 +287,7 @@ static int cinergyt2_alloc_stream_urbs (struct cinergyt2 *cinergyt2)
 	int i;
 
 	cinergyt2->streambuf = usb_buffer_alloc(cinergyt2->udev, STREAM_URB_COUNT*STREAM_BUF_SIZE,
-					      SLAB_KERNEL, &cinergyt2->streambuf_dmahandle);
+					      GFP_KERNEL, &cinergyt2->streambuf_dmahandle);
 	if (!cinergyt2->streambuf) {
 		dprintk(1, "failed to alloc consistent stream memory area, bailing out!\n");
 		return -ENOMEM;
diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c
index ef4a731..334e078f 100644
--- a/drivers/mtd/devices/m25p80.c
+++ b/drivers/mtd/devices/m25p80.c
@@ -451,7 +451,7 @@ static int __devinit m25p_probe(struct spi_device *spi)
 		return -ENODEV;
 	}
 
-	flash = kzalloc(sizeof *flash, SLAB_KERNEL);
+	flash = kzalloc(sizeof *flash, GFP_KERNEL);
 	if (!flash)
 		return -ENOMEM;
 
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index ccd4daf..b318500 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -6940,7 +6940,7 @@ static int __devinit ipr_alloc_cmd_blks(struct ipr_ioa_cfg *ioa_cfg)
 		return -ENOMEM;
 
 	for (i = 0; i < IPR_NUM_CMD_BLKS; i++) {
-		ipr_cmd = pci_pool_alloc (ioa_cfg->ipr_cmd_pool, SLAB_KERNEL, &dma_addr);
+		ipr_cmd = pci_pool_alloc (ioa_cfg->ipr_cmd_pool, GFP_KERNEL, &dma_addr);
 
 		if (!ipr_cmd) {
 			ipr_free_cmd_blks(ioa_cfg);
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index c3c0626..09f2c74 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -360,7 +360,7 @@ spi_alloc_master(struct device *dev, unsigned size)
 	if (!dev)
 		return NULL;
 
-	master = kzalloc(size + sizeof *master, SLAB_KERNEL);
+	master = kzalloc(size + sizeof *master, GFP_KERNEL);
 	if (!master)
 		return NULL;
 
@@ -607,7 +607,7 @@ static int __init spi_init(void)
 {
 	int	status;
 
-	buf = kmalloc(SPI_BUFSIZ, SLAB_KERNEL);
+	buf = kmalloc(SPI_BUFSIZ, GFP_KERNEL);
 	if (!buf) {
 		status = -ENOMEM;
 		goto err0;
diff --git a/drivers/spi/spi_bitbang.c b/drivers/spi/spi_bitbang.c
index 08c1c57..57289b6 100644
--- a/drivers/spi/spi_bitbang.c
+++ b/drivers/spi/spi_bitbang.c
@@ -196,7 +196,7 @@ int spi_bitbang_setup(struct spi_device *spi)
 		return -EINVAL;
 
 	if (!cs) {
-		cs = kzalloc(sizeof *cs, SLAB_KERNEL);
+		cs = kzalloc(sizeof *cs, GFP_KERNEL);
 		if (!cs)
 			return -ENOMEM;
 		spi->controller_state = cs;
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 0a46acf..77c05be 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -2371,7 +2371,7 @@ check_highspeed (struct usb_hub *hub, struct usb_device *udev, int port1)
 	struct usb_qualifier_descriptor	*qual;
 	int				status;
 
-	qual = kmalloc (sizeof *qual, SLAB_KERNEL);
+	qual = kmalloc (sizeof *qual, GFP_KERNEL);
 	if (qual == NULL)
 		return;
 
@@ -2922,7 +2922,7 @@ static int config_descriptors_changed(struct usb_device *udev)
 		if (len < le16_to_cpu(udev->config[index].desc.wTotalLength))
 			len = le16_to_cpu(udev->config[index].desc.wTotalLength);
 	}
-	buf = kmalloc (len, SLAB_KERNEL);
+	buf = kmalloc (len, GFP_KERNEL);
 	if (buf == NULL) {
 		dev_err(&udev->dev, "no mem to re-read configs after reset\n");
 		/* assume the worst */
diff --git a/drivers/usb/gadget/gmidi.c b/drivers/usb/gadget/gmidi.c
index 64554ac..3135182 100644
--- a/drivers/usb/gadget/gmidi.c
+++ b/drivers/usb/gadget/gmidi.c
@@ -1236,7 +1236,7 @@ autoconf_fail:
 
 
 	/* ok, we made sense of the hardware ... */
-	dev = kzalloc(sizeof(*dev), SLAB_KERNEL);
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
 	if (!dev) {
 		return -ENOMEM;
 	}
diff --git a/drivers/usb/gadget/goku_udc.c b/drivers/usb/gadget/goku_udc.c
index a3076da..805a982 100644
--- a/drivers/usb/gadget/goku_udc.c
+++ b/drivers/usb/gadget/goku_udc.c
@@ -1864,7 +1864,7 @@ static int goku_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	}
 
 	/* alloc, and start init */
-	dev = kmalloc (sizeof *dev, SLAB_KERNEL);
+	dev = kmalloc (sizeof *dev, GFP_KERNEL);
 	if (dev == NULL){
 		pr_debug("enomem %s\n", pci_name(pdev));
 		retval = -ENOMEM;
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index 86924f9..3fb1044a 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -412,7 +412,7 @@ ep_read (struct file *fd, char __user *buf, size_t len, loff_t *ptr)
 	/* FIXME readahead for O_NONBLOCK and poll(); careful with ZLPs */
 
 	value = -ENOMEM;
-	kbuf = kmalloc (len, SLAB_KERNEL);
+	kbuf = kmalloc (len, GFP_KERNEL);
 	if (unlikely (!kbuf))
 		goto free1;
 
@@ -456,7 +456,7 @@ ep_write (struct file *fd, const char __user *buf, size_t len, loff_t *ptr)
 	/* FIXME writebehind for O_NONBLOCK and poll(), qlen = 1 */
 
 	value = -ENOMEM;
-	kbuf = kmalloc (len, SLAB_KERNEL);
+	kbuf = kmalloc (len, GFP_KERNEL);
 	if (!kbuf)
 		goto free1;
 	if (copy_from_user (kbuf, buf, len)) {
@@ -1898,7 +1898,7 @@ dev_config (struct file *fd, const char __user *buf, size_t len, loff_t *ptr)
 	buf += 4;
 	length -= 4;
 
-	kbuf = kmalloc (length, SLAB_KERNEL);
+	kbuf = kmalloc (length, GFP_KERNEL);
 	if (!kbuf)
 		return -ENOMEM;
 	if (copy_from_user (kbuf, buf, length)) {
diff --git a/drivers/usb/gadget/net2280.c b/drivers/usb/gadget/net2280.c
index 0b59083..3024c67 100644
--- a/drivers/usb/gadget/net2280.c
+++ b/drivers/usb/gadget/net2280.c
@@ -2861,7 +2861,7 @@ static int net2280_probe (struct pci_dev *pdev, const struct pci_device_id *id)
 	}
 
 	/* alloc, and start init */
-	dev = kzalloc (sizeof *dev, SLAB_KERNEL);
+	dev = kzalloc (sizeof *dev, GFP_KERNEL);
 	if (dev == NULL){
 		retval = -ENOMEM;
 		goto done;
diff --git a/drivers/usb/gadget/omap_udc.c b/drivers/usb/gadget/omap_udc.c
index 48a09fd..030d87c 100644
--- a/drivers/usb/gadget/omap_udc.c
+++ b/drivers/usb/gadget/omap_udc.c
@@ -2581,7 +2581,7 @@ omap_udc_setup(struct platform_device *odev, struct otg_transceiver *xceiv)
 	/* UDC_PULLUP_EN gates the chip clock */
 	// OTG_SYSCON_1_REG |= DEV_IDLE_EN;
 
-	udc = kzalloc(sizeof(*udc), SLAB_KERNEL);
+	udc = kzalloc(sizeof(*udc), GFP_KERNEL);
 	if (!udc)
 		return -ENOMEM;
 
diff --git a/drivers/usb/gadget/zero.c b/drivers/usb/gadget/zero.c
index 0f809dd..40710ea 100644
--- a/drivers/usb/gadget/zero.c
+++ b/drivers/usb/gadget/zero.c
@@ -1190,7 +1190,7 @@ autoconf_fail:
 
 
 	/* ok, we made sense of the hardware ... */
-	dev = kzalloc(sizeof(*dev), SLAB_KERNEL);
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
 	if (!dev)
 		return -ENOMEM;
 	spin_lock_init (&dev->lock);
diff --git a/drivers/usb/host/hc_crisv10.c b/drivers/usb/host/hc_crisv10.c
index 396dc69..7fd872aa 100644
--- a/drivers/usb/host/hc_crisv10.c
+++ b/drivers/usb/host/hc_crisv10.c
@@ -188,7 +188,7 @@ static DEFINE_TIMER(bulk_eot_timer, NULL, 0, 0);
 #define CHECK_ALIGN(x) if (((__u32)(x)) & 0x00000003) \
 {panic("Alignment check (DWORD) failed at %s:%s:%d\n", __FILE__, __FUNCTION__, __LINE__);}
 
-#define SLAB_FLAG     (in_interrupt() ? GFP_ATOMIC : SLAB_KERNEL)
+#define SLAB_FLAG     (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)
 #define KMALLOC_FLAG  (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)
 
 /* Most helpful debugging aid */
diff --git a/drivers/usb/host/ohci-pnx4008.c b/drivers/usb/host/ohci-pnx4008.c
index 2dbb774..7f26f9b 100644
--- a/drivers/usb/host/ohci-pnx4008.c
+++ b/drivers/usb/host/ohci-pnx4008.c
@@ -134,7 +134,7 @@ static int isp1301_attach(struct i2c_adapter *adap, int addr, int kind)
 {
 	struct i2c_client *c;
 
-	c = (struct i2c_client *)kzalloc(sizeof(*c), SLAB_KERNEL);
+	c = (struct i2c_client *)kzalloc(sizeof(*c), GFP_KERNEL);
 
 	if (!c)
 		return -ENOMEM;
diff --git a/drivers/usb/input/acecad.c b/drivers/usb/input/acecad.c
index 0096373..909138e 100644
--- a/drivers/usb/input/acecad.c
+++ b/drivers/usb/input/acecad.c
@@ -152,7 +152,7 @@ static int usb_acecad_probe(struct usb_interface *intf, const struct usb_device_
 	if (!acecad || !input_dev)
 		goto fail1;
 
-	acecad->data = usb_buffer_alloc(dev, 8, SLAB_KERNEL, &acecad->data_dma);
+	acecad->data = usb_buffer_alloc(dev, 8, GFP_KERNEL, &acecad->data_dma);
 	if (!acecad->data)
 		goto fail1;
 
diff --git a/drivers/usb/input/usbtouchscreen.c b/drivers/usb/input/usbtouchscreen.c
index 49704d4..7f3c57d 100644
--- a/drivers/usb/input/usbtouchscreen.c
+++ b/drivers/usb/input/usbtouchscreen.c
@@ -680,7 +680,7 @@ static int usbtouch_probe(struct usb_interface *intf,
 		type->process_pkt = usbtouch_process_pkt;
 
 	usbtouch->data = usb_buffer_alloc(udev, type->rept_size,
-	                                  SLAB_KERNEL, &usbtouch->data_dma);
+	                                  GFP_KERNEL, &usbtouch->data_dma);
 	if (!usbtouch->data)
 		goto out_free;
 
diff --git a/drivers/usb/misc/usbtest.c b/drivers/usb/misc/usbtest.c
index ea04dcc..fb32186 100644
--- a/drivers/usb/misc/usbtest.c
+++ b/drivers/usb/misc/usbtest.c
@@ -213,7 +213,7 @@ static struct urb *simple_alloc_urb (
 
 	if (bytes < 0)
 		return NULL;
-	urb = usb_alloc_urb (0, SLAB_KERNEL);
+	urb = usb_alloc_urb (0, GFP_KERNEL);
 	if (!urb)
 		return urb;
 	usb_fill_bulk_urb (urb, udev, pipe, NULL, bytes, simple_callback, NULL);
@@ -223,7 +223,7 @@ static struct urb *simple_alloc_urb (
 	urb->transfer_flags = URB_NO_TRANSFER_DMA_MAP;
 	if (usb_pipein (pipe))
 		urb->transfer_flags |= URB_SHORT_NOT_OK;
-	urb->transfer_buffer = usb_buffer_alloc (udev, bytes, SLAB_KERNEL,
+	urb->transfer_buffer = usb_buffer_alloc (udev, bytes, GFP_KERNEL,
 			&urb->transfer_dma);
 	if (!urb->transfer_buffer) {
 		usb_free_urb (urb);
@@ -315,7 +315,7 @@ static int simple_io (
 		init_completion (&completion);
 		if (usb_pipeout (urb->pipe))
 			simple_fill_buf (urb);
-		if ((retval = usb_submit_urb (urb, SLAB_KERNEL)) != 0)
+		if ((retval = usb_submit_urb (urb, GFP_KERNEL)) != 0)
 			break;
 
 		/* NOTE:  no timeouts; can't be broken out of by interrupt */
@@ -374,7 +374,7 @@ alloc_sglist (int nents, int max, int vary)
 	unsigned		i;
 	unsigned		size = max;
 
-	sg = kmalloc (nents * sizeof *sg, SLAB_KERNEL);
+	sg = kmalloc (nents * sizeof *sg, GFP_KERNEL);
 	if (!sg)
 		return NULL;
 
@@ -382,7 +382,7 @@ alloc_sglist (int nents, int max, int vary)
 		char		*buf;
 		unsigned	j;
 
-		buf = kzalloc (size, SLAB_KERNEL);
+		buf = kzalloc (size, GFP_KERNEL);
 		if (!buf) {
 			free_sglist (sg, i);
 			return NULL;
@@ -428,7 +428,7 @@ static int perform_sglist (
 				(udev->speed == USB_SPEED_HIGH)
 					? (INTERRUPT_RATE << 3)
 					: INTERRUPT_RATE,
-				sg, nents, 0, SLAB_KERNEL);
+				sg, nents, 0, GFP_KERNEL);
 		
 		if (retval)
 			break;
@@ -855,7 +855,7 @@ test_ctrl_queue (struct usbtest_dev *dev, struct usbtest_param *param)
 	 * as with bulk/intr sglists, sglen is the queue depth; it also
 	 * controls which subtests run (more tests than sglen) or rerun.
 	 */
-	urb = kcalloc(param->sglen, sizeof(struct urb *), SLAB_KERNEL);
+	urb = kcalloc(param->sglen, sizeof(struct urb *), GFP_KERNEL);
 	if (!urb)
 		return -ENOMEM;
 	for (i = 0; i < param->sglen; i++) {
@@ -981,7 +981,7 @@ test_ctrl_queue (struct usbtest_dev *dev, struct usbtest_param *param)
 		if (!u)
 			goto cleanup;
 
-		reqp = usb_buffer_alloc (udev, sizeof *reqp, SLAB_KERNEL,
+		reqp = usb_buffer_alloc (udev, sizeof *reqp, GFP_KERNEL,
 				&u->setup_dma);
 		if (!reqp)
 			goto cleanup;
@@ -1067,7 +1067,7 @@ static int unlink1 (struct usbtest_dev *dev, int pipe, int size, int async)
 	 * FIXME want additional tests for when endpoint is STALLing
 	 * due to errors, or is just NAKing requests.
 	 */
-	if ((retval = usb_submit_urb (urb, SLAB_KERNEL)) != 0) {
+	if ((retval = usb_submit_urb (urb, GFP_KERNEL)) != 0) {
 		dev_dbg (&dev->intf->dev, "submit fail %d\n", retval);
 		return retval;
 	}
@@ -1251,7 +1251,7 @@ static int ctrl_out (struct usbtest_dev *dev,
 	if (length < 1 || length > 0xffff || vary >= length)
 		return -EINVAL;
 
-	buf = kmalloc(length, SLAB_KERNEL);
+	buf = kmalloc(length, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 
@@ -1403,7 +1403,7 @@ static struct urb *iso_alloc_urb (
 	maxp *= 1 + (0x3 & (le16_to_cpu(desc->wMaxPacketSize) >> 11));
 	packets = (bytes + maxp - 1) / maxp;
 
-	urb = usb_alloc_urb (packets, SLAB_KERNEL);
+	urb = usb_alloc_urb (packets, GFP_KERNEL);
 	if (!urb)
 		return urb;
 	urb->dev = udev;
@@ -1411,7 +1411,7 @@ static struct urb *iso_alloc_urb (
 
 	urb->number_of_packets = packets;
 	urb->transfer_buffer_length = bytes;
-	urb->transfer_buffer = usb_buffer_alloc (udev, bytes, SLAB_KERNEL,
+	urb->transfer_buffer = usb_buffer_alloc (udev, bytes, GFP_KERNEL,
 			&urb->transfer_dma);
 	if (!urb->transfer_buffer) {
 		usb_free_urb (urb);
@@ -1900,7 +1900,7 @@ usbtest_probe (struct usb_interface *intf, const struct usb_device_id *id)
 	}
 #endif
 
-	dev = kzalloc(sizeof(*dev), SLAB_KERNEL);
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
 	if (!dev)
 		return -ENOMEM;
 	info = (struct usbtest_info *) id->driver_info;
@@ -1910,7 +1910,7 @@ usbtest_probe (struct usb_interface *intf, const struct usb_device_id *id)
 	dev->intf = intf;
 
 	/* cacheline-aligned scratch for i/o */
-	if ((dev->buf = kmalloc (TBUF_SIZE, SLAB_KERNEL)) == NULL) {
+	if ((dev->buf = kmalloc (TBUF_SIZE, GFP_KERNEL)) == NULL) {
 		kfree (dev);
 		return -ENOMEM;
 	}
diff --git a/drivers/usb/net/rndis_host.c b/drivers/usb/net/rndis_host.c
index c2a28d8..99f26b3 100644
--- a/drivers/usb/net/rndis_host.c
+++ b/drivers/usb/net/rndis_host.c
@@ -469,7 +469,7 @@ static void rndis_unbind(struct usbnet *dev, struct usb_interface *intf)
 	struct rndis_halt	*halt;
 
 	/* try to clear any rndis state/activity (no i/o from stack!) */
-	halt = kcalloc(1, sizeof *halt, SLAB_KERNEL);
+	halt = kcalloc(1, sizeof *halt, GFP_KERNEL);
 	if (halt) {
 		halt->msg_type = RNDIS_MSG_HALT;
 		halt->msg_len = ccpu2(sizeof *halt);
diff --git a/drivers/usb/net/usbnet.c b/drivers/usb/net/usbnet.c
index 327f975..6e39e99 100644
--- a/drivers/usb/net/usbnet.c
+++ b/drivers/usb/net/usbnet.c
@@ -179,9 +179,9 @@ static int init_status (struct usbnet *dev, struct usb_interface *intf)
 	period = max ((int) dev->status->desc.bInterval,
 		(dev->udev->speed == USB_SPEED_HIGH) ? 7 : 3);
 
-	buf = kmalloc (maxp, SLAB_KERNEL);
+	buf = kmalloc (maxp, GFP_KERNEL);
 	if (buf) {
-		dev->interrupt = usb_alloc_urb (0, SLAB_KERNEL);
+		dev->interrupt = usb_alloc_urb (0, GFP_KERNEL);
 		if (!dev->interrupt) {
 			kfree (buf);
 			return -ENOMEM;
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 9ade139..52eb10c 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -217,7 +217,7 @@ static kmem_cache_t *adfs_inode_cachep;
 static struct inode *adfs_alloc_inode(struct super_block *sb)
 {
 	struct adfs_inode_info *ei;
-	ei = (struct adfs_inode_info *)kmem_cache_alloc(adfs_inode_cachep, SLAB_KERNEL);
+	ei = (struct adfs_inode_info *)kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	return &ei->vfs_inode;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 5ea72c3..81c73ec 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -71,7 +71,7 @@ static kmem_cache_t * affs_inode_cachep;
 static struct inode *affs_alloc_inode(struct super_block *sb)
 {
 	struct affs_inode_info *ei;
-	ei = (struct affs_inode_info *)kmem_cache_alloc(affs_inode_cachep, SLAB_KERNEL);
+	ei = (struct affs_inode_info *)kmem_cache_alloc(affs_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	ei->vfs_inode.i_version = 1;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 67d1f5c..c6ead00 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -412,7 +412,7 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
 	struct afs_vnode *vnode;
 
 	vnode = (struct afs_vnode *)
-		kmem_cache_alloc(afs_inode_cachep, SLAB_KERNEL);
+		kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL);
 	if (!vnode)
 		return NULL;
 
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 07f7144..995348d 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -277,7 +277,7 @@ befs_alloc_inode(struct super_block *sb)
 {
         struct befs_inode_info *bi;
         bi = (struct befs_inode_info *)kmem_cache_alloc(befs_inode_cachep,
-							SLAB_KERNEL);
+							GFP_KERNEL);
         if (!bi)
                 return NULL;
         return &bi->vfs_inode;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index ed27ffb..2e45123 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -233,7 +233,7 @@ static kmem_cache_t * bfs_inode_cachep;
 static struct inode *bfs_alloc_inode(struct super_block *sb)
 {
 	struct bfs_inode_info *bi;
-	bi = kmem_cache_alloc(bfs_inode_cachep, SLAB_KERNEL);
+	bi = kmem_cache_alloc(bfs_inode_cachep, GFP_KERNEL);
 	if (!bi)
 		return NULL;
 	return &bi->vfs_inode;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 36c0e7a..0635067 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -239,7 +239,7 @@ static kmem_cache_t * bdev_cachep __read_mostly;
 
 static struct inode *bdev_alloc_inode(struct super_block *sb)
 {
-	struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, SLAB_KERNEL);
+	struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	return &ei->vfs_inode;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 84976cd..8416862 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -245,7 +245,7 @@ static struct inode *
 cifs_alloc_inode(struct super_block *sb)
 {
 	struct cifsInodeInfo *cifs_inode;
-	cifs_inode = kmem_cache_alloc(cifs_inode_cachep, SLAB_KERNEL);
+	cifs_inode = kmem_cache_alloc(cifs_inode_cachep, GFP_KERNEL);
 	if (!cifs_inode)
 		return NULL;
 	cifs_inode->cifsAttrs = 0x20;	/* default */
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 8355daf..aedf683 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -153,7 +153,7 @@ cifs_buf_get(void)
    albeit slightly larger than necessary and maxbuffersize 
    defaults to this and can not be bigger */
 	ret_buf =
-	    (struct smb_hdr *) mempool_alloc(cifs_req_poolp, SLAB_KERNEL | GFP_NOFS);
+	    (struct smb_hdr *) mempool_alloc(cifs_req_poolp, GFP_KERNEL | GFP_NOFS);
 
 	/* clear the first few header bytes */
 	/* for most paths, more is cleared in header_assemble */
@@ -192,7 +192,7 @@ cifs_small_buf_get(void)
    albeit slightly larger than necessary and maxbuffersize 
    defaults to this and can not be bigger */
 	ret_buf =
-	    (struct smb_hdr *) mempool_alloc(cifs_sm_req_poolp, SLAB_KERNEL | GFP_NOFS);
+	    (struct smb_hdr *) mempool_alloc(cifs_sm_req_poolp, GFP_KERNEL | GFP_NOFS);
 	if (ret_buf) {
 	/* No need to clear memory here, cleared in header assemble */
 	/*	memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 7514237..1f72776 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -51,7 +51,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct cifsSesInfo *ses)
 	}
 	
 	temp = (struct mid_q_entry *) mempool_alloc(cifs_mid_poolp,
-						    SLAB_KERNEL | GFP_NOFS);
+						    GFP_KERNEL | GFP_NOFS);
 	if (temp == NULL)
 		return temp;
 	else {
@@ -118,7 +118,7 @@ AllocOplockQEntry(struct inode * pinode, __u16 fid, struct cifsTconInfo * tcon)
 		return NULL;
 	}
 	temp = (struct oplock_q_entry *) kmem_cache_alloc(cifs_oplock_cachep,
-						       SLAB_KERNEL);
+						       GFP_KERNEL);
 	if (temp == NULL)
 		return temp;
 	else {
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 88d1233..50cedd2 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -43,7 +43,7 @@ static kmem_cache_t * coda_inode_cachep;
 static struct inode *coda_alloc_inode(struct super_block *sb)
 {
 	struct coda_inode_info *ei;
-	ei = (struct coda_inode_info *)kmem_cache_alloc(coda_inode_cachep, SLAB_KERNEL);
+	ei = (struct coda_inode_info *)kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	memset(&ei->c_fid, 0, sizeof(struct CodaFid));
diff --git a/fs/dnotify.c b/fs/dnotify.c
index 2b0442d..e778b17 100644
--- a/fs/dnotify.c
+++ b/fs/dnotify.c
@@ -77,7 +77,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 	inode = filp->f_dentry->d_inode;
 	if (!S_ISDIR(inode->i_mode))
 		return -ENOTDIR;
-	dn = kmem_cache_alloc(dn_cache, SLAB_KERNEL);
+	dn = kmem_cache_alloc(dn_cache, GFP_KERNEL);
 	if (dn == NULL)
 		return -ENOMEM;
 	spin_lock(&inode->i_lock);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 776b2ee..7196f50 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -628,7 +628,7 @@ int ecryptfs_decrypt_page(struct file *file, struct page *page)
 	num_extents_per_page = PAGE_CACHE_SIZE / crypt_stat->extent_size;
 	base_extent = (page->index * num_extents_per_page);
 	lower_page_virt = kmem_cache_alloc(ecryptfs_lower_page_cache,
-					   SLAB_KERNEL);
+					   GFP_KERNEL);
 	if (!lower_page_virt) {
 		rc = -ENOMEM;
 		ecryptfs_printk(KERN_ERR, "Error getting page for encrypted "
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index a92ef05..42099e7 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -250,7 +250,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
 	int lower_flags;
 
 	/* Released in ecryptfs_release or end of function if failure */
-	file_info = kmem_cache_alloc(ecryptfs_file_info_cache, SLAB_KERNEL);
+	file_info = kmem_cache_alloc(ecryptfs_file_info_cache, GFP_KERNEL);
 	ecryptfs_set_file_private(file, file_info);
 	if (!file_info) {
 		ecryptfs_printk(KERN_ERR,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 7091141..8a1945a 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -369,7 +369,7 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
 	BUG_ON(!atomic_read(&lower_dentry->d_count));
 	ecryptfs_set_dentry_private(dentry,
 				    kmem_cache_alloc(ecryptfs_dentry_info_cache,
-						     SLAB_KERNEL));
+						     GFP_KERNEL));
 	if (!ecryptfs_dentry_to_private(dentry)) {
 		rc = -ENOMEM;
 		ecryptfs_printk(KERN_ERR, "Out of memory whilst attempting "
@@ -795,7 +795,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
 	/* Released at out_free: label */
 	ecryptfs_set_file_private(&fake_ecryptfs_file,
 				  kmem_cache_alloc(ecryptfs_file_info_cache,
-						   SLAB_KERNEL));
+						   GFP_KERNEL));
 	if (unlikely(!ecryptfs_file_to_private(&fake_ecryptfs_file))) {
 		rc = -ENOMEM;
 		goto out;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index c3746f5..745c0f1 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -207,7 +207,7 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
 	/* Released: wipe_auth_tok_list called in ecryptfs_parse_packet_set or
 	 * at end of function upon failure */
 	auth_tok_list_item =
-	    kmem_cache_alloc(ecryptfs_auth_tok_list_item_cache, SLAB_KERNEL);
+	    kmem_cache_alloc(ecryptfs_auth_tok_list_item_cache, GFP_KERNEL);
 	if (!auth_tok_list_item) {
 		ecryptfs_printk(KERN_ERR, "Unable to allocate memory\n");
 		rc = -ENOMEM;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index a78d87d..a2c6ccb 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -378,7 +378,7 @@ ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent)
 	/* Released in ecryptfs_put_super() */
 	ecryptfs_set_superblock_private(sb,
 					kmem_cache_alloc(ecryptfs_sb_info_cache,
-							 SLAB_KERNEL));
+							 GFP_KERNEL));
 	if (!ecryptfs_superblock_to_private(sb)) {
 		ecryptfs_printk(KERN_WARNING, "Out of memory\n");
 		rc = -ENOMEM;
@@ -402,7 +402,7 @@ ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent)
 	/* through deactivate_super(sb) from get_sb_nodev() */
 	ecryptfs_set_dentry_private(sb->s_root,
 				    kmem_cache_alloc(ecryptfs_dentry_info_cache,
-						     SLAB_KERNEL));
+						     GFP_KERNEL));
 	if (!ecryptfs_dentry_to_private(sb->s_root)) {
 		ecryptfs_printk(KERN_ERR,
 				"dentry_info_cache alloc failed\n");
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 825757a..eaa5daa 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -50,7 +50,7 @@ static struct inode *ecryptfs_alloc_inode(struct super_block *sb)
 	struct inode *inode = NULL;
 
 	ecryptfs_inode = kmem_cache_alloc(ecryptfs_inode_info_cache,
-					  SLAB_KERNEL);
+					  GFP_KERNEL);
 	if (unlikely(!ecryptfs_inode))
 		goto out;
 	ecryptfs_init_crypt_stat(&ecryptfs_inode->crypt_stat);
diff --git a/fs/efs/super.c b/fs/efs/super.c
index b3f5065..69b15a9 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -57,7 +57,7 @@ static kmem_cache_t * efs_inode_cachep;
 static struct inode *efs_alloc_inode(struct super_block *sb)
 {
 	struct efs_inode_info *ei;
-	ei = (struct efs_inode_info *)kmem_cache_alloc(efs_inode_cachep, SLAB_KERNEL);
+	ei = (struct efs_inode_info *)kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	return &ei->vfs_inode;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index ae228ec..f5c8843 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -961,7 +961,7 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
 	struct epitem *epi = ep_item_from_epqueue(pt);
 	struct eppoll_entry *pwq;
 
-	if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, SLAB_KERNEL))) {
+	if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
 		init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
 		pwq->whead = whead;
 		pwq->base = epi;
@@ -1004,7 +1004,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 	struct ep_pqueue epq;
 
 	error = -ENOMEM;
-	if (!(epi = kmem_cache_alloc(epi_cache, SLAB_KERNEL)))
+	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
 		goto eexit_1;
 
 	/* Item initialization follow here ... */
diff --git a/fs/exec.c b/fs/exec.c
index d993ea1..2092bd2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -404,7 +404,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
 		bprm->loader += stack_base;
 	bprm->exec += stack_base;
 
-	mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	mpnt = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 	if (!mpnt)
 		return -ENOMEM;
 
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index d8b9abd..85c237e 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -140,7 +140,7 @@ static kmem_cache_t * ext2_inode_cachep;
 static struct inode *ext2_alloc_inode(struct super_block *sb)
 {
 	struct ext2_inode_info *ei;
-	ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, SLAB_KERNEL);
+	ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 82cc4f59..8c27227 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -63,7 +63,7 @@ void fat_cache_destroy(void)
 
 static inline struct fat_cache *fat_cache_alloc(struct inode *inode)
 {
-	return kmem_cache_alloc(fat_cache_cachep, SLAB_KERNEL);
+	return kmem_cache_alloc(fat_cache_cachep, GFP_KERNEL);
 }
 
 static inline void fat_cache_free(struct fat_cache *cache)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 78945b5..b58fd0c 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -482,7 +482,7 @@ static kmem_cache_t *fat_inode_cachep;
 static struct inode *fat_alloc_inode(struct super_block *sb)
 {
 	struct msdos_inode_info *ei;
-	ei = kmem_cache_alloc(fat_inode_cachep, SLAB_KERNEL);
+	ei = kmem_cache_alloc(fat_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	return &ei->vfs_inode;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index e4f2616..c03dc9c 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -567,7 +567,7 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap
 	int result = 0;
 
 	if (on) {
-		new = kmem_cache_alloc(fasync_cache, SLAB_KERNEL);
+		new = kmem_cache_alloc(fasync_cache, GFP_KERNEL);
 		if (!new)
 			return -ENOMEM;
 	}
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 4786d51..d2dd0d7 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -103,7 +103,7 @@ vxfs_blkiget(struct super_block *sbp, u_long extent, ino_t ino)
 		struct vxfs_inode_info	*vip;
 		struct vxfs_dinode	*dip;
 
-		if (!(vip = kmem_cache_alloc(vxfs_inode_cachep, SLAB_KERNEL)))
+		if (!(vip = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL)))
 			goto fail;
 		dip = (struct vxfs_dinode *)(bp->b_data + offset);
 		memcpy(vip, dip, sizeof(*vip));
@@ -145,7 +145,7 @@ __vxfs_iget(ino_t ino, struct inode *ilistp)
 		struct vxfs_dinode	*dip;
 		caddr_t			kaddr = (char *)page_address(pp);
 
-		if (!(vip = kmem_cache_alloc(vxfs_inode_cachep, SLAB_KERNEL)))
+		if (!(vip = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL)))
 			goto fail;
 		dip = (struct vxfs_dinode *)(kaddr + offset);
 		memcpy(vip, dip, sizeof(*vip));
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 66571ea..8c15139 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -41,7 +41,7 @@ static void fuse_request_init(struct fuse_req *req)
 
 struct fuse_req *fuse_request_alloc(void)
 {
-	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, SLAB_KERNEL);
+	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_KERNEL);
 	if (req)
 		fuse_request_init(req);
 	return req;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index fc42035..e039e20 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -46,7 +46,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
 	struct inode *inode;
 	struct fuse_inode *fi;
 
-	inode = kmem_cache_alloc(fuse_inode_cachep, SLAB_KERNEL);
+	inode = kmem_cache_alloc(fuse_inode_cachep, GFP_KERNEL);
 	if (!inode)
 		return NULL;
 
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 85b17b3..ffc6409 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -145,7 +145,7 @@ static struct inode *hfs_alloc_inode(struct super_block *sb)
 {
 	struct hfs_inode_info *i;
 
-	i = kmem_cache_alloc(hfs_inode_cachep, SLAB_KERNEL);
+	i = kmem_cache_alloc(hfs_inode_cachep, GFP_KERNEL);
 	return i ? &i->vfs_inode : NULL;
 }
 
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 194eede..4a0c70c 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -440,7 +440,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
 {
 	struct hfsplus_inode_info *i;
 
-	i = kmem_cache_alloc(hfsplus_inode_cachep, SLAB_KERNEL);
+	i = kmem_cache_alloc(hfsplus_inode_cachep, GFP_KERNEL);
 	return i ? &i->vfs_inode : NULL;
 }
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 7f47569..36e5217 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -522,7 +522,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
 
 	if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
 		return NULL;
-	p = kmem_cache_alloc(hugetlbfs_inode_cachep, SLAB_KERNEL);
+	p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL);
 	if (unlikely(!p)) {
 		hugetlbfs_inc_free_inodes(sbinfo);
 		return NULL;
diff --git a/fs/inode.c b/fs/inode.c
index 26cdb11..dd15984 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -109,7 +109,7 @@ static struct inode *alloc_inode(struct super_block *sb)
 	if (sb->s_op->alloc_inode)
 		inode = sb->s_op->alloc_inode(sb);
 	else
-		inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL);
+		inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL);
 
 	if (inode) {
 		struct address_space * const mapping = &inode->i_data;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index c34b862..4b6381c 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -62,7 +62,7 @@ static kmem_cache_t *isofs_inode_cachep;
 static struct inode *isofs_alloc_inode(struct super_block *sb)
 {
 	struct iso_inode_info *ei;
-	ei = kmem_cache_alloc(isofs_inode_cachep, SLAB_KERNEL);
+	ei = kmem_cache_alloc(isofs_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	return &ei->vfs_inode;
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index bc4b810..77be534 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -33,7 +33,7 @@ static kmem_cache_t *jffs2_inode_cachep;
 static struct inode *jffs2_alloc_inode(struct super_block *sb)
 {
 	struct jffs2_inode_info *ei;
-	ei = (struct jffs2_inode_info *)kmem_cache_alloc(jffs2_inode_cachep, SLAB_KERNEL);
+	ei = (struct jffs2_inode_info *)kmem_cache_alloc(jffs2_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	return &ei->vfs_inode;
diff --git a/fs/locks.c b/fs/locks.c
index e0b6a80..a7b97d5 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -147,7 +147,7 @@ static kmem_cache_t *filelock_cache __read_mostly;
 /* Allocate an empty lock structure. */
 static struct file_lock *locks_alloc_lock(void)
 {
-	return kmem_cache_alloc(filelock_cache, SLAB_KERNEL);
+	return kmem_cache_alloc(filelock_cache, GFP_KERNEL);
 }
 
 static void locks_release_private(struct file_lock *fl)
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 1e36bae..ce532c2 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -56,7 +56,7 @@ static kmem_cache_t * minix_inode_cachep;
 static struct inode *minix_alloc_inode(struct super_block *sb)
 {
 	struct minix_inode_info *ei;
-	ei = (struct minix_inode_info *)kmem_cache_alloc(minix_inode_cachep, SLAB_KERNEL);
+	ei = (struct minix_inode_info *)kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	return &ei->vfs_inode;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 72dad55..ed84d89 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -45,7 +45,7 @@ static kmem_cache_t * ncp_inode_cachep;
 static struct inode *ncp_alloc_inode(struct super_block *sb)
 {
 	struct ncp_inode_info *ei;
-	ei = (struct ncp_inode_info *)kmem_cache_alloc(ncp_inode_cachep, SLAB_KERNEL);
+	ei = (struct ncp_inode_info *)kmem_cache_alloc(ncp_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	return &ei->vfs_inode;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index bdfabf8..769fd0a 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -143,7 +143,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 {
 	struct nfs_direct_req *dreq;
 
-	dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL);
+	dreq = kmem_cache_alloc(nfs_direct_cachep, GFP_KERNEL);
 	if (!dreq)
 		return NULL;
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 08cc4c5..6b53aae 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1080,7 +1080,7 @@ void nfs4_clear_inode(struct inode *inode)
 struct inode *nfs_alloc_inode(struct super_block *sb)
 {
 	struct nfs_inode *nfsi;
-	nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, SLAB_KERNEL);
+	nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL);
 	if (!nfsi)
 		return NULL;
 	nfsi->flags = 0UL;
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 829af32..a1561a8 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -26,7 +26,7 @@ static inline struct nfs_page *
 nfs_page_alloc(void)
 {
 	struct nfs_page	*p;
-	p = kmem_cache_alloc(nfs_page_cachep, SLAB_KERNEL);
+	p = kmem_cache_alloc(nfs_page_cachep, GFP_KERNEL);
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->wb_list);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 592a640..911d1bc 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -336,7 +336,7 @@ static struct inode *openprom_alloc_inode(struct super_block *sb)
 {
 	struct op_inode_info *oi;
 
-	oi = kmem_cache_alloc(op_inode_cachep, SLAB_KERNEL);
+	oi = kmem_cache_alloc(op_inode_cachep, GFP_KERNEL);
 	if (!oi)
 		return NULL;
 
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 49dfb2a..b24cdb2 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -88,7 +88,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
 	struct proc_inode *ei;
 	struct inode *inode;
 
-	ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, SLAB_KERNEL);
+	ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	ei->pid = NULL;
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 5a41db2..5b943eb 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -520,7 +520,7 @@ static kmem_cache_t *qnx4_inode_cachep;
 static struct inode *qnx4_alloc_inode(struct super_block *sb)
 {
 	struct qnx4_inode_info *ei;
-	ei = kmem_cache_alloc(qnx4_inode_cachep, SLAB_KERNEL);
+	ei = kmem_cache_alloc(qnx4_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	return &ei->vfs_inode;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 1724999..3233251 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -496,7 +496,7 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb)
 {
 	struct reiserfs_inode_info *ei;
 	ei = (struct reiserfs_inode_info *)
-	    kmem_cache_alloc(reiserfs_inode_cachep, SLAB_KERNEL);
+	    kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	return &ei->vfs_inode;
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index ddcd9e1..d1b455f 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -555,7 +555,7 @@ static kmem_cache_t * romfs_inode_cachep;
 static struct inode *romfs_alloc_inode(struct super_block *sb)
 {
 	struct romfs_inode_info *ei;
-	ei = (struct romfs_inode_info *)kmem_cache_alloc(romfs_inode_cachep, SLAB_KERNEL);
+	ei = (struct romfs_inode_info *)kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	return &ei->vfs_inode;
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 2c122ee..2216171 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -55,7 +55,7 @@ static kmem_cache_t *smb_inode_cachep;
 static struct inode *smb_alloc_inode(struct super_block *sb)
 {
 	struct smb_inode_info *ei;
-	ei = (struct smb_inode_info *)kmem_cache_alloc(smb_inode_cachep, SLAB_KERNEL);
+	ei = (struct smb_inode_info *)kmem_cache_alloc(smb_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	return &ei->vfs_inode;
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
index 0fb7469..3eb1402 100644
--- a/fs/smbfs/request.c
+++ b/fs/smbfs/request.c
@@ -61,7 +61,7 @@ static struct smb_request *smb_do_alloc_request(struct smb_sb_info *server,
 	struct smb_request *req;
 	unsigned char *buf = NULL;
 
-	req = kmem_cache_alloc(req_cachep, SLAB_KERNEL);
+	req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
 	VERBOSE("allocating request: %p\n", req);
 	if (!req)
 		goto out;
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index d63c5e4..a6ca12b 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -307,7 +307,7 @@ static struct inode *sysv_alloc_inode(struct super_block *sb)
 {
 	struct sysv_inode_info *si;
 
-	si = kmem_cache_alloc(sysv_inode_cachep, SLAB_KERNEL);
+	si = kmem_cache_alloc(sysv_inode_cachep, GFP_KERNEL);
 	if (!si)
 		return NULL;
 	return &si->vfs_inode;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 1aea6a4..e50f242 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -112,7 +112,7 @@ static kmem_cache_t * udf_inode_cachep;
 static struct inode *udf_alloc_inode(struct super_block *sb)
 {
 	struct udf_inode_info *ei;
-	ei = (struct udf_inode_info *)kmem_cache_alloc(udf_inode_cachep, SLAB_KERNEL);
+	ei = (struct udf_inode_info *)kmem_cache_alloc(udf_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index ec79e30..85a88c0 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1209,7 +1209,7 @@ static kmem_cache_t * ufs_inode_cachep;
 static struct inode *ufs_alloc_inode(struct super_block *sb)
 {
 	struct ufs_inode_info *ei;
-	ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, SLAB_KERNEL);
+	ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	ei->vfs_inode.i_version = 1;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a8039c8..94b831b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1483,7 +1483,7 @@ extern void __init vfs_caches_init(unsigned long);
 
 extern struct kmem_cache *names_cachep;
 
-#define __getname()	kmem_cache_alloc(names_cachep, SLAB_KERNEL)
+#define __getname()	kmem_cache_alloc(names_cachep, GFP_KERNEL)
 #define __putname(name) kmem_cache_free(names_cachep, (void *)(name))
 #ifndef CONFIG_AUDITSYSCALL
 #define putname(name)   __putname(name)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index db2c1df..61c2ab6 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -34,7 +34,7 @@ extern kmem_cache_t *anon_vma_cachep;
 
 static inline struct anon_vma *anon_vma_alloc(void)
 {
-	return kmem_cache_alloc(anon_vma_cachep, SLAB_KERNEL);
+	return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
 }
 
 static inline void anon_vma_free(struct anon_vma *anon_vma)
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 34b046e..639f65e 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -19,7 +19,6 @@ typedef struct kmem_cache kmem_cache_t;
 #include	<asm/cache.h>		/* kmalloc_sizes.h needs L1_CACHE_BYTES */
 
 /* flags for kmem_cache_alloc() */
-#define	SLAB_KERNEL		GFP_KERNEL
 #define	SLAB_DMA		GFP_DMA
 
 /* flags to pass to kmem_cache_create().
diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
index 6562a20..f81a5af 100644
--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -35,7 +35,7 @@ static inline void taskstats_tgid_alloc(struct task_struct *tsk)
 		return;
 
 	/* No problem if kmem_cache_zalloc() fails */
-	stats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+	stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
 
 	spin_lock_irq(&tsk->sighand->siglock);
 	if (!sig->stats) {
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 7c27400..813bb94 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -224,7 +224,7 @@ static struct inode *mqueue_alloc_inode(struct super_block *sb)
 {
 	struct mqueue_inode_info *ei;
 
-	ei = kmem_cache_alloc(mqueue_inode_cachep, SLAB_KERNEL);
+	ei = kmem_cache_alloc(mqueue_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	return &ei->vfs_inode;
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 66a0ea4..70e9ec6 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -41,7 +41,7 @@ void delayacct_init(void)
 
 void __delayacct_tsk_init(struct task_struct *tsk)
 {
-	tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL);
+	tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
 	if (tsk->delays)
 		spin_lock_init(&tsk->delays->lock);
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 5678e6c..711aa5f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -237,7 +237,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 				goto fail_nomem;
 			charge = len;
 		}
-		tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+		tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 		if (!tmp)
 			goto fail_nomem;
 		*tmp = *mpnt;
@@ -319,7 +319,7 @@ static inline void mm_free_pgd(struct mm_struct * mm)
 
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
 
-#define allocate_mm()	(kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
+#define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
 #define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
 
 #include <linux/init_task.h>
@@ -621,7 +621,7 @@ static struct files_struct *alloc_files(void)
 	struct files_struct *newf;
 	struct fdtable *fdt;
 
-	newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
+	newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
 	if (!newf)
 		goto out;
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index d3d2891..1b2b326 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -425,7 +425,7 @@ void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
 	*mycpu = raw_smp_processor_id();
 
 	*ptidstats = NULL;
-	tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+	tmp = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
 	if (!tmp)
 		return;
 
diff --git a/kernel/user.c b/kernel/user.c
index 220e586..c1f93c1 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -132,7 +132,7 @@ struct user_struct * alloc_uid(uid_t uid)
 	if (!up) {
 		struct user_struct *new;
 
-		new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL);
+		new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
 		if (!new)
 			return NULL;
 		new->uid = uid;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e7b69c9..ad864f8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1326,7 +1326,7 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
 	atomic_set(&new->refcnt, 1);
 	if (new->policy == MPOL_BIND) {
 		int sz = ksize(old->v.zonelist);
-		new->v.zonelist = kmemdup(old->v.zonelist, sz, SLAB_KERNEL);
+		new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
 		if (!new->v.zonelist) {
 			kmem_cache_free(policy_cache, new);
 			return ERR_PTR(-ENOMEM);
diff --git a/mm/mmap.c b/mm/mmap.c
index 7b40abd..7be110e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1736,7 +1736,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 	if (mm->map_count >= sysctl_max_map_count)
 		return -ENOMEM;
 
-	new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 	if (!new)
 		return -ENOMEM;
 
@@ -2057,7 +2057,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		    vma_start < new_vma->vm_end)
 			*vmap = new_vma;
 	} else {
-		new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+		new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 		if (new_vma) {
 			*new_vma = *vma;
 			pol = mpol_copy(vma_policy(vma));
diff --git a/mm/shmem.c b/mm/shmem.c
index 4959535..bdaecfd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2263,7 +2263,7 @@ static struct kmem_cache *shmem_inode_cachep;
 static struct inode *shmem_alloc_inode(struct super_block *sb)
 {
 	struct shmem_inode_info *p;
-	p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, SLAB_KERNEL);
+	p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
 	if (!p)
 		return NULL;
 	return &p->vfs_inode;
diff --git a/mm/slab.c b/mm/slab.c
index 9f34b49..1f374c1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2237,7 +2237,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	align = ralign;
 
 	/* Get cache's description obj. */
-	cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL);
+	cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
 	if (!cachep)
 		goto oops;
 
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index bdbc3f4..101e5cc 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -590,7 +590,7 @@ create:
 
 replace:
 	err = -ENOBUFS;
-	new_f = kmem_cache_alloc(dn_hash_kmem, SLAB_KERNEL);
+	new_f = kmem_cache_alloc(dn_hash_kmem, GFP_KERNEL);
 	if (new_f == NULL)
 		goto out;
 
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 107bb6c..4463443 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -485,13 +485,13 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg)
 		goto out;
 
 	err = -ENOBUFS;
-	new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
+	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
 	if (new_fa == NULL)
 		goto out;
 
 	new_f = NULL;
 	if (!f) {
-		new_f = kmem_cache_alloc(fn_hash_kmem, SLAB_KERNEL);
+		new_f = kmem_cache_alloc(fn_hash_kmem, GFP_KERNEL);
 		if (new_f == NULL)
 			goto out_free_new_fa;
 
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index d17990e..6be6caf 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1187,7 +1187,7 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
 			u8 state;
 
 			err = -ENOBUFS;
-			new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
+			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
 			if (new_fa == NULL)
 				goto out;
 
@@ -1232,7 +1232,7 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
 		goto out;
 
 	err = -ENOBUFS;
-	new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
+	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
 	if (new_fa == NULL)
 		goto out;
 
diff --git a/net/socket.c b/net/socket.c
index e8db547..4f417c2 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -236,7 +236,7 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
 {
 	struct socket_alloc *ei;
 
-	ei = kmem_cache_alloc(sock_inode_cachep, SLAB_KERNEL);
+	ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	init_waitqueue_head(&ei->socket.wait);
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 49dba5f..df753d0 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -143,7 +143,7 @@ static struct inode *
 rpc_alloc_inode(struct super_block *sb)
 {
 	struct rpc_inode *rpci;
-	rpci = (struct rpc_inode *)kmem_cache_alloc(rpc_inode_cachep, SLAB_KERNEL);
+	rpci = (struct rpc_inode *)kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL);
 	if (!rpci)
 		return NULL;
 	return &rpci->vfs_inode;
diff --git a/security/keys/key.c b/security/keys/key.c
index 70eacbe..157bac6 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -285,7 +285,7 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 	}
 
 	/* allocate and initialise the key and its description */
-	key = kmem_cache_alloc(key_jar, SLAB_KERNEL);
+	key = kmem_cache_alloc(key_jar, GFP_KERNEL);
 	if (!key)
 		goto no_memory_2;
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 78f98fe..ac1aeed 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -181,7 +181,7 @@ static int inode_alloc_security(struct inode *inode)
 	struct task_security_struct *tsec = current->security;
 	struct inode_security_struct *isec;
 
-	isec = kmem_cache_alloc(sel_inode_cache, SLAB_KERNEL);
+	isec = kmem_cache_alloc(sel_inode_cache, GFP_KERNEL);
 	if (!isec)
 		return -ENOMEM;
 
diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c
index d049c7a..2dfc613 100644
--- a/security/selinux/ss/avtab.c
+++ b/security/selinux/ss/avtab.c
@@ -36,7 +36,7 @@ avtab_insert_node(struct avtab *h, int hvalue,
 		  struct avtab_key *key, struct avtab_datum *datum)
 {
 	struct avtab_node * newnode;
-	newnode = kmem_cache_alloc(avtab_node_cachep, SLAB_KERNEL);
+	newnode = kmem_cache_alloc(avtab_node_cachep, GFP_KERNEL);
 	if (newnode == NULL)
 		return NULL;
 	memset(newnode, 0, sizeof(struct avtab_node));
-- 
cgit v0.10.2


From 441e143e95f5aa1e04026cb0aa71c801ba53982f Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:33:19 -0800
Subject: [PATCH] slab: remove SLAB_DMA

SLAB_DMA is an alias of GFP_DMA. This is the last one so we
remove the leftover comment too.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/atm/he.c b/drivers/atm/he.c
index ec8a7a6..7d9b4e5 100644
--- a/drivers/atm/he.c
+++ b/drivers/atm/he.c
@@ -820,7 +820,7 @@ he_init_group(struct he_dev *he_dev, int group)
 		void *cpuaddr;
 
 #ifdef USE_RBPS_POOL 
-		cpuaddr = pci_pool_alloc(he_dev->rbps_pool, GFP_KERNEL|SLAB_DMA, &dma_handle);
+		cpuaddr = pci_pool_alloc(he_dev->rbps_pool, GFP_KERNEL|GFP_DMA, &dma_handle);
 		if (cpuaddr == NULL)
 			return -ENOMEM;
 #else
@@ -884,7 +884,7 @@ he_init_group(struct he_dev *he_dev, int group)
 		void *cpuaddr;
 
 #ifdef USE_RBPL_POOL
-		cpuaddr = pci_pool_alloc(he_dev->rbpl_pool, GFP_KERNEL|SLAB_DMA, &dma_handle);
+		cpuaddr = pci_pool_alloc(he_dev->rbpl_pool, GFP_KERNEL|GFP_DMA, &dma_handle);
 		if (cpuaddr == NULL)
 			return -ENOMEM;
 #else
@@ -1724,7 +1724,7 @@ __alloc_tpd(struct he_dev *he_dev)
 	struct he_tpd *tpd;
 	dma_addr_t dma_handle; 
 
-	tpd = pci_pool_alloc(he_dev->tpd_pool, GFP_ATOMIC|SLAB_DMA, &dma_handle);
+	tpd = pci_pool_alloc(he_dev->tpd_pool, GFP_ATOMIC|GFP_DMA, &dma_handle);
 	if (tpd == NULL)
 		return NULL;
 			
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index 5ecea3e..fdaa471 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -1215,7 +1215,7 @@ dasd_eckd_build_cp(struct dasd_device * device, struct request *req)
 		dst = page_address(bv->bv_page) + bv->bv_offset;
 		if (dasd_page_cache) {
 			char *copy = kmem_cache_alloc(dasd_page_cache,
-						      SLAB_DMA | __GFP_NOWARN);
+						      GFP_DMA | __GFP_NOWARN);
 			if (copy && rq_data_dir(req) == WRITE)
 				memcpy(copy + bv->bv_offset, dst, bv->bv_len);
 			if (copy)
diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c
index 80926c5..b857fd5 100644
--- a/drivers/s390/block/dasd_fba.c
+++ b/drivers/s390/block/dasd_fba.c
@@ -308,7 +308,7 @@ dasd_fba_build_cp(struct dasd_device * device, struct request *req)
 		dst = page_address(bv->bv_page) + bv->bv_offset;
 		if (dasd_page_cache) {
 			char *copy = kmem_cache_alloc(dasd_page_cache,
-						      SLAB_DMA | __GFP_NOWARN);
+						      GFP_DMA | __GFP_NOWARN);
 			if (copy && rq_data_dir(req) == WRITE)
 				memcpy(copy + bv->bv_offset, dst, bv->bv_len);
 			if (copy)
diff --git a/drivers/usb/core/buffer.c b/drivers/usb/core/buffer.c
index 840442a..c3915dc 100644
--- a/drivers/usb/core/buffer.c
+++ b/drivers/usb/core/buffer.c
@@ -93,7 +93,7 @@ void hcd_buffer_destroy (struct usb_hcd *hcd)
 }
 
 
-/* sometimes alloc/free could use kmalloc with SLAB_DMA, for
+/* sometimes alloc/free could use kmalloc with GFP_DMA, for
  * better sharing and to leverage mm/slab.c intelligence.
  */
 
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 639f65e..fbcfc20 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -18,9 +18,6 @@ typedef struct kmem_cache kmem_cache_t;
 #include	<asm/page.h>		/* kmalloc_sizes.h needs PAGE_SIZE */
 #include	<asm/cache.h>		/* kmalloc_sizes.h needs L1_CACHE_BYTES */
 
-/* flags for kmem_cache_alloc() */
-#define	SLAB_DMA		GFP_DMA
-
 /* flags to pass to kmem_cache_create().
  * The first 3 are only valid when the allocator as been build
  * SLAB_DEBUG_SUPPORT.
diff --git a/mm/slab.c b/mm/slab.c
index 1f374c1..bb831ba 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2637,7 +2637,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
 
 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
 {
-	if (flags & SLAB_DMA)
+	if (flags & GFP_DMA)
 		BUG_ON(!(cachep->gfpflags & GFP_DMA));
 	else
 		BUG_ON(cachep->gfpflags & GFP_DMA);
@@ -2721,7 +2721,7 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	 * Be lazy and only check for valid flags here,  keeping it out of the
 	 * critical path in kmem_cache_alloc().
 	 */
-	BUG_ON(flags & ~(SLAB_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW));
+	BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW));
 	if (flags & __GFP_NO_GROW)
 		return 0;
 
-- 
cgit v0.10.2


From e18b890bb0881bbab6f4f1a6cd20d9c60d66b003 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:33:20 -0800
Subject: [PATCH] slab: remove kmem_cache_t

Replace all uses of kmem_cache_t with struct kmem_cache.

The patch was generated using the following script:

	#!/bin/sh
	#
	# Replace one string by another in all the kernel sources.
	#

	set -e

	for file in `find * -name "*.c" -o -name "*.h"|xargs grep -l $1`; do
		quilt add $file
		sed -e "1,\$s/$1/$2/g" $file >/tmp/$$
		mv /tmp/$$ $file
		quilt refresh
	done

The script was run like this

	sh replace kmem_cache_t "struct kmem_cache"

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 0543162..8621a06 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -77,7 +77,7 @@ To get this part of the dma_ API, you must #include <linux/dmapool.h>
 Many drivers need lots of small dma-coherent memory regions for DMA
 descriptors or I/O buffers.  Rather than allocating in units of a page
 or more using dma_alloc_coherent(), you can use DMA pools.  These work
-much like a kmem_cache_t, except that they use the dma-coherent allocator
+much like a struct kmem_cache, except that they use the dma-coherent allocator
 not __get_free_pages().  Also, they understand common hardware constraints
 for alignment, like queue heads needing to be aligned on N byte boundaries.
 
@@ -94,7 +94,7 @@ The pool create() routines initialize a pool of dma-coherent buffers
 for use with a given device.  It must be called in a context which
 can sleep.
 
-The "name" is for diagnostics (like a kmem_cache_t name); dev and size
+The "name" is for diagnostics (like a struct kmem_cache name); dev and size
 are like what you'd pass to dma_alloc_coherent().  The device's hardware
 alignment requirement for this type of data is "align" (which is expressed
 in bytes, and must be a power of two).  If your device has no boundary
diff --git a/arch/arm/mach-s3c2410/dma.c b/arch/arm/mach-s3c2410/dma.c
index 3d211dc..01abb0a 100644
--- a/arch/arm/mach-s3c2410/dma.c
+++ b/arch/arm/mach-s3c2410/dma.c
@@ -40,7 +40,7 @@
 
 /* io map for dma */
 static void __iomem *dma_base;
-static kmem_cache_t *dma_kmem;
+static struct kmem_cache *dma_kmem;
 
 struct s3c24xx_dma_selection dma_sel;
 
@@ -1271,7 +1271,7 @@ struct sysdev_class dma_sysclass = {
 
 /* kmem cache implementation */
 
-static void s3c2410_dma_cache_ctor(void *p, kmem_cache_t *c, unsigned long f)
+static void s3c2410_dma_cache_ctor(void *p, struct kmem_cache *c, unsigned long f)
 {
 	memset(p, 0, sizeof(struct s3c2410_dma_buf));
 }
diff --git a/arch/arm26/mm/memc.c b/arch/arm26/mm/memc.c
index 34def63..f290158 100644
--- a/arch/arm26/mm/memc.c
+++ b/arch/arm26/mm/memc.c
@@ -24,7 +24,7 @@
 
 #define MEMC_TABLE_SIZE (256*sizeof(unsigned long))
 
-kmem_cache_t *pte_cache, *pgd_cache;
+struct kmem_cache *pte_cache, *pgd_cache;
 int page_nr;
 
 /*
@@ -162,12 +162,12 @@ void __init create_memmap_holes(struct meminfo *mi)
 {
 }
 
-static void pte_cache_ctor(void *pte, kmem_cache_t *cache, unsigned long flags)
+static void pte_cache_ctor(void *pte, struct kmem_cache *cache, unsigned long flags)
 {
 	memzero(pte, sizeof(pte_t) * PTRS_PER_PTE);
 }
 
-static void pgd_cache_ctor(void *pgd, kmem_cache_t *cache, unsigned long flags)
+static void pgd_cache_ctor(void *pgd, struct kmem_cache *cache, unsigned long flags)
 {
 	memzero(pgd + MEMC_TABLE_SIZE, USER_PTRS_PER_PGD * sizeof(pgd_t));
 }
diff --git a/arch/frv/mm/pgalloc.c b/arch/frv/mm/pgalloc.c
index f76dd03..19b13be 100644
--- a/arch/frv/mm/pgalloc.c
+++ b/arch/frv/mm/pgalloc.c
@@ -18,7 +18,7 @@
 #include <asm/cacheflush.h>
 
 pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((aligned(PAGE_SIZE)));
-kmem_cache_t *pgd_cache;
+struct kmem_cache *pgd_cache;
 
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
@@ -100,7 +100,7 @@ static inline void pgd_list_del(pgd_t *pgd)
 		set_page_private(next, (unsigned long) pprev);
 }
 
-void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
+void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
 {
 	unsigned long flags;
 
@@ -120,7 +120,7 @@ void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
 }
 
 /* never called when PTRS_PER_PMD > 1 */
-void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
+void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
 {
 	unsigned long flags; /* can be called from interrupt context */
 
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 1674161..a6a8e39 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -699,8 +699,8 @@ int remove_memory(u64 start, u64 size)
 #endif
 #endif
 
-kmem_cache_t *pgd_cache;
-kmem_cache_t *pmd_cache;
+struct kmem_cache *pgd_cache;
+struct kmem_cache *pmd_cache;
 
 void __init pgtable_cache_init(void)
 {
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index 10126e3..33be236 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -193,7 +193,7 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 	return pte;
 }
 
-void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
+void pmd_ctor(void *pmd, struct kmem_cache *cache, unsigned long flags)
 {
 	memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
 }
@@ -233,7 +233,7 @@ static inline void pgd_list_del(pgd_t *pgd)
 		set_page_private(next, (unsigned long)pprev);
 }
 
-void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
+void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
 {
 	unsigned long flags;
 
@@ -253,7 +253,7 @@ void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
 }
 
 /* never called when PTRS_PER_PMD > 1 */
-void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
+void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
 {
 	unsigned long flags; /* can be called from interrupt context */
 
diff --git a/arch/ia64/ia32/ia32_support.c b/arch/ia64/ia32/ia32_support.c
index c187743..6af400a 100644
--- a/arch/ia64/ia32/ia32_support.c
+++ b/arch/ia64/ia32/ia32_support.c
@@ -249,7 +249,7 @@ ia32_init (void)
 
 #if PAGE_SHIFT > IA32_PAGE_SHIFT
 	{
-		extern kmem_cache_t *partial_page_cachep;
+		extern struct kmem_cache *partial_page_cachep;
 
 		partial_page_cachep = kmem_cache_create("partial_page_cache",
 							sizeof(struct partial_page), 0, 0,
diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c
index 9d6a3f2..a4a6e14 100644
--- a/arch/ia64/ia32/sys_ia32.c
+++ b/arch/ia64/ia32/sys_ia32.c
@@ -254,7 +254,7 @@ mmap_subpage (struct file *file, unsigned long start, unsigned long end, int pro
 }
 
 /* SLAB cache for partial_page structures */
-kmem_cache_t *partial_page_cachep;
+struct kmem_cache *partial_page_cachep;
 
 /*
  * init partial_page_list.
diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c
index b9561d3..7d0f13f 100644
--- a/arch/powerpc/kernel/rtas_flash.c
+++ b/arch/powerpc/kernel/rtas_flash.c
@@ -101,7 +101,7 @@ struct flash_block_list_header { /* just the header of flash_block_list */
 static struct flash_block_list_header rtas_firmware_flash_list = {0, NULL};
 
 /* Use slab cache to guarantee 4k alignment */
-static kmem_cache_t *flash_block_cache = NULL;
+static struct kmem_cache *flash_block_cache = NULL;
 
 #define FLASH_BLOCK_LIST_VERSION (1UL)
 
@@ -286,7 +286,7 @@ static ssize_t rtas_flash_read(struct file *file, char __user *buf,
 }
 
 /* constructor for flash_block_cache */
-void rtas_block_ctor(void *ptr, kmem_cache_t *cache, unsigned long flags)
+void rtas_block_ctor(void *ptr, struct kmem_cache *cache, unsigned long flags)
 {
 	memset(ptr, 0, RTAS_BLK_SIZE);
 }
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 424a8f5..89c836d 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -1047,7 +1047,7 @@ repeat:
 	return err;
 }
 
-static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
+static void zero_ctor(void *addr, struct kmem_cache *cache, unsigned long flags)
 {
 	memset(addr, 0, kmem_cache_size(cache));
 }
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 9a17854..d12a87e 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -141,7 +141,7 @@ static int __init setup_kcore(void)
 }
 module_init(setup_kcore);
 
-static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
+static void zero_ctor(void *addr, struct kmem_cache *cache, unsigned long flags)
 {
 	memset(addr, 0, kmem_cache_size(cache));
 }
@@ -166,9 +166,9 @@ static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
 /* Hugepages need one extra cache, initialized in hugetlbpage.c.  We
  * can't put into the tables above, because HPAGE_SHIFT is not compile
  * time constant. */
-kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+1];
+struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+1];
 #else
-kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
+struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
 #endif
 
 void pgtable_cache_init(void)
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 7edfcc9..e3af911 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -40,7 +40,7 @@
 
 #include "spufs.h"
 
-static kmem_cache_t *spufs_inode_cache;
+static struct kmem_cache *spufs_inode_cache;
 char *isolated_loader;
 
 static struct inode *
@@ -65,7 +65,7 @@ spufs_destroy_inode(struct inode *inode)
 }
 
 static void
-spufs_init_once(void *p, kmem_cache_t * cachep, unsigned long flags)
+spufs_init_once(void *p, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct spufs_inode_info *ei = p;
 
diff --git a/arch/sh/kernel/cpu/sh4/sq.c b/arch/sh/kernel/cpu/sh4/sq.c
index 55f4350..0c9ea38 100644
--- a/arch/sh/kernel/cpu/sh4/sq.c
+++ b/arch/sh/kernel/cpu/sh4/sq.c
@@ -38,7 +38,7 @@ struct sq_mapping {
 
 static struct sq_mapping *sq_mapping_list;
 static DEFINE_SPINLOCK(sq_mapping_lock);
-static kmem_cache_t *sq_cache;
+static struct kmem_cache *sq_cache;
 static unsigned long *sq_bitmap;
 
 #define store_queue_barrier()			\
diff --git a/arch/sh/mm/pmb.c b/arch/sh/mm/pmb.c
index 92e7453..b60ad83 100644
--- a/arch/sh/mm/pmb.c
+++ b/arch/sh/mm/pmb.c
@@ -30,7 +30,7 @@
 
 #define NR_PMB_ENTRIES	16
 
-static kmem_cache_t *pmb_cache;
+static struct kmem_cache *pmb_cache;
 static unsigned long pmb_map;
 
 static struct pmb_entry pmb_init_map[] = {
@@ -283,7 +283,7 @@ void pmb_unmap(unsigned long addr)
 	} while (pmbe);
 }
 
-static void pmb_cache_ctor(void *pmb, kmem_cache_t *cachep, unsigned long flags)
+static void pmb_cache_ctor(void *pmb, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct pmb_entry *pmbe = pmb;
 
@@ -297,7 +297,7 @@ static void pmb_cache_ctor(void *pmb, kmem_cache_t *cachep, unsigned long flags)
 	spin_unlock_irq(&pmb_list_lock);
 }
 
-static void pmb_cache_dtor(void *pmb, kmem_cache_t *cachep, unsigned long flags)
+static void pmb_cache_dtor(void *pmb, struct kmem_cache *cachep, unsigned long flags)
 {
 	spin_lock_irq(&pmb_list_lock);
 	pmb_list_del(pmb);
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index 09cb7fc..a8e8802 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -176,9 +176,9 @@ unsigned long sparc64_kern_sec_context __read_mostly;
 
 int bigkernel = 0;
 
-kmem_cache_t *pgtable_cache __read_mostly;
+struct kmem_cache *pgtable_cache __read_mostly;
 
-static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
+static void zero_ctor(void *addr, struct kmem_cache *cache, unsigned long flags)
 {
 	clear_page(addr);
 }
diff --git a/arch/sparc64/mm/tsb.c b/arch/sparc64/mm/tsb.c
index beaa028..236d02f 100644
--- a/arch/sparc64/mm/tsb.c
+++ b/arch/sparc64/mm/tsb.c
@@ -239,7 +239,7 @@ static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_idx, unsign
 	}
 }
 
-static kmem_cache_t *tsb_caches[8] __read_mostly;
+static struct kmem_cache *tsb_caches[8] __read_mostly;
 
 static const char *tsb_cache_names[8] = {
 	"tsb_8KB",
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 84e9be0..78c6b31 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -43,8 +43,8 @@ static int cfq_slice_idle = HZ / 125;
 #define RQ_CIC(rq)		((struct cfq_io_context*)(rq)->elevator_private)
 #define RQ_CFQQ(rq)		((rq)->elevator_private2)
 
-static kmem_cache_t *cfq_pool;
-static kmem_cache_t *cfq_ioc_pool;
+static struct kmem_cache *cfq_pool;
+static struct kmem_cache *cfq_ioc_pool;
 
 static DEFINE_PER_CPU(unsigned long, ioc_count);
 static struct completion *ioc_gone;
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index cc6e95f..a4ff327 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -44,17 +44,17 @@ static struct io_context *current_io_context(gfp_t gfp_flags, int node);
 /*
  * For the allocated request tables
  */
-static kmem_cache_t *request_cachep;
+static struct kmem_cache *request_cachep;
 
 /*
  * For queue allocation
  */
-static kmem_cache_t *requestq_cachep;
+static struct kmem_cache *requestq_cachep;
 
 /*
  * For io context allocations
  */
-static kmem_cache_t *iocontext_cachep;
+static struct kmem_cache *iocontext_cachep;
 
 /*
  * Controlling structure to kblockd
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index aa25f8b..478489c 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -12,7 +12,7 @@
 #include <linux/netdevice.h>
 #include "aoe.h"
 
-static kmem_cache_t *buf_pool_cache;
+static struct kmem_cache *buf_pool_cache;
 
 static ssize_t aoedisk_show_state(struct gendisk * disk, char *page)
 {
diff --git a/drivers/ieee1394/eth1394.c b/drivers/ieee1394/eth1394.c
index 31e5cc4..27d6c64 100644
--- a/drivers/ieee1394/eth1394.c
+++ b/drivers/ieee1394/eth1394.c
@@ -133,7 +133,7 @@ struct eth1394_node_info {
 #define ETH1394_DRIVER_NAME "eth1394"
 static const char driver_name[] = ETH1394_DRIVER_NAME;
 
-static kmem_cache_t *packet_task_cache;
+static struct kmem_cache *packet_task_cache;
 
 static struct hpsb_highlevel eth1394_highlevel;
 
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index ed2d4ef..c7bee4f 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -101,7 +101,7 @@ struct crypt_config {
 #define MIN_POOL_PAGES 32
 #define MIN_BIO_PAGES  8
 
-static kmem_cache_t *_crypt_io_pool;
+static struct kmem_cache *_crypt_io_pool;
 
 /*
  * Different IV generation algorithms:
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index e77ee6f..cf8bf05 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -101,7 +101,7 @@ typedef int (*action_fn) (struct pgpath *pgpath);
 
 #define MIN_IOS 256	/* Mempool size */
 
-static kmem_cache_t *_mpio_cache;
+static struct kmem_cache *_mpio_cache;
 
 struct workqueue_struct *kmultipathd;
 static void process_queued_ios(struct work_struct *work);
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 91c7aa1..b0ce2ce 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -88,8 +88,8 @@ struct pending_exception {
  * Hash table mapping origin volumes to lists of snapshots and
  * a lock to protect it
  */
-static kmem_cache_t *exception_cache;
-static kmem_cache_t *pending_cache;
+static struct kmem_cache *exception_cache;
+static struct kmem_cache *pending_cache;
 static mempool_t *pending_pool;
 
 /*
@@ -228,7 +228,7 @@ static int init_exception_table(struct exception_table *et, uint32_t size)
 	return 0;
 }
 
-static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem)
+static void exit_exception_table(struct exception_table *et, struct kmem_cache *mem)
 {
 	struct list_head *slot;
 	struct exception *ex, *next;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index fc4f743..7ec1b11 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -121,8 +121,8 @@ struct mapped_device {
 };
 
 #define MIN_IOS 256
-static kmem_cache_t *_io_cache;
-static kmem_cache_t *_tio_cache;
+static struct kmem_cache *_io_cache;
+static struct kmem_cache *_tio_cache;
 
 static int __init local_init(void)
 {
diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c
index b3c0149..b46f6c5 100644
--- a/drivers/md/kcopyd.c
+++ b/drivers/md/kcopyd.c
@@ -203,7 +203,7 @@ struct kcopyd_job {
 /* FIXME: this should scale with the number of pages */
 #define MIN_JOBS 512
 
-static kmem_cache_t *_job_cache;
+static struct kmem_cache *_job_cache;
 static mempool_t *_job_pool;
 
 /*
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 69c3e20..52914d5 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -348,7 +348,7 @@ static int grow_one_stripe(raid5_conf_t *conf)
 
 static int grow_stripes(raid5_conf_t *conf, int num)
 {
-	kmem_cache_t *sc;
+	struct kmem_cache *sc;
 	int devs = conf->raid_disks;
 
 	sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev));
@@ -397,7 +397,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 	LIST_HEAD(newstripes);
 	struct disk_info *ndisks;
 	int err = 0;
-	kmem_cache_t *sc;
+	struct kmem_cache *sc;
 	int i;
 
 	if (newsize <= conf->pool_size)
diff --git a/drivers/message/i2o/i2o_block.h b/drivers/message/i2o/i2o_block.h
index d9fdc95..67f921b 100644
--- a/drivers/message/i2o/i2o_block.h
+++ b/drivers/message/i2o/i2o_block.h
@@ -64,7 +64,7 @@
 
 /* I2O Block OSM mempool struct */
 struct i2o_block_mempool {
-	kmem_cache_t *slab;
+	struct kmem_cache *slab;
 	mempool_t *pool;
 };
 
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 9fc9a34..9168401 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -26,7 +26,7 @@
 
 static DEFINE_SPINLOCK(msi_lock);
 static struct msi_desc* msi_desc[NR_IRQS] = { [0 ... NR_IRQS-1] = NULL };
-static kmem_cache_t* msi_cachep;
+static struct kmem_cache* msi_cachep;
 
 static int pci_msi_enable = 1;
 
diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c
index 17fdd8c..cf28ccc 100644
--- a/drivers/s390/block/dasd_devmap.c
+++ b/drivers/s390/block/dasd_devmap.c
@@ -25,7 +25,7 @@
 
 #include "dasd_int.h"
 
-kmem_cache_t *dasd_page_cache;
+struct kmem_cache *dasd_page_cache;
 EXPORT_SYMBOL_GPL(dasd_page_cache);
 
 /*
diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h
index 9f52004..dc5dd50 100644
--- a/drivers/s390/block/dasd_int.h
+++ b/drivers/s390/block/dasd_int.h
@@ -474,7 +474,7 @@ extern struct dasd_profile_info_t dasd_global_profile;
 extern unsigned int dasd_profile_level;
 extern struct block_device_operations dasd_device_operations;
 
-extern kmem_cache_t *dasd_page_cache;
+extern struct kmem_cache *dasd_page_cache;
 
 struct dasd_ccw_req *
 dasd_kmalloc_request(char *, int, int, struct dasd_device *);
diff --git a/drivers/s390/scsi/zfcp_def.h b/drivers/s390/scsi/zfcp_def.h
index 74c0eac..32933ed 100644
--- a/drivers/s390/scsi/zfcp_def.h
+++ b/drivers/s390/scsi/zfcp_def.h
@@ -1032,9 +1032,9 @@ struct zfcp_data {
 	wwn_t                   init_wwpn;
 	fcp_lun_t               init_fcp_lun;
 	char 			*driver_version;
-	kmem_cache_t		*fsf_req_qtcb_cache;
-	kmem_cache_t		*sr_buffer_cache;
-	kmem_cache_t		*gid_pn_cache;
+	struct kmem_cache		*fsf_req_qtcb_cache;
+	struct kmem_cache		*sr_buffer_cache;
+	struct kmem_cache		*gid_pn_cache;
 };
 
 /**
diff --git a/drivers/scsi/aic94xx/aic94xx.h b/drivers/scsi/aic94xx/aic94xx.h
index 71a031d..32f513b 100644
--- a/drivers/scsi/aic94xx/aic94xx.h
+++ b/drivers/scsi/aic94xx/aic94xx.h
@@ -56,8 +56,8 @@
 /* 2*ITNL timeout + 1 second */
 #define AIC94XX_SCB_TIMEOUT  (5*HZ)
 
-extern kmem_cache_t *asd_dma_token_cache;
-extern kmem_cache_t *asd_ascb_cache;
+extern struct kmem_cache *asd_dma_token_cache;
+extern struct kmem_cache *asd_ascb_cache;
 extern char sas_addr_str[2*SAS_ADDR_SIZE + 1];
 
 static inline void asd_stringify_sas_addr(char *p, const u8 *sas_addr)
diff --git a/drivers/scsi/aic94xx/aic94xx_hwi.c b/drivers/scsi/aic94xx/aic94xx_hwi.c
index af7e011..da94e12 100644
--- a/drivers/scsi/aic94xx/aic94xx_hwi.c
+++ b/drivers/scsi/aic94xx/aic94xx_hwi.c
@@ -1047,7 +1047,7 @@ irqreturn_t asd_hw_isr(int irq, void *dev_id)
 static inline struct asd_ascb *asd_ascb_alloc(struct asd_ha_struct *asd_ha,
 					      gfp_t gfp_flags)
 {
-	extern kmem_cache_t *asd_ascb_cache;
+	extern struct kmem_cache *asd_ascb_cache;
 	struct asd_seq_data *seq = &asd_ha->seq;
 	struct asd_ascb *ascb;
 	unsigned long flags;
diff --git a/drivers/scsi/aic94xx/aic94xx_init.c b/drivers/scsi/aic94xx/aic94xx_init.c
index 42302ef..fbc82b0 100644
--- a/drivers/scsi/aic94xx/aic94xx_init.c
+++ b/drivers/scsi/aic94xx/aic94xx_init.c
@@ -450,8 +450,8 @@ static inline void asd_destroy_ha_caches(struct asd_ha_struct *asd_ha)
 	asd_ha->scb_pool = NULL;
 }
 
-kmem_cache_t *asd_dma_token_cache;
-kmem_cache_t *asd_ascb_cache;
+struct kmem_cache *asd_dma_token_cache;
+struct kmem_cache *asd_ascb_cache;
 
 static int asd_create_global_caches(void)
 {
diff --git a/drivers/scsi/libsas/sas_init.c b/drivers/scsi/libsas/sas_init.c
index d65bc4e..2f0c07f 100644
--- a/drivers/scsi/libsas/sas_init.c
+++ b/drivers/scsi/libsas/sas_init.c
@@ -36,7 +36,7 @@
 
 #include "../scsi_sas_internal.h"
 
-kmem_cache_t *sas_task_cache;
+struct kmem_cache *sas_task_cache;
 
 /*------------ SAS addr hash -----------*/
 void sas_hash_addr(u8 *hashed, const u8 *sas_addr)
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index cbe0cad..d03523d 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -24,7 +24,7 @@ char qla2x00_version_str[40];
 /*
  * SRB allocation cache
  */
-static kmem_cache_t *srb_cachep;
+static struct kmem_cache *srb_cachep;
 
 /*
  * Ioctl related information.
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index 969c9e4..9ef693c 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -19,7 +19,7 @@ char qla4xxx_version_str[40];
 /*
  * SRB allocation cache
  */
-static kmem_cache_t *srb_cachep;
+static struct kmem_cache *srb_cachep;
 
 /*
  * Module parameter information and variables
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index fafc00d..24cffd9 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -136,7 +136,7 @@ const char * scsi_device_type(unsigned type)
 EXPORT_SYMBOL(scsi_device_type);
 
 struct scsi_host_cmd_pool {
-	kmem_cache_t	*slab;
+	struct kmem_cache	*slab;
 	unsigned int	users;
 	char		*name;
 	unsigned int	slab_flags;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index fb616c6..1748e27 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -36,7 +36,7 @@
 struct scsi_host_sg_pool {
 	size_t		size;
 	char		*name; 
-	kmem_cache_t	*slab;
+	struct kmem_cache	*slab;
 	mempool_t	*pool;
 };
 
@@ -241,7 +241,7 @@ struct scsi_io_context {
 	char sense[SCSI_SENSE_BUFFERSIZE];
 };
 
-static kmem_cache_t *scsi_io_context_cache;
+static struct kmem_cache *scsi_io_context_cache;
 
 static void scsi_end_async(struct request *req, int uptodate)
 {
diff --git a/drivers/scsi/scsi_tgt_lib.c b/drivers/scsi/scsi_tgt_lib.c
index 386dbae..d402aff 100644
--- a/drivers/scsi/scsi_tgt_lib.c
+++ b/drivers/scsi/scsi_tgt_lib.c
@@ -33,7 +33,7 @@
 #include "scsi_tgt_priv.h"
 
 static struct workqueue_struct *scsi_tgtd;
-static kmem_cache_t *scsi_tgt_cmd_cache;
+static struct kmem_cache *scsi_tgt_cmd_cache;
 
 /*
  * TODO: this struct will be killed when the block layer supports large bios
diff --git a/drivers/usb/host/hc_crisv10.c b/drivers/usb/host/hc_crisv10.c
index 7fd872aa..9325e46 100644
--- a/drivers/usb/host/hc_crisv10.c
+++ b/drivers/usb/host/hc_crisv10.c
@@ -275,13 +275,13 @@ static volatile USB_SB_Desc_t TxIntrSB_zout __attribute__ ((aligned (4)));
 static int zout_buffer[4] __attribute__ ((aligned (4)));
 
 /* Cache for allocating new EP and SB descriptors. */
-static kmem_cache_t *usb_desc_cache;
+static struct kmem_cache *usb_desc_cache;
 
 /* Cache for the registers allocated in the top half. */
-static kmem_cache_t *top_half_reg_cache;
+static struct kmem_cache *top_half_reg_cache;
 
 /* Cache for the data allocated in the isoc descr top half. */
-static kmem_cache_t *isoc_compl_cache;
+static struct kmem_cache *isoc_compl_cache;
 
 static struct usb_bus *etrax_usb_bus;
 
diff --git a/drivers/usb/host/uhci-hcd.c b/drivers/usb/host/uhci-hcd.c
index 226bf3d..e87692c 100644
--- a/drivers/usb/host/uhci-hcd.c
+++ b/drivers/usb/host/uhci-hcd.c
@@ -81,7 +81,7 @@ MODULE_PARM_DESC(debug, "Debug level");
 static char *errbuf;
 #define ERRBUF_LEN    (32 * 1024)
 
-static kmem_cache_t *uhci_up_cachep;	/* urb_priv */
+static struct kmem_cache *uhci_up_cachep;	/* urb_priv */
 
 static void suspend_rh(struct uhci_hcd *uhci, enum uhci_rh_state new_state);
 static void wakeup_rh(struct uhci_hcd *uhci);
diff --git a/drivers/usb/mon/mon_text.c b/drivers/usb/mon/mon_text.c
index 46aecc8..05cf2c9 100644
--- a/drivers/usb/mon/mon_text.c
+++ b/drivers/usb/mon/mon_text.c
@@ -50,7 +50,7 @@ struct mon_event_text {
 
 #define SLAB_NAME_SZ  30
 struct mon_reader_text {
-	kmem_cache_t *e_slab;
+	struct kmem_cache *e_slab;
 	int nevents;
 	struct list_head e_list;
 	struct mon_reader r;	/* In C, parent class can be placed anywhere */
@@ -63,7 +63,7 @@ struct mon_reader_text {
 	char slab_name[SLAB_NAME_SZ];
 };
 
-static void mon_text_ctor(void *, kmem_cache_t *, unsigned long);
+static void mon_text_ctor(void *, struct kmem_cache *, unsigned long);
 
 /*
  * mon_text_submit
@@ -450,7 +450,7 @@ const struct file_operations mon_fops_text = {
 /*
  * Slab interface: constructor.
  */
-static void mon_text_ctor(void *mem, kmem_cache_t *slab, unsigned long sflags)
+static void mon_text_ctor(void *mem, struct kmem_cache *slab, unsigned long sflags)
 {
 	/*
 	 * Nothing to initialize. No, really!
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 52eb10c..e5a205c 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -212,7 +212,7 @@ static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static kmem_cache_t *adfs_inode_cachep;
+static struct kmem_cache *adfs_inode_cachep;
 
 static struct inode *adfs_alloc_inode(struct super_block *sb)
 {
@@ -228,7 +228,7 @@ static void adfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(adfs_inode_cachep, ADFS_I(inode));
 }
 
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
 
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 81c73ec..3de93e7 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -66,7 +66,7 @@ affs_write_super(struct super_block *sb)
 	pr_debug("AFFS: write_super() at %lu, clean=%d\n", get_seconds(), clean);
 }
 
-static kmem_cache_t * affs_inode_cachep;
+static struct kmem_cache * affs_inode_cachep;
 
 static struct inode *affs_alloc_inode(struct super_block *sb)
 {
@@ -83,7 +83,7 @@ static void affs_destroy_inode(struct inode *inode)
 	kmem_cache_free(affs_inode_cachep, AFFS_I(inode));
 }
 
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct affs_inode_info *ei = (struct affs_inode_info *) foo;
 
diff --git a/fs/afs/super.c b/fs/afs/super.c
index c6ead00..9a351c4 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -35,7 +35,7 @@ struct afs_mount_params {
 	struct afs_volume	*volume;
 };
 
-static void afs_i_init_once(void *foo, kmem_cache_t *cachep,
+static void afs_i_init_once(void *foo, struct kmem_cache *cachep,
 			    unsigned long flags);
 
 static int afs_get_sb(struct file_system_type *fs_type,
@@ -65,7 +65,7 @@ static struct super_operations afs_super_ops = {
 	.put_super	= afs_put_super,
 };
 
-static kmem_cache_t *afs_inode_cachep;
+static struct kmem_cache *afs_inode_cachep;
 static atomic_t afs_count_active_inodes;
 
 /*****************************************************************************/
@@ -384,7 +384,7 @@ static void afs_put_super(struct super_block *sb)
 /*
  * initialise an inode cache slab element prior to any use
  */
-static void afs_i_init_once(void *_vnode, kmem_cache_t *cachep,
+static void afs_i_init_once(void *_vnode, struct kmem_cache *cachep,
 			    unsigned long flags)
 {
 	struct afs_vnode *vnode = (struct afs_vnode *) _vnode;
diff --git a/fs/aio.c b/fs/aio.c
index 287a1bc..13aa929 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -47,8 +47,8 @@ unsigned long aio_nr;		/* current system wide number of aio requests */
 unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
 /*----end sysctl variables---*/
 
-static kmem_cache_t	*kiocb_cachep;
-static kmem_cache_t	*kioctx_cachep;
+static struct kmem_cache	*kiocb_cachep;
+static struct kmem_cache	*kioctx_cachep;
 
 static struct workqueue_struct *aio_wq;
 
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 995348d..bce402e 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -61,7 +61,7 @@ static const struct super_operations befs_sops = {
 };
 
 /* slab cache for befs_inode_info objects */
-static kmem_cache_t *befs_inode_cachep;
+static struct kmem_cache *befs_inode_cachep;
 
 static const struct file_operations befs_dir_operations = {
 	.read		= generic_read_dir,
@@ -289,7 +289,7 @@ befs_destroy_inode(struct inode *inode)
         kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
 }
 
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
         struct befs_inode_info *bi = (struct befs_inode_info *) foo;
 	
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 2e45123..eac175e 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -228,7 +228,7 @@ static void bfs_write_super(struct super_block *s)
 	unlock_kernel();
 }
 
-static kmem_cache_t * bfs_inode_cachep;
+static struct kmem_cache * bfs_inode_cachep;
 
 static struct inode *bfs_alloc_inode(struct super_block *sb)
 {
@@ -244,7 +244,7 @@ static void bfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(bfs_inode_cachep, BFS_I(inode));
 }
 
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct bfs_inode_info *bi = foo;
 
diff --git a/fs/bio.c b/fs/bio.c
index 50c40ce..7ec737e 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -30,7 +30,7 @@
 
 #define BIO_POOL_SIZE 256
 
-static kmem_cache_t *bio_slab __read_mostly;
+static struct kmem_cache *bio_slab __read_mostly;
 
 #define BIOVEC_NR_POOLS 6
 
@@ -44,7 +44,7 @@ mempool_t *bio_split_pool __read_mostly;
 struct biovec_slab {
 	int nr_vecs;
 	char *name; 
-	kmem_cache_t *slab;
+	struct kmem_cache *slab;
 };
 
 /*
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 0635067..13816b4d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -235,7 +235,7 @@ static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
  */
 
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
-static kmem_cache_t * bdev_cachep __read_mostly;
+static struct kmem_cache * bdev_cachep __read_mostly;
 
 static struct inode *bdev_alloc_inode(struct super_block *sb)
 {
@@ -253,7 +253,7 @@ static void bdev_destroy_inode(struct inode *inode)
 	kmem_cache_free(bdev_cachep, bdi);
 }
 
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct bdev_inode *ei = (struct bdev_inode *) foo;
 	struct block_device *bdev = &ei->bdev;
diff --git a/fs/buffer.c b/fs/buffer.c
index 35527dc..a8ca0ac 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2908,7 +2908,7 @@ asmlinkage long sys_bdflush(int func, long data)
 /*
  * Buffer-head allocation
  */
-static kmem_cache_t *bh_cachep;
+static struct kmem_cache *bh_cachep;
 
 /*
  * Once the number of bh's in the machine exceeds this level, we start
@@ -2961,7 +2961,7 @@ void free_buffer_head(struct buffer_head *bh)
 EXPORT_SYMBOL(free_buffer_head);
 
 static void
-init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
+init_buffer_head(void *data, struct kmem_cache *cachep, unsigned long flags)
 {
 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 			    SLAB_CTOR_CONSTRUCTOR) {
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8416862..e6b5866 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -81,7 +81,7 @@ extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
 extern mempool_t *cifs_mid_poolp;
 
-extern kmem_cache_t *cifs_oplock_cachep;
+extern struct kmem_cache *cifs_oplock_cachep;
 
 static int
 cifs_read_super(struct super_block *sb, void *data,
@@ -232,11 +232,11 @@ static int cifs_permission(struct inode * inode, int mask, struct nameidata *nd)
 		return generic_permission(inode, mask, NULL);
 }
 
-static kmem_cache_t *cifs_inode_cachep;
-static kmem_cache_t *cifs_req_cachep;
-static kmem_cache_t *cifs_mid_cachep;
-kmem_cache_t *cifs_oplock_cachep;
-static kmem_cache_t *cifs_sm_req_cachep;
+static struct kmem_cache *cifs_inode_cachep;
+static struct kmem_cache *cifs_req_cachep;
+static struct kmem_cache *cifs_mid_cachep;
+struct kmem_cache *cifs_oplock_cachep;
+static struct kmem_cache *cifs_sm_req_cachep;
 mempool_t *cifs_sm_req_poolp;
 mempool_t *cifs_req_poolp;
 mempool_t *cifs_mid_poolp;
@@ -668,7 +668,7 @@ const struct file_operations cifs_dir_ops = {
 };
 
 static void
-cifs_init_once(void *inode, kmem_cache_t * cachep, unsigned long flags)
+cifs_init_once(void *inode, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct cifsInodeInfo *cifsi = inode;
 
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 1f72776..f80007e 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -34,7 +34,7 @@
 #include "cifs_debug.h"
   
 extern mempool_t *cifs_mid_poolp;
-extern kmem_cache_t *cifs_oplock_cachep;
+extern struct kmem_cache *cifs_oplock_cachep;
 
 static struct mid_q_entry *
 AllocMidQEntry(const struct smb_hdr *smb_buffer, struct cifsSesInfo *ses)
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 50cedd2..b64659f 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -38,7 +38,7 @@ static void coda_clear_inode(struct inode *);
 static void coda_put_super(struct super_block *);
 static int coda_statfs(struct dentry *dentry, struct kstatfs *buf);
 
-static kmem_cache_t * coda_inode_cachep;
+static struct kmem_cache * coda_inode_cachep;
 
 static struct inode *coda_alloc_inode(struct super_block *sb)
 {
@@ -58,7 +58,7 @@ static void coda_destroy_inode(struct inode *inode)
 	kmem_cache_free(coda_inode_cachep, ITOC(inode));
 }
 
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct coda_inode_info *ei = (struct coda_inode_info *) foo;
 
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 3f4ff7a..f92cd30 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -49,7 +49,7 @@ struct configfs_dirent {
 #define CONFIGFS_NOT_PINNED	(CONFIGFS_ITEM_ATTR)
 
 extern struct vfsmount * configfs_mount;
-extern kmem_cache_t *configfs_dir_cachep;
+extern struct kmem_cache *configfs_dir_cachep;
 
 extern int configfs_is_root(struct config_item *item);
 
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 68bd5c9..ed67852 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -38,7 +38,7 @@
 
 struct vfsmount * configfs_mount = NULL;
 struct super_block * configfs_sb = NULL;
-kmem_cache_t *configfs_dir_cachep;
+struct kmem_cache *configfs_dir_cachep;
 static int configfs_mnt_count = 0;
 
 static struct super_operations configfs_ops = {
diff --git a/fs/dcache.c b/fs/dcache.c
index fd4a428..a7c67ce 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -43,7 +43,7 @@ static __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
 
 EXPORT_SYMBOL(dcache_lock);
 
-static kmem_cache_t *dentry_cache __read_mostly;
+static struct kmem_cache *dentry_cache __read_mostly;
 
 #define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname))
 
@@ -2072,10 +2072,10 @@ static void __init dcache_init(unsigned long mempages)
 }
 
 /* SLAB cache for __getname() consumers */
-kmem_cache_t *names_cachep __read_mostly;
+struct kmem_cache *names_cachep __read_mostly;
 
 /* SLAB cache for file structures */
-kmem_cache_t *filp_cachep __read_mostly;
+struct kmem_cache *filp_cachep __read_mostly;
 
 EXPORT_SYMBOL(d_genocide);
 
diff --git a/fs/dcookies.c b/fs/dcookies.c
index 0c4b067..21af162 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -37,7 +37,7 @@ struct dcookie_struct {
 
 static LIST_HEAD(dcookie_users);
 static DEFINE_MUTEX(dcookie_mutex);
-static kmem_cache_t *dcookie_cache __read_mostly;
+static struct kmem_cache *dcookie_cache __read_mostly;
 static struct list_head *dcookie_hashtable __read_mostly;
 static size_t hash_size __read_mostly;
 
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index 989b608..5352b03 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -15,7 +15,7 @@
 #include "config.h"
 #include "memory.h"
 
-static kmem_cache_t *lkb_cache;
+static struct kmem_cache *lkb_cache;
 
 
 int dlm_memory_init(void)
diff --git a/fs/dnotify.c b/fs/dnotify.c
index e778b17..1f26a2b 100644
--- a/fs/dnotify.c
+++ b/fs/dnotify.c
@@ -23,7 +23,7 @@
 
 int dir_notify_enable __read_mostly = 1;
 
-static kmem_cache_t *dn_cache __read_mostly;
+static struct kmem_cache *dn_cache __read_mostly;
 
 static void redo_inode_mask(struct inode *inode)
 {
diff --git a/fs/dquot.c b/fs/dquot.c
index c6ae6c0..f9cd5e2 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -131,7 +131,7 @@ static struct quota_format_type *quota_formats;	/* List of registered formats */
 static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;
 
 /* SLAB cache for dquot structures */
-static kmem_cache_t *dquot_cachep;
+static struct kmem_cache *dquot_cachep;
 
 int register_quota_format(struct quota_format_type *fmt)
 {
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index a2c6ccb..306f8fb 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -546,7 +546,7 @@ inode_info_init_once(void *vptr, struct kmem_cache *cachep, unsigned long flags)
 }
 
 static struct ecryptfs_cache_info {
-	kmem_cache_t **cache;
+	struct kmem_cache **cache;
 	const char *name;
 	size_t size;
 	void (*ctor)(void*, struct kmem_cache *, unsigned long);
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 69b15a9..dfebf21 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -52,7 +52,7 @@ static struct pt_types sgi_pt_types[] = {
 };
 
 
-static kmem_cache_t * efs_inode_cachep;
+static struct kmem_cache * efs_inode_cachep;
 
 static struct inode *efs_alloc_inode(struct super_block *sb)
 {
@@ -68,7 +68,7 @@ static void efs_destroy_inode(struct inode *inode)
 	kmem_cache_free(efs_inode_cachep, INODE_INFO(inode));
 }
 
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct efs_inode_info *ei = (struct efs_inode_info *) foo;
 
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index f5c8843..88a6f8d 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -283,10 +283,10 @@ static struct mutex epmutex;
 static struct poll_safewake psw;
 
 /* Slab cache used to allocate "struct epitem" */
-static kmem_cache_t *epi_cache __read_mostly;
+static struct kmem_cache *epi_cache __read_mostly;
 
 /* Slab cache used to allocate "struct eppoll_entry" */
-static kmem_cache_t *pwq_cache __read_mostly;
+static struct kmem_cache *pwq_cache __read_mostly;
 
 /* Virtual fs used to allocate inodes for eventpoll files */
 static struct vfsmount *eventpoll_mnt __read_mostly;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 85c237e..3aafb1d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -135,7 +135,7 @@ static void ext2_put_super (struct super_block * sb)
 	return;
 }
 
-static kmem_cache_t * ext2_inode_cachep;
+static struct kmem_cache * ext2_inode_cachep;
 
 static struct inode *ext2_alloc_inode(struct super_block *sb)
 {
@@ -156,7 +156,7 @@ static void ext2_destroy_inode(struct inode *inode)
 	kmem_cache_free(ext2_inode_cachep, EXT2_I(inode));
 }
 
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 0cf633f..9856565 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -436,7 +436,7 @@ static void ext3_put_super (struct super_block * sb)
 	return;
 }
 
-static kmem_cache_t *ext3_inode_cachep;
+static struct kmem_cache *ext3_inode_cachep;
 
 /*
  * Called inside transaction, so use GFP_NOFS
@@ -462,7 +462,7 @@ static void ext3_destroy_inode(struct inode *inode)
 	kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
 }
 
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct ext3_inode_info *ei = (struct ext3_inode_info *) foo;
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c730cbc8..f2e8c4a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -486,7 +486,7 @@ static void ext4_put_super (struct super_block * sb)
 	return;
 }
 
-static kmem_cache_t *ext4_inode_cachep;
+static struct kmem_cache *ext4_inode_cachep;
 
 /*
  * Called inside transaction, so use GFP_NOFS
@@ -513,7 +513,7 @@ static void ext4_destroy_inode(struct inode *inode)
 	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
 }
 
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
 
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 8c27227..05c2941 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -34,9 +34,9 @@ static inline int fat_max_cache(struct inode *inode)
 	return FAT_MAX_CACHE;
 }
 
-static kmem_cache_t *fat_cache_cachep;
+static struct kmem_cache *fat_cache_cachep;
 
-static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct fat_cache *cache = (struct fat_cache *)foo;
 
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index b58fd0c..a9e4688 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -477,7 +477,7 @@ static void fat_put_super(struct super_block *sb)
 	kfree(sbi);
 }
 
-static kmem_cache_t *fat_inode_cachep;
+static struct kmem_cache *fat_inode_cachep;
 
 static struct inode *fat_alloc_inode(struct super_block *sb)
 {
@@ -493,7 +493,7 @@ static void fat_destroy_inode(struct inode *inode)
 	kmem_cache_free(fat_inode_cachep, MSDOS_I(inode));
 }
 
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
 
diff --git a/fs/fcntl.c b/fs/fcntl.c
index c03dc9c..4740d35 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -553,7 +553,7 @@ int send_sigurg(struct fown_struct *fown)
 }
 
 static DEFINE_RWLOCK(fasync_lock);
-static kmem_cache_t *fasync_cache __read_mostly;
+static struct kmem_cache *fasync_cache __read_mostly;
 
 /*
  * fasync_helper() is used by some character device drivers (mainly mice)
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index d2dd0d7..0b7ae89 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -46,7 +46,7 @@ extern const struct address_space_operations vxfs_immed_aops;
 
 extern struct inode_operations vxfs_immed_symlink_iops;
 
-kmem_cache_t		*vxfs_inode_cachep;
+struct kmem_cache		*vxfs_inode_cachep;
 
 
 #ifdef DIAGNOSTIC
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8c15139..357764d 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -19,7 +19,7 @@
 
 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
 
-static kmem_cache_t *fuse_req_cachep;
+static struct kmem_cache *fuse_req_cachep;
 
 static struct fuse_conn *fuse_get_conn(struct file *file)
 {
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e039e20..2bdc652 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -22,7 +22,7 @@ MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
 MODULE_LICENSE("GPL");
 
-static kmem_cache_t *fuse_inode_cachep;
+static struct kmem_cache *fuse_inode_cachep;
 struct list_head fuse_conn_list;
 DEFINE_MUTEX(fuse_mutex);
 
@@ -601,7 +601,7 @@ static struct file_system_type fuse_fs_type = {
 static decl_subsys(fuse, NULL, NULL);
 static decl_subsys(connections, NULL, NULL);
 
-static void fuse_inode_init_once(void *foo, kmem_cache_t *cachep,
+static void fuse_inode_init_once(void *foo, struct kmem_cache *cachep,
 				 unsigned long flags)
 {
 	struct inode * inode = foo;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 9889c1e..7c1a9e2 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -25,7 +25,7 @@
 #include "util.h"
 #include "glock.h"
 
-static void gfs2_init_inode_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+static void gfs2_init_inode_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct gfs2_inode *ip = foo;
 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
@@ -37,7 +37,7 @@ static void gfs2_init_inode_once(void *foo, kmem_cache_t *cachep, unsigned long
 	}
 }
 
-static void gfs2_init_glock_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+static void gfs2_init_glock_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct gfs2_glock *gl = foo;
 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 196c604..e5707a9 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -23,9 +23,9 @@
 #include "lm.h"
 #include "util.h"
 
-kmem_cache_t *gfs2_glock_cachep __read_mostly;
-kmem_cache_t *gfs2_inode_cachep __read_mostly;
-kmem_cache_t *gfs2_bufdata_cachep __read_mostly;
+struct kmem_cache *gfs2_glock_cachep __read_mostly;
+struct kmem_cache *gfs2_inode_cachep __read_mostly;
+struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
 
 void gfs2_assert_i(struct gfs2_sbd *sdp)
 {
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 76a5089..7984dcf 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -146,9 +146,9 @@ int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
 gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__);
 
 
-extern kmem_cache_t *gfs2_glock_cachep;
-extern kmem_cache_t *gfs2_inode_cachep;
-extern kmem_cache_t *gfs2_bufdata_cachep;
+extern struct kmem_cache *gfs2_glock_cachep;
+extern struct kmem_cache *gfs2_inode_cachep;
+extern struct kmem_cache *gfs2_bufdata_cachep;
 
 static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
 					   unsigned int *p)
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index ffc6409..a369879 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -24,7 +24,7 @@
 #include "hfs_fs.h"
 #include "btree.h"
 
-static kmem_cache_t *hfs_inode_cachep;
+static struct kmem_cache *hfs_inode_cachep;
 
 MODULE_LICENSE("GPL");
 
@@ -430,7 +430,7 @@ static struct file_system_type hfs_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-static void hfs_init_once(void *p, kmem_cache_t *cachep, unsigned long flags)
+static void hfs_init_once(void *p, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct hfs_inode_info *i = p;
 
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 4a0c70c..0f513c6 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -434,7 +434,7 @@ MODULE_AUTHOR("Brad Boyer");
 MODULE_DESCRIPTION("Extended Macintosh Filesystem");
 MODULE_LICENSE("GPL");
 
-static kmem_cache_t *hfsplus_inode_cachep;
+static struct kmem_cache *hfsplus_inode_cachep;
 
 static struct inode *hfsplus_alloc_inode(struct super_block *sb)
 {
@@ -467,7 +467,7 @@ static struct file_system_type hfsplus_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-static void hfsplus_init_once(void *p, kmem_cache_t *cachep, unsigned long flags)
+static void hfsplus_init_once(void *p, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct hfsplus_inode_info *i = p;
 
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 46ceadd..34d68e2 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -160,7 +160,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static kmem_cache_t * hpfs_inode_cachep;
+static struct kmem_cache * hpfs_inode_cachep;
 
 static struct inode *hpfs_alloc_inode(struct super_block *sb)
 {
@@ -177,7 +177,7 @@ static void hpfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode));
 }
 
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 36e5217..0706f5a 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -513,7 +513,7 @@ static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
 }
 
 
-static kmem_cache_t *hugetlbfs_inode_cachep;
+static struct kmem_cache *hugetlbfs_inode_cachep;
 
 static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
 {
@@ -545,7 +545,7 @@ static const struct address_space_operations hugetlbfs_aops = {
 };
 
 
-static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
 
diff --git a/fs/inode.c b/fs/inode.c
index dd15984..699aa4f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -97,7 +97,7 @@ static DEFINE_MUTEX(iprune_mutex);
  */
 struct inodes_stat_t inodes_stat;
 
-static kmem_cache_t * inode_cachep __read_mostly;
+static struct kmem_cache * inode_cachep __read_mostly;
 
 static struct inode *alloc_inode(struct super_block *sb)
 {
@@ -209,7 +209,7 @@ void inode_init_once(struct inode *inode)
 
 EXPORT_SYMBOL(inode_init_once);
 
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct inode * inode = (struct inode *) foo;
 
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 017cb0f..e1956e6 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -34,8 +34,8 @@
 
 #include <asm/ioctls.h>
 
-static kmem_cache_t *watch_cachep __read_mostly;
-static kmem_cache_t *event_cachep __read_mostly;
+static struct kmem_cache *watch_cachep __read_mostly;
+static struct kmem_cache *event_cachep __read_mostly;
 
 static struct vfsmount *inotify_mnt __read_mostly;
 
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 4b6381c..ea55b6c 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -57,7 +57,7 @@ static void isofs_put_super(struct super_block *sb)
 static void isofs_read_inode(struct inode *);
 static int isofs_statfs (struct dentry *, struct kstatfs *);
 
-static kmem_cache_t *isofs_inode_cachep;
+static struct kmem_cache *isofs_inode_cachep;
 
 static struct inode *isofs_alloc_inode(struct super_block *sb)
 {
@@ -73,7 +73,7 @@ static void isofs_destroy_inode(struct inode *inode)
 	kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
 }
 
-static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void *foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct iso_inode_info *ei = foo;
 
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index b85c686..a8774be 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1630,7 +1630,7 @@ void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry)
 #define JBD_MAX_SLABS 5
 #define JBD_SLAB_INDEX(size)  (size >> 11)
 
-static kmem_cache_t *jbd_slab[JBD_MAX_SLABS];
+static struct kmem_cache *jbd_slab[JBD_MAX_SLABS];
 static const char *jbd_slab_names[JBD_MAX_SLABS] = {
 	"jbd_1k", "jbd_2k", "jbd_4k", NULL, "jbd_8k"
 };
@@ -1693,7 +1693,7 @@ void jbd_slab_free(void *ptr,  size_t size)
 /*
  * Journal_head storage management
  */
-static kmem_cache_t *journal_head_cache;
+static struct kmem_cache *journal_head_cache;
 #ifdef CONFIG_JBD_DEBUG
 static atomic_t nr_journal_heads = ATOMIC_INIT(0);
 #endif
@@ -1996,7 +1996,7 @@ static void __exit remove_jbd_proc_entry(void)
 
 #endif
 
-kmem_cache_t *jbd_handle_cache;
+struct kmem_cache *jbd_handle_cache;
 
 static int __init journal_init_handle_cache(void)
 {
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index c532429..d204ab3 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -70,8 +70,8 @@
 #include <linux/init.h>
 #endif
 
-static kmem_cache_t *revoke_record_cache;
-static kmem_cache_t *revoke_table_cache;
+static struct kmem_cache *revoke_record_cache;
+static struct kmem_cache *revoke_table_cache;
 
 /* Each revoke record represents one single revoked block.  During
    journal replay, this involves recording the transaction ID of the
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c60f378..5035601 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1641,7 +1641,7 @@ void * __jbd2_kmalloc (const char *where, size_t size, gfp_t flags, int retry)
 #define JBD_MAX_SLABS 5
 #define JBD_SLAB_INDEX(size)  (size >> 11)
 
-static kmem_cache_t *jbd_slab[JBD_MAX_SLABS];
+static struct kmem_cache *jbd_slab[JBD_MAX_SLABS];
 static const char *jbd_slab_names[JBD_MAX_SLABS] = {
 	"jbd2_1k", "jbd2_2k", "jbd2_4k", NULL, "jbd2_8k"
 };
@@ -1704,7 +1704,7 @@ void jbd2_slab_free(void *ptr,  size_t size)
 /*
  * Journal_head storage management
  */
-static kmem_cache_t *jbd2_journal_head_cache;
+static struct kmem_cache *jbd2_journal_head_cache;
 #ifdef CONFIG_JBD_DEBUG
 static atomic_t nr_journal_heads = ATOMIC_INIT(0);
 #endif
@@ -2007,7 +2007,7 @@ static void __exit jbd2_remove_jbd_proc_entry(void)
 
 #endif
 
-kmem_cache_t *jbd2_handle_cache;
+struct kmem_cache *jbd2_handle_cache;
 
 static int __init journal_init_handle_cache(void)
 {
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 380d199..f506646 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -70,8 +70,8 @@
 #include <linux/init.h>
 #endif
 
-static kmem_cache_t *jbd2_revoke_record_cache;
-static kmem_cache_t *jbd2_revoke_table_cache;
+static struct kmem_cache *jbd2_revoke_record_cache;
+static struct kmem_cache *jbd2_revoke_table_cache;
 
 /* Each revoke record represents one single revoked block.  During
    journal replay, this involves recording the transaction ID of the
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 3f7899e..9f15bce 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -61,8 +61,8 @@ static const struct file_operations jffs_dir_operations;
 static struct inode_operations jffs_dir_inode_operations;
 static const struct address_space_operations jffs_address_operations;
 
-kmem_cache_t     *node_cache = NULL;
-kmem_cache_t     *fm_cache = NULL;
+struct kmem_cache     *node_cache = NULL;
+struct kmem_cache     *fm_cache = NULL;
 
 /* Called by the VFS at mount time to initialize the whole file system.  */
 static int jffs_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/jffs/jffs_fm.c b/fs/jffs/jffs_fm.c
index 29b68d9..077258b 100644
--- a/fs/jffs/jffs_fm.c
+++ b/fs/jffs/jffs_fm.c
@@ -29,8 +29,8 @@ static int jffs_mark_obsolete(struct jffs_fmcontrol *fmc, __u32 fm_offset);
 static struct jffs_fm *jffs_alloc_fm(void);
 static void jffs_free_fm(struct jffs_fm *n);
 
-extern kmem_cache_t     *fm_cache;
-extern kmem_cache_t     *node_cache;
+extern struct kmem_cache     *fm_cache;
+extern struct kmem_cache     *node_cache;
 
 #if CONFIG_JFFS_FS_VERBOSE > 0
 void
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 33f2910..83f9881 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -19,16 +19,16 @@
 
 /* These are initialised to NULL in the kernel startup code.
    If you're porting to other operating systems, beware */
-static kmem_cache_t *full_dnode_slab;
-static kmem_cache_t *raw_dirent_slab;
-static kmem_cache_t *raw_inode_slab;
-static kmem_cache_t *tmp_dnode_info_slab;
-static kmem_cache_t *raw_node_ref_slab;
-static kmem_cache_t *node_frag_slab;
-static kmem_cache_t *inode_cache_slab;
+static struct kmem_cache *full_dnode_slab;
+static struct kmem_cache *raw_dirent_slab;
+static struct kmem_cache *raw_inode_slab;
+static struct kmem_cache *tmp_dnode_info_slab;
+static struct kmem_cache *raw_node_ref_slab;
+static struct kmem_cache *node_frag_slab;
+static struct kmem_cache *inode_cache_slab;
 #ifdef CONFIG_JFFS2_FS_XATTR
-static kmem_cache_t *xattr_datum_cache;
-static kmem_cache_t *xattr_ref_cache;
+static struct kmem_cache *xattr_datum_cache;
+static struct kmem_cache *xattr_ref_cache;
 #endif
 
 int __init jffs2_create_slab_caches(void)
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 77be534..7deb782 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -28,7 +28,7 @@
 
 static void jffs2_put_super(struct super_block *);
 
-static kmem_cache_t *jffs2_inode_cachep;
+static struct kmem_cache *jffs2_inode_cachep;
 
 static struct inode *jffs2_alloc_inode(struct super_block *sb)
 {
@@ -44,7 +44,7 @@ static void jffs2_destroy_inode(struct inode *inode)
 	kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
 }
 
-static void jffs2_i_init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void jffs2_i_init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct jffs2_inode_info *ei = (struct jffs2_inode_info *) foo;
 
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 0cccd1c..b1a1c72 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -74,7 +74,7 @@ static inline void lock_metapage(struct metapage *mp)
 }
 
 #define METAPOOL_MIN_PAGES 32
-static kmem_cache_t *metapage_cache;
+static struct kmem_cache *metapage_cache;
 static mempool_t *metapage_mempool;
 
 #define MPS_PER_PAGE (PAGE_CACHE_SIZE >> L2PSIZE)
@@ -180,7 +180,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp)
 
 #endif
 
-static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct metapage *mp = (struct metapage *)foo;
 
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 9c1c6e0..ca3e191 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -44,7 +44,7 @@ MODULE_DESCRIPTION("The Journaled Filesystem (JFS)");
 MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM");
 MODULE_LICENSE("GPL");
 
-static kmem_cache_t * jfs_inode_cachep;
+static struct kmem_cache * jfs_inode_cachep;
 
 static struct super_operations jfs_super_operations;
 static struct export_operations jfs_export_operations;
@@ -748,7 +748,7 @@ static struct file_system_type jfs_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void *foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo;
 
diff --git a/fs/locks.c b/fs/locks.c
index a7b97d5..1cb0c57 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -142,7 +142,7 @@ int lease_break_time = 45;
 static LIST_HEAD(file_lock_list);
 static LIST_HEAD(blocked_list);
 
-static kmem_cache_t *filelock_cache __read_mostly;
+static struct kmem_cache *filelock_cache __read_mostly;
 
 /* Allocate an empty lock structure. */
 static struct file_lock *locks_alloc_lock(void)
@@ -199,7 +199,7 @@ EXPORT_SYMBOL(locks_init_lock);
  * Initialises the fields of the file lock which are invariant for
  * free file_locks.
  */
-static void init_once(void *foo, kmem_cache_t *cache, unsigned long flags)
+static void init_once(void *foo, struct kmem_cache *cache, unsigned long flags)
 {
 	struct file_lock *lock = (struct file_lock *) foo;
 
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 0ff7125..deeb9dc 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -85,7 +85,7 @@ struct mb_cache {
 #ifndef MB_CACHE_INDEXES_COUNT
 	int				c_indexes_count;
 #endif
-	kmem_cache_t			*c_entry_cache;
+	struct kmem_cache			*c_entry_cache;
 	struct list_head		*c_block_hash;
 	struct list_head		*c_indexes_hash[0];
 };
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index ce532c2..629e09b 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -51,7 +51,7 @@ static void minix_put_super(struct super_block *sb)
 	return;
 }
 
-static kmem_cache_t * minix_inode_cachep;
+static struct kmem_cache * minix_inode_cachep;
 
 static struct inode *minix_alloc_inode(struct super_block *sb)
 {
@@ -67,7 +67,7 @@ static void minix_destroy_inode(struct inode *inode)
 	kmem_cache_free(minix_inode_cachep, minix_i(inode));
 }
 
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct minix_inode_info *ei = (struct minix_inode_info *) foo;
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 55442a6..b00ac84 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -36,7 +36,7 @@ static int event;
 
 static struct list_head *mount_hashtable __read_mostly;
 static int hash_mask __read_mostly, hash_bits __read_mostly;
-static kmem_cache_t *mnt_cache __read_mostly;
+static struct kmem_cache *mnt_cache __read_mostly;
 static struct rw_semaphore namespace_sem;
 
 /* /sys/fs */
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index ed84d89..fae5324 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -40,7 +40,7 @@ static void ncp_delete_inode(struct inode *);
 static void ncp_put_super(struct super_block *);
 static int  ncp_statfs(struct dentry *, struct kstatfs *);
 
-static kmem_cache_t * ncp_inode_cachep;
+static struct kmem_cache * ncp_inode_cachep;
 
 static struct inode *ncp_alloc_inode(struct super_block *sb)
 {
@@ -56,7 +56,7 @@ static void ncp_destroy_inode(struct inode *inode)
 	kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode));
 }
 
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct ncp_inode_info *ei = (struct ncp_inode_info *) foo;
 
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 769fd0a..2f488e1 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -58,7 +58,7 @@
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
-static kmem_cache_t *nfs_direct_cachep;
+static struct kmem_cache *nfs_direct_cachep;
 
 /*
  * This represents a set of asynchronous requests that we're waiting on
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 6b53aae..15afa46 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -55,7 +55,7 @@ static int nfs_update_inode(struct inode *, struct nfs_fattr *);
 
 static void nfs_zap_acl_cache(struct inode *);
 
-static kmem_cache_t * nfs_inode_cachep;
+static struct kmem_cache * nfs_inode_cachep;
 
 static inline unsigned long
 nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
@@ -1111,7 +1111,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
 #endif
 }
 
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct nfs_inode *nfsi = (struct nfs_inode *) foo;
 
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index a1561a8..3fbfc2f 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -20,7 +20,7 @@
 
 #define NFS_PARANOIA 1
 
-static kmem_cache_t *nfs_page_cachep;
+static struct kmem_cache *nfs_page_cachep;
 
 static inline struct nfs_page *
 nfs_page_alloc(void)
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 56f66f0..244a8c4 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -38,7 +38,7 @@ static int nfs_pagein_one(struct list_head *, struct inode *);
 static const struct rpc_call_ops nfs_read_partial_ops;
 static const struct rpc_call_ops nfs_read_full_ops;
 
-static kmem_cache_t *nfs_rdata_cachep;
+static struct kmem_cache *nfs_rdata_cachep;
 static mempool_t *nfs_rdata_mempool;
 
 #define MIN_POOL_READ	(32)
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f7dd0d0..41b0728 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -85,7 +85,7 @@ static const struct rpc_call_ops nfs_write_partial_ops;
 static const struct rpc_call_ops nfs_write_full_ops;
 static const struct rpc_call_ops nfs_commit_ops;
 
-static kmem_cache_t *nfs_wdata_cachep;
+static struct kmem_cache *nfs_wdata_cachep;
 static mempool_t *nfs_wdata_mempool;
 static mempool_t *nfs_commit_mempool;
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e431e93..640c92b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -84,10 +84,10 @@ static void nfs4_set_recdir(char *recdir);
  */
 static DEFINE_MUTEX(client_mutex);
 
-static kmem_cache_t *stateowner_slab = NULL;
-static kmem_cache_t *file_slab = NULL;
-static kmem_cache_t *stateid_slab = NULL;
-static kmem_cache_t *deleg_slab = NULL;
+static struct kmem_cache *stateowner_slab = NULL;
+static struct kmem_cache *file_slab = NULL;
+static struct kmem_cache *stateid_slab = NULL;
+static struct kmem_cache *deleg_slab = NULL;
 
 void
 nfs4_lock_state(void)
@@ -1003,7 +1003,7 @@ alloc_init_file(struct inode *ino)
 }
 
 static void
-nfsd4_free_slab(kmem_cache_t **slab)
+nfsd4_free_slab(struct kmem_cache **slab)
 {
 	if (*slab == NULL)
 		return;
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 01f9150..941acf1 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -66,7 +66,7 @@ static struct file_operations dlmfs_file_operations;
 static struct inode_operations dlmfs_dir_inode_operations;
 static struct inode_operations dlmfs_root_inode_operations;
 static struct inode_operations dlmfs_file_inode_operations;
-static kmem_cache_t *dlmfs_inode_cache;
+static struct kmem_cache *dlmfs_inode_cache;
 
 struct workqueue_struct *user_dlm_worker;
 
@@ -257,7 +257,7 @@ static ssize_t dlmfs_file_write(struct file *filp,
 }
 
 static void dlmfs_init_once(void *foo,
-			    kmem_cache_t *cachep,
+			    struct kmem_cache *cachep,
 			    unsigned long flags)
 {
 	struct dlmfs_inode_private *ip =
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f784177..856012b 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -221,7 +221,7 @@ EXPORT_SYMBOL_GPL(dlm_dump_all_mles);
 #endif  /*  0  */
 
 
-static kmem_cache_t *dlm_mle_cache = NULL;
+static struct kmem_cache *dlm_mle_cache = NULL;
 
 
 static void dlm_mle_release(struct kref *kref);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index fcd4475..80ac69f 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -61,7 +61,7 @@ struct ocfs2_em_insert_context {
 	struct ocfs2_extent_map_entry *right_ent;
 };
 
-static kmem_cache_t *ocfs2_em_ent_cachep = NULL;
+static struct kmem_cache *ocfs2_em_ent_cachep = NULL;
 
 
 static struct ocfs2_extent_map_entry *
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 46a378f..1a7dd29 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -106,7 +106,7 @@ static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
 #define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL)
 #define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL)
 
-extern kmem_cache_t *ocfs2_inode_cache;
+extern struct kmem_cache *ocfs2_inode_cache;
 
 extern const struct address_space_operations ocfs2_aops;
 
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 7574d26..0524f8a 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -68,7 +68,7 @@
 
 #include "buffer_head_io.h"
 
-static kmem_cache_t *ocfs2_inode_cachep = NULL;
+static struct kmem_cache *ocfs2_inode_cachep = NULL;
 
 /* OCFS2 needs to schedule several differnt types of work which
  * require cluster locking, disk I/O, recovery waits, etc. Since these
@@ -914,7 +914,7 @@ bail:
 }
 
 static void ocfs2_inode_init_once(void *data,
-				  kmem_cache_t *cachep,
+				  struct kmem_cache *cachep,
 				  unsigned long flags)
 {
 	struct ocfs2_inode_info *oi = data;
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 9707ed7..39814b9 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -69,7 +69,7 @@ struct ocfs2_meta_cache_item {
 	sector_t	c_block;
 };
 
-static kmem_cache_t *ocfs2_uptodate_cachep = NULL;
+static struct kmem_cache *ocfs2_uptodate_cachep = NULL;
 
 void ocfs2_metadata_cache_init(struct inode *inode)
 {
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 911d1bc..26f44e0 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -330,7 +330,7 @@ out:
 	return 0;
 }
 
-static kmem_cache_t *op_inode_cachep;
+static struct kmem_cache *op_inode_cachep;
 
 static struct inode *openprom_alloc_inode(struct super_block *sb)
 {
@@ -415,7 +415,7 @@ static struct file_system_type openprom_fs_type = {
 	.kill_sb	= kill_anon_super,
 };
 
-static void op_inode_init_once(void *data, kmem_cache_t * cachep, unsigned long flags)
+static void op_inode_init_once(void *data, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct op_inode_info *oi = (struct op_inode_info *) data;
 
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index b24cdb2..e26945b 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -81,7 +81,7 @@ static void proc_read_inode(struct inode * inode)
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 }
 
-static kmem_cache_t * proc_inode_cachep;
+static struct kmem_cache * proc_inode_cachep;
 
 static struct inode *proc_alloc_inode(struct super_block *sb)
 {
@@ -105,7 +105,7 @@ static void proc_destroy_inode(struct inode *inode)
 	kmem_cache_free(proc_inode_cachep, PROC_I(inode));
 }
 
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct proc_inode *ei = (struct proc_inode *) foo;
 
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 5b943eb..c047dc6 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -515,7 +515,7 @@ static void qnx4_read_inode(struct inode *inode)
 	brelse(bh);
 }
 
-static kmem_cache_t *qnx4_inode_cachep;
+static struct kmem_cache *qnx4_inode_cachep;
 
 static struct inode *qnx4_alloc_inode(struct super_block *sb)
 {
@@ -531,7 +531,7 @@ static void qnx4_destroy_inode(struct inode *inode)
 	kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode));
 }
 
-static void init_once(void *foo, kmem_cache_t * cachep,
+static void init_once(void *foo, struct kmem_cache * cachep,
 		      unsigned long flags)
 {
 	struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3233251..745bc71 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -490,7 +490,7 @@ static void reiserfs_put_super(struct super_block *s)
 	return;
 }
 
-static kmem_cache_t *reiserfs_inode_cachep;
+static struct kmem_cache *reiserfs_inode_cachep;
 
 static struct inode *reiserfs_alloc_inode(struct super_block *sb)
 {
@@ -507,7 +507,7 @@ static void reiserfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
 }
 
-static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void *foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
 
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index d1b455f..c5af088 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -550,7 +550,7 @@ romfs_read_inode(struct inode *i)
 	}
 }
 
-static kmem_cache_t * romfs_inode_cachep;
+static struct kmem_cache * romfs_inode_cachep;
 
 static struct inode *romfs_alloc_inode(struct super_block *sb)
 {
@@ -566,7 +566,7 @@ static void romfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
 }
 
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct romfs_inode_info *ei = (struct romfs_inode_info *) foo;
 
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 2216171..4af4cd7 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -50,7 +50,7 @@ static void smb_put_super(struct super_block *);
 static int  smb_statfs(struct dentry *, struct kstatfs *);
 static int  smb_show_options(struct seq_file *, struct vfsmount *);
 
-static kmem_cache_t *smb_inode_cachep;
+static struct kmem_cache *smb_inode_cachep;
 
 static struct inode *smb_alloc_inode(struct super_block *sb)
 {
@@ -66,7 +66,7 @@ static void smb_destroy_inode(struct inode *inode)
 	kmem_cache_free(smb_inode_cachep, SMB_I(inode));
 }
 
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct smb_inode_info *ei = (struct smb_inode_info *) foo;
 	unsigned long flagmask = SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR;
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
index 3eb1402..a4bcae8 100644
--- a/fs/smbfs/request.c
+++ b/fs/smbfs/request.c
@@ -25,7 +25,7 @@
 #define ROUND_UP(x) (((x)+3) & ~3)
 
 /* cache for request structures */
-static kmem_cache_t *req_cachep;
+static struct kmem_cache *req_cachep;
 
 static int smb_request_send_req(struct smb_request *req);
 
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 20551a1..e503f85 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -16,7 +16,7 @@
 
 struct vfsmount *sysfs_mount;
 struct super_block * sysfs_sb = NULL;
-kmem_cache_t *sysfs_dir_cachep;
+struct kmem_cache *sysfs_dir_cachep;
 
 static struct super_operations sysfs_ops = {
 	.statfs		= simple_statfs,
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 6f3d6bd..bd7cec2 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -1,6 +1,6 @@
 
 extern struct vfsmount * sysfs_mount;
-extern kmem_cache_t *sysfs_dir_cachep;
+extern struct kmem_cache *sysfs_dir_cachep;
 
 extern struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent *);
 extern int sysfs_create(struct dentry *, int mode, int (*init)(struct inode *));
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index a6ca12b..ead9864 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -301,7 +301,7 @@ static void sysv_delete_inode(struct inode *inode)
 	unlock_kernel();
 }
 
-static kmem_cache_t *sysv_inode_cachep;
+static struct kmem_cache *sysv_inode_cachep;
 
 static struct inode *sysv_alloc_inode(struct super_block *sb)
 {
@@ -318,7 +318,7 @@ static void sysv_destroy_inode(struct inode *inode)
 	kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
 }
 
-static void init_once(void *p, kmem_cache_t *cachep, unsigned long flags)
+static void init_once(void *p, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct sysv_inode_info *si = (struct sysv_inode_info *)p;
 
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e50f242..397f54a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -107,7 +107,7 @@ static struct file_system_type udf_fstype = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-static kmem_cache_t * udf_inode_cachep;
+static struct kmem_cache * udf_inode_cachep;
 
 static struct inode *udf_alloc_inode(struct super_block *sb)
 {
@@ -130,7 +130,7 @@ static void udf_destroy_inode(struct inode *inode)
 	kmem_cache_free(udf_inode_cachep, UDF_I(inode));
 }
 
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct udf_inode_info *ei = (struct udf_inode_info *) foo;
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 85a88c0..0b18d2c 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1204,7 +1204,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static kmem_cache_t * ufs_inode_cachep;
+static struct kmem_cache * ufs_inode_cachep;
 
 static struct inode *ufs_alloc_inode(struct super_block *sb)
 {
@@ -1221,7 +1221,7 @@ static void ufs_destroy_inode(struct inode *inode)
 	kmem_cache_free(ufs_inode_cachep, UFS_I(inode));
 }
 
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
 
diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
index 47faf27..7f1e929 100644
--- a/include/acpi/platform/aclinux.h
+++ b/include/acpi/platform/aclinux.h
@@ -64,7 +64,7 @@
 /* Host-dependent types and defines */
 
 #define ACPI_MACHINE_WIDTH          BITS_PER_LONG
-#define acpi_cache_t                        kmem_cache_t
+#define acpi_cache_t                        struct kmem_cache
 #define acpi_spinlock                   spinlock_t *
 #define ACPI_EXPORT_SYMBOL(symbol)  EXPORT_SYMBOL(symbol);
 #define strtoul                     simple_strtoul
diff --git a/include/asm-arm26/pgalloc.h b/include/asm-arm26/pgalloc.h
index 6437167..7725af3 100644
--- a/include/asm-arm26/pgalloc.h
+++ b/include/asm-arm26/pgalloc.h
@@ -15,7 +15,7 @@
 #include <asm/tlbflush.h>
 #include <linux/slab.h>
 
-extern kmem_cache_t *pte_cache;
+extern struct kmem_cache *pte_cache;
 
 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr){
 	return kmem_cache_alloc(pte_cache, GFP_KERNEL);
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index 7d398f4..bfee7dd 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -34,14 +34,14 @@ struct vm_area_struct;
 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
 extern unsigned long empty_zero_page[1024];
 extern pgd_t swapper_pg_dir[1024];
-extern kmem_cache_t *pgd_cache;
-extern kmem_cache_t *pmd_cache;
+extern struct kmem_cache *pgd_cache;
+extern struct kmem_cache *pmd_cache;
 extern spinlock_t pgd_lock;
 extern struct page *pgd_list;
 
-void pmd_ctor(void *, kmem_cache_t *, unsigned long);
-void pgd_ctor(void *, kmem_cache_t *, unsigned long);
-void pgd_dtor(void *, kmem_cache_t *, unsigned long);
+void pmd_ctor(void *, struct kmem_cache *, unsigned long);
+void pgd_ctor(void *, struct kmem_cache *, unsigned long);
+void pgd_dtor(void *, struct kmem_cache *, unsigned long);
 void pgtable_cache_init(void);
 void paging_init(void);
 
diff --git a/include/asm-powerpc/pgalloc.h b/include/asm-powerpc/pgalloc.h
index ae63db7..b0830db 100644
--- a/include/asm-powerpc/pgalloc.h
+++ b/include/asm-powerpc/pgalloc.h
@@ -11,7 +11,7 @@
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
 
-extern kmem_cache_t *pgtable_cache[];
+extern struct kmem_cache *pgtable_cache[];
 
 #ifdef CONFIG_PPC_64K_PAGES
 #define PTE_CACHE_NUM	0
diff --git a/include/asm-sparc64/pgalloc.h b/include/asm-sparc64/pgalloc.h
index 010f9cd..5891ff7 100644
--- a/include/asm-sparc64/pgalloc.h
+++ b/include/asm-sparc64/pgalloc.h
@@ -13,7 +13,7 @@
 #include <asm/page.h>
 
 /* Page table allocation/freeing. */
-extern kmem_cache_t *pgtable_cache;
+extern struct kmem_cache *pgtable_cache;
 
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 561e2a7..55d1ca5 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -30,7 +30,7 @@
 #ifdef CONFIG_TASK_DELAY_ACCT
 
 extern int delayacct_on;	/* Delay accounting turned on/off */
-extern kmem_cache_t *delayacct_cache;
+extern struct kmem_cache *delayacct_cache;
 extern void delayacct_init(void);
 extern void __delayacct_tsk_init(struct task_struct *);
 extern void __delayacct_tsk_exit(struct task_struct *);
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index 1fb02e1..2514f4e 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -490,7 +490,7 @@ struct i2o_dma {
  */
 struct i2o_pool {
 	char *name;
-	kmem_cache_t *slab;
+	struct kmem_cache *slab;
 	mempool_t *mempool;
 };
 
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index fe89444..dacd566 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -949,7 +949,7 @@ void journal_put_journal_head(struct journal_head *jh);
 /*
  * handle management
  */
-extern kmem_cache_t *jbd_handle_cache;
+extern struct kmem_cache *jbd_handle_cache;
 
 static inline handle_t *jbd_alloc_handle(gfp_t gfp_flags)
 {
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index ddb1287..98a0ae5 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -958,7 +958,7 @@ void jbd2_journal_put_journal_head(struct journal_head *jh);
 /*
  * handle management
  */
-extern kmem_cache_t *jbd2_handle_cache;
+extern struct kmem_cache *jbd2_handle_cache;
 
 static inline handle_t *jbd_alloc_handle(gfp_t gfp_flags)
 {
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index f13299a..03636d7 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -235,7 +235,7 @@ struct raid5_private_data {
 	 */
 	int			active_name;
 	char			cache_name[2][20];
-	kmem_cache_t		*slab_cache; /* for allocating stripes */
+	struct kmem_cache		*slab_cache; /* for allocating stripes */
 
 	int			seq_flush, seq_write;
 	int			quiesce;
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 61c2ab6..36f8503 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -30,7 +30,7 @@ struct anon_vma {
 
 #ifdef CONFIG_MMU
 
-extern kmem_cache_t *anon_vma_cachep;
+extern struct kmem_cache *anon_vma_cachep;
 
 static inline struct anon_vma *anon_vma_alloc(void)
 {
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 1d649f3..4ff3940 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -345,7 +345,7 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
 	return __alloc_skb(size, priority, 1, -1);
 }
 
-extern struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
+extern struct sk_buff *alloc_skb_from_cache(struct kmem_cache *cp,
 					    unsigned int size,
 					    gfp_t priority);
 extern void	       kfree_skbmem(struct sk_buff *skb);
diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
index f81a5af..ce8a912 100644
--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -12,7 +12,7 @@
 #include <net/genetlink.h>
 
 #ifdef CONFIG_TASKSTATS
-extern kmem_cache_t *taskstats_cache;
+extern struct kmem_cache *taskstats_cache;
 extern struct mutex taskstats_exit_mutex;
 
 static inline void taskstats_exit_free(struct taskstats *tidstats)
diff --git a/include/net/dst.h b/include/net/dst.h
index e156e38..62b7e75 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -98,7 +98,7 @@ struct dst_ops
 	int			entry_size;
 
 	atomic_t		entries;
-	kmem_cache_t 		*kmem_cachep;
+	struct kmem_cache 		*kmem_cachep;
 };
 
 #ifdef __KERNEL__
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index a9eb2ea..34cc76e 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -125,7 +125,7 @@ struct inet_hashinfo {
 	rwlock_t			lhash_lock ____cacheline_aligned;
 	atomic_t			lhash_users;
 	wait_queue_head_t		lhash_wait;
-	kmem_cache_t			*bind_bucket_cachep;
+	struct kmem_cache			*bind_bucket_cachep;
 };
 
 static inline struct inet_ehash_bucket *inet_ehash_bucket(
@@ -136,10 +136,10 @@ static inline struct inet_ehash_bucket *inet_ehash_bucket(
 }
 
 extern struct inet_bind_bucket *
-		    inet_bind_bucket_create(kmem_cache_t *cachep,
+		    inet_bind_bucket_create(struct kmem_cache *cachep,
 					    struct inet_bind_hashbucket *head,
 					    const unsigned short snum);
-extern void inet_bind_bucket_destroy(kmem_cache_t *cachep,
+extern void inet_bind_bucket_destroy(struct kmem_cache *cachep,
 				     struct inet_bind_bucket *tb);
 
 static inline int inet_bhashfn(const __u16 lport, const int bhash_size)
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index c8aacbd..2396703 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -160,7 +160,7 @@ struct neigh_table
 	atomic_t		entries;
 	rwlock_t		lock;
 	unsigned long		last_rand;
-	kmem_cache_t		*kmem_cachep;
+	struct kmem_cache		*kmem_cachep;
 	struct neigh_statistics	*stats;
 	struct neighbour	**hash_buckets;
 	unsigned int		hash_mask;
diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h
index cef3136..41bcc9e 100644
--- a/include/net/netfilter/nf_conntrack_expect.h
+++ b/include/net/netfilter/nf_conntrack_expect.h
@@ -7,7 +7,7 @@
 #include <net/netfilter/nf_conntrack.h>
 
 extern struct list_head nf_conntrack_expect_list;
-extern kmem_cache_t *nf_conntrack_expect_cachep;
+extern struct kmem_cache *nf_conntrack_expect_cachep;
 extern struct file_operations exp_file_ops;
 
 struct nf_conntrack_expect
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 426f0fe..7aed02c 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -29,7 +29,7 @@ struct proto;
 struct request_sock_ops {
 	int		family;
 	int		obj_size;
-	kmem_cache_t	*slab;
+	struct kmem_cache	*slab;
 	int		(*rtx_syn_ack)(struct sock *sk,
 				       struct request_sock *req,
 				       struct dst_entry *dst);
diff --git a/include/net/sock.h b/include/net/sock.h
index fe3a33f..730899c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -571,7 +571,7 @@ struct proto {
 	int			*sysctl_rmem;
 	int			max_header;
 
-	kmem_cache_t		*slab;
+	struct kmem_cache		*slab;
 	unsigned int		obj_size;
 
 	atomic_t		*orphan_count;
diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h
index d7a306e..1e1ee32 100644
--- a/include/net/timewait_sock.h
+++ b/include/net/timewait_sock.h
@@ -15,7 +15,7 @@
 #include <net/sock.h>
 
 struct timewait_sock_ops {
-	kmem_cache_t	*twsk_slab;
+	struct kmem_cache	*twsk_slab;
 	unsigned int	twsk_obj_size;
 	int		(*twsk_unique)(struct sock *sk,
 				       struct sock *sktw, void *twp);
diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h
index 9233ed5..0c775fc 100644
--- a/include/scsi/libsas.h
+++ b/include/scsi/libsas.h
@@ -557,7 +557,7 @@ struct sas_task {
 
 static inline struct sas_task *sas_alloc_task(gfp_t flags)
 {
-	extern kmem_cache_t *sas_task_cache;
+	extern struct kmem_cache *sas_task_cache;
 	struct sas_task *task = kmem_cache_alloc(sas_task_cache, flags);
 
 	if (task) {
@@ -575,7 +575,7 @@ static inline struct sas_task *sas_alloc_task(gfp_t flags)
 static inline void sas_free_task(struct sas_task *task)
 {
 	if (task) {
-		extern kmem_cache_t *sas_task_cache;
+		extern struct kmem_cache *sas_task_cache;
 		BUG_ON(!list_empty(&task->list));
 		kmem_cache_free(sas_task_cache, task);
 	}
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 813bb94..3acc166 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -90,7 +90,7 @@ static struct super_operations mqueue_super_ops;
 static void remove_notification(struct mqueue_inode_info *info);
 
 static spinlock_t mq_lock;
-static kmem_cache_t *mqueue_inode_cachep;
+static struct kmem_cache *mqueue_inode_cachep;
 static struct vfsmount *mqueue_mnt;
 
 static unsigned int queues_count;
@@ -211,7 +211,7 @@ static int mqueue_get_sb(struct file_system_type *fs_type,
 	return get_sb_single(fs_type, flags, data, mqueue_fill_super, mnt);
 }
 
-static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void *foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct mqueue_inode_info *p = (struct mqueue_inode_info *) foo;
 
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 70e9ec6..766d591 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -20,7 +20,7 @@
 #include <linux/delayacct.h>
 
 int delayacct_on __read_mostly = 1;	/* Delay accounting turned on/off */
-kmem_cache_t *delayacct_cache;
+struct kmem_cache *delayacct_cache;
 
 static int __init delayacct_setup_disable(char *str)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index 711aa5f..2cf74ed 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -82,26 +82,26 @@ int nr_processes(void)
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
 # define alloc_task_struct()	kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
 # define free_task_struct(tsk)	kmem_cache_free(task_struct_cachep, (tsk))
-static kmem_cache_t *task_struct_cachep;
+static struct kmem_cache *task_struct_cachep;
 #endif
 
 /* SLAB cache for signal_struct structures (tsk->signal) */
-static kmem_cache_t *signal_cachep;
+static struct kmem_cache *signal_cachep;
 
 /* SLAB cache for sighand_struct structures (tsk->sighand) */
-kmem_cache_t *sighand_cachep;
+struct kmem_cache *sighand_cachep;
 
 /* SLAB cache for files_struct structures (tsk->files) */
-kmem_cache_t *files_cachep;
+struct kmem_cache *files_cachep;
 
 /* SLAB cache for fs_struct structures (tsk->fs) */
-kmem_cache_t *fs_cachep;
+struct kmem_cache *fs_cachep;
 
 /* SLAB cache for vm_area_struct structures */
-kmem_cache_t *vm_area_cachep;
+struct kmem_cache *vm_area_cachep;
 
 /* SLAB cache for mm_struct structures (tsk->mm) */
-static kmem_cache_t *mm_cachep;
+static struct kmem_cache *mm_cachep;
 
 void free_task(struct task_struct *tsk)
 {
@@ -1421,7 +1421,7 @@ long do_fork(unsigned long clone_flags,
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
 
-static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
+static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct sighand_struct *sighand = data;
 
diff --git a/kernel/pid.c b/kernel/pid.c
index b914392..a48879b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -31,7 +31,7 @@
 #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
 static struct hlist_head *pid_hash;
 static int pidhash_shift;
-static kmem_cache_t *pid_cachep;
+static struct kmem_cache *pid_cachep;
 
 int pid_max = PID_MAX_DEFAULT;
 
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9cbb5d1..5fe87de 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -70,7 +70,7 @@
 /*
  * Lets keep our timers in a slab cache :-)
  */
-static kmem_cache_t *posix_timers_cache;
+static struct kmem_cache *posix_timers_cache;
 static struct idr posix_timers_id;
 static DEFINE_SPINLOCK(idr_lock);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index df18c16..8e19d27 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -33,7 +33,7 @@
  * SLAB caches for signal bits.
  */
 
-static kmem_cache_t *sigqueue_cachep;
+static struct kmem_cache *sigqueue_cachep;
 
 /*
  * In POSIX a signal is sent either to a specific thread (Linux task)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 1b2b326..f5f9201 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -34,7 +34,7 @@
 
 static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
 static int family_registered;
-kmem_cache_t *taskstats_cache;
+struct kmem_cache *taskstats_cache;
 
 static struct genl_family family = {
 	.id		= GENL_ID_GENERATE,
diff --git a/kernel/user.c b/kernel/user.c
index c1f93c1..4869563 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -26,7 +26,7 @@
 #define __uidhashfn(uid)	(((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
 #define uidhashentry(uid)	(uidhash_table + __uidhashfn((uid)))
 
-static kmem_cache_t *uid_cachep;
+static struct kmem_cache *uid_cachep;
 static struct list_head uidhash_table[UIDHASH_SZ];
 
 /*
diff --git a/lib/idr.c b/lib/idr.c
index 16d2143..7185353 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -33,7 +33,7 @@
 #include <linux/string.h>
 #include <linux/idr.h>
 
-static kmem_cache_t *idr_layer_cache;
+static struct kmem_cache *idr_layer_cache;
 
 static struct idr_layer *alloc_layer(struct idr *idp)
 {
@@ -445,7 +445,7 @@ void *idr_replace(struct idr *idp, void *ptr, int id)
 }
 EXPORT_SYMBOL(idr_replace);
 
-static void idr_cache_ctor(void * idr_layer, kmem_cache_t *idr_layer_cache,
+static void idr_cache_ctor(void * idr_layer, struct kmem_cache *idr_layer_cache,
 		unsigned long flags)
 {
 	memset(idr_layer, 0, sizeof(struct idr_layer));
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index aa9bfd0..9eb2595 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -63,7 +63,7 @@ static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH] __read_mostly;
 /*
  * Radix tree node cache.
  */
-static kmem_cache_t *radix_tree_node_cachep;
+static struct kmem_cache *radix_tree_node_cachep;
 
 /*
  * Per-cpu pool of preloaded nodes
@@ -846,7 +846,7 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
 EXPORT_SYMBOL(radix_tree_tagged);
 
 static void
-radix_tree_node_ctor(void *node, kmem_cache_t *cachep, unsigned long flags)
+radix_tree_node_ctor(void *node, struct kmem_cache *cachep, unsigned long flags)
 {
 	memset(node, 0, sizeof(struct radix_tree_node));
 }
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index d9f0486..8ca448d 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -23,7 +23,7 @@
 #include <asm/atomic.h>
 #include "br_private.h"
 
-static kmem_cache_t *br_fdb_cache __read_mostly;
+static struct kmem_cache *br_fdb_cache __read_mostly;
 static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 		      const unsigned char *addr);
 
diff --git a/net/core/flow.c b/net/core/flow.c
index 5df3e29..104c25d 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -44,7 +44,7 @@ static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
 
 #define flow_table(cpu) (per_cpu(flow_tables, cpu))
 
-static kmem_cache_t *flow_cachep __read_mostly;
+static struct kmem_cache *flow_cachep __read_mostly;
 
 static int flow_lwm, flow_hwm;
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7217fb8..de7801d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -68,8 +68,8 @@
 
 #include "kmap_skb.h"
 
-static kmem_cache_t *skbuff_head_cache __read_mostly;
-static kmem_cache_t *skbuff_fclone_cache __read_mostly;
+static struct kmem_cache *skbuff_head_cache __read_mostly;
+static struct kmem_cache *skbuff_fclone_cache __read_mostly;
 
 /*
  *	Keep out-of-line to prevent kernel bloat.
@@ -144,7 +144,7 @@ EXPORT_SYMBOL(skb_truesize_bug);
 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 			    int fclone, int node)
 {
-	kmem_cache_t *cache;
+	struct kmem_cache *cache;
 	struct skb_shared_info *shinfo;
 	struct sk_buff *skb;
 	u8 *data;
@@ -211,7 +211,7 @@ nodata:
  *	Buffers may only be allocated from interrupts using a @gfp_mask of
  *	%GFP_ATOMIC.
  */
-struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
+struct sk_buff *alloc_skb_from_cache(struct kmem_cache *cp,
 				     unsigned int size,
 				     gfp_t gfp_mask)
 {
diff --git a/net/core/sock.c b/net/core/sock.c
index 419c7d3..4a432da 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -841,7 +841,7 @@ struct sock *sk_alloc(int family, gfp_t priority,
 		      struct proto *prot, int zero_it)
 {
 	struct sock *sk = NULL;
-	kmem_cache_t *slab = prot->slab;
+	struct kmem_cache *slab = prot->slab;
 
 	if (slab != NULL)
 		sk = kmem_cache_alloc(slab, priority);
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index bdf1bb7..1f4727d 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -21,8 +21,8 @@
 
 #include <net/sock.h>
 
-static kmem_cache_t *dccp_ackvec_slab;
-static kmem_cache_t *dccp_ackvec_record_slab;
+static struct kmem_cache *dccp_ackvec_slab;
+static struct kmem_cache *dccp_ackvec_record_slab;
 
 static struct dccp_ackvec_record *dccp_ackvec_record_new(void)
 {
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
index ff05e59..d8cf92f 100644
--- a/net/dccp/ccid.c
+++ b/net/dccp/ccid.c
@@ -55,9 +55,9 @@ static inline void ccids_read_unlock(void)
 #define ccids_read_unlock() do { } while(0)
 #endif
 
-static kmem_cache_t *ccid_kmem_cache_create(int obj_size, const char *fmt,...)
+static struct kmem_cache *ccid_kmem_cache_create(int obj_size, const char *fmt,...)
 {
-	kmem_cache_t *slab;
+	struct kmem_cache *slab;
 	char slab_name_fmt[32], *slab_name;
 	va_list args;
 
@@ -75,7 +75,7 @@ static kmem_cache_t *ccid_kmem_cache_create(int obj_size, const char *fmt,...)
 	return slab;
 }
 
-static void ccid_kmem_cache_destroy(kmem_cache_t *slab)
+static void ccid_kmem_cache_destroy(struct kmem_cache *slab)
 {
 	if (slab != NULL) {
 		const char *name = kmem_cache_name(slab);
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index c7c2951..bcc2d12 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -27,9 +27,9 @@ struct ccid_operations {
 	unsigned char	ccid_id;
 	const char	*ccid_name;
 	struct module	*ccid_owner;
-	kmem_cache_t	*ccid_hc_rx_slab;
+	struct kmem_cache	*ccid_hc_rx_slab;
 	__u32		ccid_hc_rx_obj_size;
-	kmem_cache_t	*ccid_hc_tx_slab;
+	struct kmem_cache	*ccid_hc_tx_slab;
 	__u32		ccid_hc_tx_obj_size;
 	int		(*ccid_hc_rx_init)(struct ccid *ccid, struct sock *sk);
 	int		(*ccid_hc_tx_init)(struct ccid *ccid, struct sock *sk);
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
index 0ae85f0..eb25701 100644
--- a/net/dccp/ccids/lib/loss_interval.h
+++ b/net/dccp/ccids/lib/loss_interval.h
@@ -20,7 +20,7 @@
 #define DCCP_LI_HIST_IVAL_F_LENGTH  8
 
 struct dccp_li_hist {
-	kmem_cache_t *dccplih_slab;
+	struct kmem_cache *dccplih_slab;
 };
 
 extern struct dccp_li_hist *dccp_li_hist_new(const char *name);
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index 067cf1c..9a8bcf2 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -68,14 +68,14 @@ struct dccp_rx_hist_entry {
 };
 
 struct dccp_tx_hist {
-	kmem_cache_t *dccptxh_slab;
+	struct kmem_cache *dccptxh_slab;
 };
 
 extern struct dccp_tx_hist *dccp_tx_hist_new(const char *name);
 extern void dccp_tx_hist_delete(struct dccp_tx_hist *hist);
 
 struct dccp_rx_hist {
-	kmem_cache_t *dccprxh_slab;
+	struct kmem_cache *dccprxh_slab;
 };
 
 extern struct dccp_rx_hist *dccp_rx_hist_new(const char *name);
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 101e5cc..13b2421 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -79,7 +79,7 @@ for( ; ((f) = *(fp)) != NULL && dn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_n
 static struct hlist_head dn_fib_table_hash[DN_FIB_TABLE_HASHSZ];
 static DEFINE_RWLOCK(dn_fib_tables_lock);
 
-static kmem_cache_t *dn_hash_kmem __read_mostly;
+static struct kmem_cache *dn_hash_kmem __read_mostly;
 static int dn_fib_hash_zombies;
 
 static inline dn_fib_idx_t dn_hash(dn_fib_key_t key, struct dn_zone *dz)
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 4463443..648f47c 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -45,8 +45,8 @@
 
 #include "fib_lookup.h"
 
-static kmem_cache_t *fn_hash_kmem __read_mostly;
-static kmem_cache_t *fn_alias_kmem __read_mostly;
+static struct kmem_cache *fn_hash_kmem __read_mostly;
+static struct kmem_cache *fn_alias_kmem __read_mostly;
 
 struct fib_node {
 	struct hlist_node	fn_hash;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 6be6caf..cfb249c 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -172,7 +172,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn);
 static struct tnode *halve(struct trie *t, struct tnode *tn);
 static void tnode_free(struct tnode *tn);
 
-static kmem_cache_t *fn_alias_kmem __read_mostly;
+static struct kmem_cache *fn_alias_kmem __read_mostly;
 static struct trie *trie_local = NULL, *trie_main = NULL;
 
 
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index bd6c9bc..8c79c8a 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -27,7 +27,7 @@
  * Allocate and initialize a new local port bind bucket.
  * The bindhash mutex for snum's hash chain must be held here.
  */
-struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep,
+struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
 						 struct inet_bind_hashbucket *head,
 						 const unsigned short snum)
 {
@@ -45,7 +45,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep,
 /*
  * Caller must hold hashbucket lock for this tb with local BH disabled
  */
-void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb)
+void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
 {
 	if (hlist_empty(&tb->owners)) {
 		__hlist_del(&tb->node);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index f072f38..711eb6d 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -73,7 +73,7 @@
 /* Exported for inet_getid inline function.  */
 DEFINE_SPINLOCK(inet_peer_idlock);
 
-static kmem_cache_t *peer_cachep __read_mostly;
+static struct kmem_cache *peer_cachep __read_mostly;
 
 #define node_height(x) x->avl_height
 static struct inet_peer peer_fake_node = {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index efcf45e..ecb5422 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -105,7 +105,7 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
    In this case data path is free of exclusive locks at all.
  */
 
-static kmem_cache_t *mrt_cachep __read_mostly;
+static struct kmem_cache *mrt_cachep __read_mostly;
 
 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index 8832eb5..8086787 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -44,7 +44,7 @@
 static struct list_head *ip_vs_conn_tab;
 
 /*  SLAB cache for IPVS connections */
-static kmem_cache_t *ip_vs_conn_cachep __read_mostly;
+static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
 
 /*  counter for current IPVS connections */
 static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index f4b0e68..8556a4f 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -65,8 +65,8 @@ static LIST_HEAD(helpers);
 unsigned int ip_conntrack_htable_size __read_mostly = 0;
 int ip_conntrack_max __read_mostly;
 struct list_head *ip_conntrack_hash __read_mostly;
-static kmem_cache_t *ip_conntrack_cachep __read_mostly;
-static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
+static struct kmem_cache *ip_conntrack_cachep __read_mostly;
+static struct kmem_cache *ip_conntrack_expect_cachep __read_mostly;
 struct ip_conntrack ip_conntrack_untracked;
 unsigned int ip_ct_log_invalid __read_mostly;
 static LIST_HEAD(unconfirmed);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 97a8cfb..96d8310 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -50,7 +50,7 @@
 
 struct rt6_statistics	rt6_stats;
 
-static kmem_cache_t * fib6_node_kmem __read_mostly;
+static struct kmem_cache * fib6_node_kmem __read_mostly;
 
 enum fib_walk_state_t
 {
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index d4f68b0..12e426b 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -50,7 +50,7 @@ static u32 xfrm6_tunnel_spi;
 #define XFRM6_TUNNEL_SPI_MIN	1
 #define XFRM6_TUNNEL_SPI_MAX	0xffffffff
 
-static kmem_cache_t *xfrm6_tunnel_spi_kmem __read_mostly;
+static struct kmem_cache *xfrm6_tunnel_spi_kmem __read_mostly;
 
 #define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256
 #define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index eaa0f8a..a9638ff 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -108,7 +108,7 @@ static struct {
 	size_t size;
 
 	/* slab cache pointer */
-	kmem_cache_t *cachep;
+	struct kmem_cache *cachep;
 
 	/* allocated slab cache + modules which uses this slab cache */
 	int use;
@@ -147,7 +147,7 @@ int nf_conntrack_register_cache(u_int32_t features, const char *name,
 {
 	int ret = 0;
 	char *cache_name;
-	kmem_cache_t *cachep;
+	struct kmem_cache *cachep;
 
 	DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n",
 	       features, name, size);
@@ -226,7 +226,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_register_cache);
 /* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */
 void nf_conntrack_unregister_cache(u_int32_t features)
 {
-	kmem_cache_t *cachep;
+	struct kmem_cache *cachep;
 	char *name;
 
 	/*
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 588d379..c20f901 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -29,7 +29,7 @@
 LIST_HEAD(nf_conntrack_expect_list);
 EXPORT_SYMBOL_GPL(nf_conntrack_expect_list);
 
-kmem_cache_t *nf_conntrack_expect_cachep __read_mostly;
+struct kmem_cache *nf_conntrack_expect_cachep __read_mostly;
 static unsigned int nf_conntrack_expect_next_id;
 
 /* nf_conntrack_expect helper functions */
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index a98de0b..a5a6e19 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -92,7 +92,7 @@ struct xt_hashlimit_htable {
 static DEFINE_SPINLOCK(hashlimit_lock);	/* protects htables list */
 static DEFINE_MUTEX(hlimit_mutex);	/* additional checkentry protection */
 static HLIST_HEAD(hashlimit_htables);
-static kmem_cache_t *hashlimit_cachep __read_mostly;
+static struct kmem_cache *hashlimit_cachep __read_mostly;
 
 static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b)
 {
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 11f3b54..f2ba861 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -79,8 +79,8 @@ static struct sctp_pf *sctp_pf_inet_specific;
 static struct sctp_af *sctp_af_v4_specific;
 static struct sctp_af *sctp_af_v6_specific;
 
-kmem_cache_t *sctp_chunk_cachep __read_mostly;
-kmem_cache_t *sctp_bucket_cachep __read_mostly;
+struct kmem_cache *sctp_chunk_cachep __read_mostly;
+struct kmem_cache *sctp_bucket_cachep __read_mostly;
 
 /* Return the address of the control sock. */
 struct sock *sctp_get_ctl_sock(void)
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 8d55d10..30927d3 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -65,7 +65,7 @@
 #include <net/sctp/sctp.h>
 #include <net/sctp/sm.h>
 
-extern kmem_cache_t *sctp_chunk_cachep;
+extern struct kmem_cache *sctp_chunk_cachep;
 
 SCTP_STATIC
 struct sctp_chunk *sctp_make_chunk(const struct sctp_association *asoc,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 4960779..1e8132b 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -107,7 +107,7 @@ static void sctp_sock_migrate(struct sock *, struct sock *,
 			      struct sctp_association *, sctp_socket_type_t);
 static char *sctp_hmac_alg = SCTP_COOKIE_HMAC_ALG;
 
-extern kmem_cache_t *sctp_bucket_cachep;
+extern struct kmem_cache *sctp_bucket_cachep;
 
 /* Get the sndbuf space available at the time on the association.  */
 static inline int sctp_wspace(struct sctp_association *asoc)
diff --git a/net/socket.c b/net/socket.c
index 4f417c2..43eff48 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -230,7 +230,7 @@ int move_addr_to_user(void *kaddr, int klen, void __user *uaddr,
 
 #define SOCKFS_MAGIC 0x534F434B
 
-static kmem_cache_t *sock_inode_cachep __read_mostly;
+static struct kmem_cache *sock_inode_cachep __read_mostly;
 
 static struct inode *sock_alloc_inode(struct super_block *sb)
 {
@@ -257,7 +257,7 @@ static void sock_destroy_inode(struct inode *inode)
 			container_of(inode, struct socket_alloc, vfs_inode));
 }
 
-static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct socket_alloc *ei = (struct socket_alloc *)foo;
 
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index df753d0..19703aa 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -33,7 +33,7 @@ static int rpc_mount_count;
 static struct file_system_type rpc_pipe_fs_type;
 
 
-static kmem_cache_t *rpc_inode_cachep __read_mostly;
+static struct kmem_cache *rpc_inode_cachep __read_mostly;
 
 #define RPC_UPCALL_TIMEOUT (30*HZ)
 
@@ -824,7 +824,7 @@ static struct file_system_type rpc_pipe_fs_type = {
 };
 
 static void
-init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct rpc_inode *rpci = (struct rpc_inode *) foo;
 
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index eff44bc..225e651 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -34,8 +34,8 @@ static int			rpc_task_id;
 #define RPC_BUFFER_MAXSIZE	(2048)
 #define RPC_BUFFER_POOLSIZE	(8)
 #define RPC_TASK_POOLSIZE	(8)
-static kmem_cache_t	*rpc_task_slabp __read_mostly;
-static kmem_cache_t	*rpc_buffer_slabp __read_mostly;
+static struct kmem_cache	*rpc_task_slabp __read_mostly;
+static struct kmem_cache	*rpc_buffer_slabp __read_mostly;
 static mempool_t	*rpc_task_mempool __read_mostly;
 static mempool_t	*rpc_buffer_mempool __read_mostly;
 
diff --git a/net/tipc/handler.c b/net/tipc/handler.c
index ae6ddf0..eb80778 100644
--- a/net/tipc/handler.c
+++ b/net/tipc/handler.c
@@ -42,7 +42,7 @@ struct queue_item {
 	unsigned long data;
 };
 
-static kmem_cache_t *tipc_queue_item_cache;
+static struct kmem_cache *tipc_queue_item_cache;
 static struct list_head signal_queue_head;
 static DEFINE_SPINLOCK(qitem_lock);
 static int handler_enabled = 0;
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index a898a6a..414f890 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -12,7 +12,7 @@
 #include <net/ip.h>
 #include <net/xfrm.h>
 
-static kmem_cache_t *secpath_cachep __read_mostly;
+static struct kmem_cache *secpath_cachep __read_mostly;
 
 void __secpath_destroy(struct sec_path *sp)
 {
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index f6c77bd..3f3f563 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -39,7 +39,7 @@ EXPORT_SYMBOL(xfrm_policy_count);
 static DEFINE_RWLOCK(xfrm_policy_afinfo_lock);
 static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
 
-static kmem_cache_t *xfrm_dst_cache __read_mostly;
+static struct kmem_cache *xfrm_dst_cache __read_mostly;
 
 static struct work_struct xfrm_policy_gc_work;
 static HLIST_HEAD(xfrm_policy_gc_list);
diff --git a/security/keys/key.c b/security/keys/key.c
index 157bac6..0db816f 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -20,7 +20,7 @@
 #include <linux/err.h>
 #include "internal.h"
 
-static kmem_cache_t	*key_jar;
+static struct kmem_cache	*key_jar;
 struct rb_root		key_serial_tree; /* tree of keys indexed by serial */
 DEFINE_SPINLOCK(key_serial_lock);
 
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index 65b4ec9..e7c0b5e 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -124,7 +124,7 @@ DEFINE_PER_CPU(struct avc_cache_stats, avc_cache_stats) = { 0 };
 
 static struct avc_cache avc_cache;
 static struct avc_callback_node *avc_callbacks;
-static kmem_cache_t *avc_node_cachep;
+static struct kmem_cache *avc_node_cachep;
 
 static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass)
 {
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index ac1aeed..44e9cd4 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -124,7 +124,7 @@ static struct security_operations *secondary_ops = NULL;
 static LIST_HEAD(superblock_security_head);
 static DEFINE_SPINLOCK(sb_security_lock);
 
-static kmem_cache_t *sel_inode_cache;
+static struct kmem_cache *sel_inode_cache;
 
 /* Return security context for a given sid or just the context 
    length if the buffer is null or length is 0 */
diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c
index 2dfc613..ebb993c 100644
--- a/security/selinux/ss/avtab.c
+++ b/security/selinux/ss/avtab.c
@@ -28,7 +28,7 @@
  (keyp->source_type << 9)) & \
  AVTAB_HASH_MASK)
 
-static kmem_cache_t *avtab_node_cachep;
+static struct kmem_cache *avtab_node_cachep;
 
 static struct avtab_node*
 avtab_insert_node(struct avtab *h, int hvalue,
-- 
cgit v0.10.2


From 1b1cec4bbc59feac89670d5d6d222a02545bac94 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 6 Dec 2006 20:33:22 -0800
Subject: [PATCH] slab: deprecate kmem_cache_t

Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/slab.h b/include/linux/slab.h
index fbcfc20..2271886 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -7,16 +7,17 @@
 #ifndef _LINUX_SLAB_H
 #define	_LINUX_SLAB_H
 
-#if	defined(__KERNEL__)
+#ifdef __KERNEL__
 
-/* kmem_cache_t exists for legacy reasons and is not used by code in mm */
-typedef struct kmem_cache kmem_cache_t;
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <asm/page.h>		/* kmalloc_sizes.h needs PAGE_SIZE */
+#include <asm/cache.h>		/* kmalloc_sizes.h needs L1_CACHE_BYTES */
+#include <linux/compiler.h>
 
-#include	<linux/gfp.h>
-#include	<linux/init.h>
-#include	<linux/types.h>
-#include	<asm/page.h>		/* kmalloc_sizes.h needs PAGE_SIZE */
-#include	<asm/cache.h>		/* kmalloc_sizes.h needs L1_CACHE_BYTES */
+/* kmem_cache_t exists for legacy reasons and is not used by code in mm */
+typedef struct kmem_cache kmem_cache_t __deprecated;
 
 /* flags to pass to kmem_cache_create().
  * The first 3 are only valid when the allocator as been build
-- 
cgit v0.10.2


From 5bcd234d881d83ac0259c6d42d98f134e31c60a8 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:33:24 -0800
Subject: [PATCH] slab: fix two issues in kmalloc_node / __cache_alloc_node

This addresses two issues:

1. Kmalloc_node() may intermittently return NULL if we are allocating
   from the current node and are unable to obtain memory for the current
   node from the page allocator.  This is because we call ___cache_alloc()
   if nodeid == numa_node_id() and ____cache_alloc is not able to fallback
   to other nodes.

   This was introduced in the 2.6.19 development cycle.  <= 2.6.18 in
   that case does not do a restricted allocation and blindly trusts the
   page allocator to have given us memory from the indicated node.  It
   inserts the page regardless of the node it came from into the queues for
   the current node.

2. If kmalloc_node() is used on a node that has not been bootstrapped
   yet then we may try to pass an invalid node number to
   ____cache_alloc_node() triggering a BUG().

   Change the function to call fallback_alloc() instead.  Only call
   fallback_alloc() if we are allowed to fallback at all.  The need to
   handle a node not bootstrapped yet also first surfaced in the 2.6.19
   cycle.

Update the comments since they were still describing the old kmalloc_node
from 2.6.12.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/slab.c b/mm/slab.c
index bb831ba..6da554f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3459,29 +3459,45 @@ out:
  * @flags: See kmalloc().
  * @nodeid: node number of the target node.
  *
- * Identical to kmem_cache_alloc, except that this function is slow
- * and can sleep. And it will allocate memory on the given node, which
- * can improve the performance for cpu bound structures.
- * New and improved: it will now make sure that the object gets
- * put on the correct node list so that there is no false sharing.
+ * Identical to kmem_cache_alloc but it will allocate memory on the given
+ * node, which can improve the performance for cpu bound structures.
+ *
+ * Fallback to other node is possible if __GFP_THISNODE is not set.
  */
 static __always_inline void *
 __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
 		int nodeid, void *caller)
 {
 	unsigned long save_flags;
-	void *ptr;
+	void *ptr = NULL;
 
 	cache_alloc_debugcheck_before(cachep, flags);
 	local_irq_save(save_flags);
 
-	if (nodeid == -1 || nodeid == numa_node_id() ||
-			!cachep->nodelists[nodeid])
-		ptr = ____cache_alloc(cachep, flags);
-	else
-		ptr = ____cache_alloc_node(cachep, flags, nodeid);
-	local_irq_restore(save_flags);
+	if (unlikely(nodeid == -1))
+		nodeid = numa_node_id();
 
+	if (likely(cachep->nodelists[nodeid])) {
+		if (nodeid == numa_node_id()) {
+			/*
+			 * Use the locally cached objects if possible.
+			 * However ____cache_alloc does not allow fallback
+			 * to other nodes. It may fail while we still have
+			 * objects on other nodes available.
+			 */
+			ptr = ____cache_alloc(cachep, flags);
+		}
+		if (!ptr) {
+			/* ___cache_alloc_node can fall back to other nodes */
+			ptr = ____cache_alloc_node(cachep, flags, nodeid);
+		}
+	} else {
+		/* Node not bootstrapped yet */
+		if (!(flags & __GFP_THISNODE))
+			ptr = fallback_alloc(cachep, flags);
+	}
+
+	local_irq_restore(save_flags);
 	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
 
 	return ptr;
-- 
cgit v0.10.2


From 952f3b51beb592f3f1de15adcdef802fc086ea91 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:33:26 -0800
Subject: [PATCH] GFP_THISNODE must not trigger global reclaim

The intent of GFP_THISNODE is to make sure that an allocation occurs on a
particular node.  If this is not possible then NULL needs to be returned so
that the caller can choose what to do next on its own (the slab allocator
depends on that).

However, GFP_THISNODE currently triggers reclaim before returning a failure
(GFP_THISNODE means GFP_NORETRY is set).  If we have over allocated a node
then we will currently do some reclaim before returning NULL.  The caller
may want memory from other nodes before reclaim should be triggered.  (If
the caller wants reclaim then he can directly use __GFP_THISNODE instead).

There is no flag to avoid reclaim in the page allocator and adding yet
another GFP_xx flag would be difficult given that we are out of available
flags.

So just compare and see if all bits for GFP_THISNODE (__GFP_THISNODE,
__GFP_NORETRY and __GFP_NOWARN) are set.  If so then we return NULL before
waking up kswapd.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5d123b3..dc8753b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1151,6 +1151,17 @@ restart:
 	if (page)
 		goto got_pg;
 
+	/*
+	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
+	 * __GFP_NOWARN set) should not cause reclaim since the subsystem
+	 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
+	 * using a larger set of nodes after it has established that the
+	 * allowed per node queues are empty and that nodes are
+	 * over allocated.
+	 */
+	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+		goto nopage;
+
 	for (z = zonelist->zones; *z; z++)
 		wakeup_kswapd(*z, order);
 
-- 
cgit v0.10.2


From 3c517a6132098ca37e122a2980fc64a9e798b0d7 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:33:29 -0800
Subject: [PATCH] slab: better fallback allocation behavior

Currently we simply attempt to allocate from all allowed nodes using
GFP_THISNODE.  However, GFP_THISNODE does not do reclaim (it wont do any at
all if the recent GFP_THISNODE patch is accepted).  If we truly run out of
memory in the whole system then fallback_alloc may return NULL although
memory may still be available if we would perform more thorough reclaim.

This patch changes fallback_alloc() so that we first only inspect all the
per node queues for available slabs.  If we find any then we allocate from
those.  This avoids slab fragmentation by first getting rid of all partial
allocated slabs on every node before allocating new memory.

If we cannot satisfy the allocation from any per node queue then we extend
a slab.  We now call into the page allocator without specifying
GFP_THISNODE.  The page allocator will then implement its own fallback (in
the given cpuset context), perform necessary reclaim (again considering not
a single node but the whole set of allowed nodes) and then return pages for
a new slab.

We identify from which node the pages were allocated and then insert the
pages into the corresponding per node structure.  In order to do so we need
to modify cache_grow() to take a parameter that specifies the new slab.
kmem_getpages() can no longer set the GFP_THISNODE flag since we need to be
able to use kmem_getpage to allocate from an arbitrary node.  GFP_THISNODE
needs to be specified when calling cache_grow().

One key advantage is that the decision from which node to allocate new
memory is removed from slab fallback processing.  The patch allows to go
back to use of the page allocators fallback/reclaim logic.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/slab.c b/mm/slab.c
index 6da554f..7b8e5d6 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1605,12 +1605,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	flags |= __GFP_COMP;
 #endif
 
-	/*
-	 * Under NUMA we want memory on the indicated node. We will handle
-	 * the needed fallback ourselves since we want to serve from our
-	 * per node object lists first for other nodes.
-	 */
-	flags |= cachep->gfpflags | GFP_THISNODE;
+	flags |= cachep->gfpflags;
 
 	page = alloc_pages_node(nodeid, flags, cachep->gfporder);
 	if (!page)
@@ -2567,7 +2562,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
 	if (OFF_SLAB(cachep)) {
 		/* Slab management obj is off-slab. */
 		slabp = kmem_cache_alloc_node(cachep->slabp_cache,
-					      local_flags, nodeid);
+					      local_flags & ~GFP_THISNODE, nodeid);
 		if (!slabp)
 			return NULL;
 	} else {
@@ -2708,10 +2703,10 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
  * Grow (by 1) the number of slabs within a cache.  This is called by
  * kmem_cache_alloc() when there are no active objs left in a cache.
  */
-static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static int cache_grow(struct kmem_cache *cachep,
+		gfp_t flags, int nodeid, void *objp)
 {
 	struct slab *slabp;
-	void *objp;
 	size_t offset;
 	gfp_t local_flags;
 	unsigned long ctor_flags;
@@ -2763,12 +2758,14 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	 * Get mem for the objs.  Attempt to allocate a physical page from
 	 * 'nodeid'.
 	 */
-	objp = kmem_getpages(cachep, flags, nodeid);
+	if (!objp)
+		objp = kmem_getpages(cachep, flags, nodeid);
 	if (!objp)
 		goto failed;
 
 	/* Get slab management. */
-	slabp = alloc_slabmgmt(cachep, objp, offset, local_flags, nodeid);
+	slabp = alloc_slabmgmt(cachep, objp, offset,
+			local_flags & ~GFP_THISNODE, nodeid);
 	if (!slabp)
 		goto opps1;
 
@@ -3006,7 +3003,7 @@ alloc_done:
 
 	if (unlikely(!ac->avail)) {
 		int x;
-		x = cache_grow(cachep, flags, node);
+		x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
 
 		/* cache_grow can reenable interrupts, then ac could change. */
 		ac = cpu_cache_get(cachep);
@@ -3166,9 +3163,11 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
 
 /*
  * Fallback function if there was no memory available and no objects on a
- * certain node and we are allowed to fall back. We mimick the behavior of
- * the page allocator. We fall back according to a zonelist determined by
- * the policy layer while obeying cpuset constraints.
+ * certain node and fall back is permitted. First we scan all the
+ * available nodelists for available objects. If that fails then we
+ * perform an allocation without specifying a node. This allows the page
+ * allocator to do its reclaim / fallback magic. We then insert the
+ * slab into the proper nodelist and then allocate from it.
  */
 void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 {
@@ -3176,15 +3175,51 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 					->node_zonelists[gfp_zone(flags)];
 	struct zone **z;
 	void *obj = NULL;
+	int nid;
 
+retry:
+	/*
+	 * Look through allowed nodes for objects available
+	 * from existing per node queues.
+	 */
 	for (z = zonelist->zones; *z && !obj; z++) {
-		int nid = zone_to_nid(*z);
+		nid = zone_to_nid(*z);
+
+		if (cpuset_zone_allowed(*z, flags) &&
+			cache->nodelists[nid] &&
+			cache->nodelists[nid]->free_objects)
+				obj = ____cache_alloc_node(cache,
+					flags | GFP_THISNODE, nid);
+	}
 
-		if (zone_idx(*z) <= ZONE_NORMAL &&
-				cpuset_zone_allowed(*z, flags) &&
-				cache->nodelists[nid])
-			obj = ____cache_alloc_node(cache,
-					flags | __GFP_THISNODE, nid);
+	if (!obj) {
+		/*
+		 * This allocation will be performed within the constraints
+		 * of the current cpuset / memory policy requirements.
+		 * We may trigger various forms of reclaim on the allowed
+		 * set and go into memory reserves if necessary.
+		 */
+		obj = kmem_getpages(cache, flags, -1);
+		if (obj) {
+			/*
+			 * Insert into the appropriate per node queues
+			 */
+			nid = page_to_nid(virt_to_page(obj));
+			if (cache_grow(cache, flags, nid, obj)) {
+				obj = ____cache_alloc_node(cache,
+					flags | GFP_THISNODE, nid);
+				if (!obj)
+					/*
+					 * Another processor may allocate the
+					 * objects in the slab since we are
+					 * not holding any locks.
+					 */
+					goto retry;
+			} else {
+				kmem_freepages(cache, obj);
+				obj = NULL;
+			}
+		}
 	}
 	return obj;
 }
@@ -3241,7 +3276,7 @@ retry:
 
 must_grow:
 	spin_unlock(&l3->list_lock);
-	x = cache_grow(cachep, flags, nodeid);
+	x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
 	if (x)
 		goto retry;
 
-- 
cgit v0.10.2


From 33f2ef89f8e181486b63fdbdc97c6afa6ca9f34b Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Wed, 6 Dec 2006 20:33:32 -0800
Subject: [PATCH] mm: make compound page destructor handling explicit

Currently we we use the lru head link of the second page of a compound page
to hold its destructor.  This was ok when it was purely an internal
implmentation detail.  However, hugetlbfs overrides this destructor
violating the layering.  Abstract this out as explicit calls, also
introduce a type for the callback function allowing them to be type
checked.  For each callback we pre-declare the function, causing a type
error on definition rather than on use elsewhere.

[akpm@osdl.org: cleanups]
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0e266fe..a17b147 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -296,6 +296,24 @@ void put_pages_list(struct list_head *pages);
 void split_page(struct page *page, unsigned int order);
 
 /*
+ * Compound pages have a destructor function.  Provide a
+ * prototype for that function and accessor functions.
+ * These are _only_ valid on the head of a PG_compound page.
+ */
+typedef void compound_page_dtor(struct page *);
+
+static inline void set_compound_page_dtor(struct page *page,
+						compound_page_dtor *dtor)
+{
+	page[1].lru.next = (void *)dtor;
+}
+
+static inline compound_page_dtor *get_compound_page_dtor(struct page *page)
+{
+	return (compound_page_dtor *)page[1].lru.next;
+}
+
+/*
  * Multiple processes may "see" the same page. E.g. for untouched
  * mappings of /dev/null, all processes see the same page full of
  * zeroes, and text pages of executables and shared libraries have
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2911a36..0ccc7f2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -109,7 +109,7 @@ static int alloc_fresh_huge_page(void)
 	if (nid == MAX_NUMNODES)
 		nid = first_node(node_online_map);
 	if (page) {
-		page[1].lru.next = (void *)free_huge_page;	/* dtor */
+		set_compound_page_dtor(page, free_huge_page);
 		spin_lock(&hugetlb_lock);
 		nr_huge_pages++;
 		nr_huge_pages_node[page_to_nid(page)]++;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dc8753b..d539f83 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -230,7 +230,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
 	int i;
 	int nr_pages = 1 << order;
 
-	page[1].lru.next = (void *)free_compound_page;	/* set dtor */
+	set_compound_page_dtor(page, free_compound_page);
 	page[1].lru.prev = (void *)order;
 	for (i = 0; i < nr_pages; i++) {
 		struct page *p = page + i;
diff --git a/mm/swap.c b/mm/swap.c
index d9a3770..017e72c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -57,9 +57,9 @@ static void put_compound_page(struct page *page)
 {
 	page = (struct page *)page_private(page);
 	if (put_page_testzero(page)) {
-		void (*dtor)(struct page *page);
+		compound_page_dtor *dtor;
 
-		dtor = (void (*)(struct page *))page[1].lru.next;
+		dtor = get_compound_page_dtor(page);
 		(*dtor)(page);
 	}
 }
-- 
cgit v0.10.2


From 36de6437866bbb1d37e2312ff4f95ee4ed6d2b61 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Wed, 6 Dec 2006 20:33:42 -0800
Subject: [PATCH] Save some bytes in struct mm_struct

Before:
[acme@newtoy net-2.6.20]$ pahole --cacheline 32 kernel/sched.o mm_struct

/* include2/asm/processor.h:542 */
struct mm_struct {
        struct vm_area_struct *    mmap;                 /*     0     4 */
        struct rb_root             mm_rb;                /*     4     4 */
        struct vm_area_struct *    mmap_cache;           /*     8     4 */
        long unsigned int          (*get_unmapped_area)(); /*    12     4 */
        void                       (*unmap_area)();      /*    16     4 */
        long unsigned int          mmap_base;            /*    20     4 */
        long unsigned int          task_size;            /*    24     4 */
        long unsigned int          cached_hole_size;     /*    28     4 */
        /* ---------- cacheline 1 boundary ---------- */
        long unsigned int          free_area_cache;      /*    32     4 */
        pgd_t *                    pgd;                  /*    36     4 */
        atomic_t                   mm_users;             /*    40     4 */
        atomic_t                   mm_count;             /*    44     4 */
        int                        map_count;            /*    48     4 */
        struct rw_semaphore        mmap_sem;             /*    52    64 */
        spinlock_t                 page_table_lock;      /*   116    40 */
        struct list_head           mmlist;               /*   156     8 */
        mm_counter_t               _file_rss;            /*   164     4 */
        mm_counter_t               _anon_rss;            /*   168     4 */
        long unsigned int          hiwater_rss;          /*   172     4 */
        long unsigned int          hiwater_vm;           /*   176     4 */
        long unsigned int          total_vm;             /*   180     4 */
        long unsigned int          locked_vm;            /*   184     4 */
        long unsigned int          shared_vm;            /*   188     4 */
        /* ---------- cacheline 6 boundary ---------- */
        long unsigned int          exec_vm;              /*   192     4 */
        long unsigned int          stack_vm;             /*   196     4 */
        long unsigned int          reserved_vm;          /*   200     4 */
        long unsigned int          def_flags;            /*   204     4 */
        long unsigned int          nr_ptes;              /*   208     4 */
        long unsigned int          start_code;           /*   212     4 */
        long unsigned int          end_code;             /*   216     4 */
        long unsigned int          start_data;           /*   220     4 */
        /* ---------- cacheline 7 boundary ---------- */
        long unsigned int          end_data;             /*   224     4 */
        long unsigned int          start_brk;            /*   228     4 */
        long unsigned int          brk;                  /*   232     4 */
        long unsigned int          start_stack;          /*   236     4 */
        long unsigned int          arg_start;            /*   240     4 */
        long unsigned int          arg_end;              /*   244     4 */
        long unsigned int          env_start;            /*   248     4 */
        long unsigned int          env_end;              /*   252     4 */
        /* ---------- cacheline 8 boundary ---------- */
        long unsigned int          saved_auxv[44];       /*   256   176 */
        unsigned int               dumpable:2;           /*   432     4 */
        cpumask_t                  cpu_vm_mask;          /*   436     4 */
        mm_context_t               context;              /*   440    68 */
        long unsigned int          swap_token_time;      /*   508     4 */
        /* ---------- cacheline 16 boundary ---------- */
        char                       recent_pagein;        /*   512     1 */

        /* XXX 3 bytes hole, try to pack */

        int                        core_waiters;         /*   516     4 */
        struct completion *        core_startup_done;    /*   520     4 */
        struct completion          core_done;            /*   524    52 */
        rwlock_t                   ioctx_list_lock;      /*   576    36 */
        struct kioctx *            ioctx_list;           /*   612     4 */
}; /* size: 616, sum members: 613, holes: 1, sum holes: 3, cachelines: 20,
      last cacheline: 8 bytes */

After:

[acme@newtoy net-2.6.20]$ pahole --cacheline 32 kernel/sched.o mm_struct
/* include2/asm/processor.h:542 */
struct mm_struct {
        struct vm_area_struct *    mmap;                 /*     0     4 */
        struct rb_root             mm_rb;                /*     4     4 */
        struct vm_area_struct *    mmap_cache;           /*     8     4 */
        long unsigned int          (*get_unmapped_area)(); /*    12     4 */
        void                       (*unmap_area)();      /*    16     4 */
        long unsigned int          mmap_base;            /*    20     4 */
        long unsigned int          task_size;            /*    24     4 */
        long unsigned int          cached_hole_size;     /*    28     4 */
        /* ---------- cacheline 1 boundary ---------- */
        long unsigned int          free_area_cache;      /*    32     4 */
        pgd_t *                    pgd;                  /*    36     4 */
        atomic_t                   mm_users;             /*    40     4 */
        atomic_t                   mm_count;             /*    44     4 */
        int                        map_count;            /*    48     4 */
        struct rw_semaphore        mmap_sem;             /*    52    64 */
        spinlock_t                 page_table_lock;      /*   116    40 */
        struct list_head           mmlist;               /*   156     8 */
        mm_counter_t               _file_rss;            /*   164     4 */
        mm_counter_t               _anon_rss;            /*   168     4 */
        long unsigned int          hiwater_rss;          /*   172     4 */
        long unsigned int          hiwater_vm;           /*   176     4 */
        long unsigned int          total_vm;             /*   180     4 */
        long unsigned int          locked_vm;            /*   184     4 */
        long unsigned int          shared_vm;            /*   188     4 */
        /* ---------- cacheline 6 boundary ---------- */
        long unsigned int          exec_vm;              /*   192     4 */
        long unsigned int          stack_vm;             /*   196     4 */
        long unsigned int          reserved_vm;          /*   200     4 */
        long unsigned int          def_flags;            /*   204     4 */
        long unsigned int          nr_ptes;              /*   208     4 */
        long unsigned int          start_code;           /*   212     4 */
        long unsigned int          end_code;             /*   216     4 */
        long unsigned int          start_data;           /*   220     4 */
        /* ---------- cacheline 7 boundary ---------- */
        long unsigned int          end_data;             /*   224     4 */
        long unsigned int          start_brk;            /*   228     4 */
        long unsigned int          brk;                  /*   232     4 */
        long unsigned int          start_stack;          /*   236     4 */
        long unsigned int          arg_start;            /*   240     4 */
        long unsigned int          arg_end;              /*   244     4 */
        long unsigned int          env_start;            /*   248     4 */
        long unsigned int          env_end;              /*   252     4 */
        /* ---------- cacheline 8 boundary ---------- */
        long unsigned int          saved_auxv[44];       /*   256   176 */
        cpumask_t                  cpu_vm_mask;          /*   432     4 */
        mm_context_t               context;              /*   436    68 */
        long unsigned int          swap_token_time;      /*   504     4 */
        char                       recent_pagein;        /*   508     1 */
        unsigned char              dumpable:2;           /*   509     1 */

        /* XXX 2 bytes hole, try to pack */

        int                        core_waiters;         /*   512     4 */
        struct completion *        core_startup_done;    /*   516     4 */
        struct completion          core_done;            /*   520    52 */
        rwlock_t                   ioctx_list_lock;      /*   572    36 */
        struct kioctx *            ioctx_list;           /*   608     4 */
}; /* size: 612, sum members: 610, holes: 1, sum holes: 2, cachelines: 20,
      last cacheline: 4 bytes */

[acme@newtoy net-2.6.20]$ codiff -V /tmp/sched.o.before kernel/sched.o
/pub/scm/linux/kernel/git/acme/net-2.6.20/kernel/sched.c:
  struct mm_struct |   -4
    dumpable:2;
     from: unsigned int          /*   432(30)    4(2) */
     to:   unsigned char         /*   509(6)     1(2) */
< SNIP other offset changes >
 1 struct changed
[acme@newtoy net-2.6.20]$

I'm not aware of any problem about using 2 byte wide bitfields where
previously a 4 byte wide one was, holler if there is any, I wouldn't be
surprised, bitfields are things from hell.

For the curious, 432(30) means: at offset 432 from the struct start, at
offset 30 in the bitfield (yeah, it comes backwards, hellish, huh?) ditto
for 509(6), while 4(2) and 1(2) means "struct field size(bitfield size)".

Now we have a 2 bytes hole and are using only 4 bytes of the last 32
bytes cacheline, any takers? :-)

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index cad6a16..acfd2e1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -338,7 +338,6 @@ struct mm_struct {
 
 	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
 
-	unsigned dumpable:2;
 	cpumask_t cpu_vm_mask;
 
 	/* Architecture-specific MM context */
@@ -355,6 +354,8 @@ struct mm_struct {
 	unsigned int token_priority;
 	unsigned int last_interval;
 
+	unsigned char dumpable:2;
+
 	/* coredumping support */
 	int core_waiters;
 	struct completion *core_startup_done, core_done;
-- 
cgit v0.10.2


From 7cf9c2c76c1a17b32f2da85b50cd4fe468ed44b5 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 6 Dec 2006 20:33:44 -0800
Subject: [PATCH] radix-tree: RCU lockless readside

Make radix tree lookups safe to be performed without locks.  Readers are
protected against nodes being deleted by using RCU based freeing.  Readers
are protected against new node insertion by using memory barriers to ensure
the node itself will be properly written before it is visible in the radix
tree.

Each radix tree node keeps a record of their height (above leaf nodes).
This height does not change after insertion -- when the radix tree is
extended, higher nodes are only inserted in the top.  So a lookup can take
the pointer to what is *now* the root node, and traverse down it even if
the tree is concurrently extended and this node becomes a subtree of a new
root.

"Direct" pointers (tree height of 0, where root->rnode points directly to
the data item) are handled by using the low bit of the pointer to signal
whether rnode is a direct pointer or a pointer to a radix tree node.

When a reader wants to traverse the next branch, they will take a copy of
the pointer.  This pointer will be either NULL (and the branch is empty) or
non-NULL (and will point to a valid node).

[akpm@osdl.org: cleanups]
[Lee.Schermerhorn@hp.com: bugfixes, comments, simplifications]
[clameter@sgi.com: build fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index cbfa115..0deb842 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -1,6 +1,7 @@
 /*
  * Copyright (C) 2001 Momchil Velikov
  * Portions Copyright (C) 2001 Christoph Hellwig
+ * Copyright (C) 2006 Nick Piggin
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
@@ -21,6 +22,35 @@
 
 #include <linux/preempt.h>
 #include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/rcupdate.h>
+
+/*
+ * A direct pointer (root->rnode pointing directly to a data item,
+ * rather than another radix_tree_node) is signalled by the low bit
+ * set in the root->rnode pointer.
+ *
+ * In this case root->height is also NULL, but the direct pointer tests are
+ * needed for RCU lookups when root->height is unreliable.
+ */
+#define RADIX_TREE_DIRECT_PTR	1
+
+static inline void *radix_tree_ptr_to_direct(void *ptr)
+{
+	return (void *)((unsigned long)ptr | RADIX_TREE_DIRECT_PTR);
+}
+
+static inline void *radix_tree_direct_to_ptr(void *ptr)
+{
+	return (void *)((unsigned long)ptr & ~RADIX_TREE_DIRECT_PTR);
+}
+
+static inline int radix_tree_is_direct_ptr(void *ptr)
+{
+	return (int)((unsigned long)ptr & RADIX_TREE_DIRECT_PTR);
+}
+
+/*** radix-tree API starts here ***/
 
 #define RADIX_TREE_MAX_TAGS 2
 
@@ -47,6 +77,77 @@ do {									\
 	(root)->rnode = NULL;						\
 } while (0)
 
+/**
+ * Radix-tree synchronization
+ *
+ * The radix-tree API requires that users provide all synchronisation (with
+ * specific exceptions, noted below).
+ *
+ * Synchronization of access to the data items being stored in the tree, and
+ * management of their lifetimes must be completely managed by API users.
+ *
+ * For API usage, in general,
+ * - any function _modifying_ the the tree or tags (inserting or deleting
+ *   items, setting or clearing tags must exclude other modifications, and
+ *   exclude any functions reading the tree.
+ * - any function _reading_ the the tree or tags (looking up items or tags,
+ *   gang lookups) must exclude modifications to the tree, but may occur
+ *   concurrently with other readers.
+ *
+ * The notable exceptions to this rule are the following functions:
+ * radix_tree_lookup
+ * radix_tree_tag_get
+ * radix_tree_gang_lookup
+ * radix_tree_gang_lookup_tag
+ * radix_tree_tagged
+ *
+ * The first 4 functions are able to be called locklessly, using RCU. The
+ * caller must ensure calls to these functions are made within rcu_read_lock()
+ * regions. Other readers (lock-free or otherwise) and modifications may be
+ * running concurrently.
+ *
+ * It is still required that the caller manage the synchronization and lifetimes
+ * of the items. So if RCU lock-free lookups are used, typically this would mean
+ * that the items have their own locks, or are amenable to lock-free access; and
+ * that the items are freed by RCU (or only freed after having been deleted from
+ * the radix tree *and* a synchronize_rcu() grace period).
+ *
+ * (Note, rcu_assign_pointer and rcu_dereference are not needed to control
+ * access to data items when inserting into or looking up from the radix tree)
+ *
+ * radix_tree_tagged is able to be called without locking or RCU.
+ */
+
+/**
+ * radix_tree_deref_slot	- dereference a slot
+ * @pslot:	pointer to slot, returned by radix_tree_lookup_slot
+ * Returns:	item that was stored in that slot with any direct pointer flag
+ *		removed.
+ *
+ * For use with radix_tree_lookup_slot().  Caller must hold tree at least read
+ * locked across slot lookup and dereference.  More likely, will be used with
+ * radix_tree_replace_slot(), as well, so caller will hold tree write locked.
+ */
+static inline void *radix_tree_deref_slot(void **pslot)
+{
+	return radix_tree_direct_to_ptr(*pslot);
+}
+/**
+ * radix_tree_replace_slot	- replace item in a slot
+ * @pslot:	pointer to slot, returned by radix_tree_lookup_slot
+ * @item:	new item to store in the slot.
+ *
+ * For use with radix_tree_lookup_slot().  Caller must hold tree write locked
+ * across slot lookup and replacement.
+ */
+static inline void radix_tree_replace_slot(void **pslot, void *item)
+{
+	BUG_ON(radix_tree_is_direct_ptr(item));
+	rcu_assign_pointer(*pslot,
+		(void *)((unsigned long)item |
+			((unsigned long)*pslot & RADIX_TREE_DIRECT_PTR)));
+}
+
 int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
 void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 9eb2595..e2cefab 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -2,6 +2,7 @@
  * Copyright (C) 2001 Momchil Velikov
  * Portions Copyright (C) 2001 Christoph Hellwig
  * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
+ * Copyright (C) 2006 Nick Piggin
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
@@ -30,6 +31,7 @@
 #include <linux/gfp.h>
 #include <linux/string.h>
 #include <linux/bitops.h>
+#include <linux/rcupdate.h>
 
 
 #ifdef __KERNEL__
@@ -45,7 +47,9 @@
 	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
 
 struct radix_tree_node {
+	unsigned int	height;		/* Height from the bottom */
 	unsigned int	count;
+	struct rcu_head	rcu_head;
 	void		*slots[RADIX_TREE_MAP_SIZE];
 	unsigned long	tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
 };
@@ -100,13 +104,21 @@ radix_tree_node_alloc(struct radix_tree_root *root)
 			rtp->nr--;
 		}
 	}
+	BUG_ON(radix_tree_is_direct_ptr(ret));
 	return ret;
 }
 
+static void radix_tree_node_rcu_free(struct rcu_head *head)
+{
+	struct radix_tree_node *node =
+			container_of(head, struct radix_tree_node, rcu_head);
+	kmem_cache_free(radix_tree_node_cachep, node);
+}
+
 static inline void
 radix_tree_node_free(struct radix_tree_node *node)
 {
-	kmem_cache_free(radix_tree_node_cachep, node);
+	call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
 }
 
 /*
@@ -222,11 +234,12 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
 	}
 
 	do {
+		unsigned int newheight;
 		if (!(node = radix_tree_node_alloc(root)))
 			return -ENOMEM;
 
 		/* Increase the height.  */
-		node->slots[0] = root->rnode;
+		node->slots[0] = radix_tree_direct_to_ptr(root->rnode);
 
 		/* Propagate the aggregated tag info into the new root */
 		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
@@ -234,9 +247,11 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
 				tag_set(node, tag, 0);
 		}
 
+		newheight = root->height+1;
+		node->height = newheight;
 		node->count = 1;
-		root->rnode = node;
-		root->height++;
+		rcu_assign_pointer(root->rnode, node);
+		root->height = newheight;
 	} while (height > root->height);
 out:
 	return 0;
@@ -258,6 +273,8 @@ int radix_tree_insert(struct radix_tree_root *root,
 	int offset;
 	int error;
 
+	BUG_ON(radix_tree_is_direct_ptr(item));
+
 	/* Make sure the tree is high enough.  */
 	if (index > radix_tree_maxindex(root->height)) {
 		error = radix_tree_extend(root, index);
@@ -275,11 +292,12 @@ int radix_tree_insert(struct radix_tree_root *root,
 			/* Have to add a child node.  */
 			if (!(slot = radix_tree_node_alloc(root)))
 				return -ENOMEM;
+			slot->height = height;
 			if (node) {
-				node->slots[offset] = slot;
+				rcu_assign_pointer(node->slots[offset], slot);
 				node->count++;
 			} else
-				root->rnode = slot;
+				rcu_assign_pointer(root->rnode, slot);
 		}
 
 		/* Go a level down */
@@ -295,11 +313,11 @@ int radix_tree_insert(struct radix_tree_root *root,
 
 	if (node) {
 		node->count++;
-		node->slots[offset] = item;
+		rcu_assign_pointer(node->slots[offset], item);
 		BUG_ON(tag_get(node, 0, offset));
 		BUG_ON(tag_get(node, 1, offset));
 	} else {
-		root->rnode = item;
+		rcu_assign_pointer(root->rnode, radix_tree_ptr_to_direct(item));
 		BUG_ON(root_tag_get(root, 0));
 		BUG_ON(root_tag_get(root, 1));
 	}
@@ -308,49 +326,54 @@ int radix_tree_insert(struct radix_tree_root *root,
 }
 EXPORT_SYMBOL(radix_tree_insert);
 
-static inline void **__lookup_slot(struct radix_tree_root *root,
-				   unsigned long index)
+/**
+ *	radix_tree_lookup_slot    -    lookup a slot in a radix tree
+ *	@root:		radix tree root
+ *	@index:		index key
+ *
+ *	Returns:  the slot corresponding to the position @index in the
+ *	radix tree @root. This is useful for update-if-exists operations.
+ *
+ *	This function cannot be called under rcu_read_lock, it must be
+ *	excluded from writers, as must the returned slot for subsequent
+ *	use by radix_tree_deref_slot() and radix_tree_replace slot.
+ *	Caller must hold tree write locked across slot lookup and
+ *	replace.
+ */
+void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
 {
 	unsigned int height, shift;
-	struct radix_tree_node **slot;
-
-	height = root->height;
+	struct radix_tree_node *node, **slot;
 
-	if (index > radix_tree_maxindex(height))
+	node = root->rnode;
+	if (node == NULL)
 		return NULL;
 
-	if (height == 0 && root->rnode)
+	if (radix_tree_is_direct_ptr(node)) {
+		if (index > 0)
+			return NULL;
 		return (void **)&root->rnode;
+	}
+
+	height = node->height;
+	if (index > radix_tree_maxindex(height))
+		return NULL;
 
 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
-	slot = &root->rnode;
 
-	while (height > 0) {
-		if (*slot == NULL)
+	do {
+		slot = (struct radix_tree_node **)
+			(node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
+		node = *slot;
+		if (node == NULL)
 			return NULL;
 
-		slot = (struct radix_tree_node **)
-			((*slot)->slots +
-				((index >> shift) & RADIX_TREE_MAP_MASK));
 		shift -= RADIX_TREE_MAP_SHIFT;
 		height--;
-	}
+	} while (height > 0);
 
 	return (void **)slot;
 }
-
-/**
- *	radix_tree_lookup_slot    -    lookup a slot in a radix tree
- *	@root:		radix tree root
- *	@index:		index key
- *
- *	Lookup the slot corresponding to the position @index in the radix tree
- *	@root. This is useful for update-if-exists operations.
- */
-void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
-{
-	return __lookup_slot(root, index);
-}
 EXPORT_SYMBOL(radix_tree_lookup_slot);
 
 /**
@@ -359,13 +382,45 @@ EXPORT_SYMBOL(radix_tree_lookup_slot);
  *	@index:		index key
  *
  *	Lookup the item at the position @index in the radix tree @root.
+ *
+ *	This function can be called under rcu_read_lock, however the caller
+ *	must manage lifetimes of leaf nodes (eg. RCU may also be used to free
+ *	them safely). No RCU barriers are required to access or modify the
+ *	returned item, however.
  */
 void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
 {
-	void **slot;
+	unsigned int height, shift;
+	struct radix_tree_node *node, **slot;
+
+	node = rcu_dereference(root->rnode);
+	if (node == NULL)
+		return NULL;
+
+	if (radix_tree_is_direct_ptr(node)) {
+		if (index > 0)
+			return NULL;
+		return radix_tree_direct_to_ptr(node);
+	}
+
+	height = node->height;
+	if (index > radix_tree_maxindex(height))
+		return NULL;
+
+	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
 
-	slot = __lookup_slot(root, index);
-	return slot != NULL ? *slot : NULL;
+	do {
+		slot = (struct radix_tree_node **)
+			(node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
+		node = rcu_dereference(*slot);
+		if (node == NULL)
+			return NULL;
+
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	} while (height > 0);
+
+	return node;
 }
 EXPORT_SYMBOL(radix_tree_lookup);
 
@@ -495,27 +550,30 @@ int radix_tree_tag_get(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag)
 {
 	unsigned int height, shift;
-	struct radix_tree_node *slot;
+	struct radix_tree_node *node;
 	int saw_unset_tag = 0;
 
-	height = root->height;
-	if (index > radix_tree_maxindex(height))
-		return 0;
-
 	/* check the root's tag bit */
 	if (!root_tag_get(root, tag))
 		return 0;
 
-	if (height == 0)
-		return 1;
+	node = rcu_dereference(root->rnode);
+	if (node == NULL)
+		return 0;
+
+	if (radix_tree_is_direct_ptr(node))
+		return (index == 0);
+
+	height = node->height;
+	if (index > radix_tree_maxindex(height))
+		return 0;
 
 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-	slot = root->rnode;
 
 	for ( ; ; ) {
 		int offset;
 
-		if (slot == NULL)
+		if (node == NULL)
 			return 0;
 
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
@@ -524,15 +582,15 @@ int radix_tree_tag_get(struct radix_tree_root *root,
 		 * This is just a debug check.  Later, we can bale as soon as
 		 * we see an unset tag.
 		 */
-		if (!tag_get(slot, tag, offset))
+		if (!tag_get(node, tag, offset))
 			saw_unset_tag = 1;
 		if (height == 1) {
-			int ret = tag_get(slot, tag, offset);
+			int ret = tag_get(node, tag, offset);
 
 			BUG_ON(ret && saw_unset_tag);
 			return !!ret;
 		}
-		slot = slot->slots[offset];
+		node = rcu_dereference(node->slots[offset]);
 		shift -= RADIX_TREE_MAP_SHIFT;
 		height--;
 	}
@@ -541,47 +599,45 @@ EXPORT_SYMBOL(radix_tree_tag_get);
 #endif
 
 static unsigned int
-__lookup(struct radix_tree_root *root, void **results, unsigned long index,
+__lookup(struct radix_tree_node *slot, void **results, unsigned long index,
 	unsigned int max_items, unsigned long *next_index)
 {
 	unsigned int nr_found = 0;
 	unsigned int shift, height;
-	struct radix_tree_node *slot;
 	unsigned long i;
 
-	height = root->height;
-	if (height == 0) {
-		if (root->rnode && index == 0)
-			results[nr_found++] = root->rnode;
+	height = slot->height;
+	if (height == 0)
 		goto out;
-	}
-
 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
-	slot = root->rnode;
 
 	for ( ; height > 1; height--) {
-
-		for (i = (index >> shift) & RADIX_TREE_MAP_MASK ;
-				i < RADIX_TREE_MAP_SIZE; i++) {
+		i = (index >> shift) & RADIX_TREE_MAP_MASK;
+		for (;;) {
 			if (slot->slots[i] != NULL)
 				break;
 			index &= ~((1UL << shift) - 1);
 			index += 1UL << shift;
 			if (index == 0)
 				goto out;	/* 32-bit wraparound */
+			i++;
+			if (i == RADIX_TREE_MAP_SIZE)
+				goto out;
 		}
-		if (i == RADIX_TREE_MAP_SIZE)
-			goto out;
 
 		shift -= RADIX_TREE_MAP_SHIFT;
-		slot = slot->slots[i];
+		slot = rcu_dereference(slot->slots[i]);
+		if (slot == NULL)
+			goto out;
 	}
 
 	/* Bottom level: grab some items */
 	for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
+		struct radix_tree_node *node;
 		index++;
-		if (slot->slots[i]) {
-			results[nr_found++] = slot->slots[i];
+		node = slot->slots[i];
+		if (node) {
+			results[nr_found++] = rcu_dereference(node);
 			if (nr_found == max_items)
 				goto out;
 		}
@@ -603,28 +659,51 @@ out:
  *	*@results.
  *
  *	The implementation is naive.
+ *
+ *	Like radix_tree_lookup, radix_tree_gang_lookup may be called under
+ *	rcu_read_lock. In this case, rather than the returned results being
+ *	an atomic snapshot of the tree at a single point in time, the semantics
+ *	of an RCU protected gang lookup are as though multiple radix_tree_lookups
+ *	have been issued in individual locks, and results stored in 'results'.
  */
 unsigned int
 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 			unsigned long first_index, unsigned int max_items)
 {
-	const unsigned long max_index = radix_tree_maxindex(root->height);
+	unsigned long max_index;
+	struct radix_tree_node *node;
 	unsigned long cur_index = first_index;
-	unsigned int ret = 0;
+	unsigned int ret;
+
+	node = rcu_dereference(root->rnode);
+	if (!node)
+		return 0;
 
+	if (radix_tree_is_direct_ptr(node)) {
+		if (first_index > 0)
+			return 0;
+		node = radix_tree_direct_to_ptr(node);
+		results[0] = rcu_dereference(node);
+		return 1;
+	}
+
+	max_index = radix_tree_maxindex(node->height);
+
+	ret = 0;
 	while (ret < max_items) {
 		unsigned int nr_found;
 		unsigned long next_index;	/* Index of next search */
 
 		if (cur_index > max_index)
 			break;
-		nr_found = __lookup(root, results + ret, cur_index,
+		nr_found = __lookup(node, results + ret, cur_index,
 					max_items - ret, &next_index);
 		ret += nr_found;
 		if (next_index == 0)
 			break;
 		cur_index = next_index;
 	}
+
 	return ret;
 }
 EXPORT_SYMBOL(radix_tree_gang_lookup);
@@ -634,55 +713,64 @@ EXPORT_SYMBOL(radix_tree_gang_lookup);
  * open-coding the search.
  */
 static unsigned int
-__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index,
+__lookup_tag(struct radix_tree_node *slot, void **results, unsigned long index,
 	unsigned int max_items, unsigned long *next_index, unsigned int tag)
 {
 	unsigned int nr_found = 0;
-	unsigned int shift;
-	unsigned int height = root->height;
-	struct radix_tree_node *slot;
+	unsigned int shift, height;
 
-	if (height == 0) {
-		if (root->rnode && index == 0)
-			results[nr_found++] = root->rnode;
+	height = slot->height;
+	if (height == 0)
 		goto out;
-	}
-
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-	slot = root->rnode;
+	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
 
-	do {
-		unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK;
+	while (height > 0) {
+		unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK ;
 
-		for ( ; i < RADIX_TREE_MAP_SIZE; i++) {
-			if (tag_get(slot, tag, i)) {
-				BUG_ON(slot->slots[i] == NULL);
+		for (;;) {
+			if (tag_get(slot, tag, i))
 				break;
-			}
 			index &= ~((1UL << shift) - 1);
 			index += 1UL << shift;
 			if (index == 0)
 				goto out;	/* 32-bit wraparound */
+			i++;
+			if (i == RADIX_TREE_MAP_SIZE)
+				goto out;
 		}
-		if (i == RADIX_TREE_MAP_SIZE)
-			goto out;
 		height--;
 		if (height == 0) {	/* Bottom level: grab some items */
 			unsigned long j = index & RADIX_TREE_MAP_MASK;
 
 			for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
+				struct radix_tree_node *node;
 				index++;
-				if (tag_get(slot, tag, j)) {
-					BUG_ON(slot->slots[j] == NULL);
-					results[nr_found++] = slot->slots[j];
+				if (!tag_get(slot, tag, j))
+					continue;
+				node = slot->slots[j];
+				/*
+				 * Even though the tag was found set, we need to
+				 * recheck that we have a non-NULL node, because
+				 * if this lookup is lockless, it may have been
+				 * subsequently deleted.
+				 *
+				 * Similar care must be taken in any place that
+				 * lookup ->slots[x] without a lock (ie. can't
+				 * rely on its value remaining the same).
+				 */
+				if (node) {
+					node = rcu_dereference(node);
+					results[nr_found++] = node;
 					if (nr_found == max_items)
 						goto out;
 				}
 			}
 		}
 		shift -= RADIX_TREE_MAP_SHIFT;
-		slot = slot->slots[i];
-	} while (height > 0);
+		slot = rcu_dereference(slot->slots[i]);
+		if (slot == NULL)
+			break;
+	}
 out:
 	*next_index = index;
 	return nr_found;
@@ -706,27 +794,44 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
 		unsigned long first_index, unsigned int max_items,
 		unsigned int tag)
 {
-	const unsigned long max_index = radix_tree_maxindex(root->height);
+	struct radix_tree_node *node;
+	unsigned long max_index;
 	unsigned long cur_index = first_index;
-	unsigned int ret = 0;
+	unsigned int ret;
 
 	/* check the root's tag bit */
 	if (!root_tag_get(root, tag))
 		return 0;
 
+	node = rcu_dereference(root->rnode);
+	if (!node)
+		return 0;
+
+	if (radix_tree_is_direct_ptr(node)) {
+		if (first_index > 0)
+			return 0;
+		node = radix_tree_direct_to_ptr(node);
+		results[0] = rcu_dereference(node);
+		return 1;
+	}
+
+	max_index = radix_tree_maxindex(node->height);
+
+	ret = 0;
 	while (ret < max_items) {
 		unsigned int nr_found;
 		unsigned long next_index;	/* Index of next search */
 
 		if (cur_index > max_index)
 			break;
-		nr_found = __lookup_tag(root, results + ret, cur_index,
+		nr_found = __lookup_tag(node, results + ret, cur_index,
 					max_items - ret, &next_index, tag);
 		ret += nr_found;
 		if (next_index == 0)
 			break;
 		cur_index = next_index;
 	}
+
 	return ret;
 }
 EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
@@ -742,8 +847,19 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
 			root->rnode->count == 1 &&
 			root->rnode->slots[0]) {
 		struct radix_tree_node *to_free = root->rnode;
+		void *newptr;
 
-		root->rnode = to_free->slots[0];
+		/*
+		 * We don't need rcu_assign_pointer(), since we are simply
+		 * moving the node from one part of the tree to another. If
+		 * it was safe to dereference the old pointer to it
+		 * (to_free->slots[0]), it will be safe to dereference the new
+		 * one (root->rnode).
+		 */
+		newptr = to_free->slots[0];
+		if (root->height == 1)
+			newptr = radix_tree_ptr_to_direct(newptr);
+		root->rnode = newptr;
 		root->height--;
 		/* must only free zeroed nodes into the slab */
 		tag_clear(to_free, 0, 0);
@@ -767,6 +883,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 {
 	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
 	struct radix_tree_node *slot = NULL;
+	struct radix_tree_node *to_free;
 	unsigned int height, shift;
 	int tag;
 	int offset;
@@ -777,6 +894,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 
 	slot = root->rnode;
 	if (height == 0 && root->rnode) {
+		slot = radix_tree_direct_to_ptr(slot);
 		root_tag_clear_all(root);
 		root->rnode = NULL;
 		goto out;
@@ -809,10 +927,17 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 			radix_tree_tag_clear(root, index, tag);
 	}
 
+	to_free = NULL;
 	/* Now free the nodes we do not need anymore */
 	while (pathp->node) {
 		pathp->node->slots[pathp->offset] = NULL;
 		pathp->node->count--;
+		/*
+		 * Queue the node for deferred freeing after the
+		 * last reference to it disappears (set NULL, above).
+		 */
+		if (to_free)
+			radix_tree_node_free(to_free);
 
 		if (pathp->node->count) {
 			if (pathp->node == root->rnode)
@@ -821,13 +946,15 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 		}
 
 		/* Node with zero slots in use so free it */
-		radix_tree_node_free(pathp->node);
-
+		to_free = pathp->node;
 		pathp--;
+
 	}
 	root_tag_clear_all(root);
 	root->height = 0;
 	root->rnode = NULL;
+	if (to_free)
+		radix_tree_node_free(to_free);
 
 out:
 	return slot;
diff --git a/mm/migrate.c b/mm/migrate.c
index b4979d4..e9b161b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -294,7 +294,7 @@ out:
 static int migrate_page_move_mapping(struct address_space *mapping,
 		struct page *newpage, struct page *page)
 {
-	struct page **radix_pointer;
+	void **pslot;
 
 	if (!mapping) {
 		/* Anonymous page */
@@ -305,12 +305,11 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 
 	write_lock_irq(&mapping->tree_lock);
 
-	radix_pointer = (struct page **)radix_tree_lookup_slot(
-						&mapping->page_tree,
-						page_index(page));
+	pslot = radix_tree_lookup_slot(&mapping->page_tree,
+ 					page_index(page));
 
 	if (page_count(page) != 2 + !!PagePrivate(page) ||
-			*radix_pointer != page) {
+			(struct page *)radix_tree_deref_slot(pslot) != page) {
 		write_unlock_irq(&mapping->tree_lock);
 		return -EAGAIN;
 	}
@@ -318,7 +317,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 	/*
 	 * Now we know that no one else is looking at the page.
 	 */
-	get_page(newpage);
+	get_page(newpage);	/* add cache reference */
 #ifdef CONFIG_SWAP
 	if (PageSwapCache(page)) {
 		SetPageSwapCache(newpage);
@@ -326,8 +325,14 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 	}
 #endif
 
-	*radix_pointer = newpage;
+	radix_tree_replace_slot(pslot, newpage);
+
+	/*
+	 * Drop cache reference from old page.
+	 * We know this isn't the last reference.
+	 */
 	__put_page(page);
+
 	write_unlock_irq(&mapping->tree_lock);
 
 	return 0;
-- 
cgit v0.10.2


From 48ad504ee7d598431cb2d0b2f01c6d1aff1d2a07 Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Wed, 6 Dec 2006 20:33:47 -0800
Subject: [PATCH] security/keys/*: user kmemdup()

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-By: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/security/keys/key.c b/security/keys/key.c
index 0db816f..ac9326c 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -290,11 +290,9 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 		goto no_memory_2;
 
 	if (desc) {
-		key->description = kmalloc(desclen, GFP_KERNEL);
+		key->description = kmemdup(desc, desclen, GFP_KERNEL);
 		if (!key->description)
 			goto no_memory_3;
-
-		memcpy(key->description, desc, desclen);
 	}
 
 	atomic_set(&key->usage, 1);
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index e8d02ac..ad45ce7 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -706,12 +706,10 @@ int __key_link(struct key *keyring, struct key *key)
 				BUG_ON(size > PAGE_SIZE);
 
 				ret = -ENOMEM;
-				nklist = kmalloc(size, GFP_KERNEL);
+				nklist = kmemdup(klist, size, GFP_KERNEL);
 				if (!nklist)
 					goto error2;
 
-				memcpy(nklist, klist, size);
-
 				/* replace matched key */
 				atomic_inc(&key->usage);
 				nklist->keys[loop] = key;
-- 
cgit v0.10.2


From e9c1528a429c831458e54c8701a0b80ba563a7a8 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:33:50 -0800
Subject: [PATCH] arch/frv/kernel/futex.c must #include <linux/uaccess.h>

This patch fixes the following compile error with
-Werror-implicit-function-declaration
(without -Werror-implicit-function-declaration it's a link error):

    ...
      CC      arch/frv/kernel/futex.o
    /home/bunk/linux/kernel-2.6/linux-2.6.19-rc6-mm2/arch/frv/kernel/futex.c:
    In function 'futex_atomic_op_inuser':
    /home/bunk/linux/kernel-2.6/linux-2.6.19-rc6-mm2/arch/frv/kernel/futex.c:203:
    error: implicit declaration of function 'pagefault_disable'
    /home/bunk/linux/kernel-2.6/linux-2.6.19-rc6-mm2/arch/frv/kernel/futex.c:226:
    error: implicit declaration of function 'pagefault_enable'
    make[2]: *** [arch/frv/kernel/futex.o] Error 1
    ...

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/frv/kernel/futex.c b/arch/frv/kernel/futex.c
index 53dc5ed..14f64b0 100644
--- a/arch/frv/kernel/futex.c
+++ b/arch/frv/kernel/futex.c
@@ -10,9 +10,9 @@
  */
 
 #include <linux/futex.h>
+#include <linux/uaccess.h>
 #include <asm/futex.h>
 #include <asm/errno.h>
-#include <asm/uaccess.h>
 
 /*
  * the various futex operations; MMU fault checking is ignored under no-MMU
-- 
cgit v0.10.2


From 4d3eeeac97a6e4fc1ff3ad184f1c3bf328de7cb6 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 6 Dec 2006 20:33:54 -0800
Subject: [PATCH] avr32: fixup kprobes preemption handling

While working on SH kprobes, I noticed that avr32 got the preemption
handling wrong in the no probe case.  The idea is that upon entry of
kprobe_handler() preemption is disabled outright across the life of the
kprobe, only to be re-enabled in post_kprobe_handler().

However, in the event that the probe is never activated, there's never any
chance of hitting the post probe handler, which allows for the current
avr32 implementation to disable preemption indefinitely, as it's currently
missing a re-enable when no probe is activated.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/avr32/kernel/kprobes.c b/arch/avr32/kernel/kprobes.c
index ca41fc1..d0abbca 100644
--- a/arch/avr32/kernel/kprobes.c
+++ b/arch/avr32/kernel/kprobes.c
@@ -154,6 +154,7 @@ ss_probe:
 	return 1;
 
 no_kprobe:
+	preempt_enable_no_resched();
 	return ret;
 }
 
-- 
cgit v0.10.2


From 3869aa292fbd24103b8338937cb351459efe3f82 Mon Sep 17 00:00:00 2001
From: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
Date: Wed, 6 Dec 2006 20:33:57 -0800
Subject: [PATCH] h8300 stray bracket fix

Signed-off-by: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/h8300/kernel/setup.c b/arch/h8300/kernel/setup.c
index 1077b71..6adf8f4 100644
--- a/arch/h8300/kernel/setup.c
+++ b/arch/h8300/kernel/setup.c
@@ -116,7 +116,7 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #else
 	if ((memory_end < CONFIG_BLKDEV_RESERVE_ADDRESS) && 
-	    (memory_end > CONFIG_BLKDEV_RESERVE_ADDRESS)
+	    (memory_end > CONFIG_BLKDEV_RESERVE_ADDRESS))
 	    /* overlap userarea */
 	    memory_end = CONFIG_BLKDEV_RESERVE_ADDRESS; 
 #endif
-- 
cgit v0.10.2


From 074cec54d1049ab580ecd0026623b553e0e270c4 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Wed, 6 Dec 2006 20:33:59 -0800
Subject: [PATCH] alpha: switch to pci_get API

Now that we have pci_get_bus_and_slot we can do the job correctly.  Note that
some of these calls intentionally leak a device - this is because the device
in question is always needed from boot to reboot.

Signed-off-by: Alan Cox <alan@redhat.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/alpha/kernel/pci.c b/arch/alpha/kernel/pci.c
index ffb7d54..3c10b9a 100644
--- a/arch/alpha/kernel/pci.c
+++ b/arch/alpha/kernel/pci.c
@@ -516,10 +516,11 @@ sys_pciconfig_iobase(long which, unsigned long bus, unsigned long dfn)
 		if (bus == 0 && dfn == 0) {
 			hose = pci_isa_hose;
 		} else {
-			dev = pci_find_slot(bus, dfn);
+			dev = pci_get_bus_and_slot(bus, dfn);
 			if (!dev)
 				return -ENODEV;
 			hose = dev->sysdata;
+			pci_dev_put(dev);
 		}
 	}
 
diff --git a/arch/alpha/kernel/sys_miata.c b/arch/alpha/kernel/sys_miata.c
index b8b817f..910b43c 100644
--- a/arch/alpha/kernel/sys_miata.c
+++ b/arch/alpha/kernel/sys_miata.c
@@ -183,11 +183,15 @@ miata_map_irq(struct pci_dev *dev, u8 slot, u8 pin)
 
 	if((slot == 7) && (PCI_FUNC(dev->devfn) == 3)) {
 		u8 irq=0;
-
-		if(pci_read_config_byte(pci_find_slot(dev->bus->number, dev->devfn & ~(7)), 0x40,&irq)!=PCIBIOS_SUCCESSFUL)
+		struct pci_dev *pdev = pci_get_slot(dev->bus, dev->devfn & ~7);
+		if(pdev == NULL || pci_read_config_byte(pdev, 0x40,&irq) != PCIBIOS_SUCCESSFUL) {
+			pci_dev_put(pdev);
 			return -1;
-		else	
+		}
+		else	{
+			pci_dev_put(pdev);
 			return irq;
+		}
 	}
 
 	return COMMON_TABLE_LOOKUP;
diff --git a/arch/alpha/kernel/sys_nautilus.c b/arch/alpha/kernel/sys_nautilus.c
index 93744ba..e7594a7 100644
--- a/arch/alpha/kernel/sys_nautilus.c
+++ b/arch/alpha/kernel/sys_nautilus.c
@@ -200,7 +200,7 @@ nautilus_init_pci(void)
 	bus = pci_scan_bus(0, alpha_mv.pci_ops, hose);
 	hose->bus = bus;
 
-	irongate = pci_find_slot(0, 0);
+	irongate = pci_get_bus_and_slot(0, 0);
 	bus->self = irongate;
 	bus->resource[1] = &irongate_mem;
 
-- 
cgit v0.10.2


From 3592695c363c3f3119621bdcf5ed852d6b9d1a5c Mon Sep 17 00:00:00 2001
From: Stefan Seyfried <seife@suse.de>
Date: Wed, 6 Dec 2006 20:34:06 -0800
Subject: [PATCH] uswsusp: add pmops->{prepare,enter,finish} support (aka
 "platform mode")

Add an ioctl to the userspace swsusp code that enables the usage of the
pmops->prepare, pmops->enter and pmops->finish methods (the in-kernel
suspend knows these as "platform method").  These are needed on many
machines to (among others) speed up resuming by letting the BIOS skip some
steps or let my hp nx5000 recognise the correct ac_adapter state after
resume again.

It also ensures on many machines, that changed hardware (unplugged AC
adapters) gets correctly detected and that kacpid does not run wild after
resume.

Signed-off-by: Stefan Seyfried <seife@suse.de>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/power/power.h b/kernel/power/power.h
index bfe999f..87ecb18 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -117,7 +117,12 @@ extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
 #define SNAPSHOT_FREE_SWAP_PAGES	_IO(SNAPSHOT_IOC_MAGIC, 9)
 #define SNAPSHOT_SET_SWAP_FILE		_IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
 #define SNAPSHOT_S2RAM			_IO(SNAPSHOT_IOC_MAGIC, 11)
-#define SNAPSHOT_IOC_MAXNR	11
+#define SNAPSHOT_PMOPS			_IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
+#define SNAPSHOT_IOC_MAXNR	12
+
+#define PMOPS_PREPARE	1
+#define PMOPS_ENTER	2
+#define PMOPS_FINISH	3
 
 /**
  *	The bitmap is used for tracing allocated swap pages
diff --git a/kernel/power/user.c b/kernel/power/user.c
index d991d3b..4c24ca5 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -11,6 +11,7 @@
 
 #include <linux/suspend.h>
 #include <linux/syscalls.h>
+#include <linux/reboot.h>
 #include <linux/string.h>
 #include <linux/device.h>
 #include <linux/miscdevice.h>
@@ -313,6 +314,33 @@ OutS3:
 		up(&pm_sem);
 		break;
 
+	case SNAPSHOT_PMOPS:
+		switch (arg) {
+
+		case PMOPS_PREPARE:
+			if (pm_ops->prepare) {
+				error = pm_ops->prepare(PM_SUSPEND_DISK);
+			}
+			break;
+
+		case PMOPS_ENTER:
+			kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
+			error = pm_ops->enter(PM_SUSPEND_DISK);
+			break;
+
+		case PMOPS_FINISH:
+			if (pm_ops && pm_ops->finish) {
+				pm_ops->finish(PM_SUSPEND_DISK);
+			}
+			break;
+
+		default:
+			printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
+			error = -EINVAL;
+
+		}
+		break;
+
 	default:
 		error = -ENOTTY;
 
-- 
cgit v0.10.2


From 915bae9ebe41e52d71ad8b06d50e4ab26189f964 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:07 -0800
Subject: [PATCH] swsusp: use partition device and offset to identify swap
 areas

The Linux kernel handles swap files almost in the same way as it handles swap
partitions and there are only two differences between these two types of swap
areas:

(1) swap files need not be contiguous,

(2) the header of a swap file is not in the first block of the partition
    that holds it.  From the swsusp's point of view (1) is not a problem,
    because it is already taken care of by the swap-handling code, but (2) has
    to be taken into consideration.

In principle the location of a swap file's header may be determined with the
help of appropriate filesystem driver.  Unfortunately, however, it requires
the filesystem holding the swap file to be mounted, and if this filesystem is
journaled, it cannot be mounted during a resume from disk.  For this reason we
need some other means by which swap areas can be identified.

For example, to identify a swap area we can use the partition that holds the
area and the offset from the beginning of this partition at which the swap
header is located.

The following patch allows swsusp to identify swap areas this way.  It changes
swap_type_of() so that it takes an additional argument representing an offset
of the swap header within the partition represented by its first argument.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 89f8a39..d51e35e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -247,7 +247,7 @@ extern int swap_duplicate(swp_entry_t);
 extern int valid_swaphandles(swp_entry_t, unsigned long *);
 extern void swap_free(swp_entry_t);
 extern void free_swap_and_cache(swp_entry_t);
-extern int swap_type_of(dev_t);
+extern int swap_type_of(dev_t, sector_t);
 extern unsigned int count_swap_pages(int, int);
 extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t);
 extern struct swap_info_struct *get_swap_info_struct(unsigned);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 1a3b0dd..7b10da1 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -74,7 +74,7 @@ static int mark_swapfiles(swp_entry_t start)
 
 static int swsusp_swap_check(void) /* This is called before saving image */
 {
-	int res = swap_type_of(swsusp_resume_device);
+	int res = swap_type_of(swsusp_resume_device, 0);
 
 	if (res >= 0) {
 		root_swap = res;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 4c24ca5..a327b18 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -55,7 +55,8 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 	filp->private_data = data;
 	memset(&data->handle, 0, sizeof(struct snapshot_handle));
 	if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
-		data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1;
+		data->swap = swsusp_resume_device ?
+				swap_type_of(swsusp_resume_device, 0) : -1;
 		data->mode = O_RDONLY;
 	} else {
 		data->swap = -1;
@@ -265,7 +266,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			 * so we need to recode them
 			 */
 			if (old_decode_dev(arg)) {
-				data->swap = swap_type_of(old_decode_dev(arg));
+				data->swap = swap_type_of(old_decode_dev(arg), 0);
 				if (data->swap < 0)
 					error = -ENODEV;
 			} else {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f315131..2bfacba 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -427,34 +427,48 @@ void free_swap_and_cache(swp_entry_t entry)
 
 #ifdef CONFIG_SOFTWARE_SUSPEND
 /*
- * Find the swap type that corresponds to given device (if any)
+ * Find the swap type that corresponds to given device (if any).
  *
- * This is needed for software suspend and is done in such a way that inode
- * aliasing is allowed.
+ * @offset - number of the PAGE_SIZE-sized block of the device, starting
+ * from 0, in which the swap header is expected to be located.
+ *
+ * This is needed for the suspend to disk (aka swsusp).
  */
-int swap_type_of(dev_t device)
+int swap_type_of(dev_t device, sector_t offset)
 {
+	struct block_device *bdev = NULL;
 	int i;
 
+	if (device)
+		bdev = bdget(device);
+
 	spin_lock(&swap_lock);
 	for (i = 0; i < nr_swapfiles; i++) {
-		struct inode *inode;
+		struct swap_info_struct *sis = swap_info + i;
 
-		if (!(swap_info[i].flags & SWP_WRITEOK))
+		if (!(sis->flags & SWP_WRITEOK))
 			continue;
 
-		if (!device) {
+		if (!bdev) {
 			spin_unlock(&swap_lock);
 			return i;
 		}
-		inode = swap_info[i].swap_file->f_dentry->d_inode;
-		if (S_ISBLK(inode->i_mode) &&
-		    device == MKDEV(imajor(inode), iminor(inode))) {
-			spin_unlock(&swap_lock);
-			return i;
+		if (bdev == sis->bdev) {
+			struct swap_extent *se;
+
+			se = list_entry(sis->extent_list.next,
+					struct swap_extent, list);
+			if (se->start_block == offset) {
+				spin_unlock(&swap_lock);
+				bdput(bdev);
+				return i;
+			}
 		}
 	}
 	spin_unlock(&swap_lock);
+	if (bdev)
+		bdput(bdev);
+
 	return -ENODEV;
 }
 
-- 
cgit v0.10.2


From 3fc6b34f4803b959c1e30c15247e2180cd529115 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:09 -0800
Subject: [PATCH] swsusp: rearrange swap-handling code

Rearrange the code in kernel/power/swap.c so that the next patch is more
readable.

[This patch only moves the existing code.]

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 7b10da1..7768c2b 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -41,10 +41,121 @@ static struct swsusp_header {
 } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
 
 /*
- * Saving part...
+ * General things
  */
 
 static unsigned short root_swap = 0xffff;
+static struct block_device *resume_bdev;
+
+/**
+ *	submit - submit BIO request.
+ *	@rw:	READ or WRITE.
+ *	@off	physical offset of page.
+ *	@page:	page we're reading or writing.
+ *	@bio_chain: list of pending biod (for async reading)
+ *
+ *	Straight from the textbook - allocate and initialize the bio.
+ *	If we're reading, make sure the page is marked as dirty.
+ *	Then submit it and, if @bio_chain == NULL, wait.
+ */
+static int submit(int rw, pgoff_t page_off, struct page *page,
+			struct bio **bio_chain)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_ATOMIC, 1);
+	if (!bio)
+		return -ENOMEM;
+	bio->bi_sector = page_off * (PAGE_SIZE >> 9);
+	bio->bi_bdev = resume_bdev;
+	bio->bi_end_io = end_swap_bio_read;
+
+	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+		printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
+		bio_put(bio);
+		return -EFAULT;
+	}
+
+	lock_page(page);
+	bio_get(bio);
+
+	if (bio_chain == NULL) {
+		submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+		wait_on_page_locked(page);
+		if (rw == READ)
+			bio_set_pages_dirty(bio);
+		bio_put(bio);
+	} else {
+		if (rw == READ)
+			get_page(page);	/* These pages are freed later */
+		bio->bi_private = *bio_chain;
+		*bio_chain = bio;
+		submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+	}
+	return 0;
+}
+
+static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
+{
+	return submit(READ, page_off, virt_to_page(addr), bio_chain);
+}
+
+static int bio_write_page(pgoff_t page_off, void *addr)
+{
+	return submit(WRITE, page_off, virt_to_page(addr), NULL);
+}
+
+static int wait_on_bio_chain(struct bio **bio_chain)
+{
+	struct bio *bio;
+	struct bio *next_bio;
+	int ret = 0;
+
+	if (bio_chain == NULL)
+		return 0;
+
+	bio = *bio_chain;
+	if (bio == NULL)
+		return 0;
+	while (bio) {
+		struct page *page;
+
+		next_bio = bio->bi_private;
+		page = bio->bi_io_vec[0].bv_page;
+		wait_on_page_locked(page);
+		if (!PageUptodate(page) || PageError(page))
+			ret = -EIO;
+		put_page(page);
+		bio_put(bio);
+		bio = next_bio;
+	}
+	*bio_chain = NULL;
+	return ret;
+}
+
+static void show_speed(struct timeval *start, struct timeval *stop,
+			unsigned nr_pages, char *msg)
+{
+	s64 elapsed_centisecs64;
+	int centisecs;
+	int k;
+	int kps;
+
+	elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
+	do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
+	centisecs = elapsed_centisecs64;
+	if (centisecs == 0)
+		centisecs = 1;	/* avoid div-by-zero */
+	k = nr_pages * (PAGE_SIZE / 1024);
+	kps = (k * 100) / centisecs;
+	printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
+			centisecs / 100, centisecs % 100,
+			kps / 1000, (kps % 1000) / 10);
+}
+
+/*
+ * Saving part
+ */
 
 static int mark_swapfiles(swp_entry_t start)
 {
@@ -166,26 +277,6 @@ static void release_swap_writer(struct swap_map_handle *handle)
 	handle->bitmap = NULL;
 }
 
-static void show_speed(struct timeval *start, struct timeval *stop,
-			unsigned nr_pages, char *msg)
-{
-	s64 elapsed_centisecs64;
-	int centisecs;
-	int k;
-	int kps;
-
-	elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
-	do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
-	centisecs = elapsed_centisecs64;
-	if (centisecs == 0)
-		centisecs = 1;	/* avoid div-by-zero */
-	k = nr_pages * (PAGE_SIZE / 1024);
-	kps = (k * 100) / centisecs;
-	printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
-			centisecs / 100, centisecs % 100,
-			kps / 1000, (kps % 1000) / 10);
-}
-
 static int get_swap_writer(struct swap_map_handle *handle)
 {
 	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -205,34 +296,6 @@ static int get_swap_writer(struct swap_map_handle *handle)
 	return 0;
 }
 
-static int wait_on_bio_chain(struct bio **bio_chain)
-{
-	struct bio *bio;
-	struct bio *next_bio;
-	int ret = 0;
-
-	if (bio_chain == NULL)
-		return 0;
-
-	bio = *bio_chain;
-	if (bio == NULL)
-		return 0;
-	while (bio) {
-		struct page *page;
-
-		next_bio = bio->bi_private;
-		page = bio->bi_io_vec[0].bv_page;
-		wait_on_page_locked(page);
-		if (!PageUptodate(page) || PageError(page))
-			ret = -EIO;
-		put_page(page);
-		bio_put(bio);
-		bio = next_bio;
-	}
-	*bio_chain = NULL;
-	return ret;
-}
-
 static int swap_write_page(struct swap_map_handle *handle, void *buf,
 				struct bio **bio_chain)
 {
@@ -384,66 +447,6 @@ int swsusp_write(void)
 	return error;
 }
 
-static struct block_device *resume_bdev;
-
-/**
- *	submit - submit BIO request.
- *	@rw:	READ or WRITE.
- *	@off	physical offset of page.
- *	@page:	page we're reading or writing.
- *	@bio_chain: list of pending biod (for async reading)
- *
- *	Straight from the textbook - allocate and initialize the bio.
- *	If we're reading, make sure the page is marked as dirty.
- *	Then submit it and, if @bio_chain == NULL, wait.
- */
-static int submit(int rw, pgoff_t page_off, struct page *page,
-			struct bio **bio_chain)
-{
-	struct bio *bio;
-
-	bio = bio_alloc(GFP_ATOMIC, 1);
-	if (!bio)
-		return -ENOMEM;
-	bio->bi_sector = page_off * (PAGE_SIZE >> 9);
-	bio->bi_bdev = resume_bdev;
-	bio->bi_end_io = end_swap_bio_read;
-
-	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
-		printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
-		bio_put(bio);
-		return -EFAULT;
-	}
-
-	lock_page(page);
-	bio_get(bio);
-
-	if (bio_chain == NULL) {
-		submit_bio(rw | (1 << BIO_RW_SYNC), bio);
-		wait_on_page_locked(page);
-		if (rw == READ)
-			bio_set_pages_dirty(bio);
-		bio_put(bio);
-	} else {
-		if (rw == READ)
-			get_page(page);	/* These pages are freed later */
-		bio->bi_private = *bio_chain;
-		*bio_chain = bio;
-		submit_bio(rw | (1 << BIO_RW_SYNC), bio);
-	}
-	return 0;
-}
-
-static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
-	return submit(READ, page_off, virt_to_page(addr), bio_chain);
-}
-
-static int bio_write_page(pgoff_t page_off, void *addr)
-{
-	return submit(WRITE, page_off, virt_to_page(addr), NULL);
-}
-
 /**
  *	The following functions allow us to read data using a swap map
  *	in a file-alike way
-- 
cgit v0.10.2


From 3aef83e0ef1ffb8ea3bea97be46821a45c952173 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:10 -0800
Subject: [PATCH] swsusp: use block device offsets to identify swap locations

Make swsusp use block device offsets instead of swap offsets to identify swap
locations and make it use the same code paths for writing as well as for
reading data.

This allows us to use the same code for handling swap files and swap
partitions and to simplify the code, eg.  by dropping rw_swap_page_sync().

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/swap.h b/include/linux/swap.h
index d51e35e..add51ce 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -218,8 +218,6 @@ extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *);
 /* linux/mm/page_io.c */
 extern int swap_readpage(struct file *, struct page *);
 extern int swap_writepage(struct page *page, struct writeback_control *wbc);
-extern int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page,
-				struct bio **bio_chain);
 extern int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err);
 
 /* linux/mm/swap_state.c */
@@ -250,6 +248,7 @@ extern void free_swap_and_cache(swp_entry_t);
 extern int swap_type_of(dev_t, sector_t);
 extern unsigned int count_swap_pages(int, int);
 extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t);
+extern sector_t swapdev_block(int, pgoff_t);
 extern struct swap_info_struct *get_swap_info_struct(unsigned);
 extern int can_share_swap_page(struct page *);
 extern int remove_exclusive_swap_page(struct page *);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 87ecb18..210ebba 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -146,7 +146,7 @@ struct bitmap_page {
 
 extern void free_bitmap(struct bitmap_page *bitmap);
 extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
-extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
+extern sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap);
 extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
 
 extern int swsusp_check(void);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 7768c2b..1b08f46 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -34,8 +34,8 @@ extern char resume_file[];
 #define SWSUSP_SIG	"S1SUSPEND"
 
 static struct swsusp_header {
-	char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
-	swp_entry_t image;
+	char reserved[PAGE_SIZE - 20 - sizeof(sector_t)];
+	sector_t image;
 	char	orig_sig[10];
 	char	sig[10];
 } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
@@ -100,9 +100,9 @@ static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
 	return submit(READ, page_off, virt_to_page(addr), bio_chain);
 }
 
-static int bio_write_page(pgoff_t page_off, void *addr)
+static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
 {
-	return submit(WRITE, page_off, virt_to_page(addr), NULL);
+	return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
 }
 
 static int wait_on_bio_chain(struct bio **bio_chain)
@@ -157,22 +157,19 @@ static void show_speed(struct timeval *start, struct timeval *stop,
  * Saving part
  */
 
-static int mark_swapfiles(swp_entry_t start)
+static int mark_swapfiles(sector_t start)
 {
 	int error;
 
-	rw_swap_page_sync(READ, swp_entry(root_swap, 0),
-			  virt_to_page((unsigned long)&swsusp_header), NULL);
+	bio_read_page(0, &swsusp_header, NULL);
 	if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
 	    !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
 		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
 		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
 		swsusp_header.image = start;
-		error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0),
-				virt_to_page((unsigned long)&swsusp_header),
-				NULL);
+		error = bio_write_page(0, &swsusp_header, NULL);
 	} else {
-		pr_debug("swsusp: Partition is not swap space.\n");
+		printk(KERN_ERR "swsusp: Swap header not found!\n");
 		error = -ENODEV;
 	}
 	return error;
@@ -185,12 +182,21 @@ static int mark_swapfiles(swp_entry_t start)
 
 static int swsusp_swap_check(void) /* This is called before saving image */
 {
-	int res = swap_type_of(swsusp_resume_device, 0);
+	int res;
+
+	res = swap_type_of(swsusp_resume_device, 0);
+	if (res < 0)
+		return res;
+
+	root_swap = res;
+	resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_WRITE);
+	if (IS_ERR(resume_bdev))
+		return PTR_ERR(resume_bdev);
+
+	res = set_blocksize(resume_bdev, PAGE_SIZE);
+	if (res < 0)
+		blkdev_put(resume_bdev);
 
-	if (res >= 0) {
-		root_swap = res;
-		return 0;
-	}
 	return res;
 }
 
@@ -201,36 +207,26 @@ static int swsusp_swap_check(void) /* This is called before saving image */
  *	@bio_chain:	Link the next write BIO here
  */
 
-static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
+static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
 {
-	swp_entry_t entry;
-	int error = -ENOSPC;
-
-	if (offset) {
-		struct page *page = virt_to_page(buf);
-
-		if (bio_chain) {
-			/*
-			 * Whether or not we successfully allocated a copy page,
-			 * we take a ref on the page here.  It gets undone in
-			 * wait_on_bio_chain().
-			 */
-			struct page *page_copy;
-			page_copy = alloc_page(GFP_ATOMIC);
-			if (page_copy == NULL) {
-				WARN_ON_ONCE(1);
-				bio_chain = NULL;	/* Go synchronous */
-				get_page(page);
-			} else {
-				memcpy(page_address(page_copy),
-					page_address(page), PAGE_SIZE);
-				page = page_copy;
-			}
+	void *src;
+
+	if (!offset)
+		return -ENOSPC;
+
+	if (bio_chain) {
+		src = (void *)__get_free_page(GFP_ATOMIC);
+		if (src) {
+			memcpy(src, buf, PAGE_SIZE);
+		} else {
+			WARN_ON_ONCE(1);
+			bio_chain = NULL;	/* Go synchronous */
+			src = buf;
 		}
-		entry = swp_entry(root_swap, offset);
-		error = rw_swap_page_sync(WRITE, entry, page, bio_chain);
+	} else {
+		src = buf;
 	}
-	return error;
+	return bio_write_page(offset, src, bio_chain);
 }
 
 /*
@@ -248,11 +244,11 @@ static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
  *	at a time.
  */
 
-#define MAP_PAGE_ENTRIES	(PAGE_SIZE / sizeof(long) - 1)
+#define MAP_PAGE_ENTRIES	(PAGE_SIZE / sizeof(sector_t) - 1)
 
 struct swap_map_page {
-	unsigned long		entries[MAP_PAGE_ENTRIES];
-	unsigned long		next_swap;
+	sector_t entries[MAP_PAGE_ENTRIES];
+	sector_t next_swap;
 };
 
 /**
@@ -262,7 +258,7 @@ struct swap_map_page {
 
 struct swap_map_handle {
 	struct swap_map_page *cur;
-	unsigned long cur_swap;
+	sector_t cur_swap;
 	struct bitmap_page *bitmap;
 	unsigned int k;
 };
@@ -287,7 +283,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
 		release_swap_writer(handle);
 		return -ENOMEM;
 	}
-	handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap);
+	handle->cur_swap = alloc_swapdev_block(root_swap, handle->bitmap);
 	if (!handle->cur_swap) {
 		release_swap_writer(handle);
 		return -ENOSPC;
@@ -300,11 +296,11 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
 				struct bio **bio_chain)
 {
 	int error = 0;
-	unsigned long offset;
+	sector_t offset;
 
 	if (!handle->cur)
 		return -EINVAL;
-	offset = alloc_swap_page(root_swap, handle->bitmap);
+	offset = alloc_swapdev_block(root_swap, handle->bitmap);
 	error = write_page(buf, offset, bio_chain);
 	if (error)
 		return error;
@@ -313,7 +309,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
 		error = wait_on_bio_chain(bio_chain);
 		if (error)
 			goto out;
-		offset = alloc_swap_page(root_swap, handle->bitmap);
+		offset = alloc_swapdev_block(root_swap, handle->bitmap);
 		if (!offset)
 			return -ENOSPC;
 		handle->cur->next_swap = offset;
@@ -413,37 +409,47 @@ int swsusp_write(void)
 	struct swsusp_info *header;
 	int error;
 
-	if ((error = swsusp_swap_check())) {
+	error = swsusp_swap_check();
+	if (error) {
 		printk(KERN_ERR "swsusp: Cannot find swap device, try "
 				"swapon -a.\n");
 		return error;
 	}
 	memset(&snapshot, 0, sizeof(struct snapshot_handle));
 	error = snapshot_read_next(&snapshot, PAGE_SIZE);
-	if (error < PAGE_SIZE)
-		return error < 0 ? error : -EFAULT;
+	if (error < PAGE_SIZE) {
+		if (error >= 0)
+			error = -EFAULT;
+
+		goto out;
+	}
 	header = (struct swsusp_info *)data_of(snapshot);
 	if (!enough_swap(header->pages)) {
 		printk(KERN_ERR "swsusp: Not enough free swap\n");
-		return -ENOSPC;
+		error = -ENOSPC;
+		goto out;
 	}
 	error = get_swap_writer(&handle);
 	if (!error) {
-		unsigned long start = handle.cur_swap;
+		sector_t start = handle.cur_swap;
+
 		error = swap_write_page(&handle, header, NULL);
 		if (!error)
 			error = save_image(&handle, &snapshot,
 					header->pages - 1);
+
 		if (!error) {
 			flush_swap_writer(&handle);
 			printk("S");
-			error = mark_swapfiles(swp_entry(root_swap, start));
+			error = mark_swapfiles(start);
 			printk("|\n");
 		}
 	}
 	if (error)
 		free_all_swap_pages(root_swap, handle.bitmap);
 	release_swap_writer(&handle);
+out:
+	swsusp_close();
 	return error;
 }
 
@@ -459,17 +465,18 @@ static void release_swap_reader(struct swap_map_handle *handle)
 	handle->cur = NULL;
 }
 
-static int get_swap_reader(struct swap_map_handle *handle,
-                                      swp_entry_t start)
+static int get_swap_reader(struct swap_map_handle *handle, sector_t start)
 {
 	int error;
 
-	if (!swp_offset(start))
+	if (!start)
 		return -EINVAL;
+
 	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
 	if (!handle->cur)
 		return -ENOMEM;
-	error = bio_read_page(swp_offset(start), handle->cur, NULL);
+
+	error = bio_read_page(start, handle->cur, NULL);
 	if (error) {
 		release_swap_reader(handle);
 		return error;
@@ -481,7 +488,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
 static int swap_read_page(struct swap_map_handle *handle, void *buf,
 				struct bio **bio_chain)
 {
-	unsigned long offset;
+	sector_t offset;
 	int error;
 
 	if (!handle->cur)
@@ -608,7 +615,7 @@ int swsusp_check(void)
 		if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
 			memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
 			/* Reset swap signature now */
-			error = bio_write_page(0, &swsusp_header);
+			error = bio_write_page(0, &swsusp_header, NULL);
 		} else {
 			return -EINVAL;
 		}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 0b66659..4147a75 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -134,18 +134,18 @@ static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
 	return 0;
 }
 
-unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap)
+sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap)
 {
 	unsigned long offset;
 
 	offset = swp_offset(get_swap_page_of_type(swap));
 	if (offset) {
-		if (bitmap_set(bitmap, offset)) {
+		if (bitmap_set(bitmap, offset))
 			swap_free(swp_entry(swap, offset));
-			offset = 0;
-		}
+		else
+			return swapdev_block(swap, offset);
 	}
-	return offset;
+	return 0;
 }
 
 void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index a327b18..f0b7ef8 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -126,7 +126,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 {
 	int error = 0;
 	struct snapshot_data *data;
-	loff_t offset, avail;
+	loff_t avail;
+	sector_t offset;
 
 	if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
 		return -ENOTTY;
@@ -240,10 +241,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 				break;
 			}
 		}
-		offset = alloc_swap_page(data->swap, data->bitmap);
+		offset = alloc_swapdev_block(data->swap, data->bitmap);
 		if (offset) {
 			offset <<= PAGE_SHIFT;
-			error = put_user(offset, (loff_t __user *)arg);
+			error = put_user(offset, (sector_t __user *)arg);
 		} else {
 			error = -ENOSPC;
 		}
diff --git a/mm/page_io.c b/mm/page_io.c
index d4840ec..dbffec0 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -147,48 +147,3 @@ int swap_readpage(struct file *file, struct page *page)
 out:
 	return ret;
 }
-
-#ifdef CONFIG_SOFTWARE_SUSPEND
-/*
- * A scruffy utility function to read or write an arbitrary swap page
- * and wait on the I/O.  The caller must have a ref on the page.
- *
- * We use end_swap_bio_read() even for writes, because it happens to do what
- * we want.
- */
-int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page,
-			struct bio **bio_chain)
-{
-	struct bio *bio;
-	int ret = 0;
-	int bio_rw;
-
-	lock_page(page);
-
-	bio = get_swap_bio(GFP_KERNEL, entry.val, page, end_swap_bio_read);
-	if (bio == NULL) {
-		unlock_page(page);
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	bio_rw = rw;
-	if (!bio_chain)
-		bio_rw |= (1 << BIO_RW_SYNC);
-	if (bio_chain)
-		bio_get(bio);
-	submit_bio(bio_rw, bio);
-	if (bio_chain == NULL) {
-		wait_on_page_locked(page);
-
-		if (!PageUptodate(page) || PageError(page))
-			ret = -EIO;
-	}
-	if (bio_chain) {
-		bio->bi_private = *bio_chain;
-		*bio_chain = bio;
-	}
-out:
-	return ret;
-}
-#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2bfacba..5524236 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -945,6 +945,23 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
 	}
 }
 
+#ifdef CONFIG_SOFTWARE_SUSPEND
+/*
+ * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
+ * corresponding to given index in swap_info (swap type).
+ */
+sector_t swapdev_block(int swap_type, pgoff_t offset)
+{
+	struct swap_info_struct *sis;
+
+	if (swap_type >= nr_swapfiles)
+		return 0;
+
+	sis = swap_info + swap_type;
+	return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0;
+}
+#endif /* CONFIG_SOFTWARE_SUSPEND */
+
 /*
  * Free all of a swapdev's extent information
  */
-- 
cgit v0.10.2


From 9a154d9d95b7b9845938242f5c62505b3cab5bcd Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:12 -0800
Subject: [PATCH] swsusp: add resume_offset command line parameter

Add the kernel command line parameter "resume_offset=" allowing us to specify
the offset, in <PAGE_SIZE> units, from the beginning of the partition pointed
to by the "resume=" parameter at which the swap header is located.

This offset can be determined, for example, by an application using the FIBMAP
ioctl to obtain the swap header's block number for given file.

[akpm@osdl.org: we don't know what type sector_t is]
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index b1fb786..d79feeb 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -27,6 +27,7 @@
 static int noresume = 0;
 char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
+sector_t swsusp_resume_block;
 
 /**
  *	power_down - Shut machine down for hibernate.
@@ -423,6 +424,19 @@ static int __init resume_setup(char *str)
 	return 1;
 }
 
+static int __init resume_offset_setup(char *str)
+{
+	unsigned long long offset;
+
+	if (noresume)
+		return 1;
+
+	if (sscanf(str, "%llu", &offset) == 1)
+		swsusp_resume_block = offset;
+
+	return 1;
+}
+
 static int __init noresume_setup(char *str)
 {
 	noresume = 1;
@@ -430,4 +444,5 @@ static int __init noresume_setup(char *str)
 }
 
 __setup("noresume", noresume_setup);
+__setup("resume_offset=", resume_offset_setup);
 __setup("resume=", resume_setup);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 210ebba..adaf7d4 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -42,6 +42,7 @@ extern const void __nosave_begin, __nosave_end;
 extern unsigned long image_size;
 extern int in_suspend;
 extern dev_t swsusp_resume_device;
+extern sector_t swsusp_resume_block;
 
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 1b08f46..aa5a9bf 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -161,13 +161,14 @@ static int mark_swapfiles(sector_t start)
 {
 	int error;
 
-	bio_read_page(0, &swsusp_header, NULL);
+	bio_read_page(swsusp_resume_block, &swsusp_header, NULL);
 	if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
 	    !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
 		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
 		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
 		swsusp_header.image = start;
-		error = bio_write_page(0, &swsusp_header, NULL);
+		error = bio_write_page(swsusp_resume_block,
+					&swsusp_header, NULL);
 	} else {
 		printk(KERN_ERR "swsusp: Swap header not found!\n");
 		error = -ENODEV;
@@ -184,7 +185,7 @@ static int swsusp_swap_check(void) /* This is called before saving image */
 {
 	int res;
 
-	res = swap_type_of(swsusp_resume_device, 0);
+	res = swap_type_of(swsusp_resume_device, swsusp_resume_block);
 	if (res < 0)
 		return res;
 
@@ -610,12 +611,16 @@ int swsusp_check(void)
 	if (!IS_ERR(resume_bdev)) {
 		set_blocksize(resume_bdev, PAGE_SIZE);
 		memset(&swsusp_header, 0, sizeof(swsusp_header));
-		if ((error = bio_read_page(0, &swsusp_header, NULL)))
+		error = bio_read_page(swsusp_resume_block,
+					&swsusp_header, NULL);
+		if (error)
 			return error;
+
 		if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
 			memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
 			/* Reset swap signature now */
-			error = bio_write_page(0, &swsusp_header, NULL);
+			error = bio_write_page(swsusp_resume_block,
+						&swsusp_header, NULL);
 		} else {
 			return -EINVAL;
 		}
-- 
cgit v0.10.2


From ecbd0da1eced957e0cbb611b4a4cb5b0cf63ba31 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:13 -0800
Subject: [PATCH] swsusp: document support for swap files

Document the "resume_offset=" command line parameter as well as the way in
which swap files are supported by swsusp.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 2ddc43e..1e183bd 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1370,6 +1370,12 @@ and is between 256 and 4096 characters. It is defined in the file
 	resume=		[SWSUSP]
 			Specify the partition device for software suspend
 
+	resume_offset=	[SWSUSP]
+			Specify the offset from the beginning of the partition
+			given by "resume=" at which the swap header is located,
+			in <PAGE_SIZE> units (needed only for swap files).
+			See  Documentation/power/swsusp-and-swap-files.txt
+
 	rhash_entries=	[KNL,NET]
 			Set number of hash buckets for route cache
 
diff --git a/Documentation/power/swsusp-and-swap-files.txt b/Documentation/power/swsusp-and-swap-files.txt
new file mode 100644
index 0000000..e171d11
--- /dev/null
+++ b/Documentation/power/swsusp-and-swap-files.txt
@@ -0,0 +1,54 @@
+Using swap files with software suspend (swsusp)
+	(C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+
+The Linux kernel handles swap files almost in the same way as it handles swap
+partitions and there are only two differences between these two types of swap
+areas:
+(1) swap files need not be contiguous,
+(2) the header of a swap file is not in the first block of the partition that
+holds it.  From the swsusp's point of view (1) is not a problem, because it is
+already taken care of by the swap-handling code, but (2) has to be taken into
+consideration.
+
+In principle the location of a swap file's header may be determined with the
+help of appropriate filesystem driver.  Unfortunately, however, it requires the
+filesystem holding the swap file to be mounted, and if this filesystem is
+journaled, it cannot be mounted during resume from disk.  For this reason to
+identify a swap file swsusp uses the name of the partition that holds the file
+and the offset from the beginning of the partition at which the swap file's
+header is located.  For convenience, this offset is expressed in <PAGE_SIZE>
+units.
+
+In order to use a swap file with swsusp, you need to:
+
+1) Create the swap file and make it active, eg.
+
+# dd if=/dev/zero of=<swap_file_path> bs=1024 count=<swap_file_size_in_k>
+# mkswap <swap_file_path>
+# swapon <swap_file_path>
+
+2) Use an application that will bmap the swap file with the help of the
+FIBMAP ioctl and determine the location of the file's swap header, as the
+offset, in <PAGE_SIZE> units, from the beginning of the partition which
+holds the swap file.
+
+3) Add the following parameters to the kernel command line:
+
+resume=<swap_file_partition> resume_offset=<swap_file_offset>
+
+where <swap_file_partition> is the partition on which the swap file is located
+and <swap_file_offset> is the offset of the swap header determined by the
+application in 2).  [Of course, this step may be carried out automatically
+by the same application that determies the swap file's header offset using the
+FIBMAP ioctl.]
+
+Now, swsusp will use the swap file in the same way in which it would use a swap
+partition.  [Of course this means that the resume from a swap file cannot be
+initiated from whithin an initrd of initramfs image.]  In particular, the
+swap file has to be active (ie. be present in /proc/swaps) so that it can be
+used for suspending.
+
+Note that if the swap file used for suspending is deleted and recreated,
+the location of its header need not be the same as before.  Thus every time
+this happens the value of the "resume_offset=" kernel command line parameter
+has to be updated.
diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt
index e635e6f..0761ff6 100644
--- a/Documentation/power/swsusp.txt
+++ b/Documentation/power/swsusp.txt
@@ -297,20 +297,12 @@ system is shut down or suspended. Additionally use the encrypted
 suspend image to prevent sensitive data from being stolen after
 resume.
 
-Q: Why can't we suspend to a swap file?
+Q: Can I suspend to a swap file?
 
-A: Because accessing swap file needs the filesystem mounted, and
-filesystem might do something wrong (like replaying the journal)
-during mount.
-
-There are few ways to get that fixed:
-
-1) Probably could be solved by modifying every filesystem to support
-some kind of "really read-only!" option. Patches welcome.
-
-2) suspend2 gets around that by storing absolute positions in on-disk
-image (and blocksize), with resume parameter pointing directly to
-suspend header.
+A: Generally, yes, you can.  However, it requires you to use the "resume=" and
+"resume_offset=" kernel command line parameters, so the resume from a swap file
+cannot be initiated from an initrd or initramfs image.  See
+swsusp-and-swap-files.txt for details.
 
 Q: Is there a maximum system RAM size that is supported by swsusp?
 
-- 
cgit v0.10.2


From 37b2ba12df88f0e29f2d52aaf1ab22789377d5b5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:15 -0800
Subject: [PATCH] swsusp: add ioctl for swap files support

To be able to use swap files as suspend storage from the userland suspend
tools we need an additional ioctl() that will allow us to provide the kernel
with both the swap header's offset and the identification of the resume
partition.

The new ioctl() should be regarded as a replacement for the
SNAPSHOT_SET_SWAP_FILE ioctl() that from now on will be considered as
obsolete, but has to stay for backwards compatibility of the interface.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/power/power.h b/kernel/power/power.h
index adaf7d4..7dbfd9f 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -106,6 +106,16 @@ extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
 extern int snapshot_image_loaded(struct snapshot_handle *handle);
 extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
 
+/*
+ * This structure is used to pass the values needed for the identification
+ * of the resume swap area from a user space to the kernel via the
+ * SNAPSHOT_SET_SWAP_AREA ioctl
+ */
+struct resume_swap_area {
+	loff_t offset;
+	u_int32_t dev;
+} __attribute__((packed));
+
 #define SNAPSHOT_IOC_MAGIC	'3'
 #define SNAPSHOT_FREEZE			_IO(SNAPSHOT_IOC_MAGIC, 1)
 #define SNAPSHOT_UNFREEZE		_IO(SNAPSHOT_IOC_MAGIC, 2)
@@ -119,7 +129,9 @@ extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
 #define SNAPSHOT_SET_SWAP_FILE		_IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
 #define SNAPSHOT_S2RAM			_IO(SNAPSHOT_IOC_MAGIC, 11)
 #define SNAPSHOT_PMOPS			_IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
-#define SNAPSHOT_IOC_MAXNR	12
+#define SNAPSHOT_SET_SWAP_AREA		_IOW(SNAPSHOT_IOC_MAGIC, 13, \
+							struct resume_swap_area)
+#define SNAPSHOT_IOC_MAXNR	13
 
 #define PMOPS_PREPARE	1
 #define PMOPS_ENTER	2
diff --git a/kernel/power/user.c b/kernel/power/user.c
index f0b7ef8..05c58a2 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -343,6 +343,37 @@ OutS3:
 		}
 		break;
 
+	case SNAPSHOT_SET_SWAP_AREA:
+		if (data->bitmap) {
+			error = -EPERM;
+		} else {
+			struct resume_swap_area swap_area;
+			dev_t swdev;
+
+			error = copy_from_user(&swap_area, (void __user *)arg,
+					sizeof(struct resume_swap_area));
+			if (error) {
+				error = -EFAULT;
+				break;
+			}
+
+			/*
+			 * User space encodes device types as two-byte values,
+			 * so we need to recode them
+			 */
+			swdev = old_decode_dev(swap_area.dev);
+			if (swdev) {
+				offset = swap_area.offset;
+				data->swap = swap_type_of(swdev, offset);
+				if (data->swap < 0)
+					error = -ENODEV;
+			} else {
+				data->swap = -1;
+				error = -EINVAL;
+			}
+		}
+		break;
+
 	default:
 		error = -ENOTTY;
 
-- 
cgit v0.10.2


From bf73bae6ba0dc4bd4f1e570feb34a06b72725af6 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:16 -0800
Subject: [PATCH] swsusp: update userland interface documentation

The swsusp userland interface has recently changed for a couple of times, but
the changes have not been documented.  Fix this, and document the
SNAPSHOT_SET_SWAP_AREA ioctl().

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/Documentation/power/swsusp-and-swap-files.txt b/Documentation/power/swsusp-and-swap-files.txt
index e171d11..06f911a 100644
--- a/Documentation/power/swsusp-and-swap-files.txt
+++ b/Documentation/power/swsusp-and-swap-files.txt
@@ -38,15 +38,21 @@ resume=<swap_file_partition> resume_offset=<swap_file_offset>
 
 where <swap_file_partition> is the partition on which the swap file is located
 and <swap_file_offset> is the offset of the swap header determined by the
-application in 2).  [Of course, this step may be carried out automatically
+application in 2) (of course, this step may be carried out automatically
 by the same application that determies the swap file's header offset using the
-FIBMAP ioctl.]
+FIBMAP ioctl)
+
+OR
+
+Use a userland suspend application that will set the partition and offset
+with the help of the SNAPSHOT_SET_SWAP_AREA ioctl described in
+Documentation/power/userland-swsusp.txt (this is the only method to suspend
+to a swap file allowing the resume to be initiated from an initrd or initramfs
+image).
 
 Now, swsusp will use the swap file in the same way in which it would use a swap
-partition.  [Of course this means that the resume from a swap file cannot be
-initiated from whithin an initrd of initramfs image.]  In particular, the
-swap file has to be active (ie. be present in /proc/swaps) so that it can be
-used for suspending.
+partition.  In particular, the swap file has to be active (ie. be present in
+/proc/swaps) so that it can be used for suspending.
 
 Note that if the swap file used for suspending is deleted and recreated,
 the location of its header need not be the same as before.  Thus every time
diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt
index 64755e9..000556c 100644
--- a/Documentation/power/userland-swsusp.txt
+++ b/Documentation/power/userland-swsusp.txt
@@ -9,9 +9,8 @@ done it already.
 Now, to use the userland interface for software suspend you need special
 utilities that will read/write the system memory snapshot from/to the
 kernel.  Such utilities are available, for example, from
-<http://www.sisk.pl/kernel/utilities/suspend>.  You may want to have
-a look at them if you are going to develop your own suspend/resume
-utilities.
+<http://suspend.sourceforge.net>.  You may want to have a look at them if you
+are going to develop your own suspend/resume utilities.
 
 The interface consists of a character device providing the open(),
 release(), read(), and write() operations as well as several ioctl()
@@ -21,9 +20,9 @@ be read from /sys/class/misc/snapshot/dev.
 
 The device can be open either for reading or for writing.  If open for
 reading, it is considered to be in the suspend mode.  Otherwise it is
-assumed to be in the resume mode.  The device cannot be open for reading
-and writing.  It is also impossible to have the device open more than once
-at a time.
+assumed to be in the resume mode.  The device cannot be open for simultaneous
+reading and writing.  It is also impossible to have the device open more than
+once at a time.
 
 The ioctl() commands recognized by the device are:
 
@@ -69,9 +68,46 @@ SNAPSHOT_FREE_SWAP_PAGES - free all swap pages allocated with
 SNAPSHOT_SET_SWAP_FILE - set the resume partition (the last ioctl() argument
 	should specify the device's major and minor numbers in the old
 	two-byte format, as returned by the stat() function in the .st_rdev
-	member of the stat structure); it is recommended to always use this
-	call, because the code to set the resume partition could be removed from
-	future kernels
+	member of the stat structure)
+
+SNAPSHOT_SET_SWAP_AREA - set the resume partition and the offset (in <PAGE_SIZE>
+	units) from the beginning of the partition at which the swap header is
+	located (the last ioctl() argument should point to a struct
+	resume_swap_area, as defined in kernel/power/power.h, containing the
+	resume device specification, as for the SNAPSHOT_SET_SWAP_FILE ioctl(),
+	and the offset); for swap partitions the offset is always 0, but it is
+	different to zero for swap files (please see
+	Documentation/swsusp-and-swap-files.txt for details).
+	The SNAPSHOT_SET_SWAP_AREA ioctl() is considered as a replacement for
+	SNAPSHOT_SET_SWAP_FILE which is regarded as obsolete.   It is
+	recommended to always use this call, because the code to set the resume
+	partition may be removed from future kernels
+
+SNAPSHOT_S2RAM - suspend to RAM; using this call causes the kernel to
+	immediately enter the suspend-to-RAM state, so this call must always
+	be preceded by the SNAPSHOT_FREEZE call and it is also necessary
+	to use the SNAPSHOT_UNFREEZE call after the system wakes up.  This call
+	is needed to implement the suspend-to-both mechanism in which the
+	suspend image is first created, as though the system had been suspended
+	to disk, and then the system is suspended to RAM (this makes it possible
+	to resume the system from RAM if there's enough battery power or restore
+	its state on the basis of the saved suspend image otherwise)
+
+SNAPSHOT_PMOPS - enable the usage of the pmops->prepare, pmops->enter and
+	pmops->finish methods (the in-kernel swsusp knows these as the "platform
+	method") which are needed on many machines to (among others) speed up
+	the resume by letting the BIOS skip some steps or to let the system
+	recognise the correct state of the hardware after the resume (in
+	particular on many machines this ensures that unplugged AC
+	adapters get correctly detected and that kacpid does not run wild after
+	the resume).  The last ioctl() argument can take one of the three
+	values, defined in kernel/power/power.h:
+	PMOPS_PREPARE - make the kernel carry out the
+		pm_ops->prepare(PM_SUSPEND_DISK) operation
+	PMOPS_ENTER - make the kernel power off the system by calling
+		pm_ops->enter(PM_SUSPEND_DISK)
+	PMOPS_FINISH - make the kernel carry out the
+		pm_ops->finish(PM_SUSPEND_DISK) operation
 
 The device's read() operation can be used to transfer the snapshot image from
 the kernel.  It has the following limitations:
@@ -91,10 +127,12 @@ unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are
 still frozen when the device is being closed).
 
 Currently it is assumed that the userland utilities reading/writing the
-snapshot image from/to the kernel will use a swap partition, called the resume
-partition, as storage space.  However, this is not really required, as they
-can use, for example, a special (blank) suspend partition or a file on a partition
-that is unmounted before SNAPSHOT_ATOMIC_SNAPSHOT and mounted afterwards.
+snapshot image from/to the kernel will use a swap parition, called the resume
+partition, or a swap file as storage space (if a swap file is used, the resume
+partition is the partition that holds this file).  However, this is not really
+required, as they can use, for example, a special (blank) suspend partition or
+a file on a partition that is unmounted before SNAPSHOT_ATOMIC_SNAPSHOT and
+mounted afterwards.
 
 These utilities SHOULD NOT make any assumptions regarding the ordering of
 data within the snapshot image, except for the image header that MAY be
-- 
cgit v0.10.2


From 8357376d3df21b7d6f857931a57ac50da9c66e26 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:18 -0800
Subject: [PATCH] swsusp: Improve handling of highmem

Currently swsusp saves the contents of highmem pages by copying them to the
normal zone which is quite inefficient (eg.  it requires two normal pages
to be used for saving one highmem page).  This may be improved by using
highmem for saving the contents of saveable highmem pages.

Namely, during the suspend phase of the suspend-resume cycle we try to
allocate as many free highmem pages as there are saveable highmem pages.
If there are not enough highmem image pages to store the contents of all of
the saveable highmem pages, some of them will be stored in the "normal"
memory.  Next, we allocate as many free "normal" pages as needed to store
the (remaining) image data.  We use a memory bitmap to mark the allocated
free pages (ie.  highmem as well as "normal" image pages).

Now, we use another memory bitmap to mark all of the saveable pages
(highmem as well as "normal") and the contents of the saveable pages are
copied into the image pages.  Then, the second bitmap is used to save the
pfns corresponding to the saveable pages and the first one is used to save
their data.

During the resume phase the pfns of the pages that were saveable during the
suspend are loaded from the image and used to mark the "unsafe" page
frames.  Next, we try to allocate as many free highmem page frames as to
load all of the image data that had been in the highmem before the suspend
and we allocate so many free "normal" page frames that the total number of
allocated free pages (highmem and "normal") is equal to the size of the
image.  While doing this we have to make sure that there will be some extra
free "normal" and "safe" page frames for two lists of PBEs constructed
later.

Now, the image data are loaded, if possible, into their "original" page
frames.  The image data that cannot be written into their "original" page
frames are loaded into "safe" page frames and their "original" kernel
virtual addresses, as well as the addresses of the "safe" pages containing
their copies, are stored in one of two lists of PBEs.

One list of PBEs is for the copies of "normal" suspend pages (ie.  "normal"
pages that were saveable during the suspend) and it is used in the same way
as previously (ie.  by the architecture-dependent parts of swsusp).  The
other list of PBEs is for the copies of highmem suspend pages.  The pages
in this list are restored (in a reversible way) right before the
arch-dependent code is called.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index b1237f1..bf99bd4 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -9,10 +9,13 @@
 #include <linux/init.h>
 #include <linux/pm.h>
 
-/* page backup entry */
+/* struct pbe is used for creating lists of pages that should be restored
+ * atomically during the resume from disk, because the page frames they have
+ * occupied before the suspend are in use.
+ */
 struct pbe {
-	unsigned long address;		/* address of the copy */
-	unsigned long orig_address;	/* original address of page */
+	void *address;		/* address of the copy */
+	void *orig_address;	/* original address of a page */
 	struct pbe *next;
 };
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 7dbfd9f..3763343 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -103,8 +103,8 @@ struct snapshot_handle {
 extern unsigned int snapshot_additional_pages(struct zone *zone);
 extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
 extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
+extern void snapshot_write_finalize(struct snapshot_handle *handle);
 extern int snapshot_image_loaded(struct snapshot_handle *handle);
-extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
 
 /*
  * This structure is used to pass the values needed for the identification
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 99f9b7d..fd8251d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1,15 +1,15 @@
 /*
  * linux/kernel/power/snapshot.c
  *
- * This file provide system snapshot/restore functionality.
+ * This file provides system snapshot/restore functionality for swsusp.
  *
  * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
  *
- * This file is released under the GPLv2, and is based on swsusp.c.
+ * This file is released under the GPLv2.
  *
  */
 
-
 #include <linux/version.h>
 #include <linux/module.h>
 #include <linux/mm.h>
@@ -34,137 +34,24 @@
 
 #include "power.h"
 
-/* List of PBEs used for creating and restoring the suspend image */
+/* List of PBEs needed for restoring the pages that were allocated before
+ * the suspend and included in the suspend image, but have also been
+ * allocated by the "resume" kernel, so their contents cannot be written
+ * directly to their "original" page frames.
+ */
 struct pbe *restore_pblist;
 
-static unsigned int nr_copy_pages;
-static unsigned int nr_meta_pages;
+/* Pointer to an auxiliary buffer (1 page) */
 static void *buffer;
 
-#ifdef CONFIG_HIGHMEM
-unsigned int count_highmem_pages(void)
-{
-	struct zone *zone;
-	unsigned long zone_pfn;
-	unsigned int n = 0;
-
-	for_each_zone (zone)
-		if (is_highmem(zone)) {
-			mark_free_pages(zone);
-			for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) {
-				struct page *page;
-				unsigned long pfn = zone_pfn + zone->zone_start_pfn;
-				if (!pfn_valid(pfn))
-					continue;
-				page = pfn_to_page(pfn);
-				if (PageReserved(page))
-					continue;
-				if (PageNosaveFree(page))
-					continue;
-				n++;
-			}
-		}
-	return n;
-}
-
-struct highmem_page {
-	char *data;
-	struct page *page;
-	struct highmem_page *next;
-};
-
-static struct highmem_page *highmem_copy;
-
-static int save_highmem_zone(struct zone *zone)
-{
-	unsigned long zone_pfn;
-	mark_free_pages(zone);
-	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
-		struct page *page;
-		struct highmem_page *save;
-		void *kaddr;
-		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
-
-		if (!(pfn%10000))
-			printk(".");
-		if (!pfn_valid(pfn))
-			continue;
-		page = pfn_to_page(pfn);
-		/*
-		 * This condition results from rvmalloc() sans vmalloc_32()
-		 * and architectural memory reservations. This should be
-		 * corrected eventually when the cases giving rise to this
-		 * are better understood.
-		 */
-		if (PageReserved(page))
-			continue;
-		BUG_ON(PageNosave(page));
-		if (PageNosaveFree(page))
-			continue;
-		save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
-		if (!save)
-			return -ENOMEM;
-		save->next = highmem_copy;
-		save->page = page;
-		save->data = (void *) get_zeroed_page(GFP_ATOMIC);
-		if (!save->data) {
-			kfree(save);
-			return -ENOMEM;
-		}
-		kaddr = kmap_atomic(page, KM_USER0);
-		memcpy(save->data, kaddr, PAGE_SIZE);
-		kunmap_atomic(kaddr, KM_USER0);
-		highmem_copy = save;
-	}
-	return 0;
-}
-
-int save_highmem(void)
-{
-	struct zone *zone;
-	int res = 0;
-
-	pr_debug("swsusp: Saving Highmem");
-	drain_local_pages();
-	for_each_zone (zone) {
-		if (is_highmem(zone))
-			res = save_highmem_zone(zone);
-		if (res)
-			return res;
-	}
-	printk("\n");
-	return 0;
-}
-
-int restore_highmem(void)
-{
-	printk("swsusp: Restoring Highmem\n");
-	while (highmem_copy) {
-		struct highmem_page *save = highmem_copy;
-		void *kaddr;
-		highmem_copy = save->next;
-
-		kaddr = kmap_atomic(save->page, KM_USER0);
-		memcpy(kaddr, save->data, PAGE_SIZE);
-		kunmap_atomic(kaddr, KM_USER0);
-		free_page((long) save->data);
-		kfree(save);
-	}
-	return 0;
-}
-#else
-static inline unsigned int count_highmem_pages(void) {return 0;}
-static inline int save_highmem(void) {return 0;}
-static inline int restore_highmem(void) {return 0;}
-#endif
-
 /**
  *	@safe_needed - on resume, for storing the PBE list and the image,
  *	we can only use memory pages that do not conflict with the pages
- *	used before suspend.
+ *	used before suspend.  The unsafe pages have PageNosaveFree set
+ *	and we count them using unsafe_pages.
  *
- *	The unsafe pages are marked with the PG_nosave_free flag
- *	and we count them using unsafe_pages
+ *	Each allocated image page is marked as PageNosave and PageNosaveFree
+ *	so that swsusp_free() can release it.
  */
 
 #define PG_ANY		0
@@ -174,7 +61,7 @@ static inline int restore_highmem(void) {return 0;}
 
 static unsigned int allocated_unsafe_pages;
 
-static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
+static void *get_image_page(gfp_t gfp_mask, int safe_needed)
 {
 	void *res;
 
@@ -195,20 +82,38 @@ static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
 
 unsigned long get_safe_page(gfp_t gfp_mask)
 {
-	return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE);
+	return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
+}
+
+static struct page *alloc_image_page(gfp_t gfp_mask) {
+	struct page *page;
+
+	page = alloc_page(gfp_mask);
+	if (page) {
+		SetPageNosave(page);
+		SetPageNosaveFree(page);
+	}
+	return page;
 }
 
 /**
  *	free_image_page - free page represented by @addr, allocated with
- *	alloc_image_page (page flags set by it must be cleared)
+ *	get_image_page (page flags set by it must be cleared)
  */
 
 static inline void free_image_page(void *addr, int clear_nosave_free)
 {
-	ClearPageNosave(virt_to_page(addr));
+	struct page *page;
+
+	BUG_ON(!virt_addr_valid(addr));
+
+	page = virt_to_page(addr);
+
+	ClearPageNosave(page);
 	if (clear_nosave_free)
-		ClearPageNosaveFree(virt_to_page(addr));
-	free_page((unsigned long)addr);
+		ClearPageNosaveFree(page);
+
+	__free_page(page);
 }
 
 /* struct linked_page is used to build chains of pages */
@@ -269,7 +174,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
 	if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
 		struct linked_page *lp;
 
-		lp = alloc_image_page(ca->gfp_mask, ca->safe_needed);
+		lp = get_image_page(ca->gfp_mask, ca->safe_needed);
 		if (!lp)
 			return NULL;
 
@@ -446,8 +351,8 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
 
 	/* Compute the number of zones */
 	nr = 0;
-	for_each_zone (zone)
-		if (populated_zone(zone) && !is_highmem(zone))
+	for_each_zone(zone)
+		if (populated_zone(zone))
 			nr++;
 
 	/* Allocate the list of zones bitmap objects */
@@ -459,10 +364,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
 	}
 
 	/* Initialize the zone bitmap objects */
-	for_each_zone (zone) {
+	for_each_zone(zone) {
 		unsigned long pfn;
 
-		if (!populated_zone(zone) || is_highmem(zone))
+		if (!populated_zone(zone))
 			continue;
 
 		zone_bm->start_pfn = zone->zone_start_pfn;
@@ -481,7 +386,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
 		while (bb) {
 			unsigned long *ptr;
 
-			ptr = alloc_image_page(gfp_mask, safe_needed);
+			ptr = get_image_page(gfp_mask, safe_needed);
 			bb->data = ptr;
 			if (!ptr)
 				goto Free;
@@ -669,9 +574,81 @@ unsigned int snapshot_additional_pages(struct zone *zone)
 
 	res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
 	res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
-	return res;
+	return 2 * res;
 }
 
+#ifdef CONFIG_HIGHMEM
+/**
+ *	count_free_highmem_pages - compute the total number of free highmem
+ *	pages, system-wide.
+ */
+
+static unsigned int count_free_highmem_pages(void)
+{
+	struct zone *zone;
+	unsigned int cnt = 0;
+
+	for_each_zone(zone)
+		if (populated_zone(zone) && is_highmem(zone))
+			cnt += zone->free_pages;
+
+	return cnt;
+}
+
+/**
+ *	saveable_highmem_page - Determine whether a highmem page should be
+ *	included in the suspend image.
+ *
+ *	We should save the page if it isn't Nosave or NosaveFree, or Reserved,
+ *	and it isn't a part of a free chunk of pages.
+ */
+
+static struct page *saveable_highmem_page(unsigned long pfn)
+{
+	struct page *page;
+
+	if (!pfn_valid(pfn))
+		return NULL;
+
+	page = pfn_to_page(pfn);
+
+	BUG_ON(!PageHighMem(page));
+
+	if (PageNosave(page) || PageReserved(page) || PageNosaveFree(page))
+		return NULL;
+
+	return page;
+}
+
+/**
+ *	count_highmem_pages - compute the total number of saveable highmem
+ *	pages.
+ */
+
+unsigned int count_highmem_pages(void)
+{
+	struct zone *zone;
+	unsigned int n = 0;
+
+	for_each_zone(zone) {
+		unsigned long pfn, max_zone_pfn;
+
+		if (!is_highmem(zone))
+			continue;
+
+		mark_free_pages(zone);
+		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+			if (saveable_highmem_page(pfn))
+				n++;
+	}
+	return n;
+}
+#else
+static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
+static inline unsigned int count_highmem_pages(void) { return 0; }
+#endif /* CONFIG_HIGHMEM */
+
 /**
  *	pfn_is_nosave - check if given pfn is in the 'nosave' section
  */
@@ -684,12 +661,12 @@ static inline int pfn_is_nosave(unsigned long pfn)
 }
 
 /**
- *	saveable - Determine whether a page should be cloned or not.
- *	@pfn:	The page
+ *	saveable - Determine whether a non-highmem page should be included in
+ *	the suspend image.
  *
- *	We save a page if it isn't Nosave, and is not in the range of pages
- *	statically defined as 'unsaveable', and it
- *	isn't a part of a free chunk of pages.
+ *	We should save the page if it isn't Nosave, and is not in the range
+ *	of pages statically defined as 'unsaveable', and it isn't a part of
+ *	a free chunk of pages.
  */
 
 static struct page *saveable_page(unsigned long pfn)
@@ -701,76 +678,130 @@ static struct page *saveable_page(unsigned long pfn)
 
 	page = pfn_to_page(pfn);
 
-	if (PageNosave(page))
+	BUG_ON(PageHighMem(page));
+
+	if (PageNosave(page) || PageNosaveFree(page))
 		return NULL;
+
 	if (PageReserved(page) && pfn_is_nosave(pfn))
 		return NULL;
-	if (PageNosaveFree(page))
-		return NULL;
 
 	return page;
 }
 
+/**
+ *	count_data_pages - compute the total number of saveable non-highmem
+ *	pages.
+ */
+
 unsigned int count_data_pages(void)
 {
 	struct zone *zone;
 	unsigned long pfn, max_zone_pfn;
 	unsigned int n = 0;
 
-	for_each_zone (zone) {
+	for_each_zone(zone) {
 		if (is_highmem(zone))
 			continue;
+
 		mark_free_pages(zone);
 		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
-			n += !!saveable_page(pfn);
+			if(saveable_page(pfn))
+				n++;
 	}
 	return n;
 }
 
-static inline void copy_data_page(long *dst, long *src)
+/* This is needed, because copy_page and memcpy are not usable for copying
+ * task structs.
+ */
+static inline void do_copy_page(long *dst, long *src)
 {
 	int n;
 
-	/* copy_page and memcpy are not usable for copying task structs. */
 	for (n = PAGE_SIZE / sizeof(long); n; n--)
 		*dst++ = *src++;
 }
 
+#ifdef CONFIG_HIGHMEM
+static inline struct page *
+page_is_saveable(struct zone *zone, unsigned long pfn)
+{
+	return is_highmem(zone) ?
+			saveable_highmem_page(pfn) : saveable_page(pfn);
+}
+
+static inline void
+copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
+{
+	struct page *s_page, *d_page;
+	void *src, *dst;
+
+	s_page = pfn_to_page(src_pfn);
+	d_page = pfn_to_page(dst_pfn);
+	if (PageHighMem(s_page)) {
+		src = kmap_atomic(s_page, KM_USER0);
+		dst = kmap_atomic(d_page, KM_USER1);
+		do_copy_page(dst, src);
+		kunmap_atomic(src, KM_USER0);
+		kunmap_atomic(dst, KM_USER1);
+	} else {
+		src = page_address(s_page);
+		if (PageHighMem(d_page)) {
+			/* Page pointed to by src may contain some kernel
+			 * data modified by kmap_atomic()
+			 */
+			do_copy_page(buffer, src);
+			dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0);
+			memcpy(dst, buffer, PAGE_SIZE);
+			kunmap_atomic(dst, KM_USER0);
+		} else {
+			dst = page_address(d_page);
+			do_copy_page(dst, src);
+		}
+	}
+}
+#else
+#define page_is_saveable(zone, pfn)	saveable_page(pfn)
+
+static inline void
+copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
+{
+	do_copy_page(page_address(pfn_to_page(dst_pfn)),
+			page_address(pfn_to_page(src_pfn)));
+}
+#endif /* CONFIG_HIGHMEM */
+
 static void
 copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
 {
 	struct zone *zone;
 	unsigned long pfn;
 
-	for_each_zone (zone) {
+	for_each_zone(zone) {
 		unsigned long max_zone_pfn;
 
-		if (is_highmem(zone))
-			continue;
-
 		mark_free_pages(zone);
 		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
-			if (saveable_page(pfn))
+			if (page_is_saveable(zone, pfn))
 				memory_bm_set_bit(orig_bm, pfn);
 	}
 	memory_bm_position_reset(orig_bm);
 	memory_bm_position_reset(copy_bm);
 	do {
 		pfn = memory_bm_next_pfn(orig_bm);
-		if (likely(pfn != BM_END_OF_MAP)) {
-			struct page *page;
-			void *src;
-
-			page = pfn_to_page(pfn);
-			src = page_address(page);
-			page = pfn_to_page(memory_bm_next_pfn(copy_bm));
-			copy_data_page(page_address(page), src);
-		}
+		if (likely(pfn != BM_END_OF_MAP))
+			copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
 	} while (pfn != BM_END_OF_MAP);
 }
 
+/* Total number of image pages */
+static unsigned int nr_copy_pages;
+/* Number of pages needed for saving the original pfns of the image pages */
+static unsigned int nr_meta_pages;
+
 /**
  *	swsusp_free - free pages allocated for the suspend.
  *
@@ -792,7 +823,7 @@ void swsusp_free(void)
 				if (PageNosave(page) && PageNosaveFree(page)) {
 					ClearPageNosave(page);
 					ClearPageNosaveFree(page);
-					free_page((long) page_address(page));
+					__free_page(page);
 				}
 			}
 	}
@@ -802,34 +833,108 @@ void swsusp_free(void)
 	buffer = NULL;
 }
 
+#ifdef CONFIG_HIGHMEM
+/**
+  *	count_pages_for_highmem - compute the number of non-highmem pages
+  *	that will be necessary for creating copies of highmem pages.
+  */
+
+static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
+{
+	unsigned int free_highmem = count_free_highmem_pages();
+
+	if (free_highmem >= nr_highmem)
+		nr_highmem = 0;
+	else
+		nr_highmem -= free_highmem;
+
+	return nr_highmem;
+}
+#else
+static unsigned int
+count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
+#endif /* CONFIG_HIGHMEM */
 
 /**
- *	enough_free_mem - Make sure we enough free memory to snapshot.
- *
- *	Returns TRUE or FALSE after checking the number of available
- *	free pages.
+ *	enough_free_mem - Make sure we have enough free memory for the
+ *	snapshot image.
  */
 
-static int enough_free_mem(unsigned int nr_pages)
+static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
 {
 	struct zone *zone;
 	unsigned int free = 0, meta = 0;
 
-	for_each_zone (zone)
-		if (!is_highmem(zone)) {
+	for_each_zone(zone) {
+		meta += snapshot_additional_pages(zone);
+		if (!is_highmem(zone))
 			free += zone->free_pages;
-			meta += snapshot_additional_pages(zone);
-		}
+	}
 
-	pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n",
+	nr_pages += count_pages_for_highmem(nr_highmem);
+	pr_debug("swsusp: Normal pages needed: %u + %u + %u, available pages: %u\n",
 		nr_pages, PAGES_FOR_IO, meta, free);
 
 	return free > nr_pages + PAGES_FOR_IO + meta;
 }
 
+#ifdef CONFIG_HIGHMEM
+/**
+ *	get_highmem_buffer - if there are some highmem pages in the suspend
+ *	image, we may need the buffer to copy them and/or load their data.
+ */
+
+static inline int get_highmem_buffer(int safe_needed)
+{
+	buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
+	return buffer ? 0 : -ENOMEM;
+}
+
+/**
+ *	alloc_highmem_image_pages - allocate some highmem pages for the image.
+ *	Try to allocate as many pages as needed, but if the number of free
+ *	highmem pages is lesser than that, allocate them all.
+ */
+
+static inline unsigned int
+alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
+{
+	unsigned int to_alloc = count_free_highmem_pages();
+
+	if (to_alloc > nr_highmem)
+		to_alloc = nr_highmem;
+
+	nr_highmem -= to_alloc;
+	while (to_alloc-- > 0) {
+		struct page *page;
+
+		page = alloc_image_page(__GFP_HIGHMEM);
+		memory_bm_set_bit(bm, page_to_pfn(page));
+	}
+	return nr_highmem;
+}
+#else
+static inline int get_highmem_buffer(int safe_needed) { return 0; }
+
+static inline unsigned int
+alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
+#endif /* CONFIG_HIGHMEM */
+
+/**
+ *	swsusp_alloc - allocate memory for the suspend image
+ *
+ *	We first try to allocate as many highmem pages as there are
+ *	saveable highmem pages in the system.  If that fails, we allocate
+ *	non-highmem pages for the copies of the remaining highmem ones.
+ *
+ *	In this approach it is likely that the copies of highmem pages will
+ *	also be located in the high memory, because of the way in which
+ *	copy_data_pages() works.
+ */
+
 static int
 swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
-		unsigned int nr_pages)
+		unsigned int nr_pages, unsigned int nr_highmem)
 {
 	int error;
 
@@ -841,13 +946,19 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
 	if (error)
 		goto Free;
 
+	if (nr_highmem > 0) {
+		error = get_highmem_buffer(PG_ANY);
+		if (error)
+			goto Free;
+
+		nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem);
+	}
 	while (nr_pages-- > 0) {
-		struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD);
+		struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
+
 		if (!page)
 			goto Free;
 
-		SetPageNosave(page);
-		SetPageNosaveFree(page);
 		memory_bm_set_bit(copy_bm, page_to_pfn(page));
 	}
 	return 0;
@@ -857,30 +968,39 @@ Free:
 	return -ENOMEM;
 }
 
-/* Memory bitmap used for marking saveable pages */
+/* Memory bitmap used for marking saveable pages (during suspend) or the
+ * suspend image pages (during resume)
+ */
 static struct memory_bitmap orig_bm;
-/* Memory bitmap used for marking allocated pages that will contain the copies
- * of saveable pages
+/* Memory bitmap used on suspend for marking allocated pages that will contain
+ * the copies of saveable pages.  During resume it is initially used for
+ * marking the suspend image pages, but then its set bits are duplicated in
+ * @orig_bm and it is released.  Next, on systems with high memory, it may be
+ * used for marking "safe" highmem pages, but it has to be reinitialized for
+ * this purpose.
  */
 static struct memory_bitmap copy_bm;
 
 asmlinkage int swsusp_save(void)
 {
-	unsigned int nr_pages;
+	unsigned int nr_pages, nr_highmem;
 
-	pr_debug("swsusp: critical section: \n");
+	printk("swsusp: critical section: \n");
 
 	drain_local_pages();
 	nr_pages = count_data_pages();
-	printk("swsusp: Need to copy %u pages\n", nr_pages);
+	nr_highmem = count_highmem_pages();
+	printk("swsusp: Need to copy %u pages\n", nr_pages + nr_highmem);
 
-	if (!enough_free_mem(nr_pages)) {
+	if (!enough_free_mem(nr_pages, nr_highmem)) {
 		printk(KERN_ERR "swsusp: Not enough free memory\n");
 		return -ENOMEM;
 	}
 
-	if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages))
+	if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages, nr_highmem)) {
+		printk(KERN_ERR "swsusp: Memory allocation failed\n");
 		return -ENOMEM;
+	}
 
 	/* During allocating of suspend pagedir, new cold pages may appear.
 	 * Kill them.
@@ -894,10 +1014,12 @@ asmlinkage int swsusp_save(void)
 	 * touch swap space! Except we must write out our image of course.
 	 */
 
+	nr_pages += nr_highmem;
 	nr_copy_pages = nr_pages;
-	nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
 
 	printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
+
 	return 0;
 }
 
@@ -960,7 +1082,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 
 	if (!buffer) {
 		/* This makes the buffer be freed by swsusp_free() */
-		buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
+		buffer = get_image_page(GFP_ATOMIC, PG_ANY);
 		if (!buffer)
 			return -ENOMEM;
 	}
@@ -975,9 +1097,23 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 			memset(buffer, 0, PAGE_SIZE);
 			pack_pfns(buffer, &orig_bm);
 		} else {
-			unsigned long pfn = memory_bm_next_pfn(&copy_bm);
+			struct page *page;
 
-			handle->buffer = page_address(pfn_to_page(pfn));
+			page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
+			if (PageHighMem(page)) {
+				/* Highmem pages are copied to the buffer,
+				 * because we can't return with a kmapped
+				 * highmem page (we may not be called again).
+				 */
+				void *kaddr;
+
+				kaddr = kmap_atomic(page, KM_USER0);
+				memcpy(buffer, kaddr, PAGE_SIZE);
+				kunmap_atomic(kaddr, KM_USER0);
+				handle->buffer = buffer;
+			} else {
+				handle->buffer = page_address(page);
+			}
 		}
 		handle->prev = handle->cur;
 	}
@@ -1005,7 +1141,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
 	unsigned long pfn, max_zone_pfn;
 
 	/* Clear page flags */
-	for_each_zone (zone) {
+	for_each_zone(zone) {
 		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 			if (pfn_valid(pfn))
@@ -1101,6 +1237,218 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 	}
 }
 
+/* List of "safe" pages that may be used to store data loaded from the suspend
+ * image
+ */
+static struct linked_page *safe_pages_list;
+
+#ifdef CONFIG_HIGHMEM
+/* struct highmem_pbe is used for creating the list of highmem pages that
+ * should be restored atomically during the resume from disk, because the page
+ * frames they have occupied before the suspend are in use.
+ */
+struct highmem_pbe {
+	struct page *copy_page;	/* data is here now */
+	struct page *orig_page;	/* data was here before the suspend */
+	struct highmem_pbe *next;
+};
+
+/* List of highmem PBEs needed for restoring the highmem pages that were
+ * allocated before the suspend and included in the suspend image, but have
+ * also been allocated by the "resume" kernel, so their contents cannot be
+ * written directly to their "original" page frames.
+ */
+static struct highmem_pbe *highmem_pblist;
+
+/**
+ *	count_highmem_image_pages - compute the number of highmem pages in the
+ *	suspend image.  The bits in the memory bitmap @bm that correspond to the
+ *	image pages are assumed to be set.
+ */
+
+static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
+{
+	unsigned long pfn;
+	unsigned int cnt = 0;
+
+	memory_bm_position_reset(bm);
+	pfn = memory_bm_next_pfn(bm);
+	while (pfn != BM_END_OF_MAP) {
+		if (PageHighMem(pfn_to_page(pfn)))
+			cnt++;
+
+		pfn = memory_bm_next_pfn(bm);
+	}
+	return cnt;
+}
+
+/**
+ *	prepare_highmem_image - try to allocate as many highmem pages as
+ *	there are highmem image pages (@nr_highmem_p points to the variable
+ *	containing the number of highmem image pages).  The pages that are
+ *	"safe" (ie. will not be overwritten when the suspend image is
+ *	restored) have the corresponding bits set in @bm (it must be
+ *	unitialized).
+ *
+ *	NOTE: This function should not be called if there are no highmem
+ *	image pages.
+ */
+
+static unsigned int safe_highmem_pages;
+
+static struct memory_bitmap *safe_highmem_bm;
+
+static int
+prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
+{
+	unsigned int to_alloc;
+
+	if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE))
+		return -ENOMEM;
+
+	if (get_highmem_buffer(PG_SAFE))
+		return -ENOMEM;
+
+	to_alloc = count_free_highmem_pages();
+	if (to_alloc > *nr_highmem_p)
+		to_alloc = *nr_highmem_p;
+	else
+		*nr_highmem_p = to_alloc;
+
+	safe_highmem_pages = 0;
+	while (to_alloc-- > 0) {
+		struct page *page;
+
+		page = alloc_page(__GFP_HIGHMEM);
+		if (!PageNosaveFree(page)) {
+			/* The page is "safe", set its bit the bitmap */
+			memory_bm_set_bit(bm, page_to_pfn(page));
+			safe_highmem_pages++;
+		}
+		/* Mark the page as allocated */
+		SetPageNosave(page);
+		SetPageNosaveFree(page);
+	}
+	memory_bm_position_reset(bm);
+	safe_highmem_bm = bm;
+	return 0;
+}
+
+/**
+ *	get_highmem_page_buffer - for given highmem image page find the buffer
+ *	that suspend_write_next() should set for its caller to write to.
+ *
+ *	If the page is to be saved to its "original" page frame or a copy of
+ *	the page is to be made in the highmem, @buffer is returned.  Otherwise,
+ *	the copy of the page is to be made in normal memory, so the address of
+ *	the copy is returned.
+ *
+ *	If @buffer is returned, the caller of suspend_write_next() will write
+ *	the page's contents to @buffer, so they will have to be copied to the
+ *	right location on the next call to suspend_write_next() and it is done
+ *	with the help of copy_last_highmem_page().  For this purpose, if
+ *	@buffer is returned, @last_highmem page is set to the page to which
+ *	the data will have to be copied from @buffer.
+ */
+
+static struct page *last_highmem_page;
+
+static void *
+get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+{
+	struct highmem_pbe *pbe;
+	void *kaddr;
+
+	if (PageNosave(page) && PageNosaveFree(page)) {
+		/* We have allocated the "original" page frame and we can
+		 * use it directly to store the loaded page.
+		 */
+		last_highmem_page = page;
+		return buffer;
+	}
+	/* The "original" page frame has not been allocated and we have to
+	 * use a "safe" page frame to store the loaded page.
+	 */
+	pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
+	if (!pbe) {
+		swsusp_free();
+		return NULL;
+	}
+	pbe->orig_page = page;
+	if (safe_highmem_pages > 0) {
+		struct page *tmp;
+
+		/* Copy of the page will be stored in high memory */
+		kaddr = buffer;
+		tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm));
+		safe_highmem_pages--;
+		last_highmem_page = tmp;
+		pbe->copy_page = tmp;
+	} else {
+		/* Copy of the page will be stored in normal memory */
+		kaddr = safe_pages_list;
+		safe_pages_list = safe_pages_list->next;
+		pbe->copy_page = virt_to_page(kaddr);
+	}
+	pbe->next = highmem_pblist;
+	highmem_pblist = pbe;
+	return kaddr;
+}
+
+/**
+ *	copy_last_highmem_page - copy the contents of a highmem image from
+ *	@buffer, where the caller of snapshot_write_next() has place them,
+ *	to the right location represented by @last_highmem_page .
+ */
+
+static void copy_last_highmem_page(void)
+{
+	if (last_highmem_page) {
+		void *dst;
+
+		dst = kmap_atomic(last_highmem_page, KM_USER0);
+		memcpy(dst, buffer, PAGE_SIZE);
+		kunmap_atomic(dst, KM_USER0);
+		last_highmem_page = NULL;
+	}
+}
+
+static inline int last_highmem_page_copied(void)
+{
+	return !last_highmem_page;
+}
+
+static inline void free_highmem_data(void)
+{
+	if (safe_highmem_bm)
+		memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR);
+
+	if (buffer)
+		free_image_page(buffer, PG_UNSAFE_CLEAR);
+}
+#else
+static inline int get_safe_write_buffer(void) { return 0; }
+
+static unsigned int
+count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
+
+static inline int
+prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
+{
+	return 0;
+}
+
+static inline void *
+get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+{
+	return NULL;
+}
+
+static inline void copy_last_highmem_page(void) {}
+static inline int last_highmem_page_copied(void) { return 1; }
+static inline void free_highmem_data(void) {}
+#endif /* CONFIG_HIGHMEM */
+
 /**
  *	prepare_image - use the memory bitmap @bm to mark the pages that will
  *	be overwritten in the process of restoring the system memory state
@@ -1110,20 +1458,25 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
  *	The idea is to allocate a new memory bitmap first and then allocate
  *	as many pages as needed for the image data, but not to assign these
  *	pages to specific tasks initially.  Instead, we just mark them as
- *	allocated and create a list of "safe" pages that will be used later.
+ *	allocated and create a lists of "safe" pages that will be used
+ *	later.  On systems with high memory a list of "safe" highmem pages is
+ *	also created.
  */
 
 #define PBES_PER_LINKED_PAGE	(LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
 
-static struct linked_page *safe_pages_list;
-
 static int
 prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 {
-	unsigned int nr_pages;
+	unsigned int nr_pages, nr_highmem;
 	struct linked_page *sp_list, *lp;
 	int error;
 
+	/* If there is no highmem, the buffer will not be necessary */
+	free_image_page(buffer, PG_UNSAFE_CLEAR);
+	buffer = NULL;
+
+	nr_highmem = count_highmem_image_pages(bm);
 	error = mark_unsafe_pages(bm);
 	if (error)
 		goto Free;
@@ -1134,6 +1487,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 
 	duplicate_memory_bitmap(new_bm, bm);
 	memory_bm_free(bm, PG_UNSAFE_KEEP);
+	if (nr_highmem > 0) {
+		error = prepare_highmem_image(bm, &nr_highmem);
+		if (error)
+			goto Free;
+	}
 	/* Reserve some safe pages for potential later use.
 	 *
 	 * NOTE: This way we make sure there will be enough safe pages for the
@@ -1142,10 +1500,10 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 	 */
 	sp_list = NULL;
 	/* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
-	nr_pages = nr_copy_pages - allocated_unsafe_pages;
+	nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
 	nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
 	while (nr_pages > 0) {
-		lp = alloc_image_page(GFP_ATOMIC, PG_SAFE);
+		lp = get_image_page(GFP_ATOMIC, PG_SAFE);
 		if (!lp) {
 			error = -ENOMEM;
 			goto Free;
@@ -1156,7 +1514,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 	}
 	/* Preallocate memory for the image */
 	safe_pages_list = NULL;
-	nr_pages = nr_copy_pages - allocated_unsafe_pages;
+	nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
 	while (nr_pages > 0) {
 		lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
 		if (!lp) {
@@ -1196,6 +1554,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 	struct pbe *pbe;
 	struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
 
+	if (PageHighMem(page))
+		return get_highmem_page_buffer(page, ca);
+
 	if (PageNosave(page) && PageNosaveFree(page))
 		/* We have allocated the "original" page frame and we can
 		 * use it directly to store the loaded page.
@@ -1210,12 +1571,12 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 		swsusp_free();
 		return NULL;
 	}
-	pbe->orig_address = (unsigned long)page_address(page);
-	pbe->address = (unsigned long)safe_pages_list;
+	pbe->orig_address = page_address(page);
+	pbe->address = safe_pages_list;
 	safe_pages_list = safe_pages_list->next;
 	pbe->next = restore_pblist;
 	restore_pblist = pbe;
-	return (void *)pbe->address;
+	return pbe->address;
 }
 
 /**
@@ -1249,14 +1610,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 	if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
 		return 0;
 
-	if (!buffer) {
-		/* This makes the buffer be freed by swsusp_free() */
-		buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
+	if (handle->offset == 0) {
+		if (!buffer)
+			/* This makes the buffer be freed by swsusp_free() */
+			buffer = get_image_page(GFP_ATOMIC, PG_ANY);
+
 		if (!buffer)
 			return -ENOMEM;
-	}
-	if (!handle->offset)
+
 		handle->buffer = buffer;
+	}
 	handle->sync_read = 1;
 	if (handle->prev < handle->cur) {
 		if (handle->prev == 0) {
@@ -1284,8 +1647,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 					return -ENOMEM;
 			}
 		} else {
+			copy_last_highmem_page();
 			handle->buffer = get_buffer(&orig_bm, &ca);
-			handle->sync_read = 0;
+			if (handle->buffer != buffer)
+				handle->sync_read = 0;
 		}
 		handle->prev = handle->cur;
 	}
@@ -1301,15 +1666,73 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 	return count;
 }
 
+/**
+ *	snapshot_write_finalize - must be called after the last call to
+ *	snapshot_write_next() in case the last page in the image happens
+ *	to be a highmem page and its contents should be stored in the
+ *	highmem.  Additionally, it releases the memory that will not be
+ *	used any more.
+ */
+
+void snapshot_write_finalize(struct snapshot_handle *handle)
+{
+	copy_last_highmem_page();
+	/* Free only if we have loaded the image entirely */
+	if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) {
+		memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
+		free_highmem_data();
+	}
+}
+
 int snapshot_image_loaded(struct snapshot_handle *handle)
 {
-	return !(!nr_copy_pages ||
+	return !(!nr_copy_pages || !last_highmem_page_copied() ||
 			handle->cur <= nr_meta_pages + nr_copy_pages);
 }
 
-void snapshot_free_unused_memory(struct snapshot_handle *handle)
+#ifdef CONFIG_HIGHMEM
+/* Assumes that @buf is ready and points to a "safe" page */
+static inline void
+swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
 {
-	/* Free only if we have loaded the image entirely */
-	if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
-		memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
+	void *kaddr1, *kaddr2;
+
+	kaddr1 = kmap_atomic(p1, KM_USER0);
+	kaddr2 = kmap_atomic(p2, KM_USER1);
+	memcpy(buf, kaddr1, PAGE_SIZE);
+	memcpy(kaddr1, kaddr2, PAGE_SIZE);
+	memcpy(kaddr2, buf, PAGE_SIZE);
+	kunmap_atomic(kaddr1, KM_USER0);
+	kunmap_atomic(kaddr2, KM_USER1);
+}
+
+/**
+ *	restore_highmem - for each highmem page that was allocated before
+ *	the suspend and included in the suspend image, and also has been
+ *	allocated by the "resume" kernel swap its current (ie. "before
+ *	resume") contents with the previous (ie. "before suspend") one.
+ *
+ *	If the resume eventually fails, we can call this function once
+ *	again and restore the "before resume" highmem state.
+ */
+
+int restore_highmem(void)
+{
+	struct highmem_pbe *pbe = highmem_pblist;
+	void *buf;
+
+	if (!pbe)
+		return 0;
+
+	buf = get_image_page(GFP_ATOMIC, PG_SAFE);
+	if (!buf)
+		return -ENOMEM;
+
+	while (pbe) {
+		swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf);
+		pbe = pbe->next;
+	}
+	free_image_page(buf, PG_UNSAFE_CLEAR);
+	return 0;
 }
+#endif /* CONFIG_HIGHMEM */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index aa5a9bf..cbd187e 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -558,7 +558,7 @@ static int load_image(struct swap_map_handle *handle,
 		error = err2;
 	if (!error) {
 		printk("\b\b\b\bdone\n");
-		snapshot_free_unused_memory(snapshot);
+		snapshot_write_finalize(snapshot);
 		if (!snapshot_image_loaded(snapshot))
 			error = -ENODATA;
 	}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 4147a75..68de5c1 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -64,10 +64,8 @@ int in_suspend __nosavedata = 0;
 
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void);
-int save_highmem(void);
 int restore_highmem(void);
 #else
-static inline int save_highmem(void) { return 0; }
 static inline int restore_highmem(void) { return 0; }
 static inline unsigned int count_highmem_pages(void) { return 0; }
 #endif
@@ -184,7 +182,7 @@ static inline unsigned long __shrink_memory(long tmp)
 
 int swsusp_shrink_memory(void)
 {
-	long size, tmp;
+	long tmp;
 	struct zone *zone;
 	unsigned long pages = 0;
 	unsigned int i = 0;
@@ -192,15 +190,27 @@ int swsusp_shrink_memory(void)
 
 	printk("Shrinking memory...  ");
 	do {
-		size = 2 * count_highmem_pages();
-		size += size / 50 + count_data_pages() + PAGES_FOR_IO;
+		long size, highmem_size;
+
+		highmem_size = count_highmem_pages();
+		size = count_data_pages() + PAGES_FOR_IO;
 		tmp = size;
+		size += highmem_size;
 		for_each_zone (zone)
-			if (!is_highmem(zone) && populated_zone(zone)) {
-				tmp -= zone->free_pages;
-				tmp += zone->lowmem_reserve[ZONE_NORMAL];
-				tmp += snapshot_additional_pages(zone);
+			if (populated_zone(zone)) {
+				if (is_highmem(zone)) {
+					highmem_size -= zone->free_pages;
+				} else {
+					tmp -= zone->free_pages;
+					tmp += zone->lowmem_reserve[ZONE_NORMAL];
+					tmp += snapshot_additional_pages(zone);
+				}
 			}
+
+		if (highmem_size < 0)
+			highmem_size = 0;
+
+		tmp += highmem_size;
 		if (tmp > 0) {
 			tmp = __shrink_memory(tmp);
 			if (!tmp)
@@ -223,6 +233,7 @@ int swsusp_suspend(void)
 
 	if ((error = arch_prepare_suspend()))
 		return error;
+
 	local_irq_disable();
 	/* At this point, device_suspend() has been called, but *not*
 	 * device_power_down(). We *must* device_power_down() now.
@@ -235,18 +246,11 @@ int swsusp_suspend(void)
 		goto Enable_irqs;
 	}
 
-	if ((error = save_highmem())) {
-		printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
-		goto Restore_highmem;
-	}
-
 	save_processor_state();
 	if ((error = swsusp_arch_suspend()))
 		printk(KERN_ERR "Error %d suspending\n", error);
 	/* Restore control flow magically appears here */
 	restore_processor_state();
-Restore_highmem:
-	restore_highmem();
 	/* NOTE:  device_power_up() is just a resume() for devices
 	 * that suspended with irqs off ... no overall powerup.
 	 */
@@ -268,18 +272,23 @@ int swsusp_resume(void)
 		printk(KERN_ERR "Some devices failed to power down, very bad\n");
 	/* We'll ignore saved state, but this gets preempt count (etc) right */
 	save_processor_state();
-	error = swsusp_arch_resume();
-	/* Code below is only ever reached in case of failure. Otherwise
-	 * execution continues at place where swsusp_arch_suspend was called
-         */
-	BUG_ON(!error);
+	error = restore_highmem();
+	if (!error) {
+		error = swsusp_arch_resume();
+		/* The code below is only ever reached in case of a failure.
+		 * Otherwise execution continues at place where
+		 * swsusp_arch_suspend() was called
+        	 */
+		BUG_ON(!error);
+		/* This call to restore_highmem() undos the previous one */
+		restore_highmem();
+	}
 	/* The only reason why swsusp_arch_resume() can fail is memory being
 	 * very tight, so we have to free it as soon as we can to avoid
 	 * subsequent failures
 	 */
 	swsusp_free();
 	restore_processor_state();
-	restore_highmem();
 	touch_softlockup_watchdog();
 	device_power_up();
 	local_irq_enable();
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 05c58a2..a63b25c 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -194,12 +194,12 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		break;
 
 	case SNAPSHOT_ATOMIC_RESTORE:
+		snapshot_write_finalize(&data->handle);
 		if (data->mode != O_WRONLY || !data->frozen ||
 		    !snapshot_image_loaded(&data->handle)) {
 			error = -EPERM;
 			break;
 		}
-		snapshot_free_unused_memory(&data->handle);
 		down(&pm_sem);
 		pm_prepare_console();
 		suspend_console();
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2e97baa..2a6a79f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1260,6 +1260,9 @@ out:
 	}
 	if (!all_zones_ok) {
 		cond_resched();
+
+		try_to_freeze();
+
 		goto loop_again;
 	}
 
-- 
cgit v0.10.2


From 859491218770315ba95ee3fa09961fc71c506cae Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:19 -0800
Subject: [PATCH] swsusp: use __GFP_WAIT

swsusp uses GFP_ATOMIC, but it can afford to use __GFP_WAIT, which will
permit it to reclaim clean pagecache instead of emitting scary
page-allocation-failure messages.

Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index cbd187e..52e70ca 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -63,7 +63,7 @@ static int submit(int rw, pgoff_t page_off, struct page *page,
 {
 	struct bio *bio;
 
-	bio = bio_alloc(GFP_ATOMIC, 1);
+	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
 	if (!bio)
 		return -ENOMEM;
 	bio->bi_sector = page_off * (PAGE_SIZE >> 9);
@@ -216,7 +216,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
 		return -ENOSPC;
 
 	if (bio_chain) {
-		src = (void *)__get_free_page(GFP_ATOMIC);
+		src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
 		if (src) {
 			memcpy(src, buf, PAGE_SIZE);
 		} else {
@@ -473,7 +473,7 @@ static int get_swap_reader(struct swap_map_handle *handle, sector_t start)
 	if (!start)
 		return -EINVAL;
 
-	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+	handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
 	if (!handle->cur)
 		return -ENOMEM;
 
-- 
cgit v0.10.2


From 8a05aac2631aa0e6494d9dc990f8c68ed8b8fde7 Mon Sep 17 00:00:00 2001
From: Stefan Seyfried <seife@suse.de>
Date: Wed, 6 Dec 2006 20:34:21 -0800
Subject: [PATCH] swsusp: fix platform mode

At some point after 2.6.13, in-kernel software suspend got "incomplete" for
the so-called "platform" mode.  pm_ops->prepare() is never called.  A
visible sign of this is the "moon" light on thinkpads not flashing during
suspend.  Fix by readding the pm_ops->prepare call during suspend.

Signed-off-by: Stefan Seyfried <seife@suse.de>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index d79feeb..f507923 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -30,6 +30,22 @@ dev_t swsusp_resume_device;
 sector_t swsusp_resume_block;
 
 /**
+ *	platform_prepare - prepare the machine for hibernation using the
+ *	platform driver if so configured and return an error code if it fails
+ */
+
+static inline int platform_prepare(void)
+{
+	int error = 0;
+
+	if (pm_disk_mode == PM_DISK_PLATFORM) {
+		if (pm_ops && pm_ops->prepare)
+			error = pm_ops->prepare(PM_SUSPEND_DISK);
+	}
+	return error;
+}
+
+/**
  *	power_down - Shut machine down for hibernate.
  *	@mode:		Suspend-to-disk mode
  *
@@ -91,9 +107,15 @@ static int prepare_processes(void)
 		goto thaw;
 	}
 
+	error = platform_prepare();
+	if (error)
+		goto thaw;
+
 	/* Free memory before shutting down devices. */
 	if (!(error = swsusp_shrink_memory()))
 		return 0;
+
+	platform_finish();
 thaw:
 	thaw_processes();
 enable_cpus:
-- 
cgit v0.10.2


From 7dfb71030f7636a0d65200158113c37764552f93 Mon Sep 17 00:00:00 2001
From: Nigel Cunningham <ncunningham@linuxmail.org>
Date: Wed, 6 Dec 2006 20:34:23 -0800
Subject: [PATCH] Add include/linux/freezer.h and move definitions from sched.h

Move process freezing functions from include/linux/sched.h to freezer.h, so
that modifications to the freezer or the kernel configuration don't require
recompiling just about everything.

[akpm@osdl.org: fix ueagle driver]
Signed-off-by: Nigel Cunningham <nigel@suspend2.net>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 48cf7ff..f38a60a 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -11,6 +11,7 @@
 #include <linux/signal.h>
 #include <linux/ptrace.h>
 #include <linux/personality.h>
+#include <linux/freezer.h>
 
 #include <asm/cacheflush.h>
 #include <asm/ucontext.h>
diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c
index 3309665..0ec14854 100644
--- a/arch/avr32/kernel/signal.c
+++ b/arch/avr32/kernel/signal.c
@@ -15,7 +15,7 @@
 #include <linux/errno.h>
 #include <linux/ptrace.h>
 #include <linux/unistd.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
 
 #include <asm/uaccess.h>
 #include <asm/ucontext.h>
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index b8a5882..85baeae 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -21,7 +21,7 @@
 #include <linux/ptrace.h>
 #include <linux/unistd.h>
 #include <linux/personality.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c
index 7787f70..0295560 100644
--- a/arch/h8300/kernel/signal.c
+++ b/arch/h8300/kernel/signal.c
@@ -38,7 +38,7 @@
 #include <linux/personality.h>
 #include <linux/tty.h>
 #include <linux/binfmts.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
 
 #include <asm/setup.h>
 #include <asm/uaccess.h>
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 3b7a63e..44c5a32 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -34,6 +34,7 @@
 #include <linux/pci.h>
 #include <linux/msi.h>
 #include <linux/htirq.h>
+#include <linux/freezer.h>
 
 #include <asm/io.h>
 #include <asm/smp.h>
diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c
index b60cea4..092ea86 100644
--- a/arch/m32r/kernel/signal.c
+++ b/arch/m32r/kernel/signal.c
@@ -21,7 +21,7 @@
 #include <linux/unistd.h>
 #include <linux/stddef.h>
 #include <linux/personality.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
 #include <asm/cacheflush.h>
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 320353f..e4ebe1a 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -36,7 +36,7 @@
 #include <linux/stddef.h>
 #include <linux/tty.h>
 #include <linux/binfmts.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
 #endif
 
 #include <asm/uaccess.h>
diff --git a/arch/sh/kernel/signal.c b/arch/sh/kernel/signal.c
index 50d7c49..bb1c480 100644
--- a/arch/sh/kernel/signal.c
+++ b/arch/sh/kernel/signal.c
@@ -23,6 +23,7 @@
 #include <linux/elf.h>
 #include <linux/personality.h>
 #include <linux/binfmts.h>
+#include <linux/freezer.h>
 
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
diff --git a/arch/sh64/kernel/signal.c b/arch/sh64/kernel/signal.c
index 9e2ffc4..1666d3e 100644
--- a/arch/sh64/kernel/signal.c
+++ b/arch/sh64/kernel/signal.c
@@ -22,7 +22,7 @@
 #include <linux/errno.h>
 #include <linux/wait.h>
 #include <linux/personality.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
 #include <linux/ptrace.h>
 #include <linux/unistd.h>
 #include <linux/stddef.h>
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index f2904f6..e45eaa2 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -54,7 +54,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/miscdevice.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
 #include <linux/mutex.h>
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_ioctl.h>
diff --git a/drivers/char/hvc_console.c b/drivers/char/hvc_console.c
index 9902ffa..cc2cd46 100644
--- a/drivers/char/hvc_console.c
+++ b/drivers/char/hvc_console.c
@@ -38,6 +38,7 @@
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/delay.h>
+#include <linux/freezer.h>
 
 #include <asm/uaccess.h>
 
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 75e9e38..1b4fc92 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -28,6 +28,7 @@
 #include <linux/sysdev.h>
 #include <linux/ctype.h>
 #include <linux/kthread.h>
+#include <linux/freezer.h>
 #include <asm/uaccess.h>
 #include <asm/page.h>
 #include <asm/edac.h>
diff --git a/drivers/ieee1394/nodemgr.c b/drivers/ieee1394/nodemgr.c
index 8e7b83f..e829c93 100644
--- a/drivers/ieee1394/nodemgr.c
+++ b/drivers/ieee1394/nodemgr.c
@@ -15,6 +15,7 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/moduleparam.h>
+#include <linux/freezer.h>
 #include <asm/atomic.h>
 
 #include "csr.h"
diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index a0af97e..79dfb4b 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -23,6 +23,7 @@
 #include <linux/kthread.h>
 #include <linux/sched.h>	/* HZ */
 #include <linux/mutex.h>
+#include <linux/freezer.h>
 
 /*#include <asm/io.h>*/
 
diff --git a/drivers/input/serio/serio.c b/drivers/input/serio/serio.c
index 211943f..5f1d403 100644
--- a/drivers/input/serio/serio.c
+++ b/drivers/input/serio/serio.c
@@ -35,6 +35,7 @@
 #include <linux/slab.h>
 #include <linux/kthread.h>
 #include <linux/mutex.h>
+#include <linux/freezer.h>
 
 MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>");
 MODULE_DESCRIPTION("Serio abstraction core");
diff --git a/drivers/macintosh/therm_adt746x.c b/drivers/macintosh/therm_adt746x.c
index 13b953a..3d3bf16 100644
--- a/drivers/macintosh/therm_adt746x.c
+++ b/drivers/macintosh/therm_adt746x.c
@@ -24,6 +24,7 @@
 #include <linux/suspend.h>
 #include <linux/kthread.h>
 #include <linux/moduleparam.h>
+#include <linux/freezer.h>
 
 #include <asm/prom.h>
 #include <asm/machdep.h>
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index e63ea1c..c8558d4 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -42,7 +42,7 @@
 #include <linux/interrupt.h>
 #include <linux/device.h>
 #include <linux/sysdev.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
 #include <linux/syscalls.h>
 #include <linux/cpu.h>
 #include <asm/prom.h>
diff --git a/drivers/macintosh/windfarm_core.c b/drivers/macintosh/windfarm_core.c
index ab3faa7..e947af9 100644
--- a/drivers/macintosh/windfarm_core.c
+++ b/drivers/macintosh/windfarm_core.c
@@ -34,6 +34,7 @@
 #include <linux/device.h>
 #include <linux/platform_device.h>
 #include <linux/mutex.h>
+#include <linux/freezer.h>
 
 #include <asm/prom.h>
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8cbf9c9..6c4345b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -39,10 +39,10 @@
 #include <linux/raid/bitmap.h>
 #include <linux/sysctl.h>
 #include <linux/buffer_head.h> /* for invalidate_bdev */
-#include <linux/suspend.h>
 #include <linux/poll.h>
 #include <linux/mutex.h>
 #include <linux/ctype.h>
+#include <linux/freezer.h>
 
 #include <linux/init.h>
 
diff --git a/drivers/media/dvb/dvb-core/dvb_frontend.c b/drivers/media/dvb/dvb-core/dvb_frontend.c
index a2ab2ee..e859722 100644
--- a/drivers/media/dvb/dvb-core/dvb_frontend.c
+++ b/drivers/media/dvb/dvb-core/dvb_frontend.c
@@ -34,7 +34,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/list.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
 #include <linux/jiffies.h>
 #include <asm/processor.h>
 
diff --git a/drivers/media/video/msp3400-driver.c b/drivers/media/video/msp3400-driver.c
index cf43df3..e1b56dc 100644
--- a/drivers/media/video/msp3400-driver.c
+++ b/drivers/media/video/msp3400-driver.c
@@ -56,7 +56,7 @@
 #include <media/tvaudio.h>
 #include <media/msp3400.h>
 #include <linux/kthread.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
 #include "msp3400-driver.h"
 
 /* ---------------------------------------------------------------------- */
diff --git a/drivers/media/video/tvaudio.c b/drivers/media/video/tvaudio.c
index fcaef4b..d506dfa 100644
--- a/drivers/media/video/tvaudio.c
+++ b/drivers/media/video/tvaudio.c
@@ -29,6 +29,7 @@
 #include <linux/init.h>
 #include <linux/smp_lock.h>
 #include <linux/kthread.h>
+#include <linux/freezer.h>
 
 #include <media/tvaudio.h>
 #include <media/v4l2-common.h>
diff --git a/drivers/media/video/video-buf-dvb.c b/drivers/media/video/video-buf-dvb.c
index f53edf1..fcc5467 100644
--- a/drivers/media/video/video-buf-dvb.c
+++ b/drivers/media/video/video-buf-dvb.c
@@ -20,7 +20,7 @@
 #include <linux/fs.h>
 #include <linux/kthread.h>
 #include <linux/file.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
 
 #include <media/video-buf.h>
 #include <media/video-buf-dvb.h>
diff --git a/drivers/media/video/vivi.c b/drivers/media/video/vivi.c
index 3c8dc72..9986de5 100644
--- a/drivers/media/video/vivi.c
+++ b/drivers/media/video/vivi.c
@@ -36,6 +36,7 @@
 #include <media/v4l2-common.h>
 #include <linux/kthread.h>
 #include <linux/highmem.h>
+#include <linux/freezer.h>
 
 /* Wake up at about 30 fps */
 #define WAKE_NUMERATOR 30
diff --git a/drivers/mfd/ucb1x00-ts.c b/drivers/mfd/ucb1x00-ts.c
index 82938ad..ce1a481 100644
--- a/drivers/mfd/ucb1x00-ts.c
+++ b/drivers/mfd/ucb1x00-ts.c
@@ -28,7 +28,7 @@
 #include <linux/string.h>
 #include <linux/input.h>
 #include <linux/device.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
 #include <linux/slab.h>
 #include <linux/kthread.h>
 
diff --git a/drivers/net/irda/stir4200.c b/drivers/net/irda/stir4200.c
index 3b4c478..c14a746 100644
--- a/drivers/net/irda/stir4200.c
+++ b/drivers/net/irda/stir4200.c
@@ -50,6 +50,7 @@
 #include <linux/usb.h>
 #include <linux/crc32.h>
 #include <linux/kthread.h>
+#include <linux/freezer.h>
 #include <net/irda/irda.h>
 #include <net/irda/irlap.h>
 #include <net/irda/irda_device.h>
diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
index efcdaf1..44a2270 100644
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -49,6 +49,7 @@
 #include <asm/uaccess.h>
 #include <net/ieee80211.h>
 #include <linux/kthread.h>
+#include <linux/freezer.h>
 
 #include "airo.h"
 
diff --git a/drivers/pcmcia/cs.c b/drivers/pcmcia/cs.c
index f9cd831..606a467 100644
--- a/drivers/pcmcia/cs.c
+++ b/drivers/pcmcia/cs.c
@@ -29,6 +29,7 @@
 #include <linux/pci.h>
 #include <linux/device.h>
 #include <linux/kthread.h>
+#include <linux/freezer.h>
 #include <asm/system.h>
 #include <asm/irq.h>
 
diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c
index 81a6c83..81186f4 100644
--- a/drivers/pnp/pnpbios/core.c
+++ b/drivers/pnp/pnpbios/core.c
@@ -61,6 +61,7 @@
 #include <linux/dmi.h>
 #include <linux/delay.h>
 #include <linux/acpi.h>
+#include <linux/freezer.h>
 
 #include <asm/page.h>
 #include <asm/desc.h>
diff --git a/drivers/usb/atm/ueagle-atm.c b/drivers/usb/atm/ueagle-atm.c
index f2d196f..dae4ef1 100644
--- a/drivers/usb/atm/ueagle-atm.c
+++ b/drivers/usb/atm/ueagle-atm.c
@@ -64,6 +64,8 @@
 #include <linux/kthread.h>
 #include <linux/version.h>
 #include <linux/mutex.h>
+#include <linux/freezer.h>
+
 #include <asm/unaligned.h>
 
 #include "usbatm.h"
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 77c05be..2651c2e 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -22,6 +22,7 @@
 #include <linux/usbdevice_fs.h>
 #include <linux/kthread.h>
 #include <linux/mutex.h>
+#include <linux/freezer.h>
 
 #include <asm/semaphore.h>
 #include <asm/uaccess.h>
diff --git a/drivers/usb/gadget/file_storage.c b/drivers/usb/gadget/file_storage.c
index 8b975d1..c98316c 100644
--- a/drivers/usb/gadget/file_storage.c
+++ b/drivers/usb/gadget/file_storage.c
@@ -250,7 +250,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
 #include <linux/utsname.h>
 
 #include <linux/usb_ch9.h>
diff --git a/drivers/usb/storage/usb.c b/drivers/usb/storage/usb.c
index b401084..7064450 100644
--- a/drivers/usb/storage/usb.c
+++ b/drivers/usb/storage/usb.c
@@ -49,7 +49,7 @@
 
 #include <linux/sched.h>
 #include <linux/errno.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/slab.h>
diff --git a/drivers/w1/w1.c b/drivers/w1/w1.c
index de3e979..63c0724 100644
--- a/drivers/w1/w1.c
+++ b/drivers/w1/w1.c
@@ -31,6 +31,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/kthread.h>
+#include <linux/freezer.h>
 
 #include <asm/atomic.h>
 
diff --git a/fs/afs/kafsasyncd.c b/fs/afs/kafsasyncd.c
index f09a794..615df24 100644
--- a/fs/afs/kafsasyncd.c
+++ b/fs/afs/kafsasyncd.c
@@ -20,6 +20,7 @@
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/completion.h>
+#include <linux/freezer.h>
 #include "cell.h"
 #include "server.h"
 #include "volume.h"
diff --git a/fs/afs/kafstimod.c b/fs/afs/kafstimod.c
index 65bc05a..694344e 100644
--- a/fs/afs/kafstimod.c
+++ b/fs/afs/kafstimod.c
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/completion.h>
+#include <linux/freezer.h>
 #include "cell.h"
 #include "volume.h"
 #include "kafstimod.h"
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index e6b5866..71bc87a 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -34,6 +34,7 @@
 #include <linux/mempool.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
+#include <linux/freezer.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
 #define DECLARE_GLOBALS_HERE
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 71f7791..2caca06 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -31,6 +31,7 @@
 #include <linux/delay.h>
 #include <linux/completion.h>
 #include <linux/pagevec.h>
+#include <linux/freezer.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include "cifspdu.h"
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index a8774be..10fff94 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -31,7 +31,7 @@
 #include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/mm.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
 #include <linux/pagemap.h>
 #include <linux/kthread.h>
 #include <linux/poison.h>
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 5035601..44fc32b 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -31,7 +31,7 @@
 #include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/mm.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
 #include <linux/pagemap.h>
 #include <linux/kthread.h>
 #include <linux/poison.h>
diff --git a/fs/jffs/intrep.c b/fs/jffs/intrep.c
index 4a543e1..478a74e 100644
--- a/fs/jffs/intrep.c
+++ b/fs/jffs/intrep.c
@@ -66,6 +66,7 @@
 #include <linux/smp_lock.h>
 #include <linux/time.h>
 #include <linux/ctype.h>
+#include <linux/freezer.h>
 
 #include "intrep.h"
 #include "jffs_fm.h"
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index ff2a872..6eb3dae 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -16,6 +16,7 @@
 #include <linux/mtd/mtd.h>
 #include <linux/completion.h>
 #include <linux/sched.h>
+#include <linux/freezer.h>
 #include "nodelist.h"
 
 
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index b89c9ab..5065baa 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -67,7 +67,7 @@
 #include <linux/kthread.h>
 #include <linux/buffer_head.h>		/* for sync_blockdev() */
 #include <linux/bio.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
 #include <linux/delay.h>
 #include <linux/mutex.h>
 #include "jfs_incore.h"
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 81f6f04..d558e51 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -46,7 +46,7 @@
 #include <linux/vmalloc.h>
 #include <linux/smp_lock.h>
 #include <linux/completion.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/kthread.h>
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 3d84f60..50643b6 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -13,6 +13,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/utsname.h>
 #include <linux/smp_lock.h>
+#include <linux/freezer.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index eef4a0b..b971237 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -32,6 +32,7 @@
 #include <linux/kthread.h>
 #include <linux/migrate.h>
 #include <linux/backing-dev.h>
+#include <linux/freezer.h>
 
 STATIC kmem_zone_t *xfs_buf_zone;
 STATIC kmem_shaker_t xfs_buf_shake;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index de05abb..b93265b 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -56,6 +56,7 @@
 #include <linux/mempool.h>
 #include <linux/writeback.h>
 #include <linux/kthread.h>
+#include <linux/freezer.h>
 
 STATIC struct quotactl_ops xfs_quotactl_operations;
 STATIC struct super_operations xfs_super_operations;
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
new file mode 100644
index 0000000..266373f
--- /dev/null
+++ b/include/linux/freezer.h
@@ -0,0 +1,84 @@
+/* Freezer declarations */
+
+#ifdef CONFIG_PM
+/*
+ * Check if a process has been frozen
+ */
+static inline int frozen(struct task_struct *p)
+{
+	return p->flags & PF_FROZEN;
+}
+
+/*
+ * Check if there is a request to freeze a process
+ */
+static inline int freezing(struct task_struct *p)
+{
+	return p->flags & PF_FREEZE;
+}
+
+/*
+ * Request that a process be frozen
+ * FIXME: SMP problem. We may not modify other process' flags!
+ */
+static inline void freeze(struct task_struct *p)
+{
+	p->flags |= PF_FREEZE;
+}
+
+/*
+ * Sometimes we may need to cancel the previous 'freeze' request
+ */
+static inline void do_not_freeze(struct task_struct *p)
+{
+	p->flags &= ~PF_FREEZE;
+}
+
+/*
+ * Wake up a frozen process
+ */
+static inline int thaw_process(struct task_struct *p)
+{
+	if (frozen(p)) {
+		p->flags &= ~PF_FROZEN;
+		wake_up_process(p);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * freezing is complete, mark process as frozen
+ */
+static inline void frozen_process(struct task_struct *p)
+{
+	p->flags = (p->flags & ~PF_FREEZE) | PF_FROZEN;
+}
+
+extern void refrigerator(void);
+extern int freeze_processes(void);
+extern void thaw_processes(void);
+
+static inline int try_to_freeze(void)
+{
+	if (freezing(current)) {
+		refrigerator();
+		return 1;
+	} else
+		return 0;
+}
+#else
+static inline int frozen(struct task_struct *p) { return 0; }
+static inline int freezing(struct task_struct *p) { return 0; }
+static inline void freeze(struct task_struct *p) { BUG(); }
+static inline int thaw_process(struct task_struct *p) { return 1; }
+static inline void frozen_process(struct task_struct *p) { BUG(); }
+
+static inline void refrigerator(void) {}
+static inline int freeze_processes(void) { BUG(); return 0; }
+static inline void thaw_processes(void) {}
+
+static inline int try_to_freeze(void) { return 0; }
+
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index acfd2e1..837a012 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1618,87 +1618,6 @@ extern int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls);
 
 extern void normalize_rt_tasks(void);
 
-#ifdef CONFIG_PM
-/*
- * Check if a process has been frozen
- */
-static inline int frozen(struct task_struct *p)
-{
-	return p->flags & PF_FROZEN;
-}
-
-/*
- * Check if there is a request to freeze a process
- */
-static inline int freezing(struct task_struct *p)
-{
-	return p->flags & PF_FREEZE;
-}
-
-/*
- * Request that a process be frozen
- * FIXME: SMP problem. We may not modify other process' flags!
- */
-static inline void freeze(struct task_struct *p)
-{
-	p->flags |= PF_FREEZE;
-}
-
-/*
- * Sometimes we may need to cancel the previous 'freeze' request
- */
-static inline void do_not_freeze(struct task_struct *p)
-{
-	p->flags &= ~PF_FREEZE;
-}
-
-/*
- * Wake up a frozen process
- */
-static inline int thaw_process(struct task_struct *p)
-{
-	if (frozen(p)) {
-		p->flags &= ~PF_FROZEN;
-		wake_up_process(p);
-		return 1;
-	}
-	return 0;
-}
-
-/*
- * freezing is complete, mark process as frozen
- */
-static inline void frozen_process(struct task_struct *p)
-{
-	p->flags = (p->flags & ~PF_FREEZE) | PF_FROZEN;
-}
-
-extern void refrigerator(void);
-extern int freeze_processes(void);
-extern void thaw_processes(void);
-
-static inline int try_to_freeze(void)
-{
-	if (freezing(current)) {
-		refrigerator();
-		return 1;
-	} else
-		return 0;
-}
-#else
-static inline int frozen(struct task_struct *p) { return 0; }
-static inline int freezing(struct task_struct *p) { return 0; }
-static inline void freeze(struct task_struct *p) { BUG(); }
-static inline int thaw_process(struct task_struct *p) { return 1; }
-static inline void frozen_process(struct task_struct *p) { BUG(); }
-
-static inline void refrigerator(void) {}
-static inline int freeze_processes(void) { BUG(); return 0; }
-static inline void thaw_processes(void) {}
-
-static inline int try_to_freeze(void) { return 0; }
-
-#endif /* CONFIG_PM */
 #endif /* __KERNEL__ */
 
 #endif
diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
index 919a80c..2cfd7cb 100644
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -6,6 +6,7 @@
 #include <linux/romfs_fs.h>
 #include <linux/initrd.h>
 #include <linux/sched.h>
+#include <linux/freezer.h>
 
 #include "do_mounts.h"
 
diff --git a/kernel/audit.c b/kernel/audit.c
index 98106f6..d9b690a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -57,6 +57,7 @@
 #include <linux/netlink.h>
 #include <linux/selinux.h>
 #include <linux/inotify.h>
+#include <linux/freezer.h>
 
 #include "audit.h"
 
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index f507923..53b3b57 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -20,6 +20,7 @@
 #include <linux/pm.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
+#include <linux/freezer.h>
 
 #include "power.h"
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 873228c..6096c71 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -18,6 +18,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/resume-trace.h>
+#include <linux/freezer.h>
 
 #include "power.h"
 
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 72e72d2..29be608 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -13,6 +13,7 @@
 #include <linux/suspend.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/freezer.h>
 
 /* 
  * Timeout for stopping processes
diff --git a/kernel/power/user.c b/kernel/power/user.c
index a63b25c..26c6694 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -22,6 +22,7 @@
 #include <linux/fs.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
+#include <linux/freezer.h>
 
 #include <asm/uaccess.h>
 
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 6dcea9d..015fc63 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -13,6 +13,7 @@
 #include <linux/spinlock.h>
 #include <linux/sysdev.h>
 #include <linux/timer.h>
+#include <linux/freezer.h>
 
 #include "rtmutex.h"
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 3399701..12fdbef 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -34,7 +34,7 @@
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
 #include <linux/delay.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index 8e19d27..bc972d7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -23,6 +23,7 @@
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/capability.h>
+#include <linux/freezer.h>
 #include <asm/param.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
diff --git a/mm/pdflush.c b/mm/pdflush.c
index b02102f..8ce0900 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -21,6 +21,7 @@
 #include <linux/writeback.h>	// Prototypes pdflush_operation()
 #include <linux/kthread.h>
 #include <linux/cpuset.h>
+#include <linux/freezer.h>
 
 
 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2a6a79f..f6616e8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -36,6 +36,7 @@
 #include <linux/rwsem.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
+#include <linux/freezer.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
diff --git a/net/rxrpc/krxiod.c b/net/rxrpc/krxiod.c
index dada34a..49effd9 100644
--- a/net/rxrpc/krxiod.c
+++ b/net/rxrpc/krxiod.c
@@ -13,6 +13,7 @@
 #include <linux/completion.h>
 #include <linux/spinlock.h>
 #include <linux/init.h>
+#include <linux/freezer.h>
 #include <rxrpc/krxiod.h>
 #include <rxrpc/transport.h>
 #include <rxrpc/peer.h>
diff --git a/net/rxrpc/krxsecd.c b/net/rxrpc/krxsecd.c
index cea4eb5..3ab0f77 100644
--- a/net/rxrpc/krxsecd.c
+++ b/net/rxrpc/krxsecd.c
@@ -27,6 +27,7 @@
 #include <rxrpc/call.h>
 #include <linux/udp.h>
 #include <linux/ip.h>
+#include <linux/freezer.h>
 #include <net/sock.h>
 #include "internal.h"
 
diff --git a/net/rxrpc/krxtimod.c b/net/rxrpc/krxtimod.c
index 3e74669..9a9b613 100644
--- a/net/rxrpc/krxtimod.c
+++ b/net/rxrpc/krxtimod.c
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/completion.h>
+#include <linux/freezer.h>
 #include <rxrpc/rxrpc.h>
 #include <rxrpc/krxtimod.h>
 #include <asm/errno.h>
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 64ca1f6..1c68956 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -32,6 +32,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/file.h>
+#include <linux/freezer.h>
 #include <net/sock.h>
 #include <net/checksum.h>
 #include <net/ip.h>
-- 
cgit v0.10.2


From 32d50f57dab94d8c46566a903bbb633ee72fdcc2 Mon Sep 17 00:00:00 2001
From: Nigel Cunningham <ncunningham@linuxmail.org>
Date: Wed, 6 Dec 2006 20:34:25 -0800
Subject: [PATCH] swsusp: quieten Freezer if !CONFIG_PM_DEBUG

The freezer currently prints an '=' for every process that is frozen.  This
is pretty pointless, as the equals sign says nothing about which process is
frozen, and makes logs look messier (especially if there were a large
number of processes running).  All we really need to know is that we
started trying to freeze processes and what processes (if any) failed to
freeze, or that we succeeded.

Signed-off-by: Nigel Cunningham <nigel@suspend2.net>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/power/process.c b/kernel/power/process.c
index 29be608..b0edfc6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -40,7 +40,6 @@ void refrigerator(void)
 	long save;
 	save = current->state;
 	pr_debug("%s entered refrigerator\n", current->comm);
-	printk("=");
 
 	frozen_process(current);
 	spin_lock_irq(&current->sighand->siglock);
-- 
cgit v0.10.2


From 14b5b7cfaa110b1d25b8f80b01a8c97cf2db30bc Mon Sep 17 00:00:00 2001
From: Nigel Cunningham <ncunningham@linuxmail.org>
Date: Wed, 6 Dec 2006 20:34:26 -0800
Subject: [PATCH] swsusp: clean up whitespace in freezer output

Minor whitespace and formatting modifications for the freezer.

Signed-off-by: Nigel Cunningham <nigel@suspend2.net>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/power/process.c b/kernel/power/process.c
index b0edfc6..fedabad 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -86,7 +86,7 @@ int freeze_processes(void)
 	unsigned long start_time;
 	struct task_struct *g, *p;
 
-	printk( "Stopping tasks: " );
+	printk("Stopping tasks... ");
 	start_time = jiffies;
 	user_frozen = 0;
 	do {
@@ -134,21 +134,21 @@ int freeze_processes(void)
 	 * but it cleans up leftover PF_FREEZE requests.
 	 */
 	if (todo) {
-		printk( "\n" );
-		printk(KERN_ERR " stopping tasks timed out "
+		printk("\n");
+		printk(KERN_ERR "Stopping tasks timed out "
 			"after %d seconds (%d tasks remaining):\n",
 			TIMEOUT / HZ, todo);
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
 			if (freezeable(p) && !frozen(p))
-				printk(KERN_ERR "  %s\n", p->comm);
+				printk(KERN_ERR " %s\n", p->comm);
 			cancel_freezing(p);
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
 		return todo;
 	}
 
-	printk( "|\n" );
+	printk("done.\n");
 	BUG_ON(in_atomic());
 	return 0;
 }
@@ -157,18 +157,18 @@ void thaw_processes(void)
 {
 	struct task_struct *g, *p;
 
-	printk( "Restarting tasks..." );
+	printk("Restarting tasks... ");
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		if (!freezeable(p))
 			continue;
 		if (!thaw_process(p))
-			printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
+			printk(KERN_INFO "Strange, %s not stopped\n", p->comm);
 	} while_each_thread(g, p);
 
 	read_unlock(&tasklist_lock);
 	schedule();
-	printk( " done\n" );
+	printk("done.\n");
 }
 
 EXPORT_SYMBOL(refrigerator);
-- 
cgit v0.10.2


From ff39593ad0ff7a79a3717edac6634407aa8200c2 Mon Sep 17 00:00:00 2001
From: Nigel Cunningham <ncunningham@linuxmail.org>
Date: Wed, 6 Dec 2006 20:34:28 -0800
Subject: [PATCH] swsusp: thaw userspace and kernel space separately

Modify process thawing so that we can thaw kernel space without thawing
userspace, and thaw kernelspace first.  This will be useful in later
patches, where I intend to get swsusp thawing kernel threads only before
seeking to free memory.

Signed-off-by: Nigel Cunningham <nigel@suspend2.net>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 266373f..294ebea 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -1,5 +1,8 @@
 /* Freezer declarations */
 
+#define FREEZER_KERNEL_THREADS 0
+#define FREEZER_ALL_THREADS 1
+
 #ifdef CONFIG_PM
 /*
  * Check if a process has been frozen
@@ -57,7 +60,8 @@ static inline void frozen_process(struct task_struct *p)
 
 extern void refrigerator(void);
 extern int freeze_processes(void);
-extern void thaw_processes(void);
+#define thaw_processes() do { thaw_some_processes(FREEZER_ALL_THREADS); } while(0)
+#define thaw_kernel_threads() do { thaw_some_processes(FREEZER_KERNEL_THREADS); } while(0)
 
 static inline int try_to_freeze(void)
 {
@@ -67,6 +71,9 @@ static inline int try_to_freeze(void)
 	} else
 		return 0;
 }
+
+extern void thaw_some_processes(int all);
+
 #else
 static inline int frozen(struct task_struct *p) { return 0; }
 static inline int freezing(struct task_struct *p) { return 0; }
diff --git a/kernel/power/process.c b/kernel/power/process.c
index fedabad..cba8a58 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -153,18 +153,29 @@ int freeze_processes(void)
 	return 0;
 }
 
-void thaw_processes(void)
+void thaw_some_processes(int all)
 {
 	struct task_struct *g, *p;
+	int pass = 0; /* Pass 0 = Kernel space, 1 = Userspace */
 
 	printk("Restarting tasks... ");
 	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
-		if (!freezeable(p))
-			continue;
-		if (!thaw_process(p))
-			printk(KERN_INFO "Strange, %s not stopped\n", p->comm);
-	} while_each_thread(g, p);
+	do {
+		do_each_thread(g, p) {
+			/*
+			 * is_user = 0 if kernel thread or borrowed mm,
+			 * 1 otherwise.
+			 */
+			int is_user = !!(p->mm && !(p->flags & PF_BORROWED_MM));
+			if (!freezeable(p) || (is_user != pass))
+				continue;
+			if (!thaw_process(p))
+				printk(KERN_INFO
+					"Strange, %s not stopped\n", p->comm);
+		} while_each_thread(g, p);
+
+		pass++;
+	} while (pass < 2 && all);
 
 	read_unlock(&tasklist_lock);
 	schedule();
-- 
cgit v0.10.2


From 2d4a34c9365c6e3f94a5b26ce296e1fce9b66c8b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:29 -0800
Subject: [PATCH] swsusp: Support i386 systems with PAE or without PSE

Make swsusp support i386 systems with PAE or without PSE.

This is done by creating temporary page tables located in resume-safe page
frames before the suspend image is restored in the same way as x86_64 does
it.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Andi Kleen <ak@suse.de>
Cc: Dave Jones <davej@redhat.com>
Cc: Nigel Cunningham <ncunningham@linuxmail.org>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/i386/power/Makefile b/arch/i386/power/Makefile
index 8cfa4e8..2de7bbf 100644
--- a/arch/i386/power/Makefile
+++ b/arch/i386/power/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_PM)		+= cpu.o
-obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o
+obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o suspend.o
diff --git a/arch/i386/power/suspend.c b/arch/i386/power/suspend.c
new file mode 100644
index 0000000..db5e98d
--- /dev/null
+++ b/arch/i386/power/suspend.c
@@ -0,0 +1,158 @@
+/*
+ * Suspend support specific for i386 - temporary page tables
+ *
+ * Distribute under GPLv2
+ *
+ * Copyright (c) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ */
+
+#include <linux/suspend.h>
+#include <linux/bootmem.h>
+
+#include <asm/system.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+/* Defined in arch/i386/power/swsusp.S */
+extern int restore_image(void);
+
+/* Pointer to the temporary resume page tables */
+pgd_t *resume_pg_dir;
+
+/* The following three functions are based on the analogous code in
+ * arch/i386/mm/init.c
+ */
+
+/*
+ * Create a middle page table on a resume-safe page and put a pointer to it in
+ * the given global directory entry.  This only returns the gd entry
+ * in non-PAE compilation mode, since the middle layer is folded.
+ */
+static pmd_t *resume_one_md_table_init(pgd_t *pgd)
+{
+	pud_t *pud;
+	pmd_t *pmd_table;
+
+#ifdef CONFIG_X86_PAE
+	pmd_table = (pmd_t *)get_safe_page(GFP_ATOMIC);
+	if (!pmd_table)
+		return NULL;
+
+	set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
+	pud = pud_offset(pgd, 0);
+
+	BUG_ON(pmd_table != pmd_offset(pud, 0));
+#else
+	pud = pud_offset(pgd, 0);
+	pmd_table = pmd_offset(pud, 0);
+#endif
+
+	return pmd_table;
+}
+
+/*
+ * Create a page table on a resume-safe page and place a pointer to it in
+ * a middle page directory entry.
+ */
+static pte_t *resume_one_page_table_init(pmd_t *pmd)
+{
+	if (pmd_none(*pmd)) {
+		pte_t *page_table = (pte_t *)get_safe_page(GFP_ATOMIC);
+		if (!page_table)
+			return NULL;
+
+		set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
+
+		BUG_ON(page_table != pte_offset_kernel(pmd, 0));
+
+		return page_table;
+	}
+
+	return pte_offset_kernel(pmd, 0);
+}
+
+/*
+ * This maps the physical memory to kernel virtual address space, a total
+ * of max_low_pfn pages, by creating page tables starting from address
+ * PAGE_OFFSET.  The page tables are allocated out of resume-safe pages.
+ */
+static int resume_physical_mapping_init(pgd_t *pgd_base)
+{
+	unsigned long pfn;
+	pgd_t *pgd;
+	pmd_t *pmd;
+	pte_t *pte;
+	int pgd_idx, pmd_idx;
+
+	pgd_idx = pgd_index(PAGE_OFFSET);
+	pgd = pgd_base + pgd_idx;
+	pfn = 0;
+
+	for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
+		pmd = resume_one_md_table_init(pgd);
+		if (!pmd)
+			return -ENOMEM;
+
+		if (pfn >= max_low_pfn)
+			continue;
+
+		for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD; pmd++, pmd_idx++) {
+			if (pfn >= max_low_pfn)
+				break;
+
+			/* Map with big pages if possible, otherwise create
+			 * normal page tables.
+			 * NOTE: We can mark everything as executable here
+			 */
+			if (cpu_has_pse) {
+				set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
+				pfn += PTRS_PER_PTE;
+			} else {
+				pte_t *max_pte;
+
+				pte = resume_one_page_table_init(pmd);
+				if (!pte)
+					return -ENOMEM;
+
+				max_pte = pte + PTRS_PER_PTE;
+				for (; pte < max_pte; pte++, pfn++) {
+					if (pfn >= max_low_pfn)
+						break;
+
+					set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
+				}
+			}
+		}
+	}
+	return 0;
+}
+
+static inline void resume_init_first_level_page_table(pgd_t *pg_dir)
+{
+#ifdef CONFIG_X86_PAE
+	int i;
+
+	/* Init entries of the first-level page table to the zero page */
+	for (i = 0; i < PTRS_PER_PGD; i++)
+		set_pgd(pg_dir + i,
+			__pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
+#endif
+}
+
+int swsusp_arch_resume(void)
+{
+	int error;
+
+	resume_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC);
+	if (!resume_pg_dir)
+		return -ENOMEM;
+
+	resume_init_first_level_page_table(resume_pg_dir);
+	error = resume_physical_mapping_init(resume_pg_dir);
+	if (error)
+		return error;
+
+	/* We have got enough memory and from now on we cannot recover */
+	restore_image();
+	return 0;
+}
diff --git a/arch/i386/power/swsusp.S b/arch/i386/power/swsusp.S
index 8a2b50a..53662e0 100644
--- a/arch/i386/power/swsusp.S
+++ b/arch/i386/power/swsusp.S
@@ -28,8 +28,9 @@ ENTRY(swsusp_arch_suspend)
 	call swsusp_save
 	ret
 
-ENTRY(swsusp_arch_resume)
-	movl	$swsusp_pg_dir-__PAGE_OFFSET, %ecx
+ENTRY(restore_image)
+	movl	resume_pg_dir, %ecx
+	subl	$__PAGE_OFFSET, %ecx
 	movl	%ecx, %cr3
 
 	movl	restore_pblist, %edx
@@ -51,6 +52,10 @@ copy_loop:
 	.p2align 4,,7
 
 done:
+	/* go back to the original page tables */
+	movl	$swapper_pg_dir, %ecx
+	subl	$__PAGE_OFFSET, %ecx
+	movl	%ecx, %cr3
 	/* Flush TLB, including "global" things (vmalloc) */
 	movl	mmu_cr4_features, %eax
 	movl	%eax, %edx
diff --git a/include/asm-i386/suspend.h b/include/asm-i386/suspend.h
index 08be1e5..c1da5ca 100644
--- a/include/asm-i386/suspend.h
+++ b/include/asm-i386/suspend.h
@@ -6,18 +6,7 @@
 #include <asm/desc.h>
 #include <asm/i387.h>
 
-static inline int
-arch_prepare_suspend(void)
-{
-	/* If you want to make non-PSE machine work, turn off paging
-           in swsusp_arch_suspend. swsusp_pg_dir should have identity mapping, so
-           it could work...  */
-	if (!cpu_has_pse) {
-		printk(KERN_ERR "PSE is required for swsusp.\n");
-		return -EPERM;
-	}
-	return 0;
-}
+static inline int arch_prepare_suspend(void) { return 0; }
 
 /* image of the saved processor state */
 struct saved_context {
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 825068c..710ed08 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -78,7 +78,7 @@ config PM_SYSFS_DEPRECATED
 
 config SOFTWARE_SUSPEND
 	bool "Software Suspend"
-	depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP))
+	depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
 	---help---
 	  Enable the possibility of suspending the machine.
 	  It doesn't need ACPI or APM.
-- 
cgit v0.10.2


From 112cecb2cc0e7341db92281ba04b26c41bb8146d Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Wed, 6 Dec 2006 20:34:31 -0800
Subject: [PATCH] suspend: don't change cpus_allowed for task initiating the
 suspend

Don't modify the cpus_allowed of the task initiating the suspend.
_cpu_down() already makes sure that the task doing the suspend doesn't run
on dying cpu.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Nigel Cunningham <nigel@suspend2.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 272254f..9124669 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -270,11 +270,7 @@ int disable_nonboot_cpus(void)
 			goto out;
 		}
 	}
-	error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu));
-	if (error) {
-		printk(KERN_ERR "Could not run on CPU%d\n", first_cpu);
-		goto out;
-	}
+
 	/* We take down all of the non-boot CPUs in one shot to avoid races
 	 * with the userspace trying to use the CPU hotplug at the same time
 	 */
-- 
cgit v0.10.2


From 0d3a9abe8ae055e1052295698bcd0722c92eff47 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:32 -0800
Subject: [PATCH] swsusp: Measure memory shrinking time

Make swsusp measure and print the time needed to shrink memory during the
suspend.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@suspend2.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/power/power.h b/kernel/power/power.h
index 3763343..3afa5db 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -171,3 +171,7 @@ extern int swsusp_read(void);
 extern int swsusp_write(void);
 extern void swsusp_close(void);
 extern int suspend_enter(suspend_state_t state);
+
+struct timeval;
+extern void swsusp_show_speed(struct timeval *, struct timeval *,
+				unsigned int, char *);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 52e70ca..dedf879 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -133,26 +133,6 @@ static int wait_on_bio_chain(struct bio **bio_chain)
 	return ret;
 }
 
-static void show_speed(struct timeval *start, struct timeval *stop,
-			unsigned nr_pages, char *msg)
-{
-	s64 elapsed_centisecs64;
-	int centisecs;
-	int k;
-	int kps;
-
-	elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
-	do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
-	centisecs = elapsed_centisecs64;
-	if (centisecs == 0)
-		centisecs = 1;	/* avoid div-by-zero */
-	k = nr_pages * (PAGE_SIZE / 1024);
-	kps = (k * 100) / centisecs;
-	printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
-			centisecs / 100, centisecs % 100,
-			kps / 1000, (kps % 1000) / 10);
-}
-
 /*
  * Saving part
  */
@@ -375,7 +355,7 @@ static int save_image(struct swap_map_handle *handle,
 		error = err2;
 	if (!error)
 		printk("\b\b\b\bdone\n");
-	show_speed(&start, &stop, nr_to_write, "Wrote");
+	swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
 	return error;
 }
 
@@ -562,7 +542,7 @@ static int load_image(struct swap_map_handle *handle,
 		if (!snapshot_image_loaded(snapshot))
 			error = -ENODATA;
 	}
-	show_speed(&start, &stop, nr_to_read, "Read");
+	swsusp_show_speed(&start, &stop, nr_to_read, "Read");
 	return error;
 }
 
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 68de5c1..aa31432 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -49,6 +49,7 @@
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/highmem.h>
+#include <linux/time.h>
 
 #include "power.h"
 
@@ -164,6 +165,34 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
 }
 
 /**
+ *	swsusp_show_speed - print the time elapsed between two events represented by
+ *	@start and @stop
+ *
+ *	@nr_pages -	number of pages processed between @start and @stop
+ *	@msg -		introductory message to print
+ */
+
+void swsusp_show_speed(struct timeval *start, struct timeval *stop,
+			unsigned nr_pages, char *msg)
+{
+	s64 elapsed_centisecs64;
+	int centisecs;
+	int k;
+	int kps;
+
+	elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
+	do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
+	centisecs = elapsed_centisecs64;
+	if (centisecs == 0)
+		centisecs = 1;	/* avoid div-by-zero */
+	k = nr_pages * (PAGE_SIZE / 1024);
+	kps = (k * 100) / centisecs;
+	printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
+			centisecs / 100, centisecs % 100,
+			kps / 1000, (kps % 1000) / 10);
+}
+
+/**
  *	swsusp_shrink_memory -  Try to free as much memory as needed
  *
  *	... but do not OOM-kill anyone
@@ -187,8 +216,10 @@ int swsusp_shrink_memory(void)
 	unsigned long pages = 0;
 	unsigned int i = 0;
 	char *p = "-\\|/";
+	struct timeval start, stop;
 
 	printk("Shrinking memory...  ");
+	do_gettimeofday(&start);
 	do {
 		long size, highmem_size;
 
@@ -222,7 +253,9 @@ int swsusp_shrink_memory(void)
 		}
 		printk("\b%c", p[i++%4]);
 	} while (tmp > 0);
+	do_gettimeofday(&stop);
 	printk("\bdone (%lu pages freed)\n", pages);
+	swsusp_show_speed(&start, &stop, pages, "Freed");
 
 	return 0;
 }
-- 
cgit v0.10.2


From 3eb1b3a40722cbb46631db373af66d13d1e7ac81 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:34 -0800
Subject: [PATCH] suspend to disk fails if gdb is suspended with a traced child

Fix http://bugzilla.kernel.org/show_bug.cgi?id=7534

Fix the freezing of processes so that it won't fail if there is a traced
process the parent of which has been stopped.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: maurice barnum <pixi+kbug@burble.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/power/process.c b/kernel/power/process.c
index cba8a58..1badb9a 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -97,7 +97,9 @@ int freeze_processes(void)
 				continue;
 			if (frozen(p))
 				continue;
-			if (p->state == TASK_TRACED && frozen(p->parent)) {
+			if (p->state == TASK_TRACED &&
+			    (frozen(p->parent) ||
+			     p->parent->state == TASK_STOPPED)) {
 				cancel_freezing(p);
 				continue;
 			}
-- 
cgit v0.10.2


From a6d70980602e6f1869ebcdcbfaf55a0a5941583e Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 6 Dec 2006 20:34:35 -0800
Subject: [PATCH] convert pm_sem to a mutex

The power management semaphore is only used as mutex, so convert it.

[akpm@osdl.org: fix rotten bug]
Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 53b3b57..08d9e7a 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -213,10 +213,10 @@ static int software_resume(void)
 {
 	int error;
 
-	down(&pm_sem);
+	mutex_lock(&pm_mutex);
 	if (!swsusp_resume_device) {
 		if (!strlen(resume_file)) {
-			up(&pm_sem);
+			mutex_unlock(&pm_mutex);
 			return -ENOENT;
 		}
 		swsusp_resume_device = name_to_dev_t(resume_file);
@@ -231,7 +231,7 @@ static int software_resume(void)
 		 * FIXME: If noresume is specified, we need to find the partition
 		 * and reset it back to normal swap space.
 		 */
-		up(&pm_sem);
+		mutex_unlock(&pm_mutex);
 		return 0;
 	}
 
@@ -275,7 +275,7 @@ static int software_resume(void)
 	unprepare_processes();
  Done:
 	/* For success case, the suspend path will release the lock */
-	up(&pm_sem);
+	mutex_unlock(&pm_mutex);
 	pr_debug("PM: Resume from disk failed.\n");
 	return 0;
 }
@@ -336,7 +336,7 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
 	p = memchr(buf, '\n', n);
 	len = p ? p - buf : n;
 
-	down(&pm_sem);
+	mutex_lock(&pm_mutex);
 	for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) {
 		if (!strncmp(buf, pm_disk_modes[i], len)) {
 			mode = i;
@@ -360,7 +360,7 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
 
 	pr_debug("PM: suspend-to-disk mode set to '%s'\n",
 		 pm_disk_modes[mode]);
-	up(&pm_sem);
+	mutex_unlock(&pm_mutex);
 	return error ? error : n;
 }
 
@@ -385,9 +385,9 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
 	if (maj != MAJOR(res) || min != MINOR(res))
 		goto out;
 
-	down(&pm_sem);
+	mutex_lock(&pm_mutex);
 	swsusp_resume_device = res;
-	up(&pm_sem);
+	mutex_unlock(&pm_mutex);
 	printk("Attempting manual resume\n");
 	noresume = 0;
 	software_resume();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6096c71..751157b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -25,7 +25,7 @@
 /*This is just an arbitrary number */
 #define FREE_PAGE_NUMBER (100)
 
-DECLARE_MUTEX(pm_sem);
+DEFINE_MUTEX(pm_mutex);
 
 struct pm_ops *pm_ops;
 suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
@@ -37,9 +37,9 @@ suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
 
 void pm_set_ops(struct pm_ops * ops)
 {
-	down(&pm_sem);
+	mutex_lock(&pm_mutex);
 	pm_ops = ops;
-	up(&pm_sem);
+	mutex_unlock(&pm_mutex);
 }
 
 
@@ -183,7 +183,7 @@ static int enter_state(suspend_state_t state)
 
 	if (!valid_state(state))
 		return -ENODEV;
-	if (down_trylock(&pm_sem))
+	if (!mutex_trylock(&pm_mutex))
 		return -EBUSY;
 
 	if (state == PM_SUSPEND_DISK) {
@@ -201,7 +201,7 @@ static int enter_state(suspend_state_t state)
 	pr_debug("PM: Finishing wakeup.\n");
 	suspend_finish(state);
  Unlock:
-	up(&pm_sem);
+	mutex_unlock(&pm_mutex);
 	return error;
 }
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 3afa5db..eb461b8 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -22,7 +22,9 @@ static inline int pm_suspend_disk(void)
 	return -EPERM;
 }
 #endif
-extern struct semaphore pm_sem;
+
+extern struct mutex pm_mutex;
+
 #define power_attr(_name) \
 static struct subsys_attribute _name##_attr = {	\
 	.attr	= {				\
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 26c6694..905dc26 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -79,10 +79,10 @@ static int snapshot_release(struct inode *inode, struct file *filp)
 	free_all_swap_pages(data->swap, data->bitmap);
 	free_bitmap(data->bitmap);
 	if (data->frozen) {
-		down(&pm_sem);
+		mutex_lock(&pm_mutex);
 		thaw_processes();
 		enable_nonboot_cpus();
-		up(&pm_sem);
+		mutex_unlock(&pm_mutex);
 	}
 	atomic_inc(&device_available);
 	return 0;
@@ -144,7 +144,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 	case SNAPSHOT_FREEZE:
 		if (data->frozen)
 			break;
-		down(&pm_sem);
+		mutex_lock(&pm_mutex);
 		error = disable_nonboot_cpus();
 		if (!error) {
 			error = freeze_processes();
@@ -154,7 +154,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 				error = -EBUSY;
 			}
 		}
-		up(&pm_sem);
+		mutex_unlock(&pm_mutex);
 		if (!error)
 			data->frozen = 1;
 		break;
@@ -162,10 +162,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 	case SNAPSHOT_UNFREEZE:
 		if (!data->frozen)
 			break;
-		down(&pm_sem);
+		mutex_lock(&pm_mutex);
 		thaw_processes();
 		enable_nonboot_cpus();
-		up(&pm_sem);
+		mutex_unlock(&pm_mutex);
 		data->frozen = 0;
 		break;
 
@@ -174,7 +174,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			error = -EPERM;
 			break;
 		}
-		down(&pm_sem);
+		mutex_lock(&pm_mutex);
 		/* Free memory before shutting down devices. */
 		error = swsusp_shrink_memory();
 		if (!error) {
@@ -187,7 +187,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			}
 			resume_console();
 		}
-		up(&pm_sem);
+		mutex_unlock(&pm_mutex);
 		if (!error)
 			error = put_user(in_suspend, (unsigned int __user *)arg);
 		if (!error)
@@ -201,7 +201,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			error = -EPERM;
 			break;
 		}
-		down(&pm_sem);
+		mutex_lock(&pm_mutex);
 		pm_prepare_console();
 		suspend_console();
 		error = device_suspend(PMSG_PRETHAW);
@@ -211,7 +211,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		}
 		resume_console();
 		pm_restore_console();
-		up(&pm_sem);
+		mutex_unlock(&pm_mutex);
 		break;
 
 	case SNAPSHOT_FREE:
@@ -286,7 +286,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			break;
 		}
 
-		if (down_trylock(&pm_sem)) {
+		if (!mutex_trylock(&pm_mutex)) {
 			error = -EBUSY;
 			break;
 		}
@@ -314,7 +314,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			pm_ops->finish(PM_SUSPEND_MEM);
 
 OutS3:
-		up(&pm_sem);
+		mutex_unlock(&pm_mutex);
 		break;
 
 	case SNAPSHOT_PMOPS:
-- 
cgit v0.10.2


From a9b6f562f14dc28fb4b2415f0f275cede0abe9b5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:37 -0800
Subject: [PATCH] swsusp: Untangle thaw_processes

Move the loop from thaw_processes() to a separate function and call it
independently for kernel threads and user space processes so that the order
of thawing tasks is clearly visible.

Drop thaw_kernel_threads() which is never used.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@suspend2.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 294ebea..6e05e3e 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -1,8 +1,5 @@
 /* Freezer declarations */
 
-#define FREEZER_KERNEL_THREADS 0
-#define FREEZER_ALL_THREADS 1
-
 #ifdef CONFIG_PM
 /*
  * Check if a process has been frozen
@@ -60,8 +57,7 @@ static inline void frozen_process(struct task_struct *p)
 
 extern void refrigerator(void);
 extern int freeze_processes(void);
-#define thaw_processes() do { thaw_some_processes(FREEZER_ALL_THREADS); } while(0)
-#define thaw_kernel_threads() do { thaw_some_processes(FREEZER_KERNEL_THREADS); } while(0)
+extern void thaw_processes(void);
 
 static inline int try_to_freeze(void)
 {
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 1badb9a..fd0ebb9 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -20,6 +20,8 @@
  */
 #define TIMEOUT	(20 * HZ)
 
+#define FREEZER_KERNEL_THREADS 0
+#define FREEZER_USER_SPACE 1
 
 static inline int freezeable(struct task_struct * p)
 {
@@ -79,6 +81,11 @@ static void cancel_freezing(struct task_struct *p)
 	}
 }
 
+static inline int is_user_space(struct task_struct *p)
+{
+	return p->mm && !(p->flags & PF_BORROWED_MM);
+}
+
 /* 0 = success, else # of processes that we failed to stop */
 int freeze_processes(void)
 {
@@ -103,10 +110,9 @@ int freeze_processes(void)
 				cancel_freezing(p);
 				continue;
 			}
-			if (p->mm && !(p->flags & PF_BORROWED_MM)) {
-				/* The task is a user-space one.
-				 * Freeze it unless there's a vfork completion
-				 * pending
+			if (is_user_space(p)) {
+				/* Freeze the task unless there is a vfork
+				 * completion pending
 				 */
 				if (!p->vfork_done)
 					freeze_process(p);
@@ -155,31 +161,30 @@ int freeze_processes(void)
 	return 0;
 }
 
-void thaw_some_processes(int all)
+static void thaw_tasks(int thaw_user_space)
 {
 	struct task_struct *g, *p;
-	int pass = 0; /* Pass 0 = Kernel space, 1 = Userspace */
 
-	printk("Restarting tasks... ");
 	read_lock(&tasklist_lock);
-	do {
-		do_each_thread(g, p) {
-			/*
-			 * is_user = 0 if kernel thread or borrowed mm,
-			 * 1 otherwise.
-			 */
-			int is_user = !!(p->mm && !(p->flags & PF_BORROWED_MM));
-			if (!freezeable(p) || (is_user != pass))
-				continue;
-			if (!thaw_process(p))
-				printk(KERN_INFO
-					"Strange, %s not stopped\n", p->comm);
-		} while_each_thread(g, p);
+	do_each_thread(g, p) {
+		if (!freezeable(p))
+			continue;
 
-		pass++;
-	} while (pass < 2 && all);
+		if (is_user_space(p) == !thaw_user_space)
+			continue;
 
+		if (!thaw_process(p))
+			printk(KERN_WARNING " Strange, %s not stopped\n",
+				p->comm );
+	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
+}
+
+void thaw_processes(void)
+{
+	printk("Restarting tasks ... ");
+	thaw_tasks(FREEZER_KERNEL_THREADS);
+	thaw_tasks(FREEZER_USER_SPACE);
 	schedule();
 	printk("done.\n");
 }
-- 
cgit v0.10.2


From 11b2ce2ba90f801e2a5ebba4e6b7da72d87f2b13 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:40 -0800
Subject: [PATCH] swsusp: Untangle freeze_processes

Move the loop from freeze_processes() to a separate function and call it
independently for user space processes and kernel threads so that the order
of freezing tasks is clearly visible.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@suspend2.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/power/process.c b/kernel/power/process.c
index fd0ebb9..99eeb11 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -86,24 +86,23 @@ static inline int is_user_space(struct task_struct *p)
 	return p->mm && !(p->flags & PF_BORROWED_MM);
 }
 
-/* 0 = success, else # of processes that we failed to stop */
-int freeze_processes(void)
+static unsigned int try_to_freeze_tasks(int freeze_user_space)
 {
-	int todo, nr_user, user_frozen;
-	unsigned long start_time;
 	struct task_struct *g, *p;
+	unsigned long end_time;
+	unsigned int todo;
 
-	printk("Stopping tasks... ");
-	start_time = jiffies;
-	user_frozen = 0;
+	end_time = jiffies + TIMEOUT;
 	do {
-		nr_user = todo = 0;
+		todo = 0;
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
 			if (!freezeable(p))
 				continue;
+
 			if (frozen(p))
 				continue;
+
 			if (p->state == TASK_TRACED &&
 			    (frozen(p->parent) ||
 			     p->parent->state == TASK_STOPPED)) {
@@ -111,51 +110,76 @@ int freeze_processes(void)
 				continue;
 			}
 			if (is_user_space(p)) {
+				if (!freeze_user_space)
+					continue;
+
 				/* Freeze the task unless there is a vfork
 				 * completion pending
 				 */
 				if (!p->vfork_done)
 					freeze_process(p);
-				nr_user++;
 			} else {
-				/* Freeze only if the user space is frozen */
-				if (user_frozen)
-					freeze_process(p);
-				todo++;
+				if (freeze_user_space)
+					continue;
+
+				freeze_process(p);
 			}
+			todo++;
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
-		todo += nr_user;
-		if (!user_frozen && !nr_user) {
-			sys_sync();
-			start_time = jiffies;
-		}
-		user_frozen = !nr_user;
 		yield();			/* Yield is okay here */
-		if (todo && time_after(jiffies, start_time + TIMEOUT))
+		if (todo && time_after(jiffies, end_time))
 			break;
-	} while(todo);
+	} while (todo);
 
-	/* This does not unfreeze processes that are already frozen
-	 * (we have slightly ugly calling convention in that respect,
-	 * and caller must call thaw_processes() if something fails),
-	 * but it cleans up leftover PF_FREEZE requests.
-	 */
 	if (todo) {
+		/* This does not unfreeze processes that are already frozen
+		 * (we have slightly ugly calling convention in that respect,
+		 * and caller must call thaw_processes() if something fails),
+		 * but it cleans up leftover PF_FREEZE requests.
+		 */
 		printk("\n");
-		printk(KERN_ERR "Stopping tasks timed out "
-			"after %d seconds (%d tasks remaining):\n",
-			TIMEOUT / HZ, todo);
+		printk(KERN_ERR "Stopping %s timed out after %d seconds "
+				"(%d tasks refusing to freeze):\n",
+				freeze_user_space ? "user space processes" :
+					"kernel threads",
+				TIMEOUT / HZ, todo);
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
+			if (is_user_space(p) == !freeze_user_space)
+				continue;
+
 			if (freezeable(p) && !frozen(p))
 				printk(KERN_ERR " %s\n", p->comm);
+
 			cancel_freezing(p);
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
-		return todo;
 	}
 
+	return todo;
+}
+
+/**
+ *	freeze_processes - tell processes to enter the refrigerator
+ *
+ *	Returns 0 on success, or the number of processes that didn't freeze,
+ *	although they were told to.
+ */
+int freeze_processes(void)
+{
+	unsigned int nr_unfrozen;
+
+	printk("Stopping tasks ... ");
+	nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE);
+	if (nr_unfrozen)
+		return nr_unfrozen;
+
+	sys_sync();
+	nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS);
+	if (nr_unfrozen)
+		return nr_unfrozen;
+
 	printk("done.\n");
 	BUG_ON(in_atomic());
 	return 0;
-- 
cgit v0.10.2


From 5b6d15de2d4c8149902a680a6cd1d3b26cd2e828 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:43 -0800
Subject: [PATCH] swsusp: Fix coding style in suspend.c

Fix coding style in suspend.c.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@suspend2.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index fd8251d..712f152 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -85,7 +85,8 @@ unsigned long get_safe_page(gfp_t gfp_mask)
 	return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
 }
 
-static struct page *alloc_image_page(gfp_t gfp_mask) {
+static struct page *alloc_image_page(gfp_t gfp_mask)
+{
 	struct page *page;
 
 	page = alloc_page(gfp_mask);
-- 
cgit v0.10.2


From 59a493350e7aefff7e262efa39e017517b31b8e8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:44 -0800
Subject: [PATCH] swsusp: Fix labels

Move all labels in the swsusp code to the second column, so that they won't
fool diff -p.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@suspend2.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 08d9e7a..126dda6 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -117,9 +117,9 @@ static int prepare_processes(void)
 		return 0;
 
 	platform_finish();
-thaw:
+ thaw:
 	thaw_processes();
-enable_cpus:
+ enable_cpus:
 	enable_nonboot_cpus();
 	pm_restore_console();
 	return error;
@@ -392,7 +392,7 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
 	noresume = 0;
 	software_resume();
 	ret = n;
-out:
+ out:
 	return ret;
 }
 
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 712f152..c024606 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -411,7 +411,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
 	memory_bm_position_reset(bm);
 	return 0;
 
-Free:
+ Free:
 	bm->p_list = ca.chain;
 	memory_bm_free(bm, PG_UNSAFE_CLEAR);
 	return -ENOMEM;
@@ -557,7 +557,7 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
 	memory_bm_position_reset(bm);
 	return BM_END_OF_MAP;
 
-Return_pfn:
+ Return_pfn:
 	bm->cur.chunk = chunk;
 	bm->cur.bit = bit;
 	return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
@@ -964,7 +964,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
 	}
 	return 0;
 
-Free:
+ Free:
 	swsusp_free();
 	return -ENOMEM;
 }
@@ -1540,7 +1540,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 	}
 	return 0;
 
-Free:
+ Free:
 	swsusp_free();
 	return error;
 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index dedf879..f133d4a 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -301,7 +301,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
 		handle->cur_swap = offset;
 		handle->k = 0;
 	}
-out:
+ out:
 	return error;
 }
 
@@ -429,7 +429,7 @@ int swsusp_write(void)
 	if (error)
 		free_all_swap_pages(root_swap, handle.bitmap);
 	release_swap_writer(&handle);
-out:
+ out:
 	swsusp_close();
 	return error;
 }
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index aa31432..31aa039 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -288,7 +288,7 @@ int swsusp_suspend(void)
 	 * that suspended with irqs off ... no overall powerup.
 	 */
 	device_power_up();
-Enable_irqs:
+ Enable_irqs:
 	local_irq_enable();
 	return error;
 }
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 905dc26..069732e 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -313,7 +313,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		if (pm_ops->finish)
 			pm_ops->finish(PM_SUSPEND_MEM);
 
-OutS3:
+ OutS3:
 		mutex_unlock(&pm_mutex);
 		break;
 
-- 
cgit v0.10.2


From 06df6a5c181f462c71ddcc20ff6c7ea0bec18ec8 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Wed, 6 Dec 2006 20:34:46 -0800
Subject: [PATCH] s2ram debugging documentation

Linus posted quite nice TRACE_RESUME how-to, and I think it is too nice to
be hidden in archives of mailing list, so I turned it into Documentation
piece.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/Documentation/power/s2ram.txt b/Documentation/power/s2ram.txt
new file mode 100644
index 0000000..b05f512
--- /dev/null
+++ b/Documentation/power/s2ram.txt
@@ -0,0 +1,56 @@
+			How to get s2ram working
+			~~~~~~~~~~~~~~~~~~~~~~~~
+			2006 Linus Torvalds
+			2006 Pavel Machek
+
+1) Check suspend.sf.net, program s2ram there has long whitelist of
+   "known ok" machines, along with tricks to use on each one.
+
+2) If that does not help, try reading tricks.txt and
+   video.txt. Perhaps problem is as simple as broken module, and
+   simple module unload can fix it.
+
+3) You can use Linus' TRACE_RESUME infrastructure, described below.
+
+		      Using TRACE_RESUME
+		      ~~~~~~~~~~~~~~~~~~
+
+I've been working at making the machines I have able to STR, and almost
+always it's a driver that is buggy. Thank God for the suspend/resume
+debugging - the thing that Chuck tried to disable. That's often the _only_
+way to debug these things, and it's actually pretty powerful (but
+time-consuming - having to insert TRACE_RESUME() markers into the device
+driver that doesn't resume and recompile and reboot).
+
+Anyway, the way to debug this for people who are interested (have a
+machine that doesn't boot) is:
+
+ - enable PM_DEBUG, and PM_TRACE
+
+ - use a script like this:
+
+	#!/bin/sh
+	sync
+	echo 1 > /sys/power/pm_trace
+	echo mem > /sys/power/state
+
+   to suspend
+
+ - if it doesn't come back up (which is usually the problem), reboot by
+   holding the power button down, and look at the dmesg output for things
+   like
+
+	Magic number: 4:156:725
+	hash matches drivers/base/power/resume.c:28
+	hash matches device 0000:01:00.0
+
+   which means that the last trace event was just before trying to resume
+   device 0000:01:00.0. Then figure out what driver is controlling that
+   device (lspci and /sys/devices/pci* is your friend), and see if you can
+   fix it, disable it, or trace into its resume function.
+
+For example, the above happens to be the VGA device on my EVO, which I
+used to run with "radeonfb" (it's an ATI Radeon mobility). It turns out
+that "radeonfb" simply cannot resume that device - it tries to set the
+PLL's, and it just _hangs_. Using the regular VGA console and letting X
+resume it instead works fine.
-- 
cgit v0.10.2


From 2d87595ea628ea58415ba4638c553a8c2fbd90e2 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:47 -0800
Subject: [PATCH] PM: Fix swsusp debug mode testproc

The 'testproc' swsusp debug mode thaws tasks twice in a row, which is _very_
confusing.  Fix that.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 126dda6..6180379 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -151,7 +151,7 @@ int pm_suspend_disk(void)
 		return error;
 
 	if (pm_disk_mode == PM_DISK_TESTPROC)
-		goto Thaw;
+		return 0;
 
 	suspend_console();
 	error = device_suspend(PMSG_FREEZE);
-- 
cgit v0.10.2


From 5045cfc103566878228ca36d05a0ae0076673e5a Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Wed, 6 Dec 2006 20:34:48 -0800
Subject: [PATCH] swsusp: kill write-only variable

Cleanup write-only variable, suggested by D Binderman.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 6180379..0b00f56 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -58,12 +58,10 @@ static inline int platform_prepare(void)
 
 static void power_down(suspend_disk_method_t mode)
 {
-	int error = 0;
-
 	switch(mode) {
 	case PM_DISK_PLATFORM:
 		kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
-		error = pm_ops->enter(PM_SUSPEND_DISK);
+		pm_ops->enter(PM_SUSPEND_DISK);
 		break;
 	case PM_DISK_SHUTDOWN:
 		kernel_power_off();
-- 
cgit v0.10.2


From 341a595850dac1b0503df34260257d71b4fdf72c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:49 -0800
Subject: [PATCH] Support for freezeable workqueues

Make it possible to create a workqueue the worker thread of which will be
frozen during suspend, along with other kernel threads.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@suspend2.net>
Cc: David Chinner <dgc@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 4a3ea83..f0cb1df 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -147,9 +147,11 @@ struct execute_work {
 
 
 extern struct workqueue_struct *__create_workqueue(const char *name,
-						    int singlethread);
-#define create_workqueue(name) __create_workqueue((name), 0)
-#define create_singlethread_workqueue(name) __create_workqueue((name), 1)
+						    int singlethread,
+						    int freezeable);
+#define create_workqueue(name) __create_workqueue((name), 0, 0)
+#define create_freezeable_workqueue(name) __create_workqueue((name), 0, 1)
+#define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0)
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8d1e7cb..2945b09 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -29,6 +29,7 @@
 #include <linux/kthread.h>
 #include <linux/hardirq.h>
 #include <linux/mempolicy.h>
+#include <linux/freezer.h>
 
 /*
  * The per-CPU workqueue (if single thread, we always use the first
@@ -55,6 +56,8 @@ struct cpu_workqueue_struct {
 	struct task_struct *thread;
 
 	int run_depth;		/* Detect run_workqueue() recursion depth */
+
+	int freezeable;		/* Freeze the thread during suspend */
 } ____cacheline_aligned;
 
 /*
@@ -265,7 +268,8 @@ static int worker_thread(void *__cwq)
 	struct k_sigaction sa;
 	sigset_t blocked;
 
-	current->flags |= PF_NOFREEZE;
+	if (!cwq->freezeable)
+		current->flags |= PF_NOFREEZE;
 
 	set_user_nice(current, -5);
 
@@ -288,6 +292,9 @@ static int worker_thread(void *__cwq)
 
 	set_current_state(TASK_INTERRUPTIBLE);
 	while (!kthread_should_stop()) {
+		if (cwq->freezeable)
+			try_to_freeze();
+
 		add_wait_queue(&cwq->more_work, &wait);
 		if (list_empty(&cwq->worklist))
 			schedule();
@@ -364,7 +371,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 EXPORT_SYMBOL_GPL(flush_workqueue);
 
 static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
-						   int cpu)
+						   int cpu, int freezeable)
 {
 	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
 	struct task_struct *p;
@@ -374,6 +381,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
 	cwq->thread = NULL;
 	cwq->insert_sequence = 0;
 	cwq->remove_sequence = 0;
+	cwq->freezeable = freezeable;
 	INIT_LIST_HEAD(&cwq->worklist);
 	init_waitqueue_head(&cwq->more_work);
 	init_waitqueue_head(&cwq->work_done);
@@ -389,7 +397,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
 }
 
 struct workqueue_struct *__create_workqueue(const char *name,
-					    int singlethread)
+					    int singlethread, int freezeable)
 {
 	int cpu, destroy = 0;
 	struct workqueue_struct *wq;
@@ -409,7 +417,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
 	mutex_lock(&workqueue_mutex);
 	if (singlethread) {
 		INIT_LIST_HEAD(&wq->list);
-		p = create_workqueue_thread(wq, singlethread_cpu);
+		p = create_workqueue_thread(wq, singlethread_cpu, freezeable);
 		if (!p)
 			destroy = 1;
 		else
@@ -417,7 +425,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
 	} else {
 		list_add(&wq->list, &workqueues);
 		for_each_online_cpu(cpu) {
-			p = create_workqueue_thread(wq, cpu);
+			p = create_workqueue_thread(wq, cpu, freezeable);
 			if (p) {
 				kthread_bind(p, cpu);
 				wake_up_process(p);
@@ -667,7 +675,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		mutex_lock(&workqueue_mutex);
 		/* Create a new workqueue thread for it. */
 		list_for_each_entry(wq, &workqueues, list) {
-			if (!create_workqueue_thread(wq, hotcpu)) {
+			if (!create_workqueue_thread(wq, hotcpu, 0)) {
 				printk("workqueue for %i failed\n", hotcpu);
 				return NOTIFY_BAD;
 			}
-- 
cgit v0.10.2


From 58e14b148ddb56f0bf999965d6279932ed4a00bc Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:50 -0800
Subject: [PATCH] Use freezeable workqueues in XFS

Make the workqueues used by XFS freezeable, so their worker threads don't
submit any I/O after the suspend image has been created.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@suspend2.net>
Cc: David Chinner <dgc@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index b971237..4fb01ff 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1827,11 +1827,11 @@ xfs_buf_init(void)
 	if (!xfs_buf_zone)
 		goto out_free_trace_buf;
 
-	xfslogd_workqueue = create_workqueue("xfslogd");
+	xfslogd_workqueue = create_freezeable_workqueue("xfslogd");
 	if (!xfslogd_workqueue)
 		goto out_free_buf_zone;
 
-	xfsdatad_workqueue = create_workqueue("xfsdatad");
+	xfsdatad_workqueue = create_freezeable_workqueue("xfsdatad");
 	if (!xfsdatad_workqueue)
 		goto out_destroy_xfslogd_workqueue;
 
-- 
cgit v0.10.2


From 8bcbdf603bc4bf24c2bcfa071871afb03dd3ae80 Mon Sep 17 00:00:00 2001
From: Yan Burman <burman.yan@gmail.com>
Date: Wed, 6 Dec 2006 20:34:51 -0800
Subject: [PATCH] m68k: replace kmalloc+memset with kzalloc

Replace kmalloc+memset with kzalloc

Signed-off-by: Yan Burman <burman.yan@gmail.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/m68k/amiga/chipram.c b/arch/m68k/amiga/chipram.c
index de1304c..fa015d8 100644
--- a/arch/m68k/amiga/chipram.c
+++ b/arch/m68k/amiga/chipram.c
@@ -52,10 +52,9 @@ void *amiga_chip_alloc(unsigned long size, const char *name)
 #ifdef DEBUG
     printk("amiga_chip_alloc: allocate %ld bytes\n", size);
 #endif
-    res = kmalloc(sizeof(struct resource), GFP_KERNEL);
+    res = kzalloc(sizeof(struct resource), GFP_KERNEL);
     if (!res)
 	return NULL;
-    memset(res, 0, sizeof(struct resource));
     res->name = name;
 
     if (allocate_resource(&chipram_res, res, size, 0, UINT_MAX, PAGE_SIZE, NULL, NULL) < 0) {
diff --git a/arch/m68k/atari/hades-pci.c b/arch/m68k/atari/hades-pci.c
index 6ca57b6..bee2b14 100644
--- a/arch/m68k/atari/hades-pci.c
+++ b/arch/m68k/atari/hades-pci.c
@@ -375,10 +375,9 @@ struct pci_bus_info * __init init_hades_pci(void)
 	 * Allocate memory for bus info structure.
 	 */
 
-	bus = kmalloc(sizeof(struct pci_bus_info), GFP_KERNEL);
+	bus = kzalloc(sizeof(struct pci_bus_info), GFP_KERNEL);
 	if (!bus)
 		return NULL;
-	memset(bus, 0, sizeof(struct pci_bus_info));
 
 	/*
 	 * Claim resources. The m68k has no separate I/O space, both
-- 
cgit v0.10.2


From 54f9a398e18a49e302e2187fa694043250391d80 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Wed, 6 Dec 2006 20:34:52 -0800
Subject: [PATCH] uml: include stddef.h correctly

We were not including stddef.h in files that used offsetof.

One file was also including linux/stddef.h for no perciptible reason.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/um/sys-i386/ldt.c b/arch/um/sys-i386/ldt.c
index e299ee5..49057d8 100644
--- a/arch/um/sys-i386/ldt.c
+++ b/arch/um/sys-i386/ldt.c
@@ -3,7 +3,6 @@
  * Licensed under the GPL
  */
 
-#include "linux/stddef.h"
 #include "linux/sched.h"
 #include "linux/slab.h"
 #include "linux/types.h"
diff --git a/arch/um/sys-i386/ptrace_user.c b/arch/um/sys-i386/ptrace_user.c
index 5f3cc66..01212c8 100644
--- a/arch/um/sys-i386/ptrace_user.c
+++ b/arch/um/sys-i386/ptrace_user.c
@@ -4,9 +4,9 @@
  */
 
 #include <stdio.h>
+#include <stddef.h>
 #include <errno.h>
 #include <unistd.h>
-#include <linux/stddef.h>
 #include "ptrace_user.h"
 /* Grr, asm/user.h includes asm/ptrace.h, so has to follow ptrace_user.h */
 #include <asm/user.h>
diff --git a/arch/um/sys-i386/user-offsets.c b/arch/um/sys-i386/user-offsets.c
index 6f4ef2b..447306b 100644
--- a/arch/um/sys-i386/user-offsets.c
+++ b/arch/um/sys-i386/user-offsets.c
@@ -2,7 +2,7 @@
 #include <signal.h>
 #include <asm/ptrace.h>
 #include <asm/user.h>
-#include <linux/stddef.h>
+#include <stddef.h>
 #include <sys/poll.h>
 
 #define DEFINE(sym, val) \
-- 
cgit v0.10.2


From 7b65fee21c6bff68711b48e0aa1cfd42b3198312 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Wed, 6 Dec 2006 20:34:53 -0800
Subject: [PATCH] uml: include asm/page.h in order to get PAGE_SHIFT

Include the proper header to get a definition of PAGE_SHIFT.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/um/include/sysdep-i386/stub.h b/arch/um/include/sysdep-i386/stub.h
index b492b12..4fffae7 100644
--- a/arch/um/include/sysdep-i386/stub.h
+++ b/arch/um/include/sysdep-i386/stub.h
@@ -9,6 +9,7 @@
 #include <sys/mman.h>
 #include <asm/ptrace.h>
 #include <asm/unistd.h>
+#include <asm/page.h>
 #include "stub-data.h"
 #include "kern_constants.h"
 #include "uml-config.h"
-- 
cgit v0.10.2


From e46962fdd28f8b30b465e507b657627aa4c1a409 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Wed, 6 Dec 2006 20:34:54 -0800
Subject: [PATCH] uml: size register files correctly

We were using the wrong symbol to size register files.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/um/include/sysdep-i386/ptrace.h b/arch/um/include/sysdep-i386/ptrace.h
index 6670cc9..52b398b 100644
--- a/arch/um/include/sysdep-i386/ptrace.h
+++ b/arch/um/include/sysdep-i386/ptrace.h
@@ -75,7 +75,7 @@ union uml_pt_regs {
 #endif
 #ifdef UML_CONFIG_MODE_SKAS
 	struct skas_regs {
-		unsigned long regs[HOST_FRAME_SIZE];
+		unsigned long regs[MAX_REG_NR];
 		unsigned long fp[HOST_FP_SIZE];
 		unsigned long xfp[HOST_XFP_SIZE];
                 struct faultinfo faultinfo;
diff --git a/arch/um/include/sysdep-x86_64/ptrace.h b/arch/um/include/sysdep-x86_64/ptrace.h
index 617bb9ef..66cb400 100644
--- a/arch/um/include/sysdep-x86_64/ptrace.h
+++ b/arch/um/include/sysdep-x86_64/ptrace.h
@@ -108,7 +108,7 @@ union uml_pt_regs {
 		 * file size, while i386 uses FRAME_SIZE.  Therefore, we need
 		 * to use UM_FRAME_SIZE here instead of HOST_FRAME_SIZE.
 		 */
-		unsigned long regs[UM_FRAME_SIZE];
+		unsigned long regs[MAX_REG_NR];
 		unsigned long fp[HOST_FP_SIZE];
                 struct faultinfo faultinfo;
 		long syscall;
-- 
cgit v0.10.2


From 8210fd2a9fe4b36e99ab777a1a81eb47b703c235 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Wed, 6 Dec 2006 20:34:55 -0800
Subject: [PATCH] uml: use get_random_bytes() after random pool is seeded

When the UML network driver generates random MACs for its devices, it was
possible for a number of UMLs to get the same MACs because the ethernet
initialization was done before the random pool was properly seeded.

This patch moves the initialization later so that it gets better randomness.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/um/drivers/daemon_kern.c b/arch/um/drivers/daemon_kern.c
index 8243869..9c2e7a7 100644
--- a/arch/um/drivers/daemon_kern.c
+++ b/arch/um/drivers/daemon_kern.c
@@ -98,4 +98,4 @@ static int register_daemon(void)
 	return 0;
 }
 
-__initcall(register_daemon);
+late_initcall(register_daemon);
diff --git a/arch/um/drivers/mcast_kern.c b/arch/um/drivers/mcast_kern.c
index c090fbd..52ccb7b 100644
--- a/arch/um/drivers/mcast_kern.c
+++ b/arch/um/drivers/mcast_kern.c
@@ -127,4 +127,4 @@ static int register_mcast(void)
 	return 0;
 }
 
-__initcall(register_mcast);
+late_initcall(register_mcast);
diff --git a/arch/um/drivers/pcap_kern.c b/arch/um/drivers/pcap_kern.c
index 6e1ef85..e67362a 100644
--- a/arch/um/drivers/pcap_kern.c
+++ b/arch/um/drivers/pcap_kern.c
@@ -109,4 +109,4 @@ static int register_pcap(void)
 	return 0;
 }
 
-__initcall(register_pcap);
+late_initcall(register_pcap);
diff --git a/arch/um/drivers/slip_kern.c b/arch/um/drivers/slip_kern.c
index 788da54..25634bd 100644
--- a/arch/um/drivers/slip_kern.c
+++ b/arch/um/drivers/slip_kern.c
@@ -95,4 +95,4 @@ static int register_slip(void)
 	return 0;
 }
 
-__initcall(register_slip);
+late_initcall(register_slip);
diff --git a/arch/um/drivers/slirp_kern.c b/arch/um/drivers/slirp_kern.c
index ae322e1..b3ed8fb 100644
--- a/arch/um/drivers/slirp_kern.c
+++ b/arch/um/drivers/slirp_kern.c
@@ -119,4 +119,4 @@ static int register_slirp(void)
 	return 0;
 }
 
-__initcall(register_slirp);
+late_initcall(register_slirp);
diff --git a/arch/um/os-Linux/drivers/ethertap_kern.c b/arch/um/os-Linux/drivers/ethertap_kern.c
index 16385e2..7054182 100644
--- a/arch/um/os-Linux/drivers/ethertap_kern.c
+++ b/arch/um/os-Linux/drivers/ethertap_kern.c
@@ -105,4 +105,4 @@ static int register_ethertap(void)
 	return 0;
 }
 
-__initcall(register_ethertap);
+late_initcall(register_ethertap);
diff --git a/arch/um/os-Linux/drivers/tuntap_kern.c b/arch/um/os-Linux/drivers/tuntap_kern.c
index 0edbac6..76570a2 100644
--- a/arch/um/os-Linux/drivers/tuntap_kern.c
+++ b/arch/um/os-Linux/drivers/tuntap_kern.c
@@ -90,4 +90,4 @@ static int register_tuntap(void)
 	return 0;
 }
 
-__initcall(register_tuntap);
+late_initcall(register_tuntap);
-- 
cgit v0.10.2


From eef88d16a2cb641d9915bfdf6377e70fccec9fde Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:34:55 -0800
Subject: [PATCH] fix v850 compilation

More fallout of the post 2.6.19-rc1 IRQ changes...

      CC      init/main.o
    In file included from
    /home/bunk/linux/kernel-2.6/linux-2.6.19-rc6-mm2/include/linux/rtc.h:102,
                     from
    /home/bunk/linux/kernel-2.6/linux-2.6.19-rc6-mm2/include/linux/efi.h:19,
                     from
    /home/bunk/linux/kernel-2.6/linux-2.6.19-rc6-mm2/init/main.c:43:
    /home/bunk/linux/kernel-2.6/linux-2.6.19-rc6-mm2/include/linux/interrupt.h:67:
    error: conflicting types for 'irq_handler_t'
    include2/asm/irq.h:49: error: previous declaration of 'irq_handler_t' was here

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/asm-v850/irq.h b/include/asm-v850/irq.h
index 1bf096d..88687c1 100644
--- a/include/asm-v850/irq.h
+++ b/include/asm-v850/irq.h
@@ -46,8 +46,6 @@ extern void
 init_irq_handlers (int base_irq, int num, int interval,
 		   struct hw_interrupt_type *irq_type);
 
-typedef void (*irq_handler_t)(int irq, void *data, struct pt_regs *regs);
-
 /* Handle interrupt IRQ.  REGS are the registers at the time of ther
    interrupt.  */
 extern unsigned int handle_irq (int irq, struct pt_regs *regs);
-- 
cgit v0.10.2


From 9d827c9e8a9d8592167ac3fdc3a50544c86302b1 Mon Sep 17 00:00:00 2001
From: Mike Miller <mike.miller@hp.com>
Date: Wed, 6 Dec 2006 20:34:58 -0800
Subject: [PATCH] cciss: version change

Change the cciss version number to 3.6.14 to reflect the following
functionality changes added by the rest of the set.  They include:

 - Support to fire up on any HP RAID class controller
 - Increase nr_cmds to 512 for most controllers by adding it to the product table
 - PCI subsystem ID fix fix was pulled
 - Disable DMA prefetch for the P600 on IPF platforms
 - Change from 512 to 2048 sector_size for performance
 - Fix in cciss_open for consistency
 - Remove the no longer used revalidate_allvol function
 - Bug fix for busy configuring
 - Support for more than 16 logical volumes
 - Cleanups in cciss_interrupt_mode
 - Fix for iostats, it's been broken for several kernel releases

Signed-off-by: Mike Miller <mike.miller@hp.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 4105c3b..0af6f35 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -47,14 +47,15 @@
 #include <linux/completion.h>
 
 #define CCISS_DRIVER_VERSION(maj,min,submin) ((maj<<16)|(min<<8)|(submin))
-#define DRIVER_NAME "HP CISS Driver (v 3.6.10)"
-#define DRIVER_VERSION CCISS_DRIVER_VERSION(3,6,10)
+#define DRIVER_NAME "HP CISS Driver (v 3.6.14)"
+#define DRIVER_VERSION CCISS_DRIVER_VERSION(3,6,14)
 
 /* Embedded module documentation macros - see modules.h */
 MODULE_AUTHOR("Hewlett-Packard Company");
-MODULE_DESCRIPTION("Driver for HP Controller SA5xxx SA6xxx version 3.6.10");
+MODULE_DESCRIPTION("Driver for HP Controller SA5xxx SA6xxx version 3.6.14");
 MODULE_SUPPORTED_DEVICE("HP SA5i SA5i+ SA532 SA5300 SA5312 SA641 SA642 SA6400"
 			" SA6i P600 P800 P400 P400i E200 E200i E500");
+MODULE_VERSION("3.6.14");
 MODULE_LICENSE("GPL");
 
 #include "cciss_cmd.h"
-- 
cgit v0.10.2


From 4ff9a9a4baff2627d7bcf65d0ec07d647bc1ad29 Mon Sep 17 00:00:00 2001
From: Mike Miller <mike.miller@hp.com>
Date: Wed, 6 Dec 2006 20:35:00 -0800
Subject: [PATCH] cciss: reference driver support

Add the support to fire up on any HP RAID class device that has a valid cciss
signature.

Signed-off-by: Mike Miller <mike.miller@hp.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 0af6f35..d1f65de 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -83,6 +83,8 @@ static const struct pci_device_id cciss_pci_device_id[] = {
 	{PCI_VENDOR_ID_HP,     PCI_DEVICE_ID_HP_CISSD,     0x103C, 0x3214},
 	{PCI_VENDOR_ID_HP,     PCI_DEVICE_ID_HP_CISSD,     0x103C, 0x3215},
 	{PCI_VENDOR_ID_HP,     PCI_DEVICE_ID_HP_CISSC,     0x103C, 0x3233},
+	{PCI_VENDOR_ID_HP,     PCI_ANY_ID,	PCI_ANY_ID, PCI_ANY_ID,
+		PCI_CLASS_STORAGE_RAID << 8, 0xffff << 8, 0},
 	{0,}
 };
 
@@ -112,6 +114,7 @@ static struct board_type products[] = {
 	{0x3214103C, "Smart Array E200i", &SA5_access},
 	{0x3215103C, "Smart Array E200i", &SA5_access},
 	{0x3233103C, "Smart Array E500", &SA5_access},
+	{0xFFFF103C, "Unknown Smart Array", &SA5_access},
 };
 
 /* How long to wait (in milliseconds) for board to go into simple mode */
@@ -2960,13 +2963,6 @@ static int cciss_pci_init(ctlr_info_t *c, struct pci_dev *pdev)
 			break;
 		}
 	}
-	if (i == ARRAY_SIZE(products)) {
-		printk(KERN_WARNING "cciss: Sorry, I don't know how"
-		       " to access the Smart Array controller %08lx\n",
-		       (unsigned long)board_id);
-		err = -ENODEV;
-		goto err_out_free_res;
-	}
 	if ((readb(&c->cfgtable->Signature[0]) != 'C') ||
 	    (readb(&c->cfgtable->Signature[1]) != 'I') ||
 	    (readb(&c->cfgtable->Signature[2]) != 'S') ||
@@ -2975,6 +2971,26 @@ static int cciss_pci_init(ctlr_info_t *c, struct pci_dev *pdev)
 		err = -ENODEV;
 		goto err_out_free_res;
 	}
+	/* We didn't find the controller in our list. We know the
+	 * signature is valid. If it's an HP device let's try to
+	 * bind to the device and fire it up. Otherwise we bail.
+	 */
+	if (i == ARRAY_SIZE(products)) {
+		if (subsystem_vendor_id == PCI_VENDOR_ID_HP) {
+			c->product_name = products[i-1].product_name;
+			c->access = *(products[i-1].access);
+			printk(KERN_WARNING "cciss: This is an unknown "
+				"Smart Array controller.\n"
+				"cciss: Please update to the latest driver "
+				"available from www.hp.com.\n");
+		} else {
+			printk(KERN_WARNING "cciss: Sorry, I don't know how"
+				" to access the Smart Array controller %08lx\n"
+					, (unsigned long)board_id);
+			err = -ENODEV;
+			goto err_out_free_res;
+		}
+	}
 #ifdef CONFIG_X86
 	{
 		/* Need to enable prefetch in the SCSI core for 6400 in x86 */
-- 
cgit v0.10.2


From f880632f963c3611d096d9373d16663c076310c7 Mon Sep 17 00:00:00 2001
From: Mike Miller <mike.miller@hp.com>
Date: Wed, 6 Dec 2006 20:35:01 -0800
Subject: [PATCH] cciss: increase number of commands on controller

Remove #define NR_CMDS and replace it w/hba[i]->nr_cmds.  Most Smart Array
controllers can support up to 1024 commands but the E200 family can only
support 128.  To prevent annoying "fifo full" messages we define nr_cmds on a
per controller basis by adding it the product table.

Signed-off-by: Mike Miller <mike.miller@hp.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index d1f65de..0f976aa 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -93,28 +93,29 @@ MODULE_DEVICE_TABLE(pci, cciss_pci_device_id);
 /*  board_id = Subsystem Device ID & Vendor ID
  *  product = Marketing Name for the board
  *  access = Address of the struct of function pointers
+ *  nr_cmds = Number of commands supported by controller
  */
 static struct board_type products[] = {
-	{0x40700E11, "Smart Array 5300", &SA5_access},
-	{0x40800E11, "Smart Array 5i", &SA5B_access},
-	{0x40820E11, "Smart Array 532", &SA5B_access},
-	{0x40830E11, "Smart Array 5312", &SA5B_access},
-	{0x409A0E11, "Smart Array 641", &SA5_access},
-	{0x409B0E11, "Smart Array 642", &SA5_access},
-	{0x409C0E11, "Smart Array 6400", &SA5_access},
-	{0x409D0E11, "Smart Array 6400 EM", &SA5_access},
-	{0x40910E11, "Smart Array 6i", &SA5_access},
-	{0x3225103C, "Smart Array P600", &SA5_access},
-	{0x3223103C, "Smart Array P800", &SA5_access},
-	{0x3234103C, "Smart Array P400", &SA5_access},
-	{0x3235103C, "Smart Array P400i", &SA5_access},
-	{0x3211103C, "Smart Array E200i", &SA5_access},
-	{0x3212103C, "Smart Array E200", &SA5_access},
-	{0x3213103C, "Smart Array E200i", &SA5_access},
-	{0x3214103C, "Smart Array E200i", &SA5_access},
-	{0x3215103C, "Smart Array E200i", &SA5_access},
-	{0x3233103C, "Smart Array E500", &SA5_access},
-	{0xFFFF103C, "Unknown Smart Array", &SA5_access},
+	{0x40700E11, "Smart Array 5300", &SA5_access, 512},
+	{0x40800E11, "Smart Array 5i", &SA5B_access, 512},
+	{0x40820E11, "Smart Array 532", &SA5B_access, 512},
+	{0x40830E11, "Smart Array 5312", &SA5B_access, 512},
+	{0x409A0E11, "Smart Array 641", &SA5_access, 512},
+	{0x409B0E11, "Smart Array 642", &SA5_access, 512},
+	{0x409C0E11, "Smart Array 6400", &SA5_access, 512},
+	{0x409D0E11, "Smart Array 6400 EM", &SA5_access, 512},
+	{0x40910E11, "Smart Array 6i", &SA5_access, 512},
+	{0x3225103C, "Smart Array P600", &SA5_access, 512},
+	{0x3223103C, "Smart Array P800", &SA5_access, 512},
+	{0x3234103C, "Smart Array P400", &SA5_access, 512},
+	{0x3235103C, "Smart Array P400i", &SA5_access, 512},
+	{0x3211103C, "Smart Array E200i", &SA5_access, 120},
+	{0x3212103C, "Smart Array E200", &SA5_access, 120},
+	{0x3213103C, "Smart Array E200i", &SA5_access, 120},
+	{0x3214103C, "Smart Array E200i", &SA5_access, 120},
+	{0x3215103C, "Smart Array E200i", &SA5_access, 120},
+	{0x3233103C, "Smart Array E500", &SA5_access, 512},
+	{0xFFFF103C, "Unknown Smart Array", &SA5_access, 120},
 };
 
 /* How long to wait (in milliseconds) for board to go into simple mode */
@@ -125,7 +126,6 @@ static struct board_type products[] = {
 #define MAX_CMD_RETRIES 3
 
 #define READ_AHEAD 	 1024
-#define NR_CMDS		 384	/* #commands that can be outstanding */
 #define MAX_CTLR	32
 
 /* Originally cciss driver only supports 8 major numbers */
@@ -404,8 +404,8 @@ static CommandList_struct *cmd_alloc(ctlr_info_t *h, int get_from_pool)
 	} else {		/* get it out of the controllers pool */
 
 		do {
-			i = find_first_zero_bit(h->cmd_pool_bits, NR_CMDS);
-			if (i == NR_CMDS)
+			i = find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds);
+			if (i == h->nr_cmds)
 				return NULL;
 		} while (test_and_set_bit
 			 (i & (BITS_PER_LONG - 1),
@@ -1247,7 +1247,7 @@ static void cciss_check_queues(ctlr_info_t *h)
 	 * in case the interrupt we serviced was from an ioctl and did not
 	 * free any new commands.
 	 */
-	if ((find_first_zero_bit(h->cmd_pool_bits, NR_CMDS)) == NR_CMDS)
+	if ((find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds)) == h->nr_cmds)
 		return;
 
 	/* We have room on the queue for more commands.  Now we need to queue
@@ -1266,7 +1266,7 @@ static void cciss_check_queues(ctlr_info_t *h)
 		/* check to see if we have maxed out the number of commands
 		 * that can be placed on the queue.
 		 */
-		if ((find_first_zero_bit(h->cmd_pool_bits, NR_CMDS)) == NR_CMDS) {
+		if ((find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds)) == h->nr_cmds) {
 			if (curr_queue == start_queue) {
 				h->next_to_run =
 				    (start_queue + 1) % (h->highest_lun + 1);
@@ -2140,7 +2140,7 @@ static int add_sendcmd_reject(__u8 cmd, int ctlr, unsigned long complete)
 
 	/* We've sent down an abort or reset, but something else
 	   has completed */
-	if (srl->ncompletions >= (NR_CMDS + 2)) {
+	if (srl->ncompletions >= (hba[ctlr]->nr_cmds + 2)) {
 		/* Uh oh.  No room to save it for later... */
 		printk(KERN_WARNING "cciss%d: Sendcmd: Invalid command addr, "
 		       "reject list overflow, command lost!\n", ctlr);
@@ -2677,7 +2677,7 @@ static irqreturn_t do_cciss_intr(int irq, void *dev_id)
 			a1 = a;
 			if ((a & 0x04)) {
 				a2 = (a >> 3);
-				if (a2 >= NR_CMDS) {
+				if (a2 >= h->nr_cmds) {
 					printk(KERN_WARNING
 					       "cciss: controller cciss%d failed, stopping.\n",
 					       h->ctlr);
@@ -2960,6 +2960,7 @@ static int cciss_pci_init(ctlr_info_t *c, struct pci_dev *pdev)
 		if (board_id == products[i].board_id) {
 			c->product_name = products[i].product_name;
 			c->access = *(products[i].access);
+			c->nr_cmds = products[i].nr_cmds;
 			break;
 		}
 	}
@@ -2979,6 +2980,7 @@ static int cciss_pci_init(ctlr_info_t *c, struct pci_dev *pdev)
 		if (subsystem_vendor_id == PCI_VENDOR_ID_HP) {
 			c->product_name = products[i-1].product_name;
 			c->access = *(products[i-1].access);
+			c->nr_cmds = products[i-1].nr_cmds;
 			printk(KERN_WARNING "cciss: This is an unknown "
 				"Smart Array controller.\n"
 				"cciss: Please update to the latest driver "
@@ -3286,15 +3288,15 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	       hba[i]->intr[SIMPLE_MODE_INT], dac ? "" : " not");
 
 	hba[i]->cmd_pool_bits =
-	    kmalloc(((NR_CMDS + BITS_PER_LONG -
+	    kmalloc(((hba[i]->nr_cmds + BITS_PER_LONG -
 		      1) / BITS_PER_LONG) * sizeof(unsigned long), GFP_KERNEL);
 	hba[i]->cmd_pool = (CommandList_struct *)
 	    pci_alloc_consistent(hba[i]->pdev,
-		    NR_CMDS * sizeof(CommandList_struct),
+		    hba[i]->nr_cmds * sizeof(CommandList_struct),
 		    &(hba[i]->cmd_pool_dhandle));
 	hba[i]->errinfo_pool = (ErrorInfo_struct *)
 	    pci_alloc_consistent(hba[i]->pdev,
-		    NR_CMDS * sizeof(ErrorInfo_struct),
+		    hba[i]->nr_cmds * sizeof(ErrorInfo_struct),
 		    &(hba[i]->errinfo_pool_dhandle));
 	if ((hba[i]->cmd_pool_bits == NULL)
 	    || (hba[i]->cmd_pool == NULL)
@@ -3305,7 +3307,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 #ifdef CONFIG_CISS_SCSI_TAPE
 	hba[i]->scsi_rejects.complete =
 	    kmalloc(sizeof(hba[i]->scsi_rejects.complete[0]) *
-		    (NR_CMDS + 5), GFP_KERNEL);
+		    (hba[i]->nr_cmds + 5), GFP_KERNEL);
 	if (hba[i]->scsi_rejects.complete == NULL) {
 		printk(KERN_ERR "cciss: out of memory");
 		goto clean4;
@@ -3319,7 +3321,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	/* command and error info recs zeroed out before
 	   they are used */
 	memset(hba[i]->cmd_pool_bits, 0,
-	       ((NR_CMDS + BITS_PER_LONG -
+	       ((hba[i]->nr_cmds + BITS_PER_LONG -
 		 1) / BITS_PER_LONG) * sizeof(unsigned long));
 
 #ifdef CCISS_DEBUG
@@ -3388,11 +3390,11 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	kfree(hba[i]->cmd_pool_bits);
 	if (hba[i]->cmd_pool)
 		pci_free_consistent(hba[i]->pdev,
-				    NR_CMDS * sizeof(CommandList_struct),
+				    hba[i]->nr_cmds * sizeof(CommandList_struct),
 				    hba[i]->cmd_pool, hba[i]->cmd_pool_dhandle);
 	if (hba[i]->errinfo_pool)
 		pci_free_consistent(hba[i]->pdev,
-				    NR_CMDS * sizeof(ErrorInfo_struct),
+				    hba[i]->nr_cmds * sizeof(ErrorInfo_struct),
 				    hba[i]->errinfo_pool,
 				    hba[i]->errinfo_pool_dhandle);
 	free_irq(hba[i]->intr[SIMPLE_MODE_INT], hba[i]);
@@ -3459,9 +3461,9 @@ static void __devexit cciss_remove_one(struct pci_dev *pdev)
 		}
 	}
 
-	pci_free_consistent(hba[i]->pdev, NR_CMDS * sizeof(CommandList_struct),
+	pci_free_consistent(hba[i]->pdev, hba[i]->nr_cmds * sizeof(CommandList_struct),
 			    hba[i]->cmd_pool, hba[i]->cmd_pool_dhandle);
-	pci_free_consistent(hba[i]->pdev, NR_CMDS * sizeof(ErrorInfo_struct),
+	pci_free_consistent(hba[i]->pdev, hba[i]->nr_cmds * sizeof(ErrorInfo_struct),
 			    hba[i]->errinfo_pool, hba[i]->errinfo_pool_dhandle);
 	kfree(hba[i]->cmd_pool_bits);
 #ifdef CONFIG_CISS_SCSI_TAPE
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index 562235c..0d765f9 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -60,6 +60,7 @@ struct ctlr_info
 	__u32	board_id;
 	void __iomem *vaddr;
 	unsigned long paddr;
+	int 	nr_cmds; /* Number of commands allowed on this controller */
 	CfgTable_struct __iomem *cfgtable;
 	int	interrupts_enabled;
 	int	major;
@@ -282,6 +283,7 @@ struct board_type {
 	__u32	board_id;
 	char	*product_name;
 	struct access_method *access;
+	int nr_cmds; /* Max cmds this kind of ctlr can handle. */
 };
 
 #define CCISS_LOCK(i)	(&hba[i]->lock)
-- 
cgit v0.10.2


From de9239167158c0210c5b9a709d67cea1b6f8ae56 Mon Sep 17 00:00:00 2001
From: Mike Miller <mike.miller@hp.com>
Date: Wed, 6 Dec 2006 20:35:03 -0800
Subject: [PATCH] cciss: fix pci ssid for the E500 controller

Change the SSID on the E500 as a workaround for a firmware bug.  It looks like
the original patch was backed out between rc2 and rc4.

Signed-off-by: Mike Miller <mike.miller@hp.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 0f976aa..4899ab2 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -82,7 +82,7 @@ static const struct pci_device_id cciss_pci_device_id[] = {
 	{PCI_VENDOR_ID_HP,     PCI_DEVICE_ID_HP_CISSD,     0x103C, 0x3213},
 	{PCI_VENDOR_ID_HP,     PCI_DEVICE_ID_HP_CISSD,     0x103C, 0x3214},
 	{PCI_VENDOR_ID_HP,     PCI_DEVICE_ID_HP_CISSD,     0x103C, 0x3215},
-	{PCI_VENDOR_ID_HP,     PCI_DEVICE_ID_HP_CISSC,     0x103C, 0x3233},
+	{PCI_VENDOR_ID_HP,     PCI_DEVICE_ID_HP_CISSC,     0x103C, 0x3237},
 	{PCI_VENDOR_ID_HP,     PCI_ANY_ID,	PCI_ANY_ID, PCI_ANY_ID,
 		PCI_CLASS_STORAGE_RAID << 8, 0xffff << 8, 0},
 	{0,}
@@ -114,7 +114,7 @@ static struct board_type products[] = {
 	{0x3213103C, "Smart Array E200i", &SA5_access, 120},
 	{0x3214103C, "Smart Array E200i", &SA5_access, 120},
 	{0x3215103C, "Smart Array E200i", &SA5_access, 120},
-	{0x3233103C, "Smart Array E500", &SA5_access, 512},
+	{0x3237103C, "Smart Array E500", &SA5_access, 512},
 	{0xFFFF103C, "Unknown Smart Array", &SA5_access, 120},
 };
 
-- 
cgit v0.10.2


From f92e2f5f889803306e50c06e17ee330403e91b8d Mon Sep 17 00:00:00 2001
From: Mike Miller <mike.miller@hp.com>
Date: Wed, 6 Dec 2006 20:35:04 -0800
Subject: [PATCH] cciss: disable DMA prefetch on P600

Unconditionally disable DMA prefetch on the P600 controller.  An ASIC bug may
result in prefetching beyond the end of physical memory.

Signed-off-by: Mike Miller <mike.miller@hp.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 4899ab2..a17223c 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -3003,6 +3003,17 @@ static int cciss_pci_init(ctlr_info_t *c, struct pci_dev *pdev)
 	}
 #endif
 
+	/* Disabling DMA prefetch for the P600
+	 * An ASIC bug may result in a prefetch beyond
+	 * physical memory.
+	 */
+	if(board_id == 0x3225103C) {
+		__u32 dma_prefetch;
+		dma_prefetch = readl(c->vaddr + I2O_DMA1_CFG);
+		dma_prefetch |= 0x8000;
+		writel(dma_prefetch, c->vaddr + I2O_DMA1_CFG);
+	}
+
 #ifdef CCISS_DEBUG
 	printk("Trying to put board into Simple mode\n");
 #endif				/* CCISS_DEBUG */
diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h
index 4af7c4c..b2147cc 100644
--- a/drivers/block/cciss_cmd.h
+++ b/drivers/block/cciss_cmd.h
@@ -55,6 +55,7 @@
 #define I2O_INT_MASK            0x34
 #define I2O_IBPOST_Q            0x40
 #define I2O_OBPOST_Q            0x44
+#define I2O_DMA1_CFG		0x214
 
 //Configuration Table
 #define CFGTBL_ChangeReq        0x00000001l
-- 
cgit v0.10.2


From 92c4231aef720bd5e1d634d2f7335f31277318da Mon Sep 17 00:00:00 2001
From: Mike Miller <mike.miller@hp.com>
Date: Wed, 6 Dec 2006 20:35:06 -0800
Subject: [PATCH] cciss: set sector_size to 2048 for performance

Change the blk_queue_max_sectors from 512 to 2048.  This helps increase
performance.

[akpm@osdl.org: s/sector_size/max_sectors/]
Signed-off-by: Mike Miller <mike.miller@hp.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index a17223c..e61279f 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -269,6 +269,7 @@ static int cciss_proc_get_info(char *buffer, char **start, off_t offset,
 		       "Firmware Version: %c%c%c%c\n"
 		       "IRQ: %d\n"
 		       "Logical drives: %d\n"
+		       "Max sectors: %d\n"
 		       "Current Q depth: %d\n"
 		       "Current # commands on controller: %d\n"
 		       "Max Q depth since init: %d\n"
@@ -279,7 +280,9 @@ static int cciss_proc_get_info(char *buffer, char **start, off_t offset,
 		       (unsigned long)h->board_id,
 		       h->firm_ver[0], h->firm_ver[1], h->firm_ver[2],
 		       h->firm_ver[3], (unsigned int)h->intr[SIMPLE_MODE_INT],
-		       h->num_luns, h->Qdepth, h->commands_outstanding,
+		       h->num_luns,
+		       h->cciss_max_sectors,
+		       h->Qdepth, h->commands_outstanding,
 		       h->maxQsinceinit, h->max_outstanding, h->maxSG);
 
 	pos += size;
@@ -1395,7 +1398,7 @@ static void cciss_update_drive_info(int ctlr, int drv_index)
 		/* This is a limit in the driver and could be eliminated. */
 		blk_queue_max_phys_segments(disk->queue, MAXSGENTRIES);
 
-		blk_queue_max_sectors(disk->queue, 512);
+		blk_queue_max_sectors(disk->queue, h->cciss_max_sectors);
 
 		blk_queue_softirq_done(disk->queue, cciss_softirq_done);
 
@@ -3347,6 +3350,9 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	hba[i]->access.set_intr_mask(hba[i], CCISS_INTR_ON);
 
 	cciss_procinit(i);
+
+	hba[i]->cciss_max_sectors = 2048;
+
 	hba[i]->busy_initializing = 0;
 
 	for (j = 0; j < NWD; j++) {	/* mfm */
@@ -3371,7 +3377,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 		/* This is a limit in the driver and could be eliminated. */
 		blk_queue_max_phys_segments(q, MAXSGENTRIES);
 
-		blk_queue_max_sectors(q, 512);
+		blk_queue_max_sectors(q, hba[i]->cciss_max_sectors);
 
 		blk_queue_softirq_done(q, cciss_softirq_done);
 
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index 0d765f9..c3df673 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -77,6 +77,7 @@ struct ctlr_info
 	unsigned int intr[4];
 	unsigned int msix_vector;
 	unsigned int msi_vector;
+	int 	cciss_max_sectors;
 	BYTE	cciss_read;
 	BYTE	cciss_write;
 	BYTE	cciss_read_capacity;
-- 
cgit v0.10.2


From 7a06f789e0a1b46e4ed2a68f885cbe5ff74a34d6 Mon Sep 17 00:00:00 2001
From: Mike Miller <mike.miller@hp.com>
Date: Wed, 6 Dec 2006 20:35:08 -0800
Subject: [PATCH] cciss: change cciss_open for consistency

Change our open to test for drv->heads like we do in other places in the
driver.  Mostly for consistency.

Signed-off-by: Mike Miller <mike.miller@hp.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index e61279f..c834305 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -494,7 +494,7 @@ static int cciss_open(struct inode *inode, struct file *filep)
 	 * but I'm already using way to many device nodes to claim another one
 	 * for "raw controller".
 	 */
-	if (drv->nr_blocks == 0) {
+	if (drv->heads == 0) {
 		if (iminor(inode) != 0) {	/* not node 0? */
 			/* if not node 0 make sure it is a partition = 0 */
 			if (iminor(inode) & 0x0f) {
-- 
cgit v0.10.2


From 3833a748aa75dd39494bb861ab018216b0a2c14e Mon Sep 17 00:00:00 2001
From: Mike Miller <mike.miller@hp.com>
Date: Wed, 6 Dec 2006 20:35:10 -0800
Subject: [PATCH] cciss: remove unused revalidate_allvol function

Remove the no longer used revalidate_allvol function.  It was replaced by
rebuild_lun_table.

Signed-off-by: Mike Miller <mike.miller@hp.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index c834305..70cf932 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -141,7 +141,6 @@ static int cciss_ioctl(struct inode *inode, struct file *filep,
 		       unsigned int cmd, unsigned long arg);
 static int cciss_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
-static int revalidate_allvol(ctlr_info_t *host);
 static int cciss_revalidate(struct gendisk *disk);
 static int rebuild_lun_table(ctlr_info_t *h, struct gendisk *del_disk);
 static int deregister_disk(struct gendisk *disk, drive_info_struct *drv,
@@ -857,9 +856,7 @@ static int cciss_ioctl(struct inode *inode, struct file *filep,
 		}
 
 	case CCISS_REVALIDVOLS:
-		if (bdev != bdev->bd_contains || drv != host->drv)
-			return -ENXIO;
-		return revalidate_allvol(host);
+		return rebuild_lun_table(host, NULL);
 
 	case CCISS_GETLUNINFO:{
 			LogvolInfo_struct luninfo;
@@ -1159,75 +1156,6 @@ static int cciss_ioctl(struct inode *inode, struct file *filep,
 	}
 }
 
-/*
- * revalidate_allvol is for online array config utilities.  After a
- * utility reconfigures the drives in the array, it can use this function
- * (through an ioctl) to make the driver zap any previous disk structs for
- * that controller and get new ones.
- *
- * Right now I'm using the getgeometry() function to do this, but this
- * function should probably be finer grained and allow you to revalidate one
- * particular logical volume (instead of all of them on a particular
- * controller).
- */
-static int revalidate_allvol(ctlr_info_t *host)
-{
-	int ctlr = host->ctlr, i;
-	unsigned long flags;
-
-	spin_lock_irqsave(CCISS_LOCK(ctlr), flags);
-	if (host->usage_count > 1) {
-		spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
-		printk(KERN_WARNING "cciss: Device busy for volume"
-		       " revalidation (usage=%d)\n", host->usage_count);
-		return -EBUSY;
-	}
-	host->usage_count++;
-	spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
-
-	for (i = 0; i < NWD; i++) {
-		struct gendisk *disk = host->gendisk[i];
-		if (disk) {
-			request_queue_t *q = disk->queue;
-
-			if (disk->flags & GENHD_FL_UP)
-				del_gendisk(disk);
-			if (q)
-				blk_cleanup_queue(q);
-		}
-	}
-
-	/*
-	 * Set the partition and block size structures for all volumes
-	 * on this controller to zero.  We will reread all of this data
-	 */
-	memset(host->drv, 0, sizeof(drive_info_struct)
-	       * CISS_MAX_LUN);
-	/*
-	 * Tell the array controller not to give us any interrupts while
-	 * we check the new geometry.  Then turn interrupts back on when
-	 * we're done.
-	 */
-	host->access.set_intr_mask(host, CCISS_INTR_OFF);
-	cciss_getgeometry(ctlr);
-	host->access.set_intr_mask(host, CCISS_INTR_ON);
-
-	/* Loop through each real device */
-	for (i = 0; i < NWD; i++) {
-		struct gendisk *disk = host->gendisk[i];
-		drive_info_struct *drv = &(host->drv[i]);
-		/* we must register the controller even if no disks exist */
-		/* this is for the online array utilities */
-		if (!drv->heads && i)
-			continue;
-		blk_queue_hardsect_size(drv->queue, drv->block_size);
-		set_capacity(disk, drv->nr_blocks);
-		add_disk(disk);
-	}
-	host->usage_count--;
-	return 0;
-}
-
 static inline void complete_buffers(struct bio *bio, int status)
 {
 	while (bio) {
-- 
cgit v0.10.2


From 799202cbd0ef6a201446d99fcbd78b9f0bda6ae5 Mon Sep 17 00:00:00 2001
From: Mike Miller <mike.miller@hp.com>
Date: Wed, 6 Dec 2006 20:35:12 -0800
Subject: [PATCH] cciss: add support for 1024 logical volumes

Add the support for a large number of logical volumes.  We will soon have
hardware that support up to 1024 logical volumes.

Signed-off-by: Mike Miller <mike.miller@hp.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 70cf932..c99cb7e 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1315,6 +1315,11 @@ static void cciss_update_drive_info(int ctlr, int drv_index)
 	/* if it's the controller it's already added */
 	if (drv_index) {
 		disk->queue = blk_init_queue(do_cciss_request, &h->lock);
+		sprintf(disk->disk_name, "cciss/c%dd%d", ctlr, drv_index);
+		disk->major = h->major;
+		disk->first_minor = drv_index << NWD_SHIFT;
+		disk->fops = &cciss_fops;
+		disk->private_data = &h->drv[drv_index];
 
 		/* Set up queue information */
 		disk->queue->backing_dev_info.ra_pages = READ_AHEAD;
@@ -1393,11 +1398,6 @@ static int rebuild_lun_table(ctlr_info_t *h, struct gendisk *del_disk)
 
 	/* Set busy_configuring flag for this operation */
 	spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
-	if (h->num_luns >= CISS_MAX_LUN) {
-		spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
-		return -EINVAL;
-	}
-
 	if (h->busy_configuring) {
 		spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
 		return -EBUSY;
@@ -1430,17 +1430,8 @@ static int rebuild_lun_table(ctlr_info_t *h, struct gendisk *del_disk)
 					      0, 0, TYPE_CMD);
 
 		if (return_code == IO_OK) {
-			listlength |=
-			    (0xff & (unsigned int)(ld_buff->LUNListLength[0]))
-			    << 24;
-			listlength |=
-			    (0xff & (unsigned int)(ld_buff->LUNListLength[1]))
-			    << 16;
-			listlength |=
-			    (0xff & (unsigned int)(ld_buff->LUNListLength[2]))
-			    << 8;
-			listlength |=
-			    0xff & (unsigned int)(ld_buff->LUNListLength[3]);
+			listlength =
+				be32_to_cpu(*(__u32 *) ld_buff->LUNListLength);
 		} else {	/* reading number of logical volumes failed */
 			printk(KERN_WARNING "cciss: report logical volume"
 			       " command failed\n");
@@ -1491,6 +1482,14 @@ static int rebuild_lun_table(ctlr_info_t *h, struct gendisk *del_disk)
 				if (drv_index == -1)
 					goto freeret;
 
+				/*Check if the gendisk needs to be allocated */
+				if (!h->gendisk[drv_index]){
+					h->gendisk[drv_index] = alloc_disk(1 << NWD_SHIFT);
+					if (!h->gendisk[drv_index]){
+						printk(KERN_ERR "cciss: could not allocate new disk %d\n", drv_index);
+						goto mem_msg;
+					}
+				}
 			}
 			h->drv[drv_index].LunID = lunid;
 			cciss_update_drive_info(ctlr, drv_index);
@@ -1528,6 +1527,7 @@ static int rebuild_lun_table(ctlr_info_t *h, struct gendisk *del_disk)
 static int deregister_disk(struct gendisk *disk, drive_info_struct *drv,
 			   int clear_all)
 {
+	int i;
 	ctlr_info_t *h = get_host(disk);
 
 	if (!capable(CAP_SYS_RAWIO))
@@ -1551,9 +1551,35 @@ static int deregister_disk(struct gendisk *disk, drive_info_struct *drv,
 				del_gendisk(disk);
 			if (q) {
 				blk_cleanup_queue(q);
+				/* Set drv->queue to NULL so that we do not try
+				 * to call blk_start_queue on this queue in the
+				 * interrupt handler
+				 */
 				drv->queue = NULL;
 			}
+			/* If clear_all is set then we are deleting the logical
+			 * drive, not just refreshing its info.  For drives
+			 * other than disk 0 we will call put_disk.  We do not
+			 * do this for disk 0 as we need it to be able to
+			 * configure the controller.
+			*/
+			if (clear_all){
+				/* This isn't pretty, but we need to find the
+				 * disk in our array and NULL our the pointer.
+				 * This is so that we will call alloc_disk if
+				 * this index is used again later.
+				*/
+				for (i=0; i < CISS_MAX_LUN; i++){
+					if(h->gendisk[i] == disk){
+						h->gendisk[i] = NULL;
+						break;
+					}
+				}
+				put_disk(disk);
+			}
 		}
+	} else {
+		set_capacity(disk, 0);
 	}
 
 	--h->num_luns;
@@ -3119,13 +3145,7 @@ geo_inq:
 /* Returns -1 if no free entries are left.  */
 static int alloc_cciss_hba(void)
 {
-	struct gendisk *disk[NWD];
-	int i, n;
-	for (n = 0; n < NWD; n++) {
-		disk[n] = alloc_disk(1 << NWD_SHIFT);
-		if (!disk[n])
-			goto out;
-	}
+	int i;
 
 	for (i = 0; i < MAX_CTLR; i++) {
 		if (!hba[i]) {
@@ -3133,20 +3153,18 @@ static int alloc_cciss_hba(void)
 			p = kzalloc(sizeof(ctlr_info_t), GFP_KERNEL);
 			if (!p)
 				goto Enomem;
-			for (n = 0; n < NWD; n++)
-				p->gendisk[n] = disk[n];
+			p->gendisk[0] = alloc_disk(1 << NWD_SHIFT);
+			if (!p->gendisk[0])
+				goto Enomem;
 			hba[i] = p;
 			return i;
 		}
 	}
 	printk(KERN_WARNING "cciss: This driver supports a maximum"
 	       " of %d controllers.\n", MAX_CTLR);
-	goto out;
-      Enomem:
+	return -1;
+Enomem:
 	printk(KERN_ERR "cciss: out of memory.\n");
-      out:
-	while (n--)
-		put_disk(disk[n]);
 	return -1;
 }
 
@@ -3156,7 +3174,7 @@ static void free_hba(int i)
 	int n;
 
 	hba[i] = NULL;
-	for (n = 0; n < NWD; n++)
+	for (n = 0; n < CISS_MAX_LUN; n++)
 		put_disk(p->gendisk[n]);
 	kfree(p);
 }
@@ -3169,9 +3187,8 @@ static void free_hba(int i)
 static int __devinit cciss_init_one(struct pci_dev *pdev,
 				    const struct pci_device_id *ent)
 {
-	request_queue_t *q;
 	int i;
-	int j;
+	int j = 0;
 	int rc;
 	int dac;
 
@@ -3283,16 +3300,29 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 
 	hba[i]->busy_initializing = 0;
 
-	for (j = 0; j < NWD; j++) {	/* mfm */
+	do {
 		drive_info_struct *drv = &(hba[i]->drv[j]);
 		struct gendisk *disk = hba[i]->gendisk[j];
+		request_queue_t *q;
+
+		/* Check if the disk was allocated already */
+		if (!disk){
+			hba[i]->gendisk[j] = alloc_disk(1 << NWD_SHIFT);
+			disk = hba[i]->gendisk[j];
+		}
+
+		/* Check that the disk was able to be allocated */
+		if (!disk) {
+			printk(KERN_ERR "cciss: unable to allocate memory for disk %d\n", j);
+			goto clean4;
+		}
 
 		q = blk_init_queue(do_cciss_request, &hba[i]->lock);
 		if (!q) {
 			printk(KERN_ERR
 			       "cciss:  unable to allocate queue for disk %d\n",
 			       j);
-			break;
+			goto clean4;
 		}
 		drv->queue = q;
 
@@ -3324,7 +3354,8 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 		blk_queue_hardsect_size(q, drv->block_size);
 		set_capacity(disk, drv->nr_blocks);
 		add_disk(disk);
-	}
+		j++;
+	} while (j <= hba[i]->highest_lun);
 
 	return 1;
 
@@ -3347,6 +3378,15 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	unregister_blkdev(hba[i]->major, hba[i]->devname);
       clean1:
 	hba[i]->busy_initializing = 0;
+	/* cleanup any queues that may have been initialized */
+	for (j=0; j <= hba[i]->highest_lun; j++){
+		drive_info_struct *drv = &(hba[i]->drv[j]);
+		if (drv->queue)
+			blk_cleanup_queue(drv->queue);
+	}
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
 	free_hba(i);
 	return -1;
 }
@@ -3394,7 +3434,7 @@ static void __devexit cciss_remove_one(struct pci_dev *pdev)
 	remove_proc_entry(hba[i]->devname, proc_cciss);
 
 	/* remove it from the disk list */
-	for (j = 0; j < NWD; j++) {
+	for (j = 0; j < CISS_MAX_LUN; j++) {
 		struct gendisk *disk = hba[i]->gendisk[j];
 		if (disk) {
 			request_queue_t *q = disk->queue;
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index c3df673..b70988d 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -6,7 +6,6 @@
 #include "cciss_cmd.h"
 
 
-#define NWD		16
 #define NWD_SHIFT	4
 #define MAX_PART	(1 << NWD_SHIFT)
 
@@ -112,7 +111,7 @@ struct ctlr_info
 	int			next_to_run;
 
 	// Disk structures we need to pass back
-	struct gendisk   *gendisk[NWD];
+	struct gendisk   *gendisk[CISS_MAX_LUN];
 #ifdef CONFIG_CISS_SCSI_TAPE
 	void *scsi_ctlr; /* ptr to structure containing scsi related stuff */
 	/* list of block side commands the scsi error handling sucked up */
diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h
index b2147cc..43bf559 100644
--- a/drivers/block/cciss_cmd.h
+++ b/drivers/block/cciss_cmd.h
@@ -89,7 +89,7 @@ typedef union _u64bit
 //###########################################################################
 //STRUCTURES
 //###########################################################################
-#define CISS_MAX_LUN	16	
+#define CISS_MAX_LUN	1024
 #define CISS_MAX_PHYS_LUN	1024
 // SCSI-3 Cmmands 
 
diff --git a/include/linux/cciss_ioctl.h b/include/linux/cciss_ioctl.h
index 6e27f42..cb57c30 100644
--- a/include/linux/cciss_ioctl.h
+++ b/include/linux/cciss_ioctl.h
@@ -80,7 +80,7 @@ typedef __u32 DriverVer_type;
 #define HWORD __u16
 #define DWORD __u32
 
-#define CISS_MAX_LUN	16	
+#define CISS_MAX_LUN	1024
 
 #define LEVEL2LUN   1   // index into Target(x) structure, due to byte swapping
 #define LEVEL3LUN   0
-- 
cgit v0.10.2


From 1ecb9c0f3c9ef6af77c39a9f584940691847ccf4 Mon Sep 17 00:00:00 2001
From: Mike Miller <mike.miller@hp.com>
Date: Wed, 6 Dec 2006 20:35:13 -0800
Subject: [PATCH] cciss: cleanup cciss_interrupt mode

A pretty simple cleanup for cciss_interrupt_mode.

Signed-off-by: Mike Miller <mike.miller@hp.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index c99cb7e..892e092 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2788,23 +2788,21 @@ static void __devinit cciss_interrupt_mode(ctlr_info_t *c,
 		if (err > 0) {
 			printk(KERN_WARNING "cciss: only %d MSI-X vectors "
 			       "available\n", err);
+			goto default_int_mode;
 		} else {
 			printk(KERN_WARNING "cciss: MSI-X init failed %d\n",
 			       err);
+			goto default_int_mode;
 		}
 	}
 	if (pci_find_capability(pdev, PCI_CAP_ID_MSI)) {
 		if (!pci_enable_msi(pdev)) {
-			c->intr[SIMPLE_MODE_INT] = pdev->irq;
 			c->msi_vector = 1;
-			return;
 		} else {
 			printk(KERN_WARNING "cciss: MSI init failed\n");
-			c->intr[SIMPLE_MODE_INT] = pdev->irq;
-			return;
 		}
 	}
-      default_int_mode:
+default_int_mode:
 #endif				/* CONFIG_PCI_MSI */
 	/* if we get here we're going to use the default interrupt mode */
 	c->intr[SIMPLE_MODE_INT] = pdev->irq;
-- 
cgit v0.10.2


From 5127d002f9769ba6b1691de78dd3a5c14635e183 Mon Sep 17 00:00:00 2001
From: Suzuki Kp <suzuki@in.ibm.com>
Date: Wed, 6 Dec 2006 20:35:14 -0800
Subject: [PATCH] fix rescan_partitions to return errors properly

The current rescan_partition implementation ignores the errors that comes from
the lower layer.  It reports success for unknown partitions as well as I/O
error cases while reading the partition information.

The unknown partition is not (and will not be) considered as an error in the
kernel, since there are legal users of it (e.g, members of a RAID5 MD Device
or a new disk which is not partitioned at all ).  Changing this behaviour
would scare the user about a serious problem with their disk and is not
recommended.  Thus for both "unknown partitions" to the Linux (eg., DEC
VMS,Novell Netware) and the legal users of NULL partition, would still be
reported as "SUCCESS".

The patch attached here, scares the user about something which he does need to
worry about.  i.e, returning -EIO on disk I/O errors while reading the
partition information.

Signed-off-by: Suzuki K P <suzuki@in.ibm.com>
Cc: Erik Mouw <erik@harddisk-recovery.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6fb4b61..0b6113b 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -177,7 +177,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 	else if (warn_no_part)
 		printk(" unable to read partition table\n");
 	kfree(state);
-	return NULL;
+	return ERR_PTR(res);
 }
 
 /*
@@ -494,6 +494,8 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 		disk->fops->revalidate_disk(disk);
 	if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
 		return 0;
+	if (IS_ERR(state))	/* I/O error reading the partition table */
+		return PTR_ERR(state);
 	for (p = 1; p < state->limit; p++) {
 		sector_t size = state->parts[p].size;
 		sector_t from = state->parts[p].from;
-- 
cgit v0.10.2


From 57881dd9df40b76dc7fc6a0d13fd75f337accb32 Mon Sep 17 00:00:00 2001
From: Suzuki K P <suzuki@in.ibm.com>
Date: Wed, 6 Dec 2006 20:35:16 -0800
Subject: [PATCH] Fix check_partition routines

check_partition() stops its probe once it hits an I/O error from the
partition checkers.  This would prevent the actual partition checker
getting a chance to verify the partition.

So this patch lets check_partition() continue probing untill it hits a
success while recording the I/O error which might have been reported by the
checking routines.

Also, it does some cleanup of the partition methods for ibm, atari and
amiga to return -1 upon hitting an I/O error.

Signed-off-by: Suzuki K P <suzuki@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c
index 3068528..9917a8c 100644
--- a/fs/partitions/amiga.c
+++ b/fs/partitions/amiga.c
@@ -43,6 +43,7 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
 			if (warn_no_part)
 				printk("Dev %s: unable to read RDB block %d\n",
 				       bdevname(bdev, b), blk);
+			res = -1;
 			goto rdb_done;
 		}
 		if (*(__be32 *)data != cpu_to_be32(IDNAME_RIGIDDISK))
@@ -79,6 +80,7 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
 			if (warn_no_part)
 				printk("Dev %s: unable to read partition block %d\n",
 				       bdevname(bdev, b), blk);
+			res = -1;
 			goto rdb_done;
 		}
 		pb  = (struct PartitionBlock *)data;
diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c
index 192a6ad..1f3572d 100644
--- a/fs/partitions/atari.c
+++ b/fs/partitions/atari.c
@@ -88,7 +88,7 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
 			if (!xrs) {
 				printk (" block %ld read failed\n", partsect);
 				put_dev_sector(sect);
-				return 0;
+				return -1;
 			}
 
 			/* ++roman: sanity check: bit 0 of flg field must be set */
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 0b6113b..1901137 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -153,7 +153,7 @@ static struct parsed_partitions *
 check_partition(struct gendisk *hd, struct block_device *bdev)
 {
 	struct parsed_partitions *state;
-	int i, res;
+	int i, res, err;
 
 	state = kmalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
 	if (!state)
@@ -165,13 +165,24 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 		sprintf(state->name, "p");
 
 	state->limit = hd->minors;
-	i = res = 0;
+	i = res = err = 0;
 	while (!res && check_part[i]) {
 		memset(&state->parts, 0, sizeof(state->parts));
 		res = check_part[i++](state, bdev);
+		if (res < 0) {
+			/* We have hit an I/O error which we don't report now.
+		 	* But record it, and let the others do their job.
+		 	*/
+			err = res;
+			res = 0;
+		}
+
 	}
 	if (res > 0)
 		return state;
+	if (!err)
+	/* The partition is unrecognized. So report I/O errors if there were any */
+		res = err;
 	if (!res)
 		printk(" unknown partition table\n");
 	else if (warn_no_part)
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index d352a73..9f7ad42 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -43,7 +43,7 @@ cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
 int
 ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
-	int blocksize, offset, size;
+	int blocksize, offset, size,res;
 	loff_t i_size;
 	dasd_information_t *info;
 	struct hd_geometry *geo;
@@ -56,15 +56,16 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	unsigned char *data;
 	Sector sect;
 
+	res = 0;
 	blocksize = bdev_hardsect_size(bdev);
 	if (blocksize <= 0)
-		return 0;
+		goto out_exit;
 	i_size = i_size_read(bdev->bd_inode);
 	if (i_size == 0)
-		return 0;
+		goto out_exit;
 
 	if ((info = kmalloc(sizeof(dasd_information_t), GFP_KERNEL)) == NULL)
-		goto out_noinfo;
+		goto out_exit;
 	if ((geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL)) == NULL)
 		goto out_nogeo;
 	if ((label = kmalloc(sizeof(union label_t), GFP_KERNEL)) == NULL)
@@ -72,7 +73,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 
 	if (ioctl_by_bdev(bdev, BIODASDINFO, (unsigned long)info) != 0 ||
 	    ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0)
-		goto out_noioctl;
+		goto out_freeall;
 
 	/*
 	 * Get volume label, extract name and type.
@@ -92,6 +93,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	EBCASC(type, 4);
 	EBCASC(name, 6);
 
+	res = 1;
+
 	/*
 	 * Three different types: CMS1, VOL1 and LNX1/unlabeled
 	 */
@@ -156,6 +159,9 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 			counter++;
 			blk++;
 		}
+		if (!data)
+		/* Are we not supposed to report this ? */
+			goto out_readerr;
 	} else {
 		/*
 		 * Old style LNX1 or unlabeled disk
@@ -171,18 +177,17 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	}
 
 	printk("\n");
-	kfree(label);
-	kfree(geo);
-	kfree(info);
-	return 1;
+	goto out_freeall;
+
 
 out_readerr:
-out_noioctl:
+	res = -1;
+out_freeall:
 	kfree(label);
 out_nolab:
 	kfree(geo);
 out_nogeo:
 	kfree(info);
-out_noinfo:
-	return 0;
+out_exit:
+	return res;
 }
-- 
cgit v0.10.2


From 238b8721a554a33a451a3f13bdb5be8fe5cfc927 Mon Sep 17 00:00:00 2001
From: Peter Korsgaard <jacmet@sunsite.dk>
Date: Wed, 6 Dec 2006 20:35:17 -0800
Subject: [PATCH] serial uartlite driver

Add a driver for the Xilinx uartlite serial controller used in boards with
the PPC405 core in the Xilinx V2P/V4 fpgas.

The hardware is very simple (baudrate/start/stopbits fixed and no break
support).  See the datasheet for details:

	http://www.xilinx.com/bvdocs/ipcenter/data_sheet/opb_uartlite.pdf

See http://thread.gmane.org/gmane.linux.serial/1237/ for the email thread.

Signed-off-by: Peter Korsgaard <jacmet@sunsite.dk>
Acked-by: Olof Johansson <olof@lixom.net>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index 5dff268..fa1bba8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3454,6 +3454,12 @@ W:	http://oss.sgi.com/projects/xfs
 T:	git git://oss.sgi.com:8090/xfs/xfs-2.6
 S:	Supported
 
+XILINX UARTLITE SERIAL DRIVER
+P:	Peter Korsgaard
+M:	jacmet@sunsite.dk
+L:	linux-serial@vger.kernel.org
+S:	Maintained
+
 X86 3-LEVEL PAGING (PAE) SUPPORT
 P:	Ingo Molnar
 M:	mingo@redhat.com
diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index 0b71e7d..e936c91 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -511,6 +511,25 @@ config SERIAL_IMX_CONSOLE
 	  your boot loader (lilo or loadlin) about how to pass options to the
 	  kernel at boot time.)
 
+config SERIAL_UARTLITE
+	tristate "Xilinx uartlite serial port support"
+	depends on PPC32
+	select SERIAL_CORE
+	help
+	  Say Y here if you want to use the Xilinx uartlite serial controller.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called uartlite.ko.
+
+config SERIAL_UARTLITE_CONSOLE
+	bool "Support for console on Xilinx uartlite serial port"
+	depends on SERIAL_UARTLITE=y
+	select SERIAL_CORE_CONSOLE
+	help
+	  Say Y here if you wish to use a Xilinx uartlite as the system
+	  console (the system console is the device which receives all kernel
+	  messages and warnings and which allows logins in single user mode).
+
 config SERIAL_SUNCORE
 	bool
 	depends on SPARC
diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile
index b4d8a7c..0dba001 100644
--- a/drivers/serial/Makefile
+++ b/drivers/serial/Makefile
@@ -55,4 +55,5 @@ obj-$(CONFIG_SERIAL_VR41XX) += vr41xx_siu.o
 obj-$(CONFIG_SERIAL_SGI_IOC4) += ioc4_serial.o
 obj-$(CONFIG_SERIAL_SGI_IOC3) += ioc3_serial.o
 obj-$(CONFIG_SERIAL_ATMEL) += atmel_serial.o
+obj-$(CONFIG_SERIAL_UARTLITE) += uartlite.o
 obj-$(CONFIG_SERIAL_NETX) += netx-serial.o
diff --git a/drivers/serial/uartlite.c b/drivers/serial/uartlite.c
new file mode 100644
index 0000000..8369065
--- /dev/null
+++ b/drivers/serial/uartlite.c
@@ -0,0 +1,505 @@
+/*
+ * uartlite.c: Serial driver for Xilinx uartlite serial controller
+ *
+ * Peter Korsgaard <jacmet@sunsite.dk>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2.  This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#include <linux/platform_device.h>
+#include <linux/module.h>
+#include <linux/console.h>
+#include <linux/serial.h>
+#include <linux/serial_core.h>
+#include <linux/tty.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <asm/io.h>
+
+#define ULITE_MAJOR		204
+#define ULITE_MINOR		187
+#define ULITE_NR_UARTS		4
+
+/* For register details see datasheet:
+   http://www.xilinx.com/bvdocs/ipcenter/data_sheet/opb_uartlite.pdf
+*/
+#define ULITE_RX		0x00
+#define ULITE_TX		0x04
+#define ULITE_STATUS		0x08
+#define ULITE_CONTROL		0x0c
+
+#define ULITE_REGION		16
+
+#define ULITE_STATUS_RXVALID	0x01
+#define ULITE_STATUS_RXFULL	0x02
+#define ULITE_STATUS_TXEMPTY	0x04
+#define ULITE_STATUS_TXFULL	0x08
+#define ULITE_STATUS_IE		0x10
+#define ULITE_STATUS_OVERRUN	0x20
+#define ULITE_STATUS_FRAME	0x40
+#define ULITE_STATUS_PARITY	0x80
+
+#define ULITE_CONTROL_RST_TX	0x01
+#define ULITE_CONTROL_RST_RX	0x02
+#define ULITE_CONTROL_IE	0x10
+
+
+static struct uart_port ports[ULITE_NR_UARTS];
+
+static int ulite_receive(struct uart_port *port, int stat)
+{
+	struct tty_struct *tty = port->info->tty;
+	unsigned char ch = 0;
+	char flag = TTY_NORMAL;
+
+	if ((stat & (ULITE_STATUS_RXVALID | ULITE_STATUS_OVERRUN
+		     | ULITE_STATUS_FRAME)) == 0)
+		return 0;
+
+	/* stats */
+	if (stat & ULITE_STATUS_RXVALID) {
+		port->icount.rx++;
+		ch = readb(port->membase + ULITE_RX);
+
+		if (stat & ULITE_STATUS_PARITY)
+			port->icount.parity++;
+	}
+
+	if (stat & ULITE_STATUS_OVERRUN)
+		port->icount.overrun++;
+
+	if (stat & ULITE_STATUS_FRAME)
+		port->icount.frame++;
+
+
+	/* drop byte with parity error if IGNPAR specificed */
+	if (stat & port->ignore_status_mask & ULITE_STATUS_PARITY)
+		stat &= ~ULITE_STATUS_RXVALID;
+
+	stat &= port->read_status_mask;
+
+	if (stat & ULITE_STATUS_PARITY)
+		flag = TTY_PARITY;
+
+
+	stat &= ~port->ignore_status_mask;
+
+	if (stat & ULITE_STATUS_RXVALID)
+		tty_insert_flip_char(tty, ch, flag);
+
+	if (stat & ULITE_STATUS_FRAME)
+		tty_insert_flip_char(tty, 0, TTY_FRAME);
+
+	if (stat & ULITE_STATUS_OVERRUN)
+		tty_insert_flip_char(tty, 0, TTY_OVERRUN);
+
+	return 1;
+}
+
+static int ulite_transmit(struct uart_port *port, int stat)
+{
+	struct circ_buf *xmit  = &port->info->xmit;
+
+	if (stat & ULITE_STATUS_TXFULL)
+		return 0;
+
+	if (port->x_char) {
+		writeb(port->x_char, port->membase + ULITE_TX);
+		port->x_char = 0;
+		port->icount.tx++;
+		return 1;
+	}
+
+	if (uart_circ_empty(xmit) || uart_tx_stopped(port))
+		return 0;
+
+	writeb(xmit->buf[xmit->tail], port->membase + ULITE_TX);
+	xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE-1);
+	port->icount.tx++;
+
+	/* wake up */
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(port);
+
+	return 1;
+}
+
+static irqreturn_t ulite_isr(int irq, void *dev_id)
+{
+	struct uart_port *port = (struct uart_port *)dev_id;
+	int busy;
+
+	do {
+		int stat = readb(port->membase + ULITE_STATUS);
+		busy  = ulite_receive(port, stat);
+		busy |= ulite_transmit(port, stat);
+	} while (busy);
+
+	tty_flip_buffer_push(port->info->tty);
+
+	return IRQ_HANDLED;
+}
+
+static unsigned int ulite_tx_empty(struct uart_port *port)
+{
+	unsigned long flags;
+	unsigned int ret;
+
+	spin_lock_irqsave(&port->lock, flags);
+	ret = readb(port->membase + ULITE_STATUS);
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	return ret & ULITE_STATUS_TXEMPTY ? TIOCSER_TEMT : 0;
+}
+
+static unsigned int ulite_get_mctrl(struct uart_port *port)
+{
+	return TIOCM_CTS | TIOCM_DSR | TIOCM_CAR;
+}
+
+static void ulite_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	/* N/A */
+}
+
+static void ulite_stop_tx(struct uart_port *port)
+{
+	/* N/A */
+}
+
+static void ulite_start_tx(struct uart_port *port)
+{
+	ulite_transmit(port, readb(port->membase + ULITE_STATUS));
+}
+
+static void ulite_stop_rx(struct uart_port *port)
+{
+	/* don't forward any more data (like !CREAD) */
+	port->ignore_status_mask = ULITE_STATUS_RXVALID | ULITE_STATUS_PARITY
+		| ULITE_STATUS_FRAME | ULITE_STATUS_OVERRUN;
+}
+
+static void ulite_enable_ms(struct uart_port *port)
+{
+	/* N/A */
+}
+
+static void ulite_break_ctl(struct uart_port *port, int ctl)
+{
+	/* N/A */
+}
+
+static int ulite_startup(struct uart_port *port)
+{
+	int ret;
+
+	ret = request_irq(port->irq, ulite_isr,
+			  IRQF_DISABLED | IRQF_SAMPLE_RANDOM, "uartlite", port);
+	if (ret)
+		return ret;
+
+	writeb(ULITE_CONTROL_RST_RX | ULITE_CONTROL_RST_TX,
+	       port->membase + ULITE_CONTROL);
+	writeb(ULITE_CONTROL_IE, port->membase + ULITE_CONTROL);
+
+	return 0;
+}
+
+static void ulite_shutdown(struct uart_port *port)
+{
+	writeb(0, port->membase + ULITE_CONTROL);
+	readb(port->membase + ULITE_CONTROL); /* dummy */
+	free_irq(port->irq, port);
+}
+
+static void ulite_set_termios(struct uart_port *port, struct termios *termios,
+			      struct termios *old)
+{
+	unsigned long flags;
+	unsigned int baud;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	port->read_status_mask = ULITE_STATUS_RXVALID | ULITE_STATUS_OVERRUN
+		| ULITE_STATUS_TXFULL;
+
+	if (termios->c_iflag & INPCK)
+		port->read_status_mask |=
+			ULITE_STATUS_PARITY | ULITE_STATUS_FRAME;
+
+	port->ignore_status_mask = 0;
+	if (termios->c_iflag & IGNPAR)
+		port->ignore_status_mask |= ULITE_STATUS_PARITY
+			| ULITE_STATUS_FRAME | ULITE_STATUS_OVERRUN;
+
+	/* ignore all characters if CREAD is not set */
+	if ((termios->c_cflag & CREAD) == 0)
+		port->ignore_status_mask |=
+			ULITE_STATUS_RXVALID | ULITE_STATUS_PARITY
+			| ULITE_STATUS_FRAME | ULITE_STATUS_OVERRUN;
+
+	/* update timeout */
+	baud = uart_get_baud_rate(port, termios, old, 0, 460800);
+	uart_update_timeout(port, termios->c_cflag, baud);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static const char *ulite_type(struct uart_port *port)
+{
+	return port->type == PORT_UARTLITE ? "uartlite" : NULL;
+}
+
+static void ulite_release_port(struct uart_port *port)
+{
+	release_mem_region(port->mapbase, ULITE_REGION);
+	iounmap(port->membase);
+	port->membase = 0;
+}
+
+static int ulite_request_port(struct uart_port *port)
+{
+	if (!request_mem_region(port->mapbase, ULITE_REGION, "uartlite")) {
+		dev_err(port->dev, "Memory region busy\n");
+		return -EBUSY;
+	}
+
+	port->membase = ioremap(port->mapbase, ULITE_REGION);
+	if (!port->membase) {
+		dev_err(port->dev, "Unable to map registers\n");
+		release_mem_region(port->mapbase, ULITE_REGION);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static void ulite_config_port(struct uart_port *port, int flags)
+{
+	ulite_request_port(port);
+	port->type = PORT_UARTLITE;
+}
+
+static int ulite_verify_port(struct uart_port *port, struct serial_struct *ser)
+{
+	/* we don't want the core code to modify any port params */
+	return -EINVAL;
+}
+
+static struct uart_ops ulite_ops = {
+	.tx_empty	= ulite_tx_empty,
+	.set_mctrl	= ulite_set_mctrl,
+	.get_mctrl	= ulite_get_mctrl,
+	.stop_tx	= ulite_stop_tx,
+	.start_tx	= ulite_start_tx,
+	.stop_rx	= ulite_stop_rx,
+	.enable_ms	= ulite_enable_ms,
+	.break_ctl	= ulite_break_ctl,
+	.startup	= ulite_startup,
+	.shutdown	= ulite_shutdown,
+	.set_termios	= ulite_set_termios,
+	.type		= ulite_type,
+	.release_port	= ulite_release_port,
+	.request_port	= ulite_request_port,
+	.config_port	= ulite_config_port,
+	.verify_port	= ulite_verify_port
+};
+
+#ifdef CONFIG_SERIAL_UARTLITE_CONSOLE
+static void ulite_console_wait_tx(struct uart_port *port)
+{
+	int i;
+
+	/* wait up to 10ms for the character(s) to be sent */
+	for (i = 0; i < 10000; i++) {
+		if (readb(port->membase + ULITE_STATUS) & ULITE_STATUS_TXEMPTY)
+			break;
+		udelay(1);
+	}
+}
+
+static void ulite_console_putchar(struct uart_port *port, int ch)
+{
+	ulite_console_wait_tx(port);
+	writeb(ch, port->membase + ULITE_TX);
+}
+
+static void ulite_console_write(struct console *co, const char *s,
+				unsigned int count)
+{
+	struct uart_port *port = &ports[co->index];
+	unsigned long flags;
+	unsigned int ier;
+	int locked = 1;
+
+	if (oops_in_progress) {
+		locked = spin_trylock_irqsave(&port->lock, flags);
+	} else
+		spin_lock_irqsave(&port->lock, flags);
+
+	/* save and disable interrupt */
+	ier = readb(port->membase + ULITE_STATUS) & ULITE_STATUS_IE;
+	writeb(0, port->membase + ULITE_CONTROL);
+
+	uart_console_write(port, s, count, ulite_console_putchar);
+
+	ulite_console_wait_tx(port);
+
+	/* restore interrupt state */
+	if (ier)
+		writeb(ULITE_CONTROL_IE, port->membase + ULITE_CONTROL);
+
+	if (locked)
+		spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static int __init ulite_console_setup(struct console *co, char *options)
+{
+	struct uart_port *port;
+	int baud = 9600;
+	int bits = 8;
+	int parity = 'n';
+	int flow = 'n';
+
+	if (co->index < 0 || co->index >= ULITE_NR_UARTS)
+		return -EINVAL;
+
+	port = &ports[co->index];
+
+	/* not initialized yet? */
+	if (!port->membase)
+		return -ENODEV;
+
+	if (options)
+		uart_parse_options(options, &baud, &parity, &bits, &flow);
+
+	return uart_set_options(port, co, baud, parity, bits, flow);
+}
+
+static struct uart_driver ulite_uart_driver;
+
+static struct console ulite_console = {
+	.name	= "ttyUL",
+	.write	= ulite_console_write,
+	.device	= uart_console_device,
+	.setup	= ulite_console_setup,
+	.flags	= CON_PRINTBUFFER,
+	.index	= -1, /* Specified on the cmdline (e.g. console=ttyUL0 ) */
+	.data	= &ulite_uart_driver,
+};
+
+static int __init ulite_console_init(void)
+{
+	register_console(&ulite_console);
+	return 0;
+}
+
+console_initcall(ulite_console_init);
+
+#endif /* CONFIG_SERIAL_UARTLITE_CONSOLE */
+
+static struct uart_driver ulite_uart_driver = {
+	.owner		= THIS_MODULE,
+	.driver_name	= "uartlite",
+	.dev_name	= "ttyUL",
+	.major		= ULITE_MAJOR,
+	.minor		= ULITE_MINOR,
+	.nr		= ULITE_NR_UARTS,
+#ifdef CONFIG_SERIAL_UARTLITE_CONSOLE
+	.cons		= &ulite_console,
+#endif
+};
+
+static int __devinit ulite_probe(struct platform_device *pdev)
+{
+	struct resource *res, *res2;
+	struct uart_port *port;
+
+	if (pdev->id < 0 || pdev->id >= ULITE_NR_UARTS)
+		return -EINVAL;
+
+	if (ports[pdev->id].membase)
+		return -EBUSY;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENODEV;
+
+	res2 = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	if (!res2)
+		return -ENODEV;
+
+	port = &ports[pdev->id];
+
+	port->fifosize	= 16;
+	port->regshift	= 2;
+	port->iotype	= UPIO_MEM;
+	port->iobase	= 1; /* mark port in use */
+	port->mapbase	= res->start;
+	port->membase	= 0;
+	port->ops	= &ulite_ops;
+	port->irq	= res2->start;
+	port->flags	= UPF_BOOT_AUTOCONF;
+	port->dev	= &pdev->dev;
+	port->type	= PORT_UNKNOWN;
+	port->line	= pdev->id;
+
+	uart_add_one_port(&ulite_uart_driver, port);
+	platform_set_drvdata(pdev, port);
+
+	return 0;
+}
+
+static int ulite_remove(struct platform_device *pdev)
+{
+	struct uart_port *port = platform_get_drvdata(pdev);
+
+	platform_set_drvdata(pdev, NULL);
+
+	if (port)
+		uart_remove_one_port(&ulite_uart_driver, port);
+
+	/* mark port as free */
+	port->membase = 0;
+
+	return 0;
+}
+
+static struct platform_driver ulite_platform_driver = {
+	.probe	= ulite_probe,
+	.remove	= ulite_remove,
+	.driver	= {
+		   .owner = THIS_MODULE,
+		   .name  = "uartlite",
+		   },
+};
+
+int __init ulite_init(void)
+{
+	int ret;
+
+	ret = uart_register_driver(&ulite_uart_driver);
+	if (ret)
+		return ret;
+
+	ret = platform_driver_register(&ulite_platform_driver);
+	if (ret)
+		uart_unregister_driver(&ulite_uart_driver);
+
+	return ret;
+}
+
+void __exit ulite_exit(void)
+{
+	platform_driver_unregister(&ulite_platform_driver);
+	uart_unregister_driver(&ulite_uart_driver);
+}
+
+module_init(ulite_init);
+module_exit(ulite_exit);
+
+MODULE_AUTHOR("Peter Korsgaard <jacmet@sunsite.dk>");
+MODULE_DESCRIPTION("Xilinx uartlite serial driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 463ab95..8276721 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -132,6 +132,8 @@
 
 #define PORT_S3C2412	73
 
+/* Xilinx uartlite */
+#define PORT_UARTLITE	74
 
 #ifdef __KERNEL__
 
-- 
cgit v0.10.2


From 8684265412518858c48a56c2f0aa86f280978b74 Mon Sep 17 00:00:00 2001
From: Amol Lad <amol@verismonetworks.com>
Date: Wed, 6 Dec 2006 20:35:19 -0800
Subject: [PATCH] ioremap balanced with iounmap for
 drivers/char/rio/rio_linux.c

Signed-off-by: Amol Lad <amol@verismonetworks.com>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/rio/rio_linux.c b/drivers/char/rio/rio_linux.c
index 7ac68cb..e79b2ed 100644
--- a/drivers/char/rio/rio_linux.c
+++ b/drivers/char/rio/rio_linux.c
@@ -1026,6 +1026,7 @@ static int __init rio_init(void)
 			found++;
 		} else {
 			iounmap(p->RIOHosts[p->RIONumHosts].Caddr);
+			p->RIOHosts[p->RIONumHosts].Caddr = NULL;
 		}
 	}
 
@@ -1078,6 +1079,7 @@ static int __init rio_init(void)
 			found++;
 		} else {
 			iounmap(p->RIOHosts[p->RIONumHosts].Caddr);
+			p->RIOHosts[p->RIONumHosts].Caddr = NULL;
 		}
 #else
 		printk(KERN_ERR "Found an older RIO PCI card, but the driver is not " "compiled to support it.\n");
@@ -1117,8 +1119,10 @@ static int __init rio_init(void)
 				}
 			}
 
-			if (!okboard)
+			if (!okboard) {
 				iounmap(hp->Caddr);
+				hp->Caddr = NULL;
+			}
 		}
 	}
 
@@ -1188,6 +1192,8 @@ static void __exit rio_exit(void)
 		}
 		/* It is safe/allowed to del_timer a non-active timer */
 		del_timer(&hp->timer);
+		if (hp->Caddr)
+			iounmap(hp->Caddr);
 		if (hp->Type == RIO_PCI)
 			pci_dev_put(hp->pdev);
 	}
-- 
cgit v0.10.2


From 41bdabbb6d951cf4a9fbfe33783749b87662b02d Mon Sep 17 00:00:00 2001
From: Amol Lad <amol@verismonetworks.com>
Date: Wed, 6 Dec 2006 20:35:21 -0800
Subject: [PATCH] ioremap balanced with iounmap for drivers/char/moxa.c

Signed-off-by: Amol Lad <amol@verismonetworks.com>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/moxa.c b/drivers/char/moxa.c
index 2d025a9..8b31695 100644
--- a/drivers/char/moxa.c
+++ b/drivers/char/moxa.c
@@ -498,9 +498,12 @@ static void __exit moxa_exit(void)
 		printk("Couldn't unregister MOXA Intellio family serial driver\n");
 	put_tty_driver(moxaDriver);
 
-	for (i = 0; i < MAX_BOARDS; i++)
+	for (i = 0; i < MAX_BOARDS; i++) {
+		if (moxaBaseAddr[i])
+			iounmap(moxaBaseAddr[i]);
 		if (moxa_boards[i].busType == MOXA_BUS_TYPE_PCI)
 			pci_dev_put(moxa_boards[i].pciInfo.pdev);
+	}
 
 	if (verbose)
 		printk("Done\n");
-- 
cgit v0.10.2


From aa8a8d664828c7184a2e775fb50611324ef21b5c Mon Sep 17 00:00:00 2001
From: Amol Lad <amol@verismonetworks.com>
Date: Wed, 6 Dec 2006 20:35:22 -0800
Subject: [PATCH] ioremap balanced with iounmap for drivers/char/istallion.c

Signed-off-by: Amol Lad <amol@verismonetworks.com>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/istallion.c b/drivers/char/istallion.c
index bd9195e..8f59194 100644
--- a/drivers/char/istallion.c
+++ b/drivers/char/istallion.c
@@ -3476,6 +3476,8 @@ static int stli_initecp(stlibrd_t *brdp)
 	if (sig.magic != cpu_to_le32(ECP_MAGIC))
 	{
 		release_region(brdp->iobase, brdp->iosize);
+		iounmap(brdp->membase);
+		brdp->membase = NULL;
 		return -ENODEV;
 	}
 
@@ -3632,6 +3634,8 @@ static int stli_initonb(stlibrd_t *brdp)
 	    sig.magic3 != cpu_to_le16(ONB_MAGIC3))
 	{
 		release_region(brdp->iobase, brdp->iosize);
+		iounmap(brdp->membase);
+		brdp->membase = NULL;
 		return -ENODEV;
 	}
 
-- 
cgit v0.10.2


From b9d85b08c689dbf54b9943a02f73cb54c2b0fccf Mon Sep 17 00:00:00 2001
From: Amol Lad <amol@verismonetworks.com>
Date: Wed, 6 Dec 2006 20:35:23 -0800
Subject: [PATCH] sound/oss/btaudio.c: ioremap balanced with iounmap

ioremap must be balanced by an iounmap and failing to do so can result
in a memory leak.

Signed-off-by: Amol Lad <amol@verismonetworks.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/sound/oss/btaudio.c b/sound/oss/btaudio.c
index 6ad3841..ad7210a 100644
--- a/sound/oss/btaudio.c
+++ b/sound/oss/btaudio.c
@@ -1020,6 +1020,7 @@ static int __devinit btaudio_probe(struct pci_dev *pci_dev,
  fail2:
         free_irq(bta->irq,bta);	
  fail1:
+	iounmap(bta->mmio);
 	kfree(bta);
  fail0:
 	release_mem_region(pci_resource_start(pci_dev,0),
@@ -1051,6 +1052,7 @@ static void __devexit btaudio_remove(struct pci_dev *pci_dev)
         free_irq(bta->irq,bta);
 	release_mem_region(pci_resource_start(pci_dev,0),
 			   pci_resource_len(pci_dev,0));
+	iounmap(bta->mmio);
 
 	/* remove from linked list */
 	if (bta == btaudios) {
-- 
cgit v0.10.2


From ed07536ed6731775219c1df7fa26a7588753e693 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 6 Dec 2006 20:35:24 -0800
Subject: [PATCH] lockdep: annotate nfs/nfsd in-kernel sockets

Stick NFS sockets in their own class to avoid some lockdep warnings.  NFS
sockets are never exposed to user-space, and will hence not trigger certain
code paths that would otherwise pose deadlock scenarios.

[akpm@osdl.org: cleanups]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Steven Dickson <SteveD@redhat.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Acked-by: Neil Brown <neilb@suse.de>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
[ Fixed patch corruption by quilt, pointed out by Peter Zijlstra ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/net/sock.h b/include/net/sock.h
index 730899c..03684e7 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -746,6 +746,25 @@ static inline int sk_stream_wmem_schedule(struct sock *sk, int size)
  */
 #define sock_owned_by_user(sk)	((sk)->sk_lock.owner)
 
+/*
+ * Macro so as to not evaluate some arguments when
+ * lockdep is not enabled.
+ *
+ * Mark both the sk_lock and the sk_lock.slock as a
+ * per-address-family lock class.
+ */
+#define sock_lock_init_class_and_name(sk, sname, skey, name, key) 	\
+do {									\
+	sk->sk_lock.owner = NULL;					\
+	init_waitqueue_head(&sk->sk_lock.wq);				\
+	spin_lock_init(&(sk)->sk_lock.slock);				\
+	debug_check_no_locks_freed((void *)&(sk)->sk_lock,		\
+			sizeof((sk)->sk_lock));				\
+	lockdep_set_class_and_name(&(sk)->sk_lock.slock,		\
+		       	(skey), (sname));				\
+	lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0);	\
+} while (0)
+
 extern void FASTCALL(lock_sock_nested(struct sock *sk, int subclass));
 
 static inline void lock_sock(struct sock *sk)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index c9fefdb..e33f620 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2645,6 +2645,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
 	}
 	local_irq_restore(flags);
 }
+EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
 
 static void print_held_locks_bug(struct task_struct *curr)
 {
diff --git a/net/core/sock.c b/net/core/sock.c
index 4a432da..0ed5b4f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -810,24 +810,11 @@ lenout:
  */
 static void inline sock_lock_init(struct sock *sk)
 {
-	spin_lock_init(&sk->sk_lock.slock);
-	sk->sk_lock.owner = NULL;
-	init_waitqueue_head(&sk->sk_lock.wq);
-	/*
-	 * Make sure we are not reinitializing a held lock:
-	 */
-	debug_check_no_locks_freed((void *)&sk->sk_lock, sizeof(sk->sk_lock));
-
-	/*
-	 * Mark both the sk_lock and the sk_lock.slock as a
-	 * per-address-family lock class:
-	 */
-	lockdep_set_class_and_name(&sk->sk_lock.slock,
-				   af_family_slock_keys + sk->sk_family,
-				   af_family_slock_key_strings[sk->sk_family]);
-	lockdep_init_map(&sk->sk_lock.dep_map,
-			 af_family_key_strings[sk->sk_family],
-			 af_family_keys + sk->sk_family, 0);
+	sock_lock_init_class_and_name(sk,
+			af_family_slock_key_strings[sk->sk_family],
+			af_family_slock_keys + sk->sk_family,
+			af_family_key_strings[sk->sk_family],
+			af_family_keys + sk->sk_family);
 }
 
 /**
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 1c68956..99f54fb 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -85,6 +85,35 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req);
  */
 static int svc_conn_age_period = 6*60;
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key svc_key[2];
+static struct lock_class_key svc_slock_key[2];
+
+static inline void svc_reclassify_socket(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	BUG_ON(sk->sk_lock.owner != NULL);
+	switch (sk->sk_family) {
+	case AF_INET:
+		sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD",
+		    &svc_slock_key[0], "sk_lock-AF_INET-NFSD", &svc_key[0]);
+		break;
+
+	case AF_INET6:
+		sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD",
+		    &svc_slock_key[1], "sk_lock-AF_INET6-NFSD", &svc_key[1]);
+		break;
+
+	default:
+		BUG();
+	}
+}
+#else
+static inline void svc_reclassify_socket(struct socket *sock)
+{
+}
+#endif
+
 /*
  * Queue up an idle server thread.  Must have pool->sp_lock held.
  * Note: this is really a stack rather than a queue, so that we only
@@ -1557,6 +1586,8 @@ svc_create_socket(struct svc_serv *serv, int protocol, struct sockaddr_in *sin)
 	if ((error = sock_create_kern(PF_INET, type, protocol, &sock)) < 0)
 		return error;
 
+	svc_reclassify_socket(sock);
+
 	if (type == SOCK_STREAM)
 		sock->sk->sk_reuse = 1; /* allow address reuse */
 	error = kernel_bind(sock, (struct sockaddr *) sin,
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index cfe3c15..2fc4a31 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1058,6 +1058,35 @@ static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock)
 	return err;
 }
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key xs_key[2];
+static struct lock_class_key xs_slock_key[2];
+
+static inline void xs_reclassify_socket(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	BUG_ON(sk->sk_lock.owner != NULL);
+	switch (sk->sk_family) {
+	case AF_INET:
+		sock_lock_init_class_and_name(sk, "slock-AF_INET-NFS",
+			&xs_slock_key[0], "sk_lock-AF_INET-NFS", &xs_key[0]);
+		break;
+
+	case AF_INET6:
+		sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFS",
+			&xs_slock_key[1], "sk_lock-AF_INET6-NFS", &xs_key[1]);
+		break;
+
+	default:
+		BUG();
+	}
+}
+#else
+static inline void xs_reclassify_socket(struct socket *sock)
+{
+}
+#endif
+
 /**
  * xs_udp_connect_worker - set up a UDP socket
  * @work: RPC transport to connect
@@ -1081,6 +1110,7 @@ static void xs_udp_connect_worker(struct work_struct *work)
 		dprintk("RPC:      can't create UDP transport socket (%d).\n", -err);
 		goto out;
 	}
+	xs_reclassify_socket(sock);
 
 	if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) {
 		sock_release(sock);
@@ -1165,6 +1195,7 @@ static void xs_tcp_connect_worker(struct work_struct *work)
 			dprintk("RPC:      can't create TCP transport socket (%d).\n", -err);
 			goto out;
 		}
+		xs_reclassify_socket(sock);
 
 		if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) {
 			sock_release(sock);
-- 
cgit v0.10.2


From 317a40ac2237732aba531eee2c7b5e39dd40e959 Mon Sep 17 00:00:00 2001
From: Stas Sergeev <stsp@aknet.ru>
Date: Wed, 6 Dec 2006 20:35:25 -0800
Subject: [PATCH] honour MNT_NOEXEC for access()

Make access(X_OK) take the "noexec" mount option into account.

Signed-off-by: Stas Sergeev <stsp@aknet.ru>
Cc: Jakub Jelinek <jakub@redhat.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/namei.c b/fs/namei.c
index 28d49b3..61f99c1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -249,9 +249,11 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
 
 	/*
 	 * MAY_EXEC on regular files requires special handling: We override
-	 * filesystem execute permissions if the mode bits aren't set.
+	 * filesystem execute permissions if the mode bits aren't set or
+	 * the fs is mounted with the "noexec" flag.
 	 */
-	if ((mask & MAY_EXEC) && S_ISREG(mode) && !(mode & S_IXUGO))
+	if ((mask & MAY_EXEC) && S_ISREG(mode) && (!(mode & S_IXUGO) ||
+			(nd && nd->mnt && (nd->mnt->mnt_flags & MNT_NOEXEC))))
 		return -EACCES;
 
 	/* Ordinary permission routines do not understand MAY_APPEND. */
-- 
cgit v0.10.2


From e4fca01ea2b41c41a82f4ca3537f6ebc237adde5 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 6 Dec 2006 20:35:27 -0800
Subject: [PATCH] ext2: fsid for statvfs

Update ext2_statfs to return an FSID that is a 64 bit XOR of the 128 bit
filesystem UUID as suggested by Andreas Dilger.  See the following Bugzilla
entry for details:

  http://bugzilla.kernel.org/show_bug.cgi?id=136

Cc: Andreas Dilger <adilger@clusterfs.com>
Cc: Stephen Tweedie <sct@redhat.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 3aafb1d..255cef5 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1090,8 +1090,10 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
 {
 	struct super_block *sb = dentry->d_sb;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
+	struct ext2_super_block *es = sbi->s_es;
 	unsigned long overhead;
 	int i;
+	u64 fsid;
 
 	if (test_opt (sb, MINIX_DF))
 		overhead = 0;
@@ -1104,7 +1106,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
 		 * All of the blocks before first_data_block are
 		 * overhead
 		 */
-		overhead = le32_to_cpu(sbi->s_es->s_first_data_block);
+		overhead = le32_to_cpu(es->s_first_data_block);
 
 		/*
 		 * Add the overhead attributed to the superblock and
@@ -1125,14 +1127,18 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
 
 	buf->f_type = EXT2_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
-	buf->f_blocks = le32_to_cpu(sbi->s_es->s_blocks_count) - overhead;
+	buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead;
 	buf->f_bfree = ext2_count_free_blocks(sb);
-	buf->f_bavail = buf->f_bfree - le32_to_cpu(sbi->s_es->s_r_blocks_count);
-	if (buf->f_bfree < le32_to_cpu(sbi->s_es->s_r_blocks_count))
+	buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
+	if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
 		buf->f_bavail = 0;
-	buf->f_files = le32_to_cpu(sbi->s_es->s_inodes_count);
-	buf->f_ffree = ext2_count_free_inodes (sb);
+	buf->f_files = le32_to_cpu(es->s_inodes_count);
+	buf->f_ffree = ext2_count_free_inodes(sb);
 	buf->f_namelen = EXT2_NAME_LEN;
+	fsid = le64_to_cpup((void *)es->s_uuid) ^
+	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
+	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
+	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
 	return 0;
 }
 
-- 
cgit v0.10.2


From 50ee0a32b192902e32a2b596df7ec3496c4bf485 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 6 Dec 2006 20:35:28 -0800
Subject: [PATCH] ext3: fsid for statvfs

Update ext3_statfs to return an FSID that is a 64 bit XOR of the 128 bit
filesystem UUID as suggested by Andreas Dilger.  See the following Bugzilla
entry for details:

  http://bugzilla.kernel.org/show_bug.cgi?id=136

Cc: Andreas Dilger <adilger@clusterfs.com>
Cc: Stephen Tweedie <sct@redhat.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9856565..8ab1981 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2387,6 +2387,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
 	struct ext3_super_block *es = sbi->s_es;
 	ext3_fsblk_t overhead;
 	int i;
+	u64 fsid;
 
 	if (test_opt (sb, MINIX_DF))
 		overhead = 0;
@@ -2433,6 +2434,10 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
 	buf->f_files = le32_to_cpu(es->s_inodes_count);
 	buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter);
 	buf->f_namelen = EXT3_NAME_LEN;
+	fsid = le64_to_cpup((void *)es->s_uuid) ^
+	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
+	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
+	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
 	return 0;
 }
 
-- 
cgit v0.10.2


From 960cc398a7a2acfe455b2ec33c64dc6018c83aab Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 6 Dec 2006 20:35:29 -0800
Subject: [PATCH] ext4: fsid for statvfs

Update ext4_statfs to return an FSID that is a 64 bit XOR of the 128 bit
filesystem UUID as suggested by Andreas Dilger.  See the following Bugzilla
entry for details:

  http://bugzilla.kernel.org/show_bug.cgi?id=136

Cc: Andreas Dilger <adilger@clusterfs.com>
Cc: Stephen Tweedie <sct@redhat.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f2e8c4a..2ede7e2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2460,6 +2460,7 @@ static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
 	struct ext4_super_block *es = sbi->s_es;
 	ext4_fsblk_t overhead;
 	int i;
+	u64 fsid;
 
 	if (test_opt (sb, MINIX_DF))
 		overhead = 0;
@@ -2506,6 +2507,10 @@ static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
 	buf->f_files = le32_to_cpu(es->s_inodes_count);
 	buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter);
 	buf->f_namelen = EXT4_NAME_LEN;
+	fsid = le64_to_cpup((void *)es->s_uuid) ^
+	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
+	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
+	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
 	return 0;
 }
 
-- 
cgit v0.10.2


From 07354a00901d103085e4376b7df0aad264c1836a Mon Sep 17 00:00:00 2001
From: "Adam B. Jerome" <abj@novell.com>
Date: Wed, 6 Dec 2006 20:35:30 -0800
Subject: [PATCH] /proc/kallsyms reports lower-case types for some non-exported
 symbols

This patch addresses incorrect symbol type information reported through
/proc/kallsyms.  A lowercase character should designate the symbol as local
(or non-exported).  An uppercase character should designate the symbol as
global (or external).

Without this patch, some non-exported symbols are incorrectly assigned an
upper-case designation in /proc/kallsyms.  This patch corrects this
condition by converting non-exported symbols types to lower case when
appropriate and eliminates the superfluous upcase_if_global function

Signed-off-by: Adam B. Jerome <abj@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index eeac3e3..54befe3 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -20,6 +20,7 @@
 #include <linux/proc_fs.h>
 #include <linux/sched.h>	/* for cond_resched */
 #include <linux/mm.h>
+#include <linux/ctype.h>
 
 #include <asm/sections.h>
 
@@ -301,13 +302,6 @@ struct kallsym_iter
 	char name[KSYM_NAME_LEN+1];
 };
 
-/* Only label it "global" if it is exported. */
-static void upcase_if_global(struct kallsym_iter *iter)
-{
-	if (is_exported(iter->name, iter->owner))
-		iter->type += 'A' - 'a';
-}
-
 static int get_ksymbol_mod(struct kallsym_iter *iter)
 {
 	iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
@@ -316,7 +310,10 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
 	if (iter->owner == NULL)
 		return 0;
 
-	upcase_if_global(iter);
+	/* Label it "global" if it is exported, "local" if not exported. */
+	iter->type = is_exported(iter->name, iter->owner)
+		? toupper(iter->type) : tolower(iter->type);
+
 	return 1;
 }
 
-- 
cgit v0.10.2


From 3889b26bebd3e3cf5a3b95da683bab2f6462133d Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Wed, 6 Dec 2006 20:35:31 -0800
Subject: [PATCH] I2O: more error checking

i2o_scsi: handle sysfs failure

i2o_device:
 * convert i2o_device_add() to return integer error code
   rather than pointer.  Fortunately -nobody- checks the return code of
   this function, so changing has nil impact.
 * handle errors thrown by device_register()

More work in i2o_device remains.

Signed-off-by: Jeff Garzik <jeff@garzik.org>
Cc: Markus Lidel <Markus.Lidel@shadowconnect.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/message/i2o/device.c b/drivers/message/i2o/device.c
index ee18305..55757af 100644
--- a/drivers/message/i2o/device.c
+++ b/drivers/message/i2o/device.c
@@ -214,18 +214,17 @@ static struct i2o_device *i2o_device_alloc(void)
  *	Allocate a new I2O device and initialize it with the LCT entry. The
  *	device is appended to the device list of the controller.
  *
- *	Returns a pointer to the I2O device on success or negative error code
- *	on failure.
+ *	Returns zero on success, or a -ve errno.
  */
-static struct i2o_device *i2o_device_add(struct i2o_controller *c,
-					 i2o_lct_entry * entry)
+static int i2o_device_add(struct i2o_controller *c, i2o_lct_entry *entry)
 {
 	struct i2o_device *i2o_dev, *tmp;
+	int rc;
 
 	i2o_dev = i2o_device_alloc();
 	if (IS_ERR(i2o_dev)) {
 		printk(KERN_ERR "i2o: unable to allocate i2o device\n");
-		return i2o_dev;
+		return PTR_ERR(i2o_dev);
 	}
 
 	i2o_dev->lct_data = *entry;
@@ -236,7 +235,9 @@ static struct i2o_device *i2o_device_add(struct i2o_controller *c,
 	i2o_dev->iop = c;
 	i2o_dev->device.parent = &c->device;
 
-	device_register(&i2o_dev->device);
+	rc = device_register(&i2o_dev->device);
+	if (rc)
+		goto err;
 
 	list_add_tail(&i2o_dev->list, &c->devices);
 
@@ -270,7 +271,11 @@ static struct i2o_device *i2o_device_add(struct i2o_controller *c,
 
 	pr_debug("i2o: device %s added\n", i2o_dev->device.bus_id);
 
-	return i2o_dev;
+	return 0;
+
+err:
+	kfree(i2o_dev);
+	return rc;
 }
 
 /**
diff --git a/drivers/message/i2o/i2o_scsi.c b/drivers/message/i2o/i2o_scsi.c
index 6ebf382..d5f93b1 100644
--- a/drivers/message/i2o/i2o_scsi.c
+++ b/drivers/message/i2o/i2o_scsi.c
@@ -220,7 +220,7 @@ static int i2o_scsi_probe(struct device *dev)
 	u32 id = -1;
 	u64 lun = -1;
 	int channel = -1;
-	int i;
+	int i, rc;
 
 	i2o_shost = i2o_scsi_get_host(c);
 	if (!i2o_shost)
@@ -304,14 +304,20 @@ static int i2o_scsi_probe(struct device *dev)
 		return PTR_ERR(scsi_dev);
 	}
 
-	sysfs_create_link(&i2o_dev->device.kobj, &scsi_dev->sdev_gendev.kobj,
-			  "scsi");
+	rc = sysfs_create_link(&i2o_dev->device.kobj,
+			       &scsi_dev->sdev_gendev.kobj, "scsi");
+	if (rc)
+		goto err;
 
 	osm_info("device added (TID: %03x) channel: %d, id: %d, lun: %ld\n",
 		 i2o_dev->lct_data.tid, channel, le32_to_cpu(id),
 		 (long unsigned int)le64_to_cpu(lun));
 
 	return 0;
+
+err:
+	scsi_remove_device(scsi_dev);
+	return rc;
 };
 
 static const char *i2o_scsi_info(struct Scsi_Host *SChost)
-- 
cgit v0.10.2


From bfc7ee207078e8ca51264355805e6f56b485be4b Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Wed, 6 Dec 2006 20:35:33 -0800
Subject: [PATCH] PNP: handle sysfs errors

Signed-off-by: Jeff Garzik <jeff@garzik.org>
Cc: Adam Belay <ambx1@neo.rr.com>
Cc: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/pnp/card.c b/drivers/pnp/card.c
index 227600c..91c047a 100644
--- a/drivers/pnp/card.c
+++ b/drivers/pnp/card.c
@@ -164,9 +164,17 @@ static DEVICE_ATTR(card_id,S_IRUGO,pnp_show_card_ids,NULL);
 
 static int pnp_interface_attach_card(struct pnp_card *card)
 {
-	device_create_file(&card->dev,&dev_attr_name);
-	device_create_file(&card->dev,&dev_attr_card_id);
+	int rc = device_create_file(&card->dev,&dev_attr_name);
+	if (rc) return rc;
+
+	rc = device_create_file(&card->dev,&dev_attr_card_id);
+	if (rc) goto err_name;
+
 	return 0;
+
+err_name:
+	device_remove_file(&card->dev,&dev_attr_name);
+	return rc;
 }
 
 /**
@@ -306,16 +314,20 @@ found:
 	down_write(&dev->dev.bus->subsys.rwsem);
 	dev->card_link = clink;
 	dev->dev.driver = &drv->link.driver;
-	if (pnp_bus_type.probe(&dev->dev)) {
-		dev->dev.driver = NULL;
-		dev->card_link = NULL;
-		up_write(&dev->dev.bus->subsys.rwsem);
-		return NULL;
-	}
-	device_bind_driver(&dev->dev);
+	if (pnp_bus_type.probe(&dev->dev))
+		goto err_out;
+	if (device_bind_driver(&dev->dev))
+		goto err_out;
+
 	up_write(&dev->dev.bus->subsys.rwsem);
 
 	return dev;
+
+err_out:
+	dev->dev.driver = NULL;
+	dev->card_link = NULL;
+	up_write(&dev->dev.bus->subsys.rwsem);
+	return NULL;
 }
 
 /**
diff --git a/drivers/pnp/interface.c b/drivers/pnp/interface.c
index 9d8b415..ac9fcd4 100644
--- a/drivers/pnp/interface.c
+++ b/drivers/pnp/interface.c
@@ -461,8 +461,19 @@ static DEVICE_ATTR(id,S_IRUGO,pnp_show_current_ids,NULL);
 
 int pnp_interface_attach_device(struct pnp_dev *dev)
 {
-	device_create_file(&dev->dev,&dev_attr_options);
-	device_create_file(&dev->dev,&dev_attr_resources);
-	device_create_file(&dev->dev,&dev_attr_id);
+	int rc = device_create_file(&dev->dev,&dev_attr_options);
+	if (rc) goto err;
+	rc = device_create_file(&dev->dev,&dev_attr_resources);
+	if (rc) goto err_opt;
+	rc = device_create_file(&dev->dev,&dev_attr_id);
+	if (rc) goto err_res;
+
 	return 0;
+
+err_res:
+	device_remove_file(&dev->dev,&dev_attr_resources);
+err_opt:
+	device_remove_file(&dev->dev,&dev_attr_options);
+err:
+	return rc;
 }
-- 
cgit v0.10.2


From 91046a8a693823d434f0aa70419c48ebeb8e1b11 Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Wed, 6 Dec 2006 20:35:34 -0800
Subject: [PATCH] RTC: handle sysfs errors

Signed-off-by: Jeff Garzik <jeff@garzik.org>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/rtc/rtc-ds1672.c b/drivers/rtc/rtc-ds1672.c
index 67e816a..dfef163 100644
--- a/drivers/rtc/rtc-ds1672.c
+++ b/drivers/rtc/rtc-ds1672.c
@@ -237,17 +237,22 @@ static int ds1672_probe(struct i2c_adapter *adapter, int address, int kind)
 	/* read control register */
 	err = ds1672_get_control(client, &control);
 	if (err)
-		goto exit_detach;
+		goto exit_devreg;
 
 	if (control & DS1672_REG_CONTROL_EOSC)
 		dev_warn(&client->dev, "Oscillator not enabled. "
 					"Set time to enable.\n");
 
 	/* Register sysfs hooks */
-	device_create_file(&client->dev, &dev_attr_control);
+	err = device_create_file(&client->dev, &dev_attr_control);
+	if (err)
+		goto exit_devreg;
 
 	return 0;
 
+exit_devreg:
+	rtc_device_unregister(rtc);
+
 exit_detach:
 	i2c_detach_client(client);
 
diff --git a/drivers/rtc/rtc-rs5c372.c b/drivers/rtc/rtc-rs5c372.c
index a44fe4e..9e1bb3a 100644
--- a/drivers/rtc/rtc-rs5c372.c
+++ b/drivers/rtc/rtc-rs5c372.c
@@ -238,11 +238,19 @@ static int rs5c372_probe(struct i2c_adapter *adapter, int address, int kind)
 
 	i2c_set_clientdata(client, rtc);
 
-	device_create_file(&client->dev, &dev_attr_trim);
-	device_create_file(&client->dev, &dev_attr_osc);
+	err = device_create_file(&client->dev, &dev_attr_trim);
+	if (err) goto exit_devreg;
+	err = device_create_file(&client->dev, &dev_attr_osc);
+	if (err) goto exit_trim;
 
 	return 0;
 
+exit_trim:
+	device_remove_file(&client->dev, &dev_attr_trim);
+
+exit_devreg:
+	rtc_device_unregister(rtc);
+
 exit_detach:
 	i2c_detach_client(client);
 
diff --git a/drivers/rtc/rtc-test.c b/drivers/rtc/rtc-test.c
index 6ef9c62..f50a1b8 100644
--- a/drivers/rtc/rtc-test.c
+++ b/drivers/rtc/rtc-test.c
@@ -123,11 +123,18 @@ static int test_probe(struct platform_device *plat_dev)
 		err = PTR_ERR(rtc);
 		return err;
 	}
-	device_create_file(&plat_dev->dev, &dev_attr_irq);
+
+	err = device_create_file(&plat_dev->dev, &dev_attr_irq);
+	if (err)
+		goto err;
 
 	platform_set_drvdata(plat_dev, rtc);
 
 	return 0;
+
+err:
+	rtc_device_unregister(rtc);
+	return err;
 }
 
 static int __devexit test_remove(struct platform_device *plat_dev)
diff --git a/drivers/rtc/rtc-x1205.c b/drivers/rtc/rtc-x1205.c
index 522c697..9a67487 100644
--- a/drivers/rtc/rtc-x1205.c
+++ b/drivers/rtc/rtc-x1205.c
@@ -562,11 +562,19 @@ static int x1205_probe(struct i2c_adapter *adapter, int address, int kind)
 	else
 		dev_err(&client->dev, "couldn't read status\n");
 
-	device_create_file(&client->dev, &dev_attr_atrim);
-	device_create_file(&client->dev, &dev_attr_dtrim);
+	err = device_create_file(&client->dev, &dev_attr_atrim);
+	if (err) goto exit_devreg;
+	err = device_create_file(&client->dev, &dev_attr_dtrim);
+	if (err) goto exit_atrim;
 
 	return 0;
 
+exit_atrim:
+	device_remove_file(&client->dev, &dev_attr_atrim);
+
+exit_devreg:
+	rtc_device_unregister(rtc);
+
 exit_detach:
 	i2c_detach_client(client);
 
-- 
cgit v0.10.2


From e45f4676108d19ae93918f06cb6731c86108341a Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Wed, 6 Dec 2006 20:35:34 -0800
Subject: [PATCH] sound/oss/emu10k1: handle userspace copy errors

Propagate copy_to/from_user() errors back through callers.

Signed-off-by: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/sound/oss/emu10k1/audio.c b/sound/oss/emu10k1/audio.c
index 86dd239..49f902f 100644
--- a/sound/oss/emu10k1/audio.c
+++ b/sound/oss/emu10k1/audio.c
@@ -111,9 +111,15 @@ static ssize_t emu10k1_audio_read(struct file *file, char __user *buffer, size_t
 
 		if ((bytestocopy >= wiinst->buffer.fragment_size)
 		    || (bytestocopy >= count)) {
+			int rc;
+
 			bytestocopy = min_t(u32, bytestocopy, count);
 
-			emu10k1_wavein_xferdata(wiinst, (u8 __user *)buffer, &bytestocopy);
+			rc = emu10k1_wavein_xferdata(wiinst,
+						     (u8 __user *)buffer,
+						     &bytestocopy);
+			if (rc)
+				return rc;
 
 			count -= bytestocopy;
 			buffer += bytestocopy;
diff --git a/sound/oss/emu10k1/cardwi.c b/sound/oss/emu10k1/cardwi.c
index 8bbf44b..060d1be 100644
--- a/sound/oss/emu10k1/cardwi.c
+++ b/sound/oss/emu10k1/cardwi.c
@@ -304,11 +304,12 @@ void emu10k1_wavein_getxfersize(struct wiinst *wiinst, u32 * size)
 	}
 }
 
-static void copy_block(u8 __user *dst, u8 * src, u32 str, u32 len, u8 cov)
+static int copy_block(u8 __user *dst, u8 * src, u32 str, u32 len, u8 cov)
 {
-	if (cov == 1)
-		__copy_to_user(dst, src + str, len);
-	else {
+	if (cov == 1) {
+		if (__copy_to_user(dst, src + str, len))
+			return -EFAULT;
+	} else {
 		u8 byte;
 		u32 i;
 
@@ -316,22 +317,26 @@ static void copy_block(u8 __user *dst, u8 * src, u32 str, u32 len, u8 cov)
 
 		for (i = 0; i < len; i++) {
 			byte = src[2 * i] ^ 0x80;
-			__copy_to_user(dst + i, &byte, 1);
+			if (__copy_to_user(dst + i, &byte, 1))
+				return -EFAULT;
 		}
 	}
+
+	return 0;
 }
 
-void emu10k1_wavein_xferdata(struct wiinst *wiinst, u8 __user *data, u32 * size)
+int emu10k1_wavein_xferdata(struct wiinst *wiinst, u8 __user *data, u32 * size)
 {
 	struct wavein_buffer *buffer = &wiinst->buffer;
 	u32 sizetocopy, sizetocopy_now, start;
 	unsigned long flags;
+	int ret;
 
 	sizetocopy = min_t(u32, buffer->size, *size);
 	*size = sizetocopy;
 
 	if (!sizetocopy)
-		return;
+		return 0;
 
 	spin_lock_irqsave(&wiinst->lock, flags);
 	start = buffer->pos;
@@ -345,11 +350,17 @@ void emu10k1_wavein_xferdata(struct wiinst *wiinst, u8 __user *data, u32 * size)
 	if (sizetocopy > sizetocopy_now) {
 		sizetocopy -= sizetocopy_now;
 
-		copy_block(data, buffer->addr, start, sizetocopy_now, buffer->cov);
-		copy_block(data + sizetocopy_now, buffer->addr, 0, sizetocopy, buffer->cov);
+		ret = copy_block(data, buffer->addr, start, sizetocopy_now,
+				 buffer->cov);
+		if (ret == 0)
+			ret = copy_block(data + sizetocopy_now, buffer->addr, 0,
+					 sizetocopy, buffer->cov);
 	} else {
-		copy_block(data, buffer->addr, start, sizetocopy, buffer->cov);
+		ret = copy_block(data, buffer->addr, start, sizetocopy,
+				 buffer->cov);
 	}
+
+	return ret;
 }
 
 void emu10k1_wavein_update(struct emu10k1_card *card, struct wiinst *wiinst)
diff --git a/sound/oss/emu10k1/cardwi.h b/sound/oss/emu10k1/cardwi.h
index 15cfb9b..e82029b 100644
--- a/sound/oss/emu10k1/cardwi.h
+++ b/sound/oss/emu10k1/cardwi.h
@@ -83,7 +83,7 @@ void emu10k1_wavein_close(struct emu10k1_wavedevice *);
 void emu10k1_wavein_start(struct emu10k1_wavedevice *);
 void emu10k1_wavein_stop(struct emu10k1_wavedevice *);
 void emu10k1_wavein_getxfersize(struct wiinst *, u32 *);
-void emu10k1_wavein_xferdata(struct wiinst *, u8 __user *, u32 *);
+int emu10k1_wavein_xferdata(struct wiinst *, u8 __user *, u32 *);
 int emu10k1_wavein_setformat(struct emu10k1_wavedevice *, struct wave_format *);
 void emu10k1_wavein_update(struct emu10k1_card *, struct wiinst *);
 
diff --git a/sound/oss/emu10k1/passthrough.c b/sound/oss/emu10k1/passthrough.c
index 4e3baca..6d21d43 100644
--- a/sound/oss/emu10k1/passthrough.c
+++ b/sound/oss/emu10k1/passthrough.c
@@ -162,12 +162,15 @@ ssize_t emu10k1_pt_write(struct file *file, const char __user *buffer, size_t co
 
 		DPD(3, "prepend size %d, prepending %d bytes\n", pt->prepend_size, needed);
 		if (count < needed) {
-			copy_from_user(pt->buf + pt->prepend_size, buffer, count);
+			if (copy_from_user(pt->buf + pt->prepend_size,
+					   buffer, count))
+				return -EFAULT;
 			pt->prepend_size += count;
 			DPD(3, "prepend size now %d\n", pt->prepend_size);
 			return count;
 		}
-		copy_from_user(pt->buf + pt->prepend_size, buffer, needed);
+		if (copy_from_user(pt->buf + pt->prepend_size, buffer, needed))
+			return -EFAULT;
 		r = pt_putblock(wave_dev, (u16 *) pt->buf, nonblock);
 		if (r)
 			return r;
@@ -178,7 +181,8 @@ ssize_t emu10k1_pt_write(struct file *file, const char __user *buffer, size_t co
 	blocks_copied = 0;
 	while (blocks > 0) {
 		u16 __user *bufptr = (u16 __user *) buffer + (bytes_copied/2);
-		copy_from_user(pt->buf, bufptr, PT_BLOCKSIZE);
+		if (copy_from_user(pt->buf, bufptr, PT_BLOCKSIZE))
+			return -EFAULT;
 		r = pt_putblock(wave_dev, (u16 *)pt->buf, nonblock);
 		if (r) {
 			if (bytes_copied)
@@ -193,7 +197,8 @@ ssize_t emu10k1_pt_write(struct file *file, const char __user *buffer, size_t co
 	i = count - bytes_copied;
 	if (i) {
 		pt->prepend_size = i;
-		copy_from_user(pt->buf, buffer + bytes_copied, i);
+		if (copy_from_user(pt->buf, buffer + bytes_copied, i))
+			return -EFAULT;
 		bytes_copied += i;
 		DPD(3, "filling prepend buffer with %d bytes", i);
 	}
-- 
cgit v0.10.2


From 89fc9a1a79725c3e5c3b66cb6bd2c7d9eeab29fa Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Wed, 6 Dec 2006 20:35:35 -0800
Subject: [PATCH] SPI: improve sysfs compiler complaint handling

Signed-off-by: Jeff Garzik <jeff@garzik.org>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 09f2c74..1a3c963 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -447,7 +447,9 @@ static int __unregister(struct device *dev, void *unused)
  */
 void spi_unregister_master(struct spi_master *master)
 {
-	(void) device_for_each_child(master->cdev.dev, NULL, __unregister);
+	int dummy;
+
+	dummy = device_for_each_child(master->cdev.dev, NULL, __unregister);
 	class_device_unregister(&master->cdev);
 }
 EXPORT_SYMBOL_GPL(spi_unregister_master);
-- 
cgit v0.10.2


From 48ed214d10ae3c3999af938970f7b5b58df77be3 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@linux01.gwdg.de>
Date: Wed, 6 Dec 2006 20:35:37 -0800
Subject: [PATCH] constify inode accessors

Change the signature of i_size_read(), IMINOR() and IMAJOR() because they,
or the functions they call, will never modify the argument.

Signed-off-by: Jan Engelhardt <jengelh@gmx.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 94b831b..d791bae 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -636,7 +636,7 @@ extern void inode_double_unlock(struct inode *inode1, struct inode *inode2);
  * cmpxchg8b without the need of the lock prefix). For SMP compiles
  * and 64bit archs it makes no difference if preempt is enabled or not.
  */
-static inline loff_t i_size_read(struct inode *inode)
+static inline loff_t i_size_read(const struct inode *inode)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	loff_t i_size;
@@ -679,12 +679,12 @@ static inline void i_size_write(struct inode *inode, loff_t i_size)
 #endif
 }
 
-static inline unsigned iminor(struct inode *inode)
+static inline unsigned iminor(const struct inode *inode)
 {
 	return MINOR(inode->i_rdev);
 }
 
-static inline unsigned imajor(struct inode *inode)
+static inline unsigned imajor(const struct inode *inode)
 {
 	return MAJOR(inode->i_rdev);
 }
-- 
cgit v0.10.2


From e9168c189fd54171124b5d25644024d99869e6a8 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Wed, 6 Dec 2006 20:35:38 -0800
Subject: [PATCH] fuse: update userspace interface to version 7.8

Add a flag to the RELEASE message which specifies that a FLUSH operation
should be performed as well.  This interface update is needed for the FreeBSD
port, and doesn't actually touch the Linux implementation at all.

Also rename the unused 'flush_flags' in the FLUSH message to 'unused'.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 9fc48a6..7633632 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -15,7 +15,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 7
+#define FUSE_KERNEL_MINOR_VERSION 8
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -92,6 +92,11 @@ struct fuse_file_lock {
 #define FUSE_ASYNC_READ		(1 << 0)
 #define FUSE_POSIX_LOCKS	(1 << 1)
 
+/**
+ * Release flags
+ */
+#define FUSE_RELEASE_FLUSH	(1 << 0)
+
 enum fuse_opcode {
 	FUSE_LOOKUP	   = 1,
 	FUSE_FORGET	   = 2,  /* no reply */
@@ -205,12 +210,13 @@ struct fuse_open_out {
 struct fuse_release_in {
 	__u64	fh;
 	__u32	flags;
-	__u32	padding;
+	__u32	release_flags;
+	__u64	lock_owner;
 };
 
 struct fuse_flush_in {
 	__u64	fh;
-	__u32	flush_flags;
+	__u32	unused;
 	__u32	padding;
 	__u64	lock_owner;
 };
-- 
cgit v0.10.2


From bdcf25080438ba71bb24b885e7c102de72c25c9d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Wed, 6 Dec 2006 20:35:41 -0800
Subject: [PATCH] fuse: minor cleanup in fuse_dentry_revalidate

Remove unneeded code from fuse_dentry_revalidate().  This made some sense
while the validity time could wrap around, but now it's a very obvious no-op.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c71a6c0..677f3ed 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -141,9 +141,6 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 		struct fuse_req *forget_req;
 		struct dentry *parent;
 
-		/* Doesn't hurt to "reset" the validity timeout */
-		fuse_invalidate_entry_cache(entry);
-
 		/* For negative dentries, always do a fresh lookup */
 		if (!inode)
 			return 0;
-- 
cgit v0.10.2


From d6392f873f1d09974d5c92c52715fa422ad7c625 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Wed, 6 Dec 2006 20:35:44 -0800
Subject: [PATCH] fuse: add support for block device based filesystems

I never intended this, but people started using fuse to implement block device
based "real" filesystems (ntfs-3g, zfs).

The following four patches add better support for these kinds of filesystems.
Unlike "normal" fuse filesystems, using this feature should require superuser
privileges (enforced by the fusermount utility).

Thanks to Szabolcs Szakacsits for the input and testing.

This patch adds a 'fuseblk' filesystem type, which is only different from the
'fuse' filesystem type in how the 'dev_name' mount argument is interpreted.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/Documentation/filesystems/fuse.txt b/Documentation/filesystems/fuse.txt
index 3d74477..fd17dce 100644
--- a/Documentation/filesystems/fuse.txt
+++ b/Documentation/filesystems/fuse.txt
@@ -51,6 +51,22 @@ homepage:
 
   http://fuse.sourceforge.net/
 
+Filesystem type
+~~~~~~~~~~~~~~~
+
+The filesystem type given to mount(2) can be one of the following:
+
+'fuse'
+
+  This is the usual way to mount a FUSE filesystem.  The first
+  argument of the mount system call may contain an arbitrary string,
+  which is not interpreted by the kernel.
+
+'fuseblk'
+
+  The filesystem is block device based.  The first argument of the
+  mount system call is interpreted as the name of the device.
+
 Mount options
 ~~~~~~~~~~~~~
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2bdc652..38cf97d 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -591,6 +591,14 @@ static int fuse_get_sb(struct file_system_type *fs_type,
 	return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt);
 }
 
+static int fuse_get_sb_blk(struct file_system_type *fs_type,
+			   int flags, const char *dev_name,
+			   void *raw_data, struct vfsmount *mnt)
+{
+	return get_sb_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super,
+			   mnt);
+}
+
 static struct file_system_type fuse_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "fuse",
@@ -598,6 +606,14 @@ static struct file_system_type fuse_fs_type = {
 	.kill_sb	= kill_anon_super,
 };
 
+static struct file_system_type fuseblk_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "fuseblk",
+	.get_sb		= fuse_get_sb_blk,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
 static decl_subsys(fuse, NULL, NULL);
 static decl_subsys(connections, NULL, NULL);
 
@@ -617,24 +633,34 @@ static int __init fuse_fs_init(void)
 
 	err = register_filesystem(&fuse_fs_type);
 	if (err)
-		printk("fuse: failed to register filesystem\n");
-	else {
-		fuse_inode_cachep = kmem_cache_create("fuse_inode",
-						      sizeof(struct fuse_inode),
-						      0, SLAB_HWCACHE_ALIGN,
-						      fuse_inode_init_once, NULL);
-		if (!fuse_inode_cachep) {
-			unregister_filesystem(&fuse_fs_type);
-			err = -ENOMEM;
-		}
-	}
+		goto out;
+
+	err = register_filesystem(&fuseblk_fs_type);
+	if (err)
+		goto out_unreg;
+
+	fuse_inode_cachep = kmem_cache_create("fuse_inode",
+					      sizeof(struct fuse_inode),
+					      0, SLAB_HWCACHE_ALIGN,
+					      fuse_inode_init_once, NULL);
+	err = -ENOMEM;
+	if (!fuse_inode_cachep)
+		goto out_unreg2;
+
+	return 0;
 
+ out_unreg2:
+	unregister_filesystem(&fuseblk_fs_type);
+ out_unreg:
+	unregister_filesystem(&fuse_fs_type);
+ out:
 	return err;
 }
 
 static void fuse_fs_cleanup(void)
 {
 	unregister_filesystem(&fuse_fs_type);
+	unregister_filesystem(&fuseblk_fs_type);
 	kmem_cache_destroy(fuse_inode_cachep);
 }
 
-- 
cgit v0.10.2


From d809161402e9f99aefe8848c4e701597ac367269 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Wed, 6 Dec 2006 20:35:48 -0800
Subject: [PATCH] fuse: add blksize option

Add 'blksize' option for block device based filesystems.  During
initialization this is used to set the block size on the device and the super
block.  The default block size is 512bytes.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/Documentation/filesystems/fuse.txt b/Documentation/filesystems/fuse.txt
index fd17dce..345392c 100644
--- a/Documentation/filesystems/fuse.txt
+++ b/Documentation/filesystems/fuse.txt
@@ -110,6 +110,11 @@ Mount options
   The default is infinite.  Note that the size of read requests is
   limited anyway to 32 pages (which is 128kbyte on i386).
 
+'blksize=N'
+
+  Set the block size for the filesystem.  The default is 512.  This
+  option is only valid for 'fuseblk' type mounts.
+
 Control filesystem
 ~~~~~~~~~~~~~~~~~~
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 38cf97d..1baaaeb2 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -39,6 +39,7 @@ struct fuse_mount_data {
 	unsigned group_id_present : 1;
 	unsigned flags;
 	unsigned max_read;
+	unsigned blksize;
 };
 
 static struct inode *fuse_alloc_inode(struct super_block *sb)
@@ -274,6 +275,7 @@ enum {
 	OPT_DEFAULT_PERMISSIONS,
 	OPT_ALLOW_OTHER,
 	OPT_MAX_READ,
+	OPT_BLKSIZE,
 	OPT_ERR
 };
 
@@ -285,14 +287,16 @@ static match_table_t tokens = {
 	{OPT_DEFAULT_PERMISSIONS,	"default_permissions"},
 	{OPT_ALLOW_OTHER,		"allow_other"},
 	{OPT_MAX_READ,			"max_read=%u"},
+	{OPT_BLKSIZE,			"blksize=%u"},
 	{OPT_ERR,			NULL}
 };
 
-static int parse_fuse_opt(char *opt, struct fuse_mount_data *d)
+static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
 {
 	char *p;
 	memset(d, 0, sizeof(struct fuse_mount_data));
 	d->max_read = ~0;
+	d->blksize = 512;
 
 	while ((p = strsep(&opt, ",")) != NULL) {
 		int token;
@@ -345,6 +349,12 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d)
 			d->max_read = value;
 			break;
 
+		case OPT_BLKSIZE:
+			if (!is_bdev || match_int(&args[0], &value))
+				return 0;
+			d->blksize = value;
+			break;
+
 		default:
 			return 0;
 		}
@@ -500,15 +510,21 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	struct dentry *root_dentry;
 	struct fuse_req *init_req;
 	int err;
+	int is_bdev = sb->s_bdev != NULL;
 
 	if (sb->s_flags & MS_MANDLOCK)
 		return -EINVAL;
 
-	if (!parse_fuse_opt((char *) data, &d))
+	if (!parse_fuse_opt((char *) data, &d, is_bdev))
 		return -EINVAL;
 
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	if (is_bdev) {
+		if (!sb_set_blocksize(sb, d.blksize))
+			return -EINVAL;
+	} else {
+		sb->s_blocksize = PAGE_CACHE_SIZE;
+		sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	}
 	sb->s_magic = FUSE_SUPER_MAGIC;
 	sb->s_op = &fuse_super_operations;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
-- 
cgit v0.10.2


From b2d2272fae1e1df26ec8f93a6d5baea891dcce37 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Wed, 6 Dec 2006 20:35:51 -0800
Subject: [PATCH] fuse: add bmap support

Add support for the BMAP operation for block device based filesystems.  This
is needed to support swap-files and lilo.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 677f3ed..1cabdb2 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1024,6 +1024,8 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 	if (attr->ia_valid & ATTR_SIZE) {
 		unsigned long limit;
 		is_truncate = 1;
+		if (IS_SWAPFILE(inode))
+			return -ETXTBSY;
 		limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
 		if (limit != RLIM_INFINITY && attr->ia_size > (loff_t) limit) {
 			send_sig(SIGXFSZ, current, 0);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 763a50d..128f79c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -754,6 +754,42 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
 	return err;
 }
 
+static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
+{
+	struct inode *inode = mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	struct fuse_bmap_in inarg;
+	struct fuse_bmap_out outarg;
+	int err;
+
+	if (!inode->i_sb->s_bdev || fc->no_bmap)
+		return 0;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return 0;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.block = block;
+	inarg.blocksize = inode->i_sb->s_blocksize;
+	req->in.h.opcode = FUSE_BMAP;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (err == -ENOSYS)
+		fc->no_bmap = 1;
+
+	return err ? 0 : outarg.block;
+}
+
 static const struct file_operations fuse_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -787,6 +823,7 @@ static const struct address_space_operations fuse_file_aops  = {
 	.commit_write	= fuse_commit_write,
 	.readpages	= fuse_readpages,
 	.set_page_dirty	= fuse_set_page_dirty,
+	.bmap		= fuse_bmap,
 };
 
 void fuse_init_file_inode(struct inode *inode)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 91edb89..58d482d 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -339,6 +339,9 @@ struct fuse_conn {
 	/** Is interrupt not implemented by fs? */
 	unsigned no_interrupt : 1;
 
+	/** Is bmap not implemented by fs? */
+	unsigned no_bmap : 1;
+
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
 
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 7633632..162a754 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -132,6 +132,7 @@ enum fuse_opcode {
 	FUSE_ACCESS        = 34,
 	FUSE_CREATE        = 35,
 	FUSE_INTERRUPT     = 36,
+	FUSE_BMAP          = 37,
 };
 
 /* The read buffer is required to be at least 8k, but may be much larger */
@@ -302,6 +303,16 @@ struct fuse_interrupt_in {
 	__u64	unique;
 };
 
+struct fuse_bmap_in {
+	__u64	block;
+	__u32	blocksize;
+	__u32	padding;
+};
+
+struct fuse_bmap_out {
+	__u64	block;
+};
+
 struct fuse_in_header {
 	__u32	len;
 	__u32	opcode;
-- 
cgit v0.10.2


From 0ec7ca41f6f0f74a394a7d686bc0ee8afef84887 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Wed, 6 Dec 2006 20:35:52 -0800
Subject: [PATCH] fuse: add DESTROY operation

Add a DESTROY operation for block device based filesystems.  With the help of
this operation, such a filesystem can flush dirty data to the device
synchronously before the umount returns.

This is needed in situations where the filesystem is assumed to be clean
immediately after unmount (e.g.  ejecting removable media).

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 58d482d..b98b20d 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -298,6 +298,9 @@ struct fuse_conn {
 	    reply, before any other request, and never cleared */
 	unsigned conn_error : 1;
 
+	/** Connection successful.  Only set in INIT */
+	unsigned conn_init : 1;
+
 	/** Do readpages asynchronously?  Only set in INIT */
 	unsigned async_read : 1;
 
@@ -368,6 +371,9 @@ struct fuse_conn {
 
 	/** Key for lock owner ID scrambling */
 	u32 scramble_key[4];
+
+	/** Reserved request for the DESTROY message */
+	struct fuse_req *destroy_req;
 };
 
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 1baaaeb2..437d61c 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -206,10 +206,23 @@ static void fuse_umount_begin(struct vfsmount *vfsmnt, int flags)
 		fuse_abort_conn(get_fuse_conn_super(vfsmnt->mnt_sb));
 }
 
+static void fuse_send_destroy(struct fuse_conn *fc)
+{
+	struct fuse_req *req = fc->destroy_req;
+	if (req && fc->conn_init) {
+		fc->destroy_req = NULL;
+		req->in.h.opcode = FUSE_DESTROY;
+		req->force = 1;
+		request_send(fc, req);
+		fuse_put_request(fc, req);
+	}
+}
+
 static void fuse_put_super(struct super_block *sb)
 {
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 
+	fuse_send_destroy(fc);
 	spin_lock(&fc->lock);
 	fc->connected = 0;
 	fc->blocked = 0;
@@ -410,6 +423,8 @@ static struct fuse_conn *new_conn(void)
 void fuse_conn_put(struct fuse_conn *fc)
 {
 	if (atomic_dec_and_test(&fc->count)) {
+		if (fc->destroy_req)
+			fuse_request_free(fc->destroy_req);
 		mutex_destroy(&fc->inst_mutex);
 		kfree(fc);
 	}
@@ -466,6 +481,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 		fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
 		fc->minor = arg->minor;
 		fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
+		fc->conn_init = 1;
 	}
 	fuse_put_request(fc, req);
 	fc->blocked = 0;
@@ -563,6 +579,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (!init_req)
 		goto err_put_root;
 
+	if (is_bdev) {
+		fc->destroy_req = fuse_request_alloc();
+		if (!fc->destroy_req)
+			goto err_put_root;
+	}
+
 	mutex_lock(&fuse_mutex);
 	err = -EINVAL;
 	if (file->private_data)
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 162a754..534744e 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -133,6 +133,7 @@ enum fuse_opcode {
 	FUSE_CREATE        = 35,
 	FUSE_INTERRUPT     = 36,
 	FUSE_BMAP          = 37,
+	FUSE_DESTROY       = 38,
 };
 
 /* The read buffer is required to be at least 8k, but may be much larger */
-- 
cgit v0.10.2


From 875d95ec9eb69ffb334116fb44d04d9a64dcbfbb Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Wed, 6 Dec 2006 20:35:54 -0800
Subject: [PATCH] fuse: fix compile without CONFIG_BLOCK

Randy Dunlap wote:
> Should FUSE depend on BLOCK?  Without that and with BLOCK=n, I get:
>
> inode.c:(.text+0x3acc5): undefined reference to `sb_set_blocksize'
> inode.c:(.text+0x3a393): undefined reference to `get_sb_bdev'
> fs/built-in.o:(.data+0xd718): undefined reference to `kill_block_super

Most fuse filesystems work fine without block device support, so I
think a better solution is to disable the 'fuseblk' filesystem type if
BLOCK=n.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 437d61c..12450d2 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -535,8 +535,10 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 		return -EINVAL;
 
 	if (is_bdev) {
+#ifdef CONFIG_BLOCK
 		if (!sb_set_blocksize(sb, d.blksize))
 			return -EINVAL;
+#endif
 	} else {
 		sb->s_blocksize = PAGE_CACHE_SIZE;
 		sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
@@ -629,6 +631,14 @@ static int fuse_get_sb(struct file_system_type *fs_type,
 	return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt);
 }
 
+static struct file_system_type fuse_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "fuse",
+	.get_sb		= fuse_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+#ifdef CONFIG_BLOCK
 static int fuse_get_sb_blk(struct file_system_type *fs_type,
 			   int flags, const char *dev_name,
 			   void *raw_data, struct vfsmount *mnt)
@@ -637,13 +647,6 @@ static int fuse_get_sb_blk(struct file_system_type *fs_type,
 			   mnt);
 }
 
-static struct file_system_type fuse_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "fuse",
-	.get_sb		= fuse_get_sb,
-	.kill_sb	= kill_anon_super,
-};
-
 static struct file_system_type fuseblk_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "fuseblk",
@@ -652,6 +655,26 @@ static struct file_system_type fuseblk_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
+static inline int register_fuseblk(void)
+{
+	return register_filesystem(&fuseblk_fs_type);
+}
+
+static inline void unregister_fuseblk(void)
+{
+	unregister_filesystem(&fuseblk_fs_type);
+}
+#else
+static inline int register_fuseblk(void)
+{
+	return 0;
+}
+
+static inline void unregister_fuseblk(void)
+{
+}
+#endif
+
 static decl_subsys(fuse, NULL, NULL);
 static decl_subsys(connections, NULL, NULL);
 
@@ -673,7 +696,7 @@ static int __init fuse_fs_init(void)
 	if (err)
 		goto out;
 
-	err = register_filesystem(&fuseblk_fs_type);
+	err = register_fuseblk();
 	if (err)
 		goto out_unreg;
 
@@ -688,7 +711,7 @@ static int __init fuse_fs_init(void)
 	return 0;
 
  out_unreg2:
-	unregister_filesystem(&fuseblk_fs_type);
+	unregister_fuseblk();
  out_unreg:
 	unregister_filesystem(&fuse_fs_type);
  out:
@@ -698,7 +721,7 @@ static int __init fuse_fs_init(void)
 static void fuse_fs_cleanup(void)
 {
 	unregister_filesystem(&fuse_fs_type);
-	unregister_filesystem(&fuseblk_fs_type);
+	unregister_fuseblk();
 	kmem_cache_destroy(fuse_inode_cachep);
 }
 
-- 
cgit v0.10.2


From e59e2ae2c29700117a54e85c106017c24837119f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 6 Dec 2006 20:35:59 -0800
Subject: [PATCH] SysRq-X: show blocked tasks

Add SysRq-X support: show blocked (TASK_UNINTERRUPTIBLE) tasks only.

Useful for debugging IO stalls.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index c64f5bc..05810c8 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -182,6 +182,18 @@ static struct sysrq_key_op sysrq_showstate_op = {
 	.enable_mask	= SYSRQ_ENABLE_DUMP,
 };
 
+static void sysrq_handle_showstate_blocked(int key, struct tty_struct *tty)
+{
+	show_state_filter(TASK_UNINTERRUPTIBLE);
+}
+static struct sysrq_key_op sysrq_showstate_blocked_op = {
+	.handler	= sysrq_handle_showstate_blocked,
+	.help_msg	= "showBlockedTasks",
+	.action_msg	= "Show Blocked State",
+	.enable_mask	= SYSRQ_ENABLE_DUMP,
+};
+
+
 static void sysrq_handle_showmem(int key, struct tty_struct *tty)
 {
 	show_mem();
@@ -304,7 +316,7 @@ static struct sysrq_key_op *sysrq_key_table[36] = {
 	/* May be assigned at init time by SMP VOYAGER */
 	NULL,				/* v */
 	NULL,				/* w */
-	NULL,				/* x */
+	&sysrq_showstate_blocked_op,	/* x */
 	NULL,				/* y */
 	NULL				/* z */
 };
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 837a012..0a90cef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -194,7 +194,16 @@ extern void init_idle(struct task_struct *idle, int cpu);
 
 extern cpumask_t nohz_cpu_mask;
 
-extern void show_state(void);
+/*
+ * Only dump TASK_* tasks. (-1 for all tasks)
+ */
+extern void show_state_filter(unsigned long state_filter);
+
+static inline void show_state(void)
+{
+	show_state_filter(-1);
+}
+
 extern void show_regs(struct pt_regs *);
 
 /*
diff --git a/kernel/sched.c b/kernel/sched.c
index 12fdbef..1848e28 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4804,7 +4804,7 @@ static void show_task(struct task_struct *p)
 		show_stack(p, NULL);
 }
 
-void show_state(void)
+void show_state_filter(unsigned long state_filter)
 {
 	struct task_struct *g, *p;
 
@@ -4824,11 +4824,16 @@ void show_state(void)
 		 * console might take alot of time:
 		 */
 		touch_nmi_watchdog();
-		show_task(p);
+		if (p->state & state_filter)
+			show_task(p);
 	} while_each_thread(g, p);
 
 	read_unlock(&tasklist_lock);
-	debug_show_all_locks();
+	/*
+	 * Only show locks if all tasks are dumped:
+	 */
+	if (state_filter == -1)
+		debug_show_all_locks();
 }
 
 /**
-- 
cgit v0.10.2


From 593be07ae8f6f4a1b1b98813fabb155328f8bc0c Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 6 Dec 2006 20:36:01 -0800
Subject: [PATCH] file: kill unnecessary timer in fdtable_defer

free_fdtable_rc() schedules timer to reschedule fddef->wq if
schedule_work() on it returns 0.  However, schedule_work() guarantees that
the target work is executed at least once after the scheduling regardless
of its return value.  0 return simply means that the work was already
pending and thus no further action was required.

Another problem is that it used contant '5' as @expires argument to
mod_timer().

Kill unnecessary fddef->timer.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/file.c b/fs/file.c
index 3787e82..51aef67 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -21,7 +21,6 @@
 struct fdtable_defer {
 	spinlock_t lock;
 	struct work_struct wq;
-	struct timer_list timer;
 	struct fdtable *next;
 };
 
@@ -75,22 +74,6 @@ static void __free_fdtable(struct fdtable *fdt)
 	kfree(fdt);
 }
 
-static void fdtable_timer(unsigned long data)
-{
-	struct fdtable_defer *fddef = (struct fdtable_defer *)data;
-
-	spin_lock(&fddef->lock);
-	/*
-	 * If someone already emptied the queue return.
-	 */
-	if (!fddef->next)
-		goto out;
-	if (!schedule_work(&fddef->wq))
-		mod_timer(&fddef->timer, 5);
-out:
-	spin_unlock(&fddef->lock);
-}
-
 static void free_fdtable_work(struct work_struct *work)
 {
 	struct fdtable_defer *f =
@@ -144,13 +127,8 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
 		spin_lock(&fddef->lock);
 		fdt->next = fddef->next;
 		fddef->next = fdt;
-		/*
-		 * vmallocs are handled from the workqueue context.
-		 * If the per-cpu workqueue is running, then we
-		 * defer work scheduling through a timer.
-		 */
-		if (!schedule_work(&fddef->wq))
-			mod_timer(&fddef->timer, 5);
+		/* vmallocs are handled from the workqueue context */
+		schedule_work(&fddef->wq);
 		spin_unlock(&fddef->lock);
 		put_cpu_var(fdtable_defer_list);
 	}
@@ -354,9 +332,6 @@ static void __devinit fdtable_defer_list_init(int cpu)
 	struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
 	spin_lock_init(&fddef->lock);
 	INIT_WORK(&fddef->wq, free_fdtable_work);
-	init_timer(&fddef->timer);
-	fddef->timer.data = (unsigned long)fddef;
-	fddef->timer.function = fdtable_timer;
 	fddef->next = NULL;
 }
 
-- 
cgit v0.10.2


From 49033c81845f1bd2a69e53485f20d55a0e7bace5 Mon Sep 17 00:00:00 2001
From: Filipe <filipe@icewall.org>
Date: Wed, 6 Dec 2006 20:36:04 -0800
Subject: [PATCH] io/storage: Documentation update to as-iosched.txt

Documentation update, adding references to CFQ scheduler and to another
document about selecting IO Schedulers.

Signed-off-by: Filipe Lautert <filipe@icewall.org>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/Documentation/block/as-iosched.txt b/Documentation/block/as-iosched.txt
index e2a66f8..a598fe1 100644
--- a/Documentation/block/as-iosched.txt
+++ b/Documentation/block/as-iosched.txt
@@ -24,8 +24,10 @@ very similar behavior to the deadline IO scheduler.
 Selecting IO schedulers
 -----------------------
 To choose IO schedulers at boot time, use the argument 'elevator=deadline'.
-'noop' and 'as' (the default) are also available. IO schedulers are assigned
-globally at boot time only presently.
+'noop', 'as' and 'cfq' (the default) are also available. IO schedulers are
+assigned globally at boot time only presently. It's also possible to change
+the IO scheduler for a determined device on the fly, as described in
+Documentation/block/switching-sched.txt.
 
 
 Anticipatory IO scheduler Policies
-- 
cgit v0.10.2


From 4cf303487d5dddaace2daca8437c555f3f0bc1aa Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Wed, 6 Dec 2006 20:36:06 -0800
Subject: [PATCH] Export pm_suspend for the shared APM emulation

The new shared APM emulation just like its ARM and MIPS predecessors uses
pm_suspend() which was only exported on SH.  Move export to close to it's
definition where it really should be anyway.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/sh/kernel/sh_ksyms.c b/arch/sh/kernel/sh_ksyms.c
index c706f3b..ceee791 100644
--- a/arch/sh/kernel/sh_ksyms.c
+++ b/arch/sh/kernel/sh_ksyms.c
@@ -99,10 +99,6 @@ EXPORT_SYMBOL(__down_trylock);
 EXPORT_SYMBOL(synchronize_irq);
 #endif
 
-#ifdef CONFIG_PM
-EXPORT_SYMBOL(pm_suspend);
-#endif
-
 EXPORT_SYMBOL(csum_partial);
 #ifdef CONFIG_IPV6
 EXPORT_SYMBOL(csum_ipv6_magic);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 751157b..500eb87 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -8,6 +8,7 @@
  *
  */
 
+#include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/kobject.h>
 #include <linux/string.h>
@@ -230,7 +231,7 @@ int pm_suspend(suspend_state_t state)
 	return -EINVAL;
 }
 
-
+EXPORT_SYMBOL(pm_suspend);
 
 decl_subsys(power,NULL,NULL);
 
-- 
cgit v0.10.2


From 87b4126f10cce2d49687df227f6228fa5a9ac6c6 Mon Sep 17 00:00:00 2001
From: Suzuki K P <suzuki@in.ibm.com>
Date: Wed, 6 Dec 2006 20:36:10 -0800
Subject: [PATCH] fix reiserfs bad path release panic

One of our test team hit a reiserfs_panic while running fsstress tests on
2.6.19-rc1.  The message looks like :

  REISERFS: panic(device Null superblock):
  reiserfs[5676]: assertion !(p->path_length != 1 ) failed at
  fs/reiserfs/stree.c:397:reiserfs_check_path: path not properly relsed.

The backtrace looked :

  kernel BUG in reiserfs_panic at fs/reiserfs/prints.c:361!
	.reiserfs_check_path+0x58/0x74
	.reiserfs_get_block+0x1444/0x1508
	.__block_prepare_write+0x1c8/0x558
	.block_prepare_write+0x34/0x64
	.reiserfs_prepare_write+0x118/0x1d0
	.generic_file_buffered_write+0x314/0x82c
	.__generic_file_aio_write_nolock+0x350/0x3e0
	.__generic_file_write_nolock+0x78/0xb0
	.generic_file_write+0x60/0xf0
	.reiserfs_file_write+0x198/0x2038
	.vfs_write+0xd0/0x1b4
	.sys_write+0x4c/0x8c
	syscall_exit+0x0/0x4

Upon debugging I found that the restart_transaction was not releasing
the path if the th->refcount was > 1.

/*static*/
int restart_transaction(struct reiserfs_transaction_handle *th,
                           			struct inode *inode, struct path *path)
{
	[...]

         /* we cannot restart while nested */
         if (th->t_refcount > 1) { <<- Path is not released in this case!
                 return 0;
         }

         pathrelse(path); <<- Path released here.
	[...]

This could happen in such a situation :

In reiserfs/inode.c: reiserfs_get_block() ::

      if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
          /* restart the transaction to give the journal a chance to free
           ** some blocks.  releases the path, so we have to go back to
           ** research if we succeed on the second try
           */
          SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;

        -->>  retval = restart_transaction(th, inode, &path); <<--

  We are supposed to release the path, no matter we succeed or fail. But
if the th->refcount is > 1, the path is still valid. And,

          if (retval)
                   goto failure;
          repeat =
              _allocate_block(th, block, inode,
                             &allocated_block_nr, NULL, create);

If the above allocate_block fails with NO_DISK_SPACE or QUOTA_EXCEEDED,
we would have path which is not released.

         if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
                   goto research;
         }
         if (repeat == QUOTA_EXCEEDED)
                   retval = -EDQUOT;
         else
                   retval = -ENOSPC;
         goto failure;
	[...]

       failure:
	[...]
         reiserfs_check_path(&path); << Panics here !

Attached here is a patch which could fix the issue.

fix reiserfs/inode.c : restart_transaction() to release the path in all
cases.

The restart_transaction() doesn't release the path when the the journal
handle has a refcount > 1.  This would trigger a reiserfs_panic() if we
encounter an -ENOSPC / -EDQUOT in reiserfs_get_block().

Signed-off-by: Suzuki K P <suzuki@in.ibm.com>
Cc: "Vladimir V. Saveliev" <vs@namesys.com>
Cc: <reiserfs-dev@namesys.com>
Cc: Jeff Mahoney <jeffm@suse.com>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 9c69bca..a625688 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -216,11 +216,12 @@ static int file_capable(struct inode *inode, long block)
 	BUG_ON(!th->t_trans_id);
 	BUG_ON(!th->t_refcount);
 
+	pathrelse(path);
+
 	/* we cannot restart while nested */
 	if (th->t_refcount > 1) {
 		return 0;
 	}
-	pathrelse(path);
 	reiserfs_update_sd(th, inode);
 	err = journal_end(th, s, len);
 	if (!err) {
-- 
cgit v0.10.2


From 23a1b2a78705caa2ecaccf8422a1e22eaca59574 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Wed, 6 Dec 2006 20:36:12 -0800
Subject: [PATCH] via82cxxx: handle error condition properly

Jeff noted that the via driver returned an error to an unsigned int in a
a case where errors are not permitted. Move the check down earlier so we
can handle it properly. Not as pretty but it works this way and avoids
hacking up ugly stuff in the legacy ide core.

Signed-off-by: Alan Cox <alan@redhat.com>
Cc: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/ide/pci/via82cxxx.c b/drivers/ide/pci/via82cxxx.c
index eb7ab11..61f1a96 100644
--- a/drivers/ide/pci/via82cxxx.c
+++ b/drivers/ide/pci/via82cxxx.c
@@ -282,11 +282,11 @@ static unsigned int __devinit init_chipset_via82cxxx(struct pci_dev *dev, const
 	 * Find the ISA bridge to see how good the IDE is.
 	 */
 	via_config = via_config_find(&isa);
-	if (!via_config->id) {
-		printk(KERN_WARNING "VP_IDE: Unknown VIA SouthBridge, disabling DMA.\n");
-		pci_dev_put(isa);
-		return -ENODEV;
-	}
+
+	/* We checked this earlier so if it fails here deeep badness
+	   is involved */
+
+	BUG_ON(!via_config->id);
 
 	/*
 	 * Setup or disable Clk66 if appropriate
@@ -494,6 +494,17 @@ static ide_pci_device_t via82cxxx_chipsets[] __devinitdata = {
 
 static int __devinit via_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
+	struct pci_dev *isa = NULL;
+	struct via_isa_bridge *via_config;
+	/*
+	 * Find the ISA bridge and check we know what it is.
+	 */
+	via_config = via_config_find(&isa);
+	pci_dev_put(isa);
+	if (!via_config->id) {
+		printk(KERN_WARNING "VP_IDE: Unknown VIA SouthBridge, disabling DMA.\n");
+		return -ENODEV;
+	}
 	return ide_setup_pci_device(dev, &via82cxxx_chipsets[id->driver_data]);
 }
 
-- 
cgit v0.10.2


From 9a2239b1174bdf0952a21ed328cd74240d2dd173 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 6 Dec 2006 20:36:13 -0800
Subject: [PATCH] lockdep: fix ide/proc interaction

  rmmod/3080 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
   (proc_subdir_lock){--..}, at: [<c04a33b0>] remove_proc_entry+0x40/0x191

  and this task is already holding:
   (ide_lock){++..}, at: [<c05651a2>] ide_unregister_subdriver+0x39/0xc8
  which would create a new lock dependency:
   (ide_lock){++..} -> (proc_subdir_lock){--..}

  but this new dependency connects a hard-irq-safe lock:
   (ide_lock){++..}
  ... which became hard-irq-safe at:
    [<c043c458>] lock_acquire+0x4b/0x6b
    [<c06129d7>] _spin_lock_irqsave+0x22/0x32
    [<c0567870>] ide_intr+0x17/0x1a9
    [<c044eb31>] handle_IRQ_event+0x20/0x4d
    [<c044ebf2>] __do_IRQ+0x94/0xef
    [<c0406771>] do_IRQ+0x9e/0xbd

  to a hard-irq-unsafe lock:
   (proc_subdir_lock){--..}
  ... which became hard-irq-unsafe at:
  ...  [<c043c458>] lock_acquire+0x4b/0x6b
    [<c06126ab>] _spin_lock+0x19/0x28
    [<c04a32f2>] xlate_proc_name+0x1b/0x99
    [<c04a3547>] proc_create+0x46/0xdf
    [<c04a3642>] create_proc_entry+0x62/0xa5
    [<c07c1972>] proc_misc_init+0x1c/0x1d2
    [<c07c1844>] proc_root_init+0x4c/0xe9
    [<c07ad703>] start_kernel+0x294/0x3b3

Move ide_remove_proc_entries() out from under ide_lock; there is nothing
that indicates that this is needed.

In specific, the call to ide_add_proc_entries() is unprotected, and there
is nothing else in the file using the respective ->proc fields. Also the
lock order around destroy_proc_ide_interface() suggests this.

Alan sayeth:

  proc_ide_write_settings walks the setting list under ide_setting_sem, read
  ditto.  remove_proc_entry is doing proc side housekeeping.

  Looks fine to me, although that old code is such a mess anything could be
  going on.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 287a662..1689076 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -973,8 +973,8 @@ ide_settings_t *ide_find_setting_by_name (ide_drive_t *drive, char *name)
  *	@drive: drive
  *
  *	Automatically remove all the driver specific settings for this
- *	drive. This function may sleep and must not be called from IRQ
- *	context. The caller must hold ide_setting_sem.
+ *	drive. This function may not be called from IRQ context. The
+ *	caller must hold ide_setting_sem.
  */
  
 static void auto_remove_settings (ide_drive_t *drive)
@@ -1874,11 +1874,22 @@ void ide_unregister_subdriver(ide_drive_t *drive, ide_driver_t *driver)
 {
 	unsigned long flags;
 	
-	down(&ide_setting_sem);
-	spin_lock_irqsave(&ide_lock, flags);
 #ifdef CONFIG_PROC_FS
 	ide_remove_proc_entries(drive->proc, driver->proc);
 #endif
+	down(&ide_setting_sem);
+	spin_lock_irqsave(&ide_lock, flags);
+	/*
+	 * ide_setting_sem protects the settings list
+	 * ide_lock protects the use of settings
+	 *
+	 * so we need to hold both, ide_settings_sem because we want to
+	 * modify the settings list, and ide_lock because we cannot take
+	 * a setting out that is being used.
+	 *
+	 * OTOH both ide_{read,write}_setting are only ever used under
+	 * ide_setting_sem.
+	 */
 	auto_remove_settings(drive);
 	spin_unlock_irqrestore(&ide_lock, flags);
 	up(&ide_setting_sem);
-- 
cgit v0.10.2


From 5ec68b2e310437e99c297ba04e1afc5297aa6de1 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@linux01.gwdg.de>
Date: Wed, 6 Dec 2006 20:36:14 -0800
Subject: [PATCH] pull in necessary header files for cdev.h

linux/cdev.h uses struct kobject and other structs and should therefore
include them.  Currently, a module either needs to add the missing includes
itself, or, in case a module includes other headers already, needs to put
<linux/cdev.h> last, which goes against a alphabetically-sorted include
list.

Signed-off-by: Jan Engelhardt <jengelh@gmx.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/cdev.h b/include/linux/cdev.h
index ee5f53f..f309b00 100644
--- a/include/linux/cdev.h
+++ b/include/linux/cdev.h
@@ -2,6 +2,10 @@
 #define _LINUX_CDEV_H
 #ifdef __KERNEL__
 
+#include <linux/kobject.h>
+#include <linux/kdev_t.h>
+#include <linux/list.h>
+
 struct cdev {
 	struct kobject kobj;
 	struct module *owner;
-- 
cgit v0.10.2


From 696040670a12f66b17a839011f96d9ca376f688b Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 6 Dec 2006 20:36:15 -0800
Subject: [PATCH] cpuset: minor code refinements

A couple of minor code simplifications to the kernel/cpuset.c code.  No
functional change.  Just a little less code and a little more readable.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6313c38..bd1e89c4 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -729,9 +729,11 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 	}
 
 	/* Remaining checks don't apply to root cpuset */
-	if ((par = cur->parent) == NULL)
+	if (cur == &top_cpuset)
 		return 0;
 
+	par = cur->parent;
+
 	/* We must be a subset of our parent cpuset */
 	if (!is_cpuset_subset(trial, par))
 		return -EACCES;
@@ -1060,10 +1062,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
 	cpu_exclusive_changed =
 		(is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
 	mutex_lock(&callback_mutex);
-	if (turning_on)
-		set_bit(bit, &cs->flags);
-	else
-		clear_bit(bit, &cs->flags);
+	cs->flags = trialcs.flags;
 	mutex_unlock(&callback_mutex);
 
 	if (cpu_exclusive_changed)
-- 
cgit v0.10.2


From ed2908f31398049c4371de9b100700e80704e95f Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Wed, 6 Dec 2006 20:36:16 -0800
Subject: [PATCH] Remove superfluous lock_super() in extN xattr code

lock_super() is unnecessary for setting super-block feature flags.  Use the
provided *_SET_COMPAT_FEATURE() macros as well.

Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index af52a7f..247efd0 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -342,12 +342,9 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
 	if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR))
 		return;
 
-	lock_super(sb);
-	EXT2_SB(sb)->s_es->s_feature_compat |=
-		cpu_to_le32(EXT2_FEATURE_COMPAT_EXT_ATTR);
+	EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR);
 	sb->s_dirt = 1;
 	mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
-	unlock_super(sb);
 }
 
 /*
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index f86f248..99857a40 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -459,14 +459,11 @@ static void ext3_xattr_update_super_block(handle_t *handle,
 	if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR))
 		return;
 
-	lock_super(sb);
 	if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) {
-		EXT3_SB(sb)->s_es->s_feature_compat |=
-			cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR);
+		EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR);
 		sb->s_dirt = 1;
 		ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
 	}
-	unlock_super(sb);
 }
 
 /*
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 63233cd..dc969c3 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -459,14 +459,11 @@ static void ext4_xattr_update_super_block(handle_t *handle,
 	if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR))
 		return;
 
-	lock_super(sb);
 	if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
-		EXT4_SB(sb)->s_es->s_feature_compat |=
-			cpu_to_le32(EXT4_FEATURE_COMPAT_EXT_ATTR);
+		EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
 		sb->s_dirt = 1;
 		ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
 	}
-	unlock_super(sb);
 }
 
 /*
-- 
cgit v0.10.2


From 4740d387f3cb9e63f48f2488815b38a2c92755c8 Mon Sep 17 00:00:00 2001
From: Hans-Christian Egtvedt <hcegtvedt@atmel.com>
Date: Wed, 6 Dec 2006 20:36:17 -0800
Subject: [PATCH] spi: correct bus_num and buffer bug in spi core

Correct the following in driver/spi/spi.c in function spi_busnum_to_master:

 * must allow bus_num 0, the if is really not needed.
 * correct the name buffer which is too small for bus_num >= 10000. It

should be 9 bytes big, not 8.

Signed-off-by: Hans-Christian Egtvedt <hcegtvedt@atmel.com>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 1a3c963..7d215ea 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -465,15 +465,13 @@ EXPORT_SYMBOL_GPL(spi_unregister_master);
  */
 struct spi_master *spi_busnum_to_master(u16 bus_num)
 {
-	if (bus_num) {
-		char			name[8];
-		struct kobject		*bus;
-
-		snprintf(name, sizeof name, "spi%u", bus_num);
-		bus = kset_find_obj(&spi_master_class.subsys.kset, name);
-		if (bus)
-			return container_of(bus, struct spi_master, cdev.kobj);
-	}
+	char			name[9];
+	struct kobject		*bus;
+
+	snprintf(name, sizeof name, "spi%u", bus_num);
+	bus = kset_find_obj(&spi_master_class.subsys.kset, name);
+	if (bus)
+		return container_of(bus, struct spi_master, cdev.kobj);
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(spi_busnum_to_master);
-- 
cgit v0.10.2


From 3bd0f6943520e459659d10f3282285e43d3990f1 Mon Sep 17 00:00:00 2001
From: Hans-Christian Egtvedt <hcegtvedt@atmel.com>
Date: Wed, 6 Dec 2006 20:36:19 -0800
Subject: [PATCH] spi: set kset of master class dev explicitly

<quote Imre Deak from Thu, 12 Jan 2006 21:18:54 +0200>
  In order for spi_busnum_to_master to work spi master devices must be linked
  into the spi_master_class.subsys.kset list.  At the moment the default
  class_obj_subsys.kset is used and we can't enumerate the master devices.
</quote>

Signed-off-by: Hans-Christian Egtvedt <hcegtvedt@atmel.com>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 7d215ea..270e621 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -366,6 +366,7 @@ spi_alloc_master(struct device *dev, unsigned size)
 
 	class_device_initialize(&master->cdev);
 	master->cdev.class = &spi_master_class;
+	kobj_set_kset_s(&master->cdev, spi_master_class.subsys);
 	master->cdev.dev = get_device(dev);
 	spi_master_set_devdata(master, &master[1]);
 
-- 
cgit v0.10.2


From f4330002d11f032559954cbff68a5cad95b6d27f Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 6 Dec 2006 20:36:20 -0800
Subject: [PATCH] paride: rename pi_register() and pi_unregister()

We're about to change the semantics of pi_register()'s return value, so
rename it to something else first, so that any unconverted code reliaby
breaks.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/block/paride/aten.c b/drivers/block/paride/aten.c
index c4d696d..3539099 100644
--- a/drivers/block/paride/aten.c
+++ b/drivers/block/paride/aten.c
@@ -149,12 +149,12 @@ static struct pi_protocol aten = {
 
 static int __init aten_init(void)
 {
-	return pi_register(&aten)-1;
+	return paride_register(&aten)-1;
 }
 
 static void __exit aten_exit(void)
 {
-	pi_unregister( &aten );
+	paride_unregister( &aten );
 }
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/block/paride/bpck.c b/drivers/block/paride/bpck.c
index d462ff6..b0df16d 100644
--- a/drivers/block/paride/bpck.c
+++ b/drivers/block/paride/bpck.c
@@ -464,12 +464,12 @@ static struct pi_protocol bpck = {
 
 static int __init bpck_init(void)
 {
-	return pi_register(&bpck)-1;
+	return paride_register(&bpck)-1;
 }
 
 static void __exit bpck_exit(void)
 {
-	pi_unregister(&bpck);
+	paride_unregister(&bpck);
 }
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/block/paride/bpck6.c b/drivers/block/paride/bpck6.c
index 41a237c..7c04f74 100644
--- a/drivers/block/paride/bpck6.c
+++ b/drivers/block/paride/bpck6.c
@@ -265,12 +265,12 @@ static int __init bpck6_init(void)
 	printk(KERN_INFO "bpck6: Copyright 2001 by Micro Solutions, Inc., DeKalb IL. USA\n");
 	if(verbose)
 		printk(KERN_DEBUG "bpck6: verbose debug enabled.\n");
-	return pi_register(&bpck6) - 1;  
+	return paride_register(&bpck6) - 1;
 }
 
 static void __exit bpck6_exit(void)
 {
-	pi_unregister(&bpck6);
+	paride_unregister(&bpck6);
 }
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/block/paride/comm.c b/drivers/block/paride/comm.c
index 43d6135..c3d4076 100644
--- a/drivers/block/paride/comm.c
+++ b/drivers/block/paride/comm.c
@@ -205,12 +205,12 @@ static struct pi_protocol comm = {
 
 static int __init comm_init(void)
 {
-	return pi_register(&comm)-1;
+	return paride_register(&comm)-1;
 }
 
 static void __exit comm_exit(void)
 {
-	pi_unregister(&comm);
+	paride_unregister(&comm);
 }
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/block/paride/dstr.c b/drivers/block/paride/dstr.c
index 04d53bf..2e4219e 100644
--- a/drivers/block/paride/dstr.c
+++ b/drivers/block/paride/dstr.c
@@ -220,12 +220,12 @@ static struct pi_protocol dstr = {
 
 static int __init dstr_init(void)
 {
-	return pi_register(&dstr)-1;
+	return paride_register(&dstr)-1;
 }
 
 static void __exit dstr_exit(void)
 {
-	pi_unregister(&dstr);
+	paride_unregister(&dstr);
 }
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/block/paride/epat.c b/drivers/block/paride/epat.c
index 55d1c0a..5ed4fc1 100644
--- a/drivers/block/paride/epat.c
+++ b/drivers/block/paride/epat.c
@@ -327,12 +327,12 @@ static int __init epat_init(void)
 #ifdef CONFIG_PARIDE_EPATC8
 	epatc8 = 1;
 #endif
-	return pi_register(&epat)-1;
+	return paride_register(&epat)-1;
 }
 
 static void __exit epat_exit(void)
 {
-	pi_unregister(&epat);
+	paride_unregister(&epat);
 }
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/block/paride/epia.c b/drivers/block/paride/epia.c
index 0f2e0c2..649766c 100644
--- a/drivers/block/paride/epia.c
+++ b/drivers/block/paride/epia.c
@@ -303,12 +303,12 @@ static struct pi_protocol epia = {
 
 static int __init epia_init(void)
 {
-	return pi_register(&epia)-1;
+	return paride_register(&epia)-1;
 }
 
 static void __exit epia_exit(void)
 {
-	pi_unregister(&epia);
+	paride_unregister(&epia);
 }
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/block/paride/fit2.c b/drivers/block/paride/fit2.c
index e0f0691..7a16249 100644
--- a/drivers/block/paride/fit2.c
+++ b/drivers/block/paride/fit2.c
@@ -138,12 +138,12 @@ static struct pi_protocol fit2 = {
 
 static int __init fit2_init(void)
 {
-	return pi_register(&fit2)-1;
+	return paride_register(&fit2)-1;
 }
 
 static void __exit fit2_exit(void)
 {
-	pi_unregister(&fit2);
+	paride_unregister(&fit2);
 }
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/block/paride/fit3.c b/drivers/block/paride/fit3.c
index 15400e7..c701577 100644
--- a/drivers/block/paride/fit3.c
+++ b/drivers/block/paride/fit3.c
@@ -198,12 +198,12 @@ static struct pi_protocol fit3 = {
 
 static int __init fit3_init(void)
 {
-	return pi_register(&fit3)-1;
+	return paride_register(&fit3)-1;
 }
 
 static void __exit fit3_exit(void)
 {
-	pi_unregister(&fit3);
+	paride_unregister(&fit3);
 }
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/block/paride/friq.c b/drivers/block/paride/friq.c
index 5ea2904..e9edfa2 100644
--- a/drivers/block/paride/friq.c
+++ b/drivers/block/paride/friq.c
@@ -263,12 +263,12 @@ static struct pi_protocol friq = {
 
 static int __init friq_init(void)
 {
-	return pi_register(&friq)-1;
+	return paride_register(&friq)-1;
 }
 
 static void __exit friq_exit(void)
 {
-	pi_unregister(&friq);
+	paride_unregister(&friq);
 }
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/block/paride/frpw.c b/drivers/block/paride/frpw.c
index 56b3824..407f821 100644
--- a/drivers/block/paride/frpw.c
+++ b/drivers/block/paride/frpw.c
@@ -300,12 +300,12 @@ static struct pi_protocol frpw = {
 
 static int __init frpw_init(void)
 {
-	return pi_register(&frpw)-1;
+	return paride_register(&frpw)-1;
 }
 
 static void __exit frpw_exit(void)
 {
-	pi_unregister(&frpw);
+	paride_unregister(&frpw);
 }
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/block/paride/kbic.c b/drivers/block/paride/kbic.c
index d983bce..a563281 100644
--- a/drivers/block/paride/kbic.c
+++ b/drivers/block/paride/kbic.c
@@ -283,13 +283,13 @@ static struct pi_protocol k971 = {
 
 static int __init kbic_init(void)
 {
-	return (pi_register(&k951)||pi_register(&k971))-1;
+	return (paride_register(&k951)||paride_register(&k971))-1;
 }
 
 static void __exit kbic_exit(void)
 {
-	pi_unregister(&k951);
-	pi_unregister(&k971);
+	paride_unregister(&k951);
+	paride_unregister(&k971);
 }
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/block/paride/ktti.c b/drivers/block/paride/ktti.c
index 6c7edbf..a9d85be 100644
--- a/drivers/block/paride/ktti.c
+++ b/drivers/block/paride/ktti.c
@@ -115,12 +115,12 @@ static struct pi_protocol ktti = {
 
 static int __init ktti_init(void)
 {
-	return pi_register(&ktti)-1;
+	return paride_register(&ktti)-1;
 }
 
 static void __exit ktti_exit(void)
 {
-	pi_unregister(&ktti);
+	paride_unregister(&ktti);
 }
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/block/paride/on20.c b/drivers/block/paride/on20.c
index 9f8e010..cc7e9f59 100644
--- a/drivers/block/paride/on20.c
+++ b/drivers/block/paride/on20.c
@@ -140,12 +140,12 @@ static struct pi_protocol on20 = {
 
 static int __init on20_init(void)
 {
-	return pi_register(&on20)-1;
+	return paride_register(&on20)-1;
 }
 
 static void __exit on20_exit(void)
 {
-	pi_unregister(&on20);
+	paride_unregister(&on20);
 }
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/block/paride/on26.c b/drivers/block/paride/on26.c
index 0f833ca..8755ce4 100644
--- a/drivers/block/paride/on26.c
+++ b/drivers/block/paride/on26.c
@@ -306,12 +306,12 @@ static struct pi_protocol on26 = {
 
 static int __init on26_init(void)
 {
-	return pi_register(&on26)-1;
+	return paride_register(&on26)-1;
 }
 
 static void __exit on26_exit(void)
 {
-	pi_unregister(&on26);
+	paride_unregister(&on26);
 }
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/block/paride/paride.c b/drivers/block/paride/paride.c
index 4b258f7..55cabfb 100644
--- a/drivers/block/paride/paride.c
+++ b/drivers/block/paride/paride.c
@@ -229,7 +229,7 @@ static int pi_test_proto(PIA * pi, char *scratch, int verbose)
 	return res;
 }
 
-int pi_register(PIP * pr)
+int paride_register(PIP * pr)
 {
 	int k;
 
@@ -252,9 +252,9 @@ int pi_register(PIP * pr)
 	return 1;
 }
 
-EXPORT_SYMBOL(pi_register);
+EXPORT_SYMBOL(paride_register);
 
-void pi_unregister(PIP * pr)
+void paride_unregister(PIP * pr)
 {
 	if (!pr)
 		return;
@@ -265,7 +265,7 @@ void pi_unregister(PIP * pr)
 	protocols[pr->index] = NULL;
 }
 
-EXPORT_SYMBOL(pi_unregister);
+EXPORT_SYMBOL(paride_unregister);
 
 static int pi_register_parport(PIA * pi, int verbose)
 {
diff --git a/drivers/block/paride/paride.h b/drivers/block/paride/paride.h
index c6d98ef..2bddbf4 100644
--- a/drivers/block/paride/paride.h
+++ b/drivers/block/paride/paride.h
@@ -163,8 +163,8 @@ struct pi_protocol {
 
 typedef struct pi_protocol PIP;
 
-extern int pi_register( PIP * );
-extern void pi_unregister ( PIP * );
+extern int paride_register( PIP * );
+extern void paride_unregister ( PIP * );
 
 #endif /* __DRIVERS_PARIDE_H__ */
 /* end of paride.h */
-- 
cgit v0.10.2


From b4178ab58aa81f4ed3c75c48940682fe3b45d880 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 6 Dec 2006 20:36:21 -0800
Subject: [PATCH] paride_register(): shuffle return values

paride_register() returns 1 on success, 0 on failure and module init
code looks like

	static int __init foo_init(void)
	{
		return paride_register(&foo) - 1;
	}

which is not what one get used to. Converted to usual 0/-E convention.

In case of kbic driver, unwind registration. It was just

	return (paride_register(&k951)||paride_register(&k971))-1;

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/block/paride/aten.c b/drivers/block/paride/aten.c
index 3539099..2695465 100644
--- a/drivers/block/paride/aten.c
+++ b/drivers/block/paride/aten.c
@@ -149,7 +149,7 @@ static struct pi_protocol aten = {
 
 static int __init aten_init(void)
 {
-	return paride_register(&aten)-1;
+	return paride_register(&aten);
 }
 
 static void __exit aten_exit(void)
diff --git a/drivers/block/paride/bpck.c b/drivers/block/paride/bpck.c
index b0df16d..4f27e73 100644
--- a/drivers/block/paride/bpck.c
+++ b/drivers/block/paride/bpck.c
@@ -464,7 +464,7 @@ static struct pi_protocol bpck = {
 
 static int __init bpck_init(void)
 {
-	return paride_register(&bpck)-1;
+	return paride_register(&bpck);
 }
 
 static void __exit bpck_exit(void)
diff --git a/drivers/block/paride/bpck6.c b/drivers/block/paride/bpck6.c
index 7c04f74..da6b5d2 100644
--- a/drivers/block/paride/bpck6.c
+++ b/drivers/block/paride/bpck6.c
@@ -265,7 +265,7 @@ static int __init bpck6_init(void)
 	printk(KERN_INFO "bpck6: Copyright 2001 by Micro Solutions, Inc., DeKalb IL. USA\n");
 	if(verbose)
 		printk(KERN_DEBUG "bpck6: verbose debug enabled.\n");
-	return paride_register(&bpck6) - 1;
+	return paride_register(&bpck6);
 }
 
 static void __exit bpck6_exit(void)
diff --git a/drivers/block/paride/comm.c b/drivers/block/paride/comm.c
index c3d4076..9bcd354 100644
--- a/drivers/block/paride/comm.c
+++ b/drivers/block/paride/comm.c
@@ -205,7 +205,7 @@ static struct pi_protocol comm = {
 
 static int __init comm_init(void)
 {
-	return paride_register(&comm)-1;
+	return paride_register(&comm);
 }
 
 static void __exit comm_exit(void)
diff --git a/drivers/block/paride/dstr.c b/drivers/block/paride/dstr.c
index 2e4219e..accc5c7 100644
--- a/drivers/block/paride/dstr.c
+++ b/drivers/block/paride/dstr.c
@@ -220,7 +220,7 @@ static struct pi_protocol dstr = {
 
 static int __init dstr_init(void)
 {
-	return paride_register(&dstr)-1;
+	return paride_register(&dstr);
 }
 
 static void __exit dstr_exit(void)
diff --git a/drivers/block/paride/epat.c b/drivers/block/paride/epat.c
index 5ed4fc1..1bcdff7 100644
--- a/drivers/block/paride/epat.c
+++ b/drivers/block/paride/epat.c
@@ -327,7 +327,7 @@ static int __init epat_init(void)
 #ifdef CONFIG_PARIDE_EPATC8
 	epatc8 = 1;
 #endif
-	return paride_register(&epat)-1;
+	return paride_register(&epat);
 }
 
 static void __exit epat_exit(void)
diff --git a/drivers/block/paride/epia.c b/drivers/block/paride/epia.c
index 649766c..fb0e782 100644
--- a/drivers/block/paride/epia.c
+++ b/drivers/block/paride/epia.c
@@ -303,7 +303,7 @@ static struct pi_protocol epia = {
 
 static int __init epia_init(void)
 {
-	return paride_register(&epia)-1;
+	return paride_register(&epia);
 }
 
 static void __exit epia_exit(void)
diff --git a/drivers/block/paride/fit2.c b/drivers/block/paride/fit2.c
index 7a16249..3812837 100644
--- a/drivers/block/paride/fit2.c
+++ b/drivers/block/paride/fit2.c
@@ -138,7 +138,7 @@ static struct pi_protocol fit2 = {
 
 static int __init fit2_init(void)
 {
-	return paride_register(&fit2)-1;
+	return paride_register(&fit2);
 }
 
 static void __exit fit2_exit(void)
diff --git a/drivers/block/paride/fit3.c b/drivers/block/paride/fit3.c
index c701577..275d269 100644
--- a/drivers/block/paride/fit3.c
+++ b/drivers/block/paride/fit3.c
@@ -198,7 +198,7 @@ static struct pi_protocol fit3 = {
 
 static int __init fit3_init(void)
 {
-	return paride_register(&fit3)-1;
+	return paride_register(&fit3);
 }
 
 static void __exit fit3_exit(void)
diff --git a/drivers/block/paride/friq.c b/drivers/block/paride/friq.c
index e9edfa2..4f2ba24 100644
--- a/drivers/block/paride/friq.c
+++ b/drivers/block/paride/friq.c
@@ -263,7 +263,7 @@ static struct pi_protocol friq = {
 
 static int __init friq_init(void)
 {
-	return paride_register(&friq)-1;
+	return paride_register(&friq);
 }
 
 static void __exit friq_exit(void)
diff --git a/drivers/block/paride/frpw.c b/drivers/block/paride/frpw.c
index 407f821..c3cde36 100644
--- a/drivers/block/paride/frpw.c
+++ b/drivers/block/paride/frpw.c
@@ -300,7 +300,7 @@ static struct pi_protocol frpw = {
 
 static int __init frpw_init(void)
 {
-	return paride_register(&frpw)-1;
+	return paride_register(&frpw);
 }
 
 static void __exit frpw_exit(void)
diff --git a/drivers/block/paride/kbic.c b/drivers/block/paride/kbic.c
index a563281..35999c4 100644
--- a/drivers/block/paride/kbic.c
+++ b/drivers/block/paride/kbic.c
@@ -283,7 +283,15 @@ static struct pi_protocol k971 = {
 
 static int __init kbic_init(void)
 {
-	return (paride_register(&k951)||paride_register(&k971))-1;
+	int rv;
+
+	rv = paride_register(&k951);
+	if (rv < 0)
+		return rv;
+	rv = paride_register(&k971);
+	if (rv < 0)
+		paride_unregister(&k951);
+	return rv;
 }
 
 static void __exit kbic_exit(void)
diff --git a/drivers/block/paride/ktti.c b/drivers/block/paride/ktti.c
index a9d85be..117ab0e 100644
--- a/drivers/block/paride/ktti.c
+++ b/drivers/block/paride/ktti.c
@@ -115,7 +115,7 @@ static struct pi_protocol ktti = {
 
 static int __init ktti_init(void)
 {
-	return paride_register(&ktti)-1;
+	return paride_register(&ktti);
 }
 
 static void __exit ktti_exit(void)
diff --git a/drivers/block/paride/on20.c b/drivers/block/paride/on20.c
index cc7e9f59..0173697 100644
--- a/drivers/block/paride/on20.c
+++ b/drivers/block/paride/on20.c
@@ -140,7 +140,7 @@ static struct pi_protocol on20 = {
 
 static int __init on20_init(void)
 {
-	return paride_register(&on20)-1;
+	return paride_register(&on20);
 }
 
 static void __exit on20_exit(void)
diff --git a/drivers/block/paride/on26.c b/drivers/block/paride/on26.c
index 8755ce4..95ba256 100644
--- a/drivers/block/paride/on26.c
+++ b/drivers/block/paride/on26.c
@@ -306,7 +306,7 @@ static struct pi_protocol on26 = {
 
 static int __init on26_init(void)
 {
-	return paride_register(&on26)-1;
+	return paride_register(&on26);
 }
 
 static void __exit on26_exit(void)
diff --git a/drivers/block/paride/paride.c b/drivers/block/paride/paride.c
index 55cabfb..e4c55e0 100644
--- a/drivers/block/paride/paride.c
+++ b/drivers/block/paride/paride.c
@@ -237,19 +237,19 @@ int paride_register(PIP * pr)
 		if (protocols[k] && !strcmp(pr->name, protocols[k]->name)) {
 			printk("paride: %s protocol already registered\n",
 			       pr->name);
-			return 0;
+			return -1;
 		}
 	k = 0;
 	while ((k < MAX_PROTOS) && (protocols[k]))
 		k++;
 	if (k == MAX_PROTOS) {
 		printk("paride: protocol table full\n");
-		return 0;
+		return -1;
 	}
 	protocols[k] = pr;
 	pr->index = k;
 	printk("paride: %s registered as protocol %d\n", pr->name, k);
-	return 1;
+	return 0;
 }
 
 EXPORT_SYMBOL(paride_register);
-- 
cgit v0.10.2


From 910b1b2e6d7d10e1c3bffdd12a90ec82f535f9a5 Mon Sep 17 00:00:00 2001
From: Jarek Poplawski <jarkao2@o2.pl>
Date: Thu, 7 Dec 2006 10:45:25 +0100
Subject: [PATCH] lockdep: internal locking fixes

Here are mainly some lockdep returns with 0 with unlocking fixes.

Signed-off-by: Jarek Poplawski <jarkao2@o2.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index e33f620..f76a24f 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -237,8 +237,10 @@ e	   locks. */
 	trace->max_entries = trace->nr_entries;
 
 	nr_stack_trace_entries += trace->nr_entries;
-	if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES))
+	if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES)) {
+		__raw_spin_unlock(&hash_lock);
 		return 0;
+	}
 
 	if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) {
 		__raw_spin_unlock(&hash_lock);
@@ -474,7 +476,8 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
 		return 0;
 
 	entry->class = this;
-	save_trace(&entry->trace);
+	if (!save_trace(&entry->trace))
+		return 0;
 
 	/*
 	 * Since we never remove from the dependency list, the list can
@@ -562,8 +565,12 @@ static noinline int print_circular_bug_tail(void)
 	if (debug_locks_silent)
 		return 0;
 
+	/* hash_lock unlocked by the header */
+	__raw_spin_lock(&hash_lock);
 	this.class = check_source->class;
-	save_trace(&this.trace);
+	if (!save_trace(&this.trace))
+		return 0;
+	__raw_spin_unlock(&hash_lock);
 	print_circular_bug_entry(&this, 0);
 
 	printk("\nother info that might help us debug this:\n\n");
@@ -966,14 +973,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 			       &prev->class->locks_after, next->acquire_ip);
 	if (!ret)
 		return 0;
-	/*
-	 * Return value of 2 signals 'dependency already added',
-	 * in that case we dont have to add the backlink either.
-	 */
-	if (ret == 2)
-		return 2;
+
 	ret = add_lock_to_list(next->class, prev->class,
 			       &next->class->locks_before, next->acquire_ip);
+	if (!ret)
+		return 0;
 
 	/*
 	 * Debugging printouts:
@@ -1025,7 +1029,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
 		 * added:
 		 */
 		if (hlock->read != 2) {
-			check_prev_add(curr, hlock, next);
+			if (!check_prev_add(curr, hlock, next))
+				return 0;
 			/*
 			 * Stop after the first non-trylock entry,
 			 * as non-trylock entries have added their
-- 
cgit v0.10.2


From b23984d0a12a4821b2e9712c71550f321eb88bb5 Mon Sep 17 00:00:00 2001
From: Jarek Poplawski <jarkao2@o2.pl>
Date: Wed, 6 Dec 2006 20:36:23 -0800
Subject: [PATCH] lockdep: misc fixes in lockdep.c

 - numeric string size replaced with constant in print_lock_name and
   print_lockdep_cache,

 - return on null pointer in print_lock_dependencies,

 - one more lockdep return with 0 with unlocking fix in mark_lock.

Signed-off-by: Jarek Poplawski <jarkao2@o2.pl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f76a24f..300d61b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -359,7 +359,7 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4
 
 static void print_lock_name(struct lock_class *class)
 {
-	char str[128], c1, c2, c3, c4;
+	char str[KSYM_NAME_LEN + 1], c1, c2, c3, c4;
 	const char *name;
 
 	get_usage_chars(class, &c1, &c2, &c3, &c4);
@@ -381,7 +381,7 @@ static void print_lock_name(struct lock_class *class)
 static void print_lockdep_cache(struct lockdep_map *lock)
 {
 	const char *name;
-	char str[128];
+	char str[KSYM_NAME_LEN + 1];
 
 	name = lock->name;
 	if (!name)
@@ -451,7 +451,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
 	print_lock_class_header(class, depth);
 
 	list_for_each_entry(entry, &class->locks_after, entry) {
-		DEBUG_LOCKS_WARN_ON(!entry->class);
+		if (DEBUG_LOCKS_WARN_ON(!entry->class))
+			return;
+
 		print_lock_dependencies(entry->class, depth + 1);
 
 		printk("%*s ... acquired at:\n",depth,"");
@@ -1733,6 +1735,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 		debug_atomic_dec(&nr_unused_locks);
 		break;
 	default:
+		__raw_spin_unlock(&hash_lock);
 		debug_locks_off();
 		WARN_ON(1);
 		return 0;
-- 
cgit v0.10.2


From 59287c0913cc9a6c75712a775f6c1c1ef418ef3b Mon Sep 17 00:00:00 2001
From: Marcus Meissner <meissner@suse.de>
Date: Wed, 6 Dec 2006 20:36:24 -0800
Subject: [PATCH] binfmt_elf: randomize PIE binaries (2nd try)

Randomizes -pie compiled binaries from 64k (0x10000) up to ELF_ET_DYN_BASE.

0 -> 64k is excluded to allow NULL ptr accesses to fail.

Signed-off-by: Marcus Meissner <meissner@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index cc72bb4..b2efbae 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -856,7 +856,13 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			 * default mmap base, as well as whatever program they
 			 * might try to exec.  This is because the brk will
 			 * follow the loader, and is not movable.  */
-			load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
+			if (current->flags & PF_RANDOMIZE)
+				load_bias = randomize_range(0x10000,
+							    ELF_ET_DYN_BASE,
+							    0);
+			else
+				load_bias = ELF_ET_DYN_BASE;
+			load_bias = ELF_PAGESTART(load_bias - vaddr);
 		}
 
 		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
-- 
cgit v0.10.2


From 40b851348fe9bf49c26025b34261d25142269b60 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 6 Dec 2006 20:36:26 -0800
Subject: [PATCH] handle ext3 directory corruption better

I've been using Steve Grubb's purely evil "fsfuzzer" tool, at
http://people.redhat.com/sgrubb/files/fsfuzzer-0.4.tar.gz

Basically it makes a filesystem, splats some random bits over it, then
tries to mount it and do some simple filesystem actions.

At best, the filesystem catches the corruption gracefully.  At worst,
things spin out of control.

As you might guess, we found a couple places in ext3 where things spin out
of control :)

First, we had a corrupted directory that was never checked for
consistency...  it was corrupt, and pointed to another bad "entry" of
length 0.  The for() loop looped forever, since the length of
ext3_next_entry(de) was 0, and we kept looking at the same pointer over and
over and over and over...  I modeled this check and subsequent action on
what is done for other directory types in ext3_readdir...

(adding this check adds some computational expense; I am testing a followup
patch to reduce the number of times we check and re-check these directory
entries, in all cases.  Thanks for the idea, Andreas).

Next we had a root directory inode which had a corrupted size, claimed to
be > 200M on a 4M filesystem.  There was only really 1 block in the
directory, but because the size was so large, readdir kept coming back for
more, spewing thousands of printk's along the way.

Per Andreas' suggestion, if we're in this read error condition and we're
trying to read an offset which is greater than i_blocks worth of bytes,
stop trying, and break out of the loop.

With these two changes fsfuzz test survives quite well on ext3.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index d0b54f3..5a9313e 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -154,6 +154,9 @@ static int ext3_readdir(struct file * filp,
 			ext3_error (sb, "ext3_readdir",
 				"directory #%lu contains a hole at offset %lu",
 				inode->i_ino, (unsigned long)filp->f_pos);
+			/* corrupt size?  Maybe no more blocks to read */
+			if (filp->f_pos > inode->i_blocks << 9)
+				break;
 			filp->f_pos += sb->s_blocksize - offset;
 			continue;
 		}
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 906731a..60d2f9d 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -552,6 +552,15 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 					   dir->i_sb->s_blocksize -
 					   EXT3_DIR_REC_LEN(0));
 	for (; de < top; de = ext3_next_entry(de)) {
+		if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
+					(block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb))
+						+((char *)de - bh->b_data))) {
+			/* On error, skip the f_pos to the next block. */
+			dir_file->f_pos = (dir_file->f_pos |
+					(dir->i_sb->s_blocksize - 1)) + 1;
+			brelse (bh);
+			return count;
+		}
 		ext3fs_dirhash(de->name, de->name_len, hinfo);
 		if ((hinfo->hash < start_hash) ||
 		    ((hinfo->hash == start_hash) &&
-- 
cgit v0.10.2


From e6c4021190c828d7fa24a464db589f86c6708341 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 6 Dec 2006 20:36:28 -0800
Subject: [PATCH] handle ext4 directory corruption better

I've been using Steve Grubb's purely evil "fsfuzzer" tool, at
http://people.redhat.com/sgrubb/files/fsfuzzer-0.4.tar.gz

Basically it makes a filesystem, splats some random bits over it, then
tries to mount it and do some simple filesystem actions.

At best, the filesystem catches the corruption gracefully.  At worst,
things spin out of control.

As you might guess, we found a couple places in ext4 where things spin out
of control :)

First, we had a corrupted directory that was never checked for
consistency...  it was corrupt, and pointed to another bad "entry" of
length 0.  The for() loop looped forever, since the length of
ext4_next_entry(de) was 0, and we kept looking at the same pointer over and
over and over and over...  I modeled this check and subsequent action on
what is done for other directory types in ext4_readdir...

(adding this check adds some computational expense; I am testing a followup
patch to reduce the number of times we check and re-check these directory
entries, in all cases.  Thanks for the idea, Andreas).

Next we had a root directory inode which had a corrupted size, claimed to
be > 200M on a 4M filesystem.  There was only really 1 block in the
directory, but because the size was so large, readdir kept coming back for
more, spewing thousands of printk's along the way.

Per Andreas' suggestion, if we're in this read error condition and we're
trying to read an offset which is greater than i_blocks worth of bytes,
stop trying, and break out of the loop.

With these two changes fsfuzz test survives quite well on ext4.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index f859578..f2ed3e7 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -153,6 +153,9 @@ static int ext4_readdir(struct file * filp,
 			ext4_error (sb, "ext4_readdir",
 				"directory #%lu contains a hole at offset %lu",
 				inode->i_ino, (unsigned long)filp->f_pos);
+			/* corrupt size?  Maybe no more blocks to read */
+			if (filp->f_pos > inode->i_blocks << 9)
+				break;
 			filp->f_pos += sb->s_blocksize - offset;
 			continue;
 		}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 8b1bd03..859990e 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -552,6 +552,15 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 					   dir->i_sb->s_blocksize -
 					   EXT4_DIR_REC_LEN(0));
 	for (; de < top; de = ext4_next_entry(de)) {
+		if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
+					(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+						+((char *)de - bh->b_data))) {
+			/* On error, skip the f_pos to the next block. */
+			dir_file->f_pos = (dir_file->f_pos |
+					(dir->i_sb->s_blocksize - 1)) + 1;
+			brelse (bh);
+			return count;
+		}
 		ext4fs_dirhash(de->name, de->name_len, hinfo);
 		if ((hinfo->hash < start_hash) ||
 		    ((hinfo->hash == start_hash) &&
-- 
cgit v0.10.2


From 3316eaa31e638d21dfa4a81a3322f8898981c591 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 6 Dec 2006 20:36:29 -0800
Subject: [PATCH] tifm: fix NULL ptr and style

Fix sparse NULL warning;
  drivers/misc/tifm_core.c:223:17: warning: Using plain integer as NULL pointer

Fix style while there.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/misc/tifm_core.c b/drivers/misc/tifm_core.c
index ee32613..d61df5c 100644
--- a/drivers/misc/tifm_core.c
+++ b/drivers/misc/tifm_core.c
@@ -219,8 +219,9 @@ static int tifm_device_remove(struct device *dev)
 	struct tifm_driver *drv = fm_dev->drv;
 
 	if (drv) {
-		if (drv->remove) drv->remove(fm_dev);
-		fm_dev->drv = 0;
+		if (drv->remove)
+			drv->remove(fm_dev);
+		fm_dev->drv = NULL;
 	}
 
 	put_device(dev);
-- 
cgit v0.10.2


From 736c4b8572ac24b1e6fd58d00872305a120ac700 Mon Sep 17 00:00:00 2001
From: Mika Kukkonen <mikukkon@gmail.com>
Date: Wed, 6 Dec 2006 20:36:29 -0800
Subject: [PATCH] Function v9fs_get_idpool returns int, not u32 as called twice
 in fs/9p/vfs_inode.c

Function v9fs_get_idpool returns int, not u32.  Actually it returns -1 on
errors, and these two callers check if the value is smaller than 0, which
was caught by gcc with extra warning flags.  Compile tested only but should
be OK, as the value computed in v9fs_get_idpool() is also int.

Signed-of-by: Mika Kukkonen <mikukkon@iki.fi>
Cc: Eric Van Hensbergen <ericvh@gmail.com>
Cc: Ron Minnich <rminnich@lanl.gov>
Cc: Latchesar Ionkov <lucho@ionkov.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 5241c60..18f26cd 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -256,7 +256,7 @@ static int
 v9fs_create(struct v9fs_session_info *v9ses, u32 pfid, char *name, u32 perm,
 	u8 mode, char *extension, u32 *fidp, struct v9fs_qid *qid, u32 *iounit)
 {
-	u32 fid;
+	int fid;
 	int err;
 	struct v9fs_fcall *fcall;
 
@@ -310,7 +310,7 @@ static struct v9fs_fid*
 v9fs_clone_walk(struct v9fs_session_info *v9ses, u32 fid, struct dentry *dentry)
 {
 	int err;
-	u32 nfid;
+	int nfid;
 	struct v9fs_fid *ret;
 	struct v9fs_fcall *fcall;
 
-- 
cgit v0.10.2


From fec1d0115240593b39898289e6e1413ea6e44a84 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 6 Dec 2006 20:36:34 -0800
Subject: [PATCH] Disable CLONE_CHILD_CLEARTID for abnormal exit

The CLONE_CHILD_CLEARTID flag is used by NPTL to have its threads
communicate via memory/futex when they exit, so pthread_join can
synchronize using a simple futex wait.  The word of user memory where NPTL
stores a thread's own TID is what it passes; this gets reset to zero at
thread exit.

It is not desireable to touch this user memory when threads are dying due
to a fatal signal.  A core dump is more usefully representative of the
dying program state if the threads live at the time of the crash have their
NPTL data structures unperturbed.  The userland expectation of
CLONE_CHILD_CLEARTID has only ever been that it works for a thread making
an _exit system call.

This problem was identified by Ernie Petrides <petrides@redhat.com>.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Ernie Petrides <petrides@redhat.com>
Cc: Jakub Jelinek <jakub@redhat.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/fork.c b/kernel/fork.c
index 2cf74ed..f37980d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -448,7 +448,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 		tsk->vfork_done = NULL;
 		complete(vfork_done);
 	}
-	if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) {
+
+	/*
+	 * If we're exiting normally, clear a user-space tid field if
+	 * requested.  We leave this alone when dying by signal, to leave
+	 * the value intact in a core dump, and to save the unnecessary
+	 * trouble otherwise.  Userland only wants this done for a sys_exit.
+	 */
+	if (tsk->clear_child_tid
+	    && !(tsk->flags & PF_SIGNALED)
+	    && atomic_read(&mm->mm_users) > 1) {
 		u32 __user * tidptr = tsk->clear_child_tid;
 		tsk->clear_child_tid = NULL;
 
-- 
cgit v0.10.2


From 841d5fb7c75260f76ae682648b28a3dca724940d Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 6 Dec 2006 20:36:35 -0800
Subject: [PATCH] binfmt: fix uaccess handling

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b2efbae..68e20d5 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -243,8 +243,9 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	if (interp_aout) {
 		argv = sp + 2;
 		envp = argv + argc + 1;
-		__put_user((elf_addr_t)(unsigned long)argv, sp++);
-		__put_user((elf_addr_t)(unsigned long)envp, sp++);
+		if (__put_user((elf_addr_t)(unsigned long)argv, sp++) ||
+		    __put_user((elf_addr_t)(unsigned long)envp, sp++))
+			return -EFAULT;
 	} else {
 		argv = sp;
 		envp = argv + argc + 1;
@@ -254,7 +255,8 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	p = current->mm->arg_end = current->mm->arg_start;
 	while (argc-- > 0) {
 		size_t len;
-		__put_user((elf_addr_t)p, argv++);
+		if (__put_user((elf_addr_t)p, argv++))
+			return -EFAULT;
 		len = strnlen_user((void __user *)p, PAGE_SIZE*MAX_ARG_PAGES);
 		if (!len || len > PAGE_SIZE*MAX_ARG_PAGES)
 			return 0;
@@ -265,7 +267,8 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	current->mm->arg_end = current->mm->env_start = p;
 	while (envc-- > 0) {
 		size_t len;
-		__put_user((elf_addr_t)p, envp++);
+		if (__put_user((elf_addr_t)p, envp++))
+			return -EFAULT;
 		len = strnlen_user((void __user *)p, PAGE_SIZE*MAX_ARG_PAGES);
 		if (!len || len > PAGE_SIZE*MAX_ARG_PAGES)
 			return 0;
-- 
cgit v0.10.2


From 7116e994b47f3988389be4ceee67dac64b56e0d0 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 6 Dec 2006 20:36:36 -0800
Subject: [PATCH] compat: fix uaccess handling

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/compat.c b/fs/compat.c
index 7aef541..a7e3f16 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1144,7 +1144,9 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
 	lastdirent = buf.previous;
 	if (lastdirent) {
 		typeof(lastdirent->d_off) d_off = file->f_pos;
-		__put_user_unaligned(d_off, &lastdirent->d_off);
+		error = -EFAULT;
+		if (__put_user_unaligned(d_off, &lastdirent->d_off))
+			goto out_putf;
 		error = count - buf.count;
 	}
 
@@ -1611,14 +1613,14 @@ int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
 		nr &= ~1UL;
 		while (nr) {
 			unsigned long h, l;
-			__get_user(l, ufdset);
-			__get_user(h, ufdset+1);
+			if (__get_user(l, ufdset) || __get_user(h, ufdset+1))
+				return -EFAULT;
 			ufdset += 2;
 			*fdset++ = h << 32 | l;
 			nr -= 2;
 		}
-		if (odd)
-			__get_user(*fdset, ufdset);
+		if (odd && __get_user(*fdset, ufdset))
+			return -EFAULT;
 	} else {
 		/* Tricky, must clear full unsigned long in the
 		 * kernel fdset at the end, this makes sure that
@@ -1630,14 +1632,14 @@ int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
 }
 
 static
-void compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
-			unsigned long *fdset)
+int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
+		      unsigned long *fdset)
 {
 	unsigned long odd;
 	nr = ROUND_UP(nr, __COMPAT_NFDBITS);
 
 	if (!ufdset)
-		return;
+		return 0;
 
 	odd = nr & 1UL;
 	nr &= ~1UL;
@@ -1645,13 +1647,14 @@ void compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
 		unsigned long h, l;
 		l = *fdset++;
 		h = l >> 32;
-		__put_user(l, ufdset);
-		__put_user(h, ufdset+1);
+		if (__put_user(l, ufdset) || __put_user(h, ufdset+1))
+			return -EFAULT;
 		ufdset += 2;
 		nr -= 2;
 	}
-	if (odd)
-		__put_user(*fdset, ufdset);
+	if (odd && __put_user(*fdset, ufdset))
+		return -EFAULT;
+	return 0;
 }
 
 
@@ -1726,10 +1729,10 @@ int compat_core_sys_select(int n, compat_ulong_t __user *inp,
 		ret = 0;
 	}
 
-	compat_set_fd_set(n, inp, fds.res_in);
-	compat_set_fd_set(n, outp, fds.res_out);
-	compat_set_fd_set(n, exp, fds.res_ex);
-
+	if (compat_set_fd_set(n, inp, fds.res_in) ||
+	    compat_set_fd_set(n, outp, fds.res_out) ||
+	    compat_set_fd_set(n, exp, fds.res_ex))
+		ret = -EFAULT;
 out:
 	kfree(bits);
 out_nofds:
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index a91f262..6e31776 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -211,8 +211,10 @@ static int do_video_stillpicture(unsigned int fd, unsigned int cmd, unsigned lon
 	up_native =
 		compat_alloc_user_space(sizeof(struct video_still_picture));
 
-	put_user(compat_ptr(fp), &up_native->iFrame);
-	put_user(size, &up_native->size);
+	err =  put_user(compat_ptr(fp), &up_native->iFrame);
+	err |= put_user(size, &up_native->size);
+	if (err)
+		return -EFAULT;
 
 	err = sys_ioctl(fd, cmd, (unsigned long) up_native);
 
@@ -236,8 +238,10 @@ static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, unsigned
 	err |= get_user(length, &up->length);
 
 	up_native = compat_alloc_user_space(sizeof(struct video_spu_palette));
-	put_user(compat_ptr(palp), &up_native->palette);
-	put_user(length, &up_native->length);
+	err  = put_user(compat_ptr(palp), &up_native->palette);
+	err |= put_user(length, &up_native->length);
+	if (err)
+		return -EFAULT;
 
 	err = sys_ioctl(fd, cmd, (unsigned long) up_native);
 
@@ -2043,16 +2047,19 @@ static int serial_struct_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
         struct serial_struct ss;
         mm_segment_t oldseg = get_fs();
         __u32 udata;
+	unsigned int base;
 
         if (cmd == TIOCSSERIAL) {
                 if (!access_ok(VERIFY_READ, ss32, sizeof(SS32)))
                         return -EFAULT;
                 if (__copy_from_user(&ss, ss32, offsetof(SS32, iomem_base)))
 			return -EFAULT;
-                __get_user(udata, &ss32->iomem_base);
+                if (__get_user(udata, &ss32->iomem_base))
+			return -EFAULT;
                 ss.iomem_base = compat_ptr(udata);
-                __get_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift);
-                __get_user(ss.port_high, &ss32->port_high);
+                if (__get_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) ||
+		    __get_user(ss.port_high, &ss32->port_high))
+			return -EFAULT;
                 ss.iomap_base = 0UL;
         }
         set_fs(KERNEL_DS);
@@ -2063,12 +2070,12 @@ static int serial_struct_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
                         return -EFAULT;
                 if (__copy_to_user(ss32,&ss,offsetof(SS32,iomem_base)))
 			return -EFAULT;
-                __put_user((unsigned long)ss.iomem_base  >> 32 ?
-                            0xffffffff : (unsigned)(unsigned long)ss.iomem_base,
-                            &ss32->iomem_base);
-                __put_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift);
-                __put_user(ss.port_high, &ss32->port_high);
-
+		base = (unsigned long)ss.iomem_base  >> 32 ?
+			0xffffffff : (unsigned)(unsigned long)ss.iomem_base;
+		if (__put_user(base, &ss32->iomem_base) ||
+		    __put_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) ||
+		    __put_user(ss.port_high, &ss32->port_high))
+			return -EFAULT;
         }
         return err;
 }
-- 
cgit v0.10.2


From 064b022c7adb2d853378078a9dc141f8288d1c73 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 6 Dec 2006 20:36:37 -0800
Subject: [PATCH] profile: fix uaccess handling

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/profile.c b/kernel/profile.c
index f940b46..15b012df 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -442,7 +442,8 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	read = 0;
 
 	while (p < sizeof(unsigned int) && count > 0) {
-		put_user(*((char *)(&sample_step)+p),buf);
+		if (put_user(*((char *)(&sample_step)+p),buf))
+			return -EFAULT;
 		buf++; p++; count--; read++;
 	}
 	pnt = (char *)prof_buffer + p - sizeof(atomic_t);
-- 
cgit v0.10.2


From d3b8b6e5f20031890e09a8eab72fd596d2e2227d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 6 Dec 2006 20:36:38 -0800
Subject: [PATCH] kconfig: PRINTK_TIME depends on PRINTK

Make PRINTK_TIME depend on PRINTK.  Only display/offer it if PRINTK is
enabled.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d367910..b75fed7 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1,6 +1,7 @@
 
 config PRINTK_TIME
 	bool "Show timing information on printks"
+	depends on PRINTK
 	help
 	  Selecting this option causes timing information to be
 	  included in printk output.  This allows you to measure
-- 
cgit v0.10.2


From c140e110019f25ffa1c6f3f365b0c9103d0b8475 Mon Sep 17 00:00:00 2001
From: Ryan Underwood <nemesis@icequake.net>
Date: Wed, 6 Dec 2006 20:36:38 -0800
Subject: [PATCH] parport_pc: Add support for OX16PCI952 parallel port

Add support for the parallel port (implemented as separate PCI function) on
the Oxford Semiconductor OX16PCI952.

Signed-off-by: Ryan Underwood <nemesis@icequake.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c
index 39c9664..5749500f 100644
--- a/drivers/parport/parport_pc.c
+++ b/drivers/parport/parport_pc.c
@@ -2747,6 +2747,7 @@ enum parport_pc_pci_cards {
 	titan_1284p2,
 	avlab_1p,
 	avlab_2p,
+	oxsemi_952,
 	oxsemi_954,
 	oxsemi_840,
 	aks_0100,
@@ -2822,6 +2823,7 @@ static struct parport_pc_pci {
 	/* avlab_2p		*/	{ 2, { { 0, 1}, { 2, 3 },} },
 	/* The Oxford Semi cards are unusual: 954 doesn't support ECP,
 	 * and 840 locks up if you write 1 to bit 2! */
+	/* oxsemi_952 */		{ 1, { { 0, 1 }, } },
 	/* oxsemi_954 */		{ 1, { { 0, -1 }, } },
 	/* oxsemi_840 */		{ 1, { { 0, -1 }, } },
 	/* aks_0100 */                  { 1, { { 0, -1 }, } },
@@ -2895,6 +2897,8 @@ static const struct pci_device_id parport_pc_pci_tbl[] = {
 	/* PCI_VENDOR_ID_AVLAB/Intek21 has another bunch of cards ...*/
 	{ 0x14db, 0x2120, PCI_ANY_ID, PCI_ANY_ID, 0, 0, avlab_1p}, /* AFAVLAB_TK9902 */
 	{ 0x14db, 0x2121, PCI_ANY_ID, PCI_ANY_ID, 0, 0, avlab_2p},
+	{ PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_16PCI952PP,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, oxsemi_952 },
 	{ PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_16PCI954PP,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, oxsemi_954 },
 	{ PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_12PCI840,
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index c09da1e..dcdb90f 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1864,6 +1864,7 @@
 #define PCI_DEVICE_ID_OXSEMI_16PCI95N	0x9511
 #define PCI_DEVICE_ID_OXSEMI_16PCI954PP	0x9513
 #define PCI_DEVICE_ID_OXSEMI_16PCI952	0x9521
+#define PCI_DEVICE_ID_OXSEMI_16PCI952PP	0x9523
 
 #define PCI_VENDOR_ID_SAMSUNG		0x144d
 
-- 
cgit v0.10.2


From 20aa7b21b1cbd1aa3fbf5fc14da5f7484a61a824 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 6 Dec 2006 20:36:40 -0800
Subject: [PATCH] probe_kernel_address() needs to do set_fs()

probe_kernel_address() purports to be generic, only it forgot to select
KERNEL_DS, so it presently won't work right on all architectures.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 67918c2..76c3fe3 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -65,14 +65,22 @@ static inline unsigned long __copy_from_user_nocache(void *to,
  * do_page_fault() doesn't attempt to take mmap_sem.  This makes
  * probe_kernel_address() suitable for use within regions where the caller
  * already holds mmap_sem, or other locks which nest inside mmap_sem.
+ * This must be a macro because __get_user() needs to know the types of the
+ * args.
+ *
+ * We don't include enough header files to be able to do the set_fs().  We
+ * require that the probe_kernel_address() caller will do that.
  */
 #define probe_kernel_address(addr, retval)		\
 	({						\
 		long ret;				\
+		mm_segment_t old_fs = get_fs();		\
 							\
+		set_fs(KERNEL_DS);			\
 		pagefault_disable();			\
 		ret = __get_user(retval, addr);		\
 		pagefault_enable();			\
+		set_fs(old_fs);				\
 		ret;					\
 	})
 
-- 
cgit v0.10.2


From 138ae6631a3d6f86851dd53686fa88295d1398bd Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 6 Dec 2006 20:36:41 -0800
Subject: [PATCH] slab: use probe_kernel_address()

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/slab.c b/mm/slab.c
index 7b8e5d6..86f5d6e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -103,12 +103,12 @@
 #include	<linux/module.h>
 #include	<linux/rcupdate.h>
 #include	<linux/string.h>
+#include	<linux/uaccess.h>
 #include	<linux/nodemask.h>
 #include	<linux/mempolicy.h>
 #include	<linux/mutex.h>
 #include	<linux/rtmutex.h>
 
-#include	<asm/uaccess.h>
 #include	<asm/cacheflush.h>
 #include	<asm/tlbflush.h>
 #include	<asm/page.h>
@@ -2124,7 +2124,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	mutex_lock(&cache_chain_mutex);
 
 	list_for_each_entry(pc, &cache_chain, next) {
-		mm_segment_t old_fs = get_fs();
 		char tmp;
 		int res;
 
@@ -2133,9 +2132,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		 * destroy its slab cache and no-one else reuses the vmalloc
 		 * area of the module.  Print a warning.
 		 */
-		set_fs(KERNEL_DS);
-		res = __get_user(tmp, pc->name);
-		set_fs(old_fs);
+		res = probe_kernel_address(pc->name, tmp);
 		if (res) {
 			printk("SLAB: cache with size %d has lost its name\n",
 			       pc->buffer_size);
-- 
cgit v0.10.2


From 8bca98cabf6db738b06d6f3b6d4b6c5f2a5cb7b6 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 6 Dec 2006 20:36:43 -0800
Subject: [PATCH] paride: return proper error code

This patch makes module init return proper value instead of -1 (-EPERM).

Cc: Tim Waugh <tim@cyberelk.net>
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index ac5ba46..c852eed 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -912,12 +912,12 @@ static int __init pcd_init(void)
 	int unit;
 
 	if (disable)
-		return -1;
+		return -EINVAL;
 
 	pcd_init_units();
 
 	if (pcd_detect())
-		return -1;
+		return -ENODEV;
 
 	/* get the atapi capabilities page */
 	pcd_probe_capabilities();
@@ -925,7 +925,7 @@ static int __init pcd_init(void)
 	if (register_blkdev(major, name)) {
 		for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++)
 			put_disk(cd->disk);
-		return -1;
+		return -EBUSY;
 	}
 
 	pcd_queue = blk_init_queue(do_pcd_request, &pcd_lock);
@@ -933,7 +933,7 @@ static int __init pcd_init(void)
 		unregister_blkdev(major, name);
 		for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++)
 			put_disk(cd->disk);
-		return -1;
+		return -ENOMEM;
 	}
 
 	for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index 1a9dee1..7cdaa19 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -933,25 +933,25 @@ static int __init pf_init(void)
 	int unit;
 
 	if (disable)
-		return -1;
+		return -EINVAL;
 
 	pf_init_units();
 
 	if (pf_detect())
-		return -1;
+		return -ENODEV;
 	pf_busy = 0;
 
 	if (register_blkdev(major, name)) {
 		for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++)
 			put_disk(pf->disk);
-		return -1;
+		return -EBUSY;
 	}
 	pf_queue = blk_init_queue(do_pf_request, &pf_spin_lock);
 	if (!pf_queue) {
 		unregister_blkdev(major, name);
 		for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++)
 			put_disk(pf->disk);
-		return -1;
+		return -ENOMEM;
 	}
 
 	blk_queue_max_phys_segments(pf_queue, cluster);
diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
index 13f998a..9970aed 100644
--- a/drivers/block/paride/pg.c
+++ b/drivers/block/paride/pg.c
@@ -646,14 +646,14 @@ static int __init pg_init(void)
 	int err;
 
 	if (disable){
-		err = -1;
+		err = -EINVAL;
 		goto out;
 	}
 
 	pg_init_units();
 
 	if (pg_detect()) {
-		err = -1;
+		err = -ENODEV;
 		goto out;
 	}
 
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
index 35fb266..c902b25 100644
--- a/drivers/block/paride/pt.c
+++ b/drivers/block/paride/pt.c
@@ -946,12 +946,12 @@ static int __init pt_init(void)
 	int err;
 
 	if (disable) {
-		err = -1;
+		err = -EINVAL;
 		goto out;
 	}
 
 	if (pt_detect()) {
-		err = -1;
+		err = -ENODEV;
 		goto out;
 	}
 
-- 
cgit v0.10.2


From 38da288b8ba2b07b4e07165027e650b61d7c8ffc Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Wed, 6 Dec 2006 20:36:46 -0800
Subject: [PATCH] read_cache_pages() cleanup

Use put_pages_list() instead of opencoding it.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/readahead.c b/mm/readahead.c
index 23cb61a..a386f2b 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -148,13 +148,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
 		if (!pagevec_add(&lru_pvec, page))
 			__pagevec_lru_add(&lru_pvec);
 		if (ret) {
-			while (!list_empty(pages)) {
-				struct page *victim;
-
-				victim = list_to_page(pages);
-				list_del(&victim->lru);
-				page_cache_release(victim);
-			}
+			put_pages_list(pages);
 			break;
 		}
 	}
-- 
cgit v0.10.2


From 128fb95650b3273a8dc9ba5514b6fe7db8ea30bf Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 6 Dec 2006 20:36:50 -0800
Subject: [PATCH] taskstats_exit_alloc: optimize/simplify

If there are no listeners, every task does unneeded kmem_cache alloc/free on
exit. We don't need listeners->sem for 'if (!list_empty())' check. Yes, we may
have a false positive, but this doesn't differ from the case when the listener
is unregistered after we drop the semaphore. So we don't need to do allocation
beforehand.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Balbir Singh <balbir@in.ibm.com>
Acked-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index f5f9201..d9d7c35 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -416,7 +416,6 @@ err:
 void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
 {
 	struct listener_list *listeners;
-	struct taskstats *tmp;
 	/*
 	 * This is the cpu on which the task is exiting currently and will
 	 * be the one for which the exit event is sent, even if the cpu
@@ -424,19 +423,11 @@ void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
 	 */
 	*mycpu = raw_smp_processor_id();
 
-	*ptidstats = NULL;
-	tmp = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
-	if (!tmp)
-		return;
-
 	listeners = &per_cpu(listener_array, *mycpu);
-	down_read(&listeners->sem);
-	if (!list_empty(&listeners->list)) {
-		*ptidstats = tmp;
-		tmp = NULL;
-	}
-	up_read(&listeners->sem);
-	kfree(tmp);
+
+	*ptidstats = NULL;
+	if (!list_empty(&listeners->list))
+		*ptidstats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
 }
 
 /* Send pid data out on exit */
-- 
cgit v0.10.2


From 115085ea0794c0f339be8f9d25505c7f9861d824 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 6 Dec 2006 20:36:51 -0800
Subject: [PATCH] taskstats: cleanup do_exit() path

do_exit:
	taskstats_exit_alloc()
	...
	taskstats_exit_send()
	taskstats_exit_free()

I think this is not good, let it be a single function exported to the core
kernel, taskstats_exit(), which does alloc + send + free itself.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
index ce8a912..f1261a5 100644
--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -15,12 +15,6 @@
 extern struct kmem_cache *taskstats_cache;
 extern struct mutex taskstats_exit_mutex;
 
-static inline void taskstats_exit_free(struct taskstats *tidstats)
-{
-	if (tidstats)
-		kmem_cache_free(taskstats_cache, tidstats);
-}
-
 static inline void taskstats_tgid_init(struct signal_struct *sig)
 {
 	sig->stats = NULL;
@@ -54,17 +48,10 @@ static inline void taskstats_tgid_free(struct signal_struct *sig)
 		kmem_cache_free(taskstats_cache, sig->stats);
 }
 
-extern void taskstats_exit_alloc(struct taskstats **, unsigned int *);
-extern void taskstats_exit_send(struct task_struct *, struct taskstats *, int, unsigned int);
+extern void taskstats_exit(struct task_struct *, int group_dead);
 extern void taskstats_init_early(void);
 #else
-static inline void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
-{}
-static inline void taskstats_exit_free(struct taskstats *ptidstats)
-{}
-static inline void taskstats_exit_send(struct task_struct *tsk,
-				       struct taskstats *tidstats,
-				       int group_dead, unsigned int cpu)
+static inline void taskstats_exit(struct task_struct *tsk, int group_dead)
 {}
 static inline void taskstats_tgid_init(struct signal_struct *sig)
 {}
diff --git a/kernel/exit.c b/kernel/exit.c
index 06de6c4..4e3f919 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -850,9 +850,7 @@ static void exit_notify(struct task_struct *tsk)
 fastcall NORET_TYPE void do_exit(long code)
 {
 	struct task_struct *tsk = current;
-	struct taskstats *tidstats;
 	int group_dead;
-	unsigned int mycpu;
 
 	profile_task_exit(tsk);
 
@@ -890,8 +888,6 @@ fastcall NORET_TYPE void do_exit(long code)
 				current->comm, current->pid,
 				preempt_count());
 
-	taskstats_exit_alloc(&tidstats, &mycpu);
-
 	acct_update_integrals(tsk);
 	if (tsk->mm) {
 		update_hiwater_rss(tsk->mm);
@@ -911,8 +907,8 @@ fastcall NORET_TYPE void do_exit(long code)
 #endif
 	if (unlikely(tsk->audit_context))
 		audit_free(tsk);
-	taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
-	taskstats_exit_free(tidstats);
+
+	taskstats_exit(tsk, group_dead);
 
 	exit_mm(tsk);
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index d9d7c35..2654886 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -119,10 +119,10 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
 /*
  * Send taskstats data in @skb to listeners registered for @cpu's exit data
  */
-static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
+static void send_cpu_listeners(struct sk_buff *skb,
+					struct listener_list *listeners)
 {
 	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
-	struct listener_list *listeners;
 	struct listener *s, *tmp;
 	struct sk_buff *skb_next, *skb_cur = skb;
 	void *reply = genlmsg_data(genlhdr);
@@ -135,7 +135,6 @@ static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 	}
 
 	rc = 0;
-	listeners = &per_cpu(listener_array, cpu);
 	down_read(&listeners->sem);
 	list_for_each_entry(s, &listeners->list, list) {
 		skb_next = NULL;
@@ -413,28 +412,12 @@ err:
 	return rc;
 }
 
-void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
-{
-	struct listener_list *listeners;
-	/*
-	 * This is the cpu on which the task is exiting currently and will
-	 * be the one for which the exit event is sent, even if the cpu
-	 * on which this function is running changes later.
-	 */
-	*mycpu = raw_smp_processor_id();
-
-	listeners = &per_cpu(listener_array, *mycpu);
-
-	*ptidstats = NULL;
-	if (!list_empty(&listeners->list))
-		*ptidstats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
-}
-
 /* Send pid data out on exit */
-void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
-			int group_dead, unsigned int mycpu)
+void taskstats_exit(struct task_struct *tsk, int group_dead)
 {
 	int rc;
+	struct listener_list *listeners;
+	struct taskstats *tidstats;
 	struct sk_buff *rep_skb;
 	void *reply;
 	size_t size;
@@ -458,12 +441,17 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 		fill_tgid_exit(tsk);
 	}
 
+	listeners = &__raw_get_cpu_var(listener_array);
+	if (list_empty(&listeners->list))
+		return;
+
+	tidstats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
 	if (!tidstats)
 		return;
 
 	rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
 	if (rc < 0)
-		goto ret;
+		goto free_stats;
 
 	rc = fill_pid(tsk->pid, tsk, tidstats);
 	if (rc < 0)
@@ -492,15 +480,16 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 	nla_nest_end(rep_skb, na);
 
 send:
-	send_cpu_listeners(rep_skb, mycpu);
+	send_cpu_listeners(rep_skb, listeners);
+free_stats:
+	kmem_cache_free(taskstats_cache, tidstats);
 	return;
 
 nla_put_failure:
 	genlmsg_cancel(rep_skb, reply);
 err_skb:
 	nlmsg_free(rep_skb);
-ret:
-	return;
+	goto free_stats;
 }
 
 static struct genl_ops taskstats_ops = {
-- 
cgit v0.10.2


From 34ec12349c8a9505adc59d72f92b4595bc2483ff Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 6 Dec 2006 20:36:52 -0800
Subject: [PATCH] taskstats: cleanup ->signal->stats allocation

Allocate ->signal->stats on demand in taskstats_exit(), this allows us to
remove taskstats_tgid_alloc() (the last non-trivial inline) from taskstat's
public interface.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
index f1261a5..7e9680f 100644
--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -20,28 +20,6 @@ static inline void taskstats_tgid_init(struct signal_struct *sig)
 	sig->stats = NULL;
 }
 
-static inline void taskstats_tgid_alloc(struct task_struct *tsk)
-{
-	struct signal_struct *sig = tsk->signal;
-	struct taskstats *stats;
-
-	if (sig->stats != NULL)
-		return;
-
-	/* No problem if kmem_cache_zalloc() fails */
-	stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
-
-	spin_lock_irq(&tsk->sighand->siglock);
-	if (!sig->stats) {
-		sig->stats = stats;
-		stats = NULL;
-	}
-	spin_unlock_irq(&tsk->sighand->siglock);
-
-	if (stats)
-		kmem_cache_free(taskstats_cache, stats);
-}
-
 static inline void taskstats_tgid_free(struct signal_struct *sig)
 {
 	if (sig->stats)
@@ -55,8 +33,6 @@ static inline void taskstats_exit(struct task_struct *tsk, int group_dead)
 {}
 static inline void taskstats_tgid_init(struct signal_struct *sig)
 {}
-static inline void taskstats_tgid_alloc(struct task_struct *tsk)
-{}
 static inline void taskstats_tgid_free(struct signal_struct *sig)
 {}
 static inline void taskstats_init_early(void)
diff --git a/kernel/fork.c b/kernel/fork.c
index f37980d..6588381 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -847,7 +847,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	if (clone_flags & CLONE_THREAD) {
 		atomic_inc(&current->signal->count);
 		atomic_inc(&current->signal->live);
-		taskstats_tgid_alloc(current);
 		return 0;
 	}
 	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 2654886..7d793d6 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -412,6 +412,30 @@ err:
 	return rc;
 }
 
+static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
+{
+	struct signal_struct *sig = tsk->signal;
+	struct taskstats *stats;
+
+	if (sig->stats || thread_group_empty(tsk))
+		goto ret;
+
+	/* No problem if kmem_cache_zalloc() fails */
+	stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
+
+	spin_lock_irq(&tsk->sighand->siglock);
+	if (!sig->stats) {
+		sig->stats = stats;
+		stats = NULL;
+	}
+	spin_unlock_irq(&tsk->sighand->siglock);
+
+	if (stats)
+		kmem_cache_free(taskstats_cache, stats);
+ret:
+	return sig->stats;
+}
+
 /* Send pid data out on exit */
 void taskstats_exit(struct task_struct *tsk, int group_dead)
 {
@@ -433,7 +457,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 	size = nla_total_size(sizeof(u32)) +
 		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
 
-	is_thread_group = (tsk->signal->stats != NULL);
+	is_thread_group = !!taskstats_tgid_alloc(tsk);
 	if (is_thread_group) {
 		/* PID + STATS + TGID + STATS */
 		size = 2 * size;
-- 
cgit v0.10.2


From 68062b86fc0f480b806d270a8278709a5a41bb67 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 6 Dec 2006 20:36:53 -0800
Subject: [PATCH] taskstats: factor out reply assembling

Introduce mk_reply() helper which does all nla_put()s on reply.

Saves 453 bytes and a preparation for the next patch.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@sgi.com>
Cc: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 7d793d6..d2a4133 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -343,6 +343,25 @@ static int parse(struct nlattr *na, cpumask_t *mask)
 	return ret;
 }
 
+static int mk_reply(struct sk_buff *skb, int type, u32 pid, struct taskstats *stats)
+{
+	struct nlattr *na;
+	int aggr;
+
+	aggr = TASKSTATS_TYPE_AGGR_TGID;
+	if (type == TASKSTATS_TYPE_PID)
+		aggr = TASKSTATS_TYPE_AGGR_PID;
+
+	na = nla_nest_start(skb, aggr);
+	NLA_PUT_U32(skb, type, pid);
+	NLA_PUT_TYPE(skb, struct taskstats, TASKSTATS_TYPE_STATS, *stats);
+	nla_nest_end(skb, na);
+
+	return 0;
+nla_put_failure:
+	return -1;
+}
+
 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 {
 	int rc = 0;
@@ -350,7 +369,6 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 	struct taskstats stats;
 	void *reply;
 	size_t size;
-	struct nlattr *na;
 	cpumask_t mask;
 
 	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
@@ -382,27 +400,21 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 		if (rc < 0)
 			goto err;
 
-		na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
-		NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid);
-		NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
-				stats);
+		if (mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid, &stats))
+			goto nla_put_failure;
 	} else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
 		u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
 		rc = fill_tgid(tgid, NULL, &stats);
 		if (rc < 0)
 			goto err;
 
-		na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
-		NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid);
-		NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
-				stats);
+		if (mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid, &stats))
+			goto nla_put_failure;
 	} else {
 		rc = -EINVAL;
 		goto err;
 	}
 
-	nla_nest_end(rep_skb, na);
-
 	return send_reply(rep_skb, info->snd_pid);
 
 nla_put_failure:
@@ -446,7 +458,6 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 	void *reply;
 	size_t size;
 	int is_thread_group;
-	struct nlattr *na;
 
 	if (!family_registered)
 		return;
@@ -481,27 +492,17 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 	if (rc < 0)
 		goto err_skb;
 
-	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
-	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid);
-	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
-			*tidstats);
-	nla_nest_end(rep_skb, na);
-
-	if (!is_thread_group)
-		goto send;
+	if (mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid, tidstats))
+		goto nla_put_failure;
 
 	/*
 	 * Doesn't matter if tsk is the leader or the last group member leaving
 	 */
-	if (!group_dead)
+	if (!is_thread_group || !group_dead)
 		goto send;
 
-	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
-	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
-	/* No locking needed for tsk->signal->stats since group is dead */
-	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
-			*tsk->signal->stats);
-	nla_nest_end(rep_skb, na);
+	if (mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid, tsk->signal->stats))
+		goto nla_put_failure;
 
 send:
 	send_cpu_listeners(rep_skb, listeners);
-- 
cgit v0.10.2


From 51de4d90852ba4cfa5743594ec4a7f158b52dc43 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 6 Dec 2006 20:36:54 -0800
Subject: [PATCH] taskstats: use nla_reserve() for reply assembling

Currently taskstats_user_cmd()/taskstats_exit() do:

	1) allocate stats
	2) fill stats
	3) make a temporary copy on stack (236 bytes)
	4) copy that copy to skb
	5) free stats

With the help of nla_reserve() we can operate on skb->data directly,
thus avoiding all these steps except 2).

So, before this patch:

	// copy *stats to skb->data
	int mk_reply(skb, ..., struct taskstats *stats);

	fill_pid(stats);
	mk_reply(skb, ..., stats);

After:
	// return a pointer to skb->data
	struct taskstats *mk_reply(skb, ...);

	stat = mk_reply(skb, ...);
	fill_pid(stats);

Shrinks taskatsks.o by 162 bytes.

A stupid benchmark (send one million TASKSTATS_CMD_ATTR_PID) shows the

		real user sys
	before:
		4.02 0.06 3.96
		4.02 0.04 3.98
		4.02 0.04 3.97
	after:
		3.86 0.08 3.78
		3.88 0.10 3.77
		3.89 0.09 3.80

but this looks suspiciously good.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@sgi.com>
Cc: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index d2a4133..b0aad99 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -185,6 +185,7 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
 	} else
 		get_task_struct(tsk);
 
+	memset(stats, 0, sizeof(*stats));
 	/*
 	 * Each accounting subsystem adds calls to its functions to
 	 * fill in relevant parts of struct taskstsats as follows
@@ -227,6 +228,8 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
 
 	if (first->signal->stats)
 		memcpy(stats, first->signal->stats, sizeof(*stats));
+	else
+		memset(stats, 0, sizeof(*stats));
 
 	tsk = first;
 	do {
@@ -343,9 +346,9 @@ static int parse(struct nlattr *na, cpumask_t *mask)
 	return ret;
 }
 
-static int mk_reply(struct sk_buff *skb, int type, u32 pid, struct taskstats *stats)
+static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
 {
-	struct nlattr *na;
+	struct nlattr *na, *ret;
 	int aggr;
 
 	aggr = TASKSTATS_TYPE_AGGR_TGID;
@@ -353,20 +356,23 @@ static int mk_reply(struct sk_buff *skb, int type, u32 pid, struct taskstats *st
 		aggr = TASKSTATS_TYPE_AGGR_PID;
 
 	na = nla_nest_start(skb, aggr);
-	NLA_PUT_U32(skb, type, pid);
-	NLA_PUT_TYPE(skb, struct taskstats, TASKSTATS_TYPE_STATS, *stats);
+	if (nla_put(skb, type, sizeof(pid), &pid) < 0)
+		goto err;
+	ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
+	if (!ret)
+		goto err;
 	nla_nest_end(skb, na);
 
-	return 0;
-nla_put_failure:
-	return -1;
+	return nla_data(ret);
+err:
+	return NULL;
 }
 
 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 {
 	int rc = 0;
 	struct sk_buff *rep_skb;
-	struct taskstats stats;
+	struct taskstats *stats;
 	void *reply;
 	size_t size;
 	cpumask_t mask;
@@ -389,36 +395,36 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 	size = nla_total_size(sizeof(u32)) +
 		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
 
-	memset(&stats, 0, sizeof(stats));
 	rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
 	if (rc < 0)
 		return rc;
 
+	rc = -EINVAL;
 	if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
 		u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
-		rc = fill_pid(pid, NULL, &stats);
-		if (rc < 0)
-			goto err;
+		stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
+		if (!stats)
+			goto nla_err;
 
-		if (mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid, &stats))
-			goto nla_put_failure;
+		rc = fill_pid(pid, NULL, stats);
+		if (rc < 0)
+			goto nla_err;
 	} else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
 		u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
-		rc = fill_tgid(tgid, NULL, &stats);
-		if (rc < 0)
-			goto err;
+		stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+		if (!stats)
+			goto nla_err;
 
-		if (mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid, &stats))
-			goto nla_put_failure;
-	} else {
-		rc = -EINVAL;
+		rc = fill_tgid(tgid, NULL, stats);
+		if (rc < 0)
+			goto nla_err;
+	} else
 		goto err;
-	}
 
 	return send_reply(rep_skb, info->snd_pid);
 
-nla_put_failure:
-	rc = genlmsg_cancel(rep_skb, reply);
+nla_err:
+	genlmsg_cancel(rep_skb, reply);
 err:
 	nlmsg_free(rep_skb);
 	return rc;
@@ -453,7 +459,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 {
 	int rc;
 	struct listener_list *listeners;
-	struct taskstats *tidstats;
+	struct taskstats *stats;
 	struct sk_buff *rep_skb;
 	void *reply;
 	size_t size;
@@ -480,20 +486,17 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 	if (list_empty(&listeners->list))
 		return;
 
-	tidstats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
-	if (!tidstats)
-		return;
-
 	rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
 	if (rc < 0)
-		goto free_stats;
+		return;
 
-	rc = fill_pid(tsk->pid, tsk, tidstats);
-	if (rc < 0)
-		goto err_skb;
+	stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid);
+	if (!stats)
+		goto nla_err;
 
-	if (mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid, tidstats))
-		goto nla_put_failure;
+	rc = fill_pid(tsk->pid, tsk, stats);
+	if (rc < 0)
+		goto nla_err;
 
 	/*
 	 * Doesn't matter if tsk is the leader or the last group member leaving
@@ -501,20 +504,19 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 	if (!is_thread_group || !group_dead)
 		goto send;
 
-	if (mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid, tsk->signal->stats))
-		goto nla_put_failure;
+	stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
+	if (!stats)
+		goto nla_err;
+
+	memcpy(stats, tsk->signal->stats, sizeof(*stats));
 
 send:
 	send_cpu_listeners(rep_skb, listeners);
-free_stats:
-	kmem_cache_free(taskstats_cache, tidstats);
 	return;
 
-nla_put_failure:
+nla_err:
 	genlmsg_cancel(rep_skb, reply);
-err_skb:
 	nlmsg_free(rep_skb);
-	goto free_stats;
 }
 
 static struct genl_ops taskstats_ops = {
-- 
cgit v0.10.2


From 37167485302c8876cb0303af113696e88c2945aa Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 6 Dec 2006 20:36:55 -0800
Subject: [PATCH] taskstats: cleanup reply assembling

Thomas Graf wrote:
>
> nla_nest_start() may return NULL, either rely on prepare_reply() to be
> correct and BUG() on failure or do proper error handling for all
> functions.

nla_put() in taskstat.c can fail only if the 'size' argument of alloc_skb()
was not right. This is a kernel bug, we should not hide it. So add 'BUG()'
on error path and check for 'na == NULL'.

> genlmsg_cancel() is only required in error paths for dumping
> procedures.

So we can remove 'genlmsg_cancel()' calls and 'void *reply' (saves 227 bytes).

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b0aad99..4c3476f 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -69,7 +69,7 @@ enum actions {
 };
 
 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
-			void **replyp, size_t size)
+				size_t size)
 {
 	struct sk_buff *skb;
 	void *reply;
@@ -94,7 +94,6 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 	}
 
 	*skbp = skb;
-	*replyp = reply;
 	return 0;
 }
 
@@ -351,11 +350,13 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
 	struct nlattr *na, *ret;
 	int aggr;
 
-	aggr = TASKSTATS_TYPE_AGGR_TGID;
-	if (type == TASKSTATS_TYPE_PID)
-		aggr = TASKSTATS_TYPE_AGGR_PID;
+	aggr = (type == TASKSTATS_TYPE_PID)
+			? TASKSTATS_TYPE_AGGR_PID
+			: TASKSTATS_TYPE_AGGR_TGID;
 
 	na = nla_nest_start(skb, aggr);
+	if (!na)
+		goto err;
 	if (nla_put(skb, type, sizeof(pid), &pid) < 0)
 		goto err;
 	ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
@@ -373,7 +374,6 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 	int rc = 0;
 	struct sk_buff *rep_skb;
 	struct taskstats *stats;
-	void *reply;
 	size_t size;
 	cpumask_t mask;
 
@@ -395,7 +395,7 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 	size = nla_total_size(sizeof(u32)) +
 		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
 
-	rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+	rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
 	if (rc < 0)
 		return rc;
 
@@ -404,27 +404,24 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 		u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
 		stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
 		if (!stats)
-			goto nla_err;
+			goto err;
 
 		rc = fill_pid(pid, NULL, stats);
 		if (rc < 0)
-			goto nla_err;
+			goto err;
 	} else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
 		u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
 		stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
 		if (!stats)
-			goto nla_err;
+			goto err;
 
 		rc = fill_tgid(tgid, NULL, stats);
 		if (rc < 0)
-			goto nla_err;
+			goto err;
 	} else
 		goto err;
 
 	return send_reply(rep_skb, info->snd_pid);
-
-nla_err:
-	genlmsg_cancel(rep_skb, reply);
 err:
 	nlmsg_free(rep_skb);
 	return rc;
@@ -461,7 +458,6 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 	struct listener_list *listeners;
 	struct taskstats *stats;
 	struct sk_buff *rep_skb;
-	void *reply;
 	size_t size;
 	int is_thread_group;
 
@@ -486,17 +482,17 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 	if (list_empty(&listeners->list))
 		return;
 
-	rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+	rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size);
 	if (rc < 0)
 		return;
 
 	stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid);
 	if (!stats)
-		goto nla_err;
+		goto err;
 
 	rc = fill_pid(tsk->pid, tsk, stats);
 	if (rc < 0)
-		goto nla_err;
+		goto err;
 
 	/*
 	 * Doesn't matter if tsk is the leader or the last group member leaving
@@ -506,16 +502,14 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 
 	stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
 	if (!stats)
-		goto nla_err;
+		goto err;
 
 	memcpy(stats, tsk->signal->stats, sizeof(*stats));
 
 send:
 	send_cpu_listeners(rep_skb, listeners);
 	return;
-
-nla_err:
-	genlmsg_cancel(rep_skb, reply);
+err:
 	nlmsg_free(rep_skb);
 }
 
-- 
cgit v0.10.2


From 9774a1f54f173ad18e816496c8979f1bf8ef666a Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 6 Dec 2006 20:36:56 -0800
Subject: [PATCH] Compile-time check re world-writeable module params

One of the mistakes a module_param() user can make is to supply default
value of module parameter as the last argument.  module_param() accepts
permissions instead.  If default value is, say, 3 (-------wx), parameter
becomes world-writeable.

So far, the only remedy was to apply grep(1) and read drivers submitted
to -mm. BTDT.

With this patch applied, compiler will finally do some job.

*) bounds checking on permissions
*) world-writeable bit checking on permissions
*) compile breakage if checks trigger

First version of this check (only "& 2" part) directly caught 4 out of 7
places during my last grep.

    Subject: Neverending module_param() bugs
    [X] drivers/acpi/sbs.c:101:module_param(capacity_mode, int, CAPACITY_UNIT);
    [X] drivers/acpi/sbs.c:102:module_param(update_mode, int, UPDATE_MODE);
    [ ] drivers/acpi/sbs.c:103:module_param(update_info_mode, int, UPDATE_INFO_MODE);
    [ ] drivers/acpi/sbs.c:104:module_param(update_time, int, UPDATE_TIME);
    [ ] drivers/acpi/sbs.c:105:module_param(update_time2, int, UPDATE_TIME2);
    [X] drivers/char/watchdog/sbc8360.c:203:module_param(timeout, int, 27);
    [X] drivers/media/video/tuner-simple.c:13:module_param(offset, int, 0666);

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 7c0c2c1..4a189da 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -63,6 +63,9 @@ struct kparam_array
    not there, read bits mean it's readable, write bits mean it's
    writable. */
 #define __module_param_call(prefix, name, set, get, arg, perm)		\
+	/* Default value instead of permissions? */			\
+	static int __param_perm_check_##name __attribute__((unused)) =	\
+	BUILD_BUG_ON_ZERO((perm) < 0 || (perm) > 0777 || ((perm) & 2));	\
 	static char __param_str_##name[] = prefix #name;		\
 	static struct kernel_param const __param_##name			\
 	__attribute_used__						\
-- 
cgit v0.10.2


From f89d75f224dc530f1c173d9093f75865345840a1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 6 Dec 2006 20:36:59 -0800
Subject: [PATCH] lockdep: annotate bcsp driver

    =============================================
    [ INFO: possible recursive locking detected ]
    2.6.18-1.2699.fc6 #1
    ---------------------------------------------
    swapper/0 is trying to acquire lock:
     (&list->lock#3){+...}, at: [<c05ad307>] skb_dequeue+0x12/0x43

    but task is already holding lock:
     (&list->lock#3){+...}, at: [<df98cd79>] bcsp_dequeue+0x6a/0x11e [hci_uart]

Two different list locks nest, annotate so.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Marcel Holtmann <marcel@holtmann.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/bluetooth/hci_bcsp.c b/drivers/bluetooth/hci_bcsp.c
index d0cface..5e2c318 100644
--- a/drivers/bluetooth/hci_bcsp.c
+++ b/drivers/bluetooth/hci_bcsp.c
@@ -330,7 +330,7 @@ static struct sk_buff *bcsp_dequeue(struct hci_uart *hu)
 	   reliable packet if the number of packets sent but not yet ack'ed
 	   is < than the winsize */
 
-	spin_lock_irqsave(&bcsp->unack.lock, flags);
+	spin_lock_irqsave_nested(&bcsp->unack.lock, flags, SINGLE_DEPTH_NESTING);
 
 	if (bcsp->unack.qlen < BCSP_TXWINSIZE && (skb = skb_dequeue(&bcsp->rel)) != NULL) {
 		struct sk_buff *nskb = bcsp_prepare_pkt(bcsp, skb->data, skb->len, bt_cb(skb)->pkt_type);
@@ -696,7 +696,7 @@ static void bcsp_timed_event(unsigned long arg)
 
 	BT_DBG("hu %p retransmitting %u pkts", hu, bcsp->unack.qlen);
 
-	spin_lock_irqsave(&bcsp->unack.lock, flags);
+	spin_lock_irqsave_nested(&bcsp->unack.lock, flags, SINGLE_DEPTH_NESTING);
 
 	while ((skb = __skb_dequeue_tail(&bcsp->unack)) != NULL) {
 		bcsp->msgq_txseq = (bcsp->msgq_txseq - 1) & 0x07;
-- 
cgit v0.10.2


From e0980dafa329d33bb88edc8a3ef9fab4e070590c Mon Sep 17 00:00:00 2001
From: Paul B Schroeder <pschroeder@uplogix.com>
Date: Wed, 6 Dec 2006 20:37:03 -0800
Subject: [PATCH] Exar quad port serial

This is on our "Envoy" boxes which we have, according to the documentation, an
"Exar ST16C554/554D Quad UART with 16-byte Fifo's".  The box also has two
other "on-board" serial ports and a modem chip.

The two on-board serial UARTs were being detected along with the first two
Exar UARTs.  The last two Exar UARTs were not showing up and neither was the
modem.

This patch was the only way I could the kernel to see beyond the standard four
serial ports and get all four of the Exar UARTs to show up.

[akpm@osdl.org: build fix]
Signed-off-by:  Paul B Schroeder <pschroeder@uplogix.com>
Cc: Lennart Sorensen <lsorense@csclub.uwaterloo.ca>
Acked-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/serial/8250_exar_st16c554.c b/drivers/serial/8250_exar_st16c554.c
new file mode 100644
index 0000000..567143a
--- /dev/null
+++ b/drivers/serial/8250_exar_st16c554.c
@@ -0,0 +1,52 @@
+/*
+ *  linux/drivers/serial/8250_exar.c
+ *
+ *  Written by Paul B Schroeder < pschroeder "at" uplogix "dot" com >
+ *  Based on 8250_boca.
+ *
+ *  Copyright (C) 2005 Russell King.
+ *  Data taken from include/asm-i386/serial.h
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/serial_8250.h>
+
+#define PORT(_base,_irq)				\
+	{						\
+		.iobase		= _base,		\
+		.irq		= _irq,			\
+		.uartclk	= 1843200,		\
+		.iotype		= UPIO_PORT,		\
+		.flags		= UPF_BOOT_AUTOCONF,	\
+	}
+
+static struct plat_serial8250_port exar_data[] = {
+	PORT(0x100, 5),
+	PORT(0x108, 5),
+	PORT(0x110, 5),
+	PORT(0x118, 5),
+	{ },
+};
+
+static struct platform_device exar_device = {
+	.name			= "serial8250",
+	.id			= PLAT8250_DEV_EXAR_ST16C554,
+	.dev			= {
+		.platform_data	= exar_data,
+	},
+};
+
+static int __init exar_init(void)
+{
+	return platform_device_register(&exar_device);
+}
+
+module_init(exar_init);
+
+MODULE_AUTHOR("Paul B Schroeder");
+MODULE_DESCRIPTION("8250 serial probe module for Exar cards");
+MODULE_LICENSE("GPL");
diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index e936c91..fc12d5d 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -210,6 +210,17 @@ config SERIAL_8250_BOCA
 	  To compile this driver as a module, choose M here: the module
 	  will be called 8250_boca.
 
+config SERIAL_8250_EXAR_ST16C554
+	tristate "Support Exar ST16C554/554D Quad UART"
+	depends on SERIAL_8250 != n && ISA && SERIAL_8250_MANY_PORTS
+	help
+	  The Uplogix Envoy TU301 uses this Exar Quad UART.  If you are
+	  tinkering with your Envoy TU301, or have a machine with this UART,
+	  say Y here.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called 8250_exar_st16c554.
+
 config SERIAL_8250_HUB6
 	tristate "Support Hub6 cards"
 	depends on SERIAL_8250 != n && ISA && SERIAL_8250_MANY_PORTS
diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile
index 0dba001..df3632c 100644
--- a/drivers/serial/Makefile
+++ b/drivers/serial/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_SERIAL_8250_CONSOLE) += 8250_early.o
 obj-$(CONFIG_SERIAL_8250_FOURPORT) += 8250_fourport.o
 obj-$(CONFIG_SERIAL_8250_ACCENT) += 8250_accent.o
 obj-$(CONFIG_SERIAL_8250_BOCA) += 8250_boca.o
+obj-$(CONFIG_SERIAL_8250_EXAR_ST16C554) += 8250_exar_st16c554.o
 obj-$(CONFIG_SERIAL_8250_HUB6) += 8250_hub6.o
 obj-$(CONFIG_SERIAL_8250_MCA) += 8250_mca.o
 obj-$(CONFIG_SERIAL_8250_AU1X00) += 8250_au1x00.o
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 8e96814..71310d8 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -41,6 +41,7 @@ enum {
 	PLAT8250_DEV_FOURPORT,
 	PLAT8250_DEV_ACCENT,
 	PLAT8250_DEV_BOCA,
+	PLAT8250_DEV_EXAR_ST16C554,
 	PLAT8250_DEV_HUB6,
 	PLAT8250_DEV_MCA,
 	PLAT8250_DEV_AU1X00,
-- 
cgit v0.10.2


From 4a6e617a4bec9fb2ee4a16cf59565b2af5049e12 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 6 Dec 2006 20:37:04 -0800
Subject: [PATCH] fs/*: trivial vsnprintf() conversion

It would very lame to get buffer overflow via one of the following.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Dave Kleikamp <shaggy@austin.ibm.com>
Cc: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index e5a205c..5023351 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -36,7 +36,7 @@ void __adfs_error(struct super_block *sb, const char *function, const char *fmt,
 	va_list args;
 
 	va_start(args, fmt);
-	vsprintf(error_buf, fmt, args);
+	vsnprintf(error_buf, sizeof(error_buf), fmt, args);
 	va_end(args);
 
 	printk(KERN_CRIT "ADFS-fs error (device %s)%s%s: %s\n",
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index ccd624e..f4de4b9 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -445,7 +445,7 @@ affs_error(struct super_block *sb, const char *function, const char *fmt, ...)
 	va_list	 args;
 
 	va_start(args,fmt);
-	vsprintf(ErrorBuffer,fmt,args);
+	vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args);
 	va_end(args);
 
 	printk(KERN_CRIT "AFFS error (device %s): %s(): %s\n", sb->s_id,
@@ -461,7 +461,7 @@ affs_warning(struct super_block *sb, const char *function, const char *fmt, ...)
 	va_list	 args;
 
 	va_start(args,fmt);
-	vsprintf(ErrorBuffer,fmt,args);
+	vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args);
 	va_end(args);
 
 	printk(KERN_WARNING "AFFS warning (device %s): %s(): %s\n", sb->s_id,
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index ca3e191..846ac8f 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -93,7 +93,7 @@ void jfs_error(struct super_block *sb, const char * function, ...)
 	va_list args;
 
 	va_start(args, function);
-	vsprintf(error_buf, function, args);
+	vsnprintf(error_buf, sizeof(error_buf), function, args);
 	va_end(args);
 
 	printk(KERN_ERR "ERROR: (device %s): %s\n", sb->s_id, error_buf);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 0524f8a..4bf3954 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1674,7 +1674,7 @@ void __ocfs2_error(struct super_block *sb,
 	va_list args;
 
 	va_start(args, fmt);
-	vsprintf(error_buf, fmt, args);
+	vsnprintf(error_buf, sizeof(error_buf), fmt, args);
 	va_end(args);
 
 	/* Not using mlog here because we want to show the actual
@@ -1695,7 +1695,7 @@ void __ocfs2_abort(struct super_block* sb,
 	va_list args;
 
 	va_start(args, fmt);
-	vsprintf(error_buf, fmt, args);
+	vsnprintf(error_buf, sizeof(error_buf), fmt, args);
 	va_end(args);
 
 	printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n",
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 397f54a..1dbc295 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1709,7 +1709,7 @@ void udf_error(struct super_block *sb, const char *function,
 		sb->s_dirt = 1;
 	}
 	va_start(args, fmt);
-	vsprintf(error_buf, fmt, args);
+	vsnprintf(error_buf, sizeof(error_buf), fmt, args);
 	va_end(args);
 	printk (KERN_CRIT "UDF-fs error (device %s): %s: %s\n",
 		sb->s_id, function, error_buf);
@@ -1721,7 +1721,7 @@ void udf_warning(struct super_block *sb, const char *function,
 	va_list args;
 
 	va_start (args, fmt);
-	vsprintf(error_buf, fmt, args);
+	vsnprintf(error_buf, sizeof(error_buf), fmt, args);
 	va_end(args);
 	printk(KERN_WARNING "UDF-fs warning (device %s): %s: %s\n",
 		sb->s_id, function, error_buf);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 0b18d2c..8a8e938 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -224,7 +224,7 @@ void ufs_error (struct super_block * sb, const char * function,
 		sb->s_flags |= MS_RDONLY;
 	}
 	va_start (args, fmt);
-	vsprintf (error_buf, fmt, args);
+	vsnprintf (error_buf, sizeof(error_buf), fmt, args);
 	va_end (args);
 	switch (UFS_SB(sb)->s_mount_opt & UFS_MOUNT_ONERROR) {
 	case UFS_MOUNT_ONERROR_PANIC:
@@ -255,7 +255,7 @@ void ufs_panic (struct super_block * sb, const char * function,
 		sb->s_dirt = 1;
 	}
 	va_start (args, fmt);
-	vsprintf (error_buf, fmt, args);
+	vsnprintf (error_buf, sizeof(error_buf), fmt, args);
 	va_end (args);
 	sb->s_flags |= MS_RDONLY;
 	printk (KERN_CRIT "UFS-fs panic (device %s): %s: %s\n",
@@ -268,7 +268,7 @@ void ufs_warning (struct super_block * sb, const char * function,
 	va_list args;
 
 	va_start (args, fmt);
-	vsprintf (error_buf, fmt, args);
+	vsnprintf (error_buf, sizeof(error_buf), fmt, args);
 	va_end (args);
 	printk (KERN_WARNING "UFS-fs warning (device %s): %s: %s\n",
 		sb->s_id, function, error_buf);
-- 
cgit v0.10.2


From 352d94d040053d93bf1cf4acb4be9635e69d9200 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 6 Dec 2006 20:37:04 -0800
Subject: [PATCH] hpfs: bring hpfs_error() into shape

 - switch to error message buffer in .bss
 - missing va_end() (htf it worked before?)
 - use vsnprintf()
 - rename variables to understandable "fmt", "args".
 - "const char *fmt", yes.
 - add __attribute__((format ...

Still, put that coffee down before reading more.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 32ab51e..1c07aa8 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -317,7 +317,8 @@ static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb)
 
 /* super.c */
 
-void hpfs_error(struct super_block *, char *, ...);
+void hpfs_error(struct super_block *, const char *, ...)
+	__attribute__((format (printf, 2, 3)));
 int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *);
 unsigned hpfs_count_one_bitmap(struct super_block *, secno);
 
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 34d68e2..d4abc1a 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -46,21 +46,17 @@ static void unmark_dirty(struct super_block *s)
 }
 
 /* Filesystem error... */
+static char err_buf[1024];
 
-#define ERR_BUF_SIZE 1024
-
-void hpfs_error(struct super_block *s, char *m,...)
+void hpfs_error(struct super_block *s, const char *fmt, ...)
 {
-	char *buf;
-	va_list l;
-	va_start(l, m);
-	if (!(buf = kmalloc(ERR_BUF_SIZE, GFP_KERNEL)))
-		printk("HPFS: No memory for error message '%s'\n",m);
-	else if (vsprintf(buf, m, l) >= ERR_BUF_SIZE)
-		printk("HPFS: Grrrr... Kernel memory corrupted ... going on, but it'll crash very soon :-(\n");
-	printk("HPFS: filesystem error: ");
-	if (buf) printk("%s", buf);
-	else printk("%s\n",m);
+	va_list args;
+
+	va_start(args, fmt);
+	vsnprintf(err_buf, sizeof(err_buf), fmt, args);
+	va_end(args);
+
+	printk("HPFS: filesystem error: %s", err_buf);
 	if (!hpfs_sb(s)->sb_was_error) {
 		if (hpfs_sb(s)->sb_err == 2) {
 			printk("; crashing the system because you wanted it\n");
@@ -76,7 +72,6 @@ void hpfs_error(struct super_block *s, char *m,...)
 		} else if (s->s_flags & MS_RDONLY) printk("; going on - but anything won't be destroyed because it's read-only\n");
 		else printk("; corrupted filesystem mounted read/write - your computer will explode within 20 seconds ... but you wanted it so!\n");
 	} else printk("\n");
-	kfree(buf);
 	hpfs_sb(s)->sb_was_error = 1;
 }
 
-- 
cgit v0.10.2


From 18debbbcce1306f0bbb1c71cf587fd90413acab6 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 6 Dec 2006 20:37:05 -0800
Subject: [PATCH] hpfs: fix printk format warnings

Fix hpfs printk warnings:

  fs/hpfs/dir.c:87: warning: format '%08x' expects type 'unsigned int', but argument 3 has type 'long unsigned int'
  fs/hpfs/dir.c:147: warning: format '%08x' expects type 'unsigned int', but argument 3 has type 'long int'
  fs/hpfs/dir.c:148: warning: format '%08x' expects type 'unsigned int', but argument 3 has type 'long int'
  fs/hpfs/dnode.c:537: warning: format '%08x' expects type 'unsigned int', but argument 5 has type 'long unsigned int'
  fs/hpfs/dnode.c:854: warning: format '%08x' expects type 'unsigned int', but argument 3 has type 'loff_t'
  fs/hpfs/ea.c:247: warning: format '%08x' expects type 'unsigned int', but argument 3 has type 'long unsigned int'
  fs/hpfs/inode.c:254: warning: format '%08x' expects type 'unsigned int', but argument 3 has type 'long unsigned int'
  fs/hpfs/map.c:129: warning: format '%08x' expects type 'unsigned int', but argument 3 has type 'ino_t'
  fs/hpfs/map.c:135: warning: format '%08x' expects type 'unsigned int', but argument 3 has type 'ino_t'
  fs/hpfs/map.c:140: warning: format '%08x' expects type 'unsigned int', but argument 3 has type 'ino_t'
  fs/hpfs/map.c:147: warning: format '%08x' expects type 'unsigned int', but argument 3 has type 'ino_t'
  fs/hpfs/map.c:154: warning: format '%08x' expects type 'unsigned int', but argument 3 has type 'ino_t'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index ecc9180..594f9c4 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -84,7 +84,8 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		}
 		if (!fno->dirflag) {
 			e = 1;
-			hpfs_error(inode->i_sb, "not a directory, fnode %08x",inode->i_ino);
+			hpfs_error(inode->i_sb, "not a directory, fnode %08lx",
+					(unsigned long)inode->i_ino);
 		}
 		if (hpfs_inode->i_dno != fno->u.external[0].disk_secno) {
 			e = 1;
@@ -144,8 +145,11 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		}
 		if (de->first || de->last) {
 			if (hpfs_sb(inode->i_sb)->sb_chk) {
-				if (de->first && !de->last && (de->namelen != 2 || de ->name[0] != 1 || de->name[1] != 1)) hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08x", old_pos);
-				if (de->last && (de->namelen != 1 || de ->name[0] != 255)) hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08x", old_pos);
+				if (de->first && !de->last && (de->namelen != 2
+				    || de ->name[0] != 1 || de->name[1] != 1))
+					hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", old_pos);
+				if (de->last && (de->namelen != 1 || de ->name[0] != 255))
+					hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", old_pos);
 			}
 			hpfs_brelse4(&qbh);
 			goto again;
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
index 229ff2f..fe83c2b 100644
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -533,10 +533,13 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
 			struct buffer_head *bh;
 			struct dnode *d1;
 			struct quad_buffer_head qbh1;
-			if (hpfs_sb(i->i_sb)->sb_chk) if (up != i->i_ino) {
-				hpfs_error(i->i_sb, "bad pointer to fnode, dnode %08x, pointing to %08x, should be %08x", dno, up, i->i_ino);
+			if (hpfs_sb(i->i_sb)->sb_chk)
+			    if (up != i->i_ino) {
+				hpfs_error(i->i_sb,
+					"bad pointer to fnode, dnode %08x, pointing to %08x, should be %08lx",
+					dno, up, (unsigned long)i->i_ino);
 				return;
-			}
+			    }
 			if ((d1 = hpfs_map_dnode(i->i_sb, down, &qbh1))) {
 				d1->up = up;
 				d1->root_dnode = 1;
@@ -851,7 +854,9 @@ struct hpfs_dirent *map_pos_dirent(struct inode *inode, loff_t *posp,
 	/* Going to the next dirent */
 	if ((d = de_next_de(de)) < dnode_end_de(dnode)) {
 		if (!(++*posp & 077)) {
-			hpfs_error(inode->i_sb, "map_pos_dirent: pos crossed dnode boundary; pos = %08x", *posp);
+			hpfs_error(inode->i_sb,
+				"map_pos_dirent: pos crossed dnode boundary; pos = %08llx",
+				(unsigned long long)*posp);
 			goto bail;
 		}
 		/* We're going down the tree */
diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c
index 66339dc..547a838 100644
--- a/fs/hpfs/ea.c
+++ b/fs/hpfs/ea.c
@@ -243,8 +243,9 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, char *key, char *data
 		fnode->ea_offs = 0xc4;
 	}
 	if (fnode->ea_offs < 0xc4 || fnode->ea_offs + fnode->acl_size_s + fnode->ea_size_s > 0x200) {
-		hpfs_error(s, "fnode %08x: ea_offs == %03x, ea_size_s == %03x",
-			inode->i_ino, fnode->ea_offs, fnode->ea_size_s);
+		hpfs_error(s, "fnode %08lx: ea_offs == %03x, ea_size_s == %03x",
+			(unsigned long)inode->i_ino,
+			fnode->ea_offs, fnode->ea_size_s);
 		return;
 	}
 	if ((fnode->ea_size_s || !fnode->ea_size_l) &&
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 7faef85..85d3e1d 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -251,7 +251,10 @@ void hpfs_write_inode_nolock(struct inode *i)
 			de->file_size = 0;
 			hpfs_mark_4buffers_dirty(&qbh);
 			hpfs_brelse4(&qbh);
-		} else hpfs_error(i->i_sb, "directory %08x doesn't have '.' entry", i->i_ino);
+		} else
+			hpfs_error(i->i_sb,
+				"directory %08lx doesn't have '.' entry",
+				(unsigned long)i->i_ino);
 	}
 	mark_buffer_dirty(bh);
 	brelse(bh);
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index 0fecdac..c472458 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -126,32 +126,40 @@ struct fnode *hpfs_map_fnode(struct super_block *s, ino_t ino, struct buffer_hea
 			struct extended_attribute *ea;
 			struct extended_attribute *ea_end;
 			if (fnode->magic != FNODE_MAGIC) {
-				hpfs_error(s, "bad magic on fnode %08x", ino);
+				hpfs_error(s, "bad magic on fnode %08lx",
+					(unsigned long)ino);
 				goto bail;
 			}
 			if (!fnode->dirflag) {
 				if ((unsigned)fnode->btree.n_used_nodes + (unsigned)fnode->btree.n_free_nodes !=
 				    (fnode->btree.internal ? 12 : 8)) {
-					hpfs_error(s, "bad number of nodes in fnode %08x", ino);
+					hpfs_error(s,
+					   "bad number of nodes in fnode %08lx",
+					    (unsigned long)ino);
 					goto bail;
 				}
 				if (fnode->btree.first_free !=
 				    8 + fnode->btree.n_used_nodes * (fnode->btree.internal ? 8 : 12)) {
-					hpfs_error(s, "bad first_free pointer in fnode %08x", ino);
+					hpfs_error(s,
+					    "bad first_free pointer in fnode %08lx",
+					    (unsigned long)ino);
 					goto bail;
 				}
 			}
 			if (fnode->ea_size_s && ((signed int)fnode->ea_offs < 0xc4 ||
 			   (signed int)fnode->ea_offs + fnode->acl_size_s + fnode->ea_size_s > 0x200)) {
-				hpfs_error(s, "bad EA info in fnode %08x: ea_offs == %04x ea_size_s == %04x",
-					ino, fnode->ea_offs, fnode->ea_size_s);
+				hpfs_error(s,
+					"bad EA info in fnode %08lx: ea_offs == %04x ea_size_s == %04x",
+					(unsigned long)ino,
+					fnode->ea_offs, fnode->ea_size_s);
 				goto bail;
 			}
 			ea = fnode_ea(fnode);
 			ea_end = fnode_end_ea(fnode);
 			while (ea != ea_end) {
 				if (ea > ea_end) {
-					hpfs_error(s, "bad EA in fnode %08x", ino);
+					hpfs_error(s, "bad EA in fnode %08lx",
+						(unsigned long)ino);
 					goto bail;
 				}
 				ea = next_ea(ea);
-- 
cgit v0.10.2


From 5ac29e62be2a581ec77953eca64d85ddeef488f0 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 6 Dec 2006 20:37:06 -0800
Subject: [PATCH] drivers/cdrom/*: trivial vsnprintf() conversion

Fixing sbpcd.c baroque error printing in process.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/cdrom/optcd.c b/drivers/cdrom/optcd.c
index 25032d7..3541690 100644
--- a/drivers/cdrom/optcd.c
+++ b/drivers/cdrom/optcd.c
@@ -101,7 +101,7 @@ static void debug(int debug_this, const char* fmt, ...)
 		return;
 
 	va_start(args, fmt);
-	vsprintf(s, fmt, args);
+	vsnprintf(s, sizeof(s), fmt, args);
 	printk(KERN_DEBUG "optcd: %s\n", s);
 	va_end(args);
 }
diff --git a/drivers/cdrom/sbpcd.c b/drivers/cdrom/sbpcd.c
index ba50e5a..a1283b1 100644
--- a/drivers/cdrom/sbpcd.c
+++ b/drivers/cdrom/sbpcd.c
@@ -770,11 +770,10 @@ static void msg(int level, const char *fmt, ...)
 	
 	msgnum++;
 	if (msgnum>99) msgnum=0;
-	sprintf(buf, MSG_LEVEL "%s-%d [%02d]:  ", major_name, current_drive - D_S, msgnum);
 	va_start(args, fmt);
-	vsprintf(&buf[18], fmt, args);
+	vsnprintf(buf, sizeof(buf), fmt, args);
 	va_end(args);
-	printk(buf);
+	printk(MSG_LEVEL "%s-%d [%02d]:  %s", major_name, current_drive - D_S, msgnum, buf);
 #if KLOGD_PAUSE
 	sbp_sleep(KLOGD_PAUSE); /* else messages get lost */
 #endif /* KLOGD_PAUSE */ 
-- 
cgit v0.10.2


From dc168427e6250a5a24c59f34afed6538092dab42 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@sw.ru>
Date: Wed, 6 Dec 2006 20:37:07 -0800
Subject: [PATCH] VFS: extra check inside dentry_unhash()

d_count check after dget() is always true.

Signed-off-by:	Vasily Averin <vvs@sw.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/namei.c b/fs/namei.c
index 61f99c1..db1bca2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1998,8 +1998,7 @@ asmlinkage long sys_mkdir(const char __user *pathname, int mode)
 void dentry_unhash(struct dentry *dentry)
 {
 	dget(dentry);
-	if (atomic_read(&dentry->d_count))
-		shrink_dcache_parent(dentry);
+	shrink_dcache_parent(dentry);
 	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	if (atomic_read(&dentry->d_count) == 2)
-- 
cgit v0.10.2


From 5d469ec0f40d65b2a0a704402990a43b2dafe197 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Wed, 6 Dec 2006 20:37:08 -0800
Subject: [PATCH] Correct misc_register return code handling in several drivers

Clean up several code points in which the return code from misc_register is
not handled properly.

Several modules failed to deregister various hooks when misc_register fails,
and this patch cleans them up.  Also there are a few modules that legitimately
don't care about the failure status of misc register.  These drivers however
unilaterally call misc_deregister on module unload.

Since misc_register doesn't initialize the list_head in the init_routine if it
fails, the deregister operation is at risk for oopsing when list_del is
called.  The initial solution was to manually init the list in the miscdev
structure in each of those modules, but the consensus in this thread was to
consolodate and do that universally inside misc_register.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Cc: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Kylene Jo Hall <kjhall@us.ibm.com>
Cc: Dmitry Torokhov <dtor@mail.ru>
Cc: Olaf Hering <olh@suse.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index 7a484fc..7e975f6 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -199,6 +199,8 @@ int misc_register(struct miscdevice * misc)
 	dev_t dev;
 	int err = 0;
 
+	INIT_LIST_HEAD(&misc->list);
+
 	down(&misc_sem);
 	list_for_each_entry(c, &misc_list, list) {
 		if (c->minor == misc->minor) {
diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index 22b9905..c091603 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -680,7 +680,7 @@ static int __init mmtimer_init(void)
 	if (sn_rtc_cycles_per_second < 100000) {
 		printk(KERN_ERR "%s: unable to determine clock frequency\n",
 		       MMTIMER_NAME);
-		return -1;
+		goto out1;
 	}
 
 	mmtimer_femtoperiod = ((unsigned long)1E15 + sn_rtc_cycles_per_second /
@@ -689,13 +689,13 @@ static int __init mmtimer_init(void)
 	if (request_irq(SGI_MMTIMER_VECTOR, mmtimer_interrupt, IRQF_PERCPU, MMTIMER_NAME, NULL)) {
 		printk(KERN_WARNING "%s: unable to allocate interrupt.",
 			MMTIMER_NAME);
-		return -1;
+		goto out1;
 	}
 
 	if (misc_register(&mmtimer_miscdev)) {
 		printk(KERN_ERR "%s: failed to register device\n",
 		       MMTIMER_NAME);
-		return -1;
+		goto out2;
 	}
 
 	/* Get max numbered node, calculate slots needed */
@@ -709,16 +709,18 @@ static int __init mmtimer_init(void)
 	if (timers == NULL) {
 		printk(KERN_ERR "%s: failed to allocate memory for device\n",
 				MMTIMER_NAME);
-		return -1;
+		goto out3;
 	}
 
+	memset(timers,0,(sizeof(mmtimer_t *)*maxn));
+
 	/* Allocate mmtimer_t's for each online node */
 	for_each_online_node(node) {
 		timers[node] = kmalloc_node(sizeof(mmtimer_t)*NUM_COMPARATORS, GFP_KERNEL, node);
 		if (timers[node] == NULL) {
 			printk(KERN_ERR "%s: failed to allocate memory for device\n",
 				MMTIMER_NAME);
-			return -1;
+			goto out4;
 		}
 		for (i=0; i< NUM_COMPARATORS; i++) {
 			mmtimer_t * base = timers[node] + i;
@@ -739,6 +741,17 @@ static int __init mmtimer_init(void)
 	       sn_rtc_cycles_per_second/(unsigned long)1E6);
 
 	return 0;
+
+out4:
+	for_each_online_node(node) {
+		kfree(timers[node]);
+	}
+out3:
+	misc_deregister(&mmtimer_miscdev);
+out2:
+	free_irq(SGI_MMTIMER_VECTOR, NULL);
+out1:
+	return -1;
 }
 
 module_init(mmtimer_init);
diff --git a/drivers/char/tpm/tpm.c b/drivers/char/tpm/tpm.c
index 774fa86..33e1f66 100644
--- a/drivers/char/tpm/tpm.c
+++ b/drivers/char/tpm/tpm.c
@@ -1155,6 +1155,7 @@ struct tpm_chip *tpm_register_hardware(struct device *dev, const struct tpm_vend
 
 	if (sysfs_create_group(&dev->kobj, chip->vendor.attr_group)) {
 		list_del(&chip->list);
+		misc_deregister(&chip->vendor.miscdev);
 		put_device(dev);
 		clear_bit(chip->dev_num, dev_mask);
 		kfree(chip);
diff --git a/drivers/input/misc/hp_sdc_rtc.c b/drivers/input/misc/hp_sdc_rtc.c
index ab4da79..31d5a13 100644
--- a/drivers/input/misc/hp_sdc_rtc.c
+++ b/drivers/input/misc/hp_sdc_rtc.c
@@ -695,7 +695,9 @@ static int __init hp_sdc_rtc_init(void)
 
 	if ((ret = hp_sdc_request_timer_irq(&hp_sdc_rtc_isr)))
 		return ret;
-	misc_register(&hp_sdc_rtc_dev);
+	if (misc_register(&hp_sdc_rtc_dev) != 0)
+		printk(KERN_INFO "Could not register misc. dev for i8042 rtc\n");
+
         create_proc_read_entry ("driver/rtc", 0, NULL,
 				hp_sdc_rtc_read_proc, NULL);
 
diff --git a/drivers/macintosh/apm_emu.c b/drivers/macintosh/apm_emu.c
index 1293876..8862a83 100644
--- a/drivers/macintosh/apm_emu.c
+++ b/drivers/macintosh/apm_emu.c
@@ -529,7 +529,8 @@ static int __init apm_emu_init(void)
 	if (apm_proc)
 		apm_proc->owner = THIS_MODULE;
 
-	misc_register(&apm_device);
+	if (misc_register(&apm_device) != 0)
+		printk(KERN_INFO "Could not create misc. device for apm\n");
 
 	pmu_register_sleep_notifier(&apm_sleep_notifier);
 
-- 
cgit v0.10.2


From 8f63fdbbd6de7d734c036948bf7c4b2bebe3ad99 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Wed, 6 Dec 2006 20:37:09 -0800
Subject: [PATCH] More list debugging context

Print the other (hopefully) known good pointer when list_head debugging
too, which may yield additional clues.

Also fix for 80-columns to win akpm brownie points.

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/lib/list_debug.c b/lib/list_debug.c
index 7ba9d82..4350ba9 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -21,13 +21,15 @@ void __list_add(struct list_head *new,
 			      struct list_head *next)
 {
 	if (unlikely(next->prev != prev)) {
-		printk(KERN_ERR "list_add corruption. next->prev should be %p, but was %p\n",
-			prev, next->prev);
+		printk(KERN_ERR "list_add corruption. next->prev should be "
+			"prev (%p), but was %p. (next=%p).\n",
+			prev, next->prev, next);
 		BUG();
 	}
 	if (unlikely(prev->next != next)) {
-		printk(KERN_ERR "list_add corruption. prev->next should be %p, but was %p\n",
-			next, prev->next);
+		printk(KERN_ERR "list_add corruption. prev->next should be "
+			"next (%p), but was %p. (prev=%p).\n",
+			next, prev->next, prev);
 		BUG();
 	}
 	next->prev = new;
-- 
cgit v0.10.2


From 22f2e280179946b8be1e2205b8654f2cb4abbf64 Mon Sep 17 00:00:00 2001
From: Derek Fults <dfults@sgi.com>
Date: Wed, 6 Dec 2006 20:37:11 -0800
Subject: [PATCH] get_options to allow a hypenated range for isolcpus

This allows a hyphenated range of positive numbers in the string passed
to command line helper function, get_options.

Currently the command line option "isolcpus=" takes as its argument a
list of cpus.

Format: <cpu number>,...,<cpu number>
Valid values of <cpu_number>  include all cpus, 0 to "number of CPUs in
system - 1". This can get extremely long when isolating the majority of
cpus on a large system.  The kernel isolcpus code would not need any
changing to use this feature.  To use it, the change would be in the
command line format for 'isolcpus='
Format:
<cpu number>,...,<cpu number>
or
<cpu number>-<cpu number>  (must be a positive range in ascending
order.)
or a mixture
<cpu number>,...,<cpu number>-<cpu number>

Signed-off-by: Derek Fults <dfults@sgi.com>
Cc: "Randy.Dunlap" <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 1e183bd..8fe6b83 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -714,7 +714,12 @@ and is between 256 and 4096 characters. It is defined in the file
 			Format: <RDP>,<reset>,<pci_scan>,<verbosity>
 
 	isolcpus=	[KNL,SMP] Isolate CPUs from the general scheduler.
-			Format: <cpu number>,...,<cpu number>
+			Format:
+			<cpu number>,...,<cpu number>
+			or
+			<cpu number>-<cpu number>  (must be a positive range in ascending order)
+			or a mixture
+			<cpu number>,...,<cpu number>-<cpu number>
 			This option can be used to specify one or more CPUs
 			to isolate from the general SMP balancing and scheduling
 			algorithms. The only way to move a process onto or off
diff --git a/lib/cmdline.c b/lib/cmdline.c
index 0331ed8..8a5b530 100644
--- a/lib/cmdline.c
+++ b/lib/cmdline.c
@@ -16,6 +16,23 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 
+/*
+ *	If a hyphen was found in get_option, this will handle the
+ *	range of numbers, M-N.  This will expand the range and insert
+ *	the values[M, M+1, ..., N] into the ints array in get_options.
+ */
+
+static int get_range(char **str, int *pint)
+{
+	int x, inc_counter, upper_range;
+
+	(*str)++;
+	upper_range = simple_strtol((*str), NULL, 0);
+	inc_counter = upper_range - *pint;
+	for (x = *pint; x < upper_range; x++)
+		*pint++ = x;
+	return inc_counter;
+}
 
 /**
  *	get_option - Parse integer from an option string
@@ -29,6 +46,7 @@
  *	0 : no int in string
  *	1 : int found, no subsequent comma
  *	2 : int found including a subsequent comma
+ *	3 : hyphen found to denote a range
  */
 
 int get_option (char **str, int *pint)
@@ -44,6 +62,8 @@ int get_option (char **str, int *pint)
 		(*str)++;
 		return 2;
 	}
+	if (**str == '-')
+		return 3;
 
 	return 1;
 }
@@ -55,7 +75,8 @@ int get_option (char **str, int *pint)
  *	@ints: integer array
  *
  *	This function parses a string containing a comma-separated
- *	list of integers.  The parse halts when the array is
+ *	list of integers, a hyphen-separated range of _positive_ integers,
+ *	or a combination of both.  The parse halts when the array is
  *	full, or when no more numbers can be retrieved from the
  *	string.
  *
@@ -72,6 +93,18 @@ char *get_options(const char *str, int nints, int *ints)
 		res = get_option ((char **)&str, ints + i);
 		if (res == 0)
 			break;
+		if (res == 3) {
+			int range_nums;
+			range_nums = get_range((char **)&str, ints + i);
+			if (range_nums < 0)
+				break;
+			/*
+			 * Decrement the result by one to leave out the
+			 * last number in the range.  The next iteration
+			 * will handle the upper number in the range
+			 */
+			i += (range_nums - 1);
+		}
 		i++;
 		if (res == 1)
 			break;
-- 
cgit v0.10.2


From 072330584404392dae44cd0793ac9b316cff045b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 6 Dec 2006 20:37:12 -0800
Subject: [PATCH] vfs_getattr(): remove dead code

As Mikulas points out, (1 << anything) won't be evaluating to zero.  This code
is long-dead.

Cc: Mikulas Patocka <mikulas@artax.karlin.mff.cuni.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/stat.c b/fs/stat.c
index bca07eb..a0ebfc7 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -51,13 +51,6 @@ int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 		return inode->i_op->getattr(mnt, dentry, stat);
 
 	generic_fillattr(inode, stat);
-	if (!stat->blksize) {
-		struct super_block *s = inode->i_sb;
-		unsigned blocks;
-		blocks = (stat->size+s->s_blocksize-1) >> s->s_blocksize_bits;
-		stat->blocks = (s->s_blocksize / 512) * blocks;
-		stat->blksize = s->s_blocksize;
-	}
 	return 0;
 }
 
-- 
cgit v0.10.2


From 3a229b39eb8497ae5f8077f81f7c8c3e1aacd624 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 6 Dec 2006 20:37:14 -0800
Subject: [PATCH] ext3: uninline large functions

Saves nearly 4kbytes on x86.

Cc: Arnaldo Carvalho de Melo <acme@mandriva.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext3/Makefile b/fs/ext3/Makefile
index 704cd44..e77766a 100644
--- a/fs/ext3/Makefile
+++ b/fs/ext3/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_EXT3_FS) += ext3.o
 
 ext3-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-	   ioctl.o namei.o super.o symlink.o hash.o resize.o
+	   ioctl.o namei.o super.o symlink.o hash.o resize.o ext3_jbd.o
 
 ext3-$(CONFIG_EXT3_FS_XATTR)	 += xattr.o xattr_user.o xattr_trusted.o
 ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
diff --git a/fs/ext3/ext3_jbd.c b/fs/ext3/ext3_jbd.c
new file mode 100644
index 0000000..e1f91fd
--- /dev/null
+++ b/fs/ext3/ext3_jbd.c
@@ -0,0 +1,59 @@
+/*
+ * Interface between ext3 and JBD
+ */
+
+#include <linux/ext3_jbd.h>
+
+int __ext3_journal_get_undo_access(const char *where, handle_t *handle,
+				struct buffer_head *bh)
+{
+	int err = journal_get_undo_access(handle, bh);
+	if (err)
+		ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+	return err;
+}
+
+int __ext3_journal_get_write_access(const char *where, handle_t *handle,
+				struct buffer_head *bh)
+{
+	int err = journal_get_write_access(handle, bh);
+	if (err)
+		ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+	return err;
+}
+
+int __ext3_journal_forget(const char *where, handle_t *handle,
+				struct buffer_head *bh)
+{
+	int err = journal_forget(handle, bh);
+	if (err)
+		ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+	return err;
+}
+
+int __ext3_journal_revoke(const char *where, handle_t *handle,
+				unsigned long blocknr, struct buffer_head *bh)
+{
+	int err = journal_revoke(handle, blocknr, bh);
+	if (err)
+		ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+	return err;
+}
+
+int __ext3_journal_get_create_access(const char *where,
+				handle_t *handle, struct buffer_head *bh)
+{
+	int err = journal_get_create_access(handle, bh);
+	if (err)
+		ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+	return err;
+}
+
+int __ext3_journal_dirty_metadata(const char *where,
+				handle_t *handle, struct buffer_head *bh)
+{
+	int err = journal_dirty_metadata(handle, bh);
+	if (err)
+		ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+	return err;
+}
diff --git a/include/linux/ext3_jbd.h b/include/linux/ext3_jbd.h
index ce0e610..8c43b13 100644
--- a/include/linux/ext3_jbd.h
+++ b/include/linux/ext3_jbd.h
@@ -109,74 +109,32 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode);
  * been done yet.
  */
 
-void ext3_journal_abort_handle(const char *caller, const char *err_fn,
-		struct buffer_head *bh, handle_t *handle, int err);
-
-static inline int
-__ext3_journal_get_undo_access(const char *where, handle_t *handle,
-				struct buffer_head *bh)
+static inline void ext3_journal_release_buffer(handle_t *handle,
+						struct buffer_head *bh)
 {
-	int err = journal_get_undo_access(handle, bh);
-	if (err)
-		ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
-	return err;
+	journal_release_buffer(handle, bh);
 }
 
-static inline int
-__ext3_journal_get_write_access(const char *where, handle_t *handle,
-				struct buffer_head *bh)
-{
-	int err = journal_get_write_access(handle, bh);
-	if (err)
-		ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
-	return err;
-}
+void ext3_journal_abort_handle(const char *caller, const char *err_fn,
+		struct buffer_head *bh, handle_t *handle, int err);
 
-static inline void
-ext3_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
-{
-	journal_release_buffer(handle, bh);
-}
+int __ext3_journal_get_undo_access(const char *where, handle_t *handle,
+				struct buffer_head *bh);
 
-static inline int
-__ext3_journal_forget(const char *where, handle_t *handle, struct buffer_head *bh)
-{
-	int err = journal_forget(handle, bh);
-	if (err)
-		ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
-	return err;
-}
+int __ext3_journal_get_write_access(const char *where, handle_t *handle,
+				struct buffer_head *bh);
 
-static inline int
-__ext3_journal_revoke(const char *where, handle_t *handle,
-		      unsigned long blocknr, struct buffer_head *bh)
-{
-	int err = journal_revoke(handle, blocknr, bh);
-	if (err)
-		ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
-	return err;
-}
+int __ext3_journal_forget(const char *where, handle_t *handle,
+				struct buffer_head *bh);
 
-static inline int
-__ext3_journal_get_create_access(const char *where,
-				 handle_t *handle, struct buffer_head *bh)
-{
-	int err = journal_get_create_access(handle, bh);
-	if (err)
-		ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
-	return err;
-}
+int __ext3_journal_revoke(const char *where, handle_t *handle,
+				unsigned long blocknr, struct buffer_head *bh);
 
-static inline int
-__ext3_journal_dirty_metadata(const char *where,
-			      handle_t *handle, struct buffer_head *bh)
-{
-	int err = journal_dirty_metadata(handle, bh);
-	if (err)
-		ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
-	return err;
-}
+int __ext3_journal_get_create_access(const char *where,
+				handle_t *handle, struct buffer_head *bh);
 
+int __ext3_journal_dirty_metadata(const char *where,
+				handle_t *handle, struct buffer_head *bh);
 
 #define ext3_journal_get_undo_access(handle, bh) \
 	__ext3_journal_get_undo_access(__FUNCTION__, (handle), (bh))
-- 
cgit v0.10.2


From 8984d137df669a6e94dbce7b87095e4ce80b9e67 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 6 Dec 2006 20:37:15 -0800
Subject: [PATCH] ext4: uninline large functions

Saves nearly 4kbytes on x86.

Cc: Arnaldo Carvalho de Melo <acme@mandriva.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index a6acb96..ae6e7e5 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -5,7 +5,8 @@
 obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
 
 ext4dev-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-	   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o
+		   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
+		   ext4_jbd2.o
 
 ext4dev-$(CONFIG_EXT4DEV_FS_XATTR)	+= xattr.o xattr_user.o xattr_trusted.o
 ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL)	+= acl.o
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
new file mode 100644
index 0000000..d6afe4e
--- /dev/null
+++ b/fs/ext4/ext4_jbd2.c
@@ -0,0 +1,59 @@
+/*
+ * Interface between ext4 and JBD
+ */
+
+#include <linux/ext4_jbd2.h>
+
+int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
+				struct buffer_head *bh)
+{
+	int err = jbd2_journal_get_undo_access(handle, bh);
+	if (err)
+		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+	return err;
+}
+
+int __ext4_journal_get_write_access(const char *where, handle_t *handle,
+				struct buffer_head *bh)
+{
+	int err = jbd2_journal_get_write_access(handle, bh);
+	if (err)
+		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+	return err;
+}
+
+int __ext4_journal_forget(const char *where, handle_t *handle,
+				struct buffer_head *bh)
+{
+	int err = jbd2_journal_forget(handle, bh);
+	if (err)
+		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+	return err;
+}
+
+int __ext4_journal_revoke(const char *where, handle_t *handle,
+				ext4_fsblk_t blocknr, struct buffer_head *bh)
+{
+	int err = jbd2_journal_revoke(handle, blocknr, bh);
+	if (err)
+		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+	return err;
+}
+
+int __ext4_journal_get_create_access(const char *where,
+				handle_t *handle, struct buffer_head *bh)
+{
+	int err = jbd2_journal_get_create_access(handle, bh);
+	if (err)
+		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+	return err;
+}
+
+int __ext4_journal_dirty_metadata(const char *where,
+				handle_t *handle, struct buffer_head *bh)
+{
+	int err = jbd2_journal_dirty_metadata(handle, bh);
+	if (err)
+		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+	return err;
+}
diff --git a/include/linux/ext4_jbd2.h b/include/linux/ext4_jbd2.h
index 72dd631..d716e63 100644
--- a/include/linux/ext4_jbd2.h
+++ b/include/linux/ext4_jbd2.h
@@ -114,74 +114,32 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
  * been done yet.
  */
 
-void ext4_journal_abort_handle(const char *caller, const char *err_fn,
-		struct buffer_head *bh, handle_t *handle, int err);
-
-static inline int
-__ext4_journal_get_undo_access(const char *where, handle_t *handle,
-				struct buffer_head *bh)
+static inline void ext4_journal_release_buffer(handle_t *handle,
+						struct buffer_head *bh)
 {
-	int err = jbd2_journal_get_undo_access(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
-	return err;
+	jbd2_journal_release_buffer(handle, bh);
 }
 
-static inline int
-__ext4_journal_get_write_access(const char *where, handle_t *handle,
-				struct buffer_head *bh)
-{
-	int err = jbd2_journal_get_write_access(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
-	return err;
-}
+void ext4_journal_abort_handle(const char *caller, const char *err_fn,
+		struct buffer_head *bh, handle_t *handle, int err);
 
-static inline void
-ext4_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
-{
-	jbd2_journal_release_buffer(handle, bh);
-}
+int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
+				struct buffer_head *bh);
 
-static inline int
-__ext4_journal_forget(const char *where, handle_t *handle, struct buffer_head *bh)
-{
-	int err = jbd2_journal_forget(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
-	return err;
-}
+int __ext4_journal_get_write_access(const char *where, handle_t *handle,
+				struct buffer_head *bh);
 
-static inline int
-__ext4_journal_revoke(const char *where, handle_t *handle,
-		      ext4_fsblk_t blocknr, struct buffer_head *bh)
-{
-	int err = jbd2_journal_revoke(handle, blocknr, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
-	return err;
-}
+int __ext4_journal_forget(const char *where, handle_t *handle,
+				struct buffer_head *bh);
 
-static inline int
-__ext4_journal_get_create_access(const char *where,
-				 handle_t *handle, struct buffer_head *bh)
-{
-	int err = jbd2_journal_get_create_access(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
-	return err;
-}
+int __ext4_journal_revoke(const char *where, handle_t *handle,
+				ext4_fsblk_t blocknr, struct buffer_head *bh);
 
-static inline int
-__ext4_journal_dirty_metadata(const char *where,
-			      handle_t *handle, struct buffer_head *bh)
-{
-	int err = jbd2_journal_dirty_metadata(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
-	return err;
-}
+int __ext4_journal_get_create_access(const char *where,
+				handle_t *handle, struct buffer_head *bh);
 
+int __ext4_journal_dirty_metadata(const char *where,
+				handle_t *handle, struct buffer_head *bh);
 
 #define ext4_journal_get_undo_access(handle, bh) \
 	__ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh))
-- 
cgit v0.10.2


From f6337e2af42f9ea9ce296cfa18966dcf4f045f7d Mon Sep 17 00:00:00 2001
From: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
Date: Wed, 6 Dec 2006 20:37:17 -0800
Subject: [PATCH] i2lib unused variable cleanup

  In file included from drivers/char/ip2/ip2main.c:285:
    drivers/char/ip2/i2lib.c: In function `i2Output':
    drivers/char/ip2/i2lib.c:1019: warning: unused variable `rc'

Signed-off-by: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/ip2/i2lib.c b/drivers/char/ip2/i2lib.c
index c213fdb..7804576 100644
--- a/drivers/char/ip2/i2lib.c
+++ b/drivers/char/ip2/i2lib.c
@@ -1016,7 +1016,6 @@ i2Output(i2ChanStrPtr pCh, const char *pSource, int count)
 	unsigned short channel;
 	unsigned short stuffIndex;
 	unsigned long flags;
-	int rc = 0;
 
 	int bailout = 10;
 
-- 
cgit v0.10.2


From 2e591bbc0d563e12f5a260fbbca0df7d5810910e Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 6 Dec 2006 20:37:19 -0800
Subject: [PATCH] Make initramfs printk a warning on incorrect cpio type

It turns out that the "-c" option of cpio is highly unportable even between
distros let alone unix variants, and may actually make the wrong type of
cpio archive.  I just wasted quite some time on this, and the kernel can
detect this and warn about it (it's __init memory so it gets thrown away
and thus there is no runtime overhead)

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/init/initramfs.c b/init/initramfs.c
index d28c109..85f0403 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -182,6 +182,10 @@ static int __init do_collect(void)
 
 static int __init do_header(void)
 {
+	if (memcmp(collected, "070707", 6)==0) {
+		error("incorrect cpio method used: use -H newc option");
+		return 1;
+	}
 	if (memcmp(collected, "070701", 6)) {
 		error("no cpio magic");
 		return 1;
-- 
cgit v0.10.2


From 8bb0269160df2a60764013994d0bc5165406cf4a Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.org.uk>
Date: Wed, 6 Dec 2006 20:37:20 -0800
Subject: [PATCH] corrupted cramfs filesystems cause kernel oops

Steve Grubb's fzfuzzer tool (http://people.redhat.com/sgrubb/files/
fsfuzzer-0.6.tar.gz) generates corrupt Cramfs filesystems which cause
Cramfs to kernel oops in cramfs_uncompress_block().  The cause of the oops
is an unchecked corrupted block length field read by cramfs_readpage().

This patch adds a sanity check to cramfs_readpage() which checks that the
block length field is sensible.  The (PAGE_CACHE_SIZE << 1) size check is
intentional, even though the uncompressed data is not going to be larger
than PAGE_CACHE_SIZE, gzip sometimes generates compressed data larger than
the original source data.  Mkcramfs checks that the compressed size is
always less than or equal to PAGE_CACHE_SIZE << 1.  Of course Cramfs could
use the original uncompressed data in this case, but it doesn't.

Signed-off-by: Phillip Lougher <phillip@lougher.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index a624c3e..0509ced 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -481,6 +481,8 @@ static int cramfs_readpage(struct file *file, struct page * page)
 		pgdata = kmap(page);
 		if (compr_len == 0)
 			; /* hole */
+		else if (compr_len > (PAGE_CACHE_SIZE << 1))
+			printk(KERN_ERR "cramfs: bad compressed blocksize %u\n", compr_len);
 		else {
 			mutex_lock(&read_mutex);
 			bytes_filled = cramfs_uncompress_block(pgdata,
-- 
cgit v0.10.2


From a4c410f00f7ca4bd448b0d63f6f882fd244dc991 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 6 Dec 2006 20:37:21 -0800
Subject: [PATCH] lockdep: print current locks on in_atomic warnings

Add debug_show_held_locks(current) to __might_sleep() and schedule(); this
makes finding the offending lock leak easier.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/sched.c b/kernel/sched.c
index 1848e28..343e179 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3333,6 +3333,7 @@ asmlinkage void __sched schedule(void)
 		printk(KERN_ERR "BUG: scheduling while atomic: "
 			"%s/0x%08x/%d\n",
 			current->comm, preempt_count(), current->pid);
+		debug_show_held_locks(current);
 		dump_stack();
 	}
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -6872,6 +6873,7 @@ void __might_sleep(char *file, int line)
 				" context at %s:%d\n", file, line);
 		printk("in_atomic():%d, irqs_disabled():%d\n",
 			in_atomic(), irqs_disabled());
+		debug_show_held_locks(current);
 		dump_stack();
 	}
 #endif
-- 
cgit v0.10.2


From 6cfd76a26d9fe2ba54b9d496a48c1d9285e5c5ed Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 6 Dec 2006 20:37:22 -0800
Subject: [PATCH] lockdep: name some old style locks

Name some of the remaning 'old_style_spin_init' locks

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index fe9c5e8..3124f1b 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -452,7 +452,7 @@ void die(const char * str, struct pt_regs * regs, long err)
 		u32 lock_owner;
 		int lock_owner_depth;
 	} die = {
-		.lock =			SPIN_LOCK_UNLOCKED,
+		.lock =			__SPIN_LOCK_UNLOCKED(die.lock),
 		.lock_owner =		-1,
 		.lock_owner_depth =	0
 	};
diff --git a/include/asm-i386/rwsem.h b/include/asm-i386/rwsem.h
index bc598d6..041906f 100644
--- a/include/asm-i386/rwsem.h
+++ b/include/asm-i386/rwsem.h
@@ -75,8 +75,8 @@ struct rw_semaphore {
 
 
 #define __RWSEM_INITIALIZER(name) \
-{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) \
-	__RWSEM_DEP_MAP_INIT(name) }
+{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
+  LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) }
 
 #define DECLARE_RWSEM(name) \
 	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 33c5daa..733790d 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -73,7 +73,7 @@
 extern struct nsproxy init_nsproxy;
 #define INIT_NSPROXY(nsproxy) {						\
 	.count		= ATOMIC_INIT(1),				\
-	.nslock		= SPIN_LOCK_UNLOCKED,				\
+	.nslock		= __SPIN_LOCK_UNLOCKED(nsproxy.nslock),		\
 	.uts_ns		= &init_uts_ns,					\
 	.namespace	= NULL,						\
 	INIT_IPC_NS(ipc_ns)						\
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 27c48da..b2b91c4 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -94,7 +94,7 @@ do {							\
 
 #define __MUTEX_INITIALIZER(lockname) \
 		{ .count = ATOMIC_INIT(1) \
-		, .wait_lock = SPIN_LOCK_UNLOCKED \
+		, .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
 		, .wait_list = LIST_HEAD_INIT(lockname.wait_list) \
 		__DEBUG_MUTEX_INITIALIZER(lockname) \
 		__DEP_MAP_MUTEX_INITIALIZER(lockname) }
diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 5d41dee..b0090e9 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -63,7 +63,7 @@ struct hrtimer_sleeper;
 #endif
 
 #define __RT_MUTEX_INITIALIZER(mutexname) \
-	{ .wait_lock = SPIN_LOCK_UNLOCKED \
+	{ .wait_lock = __SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
 	, .wait_list = PLIST_HEAD_INIT(mutexname.wait_list, mutexname.wait_lock) \
 	, .owner = NULL \
 	__DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index ae1fcad..813cee1 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -44,7 +44,8 @@ struct rw_semaphore {
 #endif
 
 #define __RWSEM_INITIALIZER(name) \
-{ 0, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) }
+{ 0, __SPIN_LOCK_UNLOCKED(name.wait_lock), LIST_HEAD_INIT((name).wait_list) \
+  __RWSEM_DEP_MAP_INIT(name) }
 
 #define DECLARE_RWSEM(name) \
 	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index f399c13..0746c3b 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -222,7 +222,7 @@ struct rpc_wait_queue {
 
 #ifndef RPC_DEBUG
 # define RPC_WAITQ_INIT(var,qname) { \
-		.lock = SPIN_LOCK_UNLOCKED, \
+		.lock = __SPIN_LOCK_UNLOCKED(var.lock), \
 		.tasks = { \
 			[0] = LIST_HEAD_INIT(var.tasks[0]), \
 			[1] = LIST_HEAD_INIT(var.tasks[1]), \
@@ -231,7 +231,7 @@ struct rpc_wait_queue {
 	}
 #else
 # define RPC_WAITQ_INIT(var,qname) { \
-		.lock = SPIN_LOCK_UNLOCKED, \
+		.lock = __SPIN_LOCK_UNLOCKED(var.lock), \
 		.tasks = { \
 			[0] = LIST_HEAD_INIT(var.tasks[0]), \
 			[1] = LIST_HEAD_INIT(var.tasks[1]), \
diff --git a/kernel/acct.c b/kernel/acct.c
index 0aad5ca..dc12db8 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -89,7 +89,8 @@ struct acct_glbs {
 	struct timer_list	timer;
 };
 
-static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED};
+static struct acct_glbs acct_globals __cacheline_aligned =
+	{__SPIN_LOCK_UNLOCKED(acct_globals.lock)};
 
 /*
  * Called whenever the timer says to check the free space.
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a681912..aff1f0f 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -54,7 +54,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
 		.chip = &no_irq_chip,
 		.handle_irq = handle_bad_irq,
 		.depth = 1,
-		.lock = SPIN_LOCK_UNLOCKED,
+		.lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
 #ifdef CONFIG_SMP
 		.affinity = CPU_MASK_ALL
 #endif
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index ee9bb15..c7bb5f7 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -119,7 +119,8 @@ EXPORT_SYMBOL(svc_auth_unregister);
 #define	DN_HASHMASK	(DN_HASHMAX-1)
 
 static struct hlist_head	auth_domain_table[DN_HASHMAX];
-static spinlock_t	auth_domain_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t	auth_domain_lock =
+	__SPIN_LOCK_UNLOCKED(auth_domain_lock);
 
 void auth_domain_put(struct auth_domain *dom)
 {
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 32150cf..b6f8680 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -27,7 +27,7 @@ static DEFINE_MUTEX(key_session_mutex);
 struct key_user root_key_user = {
 	.usage		= ATOMIC_INIT(3),
 	.consq		= LIST_HEAD_INIT(root_key_user.consq),
-	.lock		= SPIN_LOCK_UNLOCKED,
+	.lock		= __SPIN_LOCK_UNLOCKED(root_key_user.lock),
 	.nkeys		= ATOMIC_INIT(2),
 	.nikeys		= ATOMIC_INIT(2),
 	.uid		= 0,
-- 
cgit v0.10.2


From 70888bd5b70579e278d5ef1b7e1ec6a420d38b9e Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@sw.ru>
Date: Wed, 6 Dec 2006 20:37:23 -0800
Subject: [PATCH] Documentation: remount_fs() needs lock_kernel

Fixed long-lived typo: remount_fs() needs BKL

Signed-off-by: Vasily Averin <vvs@sw.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index eb1a6ca..790ef6f 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -124,7 +124,7 @@ sync_fs:		no	no	read
 write_super_lockfs:	?
 unlockfs:		?
 statfs:			no	no	no
-remount_fs:		no	yes	maybe		(see below)
+remount_fs:		yes	yes	maybe		(see below)
 clear_inode:		no
 umount_begin:		yes	no	no
 show_options:		no				(vfsmount->sem)
-- 
cgit v0.10.2


From ece8a684c75df215320b4155944979e3f78c5c93 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 6 Dec 2006 20:37:24 -0800
Subject: [PATCH] sleep profiling

Implement prof=sleep profiling.  TASK_UNINTERRUPTIBLE sleeps will be taken
as a profile hit, and every millisecond spent sleeping causes a profile-hit
for the call site that initiated the sleep.

Sample readprofile output on i386:

   306 ps2_sendbyte                               1.3973
   432 call_usermodehelper_keys                   1.9548
   484 ps2_command                                0.6453
   790 __driver_attach                            4.7879
  1593 msleep                                    44.2500
  3976 sync_buffer                               64.1290
  4076 do_lookup                                 12.4648
  8587 sync_page                                122.6714
 20820 total                                      0.0067

(NOTE: architectures need to check whether get_wchan() can be called from
deep within the wakeup path.)

akpm: we need to mark more functions __sched.  lock_sock(), msleep(), others..

akpm: the contention in do_lookup() is a surprise.  Presumably doing disk
reads for directory contents while holding i_mutex.

[akpm@osdl.org: various fixes]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 8fe6b83..2a40d9f 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1294,6 +1294,7 @@ and is between 256 and 4096 characters. It is defined in the file
 			Param: "schedule" - profile schedule points.
 			Param: <number> - step/bucket size as a power of 2 for
 				statistical time based profiling.
+			Param: "sleep" - profile D-state sleeping (millisecs)
 
 	processor.max_cstate=	[HW,ACPI]
 			Limit processor to maximum C-state
diff --git a/include/linux/profile.h b/include/linux/profile.h
index acce53f..5670b34 100644
--- a/include/linux/profile.h
+++ b/include/linux/profile.h
@@ -6,10 +6,15 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/cpumask.h>
+#include <linux/cache.h>
+
 #include <asm/errno.h>
 
+extern int prof_on __read_mostly;
+
 #define CPU_PROFILING	1
 #define SCHED_PROFILING	2
+#define SLEEP_PROFILING	3
 
 struct proc_dir_entry;
 struct pt_regs;
@@ -18,7 +23,24 @@ struct notifier_block;
 /* init basic kernel profiler */
 void __init profile_init(void);
 void profile_tick(int);
-void profile_hit(int, void *);
+
+/*
+ * Add multiple profiler hits to a given address:
+ */
+void profile_hits(int, void *ip, unsigned int nr_hits);
+
+/*
+ * Single profiler hit:
+ */
+static inline void profile_hit(int type, void *ip)
+{
+	/*
+	 * Speedup for the common (no profiling enabled) case:
+	 */
+	if (unlikely(prof_on == type))
+		profile_hits(type, ip, 1);
+}
+
 #ifdef CONFIG_PROC_FS
 void create_prof_cpu_mask(struct proc_dir_entry *);
 #else
diff --git a/kernel/profile.c b/kernel/profile.c
index 15b012df..04fd84e 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -40,7 +40,7 @@ int (*timer_hook)(struct pt_regs *) __read_mostly;
 
 static atomic_t *prof_buffer;
 static unsigned long prof_len, prof_shift;
-static int prof_on __read_mostly;
+int prof_on __read_mostly;
 static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
@@ -51,9 +51,19 @@ static DEFINE_MUTEX(profile_flip_mutex);
 static int __init profile_setup(char * str)
 {
 	static char __initdata schedstr[] = "schedule";
+	static char __initdata sleepstr[] = "sleep";
 	int par;
 
-	if (!strncmp(str, schedstr, strlen(schedstr))) {
+	if (!strncmp(str, sleepstr, strlen(sleepstr))) {
+		prof_on = SLEEP_PROFILING;
+		if (str[strlen(sleepstr)] == ',')
+			str += strlen(sleepstr) + 1;
+		if (get_option(&str, &par))
+			prof_shift = par;
+		printk(KERN_INFO
+			"kernel sleep profiling enabled (shift: %ld)\n",
+			prof_shift);
+	} else if (!strncmp(str, sleepstr, strlen(sleepstr))) {
 		prof_on = SCHED_PROFILING;
 		if (str[strlen(schedstr)] == ',')
 			str += strlen(schedstr) + 1;
@@ -204,7 +214,8 @@ EXPORT_SYMBOL_GPL(profile_event_unregister);
  * positions to which hits are accounted during short intervals (e.g.
  * several seconds) is usually very small. Exclusion from buffer
  * flipping is provided by interrupt disablement (note that for
- * SCHED_PROFILING profile_hit() may be called from process context).
+ * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
+ * process context).
  * The hash function is meant to be lightweight as opposed to strong,
  * and was vaguely inspired by ppc64 firmware-supported inverted
  * pagetable hash functions, but uses a full hashtable full of finite
@@ -257,7 +268,7 @@ static void profile_discard_flip_buffers(void)
 	mutex_unlock(&profile_flip_mutex);
 }
 
-void profile_hit(int type, void *__pc)
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
 {
 	unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
 	int i, j, cpu;
@@ -274,21 +285,31 @@ void profile_hit(int type, void *__pc)
 		put_cpu();
 		return;
 	}
+	/*
+	 * We buffer the global profiler buffer into a per-CPU
+	 * queue and thus reduce the number of global (and possibly
+	 * NUMA-alien) accesses. The write-queue is self-coalescing:
+	 */
 	local_irq_save(flags);
 	do {
 		for (j = 0; j < PROFILE_GRPSZ; ++j) {
 			if (hits[i + j].pc == pc) {
-				hits[i + j].hits++;
+				hits[i + j].hits += nr_hits;
 				goto out;
 			} else if (!hits[i + j].hits) {
 				hits[i + j].pc = pc;
-				hits[i + j].hits = 1;
+				hits[i + j].hits = nr_hits;
 				goto out;
 			}
 		}
 		i = (i + secondary) & (NR_PROFILE_HIT - 1);
 	} while (i != primary);
-	atomic_inc(&prof_buffer[pc]);
+
+	/*
+	 * Add the current hit(s) and flush the write-queue out
+	 * to the global buffer:
+	 */
+	atomic_add(nr_hits, &prof_buffer[pc]);
 	for (i = 0; i < NR_PROFILE_HIT; ++i) {
 		atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
 		hits[i].pc = hits[i].hits = 0;
@@ -356,14 +377,14 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
 #define profile_flip_buffers()		do { } while (0)
 #define profile_discard_flip_buffers()	do { } while (0)
 
-void profile_hit(int type, void *__pc)
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
 {
 	unsigned long pc;
 
 	if (prof_on != type || !prof_buffer)
 		return;
 	pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
-	atomic_inc(&prof_buffer[min(pc, prof_len - 1)]);
+	atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
 }
 #endif /* !CONFIG_SMP */
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 343e179..75a005e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -948,6 +948,17 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
 	}
 #endif
 
+	/*
+	 * Sleep time is in units of nanosecs, so shift by 20 to get a
+	 * milliseconds-range estimation of the amount of time that the task
+	 * spent sleeping:
+	 */
+	if (unlikely(prof_on == SLEEP_PROFILING)) {
+		if (p->state == TASK_UNINTERRUPTIBLE)
+			profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
+				     (now - p->timestamp) >> 20);
+	}
+
 	if (!rt_task(p))
 		p->prio = recalc_task_prio(p, now);
 
-- 
cgit v0.10.2


From 6fb50ea79cb869667adaa71ed32cc15dd73986de Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:37:25 -0800
Subject: [PATCH] ext4_ext_split(): remove dead code

The Coverity checker noted that this was dead code, since in all places
above in this function, "err" is immediately checked.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2608dce..1442ccb 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -800,9 +800,6 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	}
 
 	/* insert new index */
-	if (err)
-		goto cleanup;
-
 	err = ext4_ext_insert_index(handle, inode, path + at,
 				    le32_to_cpu(border), newblock);
 
-- 
cgit v0.10.2


From d5abe669172f20a4129a711de0f250a4e07db298 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 6 Dec 2006 20:37:26 -0800
Subject: [PATCH] debug: workqueue locking sanity

Workqueue functions should not leak locks, assert so, printing the
last function ran.

Use macros in lockdep.h to avoid include dependency pains.

[akpm@osdl.org: build fix]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 819f08f..da19aeb 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -243,6 +243,8 @@ extern void lock_release(struct lockdep_map *lock, int nested,
 
 # define INIT_LOCKDEP				.lockdep_recursion = 0,
 
+#define lockdep_depth(tsk)	((tsk)->lockdep_depth)
+
 #else /* !LOCKDEP */
 
 static inline void lockdep_off(void)
@@ -277,6 +279,9 @@ static inline int lockdep_internal(void)
  * The class key takes no space if lockdep is disabled:
  */
 struct lock_class_key { };
+
+#define lockdep_depth(tsk)	(0)
+
 #endif /* !LOCKDEP */
 
 #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_GENERIC_HARDIRQS)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2945b09..5484d6e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -30,6 +30,8 @@
 #include <linux/hardirq.h>
 #include <linux/mempolicy.h>
 #include <linux/freezer.h>
+#include <linux/kallsyms.h>
+#include <linux/debug_locks.h>
 
 /*
  * The per-CPU workqueue (if single thread, we always use the first
@@ -253,6 +255,17 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 			work_release(work);
 		f(work);
 
+		if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
+			printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
+					"%s/0x%08x/%d\n",
+					current->comm, preempt_count(),
+				       	current->pid);
+			printk(KERN_ERR "    last function: ");
+			print_symbol("%s\n", (unsigned long)f);
+			debug_show_held_locks(current);
+			dump_stack();
+		}
+
 		spin_lock_irqsave(&cwq->lock, flags);
 		cwq->remove_sequence++;
 		wake_up(&cwq->work_done);
-- 
cgit v0.10.2


From 40fcfc87222e2e8af6379ec366f0cb2a411570cd Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Wed, 6 Dec 2006 20:37:27 -0800
Subject: [PATCH] HZ: 300Hz support

Fix two things.  Firstly the unit is "Hz" not "HZ".  Secondly it is useful
to have 300Hz support when doing multimedia work.  250 is fine for us in
Europe but the US frame rate is 30fps (29.99 blah for pedants).  300 gives
us a tick divisible by both 25 and 30, and for interlace work 50 and 60.
It's also giving similar performance to 250Hz.

I'd argue we should remove 250 and add 300, but that might be excess
disruption for now.

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 248e1c3..4af15802 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -7,7 +7,7 @@ choice
 	default HZ_250
 	help
 	 Allows the configuration of the timer frequency. It is customary
-	 to have the timer interrupt run at 1000 HZ but 100 HZ may be more
+	 to have the timer interrupt run at 1000 Hz but 100 Hz may be more
 	 beneficial for servers and NUMA systems that do not need to have
 	 a fast response for user interaction and that may experience bus
 	 contention and cacheline bounces as a result of timer interrupts.
@@ -19,21 +19,30 @@ choice
 	config HZ_100
 		bool "100 HZ"
 	help
-	  100 HZ is a typical choice for servers, SMP and NUMA systems
+	  100 Hz is a typical choice for servers, SMP and NUMA systems
 	  with lots of processors that may show reduced performance if
 	  too many timer interrupts are occurring.
 
 	config HZ_250
 		bool "250 HZ"
 	help
-	 250 HZ is a good compromise choice allowing server performance
+	 250 Hz is a good compromise choice allowing server performance
 	 while also showing good interactive responsiveness even
-	 on SMP and NUMA systems.
+	 on SMP and NUMA systems. If you are going to be using NTSC video
+	 or multimedia, selected 300Hz instead.
+
+	config HZ_300
+		bool "300 HZ"
+	help
+	 300 Hz is a good compromise choice allowing server performance
+	 while also showing good interactive responsiveness even
+	 on SMP and NUMA systems and exactly dividing by both PAL and
+	 NTSC frame rates for video and multimedia work.
 
 	config HZ_1000
 		bool "1000 HZ"
 	help
-	 1000 HZ is the preferred choice for desktop systems and other
+	 1000 Hz is the preferred choice for desktop systems and other
 	 systems requiring fast interactive responses to events.
 
 endchoice
@@ -42,5 +51,6 @@ config HZ
 	int
 	default 100 if HZ_100
 	default 250 if HZ_250
+	default 300 if HZ_300
 	default 1000 if HZ_1000
 
-- 
cgit v0.10.2


From 0d75565f1e8f098b80a34ccf70db450f60618ec8 Mon Sep 17 00:00:00 2001
From: Kristian Kielhofner <kris@krisk.org>
Date: Wed, 6 Dec 2006 20:37:28 -0800
Subject: [PATCH] PCEngines WRAP LED Support

A driver for the PCEngines WRAP boards (http://www.pcengines.ch), which are
very similar to the Soekris net4801 (same NS SC1100 geode reference
design).

The LEDs on the WRAP are on different GPIO lines and I have modified and
copied the net48xx error led support for this.  It also includes support
for an "extra" led (in addition to error).  The three LEDs on the WRAP are
at GPIO lines 2,3,18 (WRAP LEDs from left to right).  This driver gives
access to the second and third LEDs by twiddling GPIO lines 3 & 18.

Because these boards are so similar to the net48xx, I basically sed-ed that
driver to form the basis for leds-wrap.c.  The only changes from
leds-net48xx.c are:

 - #define WRAP_EXTRA_LED_GPIO

 - name changes

 - duplicate relevant sections to provide support for the "extra" led

 - reverse the various *_led_set values.  The WRAP is "backwards" from the
   net48xx, and these needed to be updated for that.

[akpm@osdl.org: build fix]
Signed-off-by: Kristian Kielhofner <kris@krisk.org>
Acked-by: Richard Purdie <rpurdie@rpsys.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index 9c39b98..176142c 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -76,6 +76,12 @@ config LEDS_NET48XX
 	  This option enables support for the Soekris net4801 and net4826 error
 	  LED.
 
+config LEDS_WRAP
+	tristate "LED Support for the WRAP series LEDs"
+	depends on LEDS_CLASS && SCx200_GPIO
+	help
+	  This option enables support for the PCEngines WRAP programmable LEDs.
+
 comment "LED Triggers"
 
 config LEDS_TRIGGERS
diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile
index 6aa2aed..500de3dc 100644
--- a/drivers/leds/Makefile
+++ b/drivers/leds/Makefile
@@ -13,6 +13,7 @@ obj-$(CONFIG_LEDS_TOSA)			+= leds-tosa.o
 obj-$(CONFIG_LEDS_S3C24XX)		+= leds-s3c24xx.o
 obj-$(CONFIG_LEDS_AMS_DELTA)		+= leds-ams-delta.o
 obj-$(CONFIG_LEDS_NET48XX)		+= leds-net48xx.o
+obj-$(CONFIG_LEDS_WRAP)			+= leds-wrap.o
 
 # LED Triggers
 obj-$(CONFIG_LEDS_TRIGGER_TIMER)	+= ledtrig-timer.o
diff --git a/drivers/leds/leds-wrap.c b/drivers/leds/leds-wrap.c
new file mode 100644
index 0000000..27fb2d8
--- /dev/null
+++ b/drivers/leds/leds-wrap.c
@@ -0,0 +1,142 @@
+/*
+ * LEDs driver for PCEngines WRAP
+ *
+ * Copyright (C) 2006 Kristian Kielhofner <kris@krisk.org>
+ *
+ * Based on leds-net48xx.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/leds.h>
+#include <linux/err.h>
+#include <asm/io.h>
+#include <linux/scx200_gpio.h>
+
+#define DRVNAME "wrap-led"
+#define WRAP_ERROR_LED_GPIO	3
+#define	WRAP_EXTRA_LED_GPIO	18
+
+static struct platform_device *pdev;
+
+static void wrap_error_led_set(struct led_classdev *led_cdev,
+		enum led_brightness value)
+{
+	if (value)
+		scx200_gpio_set_low(WRAP_ERROR_LED_GPIO);
+	else
+		scx200_gpio_set_high(WRAP_ERROR_LED_GPIO);
+}
+
+static void wrap_extra_led_set(struct led_classdev *led_cdev,
+		enum led_brightness value)
+{
+	if (value)
+		scx200_gpio_set_low(WRAP_EXTRA_LED_GPIO);
+	else
+		scx200_gpio_set_high(WRAP_EXTRA_LED_GPIO);
+}
+
+static struct led_classdev wrap_error_led = {
+	.name		= "wrap:error",
+	.brightness_set	= wrap_error_led_set,
+};
+
+static struct led_classdev wrap_extra_led = {
+	.name           = "wrap:extra",
+	.brightness_set = wrap_extra_led_set,
+};
+
+#ifdef CONFIG_PM
+static int wrap_led_suspend(struct platform_device *dev,
+		pm_message_t state)
+{
+	led_classdev_suspend(&wrap_error_led);
+	led_classdev_suspend(&wrap_extra_led);
+	return 0;
+}
+
+static int wrap_led_resume(struct platform_device *dev)
+{
+	led_classdev_resume(&wrap_error_led);
+	led_classdev_resume(&wrap_extra_led);
+	return 0;
+}
+#else
+#define wrap_led_suspend NULL
+#define wrap_led_resume NULL
+#endif
+
+static int wrap_led_probe(struct platform_device *pdev)
+{
+	int ret;
+
+	ret = led_classdev_register(&pdev->dev, &wrap_error_led);
+	if (ret == 0) {
+		ret = led_classdev_register(&pdev->dev, &wrap_extra_led);
+		if (ret < 0)
+			led_classdev_unregister(&wrap_error_led);
+	}
+	return ret;
+}
+
+static int wrap_led_remove(struct platform_device *pdev)
+{
+	led_classdev_unregister(&wrap_error_led);
+	led_classdev_unregister(&wrap_extra_led);
+	return 0;
+}
+
+static struct platform_driver wrap_led_driver = {
+	.probe		= wrap_led_probe,
+	.remove		= wrap_led_remove,
+	.suspend	= wrap_led_suspend,
+	.resume		= wrap_led_resume,
+	.driver		= {
+		.name		= DRVNAME,
+		.owner		= THIS_MODULE,
+	},
+};
+
+static int __init wrap_led_init(void)
+{
+	int ret;
+
+	if (!scx200_gpio_present()) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	ret = platform_driver_register(&wrap_led_driver);
+	if (ret < 0)
+		goto out;
+
+	pdev = platform_device_register_simple(DRVNAME, -1, NULL, 0);
+	if (IS_ERR(pdev)) {
+		ret = PTR_ERR(pdev);
+		platform_driver_unregister(&wrap_led_driver);
+		goto out;
+	}
+
+out:
+	return ret;
+}
+
+static void __exit wrap_led_exit(void)
+{
+	platform_device_unregister(pdev);
+	platform_driver_unregister(&wrap_led_driver);
+}
+
+module_init(wrap_led_init);
+module_exit(wrap_led_exit);
+
+MODULE_AUTHOR("Kristian Kielhofner <kris@krisk.org>");
+MODULE_DESCRIPTION("PCEngines WRAP LED driver");
+MODULE_LICENSE("GPL");
+
-- 
cgit v0.10.2


From 28ec24e23229ae3d333f8d7f0e6b31fa8ea7bf46 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 6 Dec 2006 20:37:29 -0800
Subject: [PATCH] driver/base/memory.c: handle errors properly

Do proper error-checking and propagation in drivers/base/memory.c, hence fix
__must_check warnings.

Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index c6b7d9c..74b9679 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -290,9 +290,8 @@ static CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL);
 
 static int block_size_init(void)
 {
-	sysfs_create_file(&memory_sysdev_class.kset.kobj,
-		&class_attr_block_size_bytes.attr);
-	return 0;
+	return sysfs_create_file(&memory_sysdev_class.kset.kobj,
+				&class_attr_block_size_bytes.attr);
 }
 
 /*
@@ -323,12 +322,14 @@ static CLASS_ATTR(probe, 0700, NULL, memory_probe_store);
 
 static int memory_probe_init(void)
 {
-	sysfs_create_file(&memory_sysdev_class.kset.kobj,
-		&class_attr_probe.attr);
-	return 0;
+	return sysfs_create_file(&memory_sysdev_class.kset.kobj,
+				&class_attr_probe.attr);
 }
 #else
-#define memory_probe_init(...)	do {} while (0)
+static inline int memory_probe_init(void)
+{
+	return 0;
+}
 #endif
 
 /*
@@ -431,9 +432,12 @@ int __init memory_dev_init(void)
 {
 	unsigned int i;
 	int ret;
+	int err;
 
 	memory_sysdev_class.kset.uevent_ops = &memory_uevent_ops;
 	ret = sysdev_class_register(&memory_sysdev_class);
+	if (ret)
+		goto out;
 
 	/*
 	 * Create entries for memory sections that were found
@@ -442,11 +446,19 @@ int __init memory_dev_init(void)
 	for (i = 0; i < NR_MEM_SECTIONS; i++) {
 		if (!valid_section_nr(i))
 			continue;
-		add_memory_block(0, __nr_to_section(i), MEM_ONLINE, 0);
+		err = add_memory_block(0, __nr_to_section(i), MEM_ONLINE, 0);
+		if (!ret)
+			ret = err;
 	}
 
-	memory_probe_init();
-	block_size_init();
-
+	err = memory_probe_init();
+	if (!ret)
+		ret = err;
+	err = block_size_init();
+	if (!ret)
+		ret = err;
+out:
+	if (ret)
+		printk(KERN_ERR "%s() failed: %d\n", __FUNCTION__, ret);
 	return ret;
 }
-- 
cgit v0.10.2


From f5738ceed46782aea7663d62cb6398eb05fc4ce0 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 6 Dec 2006 20:37:29 -0800
Subject: [PATCH] remove kernel syscalls

The last thing we agreed on was to remove the macros entirely for 2.6.19,
on all architectures. Unfortunately, I think nobody actually _did_ that,
so they are still there.

[akpm@osdl.org: x86_64 fix]
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Greg Schafer <gschafer@zip.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index 92546c1..630036c 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -42,6 +42,7 @@
 #include <asm/topology.h>
 
 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+#define __syscall_clobber "r11","rcx","memory"
 
 int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
 seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
diff --git a/include/asm-alpha/unistd.h b/include/asm-alpha/unistd.h
index 2cabbd4..84313d1 100644
--- a/include/asm-alpha/unistd.h
+++ b/include/asm-alpha/unistd.h
@@ -387,188 +387,6 @@
 
 #define NR_SYSCALLS			447
 
-#if defined(__GNUC__)
-
-#define _syscall_return(type)						\
-	return (_sc_err ? errno = _sc_ret, _sc_ret = -1L : 0), (type) _sc_ret
-
-#define _syscall_clobbers						\
-	"$1", "$2", "$3", "$4", "$5", "$6", "$7", "$8",			\
-	"$22", "$23", "$24", "$25", "$27", "$28" 			\
-
-#define _syscall0(type, name)						\
-type name(void)								\
-{									\
-	long _sc_ret, _sc_err;						\
-	{								\
-		register long _sc_0 __asm__("$0");			\
-		register long _sc_19 __asm__("$19");			\
-									\
-		_sc_0 = __NR_##name;					\
-		__asm__("callsys # %0 %1 %2"				\
-			: "=r"(_sc_0), "=r"(_sc_19)			\
-			: "0"(_sc_0)					\
-			: _syscall_clobbers);				\
-		_sc_ret = _sc_0, _sc_err = _sc_19;			\
-	}								\
-	_syscall_return(type);						\
-}
-
-#define _syscall1(type,name,type1,arg1)					\
-type name(type1 arg1)							\
-{									\
-	long _sc_ret, _sc_err;						\
-	{								\
-		register long _sc_0 __asm__("$0");			\
-		register long _sc_16 __asm__("$16");			\
-		register long _sc_19 __asm__("$19");			\
-									\
-		_sc_0 = __NR_##name;					\
-		_sc_16 = (long) (arg1);					\
-		__asm__("callsys # %0 %1 %2 %3"				\
-			: "=r"(_sc_0), "=r"(_sc_19)			\
-			: "0"(_sc_0), "r"(_sc_16)			\
-			: _syscall_clobbers);				\
-		_sc_ret = _sc_0, _sc_err = _sc_19;			\
-	}								\
-	_syscall_return(type);						\
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2)			\
-type name(type1 arg1,type2 arg2)					\
-{									\
-	long _sc_ret, _sc_err;						\
-	{								\
-		register long _sc_0 __asm__("$0");			\
-		register long _sc_16 __asm__("$16");			\
-		register long _sc_17 __asm__("$17");			\
-		register long _sc_19 __asm__("$19");			\
-									\
-		_sc_0 = __NR_##name;					\
-		_sc_16 = (long) (arg1);					\
-		_sc_17 = (long) (arg2);					\
-		__asm__("callsys # %0 %1 %2 %3 %4"			\
-			: "=r"(_sc_0), "=r"(_sc_19)			\
-			: "0"(_sc_0), "r"(_sc_16), "r"(_sc_17)		\
-			: _syscall_clobbers);				\
-		_sc_ret = _sc_0, _sc_err = _sc_19;			\
-	}								\
-	_syscall_return(type);						\
-}
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3)		\
-type name(type1 arg1,type2 arg2,type3 arg3)				\
-{									\
-	long _sc_ret, _sc_err;						\
-	{								\
-		register long _sc_0 __asm__("$0");			\
-		register long _sc_16 __asm__("$16");			\
-		register long _sc_17 __asm__("$17");			\
-		register long _sc_18 __asm__("$18");			\
-		register long _sc_19 __asm__("$19");			\
-									\
-		_sc_0 = __NR_##name;					\
-		_sc_16 = (long) (arg1);					\
-		_sc_17 = (long) (arg2);					\
-		_sc_18 = (long) (arg3);					\
-		__asm__("callsys # %0 %1 %2 %3 %4 %5"			\
-			: "=r"(_sc_0), "=r"(_sc_19)			\
-			: "0"(_sc_0), "r"(_sc_16), "r"(_sc_17),		\
-			  "r"(_sc_18)					\
-			: _syscall_clobbers);				\
-		_sc_ret = _sc_0, _sc_err = _sc_19;			\
-	}								\
-	_syscall_return(type);						\
-}
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4)		 \
-{									 \
-	long _sc_ret, _sc_err;						\
-	{								\
-		register long _sc_0 __asm__("$0");			\
-		register long _sc_16 __asm__("$16");			\
-		register long _sc_17 __asm__("$17");			\
-		register long _sc_18 __asm__("$18");			\
-		register long _sc_19 __asm__("$19");			\
-									\
-		_sc_0 = __NR_##name;					\
-		_sc_16 = (long) (arg1);					\
-		_sc_17 = (long) (arg2);					\
-		_sc_18 = (long) (arg3);					\
-		_sc_19 = (long) (arg4);					\
-		__asm__("callsys # %0 %1 %2 %3 %4 %5 %6"		\
-			: "=r"(_sc_0), "=r"(_sc_19)			\
-			: "0"(_sc_0), "r"(_sc_16), "r"(_sc_17),		\
-			  "r"(_sc_18), "1"(_sc_19)			\
-			: _syscall_clobbers);				\
-		_sc_ret = _sc_0, _sc_err = _sc_19;			\
-	}								\
-	_syscall_return(type);						\
-} 
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
-	  type5,arg5)							 \
-type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5)	\
-{									\
-	long _sc_ret, _sc_err;						\
-	{								\
-		register long _sc_0 __asm__("$0");			\
-		register long _sc_16 __asm__("$16");			\
-		register long _sc_17 __asm__("$17");			\
-		register long _sc_18 __asm__("$18");			\
-		register long _sc_19 __asm__("$19");			\
-		register long _sc_20 __asm__("$20");			\
-									\
-		_sc_0 = __NR_##name;					\
-		_sc_16 = (long) (arg1);					\
-		_sc_17 = (long) (arg2);					\
-		_sc_18 = (long) (arg3);					\
-		_sc_19 = (long) (arg4);					\
-		_sc_20 = (long) (arg5);					\
-		__asm__("callsys # %0 %1 %2 %3 %4 %5 %6 %7"		\
-			: "=r"(_sc_0), "=r"(_sc_19)			\
-			: "0"(_sc_0), "r"(_sc_16), "r"(_sc_17),		\
-			  "r"(_sc_18), "1"(_sc_19), "r"(_sc_20)		\
-			: _syscall_clobbers);				\
-		_sc_ret = _sc_0, _sc_err = _sc_19;			\
-	}								\
-	_syscall_return(type);						\
-}
-
-#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
-	  type5,arg5,type6,arg6)					 \
-type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5, type6 arg6)\
-{									\
-	long _sc_ret, _sc_err;						\
-	{								\
-		register long _sc_0 __asm__("$0");			\
-		register long _sc_16 __asm__("$16");			\
-		register long _sc_17 __asm__("$17");			\
-		register long _sc_18 __asm__("$18");			\
-		register long _sc_19 __asm__("$19");			\
-		register long _sc_20 __asm__("$20");			\
-		register long _sc_21 __asm__("$21");			\
-									\
-		_sc_0 = __NR_##name;					\
-		_sc_16 = (long) (arg1);					\
-		_sc_17 = (long) (arg2);					\
-		_sc_18 = (long) (arg3);					\
-		_sc_19 = (long) (arg4);					\
-		_sc_20 = (long) (arg5);					\
-		_sc_21 = (long) (arg6);					\
-		__asm__("callsys # %0 %1 %2 %3 %4 %5 %6 %7 %8"		\
-			: "=r"(_sc_0), "=r"(_sc_19)			\
-			: "0"(_sc_0), "r"(_sc_16), "r"(_sc_17),		\
-			  "r"(_sc_18), "1"(_sc_19), "r"(_sc_20), "r"(_sc_21) \
-			: _syscall_clobbers);				\
-		_sc_ret = _sc_0, _sc_err = _sc_19;			\
-	}								\
-	_syscall_return(type);						\
-}
-
-#endif /* __GNUC__ */
-
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_STAT64
diff --git a/include/asm-arm/unistd.h b/include/asm-arm/unistd.h
index 14a87ee..d44c629 100644
--- a/include/asm-arm/unistd.h
+++ b/include/asm-arm/unistd.h
@@ -377,156 +377,6 @@
 #endif
 
 #ifdef __KERNEL__
-#include <linux/err.h>
-#include <linux/linkage.h>
-
-#define __sys2(x) #x
-#define __sys1(x) __sys2(x)
-
-#ifndef __syscall
-#if defined(__thumb__) || defined(__ARM_EABI__)
-#define __SYS_REG(name) register long __sysreg __asm__("r7") = __NR_##name;
-#define __SYS_REG_LIST(regs...) "r" (__sysreg) , ##regs
-#define __syscall(name) "swi\t0"
-#else
-#define __SYS_REG(name)
-#define __SYS_REG_LIST(regs...) regs
-#define __syscall(name) "swi\t" __sys1(__NR_##name) ""
-#endif
-#endif
-
-#define __syscall_return(type, res)					\
-do {									\
-	if ((unsigned long)(res) >= (unsigned long)(-MAX_ERRNO)) {	\
-		errno = -(res);						\
-		res = -1;						\
-	}								\
-	return (type) (res);						\
-} while (0)
-
-#define _syscall0(type,name)						\
-type name(void) {							\
-  __SYS_REG(name)							\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: __SYS_REG_LIST()						\
-	: "memory" );							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-
-#define _syscall1(type,name,type1,arg1) 				\
-type name(type1 arg1) { 						\
-  __SYS_REG(name)							\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: __SYS_REG_LIST( "0" (__r0) )					\
-	: "memory" );							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2)			\
-type name(type1 arg1,type2 arg2) {					\
-  __SYS_REG(name)							\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __r1 __asm__("r1") = (long)arg2;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: __SYS_REG_LIST( "0" (__r0), "r" (__r1) )			\
-	: "memory" );							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3)		\
-type name(type1 arg1,type2 arg2,type3 arg3) {				\
-  __SYS_REG(name)							\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __r1 __asm__("r1") = (long)arg2;			\
-  register long __r2 __asm__("r2") = (long)arg3;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: __SYS_REG_LIST( "0" (__r0), "r" (__r1), "r" (__r2) )		\
-	: "memory" );							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4)\
-type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4) {		\
-  __SYS_REG(name)							\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __r1 __asm__("r1") = (long)arg2;			\
-  register long __r2 __asm__("r2") = (long)arg3;			\
-  register long __r3 __asm__("r3") = (long)arg4;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: __SYS_REG_LIST( "0" (__r0), "r" (__r1), "r" (__r2), "r" (__r3) ) \
-	: "memory" );							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-  
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5)	\
-type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) {	\
-  __SYS_REG(name)							\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __r1 __asm__("r1") = (long)arg2;			\
-  register long __r2 __asm__("r2") = (long)arg3;			\
-  register long __r3 __asm__("r3") = (long)arg4;			\
-  register long __r4 __asm__("r4") = (long)arg5;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: __SYS_REG_LIST( "0" (__r0), "r" (__r1), "r" (__r2),		\
-			  "r" (__r3), "r" (__r4) )			\
-	: "memory" );							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-
-#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5,type6,arg6)	\
-type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6) {	\
-  __SYS_REG(name)							\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __r1 __asm__("r1") = (long)arg2;			\
-  register long __r2 __asm__("r2") = (long)arg3;			\
-  register long __r3 __asm__("r3") = (long)arg4;			\
-  register long __r4 __asm__("r4") = (long)arg5;			\
-  register long __r5 __asm__("r5") = (long)arg6;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: __SYS_REG_LIST( "0" (__r0), "r" (__r1), "r" (__r2),		\
-			  "r" (__r3), "r" (__r4), "r" (__r5) )		\
-	: "memory" );							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_STAT64
diff --git a/include/asm-arm26/unistd.h b/include/asm-arm26/unistd.h
index 25a5eea..4c3b919 100644
--- a/include/asm-arm26/unistd.h
+++ b/include/asm-arm26/unistd.h
@@ -311,139 +311,6 @@
 #define __ARM_NR_usr26			(__ARM_NR_BASE+3)
 
 #ifdef __KERNEL__
-#include <linux/err.h>
-#include <linux/linkage.h>
-
-#define __sys2(x) #x
-#define __sys1(x) __sys2(x)
-
-#ifndef __syscall
-#define __syscall(name) "swi\t" __sys1(__NR_##name) ""
-#endif
-
-#define __syscall_return(type, res)					\
-do {									\
-	if ((unsigned long)(res) >= (unsigned long)-MAX_ERRNO) {	\
-		errno = -(res);						\
-		res = -1;						\
-	}								\
-	return (type) (res);						\
-} while (0)
-
-#define _syscall0(type,name)						\
-type name(void) {							\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	:								\
-	: "lr");							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-
-#define _syscall1(type,name,type1,arg1) 				\
-type name(type1 arg1) { 						\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: "r" (__r0)							\
-	: "lr");							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2)			\
-type name(type1 arg1,type2 arg2) {					\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __r1 __asm__("r1") = (long)arg2;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: "r" (__r0),"r" (__r1) 					\
-	: "lr");							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3)		\
-type name(type1 arg1,type2 arg2,type3 arg3) {				\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __r1 __asm__("r1") = (long)arg2;			\
-  register long __r2 __asm__("r2") = (long)arg3;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: "r" (__r0),"r" (__r1),"r" (__r2)				\
-	: "lr");							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4)\
-type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4) {		\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __r1 __asm__("r1") = (long)arg2;			\
-  register long __r2 __asm__("r2") = (long)arg3;			\
-  register long __r3 __asm__("r3") = (long)arg4;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: "r" (__r0),"r" (__r1),"r" (__r2),"r" (__r3)			\
-	: "lr");							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-  
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5)	\
-type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) {	\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __r1 __asm__("r1") = (long)arg2;			\
-  register long __r2 __asm__("r2") = (long)arg3;			\
-  register long __r3 __asm__("r3") = (long)arg4;			\
-  register long __r4 __asm__("r4") = (long)arg5;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: "r" (__r0),"r" (__r1),"r" (__r2),"r" (__r3),"r" (__r4)	\
-	: "lr");							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-
-#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5,type6,arg6)	\
-type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6) {	\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __r1 __asm__("r1") = (long)arg2;			\
-  register long __r2 __asm__("r2") = (long)arg3;			\
-  register long __r3 __asm__("r3") = (long)arg4;			\
-  register long __r4 __asm__("r4") = (long)arg5;			\
-  register long __r5 __asm__("r5") = (long)arg6;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: "r" (__r0),"r" (__r1),"r" (__r2),"r" (__r3), "r" (__r4),"r" (__r5)		\
-	: "lr");							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/asm-frv/unistd.h b/include/asm-frv/unistd.h
index 725e854..584c041 100644
--- a/include/asm-frv/unistd.h
+++ b/include/asm-frv/unistd.h
@@ -320,125 +320,6 @@
 #ifdef __KERNEL__
 
 #define NR_syscalls 310
-#include <linux/err.h>
-
-/*
- * process the return value of a syscall, consigning it to one of two possible fates
- * - user-visible error numbers are in the range -1 - -4095: see <asm-frv/errno.h>
- */
-#undef __syscall_return
-#define __syscall_return(type, res)					\
-do {									\
-        unsigned long __sr2 = (res);					\
-	if (__builtin_expect(__sr2 >= (unsigned long)(-MAX_ERRNO), 0)) { \
-		errno = (-__sr2);					\
-		__sr2 = ~0UL;						\
-	}								\
-	return (type) __sr2;						\
-} while (0)
-
-/* XXX - _foo needs to be __foo, while __NR_bar could be _NR_bar. */
-
-#undef _syscall0
-#define _syscall0(type,name)						\
-type name(void)								\
-{									\
-	register unsigned long __scnum __asm__ ("gr7") = (__NR_##name);	\
-	register unsigned long __sc0 __asm__ ("gr8");			\
-	__asm__ __volatile__ ("tira gr0,#0"				\
-			      : "=r" (__sc0)				\
-			      : "r" (__scnum));				\
-	__syscall_return(type, __sc0);					\
-}
-
-#undef _syscall1
-#define _syscall1(type,name,type1,arg1)						\
-type name(type1 arg1)								\
-{										\
-	register unsigned long __scnum __asm__ ("gr7") = (__NR_##name);		\
-	register unsigned long __sc0 __asm__ ("gr8") = (unsigned long) arg1;	\
-	__asm__ __volatile__ ("tira gr0,#0"					\
-			      : "+r" (__sc0)					\
-			      : "r" (__scnum));					\
-	__syscall_return(type, __sc0);						\
-}
-
-#undef _syscall2
-#define _syscall2(type,name,type1,arg1,type2,arg2)				\
-type name(type1 arg1,type2 arg2)						\
-{										\
-	register unsigned long __scnum __asm__ ("gr7") = (__NR_##name);		\
-	register unsigned long __sc0 __asm__ ("gr8") = (unsigned long) arg1;	\
-	register unsigned long __sc1 __asm__ ("gr9") = (unsigned long) arg2;	\
-	__asm__ __volatile__ ("tira gr0,#0"					\
-			      : "+r" (__sc0)					\
-			      : "r" (__scnum), "r" (__sc1));			\
-	__syscall_return(type, __sc0);						\
-}
-
-#undef _syscall3
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3)			\
-type name(type1 arg1,type2 arg2,type3 arg3)					\
-{										\
-	register unsigned long __scnum __asm__ ("gr7") = (__NR_##name);		\
-	register unsigned long __sc0 __asm__ ("gr8") = (unsigned long) arg1;	\
-	register unsigned long __sc1 __asm__ ("gr9") = (unsigned long) arg2;	\
-	register unsigned long __sc2 __asm__ ("gr10") = (unsigned long) arg3;	\
-	__asm__ __volatile__ ("tira gr0,#0"					\
-			      : "+r" (__sc0)					\
-			      : "r" (__scnum), "r" (__sc1), "r" (__sc2));	\
-	__syscall_return(type, __sc0);						\
-}
-
-#undef _syscall4
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4)		\
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4)				\
-{											\
-	register unsigned long __scnum __asm__ ("gr7") = (__NR_##name);			\
-	register unsigned long __sc0 __asm__ ("gr8") = (unsigned long) arg1;		\
-	register unsigned long __sc1 __asm__ ("gr9") = (unsigned long) arg2;		\
-	register unsigned long __sc2 __asm__ ("gr10") = (unsigned long) arg3;		\
-	register unsigned long __sc3 __asm__ ("gr11") = (unsigned long) arg4;		\
-	__asm__ __volatile__ ("tira gr0,#0"						\
-			      : "+r" (__sc0)						\
-			      : "r" (__scnum), "r" (__sc1), "r" (__sc2), "r" (__sc3));	\
-	__syscall_return(type, __sc0);							\
-}
-
-#undef _syscall5
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5)	\
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5)			\
-{											\
-	register unsigned long __scnum __asm__ ("gr7") = (__NR_##name);			\
-	register unsigned long __sc0 __asm__ ("gr8") = (unsigned long) arg1;		\
-	register unsigned long __sc1 __asm__ ("gr9") = (unsigned long) arg2;		\
-	register unsigned long __sc2 __asm__ ("gr10") = (unsigned long) arg3;		\
-	register unsigned long __sc3 __asm__ ("gr11") = (unsigned long) arg4;		\
-	register unsigned long __sc4 __asm__ ("gr12") = (unsigned long) arg5;		\
-	__asm__ __volatile__ ("tira gr0,#0"						\
-			      : "+r" (__sc0)						\
-			      : "r" (__scnum), "r" (__sc1), "r" (__sc2),		\
-			      "r" (__sc3), "r" (__sc4));				\
-	__syscall_return(type, __sc0);							\
-}
-
-#undef _syscall6
-#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5, type6, arg6) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6)		 \
-{												 \
-	register unsigned long __scnum __asm__ ("gr7") = (__NR_##name);				 \
-	register unsigned long __sc0 __asm__ ("gr8") = (unsigned long) arg1;			 \
-	register unsigned long __sc1 __asm__ ("gr9") = (unsigned long) arg2;			 \
-	register unsigned long __sc2 __asm__ ("gr10") = (unsigned long) arg3;			 \
-	register unsigned long __sc3 __asm__ ("gr11") = (unsigned long) arg4;			 \
-	register unsigned long __sc4 __asm__ ("gr12") = (unsigned long) arg5;			 \
-	register unsigned long __sc5 __asm__ ("gr13") = (unsigned long) arg6;			 \
-	__asm__ __volatile__ ("tira gr0,#0"							 \
-			      : "+r" (__sc0)							 \
-			      : "r" (__scnum), "r" (__sc1), "r" (__sc2),			 \
-			      "r" (__sc3), "r" (__sc4), "r" (__sc5));				 \
-	__syscall_return(type, __sc0);								 \
-}
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 /* #define __ARCH_WANT_OLD_READDIR */
diff --git a/include/asm-h8300/unistd.h b/include/asm-h8300/unistd.h
index 747788d..7ddd414 100644
--- a/include/asm-h8300/unistd.h
+++ b/include/asm-h8300/unistd.h
@@ -295,172 +295,6 @@
 #ifdef __KERNEL__
 
 #define NR_syscalls 289
-#include <linux/err.h>
-
-/* user-visible error numbers are in the range -1 - -MAX_ERRNO: see
-   <asm-m68k/errno.h> */
-
-#define __syscall_return(type, res) \
-do { \
-	if ((unsigned long)(res) >= (unsigned long)(-MAX_ERRNO)) { \
-	/* avoid using res which is declared to be in register d0; \
-	   errno might expand to a function call and clobber it.  */ \
-		int __err = -(res); \
-		errno = __err; \
-		res = -1; \
-	} \
-	return (type) (res); \
-} while (0)
-
-#define _syscall0(type, name)				\
-type name(void)						\
-{							\
-  register long __res __asm__("er0");			\
-  __asm__ __volatile__ ("mov.l %1,er0\n\t"		\
-                        "trapa	#0\n\t"			\
-			: "=r" (__res)			\
-			: "g" (__NR_##name)		\
-			: "cc", "memory");		\
-  __syscall_return(type, __res);			\
-}
-
-#define _syscall1(type, name, atype, a)			\
-type name(atype a)					\
-{							\
-  register long __res __asm__("er0");			\
-  register long _a __asm__("er1");			\
-  _a = (long)a;						\
-  __asm__ __volatile__ ("mov.l %1,er0\n\t"		\
-                        "trapa	#0\n\t"			\
-			: "=r" (__res)			\
-			: "g" (__NR_##name),		\
-			  "g" (_a)			\
-			: "cc", "memory");	 	\
-  __syscall_return(type, __res);			\
-}
-
-#define _syscall2(type, name, atype, a, btype, b)	\
-type name(atype a, btype b)				\
-{							\
-  register long __res __asm__("er0");			\
-  register long _a __asm__("er1");			\
-  register long _b __asm__("er2");			\
-  _a = (long)a;						\
-  _b = (long)b;						\
-  __asm__ __volatile__ ("mov.l %1,er0\n\t"		\
-                        "trapa	#0\n\t"			\
-			: "=r" (__res)			\
-			: "g" (__NR_##name),		\
-			  "g" (_a),			\
-			  "g" (_b)			\
-			: "cc", "memory");	 	\
-  __syscall_return(type, __res);			\
-}
-
-#define _syscall3(type, name, atype, a, btype, b, ctype, c)	\
-type name(atype a, btype b, ctype c)			\
-{							\
-  register long __res __asm__("er0");			\
-  register long _a __asm__("er1");			\
-  register long _b __asm__("er2");			\
-  register long _c __asm__("er3");			\
-  _a = (long)a;						\
-  _b = (long)b;						\
-  _c = (long)c;						\
-  __asm__ __volatile__ ("mov.l %1,er0\n\t"		\
-                        "trapa	#0\n\t"			\
-			: "=r" (__res)			\
-			: "g" (__NR_##name),		\
-			  "g" (_a),			\
-			  "g" (_b),			\
-			  "g" (_c)			\
-			: "cc", "memory");		\
-  __syscall_return(type, __res);			\
-}
-
-#define _syscall4(type, name, atype, a, btype, b,	\
-                  ctype, c, dtype, d)			\
-type name(atype a, btype b, ctype c, dtype d)		\
-{							\
-  register long __res __asm__("er0");			\
-  register long _a __asm__("er1");			\
-  register long _b __asm__("er2");			\
-  register long _c __asm__("er3");			\
-  register long _d __asm__("er4");			\
-  _a = (long)a;						\
-  _b = (long)b;						\
-  _c = (long)c;						\
-  _d = (long)d;						\
-  __asm__ __volatile__ ("mov.l	%1,er0\n\t"		\
-                        "trapa	#0\n\t"			\
-			: "=r" (__res)			\
-			: "g" (__NR_##name),		\
-			  "g" (_a),			\
-			  "g" (_b),			\
-			  "g" (_c),			\
-			  "g" (_d)			\
-			: "cc", "memory");		\
-  __syscall_return(type, __res);			\
-}
-
-#define _syscall5(type, name, atype, a, btype, b,	\
-                  ctype, c, dtype, d, etype, e)		\
-type name(atype a, btype b, ctype c, dtype d, etype e)	\
-{							\
-  register long __res __asm__("er0");			\
-  register long _a __asm__("er1");			\
-  register long _b __asm__("er2");			\
-  register long _c __asm__("er3");			\
-  register long _d __asm__("er4");			\
-  register long _e __asm__("er5");			\
-  _a = (long)a;                                       	\
-  _b = (long)b;                                       	\
-  _c = (long)c;                                       	\
-  _d = (long)d;                                       	\
-  _e = (long)e;                                       	\
-  __asm__ __volatile__ ("mov.l	%1,er0\n\t"		\
-                        "trapa	#0\n\t"			\
-			: "=r" (__res)			\
-			: "g" (__NR_##name),		\
-			  "g" (_a),			\
-			  "g" (_b),			\
-			  "g" (_c),			\
-			  "g" (_d),			\
-			  "g" (_e)			\
-			: "cc", "memory");		\
-  __syscall_return(type, __res);			\
-}
-
-#define _syscall6(type, name, atype, a, btype, b,	\
-                  ctype, c, dtype, d, etype, e, ftype, f)	\
-type name(atype a, btype b, ctype c, dtype d, etype e, ftype f)	\
-{							\
-  register long __res __asm__("er0");			\
-  register long _a __asm__("er1");			\
-  register long _b __asm__("er2");			\
-  register long _c __asm__("er3");			\
-  register long _d __asm__("er4");			\
-  register long _e __asm__("er5");			\
-  register long _f __asm__("er6");			\
-  _a = (long)a;                                       	\
-  _b = (long)b;                                       	\
-  _c = (long)c;                                       	\
-  _d = (long)d;                                       	\
-  _e = (long)e;                                       	\
-  _f = (long)f;                                       	\
-  __asm__ __volatile__ ("mov.l	%1,er0\n\t"		\
-                        "trapa	#0\n\t"			\
-			: "=r" (__res)			\
-			: "g" (__NR_##name),		\
-			  "g" (_a),			\
-			  "g" (_b),			\
-			  "g" (_c),			\
-			  "g" (_d),			\
-			  "g" (_e)			\
-			  "g" (_f)			\
-			: "cc", "memory");		\
-  __syscall_return(type, __res);			\
-}
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index beeeaf6..833fa17 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -329,104 +329,6 @@
 #ifdef __KERNEL__
 
 #define NR_syscalls 320
-#include <linux/err.h>
-
-/*
- * user-visible error numbers are in the range -1 - -MAX_ERRNO: see
- * <asm-i386/errno.h>
- */
-#define __syscall_return(type, res) \
-do { \
-	if ((unsigned long)(res) >= (unsigned long)(-MAX_ERRNO)) { \
-		errno = -(res); \
-		res = -1; \
-	} \
-	return (type) (res); \
-} while (0)
-
-/* XXX - _foo needs to be __foo, while __NR_bar could be _NR_bar. */
-#define _syscall0(type,name) \
-type name(void) \
-{ \
-long __res; \
-__asm__ volatile ("int $0x80" \
-	: "=a" (__res) \
-	: "0" (__NR_##name)); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall1(type,name,type1,arg1) \
-type name(type1 arg1) \
-{ \
-long __res; \
-__asm__ volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" \
-	: "=a" (__res) \
-	: "0" (__NR_##name),"ri" ((long)(arg1)) : "memory"); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2) \
-type name(type1 arg1,type2 arg2) \
-{ \
-long __res; \
-__asm__ volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" \
-	: "=a" (__res) \
-	: "0" (__NR_##name),"ri" ((long)(arg1)),"c" ((long)(arg2)) \
-	: "memory"); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \
-type name(type1 arg1,type2 arg2,type3 arg3) \
-{ \
-long __res; \
-__asm__ volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" \
-	: "=a" (__res) \
-	: "0" (__NR_##name),"ri" ((long)(arg1)),"c" ((long)(arg2)), \
-		  "d" ((long)(arg3)) : "memory"); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4) \
-{ \
-long __res; \
-__asm__ volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" \
-	: "=a" (__res) \
-	: "0" (__NR_##name),"ri" ((long)(arg1)),"c" ((long)(arg2)), \
-	  "d" ((long)(arg3)),"S" ((long)(arg4)) : "memory"); \
-__syscall_return(type,__res); \
-} 
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
-	  type5,arg5) \
-type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5) \
-{ \
-long __res; \
-__asm__ volatile ("push %%ebx ; movl %2,%%ebx ; movl %1,%%eax ; " \
-                  "int $0x80 ; pop %%ebx" \
-	: "=a" (__res) \
-	: "i" (__NR_##name),"ri" ((long)(arg1)),"c" ((long)(arg2)), \
-	  "d" ((long)(arg3)),"S" ((long)(arg4)),"D" ((long)(arg5)) \
-	: "memory"); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
-	  type5,arg5,type6,arg6) \
-type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5,type6 arg6) \
-{ \
-long __res; \
-  struct { long __a1; long __a6; } __s = { (long)arg1, (long)arg6 }; \
-__asm__ volatile ("push %%ebp ; push %%ebx ; movl 4(%2),%%ebp ; " \
-                  "movl 0(%2),%%ebx ; movl %1,%%eax ; int $0x80 ; " \
-                  "pop %%ebx ;  pop %%ebp" \
-	: "=a" (__res) \
-	: "i" (__NR_##name),"0" ((long)(&__s)),"c" ((long)(arg2)), \
-	  "d" ((long)(arg3)),"S" ((long)(arg4)),"D" ((long)(arg5)) \
-	: "memory"); \
-__syscall_return(type,__res); \
-}
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/asm-m32r/unistd.h b/include/asm-m32r/unistd.h
index 95aa342..5b66bd3 100644
--- a/include/asm-m32r/unistd.h
+++ b/include/asm-m32r/unistd.h
@@ -296,117 +296,6 @@
 #ifdef __KERNEL__
 
 #define NR_syscalls 285
-#include <linux/err.h>
-
-/* user-visible error numbers are in the range -1 - -MAX_ERRNO: see
- * <asm-m32r/errno.h>
- */
-
-#include <asm/syscall.h>	/* SYSCALL_* */
-
-#define __syscall_return(type, res) \
-do { \
-	if ((unsigned long)(res) >= (unsigned long)(-MAX_ERRNO)) { \
-	/* Avoid using "res" which is declared to be in register r0; \
-	   errno might expand to a function call and clobber it.  */ \
-		int __err = -(res); \
-		errno = __err; \
-		res = -1; \
-	} \
-	return (type) (res); \
-} while (0)
-
-#define _syscall0(type,name) \
-type name(void) \
-{ \
-register long __scno __asm__ ("r7") = __NR_##name; \
-register long __res __asm__("r0"); \
-__asm__ __volatile__ (\
-	"trap #" SYSCALL_VECTOR "|| nop"\
-	: "=r" (__res) \
-	: "r" (__scno) \
-	: "memory"); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall1(type,name,type1,arg1) \
-type name(type1 arg1) \
-{ \
-register long __scno __asm__ ("r7") = __NR_##name; \
-register long __res __asm__ ("r0") = (long)(arg1); \
-__asm__ __volatile__ (\
-	"trap #" SYSCALL_VECTOR "|| nop"\
-	: "=r" (__res) \
-	: "r" (__scno), "0" (__res) \
-	: "memory"); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2) \
-type name(type1 arg1,type2 arg2) \
-{ \
-register long __scno __asm__ ("r7") = __NR_##name; \
-register long __arg2 __asm__ ("r1") = (long)(arg2); \
-register long __res __asm__ ("r0") = (long)(arg1); \
-__asm__ __volatile__ (\
-	"trap #" SYSCALL_VECTOR "|| nop"\
-	: "=r" (__res) \
-	: "r" (__scno), "0" (__res), "r" (__arg2) \
-	: "memory"); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \
-type name(type1 arg1,type2 arg2,type3 arg3) \
-{ \
-register long __scno __asm__ ("r7") = __NR_##name; \
-register long __arg3 __asm__ ("r2") = (long)(arg3); \
-register long __arg2 __asm__ ("r1") = (long)(arg2); \
-register long __res __asm__ ("r0") = (long)(arg1); \
-__asm__ __volatile__ (\
-	"trap #" SYSCALL_VECTOR "|| nop"\
-	: "=r" (__res) \
-	: "r" (__scno), "0" (__res), "r" (__arg2), \
-		"r" (__arg3) \
-	: "memory"); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
-type name(type1 arg1,type2 arg2,type3 arg3,type4 arg4) \
-{ \
-register long __scno __asm__ ("r7") = __NR_##name; \
-register long __arg4 __asm__ ("r3") = (long)(arg4); \
-register long __arg3 __asm__ ("r2") = (long)(arg3); \
-register long __arg2 __asm__ ("r1") = (long)(arg2); \
-register long __res __asm__ ("r0") = (long)(arg1); \
-__asm__ __volatile__ (\
-	"trap #" SYSCALL_VECTOR "|| nop"\
-	: "=r" (__res) \
-	: "r" (__scno), "0" (__res), "r" (__arg2), \
-		"r" (__arg3), "r" (__arg4) \
-	: "memory"); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
-	type5,arg5) \
-type name(type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5) \
-{ \
-register long __scno __asm__ ("r7") = __NR_##name; \
-register long __arg5 __asm__ ("r4") = (long)(arg5); \
-register long __arg4 __asm__ ("r3") = (long)(arg4); \
-register long __arg3 __asm__ ("r2") = (long)(arg3); \
-register long __arg2 __asm__ ("r1") = (long)(arg2); \
-register long __res __asm__ ("r0") = (long)(arg1); \
-__asm__ __volatile__ (\
-	"trap #" SYSCALL_VECTOR "|| nop"\
-	: "=r" (__res) \
-	: "r" (__scno), "0" (__res), "r" (__arg2), \
-		"r" (__arg3), "r" (__arg4), "r" (__arg5) \
-	: "memory"); \
-__syscall_return(type,__res); \
-}
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_STAT64
diff --git a/include/asm-m68k/unistd.h b/include/asm-m68k/unistd.h
index ad43480..fdbb60e 100644
--- a/include/asm-m68k/unistd.h
+++ b/include/asm-m68k/unistd.h
@@ -317,103 +317,6 @@
 #ifdef __KERNEL__
 
 #define NR_syscalls		311
-#include <linux/err.h>
-
-/* user-visible error numbers are in the range -1 - -MAX_ERRNO: see
-   <asm-m68k/errno.h> */
-
-#define __syscall_return(type, res) \
-do { \
-	if ((unsigned long)(res) >= (unsigned long)(-MAX_ERRNO)) { \
-	/* avoid using res which is declared to be in register d0; \
-	   errno might expand to a function call and clobber it.  */ \
-		int __err = -(res); \
-		errno = __err; \
-		res = -1; \
-	} \
-	return (type) (res); \
-} while (0)
-
-#define _syscall0(type,name) \
-type name(void) \
-{ \
-register long __res __asm__ ("%d0") = __NR_##name; \
-__asm__ __volatile__ ("trap  #0" \
-                      : "+d" (__res) ); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall1(type,name,atype,a) \
-type name(atype a) \
-{ \
-register long __res __asm__ ("%d0") = __NR_##name; \
-register long __a __asm__ ("%d1") = (long)(a); \
-__asm__ __volatile__ ("trap  #0" \
-		      : "+d" (__res) \
-		      : "d" (__a)  ); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall2(type,name,atype,a,btype,b) \
-type name(atype a,btype b) \
-{ \
-register long __res __asm__ ("%d0") = __NR_##name; \
-register long __a __asm__ ("%d1") = (long)(a); \
-register long __b __asm__ ("%d2") = (long)(b); \
-__asm__ __volatile__ ("trap  #0" \
-		      : "+d" (__res) \
-                      : "d" (__a), "d" (__b) \
-		     ); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall3(type,name,atype,a,btype,b,ctype,c) \
-type name(atype a,btype b,ctype c) \
-{ \
-register long __res __asm__ ("%d0") = __NR_##name; \
-register long __a __asm__ ("%d1") = (long)(a); \
-register long __b __asm__ ("%d2") = (long)(b); \
-register long __c __asm__ ("%d3") = (long)(c); \
-__asm__ __volatile__ ("trap  #0" \
-		      : "+d" (__res) \
-                      : "d" (__a), "d" (__b), \
-			"d" (__c) \
-		     ); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall4(type,name,atype,a,btype,b,ctype,c,dtype,d) \
-type name (atype a, btype b, ctype c, dtype d) \
-{ \
-register long __res __asm__ ("%d0") = __NR_##name; \
-register long __a __asm__ ("%d1") = (long)(a); \
-register long __b __asm__ ("%d2") = (long)(b); \
-register long __c __asm__ ("%d3") = (long)(c); \
-register long __d __asm__ ("%d4") = (long)(d); \
-__asm__ __volatile__ ("trap  #0" \
-                      : "+d" (__res) \
-                      : "d" (__a), "d" (__b), \
-			"d" (__c), "d" (__d)  \
-		     ); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall5(type,name,atype,a,btype,b,ctype,c,dtype,d,etype,e) \
-type name (atype a,btype b,ctype c,dtype d,etype e) \
-{ \
-register long __res __asm__ ("%d0") = __NR_##name; \
-register long __a __asm__ ("%d1") = (long)(a); \
-register long __b __asm__ ("%d2") = (long)(b); \
-register long __c __asm__ ("%d3") = (long)(c); \
-register long __d __asm__ ("%d4") = (long)(d); \
-register long __e __asm__ ("%d5") = (long)(e); \
-__asm__ __volatile__ ("trap  #0" \
-		      : "+d" (__res) \
-		      : "d" (__a), "d" (__b), \
-			"d" (__c), "d" (__d), "d" (__e)  \
-                     ); \
-__syscall_return(type,__res); \
-}
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/asm-m68knommu/unistd.h b/include/asm-m68knommu/unistd.h
index ebaf031..82e0319 100644
--- a/include/asm-m68knommu/unistd.h
+++ b/include/asm-m68knommu/unistd.h
@@ -318,156 +318,6 @@
 #ifdef __KERNEL__
 
 #define NR_syscalls		311
-#include <linux/err.h>
-
-/* user-visible error numbers are in the range -1 - -MAX_ERRNO: see
-   <asm-m68k/errno.h> */
-
-#define __syscall_return(type, res) \
-do { \
-	if ((unsigned long)(res) >= (unsigned long)(-MAX_ERRNO)) { \
-	/* avoid using res which is declared to be in register d0; \
-	   errno might expand to a function call and clobber it.  */ \
-		int __err = -(res); \
-		errno = __err; \
-		res = -1; \
-	} \
-	return (type) (res); \
-} while (0)
-
-#define _syscall0(type, name)							\
-type name(void)									\
-{										\
-  long __res;									\
-  __asm__ __volatile__ ("movel	%1, %%d0\n\t"					\
-  			"trap	#0\n\t"						\
-  			"movel	%%d0, %0"					\
-			: "=g" (__res)						\
-			: "i" (__NR_##name)					\
-			: "cc", "%d0");						\
-  if ((unsigned long)(__res) >= (unsigned long)(-125)) {				\
-    errno = -__res;								\
-    __res = -1;									\
-  }										\
-  return (type)__res;								\
-}
-
-#define _syscall1(type, name, atype, a)						\
-type name(atype a)								\
-{										\
-  long __res;									\
-  __asm__ __volatile__ ("movel	%2, %%d1\n\t"					\
-  			"movel	%1, %%d0\n\t"					\
-  			"trap	#0\n\t"						\
-  			"movel	%%d0, %0"					\
-			: "=g" (__res)						\
-			: "i" (__NR_##name),					\
-			  "g" ((long)a)						\
-			: "cc", "%d0", "%d1");					\
-  if ((unsigned long)(__res) >= (unsigned long)(-125)) {				\
-    errno = -__res;								\
-    __res = -1;									\
-  }										\
-  return (type)__res;								\
-}
-
-#define _syscall2(type, name, atype, a, btype, b)				\
-type name(atype a, btype b)							\
-{										\
-  long __res;									\
-  __asm__ __volatile__ ("movel	%3, %%d2\n\t"					\
-  			"movel	%2, %%d1\n\t"					\
-			"movel	%1, %%d0\n\t"					\
-  			"trap	#0\n\t"						\
-  			"movel	%%d0, %0"					\
-			: "=g" (__res)						\
-			: "i" (__NR_##name),					\
-			  "a" ((long)a),					\
-			  "g" ((long)b)						\
-			: "cc", "%d0", "%d1", "%d2");				\
-  if ((unsigned long)(__res) >= (unsigned long)(-125)) {				\
-    errno = -__res;								\
-    __res = -1;									\
-  }										\
-  return (type)__res;								\
-}
-
-#define _syscall3(type, name, atype, a, btype, b, ctype, c)			\
-type name(atype a, btype b, ctype c)						\
-{										\
-  long __res;									\
-  __asm__ __volatile__ ("movel	%4, %%d3\n\t"					\
-			"movel	%3, %%d2\n\t"					\
-  			"movel	%2, %%d1\n\t"					\
-			"movel	%1, %%d0\n\t"					\
-  			"trap	#0\n\t"						\
-  			"movel	%%d0, %0"					\
-			: "=g" (__res)						\
-			: "i" (__NR_##name),					\
-			  "a" ((long)a),					\
-			  "a" ((long)b),					\
-			  "g" ((long)c)						\
-			: "cc", "%d0", "%d1", "%d2", "%d3");			\
-  if ((unsigned long)(__res) >= (unsigned long)(-125)) {				\
-    errno = -__res;								\
-    __res = -1;									\
-  }										\
-  return (type)__res;								\
-}
-
-#define _syscall4(type, name, atype, a, btype, b, ctype, c, dtype, d)		\
-type name(atype a, btype b, ctype c, dtype d)					\
-{										\
-  long __res;									\
-  __asm__ __volatile__ ("movel	%5, %%d4\n\t"					\
-			"movel	%4, %%d3\n\t"					\
-			"movel	%3, %%d2\n\t"					\
-  			"movel	%2, %%d1\n\t"					\
-			"movel	%1, %%d0\n\t"					\
-  			"trap	#0\n\t"						\
-  			"movel	%%d0, %0"					\
-			: "=g" (__res)						\
-			: "i" (__NR_##name),					\
-			  "a" ((long)a),					\
-			  "a" ((long)b),					\
-			  "a" ((long)c),					\
-			  "g" ((long)d)						\
-			: "cc", "%d0", "%d1", "%d2", "%d3",			\
-			  "%d4");						\
-  if ((unsigned long)(__res) >= (unsigned long)(-125)) {				\
-    errno = -__res;								\
-    __res = -1;									\
-  }										\
-  return (type)__res;								\
-}
-
-#define _syscall5(type, name, atype, a, btype, b, ctype, c, dtype, d, etype, e)	\
-type name(atype a, btype b, ctype c, dtype d, etype e)				\
-{										\
-  long __res;									\
-  __asm__ __volatile__ ("movel	%6, %%d5\n\t"					\
-			"movel	%5, %%d4\n\t"					\
-			"movel	%4, %%d3\n\t"					\
-			"movel	%3, %%d2\n\t"					\
-  			"movel	%2, %%d1\n\t"					\
-			"movel	%1, %%d0\n\t"					\
-  			"trap	#0\n\t"						\
-  			"movel	%%d0, %0"					\
-			: "=g" (__res)						\
-			: "i" (__NR_##name),					\
-			  "a" ((long)a),					\
-			  "a" ((long)b),					\
-			  "a" ((long)c),					\
-			  "a" ((long)d),					\
-			  "g" ((long)e)						\
-			: "cc", "%d0", "%d1", "%d2", "%d3",			\
-			  "%d4", "%d5");					\
-  if ((unsigned long)(__res) >= (unsigned long)(-125)) {				\
-    errno = -__res;								\
-    __res = -1;									\
-  }										\
-  return (type)__res;								\
-}
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/asm-mips/unistd.h b/include/asm-mips/unistd.h
index ec56aa5..696cff3 100644
--- a/include/asm-mips/unistd.h
+++ b/include/asm-mips/unistd.h
@@ -933,268 +933,6 @@
 
 #ifndef __ASSEMBLY__
 
-/* XXX - _foo needs to be __foo, while __NR_bar could be _NR_bar. */
-#define _syscall0(type,name) \
-type name(void) \
-{ \
-	register unsigned long __a3 asm("$7"); \
-	unsigned long __v0; \
-	\
-	__asm__ volatile ( \
-	".set\tnoreorder\n\t" \
-	"li\t$2, %2\t\t\t# " #name "\n\t" \
-	"syscall\n\t" \
-	"move\t%0, $2\n\t" \
-	".set\treorder" \
-	: "=&r" (__v0), "=r" (__a3) \
-	: "i" (__NR_##name) \
-	: "$2", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", \
-	  "memory"); \
-	\
-	if (__a3 == 0) \
-		return (type) __v0; \
-	errno = __v0; \
-	return (type) -1; \
-}
-
-/*
- * DANGER: This macro isn't usable for the pipe(2) call
- * which has a unusual return convention.
- */
-#define _syscall1(type,name,atype,a) \
-type name(atype a) \
-{ \
-	register unsigned long __a0 asm("$4") = (unsigned long) a; \
-	register unsigned long __a3 asm("$7"); \
-	unsigned long __v0; \
-	\
-	__asm__ volatile ( \
-	".set\tnoreorder\n\t" \
-	"li\t$2, %3\t\t\t# " #name "\n\t" \
-	"syscall\n\t" \
-	"move\t%0, $2\n\t" \
-	".set\treorder" \
-	: "=&r" (__v0), "=r" (__a3) \
-	: "r" (__a0), "i" (__NR_##name) \
-	: "$2", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", \
-	  "memory"); \
-	\
-	if (__a3 == 0) \
-		return (type) __v0; \
-	errno = __v0; \
-	return (type) -1; \
-}
-
-#define _syscall2(type,name,atype,a,btype,b) \
-type name(atype a, btype b) \
-{ \
-	register unsigned long __a0 asm("$4") = (unsigned long) a; \
-	register unsigned long __a1 asm("$5") = (unsigned long) b; \
-	register unsigned long __a3 asm("$7"); \
-	unsigned long __v0; \
-	\
-	__asm__ volatile ( \
-	".set\tnoreorder\n\t" \
-	"li\t$2, %4\t\t\t# " #name "\n\t" \
-	"syscall\n\t" \
-	"move\t%0, $2\n\t" \
-	".set\treorder" \
-	: "=&r" (__v0), "=r" (__a3) \
-	: "r" (__a0), "r" (__a1), "i" (__NR_##name) \
-	: "$2", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", \
-	  "memory"); \
-	\
-	if (__a3 == 0) \
-		return (type) __v0; \
-	errno = __v0; \
-	return (type) -1; \
-}
-
-#define _syscall3(type,name,atype,a,btype,b,ctype,c) \
-type name(atype a, btype b, ctype c) \
-{ \
-	register unsigned long __a0 asm("$4") = (unsigned long) a; \
-	register unsigned long __a1 asm("$5") = (unsigned long) b; \
-	register unsigned long __a2 asm("$6") = (unsigned long) c; \
-	register unsigned long __a3 asm("$7"); \
-	unsigned long __v0; \
-	\
-	__asm__ volatile ( \
-	".set\tnoreorder\n\t" \
-	"li\t$2, %5\t\t\t# " #name "\n\t" \
-	"syscall\n\t" \
-	"move\t%0, $2\n\t" \
-	".set\treorder" \
-	: "=&r" (__v0), "=r" (__a3) \
-	: "r" (__a0), "r" (__a1), "r" (__a2), "i" (__NR_##name) \
-	: "$2", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", \
-	  "memory"); \
-	\
-	if (__a3 == 0) \
-		return (type) __v0; \
-	errno = __v0; \
-	return (type) -1; \
-}
-
-#define _syscall4(type,name,atype,a,btype,b,ctype,c,dtype,d) \
-type name(atype a, btype b, ctype c, dtype d) \
-{ \
-	register unsigned long __a0 asm("$4") = (unsigned long) a; \
-	register unsigned long __a1 asm("$5") = (unsigned long) b; \
-	register unsigned long __a2 asm("$6") = (unsigned long) c; \
-	register unsigned long __a3 asm("$7") = (unsigned long) d; \
-	unsigned long __v0; \
-	\
-	__asm__ volatile ( \
-	".set\tnoreorder\n\t" \
-	"li\t$2, %5\t\t\t# " #name "\n\t" \
-	"syscall\n\t" \
-	"move\t%0, $2\n\t" \
-	".set\treorder" \
-	: "=&r" (__v0), "+r" (__a3) \
-	: "r" (__a0), "r" (__a1), "r" (__a2), "i" (__NR_##name) \
-	: "$2", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", \
-	  "memory"); \
-	\
-	if (__a3 == 0) \
-		return (type) __v0; \
-	errno = __v0; \
-	return (type) -1; \
-}
-
-#if (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-/*
- * Using those means your brain needs more than an oil change ;-)
- */
-
-#define _syscall5(type,name,atype,a,btype,b,ctype,c,dtype,d,etype,e) \
-type name(atype a, btype b, ctype c, dtype d, etype e) \
-{ \
-	register unsigned long __a0 asm("$4") = (unsigned long) a; \
-	register unsigned long __a1 asm("$5") = (unsigned long) b; \
-	register unsigned long __a2 asm("$6") = (unsigned long) c; \
-	register unsigned long __a3 asm("$7") = (unsigned long) d; \
-	unsigned long __v0; \
-	\
-	__asm__ volatile ( \
-	".set\tnoreorder\n\t" \
-	"lw\t$2, %6\n\t" \
-	"subu\t$29, 32\n\t" \
-	"sw\t$2, 16($29)\n\t" \
-	"li\t$2, %5\t\t\t# " #name "\n\t" \
-	"syscall\n\t" \
-	"move\t%0, $2\n\t" \
-	"addiu\t$29, 32\n\t" \
-	".set\treorder" \
-	: "=&r" (__v0), "+r" (__a3) \
-	: "r" (__a0), "r" (__a1), "r" (__a2), "i" (__NR_##name), \
-	  "m" ((unsigned long)e) \
-	: "$2", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", \
-	  "memory"); \
-	\
-	if (__a3 == 0) \
-		return (type) __v0; \
-	errno = __v0; \
-	return (type) -1; \
-}
-
-#define _syscall6(type,name,atype,a,btype,b,ctype,c,dtype,d,etype,e,ftype,f) \
-type name(atype a, btype b, ctype c, dtype d, etype e, ftype f) \
-{ \
-	register unsigned long __a0 asm("$4") = (unsigned long) a; \
-	register unsigned long __a1 asm("$5") = (unsigned long) b; \
-	register unsigned long __a2 asm("$6") = (unsigned long) c; \
-	register unsigned long __a3 asm("$7") = (unsigned long) d; \
-	unsigned long __v0; \
-	\
-	__asm__ volatile ( \
-	".set\tnoreorder\n\t" \
-	"lw\t$2, %6\n\t" \
-	"lw\t$8, %7\n\t" \
-	"subu\t$29, 32\n\t" \
-	"sw\t$2, 16($29)\n\t" \
-	"sw\t$8, 20($29)\n\t" \
-	"li\t$2, %5\t\t\t# " #name "\n\t" \
-	"syscall\n\t" \
-	"move\t%0, $2\n\t" \
-	"addiu\t$29, 32\n\t" \
-	".set\treorder" \
-	: "=&r" (__v0), "+r" (__a3) \
-	: "r" (__a0), "r" (__a1), "r" (__a2), "i" (__NR_##name), \
-	  "m" ((unsigned long)e), "m" ((unsigned long)f) \
-	: "$2", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", \
-	  "memory"); \
-	\
-	if (__a3 == 0) \
-		return (type) __v0; \
-	errno = __v0; \
-	return (type) -1; \
-}
-
-#endif /* (_MIPS_SIM == _MIPS_SIM_ABI32) */
-
-#if (_MIPS_SIM == _MIPS_SIM_NABI32) || (_MIPS_SIM == _MIPS_SIM_ABI64)
-
-#define _syscall5(type,name,atype,a,btype,b,ctype,c,dtype,d,etype,e) \
-type name (atype a,btype b,ctype c,dtype d,etype e) \
-{ \
-	register unsigned long __a0 asm("$4") = (unsigned long) a; \
-	register unsigned long __a1 asm("$5") = (unsigned long) b; \
-	register unsigned long __a2 asm("$6") = (unsigned long) c; \
-	register unsigned long __a3 asm("$7") = (unsigned long) d; \
-	register unsigned long __a4 asm("$8") = (unsigned long) e; \
-	unsigned long __v0; \
-	\
-	__asm__ volatile ( \
-	".set\tnoreorder\n\t" \
-	"li\t$2, %6\t\t\t# " #name "\n\t" \
-	"syscall\n\t" \
-	"move\t%0, $2\n\t" \
-	".set\treorder" \
-	: "=&r" (__v0), "+r" (__a3) \
-	: "r" (__a0), "r" (__a1), "r" (__a2), "r" (__a4), "i" (__NR_##name) \
-	: "$2", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", \
-	  "memory"); \
-	\
-	if (__a3 == 0) \
-		return (type) __v0; \
-	errno = __v0; \
-	return (type) -1; \
-}
-
-#define _syscall6(type,name,atype,a,btype,b,ctype,c,dtype,d,etype,e,ftype,f) \
-type name (atype a,btype b,ctype c,dtype d,etype e,ftype f) \
-{ \
-	register unsigned long __a0 asm("$4") = (unsigned long) a; \
-	register unsigned long __a1 asm("$5") = (unsigned long) b; \
-	register unsigned long __a2 asm("$6") = (unsigned long) c; \
-	register unsigned long __a3 asm("$7") = (unsigned long) d; \
-	register unsigned long __a4 asm("$8") = (unsigned long) e; \
-	register unsigned long __a5 asm("$9") = (unsigned long) f; \
-	unsigned long __v0; \
-	\
-	__asm__ volatile ( \
-	".set\tnoreorder\n\t" \
-	"li\t$2, %7\t\t\t# " #name "\n\t" \
-	"syscall\n\t" \
-	"move\t%0, $2\n\t" \
-	".set\treorder" \
-	: "=&r" (__v0), "+r" (__a3) \
-	: "r" (__a0), "r" (__a1), "r" (__a2), "r" (__a4), "r" (__a5), \
-	  "i" (__NR_##name) \
-	: "$2", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", \
-	  "memory"); \
-	\
-	if (__a3 == 0) \
-		return (type) __v0; \
-	errno = __v0; \
-	return (type) -1; \
-}
-
-#endif /* (_MIPS_SIM == _MIPS_SIM_NABI32) || (_MIPS_SIM == _MIPS_SIM_ABI64) */
-
-
 #define __ARCH_OMIT_COMPAT_SYS_GETDENTS64
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/asm-powerpc/unistd.h b/include/asm-powerpc/unistd.h
index 04b6c17..0ae954e 100644
--- a/include/asm-powerpc/unistd.h
+++ b/include/asm-powerpc/unistd.h
@@ -334,115 +334,6 @@
 
 #ifndef __ASSEMBLY__
 
-/* On powerpc a system call basically clobbers the same registers like a
- * function call, with the exception of LR (which is needed for the
- * "sc; bnslr" sequence) and CR (where only CR0.SO is clobbered to signal
- * an error return status).
- */
-
-#define __syscall_nr(nr, type, name, args...)				\
-	unsigned long __sc_ret, __sc_err;				\
-	{								\
-		register unsigned long __sc_0  __asm__ ("r0");		\
-		register unsigned long __sc_3  __asm__ ("r3");		\
-		register unsigned long __sc_4  __asm__ ("r4");		\
-		register unsigned long __sc_5  __asm__ ("r5");		\
-		register unsigned long __sc_6  __asm__ ("r6");		\
-		register unsigned long __sc_7  __asm__ ("r7");		\
-		register unsigned long __sc_8  __asm__ ("r8");		\
-									\
-		__sc_loadargs_##nr(name, args);				\
-		__asm__ __volatile__					\
-			("sc           \n\t"				\
-			 "mfcr %0      "				\
-			: "=&r" (__sc_0),				\
-			  "=&r" (__sc_3),  "=&r" (__sc_4),		\
-			  "=&r" (__sc_5),  "=&r" (__sc_6),		\
-			  "=&r" (__sc_7),  "=&r" (__sc_8)		\
-			: __sc_asm_input_##nr				\
-			: "cr0", "ctr", "memory",			\
-			  "r9", "r10","r11", "r12");			\
-		__sc_ret = __sc_3;					\
-		__sc_err = __sc_0;					\
-	}								\
-	if (__sc_err & 0x10000000)					\
-	{								\
-		errno = __sc_ret;					\
-		__sc_ret = -1;						\
-	}								\
-	return (type) __sc_ret
-
-#define __sc_loadargs_0(name, dummy...)					\
-	__sc_0 = __NR_##name
-#define __sc_loadargs_1(name, arg1)					\
-	__sc_loadargs_0(name);						\
-	__sc_3 = (unsigned long) (arg1)
-#define __sc_loadargs_2(name, arg1, arg2)				\
-	__sc_loadargs_1(name, arg1);					\
-	__sc_4 = (unsigned long) (arg2)
-#define __sc_loadargs_3(name, arg1, arg2, arg3)				\
-	__sc_loadargs_2(name, arg1, arg2);				\
-	__sc_5 = (unsigned long) (arg3)
-#define __sc_loadargs_4(name, arg1, arg2, arg3, arg4)			\
-	__sc_loadargs_3(name, arg1, arg2, arg3);			\
-	__sc_6 = (unsigned long) (arg4)
-#define __sc_loadargs_5(name, arg1, arg2, arg3, arg4, arg5)		\
-	__sc_loadargs_4(name, arg1, arg2, arg3, arg4);			\
-	__sc_7 = (unsigned long) (arg5)
-#define __sc_loadargs_6(name, arg1, arg2, arg3, arg4, arg5, arg6)	\
-	__sc_loadargs_5(name, arg1, arg2, arg3, arg4, arg5);		\
-	__sc_8 = (unsigned long) (arg6)
-
-#define __sc_asm_input_0 "0" (__sc_0)
-#define __sc_asm_input_1 __sc_asm_input_0, "1" (__sc_3)
-#define __sc_asm_input_2 __sc_asm_input_1, "2" (__sc_4)
-#define __sc_asm_input_3 __sc_asm_input_2, "3" (__sc_5)
-#define __sc_asm_input_4 __sc_asm_input_3, "4" (__sc_6)
-#define __sc_asm_input_5 __sc_asm_input_4, "5" (__sc_7)
-#define __sc_asm_input_6 __sc_asm_input_5, "6" (__sc_8)
-
-#define _syscall0(type,name)						\
-type name(void)								\
-{									\
-	__syscall_nr(0, type, name);					\
-}
-
-#define _syscall1(type,name,type1,arg1)					\
-type name(type1 arg1)							\
-{									\
-	__syscall_nr(1, type, name, arg1);				\
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2)			\
-type name(type1 arg1, type2 arg2)					\
-{									\
-	__syscall_nr(2, type, name, arg1, arg2);			\
-}
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3)		\
-type name(type1 arg1, type2 arg2, type3 arg3)				\
-{									\
-	__syscall_nr(3, type, name, arg1, arg2, arg3);			\
-}
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
-type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4)		\
-{									\
-	__syscall_nr(4, type, name, arg1, arg2, arg3, arg4);		\
-}
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5) \
-type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5)	\
-{									\
-	__syscall_nr(5, type, name, arg1, arg2, arg3, arg4, arg5);	\
-}
-#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5,type6,arg6) \
-type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6) \
-{									\
-	__syscall_nr(6, type, name, arg1, arg2, arg3, arg4, arg5, arg6); \
-}
-
-
 #include <linux/types.h>
 #include <linux/compiler.h>
 #include <linux/linkage.h>
diff --git a/include/asm-s390/unistd.h b/include/asm-s390/unistd.h
index 71d3c21..fb6fef9 100644
--- a/include/asm-s390/unistd.h
+++ b/include/asm-s390/unistd.h
@@ -345,160 +345,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/err.h>
-
-#define __syscall_return(type, res)			     \
-do {							     \
-	if ((unsigned long)(res) >= (unsigned long)(-MAX_ERRNO)) { \
-		errno = -(res);				     \
-		res = -1;				     \
-	}						     \
-	return (type) (res);				     \
-} while (0)
-
-#define _svc_clobber "1", "cc", "memory"
-
-#define _syscall0(type,name)					\
-type name(void) {						\
-	register long __svcres asm("2");			\
-	long __res;						\
-	asm volatile(						\
-		"	.if	%1 < 256\n"			\
-		"	svc	%b1\n"				\
-		"	.else\n"				\
-		"	la	%%r1,%1\n"			\
-		"	svc	0\n"				\
-		"	.endif"					\
-		: "=d" (__svcres)				\
-		: "i" (__NR_##name)				\
-		: _svc_clobber);				\
-	__res = __svcres;					\
-	__syscall_return(type,__res);				\
-}
-
-#define _syscall1(type,name,type1,arg1)				\
-type name(type1 arg1) {						\
-	register type1 __arg1 asm("2") = arg1;			\
-	register long __svcres asm("2");			\
-	long __res;						\
-	asm volatile(						\
-		"	.if	%1 < 256\n"			\
-		"	svc	%b1\n"				\
-		"	.else\n"				\
-		"	la	%%r1,%1\n"			\
-		"	svc	0\n"				\
-		"	.endif"					\
-		: "=d" (__svcres)				\
-		: "i" (__NR_##name),				\
-		  "0" (__arg1)					\
-		: _svc_clobber);				\
-	__res = __svcres;					\
-	__syscall_return(type,__res);				\
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2)		\
-type name(type1 arg1, type2 arg2) {				\
-	register type1 __arg1 asm("2") = arg1;			\
-	register type2 __arg2 asm("3") = arg2;			\
-	register long __svcres asm("2");			\
-	long __res;						\
-	asm volatile(						\
-		"	.if	%1 < 256\n"			\
-		"	svc	%b1\n"				\
-		"	.else\n"				\
-		"	la	%%r1,%1\n"			\
-		"	svc	0\n"				\
-		"	.endif"					\
-		: "=d" (__svcres)				\
-		: "i" (__NR_##name),				\
-		  "0" (__arg1),					\
-		  "d" (__arg2)					\
-		: _svc_clobber );				\
-	__res = __svcres;					\
-	__syscall_return(type,__res);				\
-}
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3)	\
-type name(type1 arg1, type2 arg2, type3 arg3) {			\
-	register type1 __arg1 asm("2") = arg1;			\
-	register type2 __arg2 asm("3") = arg2;			\
-	register type3 __arg3 asm("4") = arg3;			\
-	register long __svcres asm("2");			\
-	long __res;						\
-	asm volatile(						\
-		"	.if	%1 < 256\n"			\
-		"	svc	%b1\n"				\
-		"	.else\n"				\
-		"	la	%%r1,%1\n"			\
-		"	svc	0\n"				\
-		"	.endif"					\
-		: "=d" (__svcres)				\
-		: "i" (__NR_##name),				\
-		  "0" (__arg1),					\
-		  "d" (__arg2),					\
-		  "d" (__arg3)					\
-		: _svc_clobber);				\
-	__res = __svcres;					\
-	__syscall_return(type,__res);				\
-}
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,	\
-		  type4,name4)					\
-type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4) {	\
-	register type1 __arg1 asm("2") = arg1;			\
-	register type2 __arg2 asm("3") = arg2;			\
-	register type3 __arg3 asm("4") = arg3;			\
-	register type4 __arg4 asm("5") = arg4;			\
-	register long __svcres asm("2");			\
-	long __res;						\
-	asm volatile(						\
-		"	.if	%1 < 256\n"			\
-		"	svc	%b1\n"				\
-		"	.else\n"				\
-		"	la	%%r1,%1\n"			\
-		"	svc	0\n"				\
-		"	.endif"					\
-		: "=d" (__svcres)				\
-		: "i" (__NR_##name),				\
-		  "0" (__arg1),					\
-		  "d" (__arg2),					\
-		  "d" (__arg3),					\
-		  "d" (__arg4)					\
-		: _svc_clobber);				\
-	__res = __svcres;					\
-	__syscall_return(type,__res);				\
-}
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,	\
-		  type4,name4,type5,name5)			\
-type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4,	\
-	  type5 arg5) {						\
-	register type1 __arg1 asm("2") = arg1;			\
-	register type2 __arg2 asm("3") = arg2;			\
-	register type3 __arg3 asm("4") = arg3;			\
-	register type4 __arg4 asm("5") = arg4;			\
-	register type5 __arg5 asm("6") = arg5;			\
-	register long __svcres asm("2");			\
-	long __res;						\
-	asm volatile(						\
-		"	.if	%1 < 256\n"			\
-		"	svc	%b1\n"				\
-		"	.else\n"				\
-		"	la	%%r1,%1\n"			\
-		"	svc	0\n"				\
-		"	.endif"					\
-		: "=d" (__svcres)				\
-		: "i" (__NR_##name),				\
-		  "0" (__arg1),					\
-		  "d" (__arg2),					\
-		  "d" (__arg3),					\
-		  "d" (__arg4),					\
-		  "d" (__arg5)					\
-		: _svc_clobber);				\
-	__res = __svcres;					\
-	__syscall_return(type,__res);				\
-}
-
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_SYS_ALARM
diff --git a/include/asm-sh/unistd.h b/include/asm-sh/unistd.h
index 0cae1d2..f982073 100644
--- a/include/asm-sh/unistd.h
+++ b/include/asm-sh/unistd.h
@@ -332,143 +332,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/err.h>
-
-/* user-visible error numbers are in the range -1 - -MAX_ERRNO:
- * see <asm-sh/errno.h> */
-
-#define __syscall_return(type, res) \
-do { \
-	if ((unsigned long)(res) >= (unsigned long)(-MAX_ERRNO)) { \
-	/* Avoid using "res" which is declared to be in register r0; \
-	   errno might expand to a function call and clobber it.  */ \
-		int __err = -(res); \
-		errno = __err; \
-		res = -1; \
-	} \
-	return (type) (res); \
-} while (0)
-
-#if defined(__sh2__) || defined(__SH2E__) || defined(__SH2A__)
-#define SYSCALL_ARG0	"trapa #0x20"
-#define SYSCALL_ARG1	"trapa #0x21"
-#define SYSCALL_ARG2	"trapa #0x22"
-#define SYSCALL_ARG3	"trapa #0x23"
-#define SYSCALL_ARG4	"trapa #0x24"
-#define SYSCALL_ARG5	"trapa #0x25"
-#define SYSCALL_ARG6	"trapa #0x26"
-#else
-#define SYSCALL_ARG0	"trapa #0x10"
-#define SYSCALL_ARG1	"trapa #0x11"
-#define SYSCALL_ARG2	"trapa #0x12"
-#define SYSCALL_ARG3	"trapa #0x13"
-#define SYSCALL_ARG4	"trapa #0x14"
-#define SYSCALL_ARG5	"trapa #0x15"
-#define SYSCALL_ARG6	"trapa #0x16"
-#endif
-
-/* XXX - _foo needs to be __foo, while __NR_bar could be _NR_bar. */
-#define _syscall0(type,name) \
-type name(void) \
-{ \
-register long __sc0 __asm__ ("r3") = __NR_##name; \
-__asm__ __volatile__ (SYSCALL_ARG0 \
-	: "=z" (__sc0) \
-	: "0" (__sc0) \
-	: "memory" ); \
-__syscall_return(type,__sc0); \
-}
-
-#define _syscall1(type,name,type1,arg1) \
-type name(type1 arg1) \
-{ \
-register long __sc0 __asm__ ("r3") = __NR_##name; \
-register long __sc4 __asm__ ("r4") = (long) arg1; \
-__asm__ __volatile__ (SYSCALL_ARG1 \
-	: "=z" (__sc0) \
-	: "0" (__sc0), "r" (__sc4) \
-	: "memory"); \
-__syscall_return(type,__sc0); \
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2) \
-type name(type1 arg1,type2 arg2) \
-{ \
-register long __sc0 __asm__ ("r3") = __NR_##name; \
-register long __sc4 __asm__ ("r4") = (long) arg1; \
-register long __sc5 __asm__ ("r5") = (long) arg2; \
-__asm__ __volatile__ (SYSCALL_ARG2 \
-	: "=z" (__sc0) \
-	: "0" (__sc0), "r" (__sc4), "r" (__sc5) \
-	: "memory"); \
-__syscall_return(type,__sc0); \
-}
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \
-type name(type1 arg1,type2 arg2,type3 arg3) \
-{ \
-register long __sc0 __asm__ ("r3") = __NR_##name; \
-register long __sc4 __asm__ ("r4") = (long) arg1; \
-register long __sc5 __asm__ ("r5") = (long) arg2; \
-register long __sc6 __asm__ ("r6") = (long) arg3; \
-__asm__ __volatile__ (SYSCALL_ARG3 \
-	: "=z" (__sc0) \
-	: "0" (__sc0), "r" (__sc4), "r" (__sc5), "r" (__sc6) \
-	: "memory"); \
-__syscall_return(type,__sc0); \
-}
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4) \
-{ \
-register long __sc0 __asm__ ("r3") = __NR_##name; \
-register long __sc4 __asm__ ("r4") = (long) arg1; \
-register long __sc5 __asm__ ("r5") = (long) arg2; \
-register long __sc6 __asm__ ("r6") = (long) arg3; \
-register long __sc7 __asm__ ("r7") = (long) arg4; \
-__asm__ __volatile__ (SYSCALL_ARG4 \
-	: "=z" (__sc0) \
-	: "0" (__sc0), "r" (__sc4), "r" (__sc5), "r" (__sc6),  \
-	  "r" (__sc7) \
-	: "memory" ); \
-__syscall_return(type,__sc0); \
-}
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) \
-{ \
-register long __sc3 __asm__ ("r3") = __NR_##name; \
-register long __sc4 __asm__ ("r4") = (long) arg1; \
-register long __sc5 __asm__ ("r5") = (long) arg2; \
-register long __sc6 __asm__ ("r6") = (long) arg3; \
-register long __sc7 __asm__ ("r7") = (long) arg4; \
-register long __sc0 __asm__ ("r0") = (long) arg5; \
-__asm__ __volatile__ (SYSCALL_ARG5 \
-	: "=z" (__sc0) \
-	: "0" (__sc0), "r" (__sc4), "r" (__sc5), "r" (__sc6), "r" (__sc7),  \
-	  "r" (__sc3) \
-	: "memory" ); \
-__syscall_return(type,__sc0); \
-}
-
-#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5,type6,arg6) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6) \
-{ \
-register long __sc3 __asm__ ("r3") = __NR_##name; \
-register long __sc4 __asm__ ("r4") = (long) arg1; \
-register long __sc5 __asm__ ("r5") = (long) arg2; \
-register long __sc6 __asm__ ("r6") = (long) arg3; \
-register long __sc7 __asm__ ("r7") = (long) arg4; \
-register long __sc0 __asm__ ("r0") = (long) arg5; \
-register long __sc1 __asm__ ("r1") = (long) arg6; \
-__asm__ __volatile__ (SYSCALL_ARG6 \
-	: "=z" (__sc0) \
-	: "0" (__sc0), "r" (__sc4), "r" (__sc5), "r" (__sc6), "r" (__sc7),  \
-	  "r" (__sc3), "r" (__sc1) \
-	: "memory" ); \
-__syscall_return(type,__sc0); \
-}
-
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_OLD_STAT
diff --git a/include/asm-sh64/unistd.h b/include/asm-sh64/unistd.h
index ee7828b..1f38a7a 100644
--- a/include/asm-sh64/unistd.h
+++ b/include/asm-sh64/unistd.h
@@ -347,148 +347,6 @@
 #ifdef __KERNEL__ 
 
 #define NR_syscalls 321
-#include <linux/err.h>
-
-/* user-visible error numbers are in the range -1 - -MAX_ERRNO:
- * see <asm-sh64/errno.h> */
-
-#define __syscall_return(type, res) \
-do { \
-	/* Note: when returning from kernel the return value is in r9	    \
-	**       This prevents conflicts between return value and arg1      \
-	**       when dispatching signal handler, in other words makes	    \
-	**       life easier in the system call epilogue (see entry.S)      \
-	*/								    \
-        register unsigned long __sr2 __asm__ ("r2") = res;		    \
-	if ((unsigned long)(res) >= (unsigned long)(-MAX_ERRNO)) {	    \
-		errno = -(res);						    \
-		__sr2 = -1; 						    \
-	} \
-	return (type) (__sr2); 						    \
-} while (0)
-
-/* XXX - _foo needs to be __foo, while __NR_bar could be _NR_bar. */
-
-#define _syscall0(type,name) \
-type name(void) \
-{ \
-register unsigned long __sc0 __asm__ ("r9") = ((0x10 << 16) | __NR_##name); \
-__asm__ __volatile__ ("trapa	%1 !\t\t\t" #name "()"			    \
-	: "=r" (__sc0) 							    \
-	: "r" (__sc0) ); 						    \
-__syscall_return(type,__sc0); 						    \
-}
-
-	/*
-	 * The apparent spurious "dummy" assembler comment is *needed*,
-	 * as without it, the compiler treats the arg<n> variables
-	 * as no longer live just before the asm. The compiler can
-	 * then optimize the storage into any registers it wishes.
-	 * The additional dummy statement forces the compiler to put
-	 * the arguments into the correct registers before the TRAPA.
-	 */
-#define _syscall1(type,name,type1,arg1) \
-type name(type1 arg1) \
-{ \
-register unsigned long __sc0 __asm__ ("r9") = ((0x11 << 16) | __NR_##name); \
-register unsigned long __sc2 __asm__ ("r2") = (unsigned long) arg1;	    \
-__asm__ __volatile__ ("trapa	%1 !\t\t\t" #name "(%2)"		    \
-	: "=r" (__sc0) 							    \
-	: "r" (__sc0), "r" (__sc2));					    \
-__asm__ __volatile__ ("!dummy	%0 %1"				   	    \
-	:								    \
-	: "r" (__sc0), "r" (__sc2));					    \
-__syscall_return(type,__sc0); 						    \
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2) \
-type name(type1 arg1,type2 arg2) \
-{ \
-register unsigned long __sc0 __asm__ ("r9") = ((0x12 << 16) | __NR_##name); \
-register unsigned long __sc2 __asm__ ("r2") = (unsigned long) arg1;	    \
-register unsigned long __sc3 __asm__ ("r3") = (unsigned long) arg2;	    \
-__asm__ __volatile__ ("trapa	%1 !\t\t\t" #name "(%2,%3)"		    \
-	: "=r" (__sc0) 							    \
-	: "r" (__sc0), "r" (__sc2), "r" (__sc3) );			    \
-__asm__ __volatile__ ("!dummy	%0 %1 %2"			   	    \
-	:								    \
-	: "r" (__sc0), "r" (__sc2), "r" (__sc3) );			    \
-__syscall_return(type,__sc0); 						    \
-}
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \
-type name(type1 arg1,type2 arg2,type3 arg3) \
-{ \
-register unsigned long __sc0 __asm__ ("r9") = ((0x13 << 16) | __NR_##name); \
-register unsigned long __sc2 __asm__ ("r2") = (unsigned long) arg1;	    \
-register unsigned long __sc3 __asm__ ("r3") = (unsigned long) arg2;	    \
-register unsigned long __sc4 __asm__ ("r4") = (unsigned long) arg3;	    \
-__asm__ __volatile__ ("trapa	%1 !\t\t\t" #name "(%2,%3,%4)"		    \
-	: "=r" (__sc0) 							    \
-	: "r" (__sc0), "r" (__sc2), "r" (__sc3), "r" (__sc4) );		    \
-__asm__ __volatile__ ("!dummy	%0 %1 %2 %3"			   	    \
-	:								    \
-	: "r" (__sc0), "r" (__sc2), "r" (__sc3), "r" (__sc4) );	   	    \
-__syscall_return(type,__sc0); 						    \
-}
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4) \
-{ \
-register unsigned long __sc0 __asm__ ("r9") = ((0x14 << 16) | __NR_##name); \
-register unsigned long __sc2 __asm__ ("r2") = (unsigned long) arg1;	    \
-register unsigned long __sc3 __asm__ ("r3") = (unsigned long) arg2;	    \
-register unsigned long __sc4 __asm__ ("r4") = (unsigned long) arg3;	    \
-register unsigned long __sc5 __asm__ ("r5") = (unsigned long) arg4;	    \
-__asm__ __volatile__ ("trapa	%1 !\t\t\t" #name "(%2,%3,%4,%5)"	    \
-	: "=r" (__sc0) 							    \
-	: "r" (__sc0), "r" (__sc2), "r" (__sc3), "r" (__sc4), "r" (__sc5) );\
-__asm__ __volatile__ ("!dummy	%0 %1 %2 %3 %4"			   	    \
-	:								    \
-	: "r" (__sc0), "r" (__sc2), "r" (__sc3), "r" (__sc4), "r" (__sc5) );\
-__syscall_return(type,__sc0); 						    \
-}
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) \
-{ \
-register unsigned long __sc0 __asm__ ("r9") = ((0x15 << 16) | __NR_##name); \
-register unsigned long __sc2 __asm__ ("r2") = (unsigned long) arg1;	    \
-register unsigned long __sc3 __asm__ ("r3") = (unsigned long) arg2;	    \
-register unsigned long __sc4 __asm__ ("r4") = (unsigned long) arg3;	    \
-register unsigned long __sc5 __asm__ ("r5") = (unsigned long) arg4;	    \
-register unsigned long __sc6 __asm__ ("r6") = (unsigned long) arg5;	    \
-__asm__ __volatile__ ("trapa	%1 !\t\t\t" #name "(%2,%3,%4,%5,%6)"	    \
-	: "=r" (__sc0) 							    \
-	: "r" (__sc0), "r" (__sc2), "r" (__sc3), "r" (__sc4), "r" (__sc5),  \
-	  "r" (__sc6));							    \
-__asm__ __volatile__ ("!dummy	%0 %1 %2 %3 %4 %5"		   	    \
-	:								    \
-	: "r" (__sc0), "r" (__sc2), "r" (__sc3), "r" (__sc4), "r" (__sc5),  \
-	  "r" (__sc6));							    \
-__syscall_return(type,__sc0); 						    \
-}
-
-#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5, type6, arg6) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6) \
-{ \
-register unsigned long __sc0 __asm__ ("r9") = ((0x16 << 16) | __NR_##name); \
-register unsigned long __sc2 __asm__ ("r2") = (unsigned long) arg1;	    \
-register unsigned long __sc3 __asm__ ("r3") = (unsigned long) arg2;	    \
-register unsigned long __sc4 __asm__ ("r4") = (unsigned long) arg3;	    \
-register unsigned long __sc5 __asm__ ("r5") = (unsigned long) arg4;	    \
-register unsigned long __sc6 __asm__ ("r6") = (unsigned long) arg5;	    \
-register unsigned long __sc7 __asm__ ("r7") = (unsigned long) arg6;	    \
-__asm__ __volatile__ ("trapa	%1 !\t\t\t" #name "(%2,%3,%4,%5,%6,%7)"	    \
-	: "=r" (__sc0) 							    \
-	: "r" (__sc0), "r" (__sc2), "r" (__sc3), "r" (__sc4), "r" (__sc5),  \
-	  "r" (__sc6), "r" (__sc7));					    \
-__asm__ __volatile__ ("!dummy	%0 %1 %2 %3 %4 %5 %6"		   	    \
-	:								    \
-	: "r" (__sc0), "r" (__sc2), "r" (__sc3), "r" (__sc4), "r" (__sc5),  \
-	  "r" (__sc6), "r" (__sc7));					    \
-__syscall_return(type,__sc0); 						    \
-}
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/asm-sparc/unistd.h b/include/asm-sparc/unistd.h
index f7827fa..d5b2f80 100644
--- a/include/asm-sparc/unistd.h
+++ b/include/asm-sparc/unistd.h
@@ -329,136 +329,6 @@
  *          find a free slot in the 0-302 range.
  */
 
-#define _syscall0(type,name) \
-type name(void) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-__asm__ __volatile__ ("t 0x10\n\t" \
-		      "bcc 1f\n\t" \
-		      "mov %%o0, %0\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "1:\n\t" \
-		      : "=r" (__res)\
-		      : "r" (__g1) \
-		      : "o0", "cc"); \
-if (__res < -255 || __res >= 0) \
-    return (type) __res; \
-errno = -__res; \
-return -1; \
-}
-
-#define _syscall1(type,name,type1,arg1) \
-type name(type1 arg1) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-register long __o0 __asm__ ("o0") = (long)(arg1); \
-__asm__ __volatile__ ("t 0x10\n\t" \
-		      "bcc 1f\n\t" \
-		      "mov %%o0, %0\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "1:\n\t" \
-		      : "=r" (__res), "=&r" (__o0) \
-		      : "1" (__o0), "r" (__g1) \
-		      : "cc"); \
-if (__res < -255 || __res >= 0) \
-	return (type) __res; \
-errno = -__res; \
-return -1; \
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2) \
-type name(type1 arg1,type2 arg2) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-register long __o0 __asm__ ("o0") = (long)(arg1); \
-register long __o1 __asm__ ("o1") = (long)(arg2); \
-__asm__ __volatile__ ("t 0x10\n\t" \
-		      "bcc 1f\n\t" \
-		      "mov %%o0, %0\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "1:\n\t" \
-		      : "=r" (__res), "=&r" (__o0) \
-		      : "1" (__o0), "r" (__o1), "r" (__g1) \
-		      : "cc"); \
-if (__res < -255 || __res >= 0) \
-	return (type) __res; \
-errno = -__res; \
-return -1; \
-}
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \
-type name(type1 arg1,type2 arg2,type3 arg3) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-register long __o0 __asm__ ("o0") = (long)(arg1); \
-register long __o1 __asm__ ("o1") = (long)(arg2); \
-register long __o2 __asm__ ("o2") = (long)(arg3); \
-__asm__ __volatile__ ("t 0x10\n\t" \
-		      "bcc 1f\n\t" \
-		      "mov %%o0, %0\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "1:\n\t" \
-		      : "=r" (__res), "=&r" (__o0) \
-		      : "1" (__o0), "r" (__o1), "r" (__o2), "r" (__g1) \
-		      : "cc"); \
-if (__res < -255 || __res>=0) \
-	return (type) __res; \
-errno = -__res; \
-return -1; \
-}
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-register long __o0 __asm__ ("o0") = (long)(arg1); \
-register long __o1 __asm__ ("o1") = (long)(arg2); \
-register long __o2 __asm__ ("o2") = (long)(arg3); \
-register long __o3 __asm__ ("o3") = (long)(arg4); \
-__asm__ __volatile__ ("t 0x10\n\t" \
-		      "bcc 1f\n\t" \
-		      "mov %%o0, %0\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "1:\n\t" \
-		      : "=r" (__res), "=&r" (__o0) \
-		      : "1" (__o0), "r" (__o1), "r" (__o2), "r" (__o3), "r" (__g1) \
-		      : "cc"); \
-if (__res < -255 || __res>=0) \
-	return (type) __res; \
-errno = -__res; \
-return -1; \
-} 
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
-	  type5,arg5) \
-type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-register long __o0 __asm__ ("o0") = (long)(arg1); \
-register long __o1 __asm__ ("o1") = (long)(arg2); \
-register long __o2 __asm__ ("o2") = (long)(arg3); \
-register long __o3 __asm__ ("o3") = (long)(arg4); \
-register long __o4 __asm__ ("o4") = (long)(arg5); \
-__asm__ __volatile__ ("t 0x10\n\t" \
-		      "bcc 1f\n\t" \
-		      "mov %%o0, %0\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "1:\n\t" \
-		      : "=r" (__res), "=&r" (__o0) \
-		      : "1" (__o0), "r" (__o1), "r" (__o2), "r" (__o3), "r" (__o4), "r" (__g1) \
-		      : "cc"); \
-if (__res < -255 || __res>=0) \
-	return (type) __res; \
-errno = -__res; \
-return -1; \
-}
-
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_STAT64
diff --git a/include/asm-sparc64/unistd.h b/include/asm-sparc64/unistd.h
index 63669da..4704753 100644
--- a/include/asm-sparc64/unistd.h
+++ b/include/asm-sparc64/unistd.h
@@ -332,124 +332,6 @@
  *          find a free slot in the 0-302 range.
  */
 
-#define _syscall0(type,name) \
-type name(void) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-__asm__ __volatile__ ("t 0x6d\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "movcc %%xcc, %%o0, %0\n\t" \
-		      : "=r" (__res)\
-		      : "r" (__g1) \
-		      : "o0", "cc"); \
-if (__res >= 0) \
-    return (type) __res; \
-errno = -__res; \
-return -1; \
-}
-
-#define _syscall1(type,name,type1,arg1) \
-type name(type1 arg1) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-register long __o0 __asm__ ("o0") = (long)(arg1); \
-__asm__ __volatile__ ("t 0x6d\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "movcc %%xcc, %%o0, %0\n\t" \
-		      : "=r" (__res), "=&r" (__o0) \
-		      : "1" (__o0), "r" (__g1) \
-		      : "cc"); \
-if (__res >= 0) \
-	return (type) __res; \
-errno = -__res; \
-return -1; \
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2) \
-type name(type1 arg1,type2 arg2) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-register long __o0 __asm__ ("o0") = (long)(arg1); \
-register long __o1 __asm__ ("o1") = (long)(arg2); \
-__asm__ __volatile__ ("t 0x6d\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "movcc %%xcc, %%o0, %0\n\t" \
-		      : "=r" (__res), "=&r" (__o0) \
-		      : "1" (__o0), "r" (__o1), "r" (__g1) \
-		      : "cc"); \
-if (__res >= 0) \
-	return (type) __res; \
-errno = -__res; \
-return -1; \
-}
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \
-type name(type1 arg1,type2 arg2,type3 arg3) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-register long __o0 __asm__ ("o0") = (long)(arg1); \
-register long __o1 __asm__ ("o1") = (long)(arg2); \
-register long __o2 __asm__ ("o2") = (long)(arg3); \
-__asm__ __volatile__ ("t 0x6d\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "movcc %%xcc, %%o0, %0\n\t" \
-		      : "=r" (__res), "=&r" (__o0) \
-		      : "1" (__o0), "r" (__o1), "r" (__o2), "r" (__g1) \
-		      : "cc"); \
-if (__res>=0) \
-	return (type) __res; \
-errno = -__res; \
-return -1; \
-}
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-register long __o0 __asm__ ("o0") = (long)(arg1); \
-register long __o1 __asm__ ("o1") = (long)(arg2); \
-register long __o2 __asm__ ("o2") = (long)(arg3); \
-register long __o3 __asm__ ("o3") = (long)(arg4); \
-__asm__ __volatile__ ("t 0x6d\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "movcc %%xcc, %%o0, %0\n\t" \
-		      : "=r" (__res), "=&r" (__o0) \
-		      : "1" (__o0), "r" (__o1), "r" (__o2), "r" (__o3), "r" (__g1) \
-		      : "cc"); \
-if (__res>=0) \
-	return (type) __res; \
-errno = -__res; \
-return -1; \
-} 
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
-	  type5,arg5) \
-type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-register long __o0 __asm__ ("o0") = (long)(arg1); \
-register long __o1 __asm__ ("o1") = (long)(arg2); \
-register long __o2 __asm__ ("o2") = (long)(arg3); \
-register long __o3 __asm__ ("o3") = (long)(arg4); \
-register long __o4 __asm__ ("o4") = (long)(arg5); \
-__asm__ __volatile__ ("t 0x6d\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "movcc %%xcc, %%o0, %0\n\t" \
-		      : "=r" (__res), "=&r" (__o0) \
-		      : "1" (__o0), "r" (__o1), "r" (__o2), "r" (__o3), "r" (__o4), "r" (__g1) \
-		      : "cc"); \
-if (__res>=0) \
-	return (type) __res; \
-errno = -__res; \
-return -1; \
-}
-
 /* sysconf options, for SunOS compatibility */
 #define   _SC_ARG_MAX             1
 #define   _SC_CHILD_MAX           2
diff --git a/include/asm-v850/unistd.h b/include/asm-v850/unistd.h
index 737401e..2241ed4 100644
--- a/include/asm-v850/unistd.h
+++ b/include/asm-v850/unistd.h
@@ -204,168 +204,8 @@
 #define __NR_gettid		201
 #define __NR_tkill		202
 
-
-/* Syscall protocol:
-   Syscall number in r12, args in r6-r9, r13-r14
-   Return value in r10
-   Trap 0 for `short' syscalls, where all the args can fit in function
-   call argument registers, and trap 1 when there are additional args in
-   r13-r14.  */
-
-#define SYSCALL_NUM	"r12"
-#define SYSCALL_ARG0	"r6"
-#define SYSCALL_ARG1	"r7"
-#define SYSCALL_ARG2	"r8"
-#define SYSCALL_ARG3	"r9"
-#define SYSCALL_ARG4	"r13"
-#define SYSCALL_ARG5	"r14"
-#define SYSCALL_RET	"r10"
-
-#define SYSCALL_SHORT_TRAP	"0"
-#define SYSCALL_LONG_TRAP	"1"
-
-/* Registers clobbered by any syscall.  This _doesn't_ include the syscall
-   number (r12) or the `extended arg' registers (r13, r14), even though
-   they are actually clobbered too (this is because gcc's `asm' statement
-   doesn't allow a clobber to be used as an input or output).  */
-#define SYSCALL_CLOBBERS	"r1", "r5", "r11", "r15", "r16", \
-				"r17", "r18", "r19"
-
-/* Registers clobbered by a `short' syscall.  This includes all clobbers
-   except the syscall number (r12).  */
-#define SYSCALL_SHORT_CLOBBERS	SYSCALL_CLOBBERS, "r13", "r14"
-
 #ifdef __KERNEL__
 
-#include <asm/clinkage.h>
-#include <linux/err.h>
-
-#define __syscall_return(type, res)					      \
-  do {									      \
-	  /* user-visible error numbers are in the range -1 - -MAX_ERRNO:      \
-	     see <asm-v850/errno.h> */					      \
-	  if (__builtin_expect ((unsigned long)(res) >= (unsigned long)(-MAX_ERRNO), 0)) { \
-		  errno = -(res);					      \
-		  res = -1;						      \
-	  }								      \
-	  return (type) (res);						      \
-  } while (0)
-
-
-#define _syscall0(type, name)						      \
-type name (void)							      \
-{									      \
-  register unsigned long __syscall __asm__ (SYSCALL_NUM) = __NR_##name;	      \
-  register unsigned long __ret __asm__ (SYSCALL_RET);			      \
-  __asm__ __volatile__ ("trap " SYSCALL_SHORT_TRAP			      \
-			: "=r" (__ret), "=r" (__syscall)	 	      \
-			: "1" (__syscall)				      \
-			: SYSCALL_SHORT_CLOBBERS);			      \
-  __syscall_return (type, __ret);					      \
-}
-
-#define _syscall1(type, name, atype, a)					      \
-type name (atype a)							      \
-{									      \
-  register atype __a __asm__ (SYSCALL_ARG0) = a;			      \
-  register unsigned long __syscall __asm__ (SYSCALL_NUM) = __NR_##name;	      \
-  register unsigned long __ret __asm__ (SYSCALL_RET);			      \
-  __asm__ __volatile__ ("trap " SYSCALL_SHORT_TRAP			      \
-			: "=r" (__ret), "=r" (__syscall)		      \
-			: "1" (__syscall), "r" (__a)			      \
-			: SYSCALL_SHORT_CLOBBERS);			      \
-  __syscall_return (type, __ret);					      \
-}
-
-#define _syscall2(type, name, atype, a, btype, b)			      \
-type name (atype a, btype b)						      \
-{									      \
-  register atype __a __asm__ (SYSCALL_ARG0) = a;			      \
-  register btype __b __asm__ (SYSCALL_ARG1) = b;			      \
-  register unsigned long __syscall __asm__ (SYSCALL_NUM) = __NR_##name;	      \
-  register unsigned long __ret __asm__ (SYSCALL_RET);			      \
-  __asm__ __volatile__ ("trap " SYSCALL_SHORT_TRAP			      \
-			: "=r" (__ret), "=r" (__syscall)		      \
-			: "1" (__syscall), "r" (__a), "r" (__b)		      \
-			: SYSCALL_SHORT_CLOBBERS);			      \
-  __syscall_return (type, __ret);					      \
-}
-
-#define _syscall3(type, name, atype, a, btype, b, ctype, c)		      \
-type name (atype a, btype b, ctype c)					      \
-{									      \
-  register atype __a __asm__ (SYSCALL_ARG0) = a;			      \
-  register btype __b __asm__ (SYSCALL_ARG1) = b;			      \
-  register ctype __c __asm__ (SYSCALL_ARG2) = c;			      \
-  register unsigned long __syscall __asm__ (SYSCALL_NUM) = __NR_##name;	      \
-  register unsigned long __ret __asm__ (SYSCALL_RET);			      \
-  __asm__ __volatile__ ("trap " SYSCALL_SHORT_TRAP			      \
-			: "=r" (__ret), "=r" (__syscall)		      \
-			: "1" (__syscall), "r" (__a), "r" (__b), "r" (__c)    \
-			: SYSCALL_SHORT_CLOBBERS);			      \
-  __syscall_return (type, __ret);					      \
-}
-
-#define _syscall4(type, name, atype, a, btype, b, ctype, c, dtype, d)	      \
-type name (atype a, btype b, ctype c, dtype d)				      \
-{									      \
-  register atype __a __asm__ (SYSCALL_ARG0) = a;			      \
-  register btype __b __asm__ (SYSCALL_ARG1) = b;			      \
-  register ctype __c __asm__ (SYSCALL_ARG2) = c;			      \
-  register dtype __d __asm__ (SYSCALL_ARG3) = d;			      \
-  register unsigned long __syscall __asm__ (SYSCALL_NUM) = __NR_##name;	      \
-  register unsigned long __ret __asm__ (SYSCALL_RET);			      \
-  __asm__ __volatile__ ("trap " SYSCALL_SHORT_TRAP			      \
-			: "=r" (__ret), "=r" (__syscall)		      \
-			: "1" (__syscall),				      \
-			"r" (__a), "r" (__b), "r" (__c), "r" (__d)	      \
-			: SYSCALL_SHORT_CLOBBERS);			      \
-  __syscall_return (type, __ret);					      \
-}
-
-#define _syscall5(type, name, atype, a, btype, b, ctype, c, dtype, d, etype,e)\
-type name (atype a, btype b, ctype c, dtype d, etype e)			      \
-{									      \
-  register atype __a __asm__ (SYSCALL_ARG0) = a;			      \
-  register btype __b __asm__ (SYSCALL_ARG1) = b;			      \
-  register ctype __c __asm__ (SYSCALL_ARG2) = c;			      \
-  register dtype __d __asm__ (SYSCALL_ARG3) = d;			      \
-  register etype __e __asm__ (SYSCALL_ARG4) = e;			      \
-  register unsigned long __syscall __asm__ (SYSCALL_NUM) = __NR_##name;	      \
-  register unsigned long __ret __asm__ (SYSCALL_RET);			      \
-  __asm__ __volatile__ ("trap " SYSCALL_LONG_TRAP			      \
-			: "=r" (__ret), "=r" (__syscall), "=r" (__e)	      \
-			: "1" (__syscall),				      \
-			"r" (__a), "r" (__b), "r" (__c), "r" (__d), "2" (__e) \
-			: SYSCALL_CLOBBERS);				      \
-  __syscall_return (type, __ret);					      \
-}
-
-#define __SYSCALL6_TRAP(syscall, ret, a, b, c, d, e, f)			      \
-  __asm__ __volatile__ ("trap " SYSCALL_LONG_TRAP			      \
-			: "=r" (ret), "=r" (syscall),			      \
- 			"=r" (e), "=r" (f)				      \
-			: "1" (syscall),				      \
-			"r" (a), "r" (b), "r" (c), "r" (d),		      \
-			"2" (e), "3" (f)				      \
-			: SYSCALL_CLOBBERS);
-
-#define _syscall6(type, name, atype, a, btype, b, ctype, c, dtype, d, etype, e, ftype, f) \
-type name (atype a, btype b, ctype c, dtype d, etype e, ftype f)	      \
-{									      \
-  register atype __a __asm__ (SYSCALL_ARG0) = a;			      \
-  register btype __b __asm__ (SYSCALL_ARG1) = b;			      \
-  register ctype __c __asm__ (SYSCALL_ARG2) = c;			      \
-  register dtype __d __asm__ (SYSCALL_ARG3) = d;			      \
-  register etype __e __asm__ (SYSCALL_ARG4) = e;			      \
-  register etype __f __asm__ (SYSCALL_ARG5) = f;			      \
-  register unsigned long __syscall __asm__ (SYSCALL_NUM) = __NR_##name;	      \
-  register unsigned long __ret __asm__ (SYSCALL_RET);			      \
-  __SYSCALL6_TRAP(__syscall, __ret, __a, __b, __c, __d, __e, __f);	      \
-  __syscall_return (type, __ret);					      \
-}
-		
-
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_STAT64
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 777288e..c5f596e 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -622,25 +622,7 @@ __SYSCALL(__NR_move_pages, sys_move_pages)
 
 #define __NR_syscall_max __NR_move_pages
 
-#ifdef __KERNEL__
-#include <linux/err.h>
-#endif
-
 #ifndef __NO_STUBS
-
-/* user-visible error numbers are in the range -1 - -MAX_ERRNO */
-
-#define __syscall_clobber "r11","rcx","memory" 
-
-#define __syscall_return(type, res) \
-do { \
-	if ((unsigned long)(res) >= (unsigned long)(-MAX_ERRNO)) { \
-		errno = -(res); \
-		res = -1; \
-	} \
-	return (type) (res); \
-} while (0)
-
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_OLD_STAT
 #define __ARCH_WANT_SYS_ALARM
@@ -664,87 +646,6 @@ do { \
 #define __ARCH_WANT_SYS_TIME
 #define __ARCH_WANT_COMPAT_SYS_TIME
 
-#define __syscall "syscall"
-
-#define _syscall0(type,name) \
-type name(void) \
-{ \
-long __res; \
-__asm__ volatile (__syscall \
-	: "=a" (__res) \
-	: "0" (__NR_##name) : __syscall_clobber ); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall1(type,name,type1,arg1) \
-type name(type1 arg1) \
-{ \
-long __res; \
-__asm__ volatile (__syscall \
-	: "=a" (__res) \
-	: "0" (__NR_##name),"D" ((long)(arg1)) : __syscall_clobber ); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2) \
-type name(type1 arg1,type2 arg2) \
-{ \
-long __res; \
-__asm__ volatile (__syscall \
-	: "=a" (__res) \
-	: "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)) : __syscall_clobber ); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \
-type name(type1 arg1,type2 arg2,type3 arg3) \
-{ \
-long __res; \
-__asm__ volatile (__syscall \
-	: "=a" (__res) \
-	: "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)), \
-		  "d" ((long)(arg3)) : __syscall_clobber); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4) \
-{ \
-long __res; \
-__asm__ volatile ("movq %5,%%r10 ;" __syscall \
-	: "=a" (__res) \
-	: "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)), \
-	  "d" ((long)(arg3)),"g" ((long)(arg4)) : __syscall_clobber,"r10" ); \
-__syscall_return(type,__res); \
-} 
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
-	  type5,arg5) \
-type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5) \
-{ \
-long __res; \
-__asm__ volatile ("movq %5,%%r10 ; movq %6,%%r8 ; " __syscall \
-	: "=a" (__res) \
-	: "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)), \
-	  "d" ((long)(arg3)),"g" ((long)(arg4)),"g" ((long)(arg5)) : \
-	__syscall_clobber,"r8","r10" ); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
-	  type5,arg5,type6,arg6) \
-type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5,type6 arg6) \
-{ \
-long __res; \
-__asm__ volatile ("movq %5,%%r10 ; movq %6,%%r8 ; movq %7,%%r9 ; " __syscall \
-	: "=a" (__res) \
-	: "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)), \
-	  "d" ((long)(arg3)), "g" ((long)(arg4)), "g" ((long)(arg5)), \
-	  "g" ((long)(arg6)) : \
-	__syscall_clobber,"r8","r10","r9" ); \
-__syscall_return(type,__res); \
-}
-
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
 
diff --git a/include/asm-xtensa/unistd.h b/include/asm-xtensa/unistd.h
index 411f810..2e1a1b9 100644
--- a/include/asm-xtensa/unistd.h
+++ b/include/asm-xtensa/unistd.h
@@ -218,190 +218,6 @@
 
 #define SYSXTENSA_COUNT		   5	/* count of syscall0 functions*/
 
-#ifdef __KERNEL__
-#include <linux/linkage.h>
-
-#define __syscall_return(type, res) return ((type)(res))
-
-/* Tensilica's xt-xcc compiler is much more agressive at code
- * optimization than gcc.  Multiple __asm__ statements are
- * insufficient for xt-xcc because subsequent optimization passes
- * (beyond the front-end that knows of __asm__ statements and other
- * such GNU Extensions to C) can modify the register selection for
- * containment of C variables.
- *
- * xt-xcc cannot modify the contents of a single __asm__ statement, so
- * we create single-asm versions of the syscall macros that are
- * suitable and optimal for both xt-xcc and gcc.
- *
- * Linux takes system-call arguments in registers.  The following
- * design is optimized for user-land apps (e.g., glibc) which
- * typically have a function wrapper around the "syscall" assembly
- * instruction.  It satisfies the Xtensa ABI while minizing argument
- * shifting.
- *
- * The Xtensa ABI and software conventions require the system-call
- * number in a2.  If an argument exists in a2, we move it to the next
- * available register.  Note that for improved efficiency, we do NOT
- * shift all parameters down one register to maintain the original
- * order.
- *
- * At best case (zero arguments), we just write the syscall number to
- * a2.  At worst case (1 to 6 arguments), we move the argument in a2
- * to the next available register, then write the syscall number to
- * a2.
- *
- * For clarity, the following truth table enumerates all possibilities.
- *
- * arguments	syscall number	arg0, arg1, arg2, arg3, arg4, arg5
- * ---------	--------------	----------------------------------
- *	0	      a2
- *	1	      a2	a3
- *	2	      a2	a4,   a3
- *	3	      a2	a5,   a3,   a4
- *	4	      a2	a6,   a3,   a4,   a5
- *	5	      a2	a7,   a3,   a4,   a5,   a6
- *	6	      a2	a8,   a3,   a4,   a5,   a6,   a7
- */
-
-#define _syscall0(type,name) \
-type name(void) \
-{ \
-long __res; \
-__asm__ __volatile__ ( \
-	"  movi  a2, %1 \n" \
-	"  syscall      \n" \
-	"  mov   %0, a2 \n" \
-	: "=a" (__res) \
-	: "i" (__NR_##name) \
-	: "a2" \
-	); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall1(type,name,type0,arg0) \
-type name(type0 arg0) \
-{ \
-long __res; \
-__asm__ __volatile__ ( \
-	"  mov   a3, %2 \n" \
-	"  movi  a2, %1 \n" \
-	"  syscall      \n" \
-	"  mov   %0, a2 \n" \
-	: "=a" (__res) \
-	: "i" (__NR_##name), "a" (arg0) \
-	: "a2", "a3" \
-	); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall2(type,name,type0,arg0,type1,arg1) \
-type name(type0 arg0,type1 arg1) \
-{ \
-long __res; \
-__asm__ __volatile__ ( \
-	"  mov   a4, %2 \n" \
-	"  mov   a3, %3 \n" \
-	"  movi  a2, %1 \n" \
-	"  syscall      \n" \
-	"  mov   %0, a2 \n" \
-	: "=a" (__res) \
-	: "i" (__NR_##name), "a" (arg0), "a" (arg1) \
-	: "a2", "a3", "a4" \
-	); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall3(type,name,type0,arg0,type1,arg1,type2,arg2) \
-type name(type0 arg0,type1 arg1,type2 arg2) \
-{ \
-long __res; \
-__asm__ __volatile__ ( \
-	"  mov   a5, %2 \n" \
-	"  mov   a4, %4 \n" \
-	"  mov   a3, %3 \n" \
-	"  movi  a2, %1 \n" \
-	"  syscall      \n" \
-	"  mov   %0, a2 \n" \
-	: "=a" (__res) \
-	: "i" (__NR_##name), "a" (arg0), "a" (arg1), "a" (arg2) \
-	: "a2", "a3", "a4", "a5" \
-	); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall4(type,name,type0,arg0,type1,arg1,type2,arg2,type3,arg3) \
-type name(type0 arg0,type1 arg1,type2 arg2,type3 arg3) \
-{ \
-long __res; \
-__asm__ __volatile__ ( \
-	"  mov   a6, %2 \n" \
-	"  mov   a5, %5 \n" \
-	"  mov   a4, %4 \n" \
-	"  mov   a3, %3 \n" \
-	"  movi  a2, %1 \n" \
-	"  syscall      \n" \
-	"  mov   %0, a2 \n" \
-	: "=a" (__res) \
-	: "i" (__NR_##name), "a" (arg0), "a" (arg1), "a" (arg2), "a" (arg3) \
-	: "a2", "a3", "a4", "a5", "a6" \
-	); \
-__syscall_return(type,__res); \
-}
-
-/* Note that we save and restore the a7 frame pointer.
- * Including a7 in the clobber list doesn't do what you'd expect.
- */
-#define _syscall5(type,name,type0,arg0,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
-type name(type0 arg0,type1 arg1,type2 arg2,type3 arg3,type4 arg4) \
-{ \
-long __res; \
-__asm__ __volatile__ ( \
-	"  mov   a9, a7 \n" \
-	"  mov   a7, %2 \n" \
-	"  mov   a6, %6 \n" \
-	"  mov   a5, %5 \n" \
-	"  mov   a4, %4 \n" \
-	"  mov   a3, %3 \n" \
-	"  movi  a2, %1 \n" \
-	"  syscall      \n" \
-	"  mov   a7, a9 \n" \
-	"  mov   %0, a2 \n" \
-	: "=a" (__res) \
-	: "i" (__NR_##name), "a" (arg0), "a" (arg1), "a" (arg2), \
-                             "a" (arg3), "a" (arg4) \
-	: "a2", "a3", "a4", "a5", "a6", "a9" \
-	); \
-__syscall_return(type,__res); \
-}
-
-/* Note that we save and restore the a7 frame pointer.
- * Including a7 in the clobber list doesn't do what you'd expect.
- */
-#define _syscall6(type,name,type0,arg0,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5) \
-type name(type0 arg0,type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5) \
-{ \
-long __res; \
-__asm__ __volatile__ ( \
-	"  mov   a9, a7 \n" \
-	"  mov   a8, %2 \n" \
-	"  mov   a7, %7 \n" \
-	"  mov   a6, %6 \n" \
-	"  mov   a5, %5 \n" \
-	"  mov   a4, %4 \n" \
-	"  mov   a3, %3 \n" \
-	"  movi  a2, %1 \n" \
-	"  syscall      \n" \
-	"  mov   a7, a9 \n" \
-	"  mov   %0, a2 \n" \
-	: "=a" (__res) \
-	: "i" (__NR_##name), "a" (arg0), "a" (arg1), "a" (arg2), \
-                             "a" (arg3), "a" (arg4), "a" (arg5)  \
-	: "a2", "a3", "a4", "a5", "a6", "a8", "a9" \
-	); \
-__syscall_return(type,__res); \
-}
-
 /*
  * "Conditional" syscalls
  *
-- 
cgit v0.10.2


From 93f210dd9e614ddab7ecef0b4c9ba6ad3720d860 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 6 Dec 2006 20:37:33 -0800
Subject: [PATCH] protect ext2 ioctl modifying append_only immutable etc with
 i_mutex

Port commit a090d9132c1e53e3517111123680c15afb25c0a4 into ext2:

All modifications of ->i_flags in inodes that might be visible to somebody
else must be under ->i_mutex.  That patch fixes ext2 ioctl() setting S_APPEND.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 1dfba77..e3cf8c8 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -44,6 +44,7 @@ int ext2_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		if (!S_ISDIR(inode->i_mode))
 			flags &= ~EXT2_DIRSYNC_FL;
 
+		mutex_lock(&inode->i_mutex);
 		oldflags = ei->i_flags;
 
 		/*
@@ -53,13 +54,16 @@ int ext2_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		 * This test looks nicer. Thanks to Pauline Middelink
 		 */
 		if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) {
-			if (!capable(CAP_LINUX_IMMUTABLE))
+			if (!capable(CAP_LINUX_IMMUTABLE)) {
+				mutex_unlock(&inode->i_mutex);
 				return -EPERM;
+			}
 		}
 
 		flags = flags & EXT2_FL_USER_MODIFIABLE;
 		flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE;
 		ei->i_flags = flags;
+		mutex_unlock(&inode->i_mutex);
 
 		ext2_set_inode_flags(inode);
 		inode->i_ctime = CURRENT_TIME_SEC;
-- 
cgit v0.10.2


From 04903664325acb3f199dd8a4b8f1aa437e9fd6b2 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 6 Dec 2006 20:37:33 -0800
Subject: [PATCH] remove HASH_HIGHMEM

It has no users and it's doubtful that we'll need it again.

Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 31e9abb..2275f27 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -119,8 +119,7 @@ extern void *alloc_large_system_hash(const char *tablename,
 				     unsigned int *_hash_mask,
 				     unsigned long limit);
 
-#define HASH_HIGHMEM	0x00000001	/* Consider highmem? */
-#define HASH_EARLY	0x00000002	/* Allocating during early boot? */
+#define HASH_EARLY	0x00000001	/* Allocating during early boot? */
 
 /* Only NUMA needs hash distribution.
  * IA64 is known to have sufficient vmalloc space.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d539f83..2273952 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3226,7 +3226,7 @@ void *__init alloc_large_system_hash(const char *tablename,
 	/* allow the kernel cmdline to have a say */
 	if (!numentries) {
 		/* round applicable memory size up to nearest megabyte */
-		numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
+		numentries = nr_kernel_pages;
 		numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
 		numentries >>= 20 - PAGE_SHIFT;
 		numentries <<= 20 - PAGE_SHIFT;
-- 
cgit v0.10.2


From e92a4d595b464c4aae64be39ca61a9ffe9c8b278 Mon Sep 17 00:00:00 2001
From: Andrey Savochkin <saw@sw.ru>
Date: Wed, 6 Dec 2006 20:37:34 -0800
Subject: [PATCH] retries in ext3_prepare_write() violate ordering requirements

In journal=ordered or journal=data mode retry in ext3_prepare_write()
breaks the requirements of journaling of data with respect to metadata.
The fix is to call commit_write to commit allocated zero blocks before
retry.

Signed-off-by: Kirill Korotaev <dev@openvz.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Ken Chen <kenneth.w.chen@intel.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 03ba5bc..beaf25f 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1148,37 +1148,102 @@ static int do_journal_get_write_access(handle_t *handle,
 	return ext3_journal_get_write_access(handle, bh);
 }
 
+/*
+ * The idea of this helper function is following:
+ * if prepare_write has allocated some blocks, but not all of them, the
+ * transaction must include the content of the newly allocated blocks.
+ * This content is expected to be set to zeroes by block_prepare_write().
+ * 2006/10/14  SAW
+ */
+static int ext3_prepare_failure(struct file *file, struct page *page,
+				unsigned from, unsigned to)
+{
+	struct address_space *mapping;
+	struct buffer_head *bh, *head, *next;
+	unsigned block_start, block_end;
+	unsigned blocksize;
+	int ret;
+	handle_t *handle = ext3_journal_current_handle();
+
+	mapping = page->mapping;
+	if (ext3_should_writeback_data(mapping->host)) {
+		/* optimization: no constraints about data */
+skip:
+		return ext3_journal_stop(handle);
+	}
+
+	head = page_buffers(page);
+	blocksize = head->b_size;
+	for (	bh = head, block_start = 0;
+		bh != head || !block_start;
+	    	block_start = block_end, bh = next)
+	{
+		next = bh->b_this_page;
+		block_end = block_start + blocksize;
+		if (block_end <= from)
+			continue;
+		if (block_start >= to) {
+			block_start = to;
+			break;
+		}
+		if (!buffer_mapped(bh))
+		/* prepare_write failed on this bh */
+			break;
+		if (ext3_should_journal_data(mapping->host)) {
+			ret = do_journal_get_write_access(handle, bh);
+			if (ret) {
+				ext3_journal_stop(handle);
+				return ret;
+			}
+		}
+	/*
+	 * block_start here becomes the first block where the current iteration
+	 * of prepare_write failed.
+	 */
+	}
+	if (block_start <= from)
+		goto skip;
+
+	/* commit allocated and zeroed buffers */
+	return mapping->a_ops->commit_write(file, page, from, block_start);
+}
+
 static int ext3_prepare_write(struct file *file, struct page *page,
 			      unsigned from, unsigned to)
 {
 	struct inode *inode = page->mapping->host;
-	int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
+	int ret, ret2;
+	int needed_blocks = ext3_writepage_trans_blocks(inode);
 	handle_t *handle;
 	int retries = 0;
 
 retry:
 	handle = ext3_journal_start(inode, needed_blocks);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto out;
-	}
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
 	if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
 		ret = nobh_prepare_write(page, from, to, ext3_get_block);
 	else
 		ret = block_prepare_write(page, from, to, ext3_get_block);
 	if (ret)
-		goto prepare_write_failed;
+		goto failure;
 
 	if (ext3_should_journal_data(inode)) {
 		ret = walk_page_buffers(handle, page_buffers(page),
 				from, to, NULL, do_journal_get_write_access);
+		if (ret)
+			/* fatal error, just put the handle and return */
+			journal_stop(handle);
 	}
-prepare_write_failed:
-	if (ret)
-		ext3_journal_stop(handle);
+	return ret;
+
+failure:
+	ret2 = ext3_prepare_failure(file, page, from, to);
+	if (ret2 < 0)
+		return ret2;
 	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
-out:
+	/* retry number exceeded, or other error like -EDQUOT */
 	return ret;
 }
 
-- 
cgit v0.10.2


From b46be05004abb419e303e66e143eed9f8a6e9f3f Mon Sep 17 00:00:00 2001
From: Andrey Savochkin <saw@sw.ru>
Date: Wed, 6 Dec 2006 20:37:36 -0800
Subject: [PATCH] retries in ext4_prepare_write() violate ordering requirements

In journal=ordered or journal=data mode retry in ext4_prepare_write()
breaks the requirements of journaling of data with respect to metadata.
The fix is to call commit_write to commit allocated zero blocks before
retry.

Signed-off-by: Kirill Korotaev <dev@openvz.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Ken Chen <kenneth.w.chen@intel.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0a60ec5..1d85d4e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1147,37 +1147,102 @@ static int do_journal_get_write_access(handle_t *handle,
 	return ext4_journal_get_write_access(handle, bh);
 }
 
+/*
+ * The idea of this helper function is following:
+ * if prepare_write has allocated some blocks, but not all of them, the
+ * transaction must include the content of the newly allocated blocks.
+ * This content is expected to be set to zeroes by block_prepare_write().
+ * 2006/10/14  SAW
+ */
+static int ext4_prepare_failure(struct file *file, struct page *page,
+				unsigned from, unsigned to)
+{
+	struct address_space *mapping;
+	struct buffer_head *bh, *head, *next;
+	unsigned block_start, block_end;
+	unsigned blocksize;
+	int ret;
+	handle_t *handle = ext4_journal_current_handle();
+
+	mapping = page->mapping;
+	if (ext4_should_writeback_data(mapping->host)) {
+		/* optimization: no constraints about data */
+skip:
+		return ext4_journal_stop(handle);
+	}
+
+	head = page_buffers(page);
+	blocksize = head->b_size;
+	for (	bh = head, block_start = 0;
+		bh != head || !block_start;
+	    	block_start = block_end, bh = next)
+	{
+		next = bh->b_this_page;
+		block_end = block_start + blocksize;
+		if (block_end <= from)
+			continue;
+		if (block_start >= to) {
+			block_start = to;
+			break;
+		}
+		if (!buffer_mapped(bh))
+		/* prepare_write failed on this bh */
+			break;
+		if (ext4_should_journal_data(mapping->host)) {
+			ret = do_journal_get_write_access(handle, bh);
+			if (ret) {
+				ext4_journal_stop(handle);
+				return ret;
+			}
+		}
+	/*
+	 * block_start here becomes the first block where the current iteration
+	 * of prepare_write failed.
+	 */
+	}
+	if (block_start <= from)
+		goto skip;
+
+	/* commit allocated and zeroed buffers */
+	return mapping->a_ops->commit_write(file, page, from, block_start);
+}
+
 static int ext4_prepare_write(struct file *file, struct page *page,
 			      unsigned from, unsigned to)
 {
 	struct inode *inode = page->mapping->host;
-	int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+	int ret, ret2;
+	int needed_blocks = ext4_writepage_trans_blocks(inode);
 	handle_t *handle;
 	int retries = 0;
 
 retry:
 	handle = ext4_journal_start(inode, needed_blocks);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto out;
-	}
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
 	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
 		ret = nobh_prepare_write(page, from, to, ext4_get_block);
 	else
 		ret = block_prepare_write(page, from, to, ext4_get_block);
 	if (ret)
-		goto prepare_write_failed;
+		goto failure;
 
 	if (ext4_should_journal_data(inode)) {
 		ret = walk_page_buffers(handle, page_buffers(page),
 				from, to, NULL, do_journal_get_write_access);
+		if (ret)
+			/* fatal error, just put the handle and return */
+			journal_stop(handle);
 	}
-prepare_write_failed:
-	if (ret)
-		ext4_journal_stop(handle);
+	return ret;
+
+failure:
+	ret2 = ext4_prepare_failure(file, page, from, to);
+	if (ret2 < 0)
+		return ret2;
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
-out:
+	/* retry number exceeded, or other error like -EDQUOT */
 	return ret;
 }
 
-- 
cgit v0.10.2


From cfd1893477fa94bb0915e39afa2f044ac978b5c6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 6 Dec 2006 20:37:38 -0800
Subject: [PATCH] ktime: Fix signed / unsigned mismatch in ktime_to_ns

The 32 bit implementation of ktime_to_ns returns unsigned value, while the
64 bit version correctly returns an signed value.  There is no current user
affected by this, but it has to be fixed, as ktime values can be negative.

Pointed-out-by: Helmut Duregger <Helmut.Duregger@student.uibk.ac.at>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index 84eeecd..611f17f 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -248,9 +248,9 @@ static inline struct timeval ktime_to_timeval(const ktime_t kt)
  *
  * Returns the scalar nanoseconds representation of kt
  */
-static inline u64 ktime_to_ns(const ktime_t kt)
+static inline s64 ktime_to_ns(const ktime_t kt)
 {
-	return (u64) kt.tv.sec * NSEC_PER_SEC + kt.tv.nsec;
+	return (s64) kt.tv.sec * NSEC_PER_SEC + kt.tv.nsec;
 }
 
 #endif
-- 
cgit v0.10.2


From 44ddc4f5673a62c9ecdbb7b502fe7b8206b0f945 Mon Sep 17 00:00:00 2001
From: "akpm@osdl.org" <akpm@osdl.org>
Date: Wed, 6 Dec 2006 20:37:41 -0800
Subject: [PATCH] qconf: support old QT

Might make qconf compilable with qt-3.1 as well as qt-3.3

Cc: greg chesson <xtp@google.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/scripts/kconfig/qconf.cc b/scripts/kconfig/qconf.cc
index 338bdea..f5628c5 100644
--- a/scripts/kconfig/qconf.cc
+++ b/scripts/kconfig/qconf.cc
@@ -798,7 +798,7 @@ void ConfigList::contextMenuEvent(QContextMenuEvent *e)
 			QAction *action;
 
 			headerPopup = new QPopupMenu(this);
-			action = new QAction("Show Name", 0, this);
+			action = new QAction(NULL, "Show Name", 0, this);
 			  action->setToggleAction(TRUE);
 			  connect(action, SIGNAL(toggled(bool)),
 				  parent(), SLOT(setShowName(bool)));
@@ -806,7 +806,7 @@ void ConfigList::contextMenuEvent(QContextMenuEvent *e)
 				  action, SLOT(setOn(bool)));
 			  action->setOn(showName);
 			  action->addTo(headerPopup);
-			action = new QAction("Show Range", 0, this);
+			action = new QAction(NULL, "Show Range", 0, this);
 			  action->setToggleAction(TRUE);
 			  connect(action, SIGNAL(toggled(bool)),
 				  parent(), SLOT(setShowRange(bool)));
@@ -814,7 +814,7 @@ void ConfigList::contextMenuEvent(QContextMenuEvent *e)
 				  action, SLOT(setOn(bool)));
 			  action->setOn(showRange);
 			  action->addTo(headerPopup);
-			action = new QAction("Show Data", 0, this);
+			action = new QAction(NULL, "Show Data", 0, this);
 			  action->setToggleAction(TRUE);
 			  connect(action, SIGNAL(toggled(bool)),
 				  parent(), SLOT(setShowData(bool)));
@@ -1161,7 +1161,7 @@ void ConfigInfoView::expr_print_help(void *data, struct symbol *sym, const char
 QPopupMenu* ConfigInfoView::createPopupMenu(const QPoint& pos)
 {
 	QPopupMenu* popup = Parent::createPopupMenu(pos);
-	QAction* action = new QAction("Show Debug Info", 0, popup);
+	QAction* action = new QAction(NULL,"Show Debug Info", 0, popup);
 	  action->setToggleAction(TRUE);
 	  connect(action, SIGNAL(toggled(bool)), SLOT(setShowDebug(bool)));
 	  connect(this, SIGNAL(showDebugChanged(bool)), action, SLOT(setOn(bool)));
-- 
cgit v0.10.2


From c36264dfb2d6fa6383082de0a1bba8e12b477da1 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 6 Dec 2006 20:37:42 -0800
Subject: [PATCH] remove the syslog interface when printk is disabled

Attempts to read() from the non-existent dmesg buffer will return zero and
userspace tends to get stuck in a busyloop.

So just remove /dev/kmsg altogether if CONFIG_PRINTK=n.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 7431d7b..f6c7762 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -8,8 +8,9 @@ proc-y			:= nommu.o task_nommu.o
 proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o
 
 proc-y       += inode.o root.o base.o generic.o array.o \
-		kmsg.o proc_tty.o proc_misc.o
+		proc_tty.o proc_misc.o
 
 proc-$(CONFIG_PROC_KCORE)	+= kcore.o
 proc-$(CONFIG_PROC_VMCORE)	+= vmcore.o
 proc-$(CONFIG_PROC_DEVICETREE)	+= proc_devtree.o
+proc-$(CONFIG_PRINTK)	+= kmsg.o
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 93c43b6..51815ce 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -696,9 +696,11 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
+#ifdef CONFIG_PRINTK
 	entry = create_proc_entry("kmsg", S_IRUSR, &proc_root);
 	if (entry)
 		entry->proc_fops = &proc_kmsg_operations;
+#endif
 	create_seq_entry("devices", 0, &proc_devinfo_operations);
 	create_seq_entry("cpuinfo", 0, &proc_cpuinfo_operations);
 #ifdef CONFIG_BLOCK
diff --git a/kernel/printk.c b/kernel/printk.c
index 6642655..ba59c2a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -631,12 +631,7 @@ EXPORT_SYMBOL(vprintk);
 
 asmlinkage long sys_syslog(int type, char __user *buf, int len)
 {
-	return 0;
-}
-
-int do_syslog(int type, char __user *buf, int len)
-{
-	return 0;
+	return -ENOSYS;
 }
 
 static void call_console_drivers(unsigned long start, unsigned long end)
-- 
cgit v0.10.2


From 319e799abb89d9215a203e32c2cad51115d302f4 Mon Sep 17 00:00:00 2001
From: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Date: Wed, 6 Dec 2006 20:37:43 -0800
Subject: [PATCH] ver_linux additions

scripts/ver_linux needed some minor clean-ups, as follows:
1) Add reporting of actual oprofile release
2) Add reporting of actual wireless-tools release
3) Add reporting of actual pcmciautils release

Signed-off-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/scripts/ver_linux b/scripts/ver_linux
index 84999f6..72876df 100755
--- a/scripts/ver_linux
+++ b/scripts/ver_linux
@@ -48,6 +48,8 @@ fsck.reiser4 -V 2>&1 | grep ^fsck.reiser4 | awk \
 xfs_db -V 2>&1 | grep version | awk \
 'NR==1{print "xfsprogs              ", $3}'
 
+pccardctl -V 2>&1| grep pcmciautils | awk '{print "pcmciautils           ", $2}'
+
 cardmgr -V 2>&1| grep version | awk \
 'NR==1{print "pcmcia-cs             ", $3}'
 
@@ -87,10 +89,16 @@ loadkeys -h 2>&1 | awk \
 loadkeys -V 2>&1 | awk \
 '(NR==1 && ($2 ~ /console-tools/)) {print "Console-tools         ", $3}'
 
+oprofiled --version 2>&1 | awk \
+'(NR==1 && ($2 == "oprofile")) {print "oprofile              ", $3}'
+
 expr --v 2>&1 | awk 'NR==1{print "Sh-utils              ", $NF}'
 
 udevinfo -V 2>&1 | grep version | awk '{print "udev                  ", $3}'
 
+iwconfig --version 2>&1 | awk \
+'(NR==1 && ($3 == "version")) {print "wireless-tools        ",$4}'
+
 if [ -e /proc/modules ]; then
     X=`cat /proc/modules | sed -e "s/ .*$//"`
     echo "Modules Loaded         "$X
-- 
cgit v0.10.2


From 19e5d9c0d2194b4b47189cbec2921cbf72b0bd1c Mon Sep 17 00:00:00 2001
From: Henry Nestler <henry.ne@arcor.de>
Date: Wed, 6 Dec 2006 20:37:45 -0800
Subject: [PATCH] initrd: remove unused false condition for initrd_start

After LOADER_TYPE && INITRD_START are true, the short if-condition
for INITRD_START can never be false.

Remove unused code from the else condition.

Signed-off-by: Henry Nestler <henry.ne@arcor.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/frv/kernel/setup.c b/arch/frv/kernel/setup.c
index a8c61da..1a5eb6c 100644
--- a/arch/frv/kernel/setup.c
+++ b/arch/frv/kernel/setup.c
@@ -947,7 +947,7 @@ static void __init setup_linux_memory(void)
 	if (LOADER_TYPE && INITRD_START) {
 		if (INITRD_START + INITRD_SIZE <= (low_top_pfn << PAGE_SHIFT)) {
 			reserve_bootmem(INITRD_START, INITRD_SIZE);
-			initrd_start = INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
+			initrd_start = INITRD_START + PAGE_OFFSET;
 			initrd_end = initrd_start + INITRD_SIZE;
 		}
 		else {
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 141041d..97bb869 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -1162,8 +1162,7 @@ void __init setup_bootmem_allocator(void)
 	if (LOADER_TYPE && INITRD_START) {
 		if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
 			reserve_bootmem(INITRD_START, INITRD_SIZE);
-			initrd_start =
-				INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
+			initrd_start = INITRD_START + PAGE_OFFSET;
 			initrd_end = initrd_start+INITRD_SIZE;
 		}
 		else {
diff --git a/arch/m32r/kernel/setup.c b/arch/m32r/kernel/setup.c
index 0e7778b..936205f 100644
--- a/arch/m32r/kernel/setup.c
+++ b/arch/m32r/kernel/setup.c
@@ -196,9 +196,7 @@ static unsigned long __init setup_memory(void)
 	if (LOADER_TYPE && INITRD_START) {
 		if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
 			reserve_bootmem(INITRD_START, INITRD_SIZE);
-			initrd_start = INITRD_START ?
-				INITRD_START + PAGE_OFFSET : 0;
-
+			initrd_start = INITRD_START + PAGE_OFFSET;
 			initrd_end = initrd_start + INITRD_SIZE;
 			printk("initrd:start[%08lx],size[%08lx]\n",
 				initrd_start, INITRD_SIZE);
diff --git a/arch/m32r/mm/discontig.c b/arch/m32r/mm/discontig.c
index abb34cc..c7efdb0 100644
--- a/arch/m32r/mm/discontig.c
+++ b/arch/m32r/mm/discontig.c
@@ -105,9 +105,7 @@ unsigned long __init setup_memory(void)
 		if (INITRD_START + INITRD_SIZE <= PFN_PHYS(max_low_pfn)) {
 			reserve_bootmem_node(NODE_DATA(0), INITRD_START,
 				INITRD_SIZE);
-			initrd_start = INITRD_START ?
-				INITRD_START + PAGE_OFFSET : 0;
-
+			initrd_start = INITRD_START + PAGE_OFFSET;
 			initrd_end = initrd_start + INITRD_SIZE;
 			printk("initrd:start[%08lx],size[%08lx]\n",
 				initrd_start, INITRD_SIZE);
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index 696ca75..f8dd6b7 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -332,8 +332,7 @@ void __init setup_arch(char **cmdline_p)
 	if (LOADER_TYPE && INITRD_START) {
 		if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
 			reserve_bootmem_node(NODE_DATA(0), INITRD_START+__MEMORY_START, INITRD_SIZE);
-			initrd_start =
-				INITRD_START ? INITRD_START + PAGE_OFFSET + __MEMORY_START : 0;
+			initrd_start = INITRD_START + PAGE_OFFSET + __MEMORY_START;
 			initrd_end = initrd_start + INITRD_SIZE;
 		} else {
 			printk("initrd extends beyond end of memory "
diff --git a/arch/sh64/kernel/setup.c b/arch/sh64/kernel/setup.c
index ffb310e..b9e7d54 100644
--- a/arch/sh64/kernel/setup.c
+++ b/arch/sh64/kernel/setup.c
@@ -243,9 +243,7 @@ void __init setup_arch(char **cmdline_p)
 		if (INITRD_START + INITRD_SIZE <= (PFN_PHYS(last_pfn))) {
 		        reserve_bootmem_node(NODE_DATA(0), INITRD_START + __MEMORY_START, INITRD_SIZE);
 
-			initrd_start =
-			  (long) INITRD_START ? INITRD_START + PAGE_OFFSET +  __MEMORY_START : 0;
-
+			initrd_start = (long) INITRD_START + PAGE_OFFSET + __MEMORY_START;
 			initrd_end = initrd_start + INITRD_SIZE;
 		} else {
 			printk("initrd extends beyond end of memory "
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index fc944b5..f12f266 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -471,8 +471,7 @@ void __init setup_arch(char **cmdline_p)
 	if (LOADER_TYPE && INITRD_START) {
 		if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
 			reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
-			initrd_start =
-				INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
+			initrd_start = INITRD_START + PAGE_OFFSET;
 			initrd_end = initrd_start+INITRD_SIZE;
 		}
 		else {
-- 
cgit v0.10.2


From 651971cb7242e8f6d7ebd153e69bd271cb731223 Mon Sep 17 00:00:00 2001
From: suzuki <suzuki@linux.vnet.ibm.com>
Date: Wed, 6 Dec 2006 20:37:48 -0800
Subject: [PATCH] Fix the size limit of compat space msgsize

Currently we allocate 64k space on the user stack and use it the msgbuf for
sys_{msgrcv,msgsnd} for compat and the results are later copied in user [
by copy_in_user].  This patch introduces helper routines for
sys_{msgrcv,msgsnd} as below:

do_msgsnd() : Accepts the mtype and user space ptr to the buffer along with
the msqid and msgflg.

do_msgrcv() : Accepts a kernel space ptr to mtype and a userspace ptr to
the buffer.  The mtype has to be copied back the user space msgbuf by the
caller.

These changes avoid the need to allocate the msgsize on the userspace (
thus removing the size limt ) and the overhead of an extra copy_in_user().

Signed-off-by: Suzuki K P <suzuki@in.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/msg.h b/include/linux/msg.h
index acc7c17..f1b60740 100644
--- a/include/linux/msg.h
+++ b/include/linux/msg.h
@@ -92,6 +92,12 @@ struct msg_queue {
 	struct list_head q_senders;
 };
 
+/* Helper routines for sys_msgsnd and sys_msgrcv */
+extern long do_msgsnd(int msqid, long mtype, void __user *mtext,
+			size_t msgsz, int msgflg);
+extern long do_msgrcv(int msqid, long *pmtype, void __user *mtext,
+			size_t msgsz, long msgtyp, int msgflg);
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_MSG_H */
diff --git a/ipc/compat.c b/ipc/compat.c
index 4d20cfd..fa18141 100644
--- a/ipc/compat.c
+++ b/ipc/compat.c
@@ -115,7 +115,6 @@ struct compat_shm_info {
 
 extern int sem_ctls[];
 #define sc_semopm	(sem_ctls[2])
-#define MAXBUF (64*1024)
 
 static inline int compat_ipc_parse_version(int *cmd)
 {
@@ -307,35 +306,30 @@ long compat_sys_semctl(int first, int second, int third, void __user *uptr)
 
 long compat_sys_msgsnd(int first, int second, int third, void __user *uptr)
 {
-	struct msgbuf __user *p;
 	struct compat_msgbuf __user *up = uptr;
 	long type;
 
 	if (first < 0)
 		return -EINVAL;
-	if (second < 0 || (second >= MAXBUF - sizeof(struct msgbuf)))
+	if (second < 0)
 		return -EINVAL;
 
-	p = compat_alloc_user_space(second + sizeof(struct msgbuf));
-	if (get_user(type, &up->mtype) ||
-	    put_user(type, &p->mtype) ||
-	    copy_in_user(p->mtext, up->mtext, second))
+	if (get_user(type, &up->mtype))
 		return -EFAULT;
 
-	return sys_msgsnd(first, p, second, third);
+	return do_msgsnd(first, type, up->mtext, second, third);
 }
 
 long compat_sys_msgrcv(int first, int second, int msgtyp, int third,
 			   int version, void __user *uptr)
 {
-	struct msgbuf __user *p;
 	struct compat_msgbuf __user *up;
 	long type;
 	int err;
 
 	if (first < 0)
 		return -EINVAL;
-	if (second < 0 || (second >= MAXBUF - sizeof(struct msgbuf)))
+	if (second < 0)
 		return -EINVAL;
 
 	if (!version) {
@@ -349,14 +343,11 @@ long compat_sys_msgrcv(int first, int second, int msgtyp, int third,
 		uptr = compat_ptr(ipck.msgp);
 		msgtyp = ipck.msgtyp;
 	}
-	p = compat_alloc_user_space(second + sizeof(struct msgbuf));
-	err = sys_msgrcv(first, p, second, msgtyp, third);
+	up = uptr;
+	err = do_msgrcv(first, &type, up->mtext, second, msgtyp, third);
 	if (err < 0)
 		goto out;
-	up = uptr;
-	if (get_user(type, &p->mtype) ||
-	    put_user(type, &up->mtype) ||
-	    copy_in_user(up->mtext, p->mtext, err))
+	if (put_user(type, &up->mtype))
 		err = -EFAULT;
 out:
 	return err;
diff --git a/ipc/msg.c b/ipc/msg.c
index 1266b1d..a388824 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -626,12 +626,11 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
 	return 0;
 }
 
-asmlinkage long
-sys_msgsnd(int msqid, struct msgbuf __user *msgp, size_t msgsz, int msgflg)
+long do_msgsnd(int msqid, long mtype, void __user *mtext,
+		size_t msgsz, int msgflg)
 {
 	struct msg_queue *msq;
 	struct msg_msg *msg;
-	long mtype;
 	int err;
 	struct ipc_namespace *ns;
 
@@ -639,12 +638,10 @@ sys_msgsnd(int msqid, struct msgbuf __user *msgp, size_t msgsz, int msgflg)
 
 	if (msgsz > ns->msg_ctlmax || (long) msgsz < 0 || msqid < 0)
 		return -EINVAL;
-	if (get_user(mtype, &msgp->mtype))
-		return -EFAULT;
 	if (mtype < 1)
 		return -EINVAL;
 
-	msg = load_msg(msgp->mtext, msgsz);
+	msg = load_msg(mtext, msgsz);
 	if (IS_ERR(msg))
 		return PTR_ERR(msg);
 
@@ -723,6 +720,16 @@ out_free:
 	return err;
 }
 
+asmlinkage long
+sys_msgsnd(int msqid, struct msgbuf __user *msgp, size_t msgsz, int msgflg)
+{
+	long mtype;
+
+	if (get_user(mtype, &msgp->mtype))
+		return -EFAULT;
+	return do_msgsnd(msqid, mtype, msgp->mtext, msgsz, msgflg);
+}
+
 static inline int convert_mode(long *msgtyp, int msgflg)
 {
 	/*
@@ -742,8 +749,8 @@ static inline int convert_mode(long *msgtyp, int msgflg)
 	return SEARCH_EQUAL;
 }
 
-asmlinkage long sys_msgrcv(int msqid, struct msgbuf __user *msgp, size_t msgsz,
-			   long msgtyp, int msgflg)
+long do_msgrcv(int msqid, long *pmtype, void __user *mtext,
+		size_t msgsz, long msgtyp, int msgflg)
 {
 	struct msg_queue *msq;
 	struct msg_msg *msg;
@@ -889,15 +896,30 @@ out_unlock:
 		return PTR_ERR(msg);
 
 	msgsz = (msgsz > msg->m_ts) ? msg->m_ts : msgsz;
-	if (put_user (msg->m_type, &msgp->mtype) ||
-	    store_msg(msgp->mtext, msg, msgsz)) {
+	*pmtype = msg->m_type;
+	if (store_msg(mtext, msg, msgsz))
 		msgsz = -EFAULT;
-	}
+
 	free_msg(msg);
 
 	return msgsz;
 }
 
+asmlinkage long sys_msgrcv(int msqid, struct msgbuf __user *msgp, size_t msgsz,
+			   long msgtyp, int msgflg)
+{
+	long err, mtype;
+
+	err =  do_msgrcv(msqid, &mtype, msgp->mtext, msgsz, msgtyp, msgflg);
+	if (err < 0)
+		goto out;
+
+	if (put_user(mtype, &msgp->mtype))
+		err = -EFAULT;
+out:
+	return err;
+}
+
 #ifdef CONFIG_PROC_FS
 static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
 {
-- 
cgit v0.10.2


From 386d9a7edd9f3492c99124b0a659e9ed7abb30f9 Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus@valinux.co.jp>
Date: Wed, 6 Dec 2006 20:37:53 -0800
Subject: [PATCH] elf: Always define elf_addr_t in linux/elf.h

Define elf_addr_t in linux/elf.h.  The size of the type is determined using
ELF_CLASS.  This allows us to remove the defines that today are spread all
over .c and .h files.

Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
Cc: Daniel Jacobowitz <drow@false.org>
Cc: Roland McGrath <roland@redhat.com>
Cc: Jakub Jelinek <jakub@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/ia64/ia32/ia32priv.h b/arch/ia64/ia32/ia32priv.h
index 703a67c..cfa0bc0 100644
--- a/arch/ia64/ia32/ia32priv.h
+++ b/arch/ia64/ia32/ia32priv.h
@@ -330,8 +330,6 @@ struct old_linux32_dirent {
 void ia64_elf32_init(struct pt_regs *regs);
 #define ELF_PLAT_INIT(_r, load_addr)	ia64_elf32_init(_r)
 
-#define elf_addr_t	u32
-
 /* This macro yields a bitmask that programs can use to figure out
    what instruction set this CPU supports.  */
 #define ELF_HWCAP	0
diff --git a/arch/mips/kernel/binfmt_elfn32.c b/arch/mips/kernel/binfmt_elfn32.c
index 4a9f1ec..9b34238 100644
--- a/arch/mips/kernel/binfmt_elfn32.c
+++ b/arch/mips/kernel/binfmt_elfn32.c
@@ -90,7 +90,6 @@ struct elf_prpsinfo32
 	char	pr_psargs[ELF_PRARGSZ];	/* initial part of arg list */
 };
 
-#define elf_addr_t	u32
 #define elf_caddr_t	u32
 #define init_elf_binfmt init_elfn32_binfmt
 
diff --git a/arch/mips/kernel/binfmt_elfo32.c b/arch/mips/kernel/binfmt_elfo32.c
index e318137..993f7ec 100644
--- a/arch/mips/kernel/binfmt_elfo32.c
+++ b/arch/mips/kernel/binfmt_elfo32.c
@@ -92,7 +92,6 @@ struct elf_prpsinfo32
 	char	pr_psargs[ELF_PRARGSZ];	/* initial part of arg list */
 };
 
-#define elf_addr_t	u32
 #define elf_caddr_t	u32
 #define init_elf_binfmt init_elf32_binfmt
 
diff --git a/arch/mips/kernel/irixelf.c b/arch/mips/kernel/irixelf.c
index ab12c8f..a82fae2 100644
--- a/arch/mips/kernel/irixelf.c
+++ b/arch/mips/kernel/irixelf.c
@@ -52,10 +52,6 @@ static struct linux_binfmt irix_format = {
 	irix_core_dump, PAGE_SIZE
 };
 
-#ifndef elf_addr_t
-#define elf_addr_t unsigned long
-#endif
-
 #ifdef DEBUG
 /* Debugging routines. */
 static char *get_elf_p_type(Elf32_Word p_type)
diff --git a/arch/parisc/kernel/binfmt_elf32.c b/arch/parisc/kernel/binfmt_elf32.c
index 1e64e7b..ecb10a4 100644
--- a/arch/parisc/kernel/binfmt_elf32.c
+++ b/arch/parisc/kernel/binfmt_elf32.c
@@ -75,7 +75,6 @@ struct elf_prpsinfo32
 	char	pr_psargs[ELF_PRARGSZ];	/* initial part of arg list */
 };
 
-#define elf_addr_t	unsigned int
 #define init_elf_binfmt init_elf32_binfmt
 
 #define ELF_PLATFORM  ("PARISC32\0")
diff --git a/arch/s390/kernel/binfmt_elf32.c b/arch/s390/kernel/binfmt_elf32.c
index 9565a2d..5c46054 100644
--- a/arch/s390/kernel/binfmt_elf32.c
+++ b/arch/s390/kernel/binfmt_elf32.c
@@ -176,7 +176,6 @@ struct elf_prpsinfo32
 
 #include <linux/highuid.h>
 
-#define elf_addr_t	u32
 /*
 #define init_elf_binfmt init_elf32_binfmt
 */
diff --git a/arch/sparc64/kernel/binfmt_elf32.c b/arch/sparc64/kernel/binfmt_elf32.c
index a98f3ae..9ad84ff 100644
--- a/arch/sparc64/kernel/binfmt_elf32.c
+++ b/arch/sparc64/kernel/binfmt_elf32.c
@@ -141,7 +141,6 @@ cputime_to_compat_timeval(const cputime_t cputime, struct compat_timeval *value)
 	value->tv_sec = jiffies / HZ;
 }
 
-#define elf_addr_t	u32
 #undef start_thread
 #define start_thread start_thread32
 #define init_elf_binfmt init_elf32_binfmt
diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c
index 932a62a..543ef4f 100644
--- a/arch/x86_64/ia32/ia32_binfmt.c
+++ b/arch/x86_64/ia32/ia32_binfmt.c
@@ -305,8 +305,6 @@ MODULE_AUTHOR("Eric Youngdale, Andi Kleen");
 #undef MODULE_DESCRIPTION
 #undef MODULE_AUTHOR
 
-#define elf_addr_t __u32
-
 static void elf32_init(struct pt_regs *);
 
 #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 68e20d5..14ea630 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -47,10 +47,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
 static int load_elf_library(struct file *);
 static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int);
 
-#ifndef elf_addr_t
-#define elf_addr_t unsigned long
-#endif
-
 /*
  * If we don't support core dumping, then supply a NULL so we
  * don't even try.
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index f86d5c9..ed9a61c 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -40,9 +40,6 @@
 #include <asm/pgalloc.h>
 
 typedef char *elf_caddr_t;
-#ifndef elf_addr_t
-#define elf_addr_t unsigned long
-#endif
 
 #if 0
 #define kdebug(fmt, ...) printk("FDPIC "fmt"\n" ,##__VA_ARGS__ )
diff --git a/include/asm-powerpc/elf.h b/include/asm-powerpc/elf.h
index b543664..d36426c 100644
--- a/include/asm-powerpc/elf.h
+++ b/include/asm-powerpc/elf.h
@@ -124,12 +124,10 @@ typedef elf_greg_t32 elf_gregset_t32[ELF_NGREG];
 # define ELF_DATA	ELFDATA2MSB
   typedef elf_greg_t64 elf_greg_t;
   typedef elf_gregset_t64 elf_gregset_t;
-# define elf_addr_t unsigned long
 #else
   /* Assumption: ELF_ARCH == EM_PPC and ELF_CLASS == ELFCLASS32 */
   typedef elf_greg_t32 elf_greg_t;
   typedef elf_gregset_t32 elf_gregset_t;
-# define elf_addr_t __u32
 #endif /* ELF_ARCH */
 
 /* Floating point registers */
diff --git a/include/linux/elf.h b/include/linux/elf.h
index 743d5c8..b403516 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -358,6 +358,7 @@ extern Elf32_Dyn _DYNAMIC [];
 #define elfhdr		elf32_hdr
 #define elf_phdr	elf32_phdr
 #define elf_note	elf32_note
+#define elf_addr_t	Elf32_Off
 
 #else
 
@@ -365,6 +366,7 @@ extern Elf64_Dyn _DYNAMIC [];
 #define elfhdr		elf64_hdr
 #define elf_phdr	elf64_phdr
 #define elf_note	elf64_note
+#define elf_addr_t	Elf64_Off
 
 #endif
 
-- 
cgit v0.10.2


From 584236ac7cdddeec0fdff25d2e475471ef91d028 Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus@valinux.co.jp>
Date: Wed, 6 Dec 2006 20:37:56 -0800
Subject: [PATCH] elf: include terminating zero in n_namesz

The ELF32 spec says we should plus we include the zero on other platforms.

Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
Cc: Daniel Jacobowitz <drow@false.org>
Cc: Roland McGrath <roland@redhat.com>
Cc: Jakub Jelinek <jakub@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/mips/kernel/irixelf.c b/arch/mips/kernel/irixelf.c
index a82fae2..1bbefbf 100644
--- a/arch/mips/kernel/irixelf.c
+++ b/arch/mips/kernel/irixelf.c
@@ -1009,7 +1009,7 @@ static int notesize(struct memelfnote *en)
 	int sz;
 
 	sz = sizeof(struct elf_note);
-	sz += roundup(strlen(en->name), 4);
+	sz += roundup(strlen(en->name) + 1, 4);
 	sz += roundup(en->datasz, 4);
 
 	return sz;
@@ -1028,7 +1028,7 @@ static int writenote(struct memelfnote *men, struct file *file)
 {
 	struct elf_note en;
 
-	en.n_namesz = strlen(men->name);
+	en.n_namesz = strlen(men->name) + 1;
 	en.n_descsz = men->datasz;
 	en.n_type = men->type;
 
-- 
cgit v0.10.2


From 360276042d7a8369ce912acff99c1c4de394b312 Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus@valinux.co.jp>
Date: Wed, 6 Dec 2006 20:38:00 -0800
Subject: [PATCH] elf: fix kcore note size calculation

 - Define "CORE" string as CORE_STR in single common place.
 - Include terminating zero in CORE_STR length calculation for elf_buflen.
 - Use roundup(,4) to include alignment in elf_buflen calculation.

[akpm@osdl.org: simplification suggested by Roland]
Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
Cc: Daniel Jacobowitz <drow@false.org>
Cc: Roland McGrath <roland@redhat.com>
Cc: Jakub Jelinek <jakub@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 1294eda..1be7308 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -22,6 +22,7 @@
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
+#define CORE_STR "CORE"
 
 static int open_kcore(struct inode * inode, struct file * filp)
 {
@@ -82,10 +83,11 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
 	}
 	*elf_buflen =	sizeof(struct elfhdr) + 
 			(*nphdr + 2)*sizeof(struct elf_phdr) + 
-			3 * (sizeof(struct elf_note) + 4) +
-			sizeof(struct elf_prstatus) +
-			sizeof(struct elf_prpsinfo) +
-			sizeof(struct task_struct);
+			3 * ((sizeof(struct elf_note)) +
+			     roundup(sizeof(CORE_STR), 4)) +
+			roundup(sizeof(struct elf_prstatus), 4) +
+			roundup(sizeof(struct elf_prpsinfo), 4) +
+			roundup(sizeof(struct task_struct), 4);
 	*elf_buflen = PAGE_ALIGN(*elf_buflen);
 	return size + *elf_buflen;
 }
@@ -210,7 +212,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
 	nhdr->p_offset	= offset;
 
 	/* set up the process status */
-	notes[0].name = "CORE";
+	notes[0].name = CORE_STR;
 	notes[0].type = NT_PRSTATUS;
 	notes[0].datasz = sizeof(struct elf_prstatus);
 	notes[0].data = &prstatus;
@@ -221,7 +223,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
 	bufp = storenote(&notes[0], bufp);
 
 	/* set up the process info */
-	notes[1].name	= "CORE";
+	notes[1].name	= CORE_STR;
 	notes[1].type	= NT_PRPSINFO;
 	notes[1].datasz	= sizeof(struct elf_prpsinfo);
 	notes[1].data	= &prpsinfo;
@@ -238,7 +240,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
 	bufp = storenote(&notes[1], bufp);
 
 	/* set up the task structure */
-	notes[2].name	= "CORE";
+	notes[2].name	= CORE_STR;
 	notes[2].type	= NT_TASKSTRUCT;
 	notes[2].datasz	= sizeof(struct task_struct);
 	notes[2].data	= current;
-- 
cgit v0.10.2


From de21c57b90b3716f6f951e88e039d00ab6729ce9 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@openvz.org>
Date: Wed, 6 Dec 2006 20:38:02 -0800
Subject: [PATCH] reiserfs: add missing D-cache flushing

Looks like, reiserfs_prepare_file_region_for_write() doesn't contain
several flush_dcache_page() calls.

Found with help from Dmitriy Monakhov <dmonakhov@openvz.org>

[akpm@osdl.org: small speedup]
Signed-off-by: Alexey Dobriyan <adobriyan@openvz.org>
Cc: Dmitriy Monakhov <dmonakhov@openvz.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index ac14318..6526498 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1045,6 +1045,7 @@ static int reiserfs_prepare_file_region_for_write(struct inode *inode
 			char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
 			memset(kaddr, 0, from);
 			kunmap_atomic(kaddr, KM_USER0);
+			flush_dcache_page(prepared_pages[0]);
 		}
 		if (to != PAGE_CACHE_SIZE) {	/* Last page needs to be partially zeroed */
 			char *kaddr =
@@ -1052,6 +1053,7 @@ static int reiserfs_prepare_file_region_for_write(struct inode *inode
 					KM_USER0);
 			memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
 			kunmap_atomic(kaddr, KM_USER0);
+			flush_dcache_page(prepared_pages[num_pages - 1]);
 		}
 
 		/* Since all blocks are new - use already calculated value */
@@ -1185,6 +1187,7 @@ static int reiserfs_prepare_file_region_for_write(struct inode *inode
 					memset(kaddr + block_start, 0,
 					       from - block_start);
 					kunmap_atomic(kaddr, KM_USER0);
+					flush_dcache_page(prepared_pages[0]);
 					set_buffer_uptodate(bh);
 				}
 			}
@@ -1222,6 +1225,7 @@ static int reiserfs_prepare_file_region_for_write(struct inode *inode
 							KM_USER0);
 					memset(kaddr + to, 0, block_end - to);
 					kunmap_atomic(kaddr, KM_USER0);
+					flush_dcache_page(prepared_pages[num_pages - 1]);
 					set_buffer_uptodate(bh);
 				}
 			}
-- 
cgit v0.10.2


From c67220480e0dc6442851691c81c2ff7aa48f9d8f Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:38:02 -0800
Subject: [PATCH] The scheduled removal of some OSS options

The scheduled removal of the OSS drivers depending on OSS_OBSOLETE_DRIVER.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/media/radio/Kconfig b/drivers/media/radio/Kconfig
index 6d96b17..920b63f 100644
--- a/drivers/media/radio/Kconfig
+++ b/drivers/media/radio/Kconfig
@@ -173,38 +173,6 @@ config RADIO_MAESTRO
 	  To compile this driver as a module, choose M here: the
 	  module will be called radio-maestro.
 
-config RADIO_MIROPCM20
-	tristate "miroSOUND PCM20 radio"
-	depends on ISA && VIDEO_V4L1 && SOUND_ACI_MIXER
-	---help---
-	  Choose Y here if you have this FM radio card. You also need to say Y
-	  to "ACI mixer (miroSOUND PCM1-pro/PCM12/PCM20 radio)" (in "Sound")
-	  for this to work.
-
-	  In order to control your radio card, you will need to use programs
-	  that are compatible with the Video For Linux API.  Information on
-	  this API and pointers to "v4l" programs may be found at
-	  <file:Documentation/video4linux/API.html>.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called miropcm20.
-
-config RADIO_MIROPCM20_RDS
-	tristate "miroSOUND PCM20 radio RDS user interface (EXPERIMENTAL)"
-	depends on RADIO_MIROPCM20 && EXPERIMENTAL
-	---help---
-	  Choose Y here if you want to see RDS/RBDS information like
-	  RadioText, Programme Service name, Clock Time and date, Programme
-	  Type and Traffic Announcement/Programme identification.
-
-	  It's not possible to read the raw RDS packets from the device, so
-	  the driver cant provide an V4L interface for this.  But the
-	  availability of RDS is reported over V4L by the basic driver
-	  already.  Here RDS can be read from files in /dev/v4l/rds.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called miropcm20-rds.
-
 config RADIO_SF16FMI
 	tristate "SF16FMI Radio"
 	depends on ISA && VIDEO_V4L2
diff --git a/sound/oss/Kconfig b/sound/oss/Kconfig
index cc2b9ab..a0588c2 100644
--- a/sound/oss/Kconfig
+++ b/sound/oss/Kconfig
@@ -5,20 +5,6 @@
 #
 # Prompt user for primary drivers.
 
-config OSS_OBSOLETE_DRIVER
-	bool "Obsolete OSS drivers"
-	depends on SOUND_PRIME
-	help
-	  This option enables support for obsolete OSS drivers that
-	  are scheduled for removal in the near future since there
-	  are ALSA drivers for the same hardware.
-
-	  Please contact Adrian Bunk <bunk@stusta.de> if you had to
-	  say Y here because your soundcard is not properly supported
-	  by ALSA.
-
-	  If unsure, say N.
-
 config SOUND_BT878
 	tristate "BT878 audio dma"
 	depends on SOUND_PRIME && PCI
@@ -35,40 +21,6 @@ config SOUND_BT878
 	  To compile this driver as a module, choose M here: the module will
 	  be called btaudio.
 
-config SOUND_EMU10K1
-	tristate "Creative SBLive! (EMU10K1)"
-	depends on SOUND_PRIME && PCI && OSS_OBSOLETE_DRIVER
-	---help---
-	  Say Y or M if you have a PCI sound card using the EMU10K1 chipset,
-	  such as the Creative SBLive!, SB PCI512 or Emu-APS.
-
-	  For more information on this driver and the degree of support for
-	  the different card models please check:
-
-		<http://sourceforge.net/projects/emu10k1/>
-
-	  It is now possible to load dsp microcode patches into the EMU10K1
-	  chip.  These patches are used to implement real time sound
-	  processing effects which include for example: signal routing,
-	  bass/treble control, AC3 passthrough, ...
-	  Userspace tools to create new patches and load/unload them can be
-	  found in the emu-tools package at the above URL.
-
-config MIDI_EMU10K1
-	bool "Creative SBLive! MIDI (EXPERIMENTAL)"
-	depends on SOUND_EMU10K1 && EXPERIMENTAL && ISA_DMA_API
-	help
-	  Say Y if you want to be able to use the OSS /dev/sequencer
-	  interface.  This code is still experimental.
-
-config SOUND_FUSION
-	tristate "Crystal SoundFusion (CS4280/461x)"
-	depends on SOUND_PRIME && PCI && OSS_OBSOLETE_DRIVER
-	help
-	  This module drives the Crystal SoundFusion devices (CS4280/46xx
-	  series) when wired as native sound drivers with AC97 codecs.  If
-	  this driver does not work try the CS4232 driver.
-
 config SOUND_BCM_CS4297A
 	tristate "Crystal Sound CS4297a (for Swarm)"
 	depends on SOUND_PRIME && SIBYTE_SWARM
@@ -448,47 +400,6 @@ config SOUND_DMAP
 
 	  Say Y unless you have 16MB or more RAM or a PCI sound card.
 
-config SOUND_AD1816
-	tristate "AD1816(A) based cards (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && SOUND_OSS && OSS_OBSOLETE_DRIVER
-	help
-	  Say M here if you have a sound card based on the Analog Devices
-	  AD1816(A) chip.
-
-	  If you compile the driver into the kernel, you have to add
-	  "ad1816=<io>,<irq>,<dma>,<dma2>" to the kernel command line.
-
-config SOUND_AD1889
-	tristate "AD1889 based cards (AD1819 codec) (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && SOUND_OSS && PCI && OSS_OBSOLETE_DRIVER
-	help
-	  Say M here if you have a sound card based on the Analog Devices
-	  AD1889 chip.
-
-config SOUND_ADLIB
-	tristate "Adlib Cards"
-	depends on SOUND_OSS && OSS_OBSOLETE_DRIVER
-	help
-	  Includes ASB 64 4D. Information on programming AdLib cards is
-	  available at <http://www.itsnet.com/home/ldragon/Specs/adlib.html>.
-
-config SOUND_ACI_MIXER
-	tristate "ACI mixer (miroSOUND PCM1-pro/PCM12/PCM20)"
-	depends on SOUND_OSS && OSS_OBSOLETE_DRIVER
-	---help---
-	  ACI (Audio Command Interface) is a protocol used to communicate with
-	  the microcontroller on some sound cards produced by miro and
-	  Cardinal Technologies.  The main function of the ACI is to control
-	  the mixer and to get a product identification.
-
-	  This VoxWare ACI driver currently supports the ACI functions on the
-	  miroSOUND PCM1-pro, PCM12 and PCM20 radio. On the PCM20 radio, ACI
-	  also controls the radio tuner. This is supported in the video4linux
-	  miropcm20 driver (say M or Y here and go back to "Multimedia
-	  devices" -> "Radio Adapters").
-
-	  This driver is also available as a module and will be called aci.
-
 config SOUND_CS4232
 	tristate "Crystal CS4232 based (PnP) cards"
 	depends on SOUND_OSS
@@ -594,18 +505,6 @@ config SOUND_MPU401
 	  If you compile the driver into the kernel, you have to add
 	  "mpu401=<io>,<irq>" to the kernel command line.
 
-config SOUND_NM256
-	tristate "NM256AV/NM256ZX audio support"
-	depends on SOUND_OSS && OSS_OBSOLETE_DRIVER
-	help
-	  Say M here to include audio support for the NeoMagic 256AV/256ZX
-	  chipsets. These are the audio chipsets found in the Sony
-	  Z505S/SX/DX, some Sony F-series, and the Dell Latitude CPi and CPt
-	  laptops. It includes support for an AC97-compatible mixer and an
-	  apparently proprietary sound engine.
-
-	  See <file:Documentation/sound/oss/NM256> for further information.
-
 config SOUND_PAS
 	tristate "ProAudioSpectrum 16 support"
 	depends on SOUND_OSS
@@ -714,20 +613,6 @@ config SOUND_YM3812
 
 	  If unsure, say Y.
 
-config SOUND_OPL3SA2
-	tristate "Yamaha OPL3-SA2 and SA3 based PnP cards"
-	depends on SOUND_OSS && OSS_OBSOLETE_DRIVER
-	help
-	  Say Y or M if you have a card based on one of these Yamaha sound
-	  chipsets or the "SAx", which is actually a SA3. Read
-	  <file:Documentation/sound/oss/OPL3-SA2> for more information on
-	  configuring these cards.
-
-	  If you compile the driver into the kernel and do not also
-	  configure in the optional ISA PnP support, you will have to add
-	  "opl3sa2=<io>,<irq>,<dma>,<dma2>,<mssio>,<mpuio>" to the kernel
-	  command line.
-
 config SOUND_UART6850
 	tristate "6850 UART support"
 	depends on SOUND_OSS
-- 
cgit v0.10.2


From 940cc2df935d00e704c7b8366bdaedda379abf85 Mon Sep 17 00:00:00 2001
From: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
Date: Wed, 6 Dec 2006 20:38:04 -0800
Subject: [PATCH] make 1-bit bitfields unsigned

Keeps sparse happy.

Signed-of-by: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Mauro Carvalho Chehab <mchehab@infradead.org>
Acked-by: Karsten Keil <kkeil@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/isdn/hisax/isdnhdlc.h b/drivers/isdn/hisax/isdnhdlc.h
index 2693159..5655b5f 100644
--- a/drivers/isdn/hisax/isdnhdlc.h
+++ b/drivers/isdn/hisax/isdnhdlc.h
@@ -41,10 +41,10 @@ struct isdnhdlc_vars {
 	unsigned char shift_reg;
 	unsigned char ffvalue;
 
-	int data_received:1; 	// set if transferring data
-	int dchannel:1; 	// set if D channel (send idle instead of flags)
-	int do_adapt56:1; 	// set if 56K adaptation
-        int do_closing:1; 	// set if in closing phase (need to send CRC + flag
+	unsigned int data_received:1; 	// set if transferring data
+	unsigned int dchannel:1; 	// set if D channel (send idle instead of flags)
+	unsigned int do_adapt56:1; 	// set if 56K adaptation
+	unsigned int do_closing:1; 	// set if in closing phase (need to send CRC + flag
 };
 
 
diff --git a/drivers/media/dvb/frontends/l64781.c b/drivers/media/dvb/frontends/l64781.c
index f3bc82e..1aeacb1 100644
--- a/drivers/media/dvb/frontends/l64781.c
+++ b/drivers/media/dvb/frontends/l64781.c
@@ -36,7 +36,7 @@ struct l64781_state {
 	struct dvb_frontend frontend;
 
 	/* private demodulator data */
-	int first:1;
+	unsigned int first:1;
 };
 
 #define dprintk(args...) \
diff --git a/sound/arm/sa11xx-uda1341.c b/sound/arm/sa11xx-uda1341.c
index c79a9af..c7e1b26 100644
--- a/sound/arm/sa11xx-uda1341.c
+++ b/sound/arm/sa11xx-uda1341.c
@@ -125,7 +125,7 @@ struct audio_stream {
 #else
 	dma_regs_t *dma_regs;	/* points to our DMA registers */
 #endif
-	int active:1;		/* we are using this stream for transfer now */
+	unsigned int active:1;	/* we are using this stream for transfer now */
 	int period;		/* current transfer period */
 	int periods;		/* current count of periods registerd in the DMA engine */
 	int tx_spin;		/* are we recoding - flag used to do DMA trans. for sync */
-- 
cgit v0.10.2


From c15bb296403f1ce448384d58742e0dc04f49d664 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@austin.ibm.com>
Date: Wed, 6 Dec 2006 20:38:07 -0800
Subject: [PATCH] HVCS char driver janitoring: move block of code

Move a block of code from the bottom of the file to the top, which is needed
to enable the cleanup.

Signed-off-by: Linas Vepstas <linas@austin.ibm.com>
Cc: Ryan S. Arnold <rsa@us.ibm.com>
Cc: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/hvcs.c b/drivers/char/hvcs.c
index 8728255..d090622 100644
--- a/drivers/char/hvcs.c
+++ b/drivers/char/hvcs.c
@@ -337,11 +337,6 @@ static int hvcs_open(struct tty_struct *tty, struct file *filp);
 static void hvcs_close(struct tty_struct *tty, struct file *filp);
 static void hvcs_hangup(struct tty_struct * tty);
 
-static void hvcs_create_device_attrs(struct hvcs_struct *hvcsd);
-static void hvcs_remove_device_attrs(struct vio_dev *vdev);
-static void hvcs_create_driver_attrs(void);
-static void hvcs_remove_driver_attrs(void);
-
 static int __devinit hvcs_probe(struct vio_dev *dev,
 		const struct vio_device_id *id);
 static int __devexit hvcs_remove(struct vio_dev *dev);
@@ -353,6 +348,172 @@ static void __exit hvcs_module_exit(void);
 #define HVCS_TRY_WRITE	0x00000004
 #define HVCS_READ_MASK	(HVCS_SCHED_READ | HVCS_QUICK_READ)
 
+static inline struct hvcs_struct *from_vio_dev(struct vio_dev *viod)
+{
+	return viod->dev.driver_data;
+}
+/* The sysfs interface for the driver and devices */
+
+static ssize_t hvcs_partner_vtys_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct vio_dev *viod = to_vio_dev(dev);
+	struct hvcs_struct *hvcsd = from_vio_dev(viod);
+	unsigned long flags;
+	int retval;
+
+	spin_lock_irqsave(&hvcsd->lock, flags);
+	retval = sprintf(buf, "%X\n", hvcsd->p_unit_address);
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+	return retval;
+}
+static DEVICE_ATTR(partner_vtys, S_IRUGO, hvcs_partner_vtys_show, NULL);
+
+static ssize_t hvcs_partner_clcs_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct vio_dev *viod = to_vio_dev(dev);
+	struct hvcs_struct *hvcsd = from_vio_dev(viod);
+	unsigned long flags;
+	int retval;
+
+	spin_lock_irqsave(&hvcsd->lock, flags);
+	retval = sprintf(buf, "%s\n", &hvcsd->p_location_code[0]);
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+	return retval;
+}
+static DEVICE_ATTR(partner_clcs, S_IRUGO, hvcs_partner_clcs_show, NULL);
+
+static ssize_t hvcs_current_vty_store(struct device *dev, struct device_attribute *attr, const char * buf,
+		size_t count)
+{
+	/*
+	 * Don't need this feature at the present time because firmware doesn't
+	 * yet support multiple partners.
+	 */
+	printk(KERN_INFO "HVCS: Denied current_vty change: -EPERM.\n");
+	return -EPERM;
+}
+
+static ssize_t hvcs_current_vty_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct vio_dev *viod = to_vio_dev(dev);
+	struct hvcs_struct *hvcsd = from_vio_dev(viod);
+	unsigned long flags;
+	int retval;
+
+	spin_lock_irqsave(&hvcsd->lock, flags);
+	retval = sprintf(buf, "%s\n", &hvcsd->p_location_code[0]);
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+	return retval;
+}
+
+static DEVICE_ATTR(current_vty,
+	S_IRUGO | S_IWUSR, hvcs_current_vty_show, hvcs_current_vty_store);
+
+static ssize_t hvcs_vterm_state_store(struct device *dev, struct device_attribute *attr, const char *buf,
+		size_t count)
+{
+	struct vio_dev *viod = to_vio_dev(dev);
+	struct hvcs_struct *hvcsd = from_vio_dev(viod);
+	unsigned long flags;
+
+	/* writing a '0' to this sysfs entry will result in the disconnect. */
+	if (simple_strtol(buf, NULL, 0) != 0)
+		return -EINVAL;
+
+	spin_lock_irqsave(&hvcsd->lock, flags);
+
+	if (hvcsd->open_count > 0) {
+		spin_unlock_irqrestore(&hvcsd->lock, flags);
+		printk(KERN_INFO "HVCS: vterm state unchanged.  "
+				"The hvcs device node is still in use.\n");
+		return -EPERM;
+	}
+
+	if (hvcsd->connected == 0) {
+		spin_unlock_irqrestore(&hvcsd->lock, flags);
+		printk(KERN_INFO "HVCS: vterm state unchanged. The"
+				" vty-server is not connected to a vty.\n");
+		return -EPERM;
+	}
+
+	hvcs_partner_free(hvcsd);
+	printk(KERN_INFO "HVCS: Closed vty-server@%X and"
+			" partner vty@%X:%d connection.\n",
+			hvcsd->vdev->unit_address,
+			hvcsd->p_unit_address,
+			(uint32_t)hvcsd->p_partition_ID);
+
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+	return count;
+}
+
+static ssize_t hvcs_vterm_state_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct vio_dev *viod = to_vio_dev(dev);
+	struct hvcs_struct *hvcsd = from_vio_dev(viod);
+	unsigned long flags;
+	int retval;
+
+	spin_lock_irqsave(&hvcsd->lock, flags);
+	retval = sprintf(buf, "%d\n", hvcsd->connected);
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+	return retval;
+}
+static DEVICE_ATTR(vterm_state, S_IRUGO | S_IWUSR,
+		hvcs_vterm_state_show, hvcs_vterm_state_store);
+
+static ssize_t hvcs_index_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct vio_dev *viod = to_vio_dev(dev);
+	struct hvcs_struct *hvcsd = from_vio_dev(viod);
+	unsigned long flags;
+	int retval;
+
+	spin_lock_irqsave(&hvcsd->lock, flags);
+	retval = sprintf(buf, "%d\n", hvcsd->index);
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+	return retval;
+}
+
+static DEVICE_ATTR(index, S_IRUGO, hvcs_index_show, NULL);
+
+static struct attribute *hvcs_attrs[] = {
+	&dev_attr_partner_vtys.attr,
+	&dev_attr_partner_clcs.attr,
+	&dev_attr_current_vty.attr,
+	&dev_attr_vterm_state.attr,
+	&dev_attr_index.attr,
+	NULL,
+};
+
+static struct attribute_group hvcs_attr_group = {
+	.attrs = hvcs_attrs,
+};
+
+static ssize_t hvcs_rescan_show(struct device_driver *ddp, char *buf)
+{
+	/* A 1 means it is updating, a 0 means it is done updating */
+	return snprintf(buf, PAGE_SIZE, "%d\n", hvcs_rescan_status);
+}
+
+static ssize_t hvcs_rescan_store(struct device_driver *ddp, const char * buf,
+		size_t count)
+{
+	if ((simple_strtol(buf, NULL, 0) != 1)
+		&& (hvcs_rescan_status != 0))
+		return -EINVAL;
+
+	hvcs_rescan_status = 1;
+	printk(KERN_INFO "HVCS: rescanning partner info for all"
+		" vty-servers.\n");
+	hvcs_rescan_devices_list();
+	hvcs_rescan_status = 0;
+	return count;
+}
+
+static DRIVER_ATTR(rescan,
+	S_IRUGO | S_IWUSR, hvcs_rescan_show, hvcs_rescan_store);
+
 static void hvcs_kick(void)
 {
 	hvcs_kicked = 1;
@@ -575,7 +736,7 @@ static void destroy_hvcs_struct(struct kobject *kobj)
 	spin_unlock_irqrestore(&hvcsd->lock, flags);
 	spin_unlock(&hvcs_structs_lock);
 
-	hvcs_remove_device_attrs(vdev);
+	sysfs_remove_group(&vdev->dev.kobj, &hvcs_attr_group);
 
 	kfree(hvcsd);
 }
@@ -608,6 +769,7 @@ static int __devinit hvcs_probe(
 {
 	struct hvcs_struct *hvcsd;
 	int index;
+	int retval;
 
 	if (!dev || !id) {
 		printk(KERN_ERR "HVCS: probed with invalid parameter.\n");
@@ -658,14 +820,16 @@ static int __devinit hvcs_probe(
 	 * the hvcs_struct has been added to the devices list then the user app
 	 * will get -ENODEV.
 	 */
-
 	spin_lock(&hvcs_structs_lock);
-
 	list_add_tail(&(hvcsd->next), &hvcs_structs);
-
 	spin_unlock(&hvcs_structs_lock);
 
-	hvcs_create_device_attrs(hvcsd);
+	retval = sysfs_create_group(&dev->dev.kobj, &hvcs_attr_group);
+	if (retval) {
+		printk(KERN_ERR "HVCS: Can't create sysfs attrs for vty-server@%X\n",
+		       hvcsd->vdev->unit_address);
+		return retval;
+	}
 
 	printk(KERN_INFO "HVCS: vty-server@%X added to the vio bus.\n", dev->unit_address);
 
@@ -1354,8 +1518,10 @@ static int __init hvcs_module_init(void)
 	if (!hvcs_tty_driver)
 		return -ENOMEM;
 
-	if (hvcs_alloc_index_list(num_ttys_to_alloc))
-		return -ENOMEM;
+	if (hvcs_alloc_index_list(num_ttys_to_alloc)) {
+		rc = -ENOMEM;
+		goto index_fail;
+	}
 
 	hvcs_tty_driver->owner = THIS_MODULE;
 
@@ -1385,41 +1551,57 @@ static int __init hvcs_module_init(void)
 	 * dynamically assigned major and minor numbers for our devices.
 	 */
 	if (tty_register_driver(hvcs_tty_driver)) {
-		printk(KERN_ERR "HVCS: registration "
-			" as a tty driver failed.\n");
-		hvcs_free_index_list();
-		put_tty_driver(hvcs_tty_driver);
-		return -EIO;
+		printk(KERN_ERR "HVCS: registration as a tty driver failed.\n");
+		rc = -EIO;
+		goto register_fail;
 	}
 
 	hvcs_pi_buff = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!hvcs_pi_buff) {
-		tty_unregister_driver(hvcs_tty_driver);
-		hvcs_free_index_list();
-		put_tty_driver(hvcs_tty_driver);
-		return -ENOMEM;
+		rc = -ENOMEM;
+		goto buff_alloc_fail;
 	}
 
 	hvcs_task = kthread_run(khvcsd, NULL, "khvcsd");
 	if (IS_ERR(hvcs_task)) {
 		printk(KERN_ERR "HVCS: khvcsd creation failed.  Driver not loaded.\n");
-		kfree(hvcs_pi_buff);
-		tty_unregister_driver(hvcs_tty_driver);
-		hvcs_free_index_list();
-		put_tty_driver(hvcs_tty_driver);
-		return -EIO;
+		rc = -EIO;
+		goto kthread_fail;
 	}
 
 	rc = vio_register_driver(&hvcs_vio_driver);
+	if (rc) {
+		printk(KERN_ERR "HVCS: can't register vio driver\n");
+		goto vio_fail;
+	}
 
 	/*
 	 * This needs to be done AFTER the vio_register_driver() call or else
 	 * the kobjects won't be initialized properly.
 	 */
-	hvcs_create_driver_attrs();
+	rc = driver_create_file(&(hvcs_vio_driver.driver), &driver_attr_rescan);
+	if (rc) {
+		printk(KERN_ERR "HVCS: sysfs attr create failed\n");
+		goto attr_fail;
+	}
 
 	printk(KERN_INFO "HVCS: driver module inserted.\n");
 
+	return 0;
+
+attr_fail:
+	vio_unregister_driver(&hvcs_vio_driver);
+vio_fail:
+	kthread_stop(hvcs_task);
+kthread_fail:
+	kfree(hvcs_pi_buff);
+buff_alloc_fail:
+	tty_unregister_driver(hvcs_tty_driver);
+register_fail:
+	hvcs_free_index_list();
+index_fail:
+	put_tty_driver(hvcs_tty_driver);
+	hvcs_tty_driver = NULL;
 	return rc;
 }
 
@@ -1441,7 +1623,7 @@ static void __exit hvcs_module_exit(void)
 	hvcs_pi_buff = NULL;
 	spin_unlock(&hvcs_pi_lock);
 
-	hvcs_remove_driver_attrs();
+	driver_remove_file(&hvcs_vio_driver.driver, &driver_attr_rescan);
 
 	vio_unregister_driver(&hvcs_vio_driver);
 
@@ -1456,191 +1638,3 @@ static void __exit hvcs_module_exit(void)
 
 module_init(hvcs_module_init);
 module_exit(hvcs_module_exit);
-
-static inline struct hvcs_struct *from_vio_dev(struct vio_dev *viod)
-{
-	return viod->dev.driver_data;
-}
-/* The sysfs interface for the driver and devices */
-
-static ssize_t hvcs_partner_vtys_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	struct vio_dev *viod = to_vio_dev(dev);
-	struct hvcs_struct *hvcsd = from_vio_dev(viod);
-	unsigned long flags;
-	int retval;
-
-	spin_lock_irqsave(&hvcsd->lock, flags);
-	retval = sprintf(buf, "%X\n", hvcsd->p_unit_address);
-	spin_unlock_irqrestore(&hvcsd->lock, flags);
-	return retval;
-}
-static DEVICE_ATTR(partner_vtys, S_IRUGO, hvcs_partner_vtys_show, NULL);
-
-static ssize_t hvcs_partner_clcs_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	struct vio_dev *viod = to_vio_dev(dev);
-	struct hvcs_struct *hvcsd = from_vio_dev(viod);
-	unsigned long flags;
-	int retval;
-
-	spin_lock_irqsave(&hvcsd->lock, flags);
-	retval = sprintf(buf, "%s\n", &hvcsd->p_location_code[0]);
-	spin_unlock_irqrestore(&hvcsd->lock, flags);
-	return retval;
-}
-static DEVICE_ATTR(partner_clcs, S_IRUGO, hvcs_partner_clcs_show, NULL);
-
-static ssize_t hvcs_current_vty_store(struct device *dev, struct device_attribute *attr, const char * buf,
-		size_t count)
-{
-	/*
-	 * Don't need this feature at the present time because firmware doesn't
-	 * yet support multiple partners.
-	 */
-	printk(KERN_INFO "HVCS: Denied current_vty change: -EPERM.\n");
-	return -EPERM;
-}
-
-static ssize_t hvcs_current_vty_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	struct vio_dev *viod = to_vio_dev(dev);
-	struct hvcs_struct *hvcsd = from_vio_dev(viod);
-	unsigned long flags;
-	int retval;
-
-	spin_lock_irqsave(&hvcsd->lock, flags);
-	retval = sprintf(buf, "%s\n", &hvcsd->p_location_code[0]);
-	spin_unlock_irqrestore(&hvcsd->lock, flags);
-	return retval;
-}
-
-static DEVICE_ATTR(current_vty,
-	S_IRUGO | S_IWUSR, hvcs_current_vty_show, hvcs_current_vty_store);
-
-static ssize_t hvcs_vterm_state_store(struct device *dev, struct device_attribute *attr, const char *buf,
-		size_t count)
-{
-	struct vio_dev *viod = to_vio_dev(dev);
-	struct hvcs_struct *hvcsd = from_vio_dev(viod);
-	unsigned long flags;
-
-	/* writing a '0' to this sysfs entry will result in the disconnect. */
-	if (simple_strtol(buf, NULL, 0) != 0)
-		return -EINVAL;
-
-	spin_lock_irqsave(&hvcsd->lock, flags);
-
-	if (hvcsd->open_count > 0) {
-		spin_unlock_irqrestore(&hvcsd->lock, flags);
-		printk(KERN_INFO "HVCS: vterm state unchanged.  "
-				"The hvcs device node is still in use.\n");
-		return -EPERM;
-	}
-
-	if (hvcsd->connected == 0) {
-		spin_unlock_irqrestore(&hvcsd->lock, flags);
-		printk(KERN_INFO "HVCS: vterm state unchanged. The"
-				" vty-server is not connected to a vty.\n");
-		return -EPERM;
-	}
-
-	hvcs_partner_free(hvcsd);
-	printk(KERN_INFO "HVCS: Closed vty-server@%X and"
-			" partner vty@%X:%d connection.\n",
-			hvcsd->vdev->unit_address,
-			hvcsd->p_unit_address,
-			(uint32_t)hvcsd->p_partition_ID);
-
-	spin_unlock_irqrestore(&hvcsd->lock, flags);
-	return count;
-}
-
-static ssize_t hvcs_vterm_state_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	struct vio_dev *viod = to_vio_dev(dev);
-	struct hvcs_struct *hvcsd = from_vio_dev(viod);
-	unsigned long flags;
-	int retval;
-
-	spin_lock_irqsave(&hvcsd->lock, flags);
-	retval = sprintf(buf, "%d\n", hvcsd->connected);
-	spin_unlock_irqrestore(&hvcsd->lock, flags);
-	return retval;
-}
-static DEVICE_ATTR(vterm_state, S_IRUGO | S_IWUSR,
-		hvcs_vterm_state_show, hvcs_vterm_state_store);
-
-static ssize_t hvcs_index_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	struct vio_dev *viod = to_vio_dev(dev);
-	struct hvcs_struct *hvcsd = from_vio_dev(viod);
-	unsigned long flags;
-	int retval;
-
-	spin_lock_irqsave(&hvcsd->lock, flags);
-	retval = sprintf(buf, "%d\n", hvcsd->index);
-	spin_unlock_irqrestore(&hvcsd->lock, flags);
-	return retval;
-}
-
-static DEVICE_ATTR(index, S_IRUGO, hvcs_index_show, NULL);
-
-static struct attribute *hvcs_attrs[] = {
-	&dev_attr_partner_vtys.attr,
-	&dev_attr_partner_clcs.attr,
-	&dev_attr_current_vty.attr,
-	&dev_attr_vterm_state.attr,
-	&dev_attr_index.attr,
-	NULL,
-};
-
-static struct attribute_group hvcs_attr_group = {
-	.attrs = hvcs_attrs,
-};
-
-static void hvcs_create_device_attrs(struct hvcs_struct *hvcsd)
-{
-	struct vio_dev *vdev = hvcsd->vdev;
-	sysfs_create_group(&vdev->dev.kobj, &hvcs_attr_group);
-}
-
-static void hvcs_remove_device_attrs(struct vio_dev *vdev)
-{
-	sysfs_remove_group(&vdev->dev.kobj, &hvcs_attr_group);
-}
-
-static ssize_t hvcs_rescan_show(struct device_driver *ddp, char *buf)
-{
-	/* A 1 means it is updating, a 0 means it is done updating */
-	return snprintf(buf, PAGE_SIZE, "%d\n", hvcs_rescan_status);
-}
-
-static ssize_t hvcs_rescan_store(struct device_driver *ddp, const char * buf,
-		size_t count)
-{
-	if ((simple_strtol(buf, NULL, 0) != 1)
-		&& (hvcs_rescan_status != 0))
-		return -EINVAL;
-
-	hvcs_rescan_status = 1;
-	printk(KERN_INFO "HVCS: rescanning partner info for all"
-		" vty-servers.\n");
-	hvcs_rescan_devices_list();
-	hvcs_rescan_status = 0;
-	return count;
-}
-static DRIVER_ATTR(rescan,
-	S_IRUGO | S_IWUSR, hvcs_rescan_show, hvcs_rescan_store);
-
-static void hvcs_create_driver_attrs(void)
-{
-	struct device_driver *driverfs = &(hvcs_vio_driver.driver);
-	driver_create_file(driverfs, &driver_attr_rescan);
-}
-
-static void hvcs_remove_driver_attrs(void)
-{
-	struct device_driver *driverfs = &(hvcs_vio_driver.driver);
-	driver_remove_file(driverfs, &driver_attr_rescan);
-}
-- 
cgit v0.10.2


From b4c6c34a530b4d1c626f4ac0a884e0a9b849378c Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 6 Dec 2006 20:38:11 -0800
Subject: [PATCH] kprobes: enable booster on the preemptible kernel

When we are unregistering a kprobe-booster, we can't release its
instruction buffer immediately on the preemptive kernel, because some
processes might be preempted on the buffer.  The freeze_processes() and
thaw_processes() functions can clean most of processes up from the buffer.
There are still some non-frozen threads who have the PF_NOFREEZE flag.  If
those threads are sleeping (not preempted) at the known place outside the
buffer, we can ensure safety of freeing.

However, the processing of this check routine takes a long time.  So, this
patch introduces the garbage collection mechanism of insn_slot.  It also
introduces the "dirty" flag to free_insn_slot because of efficiency.

The "clean" instruction slots (dirty flag is cleared) are released
immediately.  But the "dirty" slots which are used by boosted kprobes, are
marked as garbages.  collect_garbage_slots() will be invoked to release
"dirty" slots if there are more than INSNS_PER_PAGE garbage slots or if
there are no unused slots.

Cc: "Keshavamurthy, Anil S" <anil.s.keshavamurthy@intel.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: "bibo,mao" <bibo.mao@intel.com>
Cc: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Cc: Yumiko Sugita <yumiko.sugita.yf@hitachi.com>
Cc: Satoshi Oshima <soshima@redhat.com>
Cc: Hideo Aoki <haoki@redhat.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index fc79e1e..af1d533 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -184,7 +184,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
 	mutex_lock(&kprobe_mutex);
-	free_insn_slot(p->ainsn.insn);
+	free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
 	mutex_unlock(&kprobe_mutex);
 }
 
@@ -333,7 +333,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 		return 1;
 
 ss_probe:
-#ifndef CONFIG_PREEMPT
+#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
 	if (p->ainsn.boostable == 1 && !p->post_handler){
 		/* Boost up -- we can execute copied instructions directly */
 		reset_current_kprobe();
diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index 51217d6..4d592ee 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -481,7 +481,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
 	mutex_lock(&kprobe_mutex);
-	free_insn_slot(p->ainsn.insn);
+	free_insn_slot(p->ainsn.insn, 0);
 	mutex_unlock(&kprobe_mutex);
 }
 /*
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 7b8d12b..4657563 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -85,7 +85,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
 	mutex_lock(&kprobe_mutex);
-	free_insn_slot(p->ainsn.insn);
+	free_insn_slot(p->ainsn.insn, 0);
 	mutex_unlock(&kprobe_mutex);
 }
 
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index 67914fe..576368c 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -200,7 +200,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
 	mutex_lock(&kprobe_mutex);
-	free_insn_slot(p->ainsn.insn);
+	free_insn_slot(p->ainsn.insn, 0);
 	mutex_unlock(&kprobe_mutex);
 }
 
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index ac24156..209c8c0 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -224,7 +224,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
 	mutex_lock(&kprobe_mutex);
-	free_insn_slot(p->ainsn.insn);
+	free_insn_slot(p->ainsn.insn, 0);
 	mutex_unlock(&kprobe_mutex);
 }
 
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index ac4c055..769be39 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -165,7 +165,7 @@ extern void arch_disarm_kprobe(struct kprobe *p);
 extern int arch_init_kprobes(void);
 extern void show_registers(struct pt_regs *regs);
 extern kprobe_opcode_t *get_insn_slot(void);
-extern void free_insn_slot(kprobe_opcode_t *slot);
+extern void free_insn_slot(kprobe_opcode_t *slot, int dirty);
 extern void kprobes_inc_nmissed_count(struct kprobe *p);
 
 /* Get the kprobe at this addr (if any) - called with preemption disabled */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 610c837..17ec4af 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -38,6 +38,7 @@
 #include <linux/module.h>
 #include <linux/moduleloader.h>
 #include <linux/kallsyms.h>
+#include <linux/freezer.h>
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
@@ -83,9 +84,36 @@ struct kprobe_insn_page {
 	kprobe_opcode_t *insns;		/* Page of instruction slots */
 	char slot_used[INSNS_PER_PAGE];
 	int nused;
+	int ngarbage;
 };
 
 static struct hlist_head kprobe_insn_pages;
+static int kprobe_garbage_slots;
+static int collect_garbage_slots(void);
+
+static int __kprobes check_safety(void)
+{
+	int ret = 0;
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_PM)
+	ret = freeze_processes();
+	if (ret == 0) {
+		struct task_struct *p, *q;
+		do_each_thread(p, q) {
+			if (p != current && p->state == TASK_RUNNING &&
+			    p->pid != 0) {
+				printk("Check failed: %s is running\n",p->comm);
+				ret = -1;
+				goto loop_end;
+			}
+		} while_each_thread(p, q);
+	}
+loop_end:
+	thaw_processes();
+#else
+	synchronize_sched();
+#endif
+	return ret;
+}
 
 /**
  * get_insn_slot() - Find a slot on an executable page for an instruction.
@@ -96,6 +124,7 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
 	struct kprobe_insn_page *kip;
 	struct hlist_node *pos;
 
+      retry:
 	hlist_for_each(pos, &kprobe_insn_pages) {
 		kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
 		if (kip->nused < INSNS_PER_PAGE) {
@@ -112,7 +141,11 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
 		}
 	}
 
-	/* All out of space.  Need to allocate a new page. Use slot 0.*/
+	/* If there are any garbage slots, collect it and try again. */
+	if (kprobe_garbage_slots && collect_garbage_slots() == 0) {
+		goto retry;
+	}
+	/* All out of space.  Need to allocate a new page. Use slot 0. */
 	kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
 	if (!kip) {
 		return NULL;
@@ -133,10 +166,62 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
 	memset(kip->slot_used, 0, INSNS_PER_PAGE);
 	kip->slot_used[0] = 1;
 	kip->nused = 1;
+	kip->ngarbage = 0;
 	return kip->insns;
 }
 
-void __kprobes free_insn_slot(kprobe_opcode_t *slot)
+/* Return 1 if all garbages are collected, otherwise 0. */
+static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
+{
+	kip->slot_used[idx] = 0;
+	kip->nused--;
+	if (kip->nused == 0) {
+		/*
+		 * Page is no longer in use.  Free it unless
+		 * it's the last one.  We keep the last one
+		 * so as not to have to set it up again the
+		 * next time somebody inserts a probe.
+		 */
+		hlist_del(&kip->hlist);
+		if (hlist_empty(&kprobe_insn_pages)) {
+			INIT_HLIST_NODE(&kip->hlist);
+			hlist_add_head(&kip->hlist,
+				       &kprobe_insn_pages);
+		} else {
+			module_free(NULL, kip->insns);
+			kfree(kip);
+		}
+		return 1;
+	}
+	return 0;
+}
+
+static int __kprobes collect_garbage_slots(void)
+{
+	struct kprobe_insn_page *kip;
+	struct hlist_node *pos, *next;
+
+	/* Ensure no-one is preepmted on the garbages */
+	if (check_safety() != 0)
+		return -EAGAIN;
+
+	hlist_for_each_safe(pos, next, &kprobe_insn_pages) {
+		int i;
+		kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+		if (kip->ngarbage == 0)
+			continue;
+		kip->ngarbage = 0;	/* we will collect all garbages */
+		for (i = 0; i < INSNS_PER_PAGE; i++) {
+			if (kip->slot_used[i] == -1 &&
+			    collect_one_slot(kip, i))
+				break;
+		}
+	}
+	kprobe_garbage_slots = 0;
+	return 0;
+}
+
+void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
 {
 	struct kprobe_insn_page *kip;
 	struct hlist_node *pos;
@@ -146,28 +231,18 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot)
 		if (kip->insns <= slot &&
 		    slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
 			int i = (slot - kip->insns) / MAX_INSN_SIZE;
-			kip->slot_used[i] = 0;
-			kip->nused--;
-			if (kip->nused == 0) {
-				/*
-				 * Page is no longer in use.  Free it unless
-				 * it's the last one.  We keep the last one
-				 * so as not to have to set it up again the
-				 * next time somebody inserts a probe.
-				 */
-				hlist_del(&kip->hlist);
-				if (hlist_empty(&kprobe_insn_pages)) {
-					INIT_HLIST_NODE(&kip->hlist);
-					hlist_add_head(&kip->hlist,
-						&kprobe_insn_pages);
-				} else {
-					module_free(NULL, kip->insns);
-					kfree(kip);
-				}
+			if (dirty) {
+				kip->slot_used[i] = -1;
+				kip->ngarbage++;
+			} else {
+				collect_one_slot(kip, i);
 			}
-			return;
+			break;
 		}
 	}
+	if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) {
+		collect_garbage_slots();
+	}
 }
 #endif
 
-- 
cgit v0.10.2


From 83df8db9e62129975fab6d800fb381faf0dfee74 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 6 Dec 2006 20:38:14 -0800
Subject: [PATCH] declare smp_call_function_single in generic code

smp_call_function_single() needs to be visible in non-SMP builds, to fix:

arch/x86_64/kernel/vsyscall.c:283: warning: implicit declaration of function 'smp_call_function_single'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index d6b7c05..f1bdd50 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -118,13 +118,6 @@ static __inline int logical_smp_processor_id(void)
 #define cpu_physical_id(cpu)		x86_cpu_to_apicid[cpu]
 #else
 #define cpu_physical_id(cpu)		boot_cpu_id
-static inline int smp_call_function_single(int cpuid, void (*func) (void *info),
-				void *info, int retry, int wait)
-{
-	/* Disable interrupts here? */
-	func(info);
-	return 0;
-}
 #endif /* !CONFIG_SMP */
 #endif
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 5164998..7ba23ec 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -99,6 +99,13 @@ static inline int up_smp_call_function(void)
 static inline void smp_send_reschedule(int cpu) { }
 #define num_booting_cpus()			1
 #define smp_prepare_boot_cpu()			do {} while (0)
+static inline int smp_call_function_single(int cpuid, void (*func) (void *info),
+				void *info, int retry, int wait)
+{
+	/* Disable interrupts here? */
+	func(info);
+	return 0;
+}
 
 #endif /* !SMP */
 
-- 
cgit v0.10.2


From a38a44c1a93078fc5fadc4ac2df8dea4697069e2 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 6 Dec 2006 20:38:16 -0800
Subject: [PATCH] smp_call_function_single() check that local interrupts are
 enabled

smp_call_function_single() can deadlock if the caller disabled local
interrupts (the target CPU could be spinning on call_lock).  Check for that.

Why on earth do these functions use spin_lock_bh()??

Cc: "Randy.Dunlap" <rdunlap@xenotime.net>
Cc: Andi Kleen <ak@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 31e5c65..9827cf9 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -699,6 +699,10 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 		put_cpu();
 		return -EBUSY;
 	}
+
+	/* Can deadlock when called with interrupts disabled */
+	WARN_ON(irqs_disabled());
+
 	spin_lock_bh(&call_lock);
 	__smp_call_function_single(cpu, func, info, nonatomic, wait);
 	spin_unlock_bh(&call_lock);
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 9f74c88..32f4d7e 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -379,6 +379,10 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
 		put_cpu();
 		return 0;
 	}
+
+	/* Can deadlock when called with interrupts disabled */
+	WARN_ON(irqs_disabled());
+
 	spin_lock_bh(&call_lock);
 	__smp_call_function_single(cpu, func, info, nonatomic, wait);
 	spin_unlock_bh(&call_lock);
-- 
cgit v0.10.2


From 02316067852187b8bec781bec07410e91af79627 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 6 Dec 2006 20:38:17 -0800
Subject: [PATCH] hotplug CPU: clean up hotcpu_notifier() use

There was lots of #ifdef noise in the kernel due to hotcpu_notifier(fn,
prio) not correctly marking 'fn' as used in the !HOTPLUG_CPU case, and thus
generating compiler warnings of unused symbols, hence forcing people to add
#ifdefs.

the compiler can skip truly unused functions just fine:

    text    data     bss     dec     hex filename
 1624412  728710 3674856 6027978  5bfaca vmlinux.before
 1624412  728710 3674856 6027978  5bfaca vmlinux.after

[akpm@osdl.org: topology.c fix]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c
index bad8b44..065005c 100644
--- a/arch/i386/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c
@@ -116,7 +116,6 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
 	return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
 {
 	return sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
@@ -153,7 +152,6 @@ static struct notifier_block thermal_throttle_cpu_notifier =
 {
 	.notifier_call = thermal_throttle_cpu_callback,
 };
-#endif /* CONFIG_HOTPLUG_CPU */
 
 static __init int thermal_throttle_init_device(void)
 {
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index ab0c327..23b2cc7 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -167,7 +167,6 @@ static int cpuid_device_create(int i)
 	return err;
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
@@ -187,7 +186,6 @@ static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier =
 {
 	.notifier_call = cpuid_class_cpu_callback,
 };
-#endif /* !CONFIG_HOTPLUG_CPU */
 
 static int __init cpuid_init(void)
 {
diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index 23f5984..9723466 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -703,7 +703,6 @@ static struct sysdev_driver mc_sysdev_driver = {
 	.resume = mc_sysdev_resume,
 };
 
-#ifdef CONFIG_HOTPLUG_CPU
 static __cpuinit int
 mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 {
@@ -726,7 +725,6 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 static struct notifier_block mc_cpu_notifier = {
 	.notifier_call = mc_cpu_callback,
 };
-#endif
 
 static int __init microcode_init (void)
 {
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index a773f77..7763c67 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -250,7 +250,6 @@ static int msr_device_create(int i)
 	return err;
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int msr_class_cpu_callback(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
@@ -271,7 +270,6 @@ static struct notifier_block __cpuinitdata msr_class_cpu_notifier =
 {
 	.notifier_call = msr_class_cpu_callback,
 };
-#endif
 
 static int __init msr_init(void)
 {
diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c
index 0b546e2..c4c10a0 100644
--- a/arch/ia64/kernel/palinfo.c
+++ b/arch/ia64/kernel/palinfo.c
@@ -952,7 +952,6 @@ remove_palinfo_proc_entries(unsigned int hcpu)
 	}
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int palinfo_cpu_callback(struct notifier_block *nfb,
 					unsigned long action, void *hcpu)
 {
@@ -974,7 +973,6 @@ static struct notifier_block palinfo_cpu_notifier =
 	.notifier_call = palinfo_cpu_callback,
 	.priority = 0,
 };
-#endif
 
 static int __init
 palinfo_init(void)
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index e63b8ca..fd607ca 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -575,7 +575,6 @@ static struct file_operations salinfo_data_fops = {
 	.write   = salinfo_log_write,
 };
 
-#ifdef	CONFIG_HOTPLUG_CPU
 static int __devinit
 salinfo_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 {
@@ -620,7 +619,6 @@ static struct notifier_block salinfo_cpu_notifier =
 	.notifier_call = salinfo_cpu_callback,
 	.priority = 0,
 };
-#endif	/* CONFIG_HOTPLUG_CPU */
 
 static int __init
 salinfo_init(void)
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index 67d5cf9..b8c2372 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -561,7 +561,6 @@ appldata_offline_cpu(int cpu)
 	spin_unlock(&appldata_timer_lock);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int __cpuinit
 appldata_cpu_notify(struct notifier_block *self,
 		    unsigned long action, void *hcpu)
@@ -582,7 +581,6 @@ appldata_cpu_notify(struct notifier_block *self,
 static struct notifier_block appldata_nb = {
 	.notifier_call = appldata_cpu_notify,
 };
-#endif
 
 /*
  * appldata_init()
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index c7587fc..bc863c4 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -641,7 +641,6 @@ static __cpuinit int mce_create_device(unsigned int cpu)
 	return err;
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static void mce_remove_device(unsigned int cpu)
 {
 	int i;
@@ -674,7 +673,6 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 static struct notifier_block mce_cpu_notifier = {
 	.notifier_call = mce_cpu_callback,
 };
-#endif
 
 static __init int mce_init_device(void)
 {
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index 883fe74..fa09deb 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -551,7 +551,6 @@ out:
 	return err;
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 /*
  * let's be hotplug friendly.
  * in case of multiple core processors, the first core always takes ownership
@@ -594,12 +593,14 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 
 	sprintf(name, "threshold_bank%i", bank);
 
+#ifdef CONFIG_SMP
 	/* sibling symlink */
 	if (shared_bank[bank] && b->blocks->cpu != cpu) {
 		sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name);
 		per_cpu(threshold_banks, cpu)[bank] = NULL;
 		return;
 	}
+#endif
 
 	/* remove all sibling symlinks before unregistering */
 	for_each_cpu_mask(i, b->cpus) {
@@ -656,7 +657,6 @@ static int threshold_cpu_callback(struct notifier_block *nfb,
 static struct notifier_block threshold_cpu_notifier = {
 	.notifier_call = threshold_cpu_callback,
 };
-#endif /* CONFIG_HOTPLUG_CPU */
 
 static __init int threshold_init_device(void)
 {
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index 630036c..3785e49 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -275,7 +275,6 @@ static void __cpuinit cpu_vsyscall_init(void *arg)
 	vsyscall_set_cpu(raw_smp_processor_id());
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int __cpuinit
 cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
 {
@@ -284,7 +283,6 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
 		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
 	return NOTIFY_DONE;
 }
-#endif
 
 static void __init map_vsyscall(void)
 {
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index a4ff327..31512cd 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3459,8 +3459,6 @@ static void blk_done_softirq(struct softirq_action *h)
 	}
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-
 static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
 			  void *hcpu)
 {
@@ -3486,8 +3484,6 @@ static struct notifier_block __devinitdata blk_cpu_notifier = {
 	.notifier_call	= blk_cpu_notify,
 };
 
-#endif /* CONFIG_HOTPLUG_CPU */
-
 /**
  * blk_complete_request - end I/O on a request
  * @req:      the request being processed
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index 3d12b85..067a9e8 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -108,7 +108,6 @@ static int __cpuinit topology_add_dev(unsigned int cpu)
 	return rc;
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static void __cpuinit topology_remove_dev(unsigned int cpu)
 {
 	struct sys_device *sys_dev = get_cpu_sysdev(cpu);
@@ -136,7 +135,6 @@ static int __cpuinit topology_cpu_callback(struct notifier_block *nfb,
 	}
 	return rc ? NOTIFY_BAD : NOTIFY_OK;
 }
-#endif
 
 static int __cpuinit topology_sysfs_init(void)
 {
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 7a7c6e6..47ab42d 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1537,7 +1537,6 @@ int cpufreq_update_policy(unsigned int cpu)
 }
 EXPORT_SYMBOL(cpufreq_update_policy);
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int cpufreq_cpu_callback(struct notifier_block *nfb,
 					unsigned long action, void *hcpu)
 {
@@ -1577,7 +1576,6 @@ static struct notifier_block __cpuinitdata cpufreq_cpu_notifier =
 {
     .notifier_call = cpufreq_cpu_callback,
 };
-#endif /* CONFIG_HOTPLUG_CPU */
 
 /*********************************************************************
  *               REGISTER / UNREGISTER CPUFREQ DRIVER                *
diff --git a/fs/buffer.c b/fs/buffer.c
index a8ca0ac..517860f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2972,7 +2972,6 @@ init_buffer_head(void *data, struct kmem_cache *cachep, unsigned long flags)
 	}
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static void buffer_exit_cpu(int cpu)
 {
 	int i;
@@ -2994,7 +2993,6 @@ static int buffer_cpu_notify(struct notifier_block *self,
 		buffer_exit_cpu((unsigned long)hcpu);
 	return NOTIFY_OK;
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 
 void __init buffer_init(void)
 {
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index f02d71b..71dc6ba 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -89,9 +89,9 @@ int cpu_down(unsigned int cpu);
 #define lock_cpu_hotplug()	do { } while (0)
 #define unlock_cpu_hotplug()	do { } while (0)
 #define lock_cpu_hotplug_interruptible() 0
-#define hotcpu_notifier(fn, pri)	do { } while (0)
-#define register_hotcpu_notifier(nb)	do { } while (0)
-#define unregister_hotcpu_notifier(nb)	do { } while (0)
+#define hotcpu_notifier(fn, pri)	do { (void)(fn); } while (0)
+#define register_hotcpu_notifier(nb)	do { (void)(nb); } while (0)
+#define unregister_hotcpu_notifier(nb)	do { (void)(nb); } while (0)
 
 /* CPUs don't go offline once they're online w/o CONFIG_HOTPLUG_CPU */
 static inline int cpu_is_offline(int cpu) { return 0; }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index bd1e89c4..9b62b4c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2044,7 +2044,6 @@ out:
 	return err;
 }
 
-#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
 /*
  * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
  * or memory nodes, we need to walk over the cpuset hierarchy,
@@ -2108,9 +2107,7 @@ static void common_cpu_mem_hotplug_unplug(void)
 	mutex_unlock(&callback_mutex);
 	mutex_unlock(&manage_mutex);
 }
-#endif
 
-#ifdef CONFIG_HOTPLUG_CPU
 /*
  * The top_cpuset tracks what CPUs and Memory Nodes are online,
  * period.  This is necessary in order to make cpusets transparent
@@ -2127,7 +2124,6 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb,
 	common_cpu_mem_hotplug_unplug();
 	return 0;
 }
-#endif
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
diff --git a/kernel/profile.c b/kernel/profile.c
index 04fd84e..0961d93 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -319,7 +319,6 @@ out:
 	put_cpu();
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int __devinit profile_cpu_callback(struct notifier_block *info,
 					unsigned long action, void *__cpu)
 {
@@ -372,10 +371,10 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
 	}
 	return NOTIFY_OK;
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 #else /* !CONFIG_SMP */
 #define profile_flip_buffers()		do { } while (0)
 #define profile_discard_flip_buffers()	do { } while (0)
+#define profile_cpu_callback		NULL
 
 void profile_hits(int type, void *__pc, unsigned int nr_hits)
 {
diff --git a/kernel/sched.c b/kernel/sched.c
index 75a005e..c83f531 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6740,8 +6740,6 @@ SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
 	    sched_smt_power_savings_store);
 #endif
 
-
-#ifdef CONFIG_HOTPLUG_CPU
 /*
  * Force a reinitialization of the sched domains hierarchy.  The domains
  * and groups cannot be updated in place without racing with the balancing
@@ -6774,7 +6772,6 @@ static int update_sched_domains(struct notifier_block *nfb,
 
 	return NOTIFY_OK;
 }
-#endif
 
 void __init sched_init_smp(void)
 {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5484d6e..c525731 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -655,7 +655,6 @@ int current_is_keventd(void)
 
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 /* Take the work from this (downed) CPU. */
 static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 {
@@ -738,7 +737,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 
 	return NOTIFY_OK;
 }
-#endif
 
 void init_workqueues(void)
 {
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index e2cefab..d69ddbe 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -996,7 +996,6 @@ static __init void radix_tree_init_maxindex(void)
 		height_to_maxindex[i] = __maxindex(i);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int radix_tree_callback(struct notifier_block *nfb,
                             unsigned long action,
                             void *hcpu)
@@ -1016,7 +1015,6 @@ static int radix_tree_callback(struct notifier_block *nfb,
        }
        return NOTIFY_OK;
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 
 void __init radix_tree_init(void)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2273952..27ec7a1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -701,7 +701,6 @@ void drain_node_pages(int nodeid)
 }
 #endif
 
-#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
 static void __drain_pages(unsigned int cpu)
 {
 	unsigned long flags;
@@ -723,7 +722,6 @@ static void __drain_pages(unsigned int cpu)
 		}
 	}
 }
-#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
 
 #ifdef CONFIG_PM
 
@@ -2907,7 +2905,6 @@ void __init free_area_init(unsigned long *zones_size)
 			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int page_alloc_cpu_notify(struct notifier_block *self,
 				 unsigned long action, void *hcpu)
 {
@@ -2922,7 +2919,6 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
 	}
 	return NOTIFY_OK;
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 
 void __init page_alloc_init(void)
 {
diff --git a/mm/swap.c b/mm/swap.c
index 017e72c..2ed7be3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -514,5 +514,7 @@ void __init swap_setup(void)
 	 * Right now other parts of the system means that we
 	 * _really_ don't want to cluster much more
 	 */
+#ifdef CONFIG_HOTPLUG_CPU
 	hotcpu_notifier(cpu_swap_callback, 0);
+#endif
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f6616e8..093f5fe 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1513,7 +1513,6 @@ out:
 }
 #endif
 
-#ifdef CONFIG_HOTPLUG_CPU
 /* It's optimal to keep kswapds on the same CPUs as their memory, but
    not required for correctness.  So if the last cpu in a node goes
    away, we get changed to run anywhere: as the first one comes back,
@@ -1534,7 +1533,6 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 	}
 	return NOTIFY_OK;
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 
 /*
  * This kswapd start function will be called by init and node-hot-add.
diff --git a/net/core/dev.c b/net/core/dev.c
index 59d058a..e660cb5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3340,7 +3340,6 @@ void unregister_netdev(struct net_device *dev)
 
 EXPORT_SYMBOL(unregister_netdev);
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int dev_cpu_callback(struct notifier_block *nfb,
 			    unsigned long action,
 			    void *ocpu)
@@ -3384,7 +3383,6 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 
 	return NOTIFY_OK;
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 
 #ifdef CONFIG_NET_DMA
 /**
diff --git a/net/core/flow.c b/net/core/flow.c
index 104c25d..d137f97 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -340,7 +340,6 @@ static void __devinit flow_cache_cpu_prepare(int cpu)
 	tasklet_init(tasklet, flow_cache_flush_tasklet, 0);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int flow_cache_cpu(struct notifier_block *nfb,
 			  unsigned long action,
 			  void *hcpu)
@@ -349,7 +348,6 @@ static int flow_cache_cpu(struct notifier_block *nfb,
 		__flow_cache_shrink((unsigned long)hcpu, 0);
 	return NOTIFY_OK;
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 
 static int __init flow_cache_init(void)
 {
-- 
cgit v0.10.2


From 2bd94bd79e5bfa217714f78e5d6d7b6517ca546f Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Wed, 6 Dec 2006 20:38:18 -0800
Subject: [PATCH] ext3: fix reservation extension

Hugh Dickins wrote:
> Not found anything relevant, but I keep noticing these lines
> in ext2_try_to_allocate_with_rsv(), ext3 and ext4 similar:
>
> 		} else if (grp_goal > 0 &&
> 				(my_rsv->rsv_end - grp_goal + 1) < *count)
> 			try_to_extend_reservation(my_rsv, sb,
> 					*count-my_rsv->rsv_end + grp_goal - 1);
>
> They're wrong, a no-op in most groups, aren't they?  rsv_end is an
> absolute block number, whereas grp_goal is group-relative, so the
> calculation ought to bring in group_first_block?  Or I'm confused.
>

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Cc: "linux-ext4@vger.kernel.org" <linux-ext4@vger.kernel.org>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index b41a7d7..f96c40a 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1307,10 +1307,14 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 			if (!goal_in_my_reservation(&my_rsv->rsv_window,
 							grp_goal, group, sb))
 				grp_goal = -1;
-		} else if (grp_goal > 0 &&
-			  (my_rsv->rsv_end-grp_goal+1) < *count)
-			try_to_extend_reservation(my_rsv, sb,
-					*count-my_rsv->rsv_end + grp_goal - 1);
+		} else if (grp_goal > 0) {
+			int curr = my_rsv->rsv_end -
+					(grp_goal + group_first_block) + 1;
+
+			if (curr < *count)
+				try_to_extend_reservation(my_rsv, sb,
+							*count - curr);
+		}
 
 		if ((my_rsv->rsv_start > group_last_block) ||
 				(my_rsv->rsv_end < group_first_block)) {
-- 
cgit v0.10.2


From 1df1e63b9e9340015c01b85817568fb9afde10bc Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Wed, 6 Dec 2006 20:38:19 -0800
Subject: [PATCH] ext4: fix reservation extension

Hugh Dickins wrote:
> Not found anything relevant, but I keep noticing these lines
> in ext2_try_to_allocate_with_rsv(), ext3 and ext4 similar:
>
> 		} else if (grp_goal > 0 &&
> 				(my_rsv->rsv_end - grp_goal + 1) < *count)
> 			try_to_extend_reservation(my_rsv, sb,
> 					*count-my_rsv->rsv_end + grp_goal - 1);
>
> They're wrong, a no-op in most groups, aren't they?  rsv_end is an
> absolute block number, whereas grp_goal is group-relative, so the
> calculation ought to bring in group_first_block?  Or I'm confused.
>

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Cc: "linux-ext4@vger.kernel.org" <linux-ext4@vger.kernel.org>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 5d45582..6bd0bd5 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1324,10 +1324,14 @@ ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 			if (!goal_in_my_reservation(&my_rsv->rsv_window,
 							grp_goal, group, sb))
 				grp_goal = -1;
-		} else if (grp_goal > 0 &&
-			  (my_rsv->rsv_end-grp_goal+1) < *count)
-			try_to_extend_reservation(my_rsv, sb,
-					*count-my_rsv->rsv_end + grp_goal - 1);
+		} else if (grp_goal > 0) {
+			int curr = my_rsv->rsv_end -
+					(grp_goal + group_first_block) + 1;
+
+			if (curr < *count)
+				try_to_extend_reservation(my_rsv, sb,
+							*count - curr);
+		}
 
 		if ((my_rsv->rsv_start > group_last_block) ||
 				(my_rsv->rsv_end < group_first_block)) {
-- 
cgit v0.10.2


From fed806f4072badad614699e1d40202e0ffef5c63 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Wed, 6 Dec 2006 20:38:20 -0800
Subject: [PATCH] allow hwrandom core to be a module

Despite it being small, there should be the option of making it a
module...

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Michael Buesch <mb@bu3sch.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig
index 9f7635f..5f3acd8 100644
--- a/drivers/char/hw_random/Kconfig
+++ b/drivers/char/hw_random/Kconfig
@@ -3,17 +3,20 @@
 #
 
 config HW_RANDOM
-	bool "Hardware Random Number Generator Core support"
-	default y
+	tristate "Hardware Random Number Generator Core support"
+	default m
 	---help---
 	  Hardware Random Number Generator Core infrastructure.
 
+	  To compile this driver as a module, choose M here: the
+	  module will be called rng-core.
+
 	  If unsure, say Y.
 
 config HW_RANDOM_INTEL
 	tristate "Intel HW Random Number Generator support"
 	depends on HW_RANDOM && (X86 || IA64) && PCI
-	default y
+	default HW_RANDOM
 	---help---
 	  This driver provides kernel-side support for the Random Number
 	  Generator hardware found on Intel i8xx-based motherboards.
@@ -26,7 +29,7 @@ config HW_RANDOM_INTEL
 config HW_RANDOM_AMD
 	tristate "AMD HW Random Number Generator support"
 	depends on HW_RANDOM && X86 && PCI
-	default y
+	default HW_RANDOM
 	---help---
 	  This driver provides kernel-side support for the Random Number
 	  Generator hardware found on AMD 76x-based motherboards.
@@ -39,7 +42,7 @@ config HW_RANDOM_AMD
 config HW_RANDOM_GEODE
 	tristate "AMD Geode HW Random Number Generator support"
 	depends on HW_RANDOM && X86 && PCI
-	default y
+	default HW_RANDOM
 	---help---
 	  This driver provides kernel-side support for the Random Number
 	  Generator hardware found on the AMD Geode LX.
@@ -52,7 +55,7 @@ config HW_RANDOM_GEODE
 config HW_RANDOM_VIA
 	tristate "VIA HW Random Number Generator support"
 	depends on HW_RANDOM && X86_32
-	default y
+	default HW_RANDOM
 	---help---
 	  This driver provides kernel-side support for the Random Number
 	  Generator hardware found on VIA based motherboards.
@@ -65,7 +68,7 @@ config HW_RANDOM_VIA
 config HW_RANDOM_IXP4XX
 	tristate "Intel IXP4xx NPU HW Random Number Generator support"
 	depends on HW_RANDOM && ARCH_IXP4XX
-	default y
+	default HW_RANDOM
 	---help---
 	  This driver provides kernel-side support for the Random
 	  Number Generator hardware found on the Intel IXP4xx NPU.
@@ -78,7 +81,7 @@ config HW_RANDOM_IXP4XX
 config HW_RANDOM_OMAP
 	tristate "OMAP Random Number Generator support"
 	depends on HW_RANDOM && (ARCH_OMAP16XX || ARCH_OMAP24XX)
-	default y
+	default HW_RANDOM
  	---help---
  	  This driver provides kernel-side support for the Random Number
 	  Generator hardware found on OMAP16xx and OMAP24xx multimedia
diff --git a/drivers/char/hw_random/Makefile b/drivers/char/hw_random/Makefile
index e263ae9..c41fa19 100644
--- a/drivers/char/hw_random/Makefile
+++ b/drivers/char/hw_random/Makefile
@@ -2,7 +2,8 @@
 # Makefile for HW Random Number Generator (RNG) device drivers.
 #
 
-obj-$(CONFIG_HW_RANDOM) += core.o
+obj-$(CONFIG_HW_RANDOM) += rng-core.o
+rng-core-y := core.o
 obj-$(CONFIG_HW_RANDOM_INTEL) += intel-rng.o
 obj-$(CONFIG_HW_RANDOM_AMD) += amd-rng.o
 obj-$(CONFIG_HW_RANDOM_GEODE) += geode-rng.o
-- 
cgit v0.10.2


From 1f370a23f2a3101886953add4bd7f529e3bba016 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:38:21 -0800
Subject: [PATCH] make mm/shmem.c:shmem_xattr_security_handler static

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/mm/shmem.c b/mm/shmem.c
index bdaecfd..0076536 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1943,7 +1943,7 @@ static int shmem_xattr_security_set(struct inode *inode, const char *name,
 	return security_inode_setsecurity(inode, name, value, size, flags);
 }
 
-struct xattr_handler shmem_xattr_security_handler = {
+static struct xattr_handler shmem_xattr_security_handler = {
 	.prefix = XATTR_SECURITY_PREFIX,
 	.list   = shmem_xattr_security_list,
 	.get    = shmem_xattr_security_get,
-- 
cgit v0.10.2


From ebe7e5fe4b41deeb2731c5b52d8c8e6ac08b1f74 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:38:21 -0800
Subject: [PATCH] remove kernel/lockdep.c:lockdep_internal

Remove the no longer used lockdep_internal().

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index da19aeb..498bfbd 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -193,7 +193,6 @@ extern void lockdep_free_key_range(void *start, unsigned long size);
 
 extern void lockdep_off(void);
 extern void lockdep_on(void);
-extern int lockdep_internal(void);
 
 /*
  * These methods are used by specific locking variants (spinlocks,
@@ -255,11 +254,6 @@ static inline void lockdep_on(void)
 {
 }
 
-static inline int lockdep_internal(void)
-{
-	return 0;
-}
-
 # define lock_acquire(l, s, t, r, c, i)		do { } while (0)
 # define lock_release(l, n, i)			do { } while (0)
 # define lockdep_init()				do { } while (0)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 300d61b..3926c36 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -140,13 +140,6 @@ void lockdep_on(void)
 
 EXPORT_SYMBOL(lockdep_on);
 
-int lockdep_internal(void)
-{
-	return current->lockdep_recursion != 0;
-}
-
-EXPORT_SYMBOL(lockdep_internal);
-
 /*
  * Debugging switches:
  */
-- 
cgit v0.10.2


From d3228a887cae75ef2b8b1211c31c539bef5a5698 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:38:22 -0800
Subject: [PATCH] make kernel/signal.c:kill_proc_info() static

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0a90cef..3a76724 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1305,7 +1305,6 @@ extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp);
 extern int kill_pg_info(int, struct siginfo *, pid_t);
-extern int kill_proc_info(int, struct siginfo *, pid_t);
 extern void do_notify_parent(struct task_struct *, int);
 extern void force_sig(int, struct task_struct *);
 extern void force_sig_specific(int, struct task_struct *);
diff --git a/kernel/signal.c b/kernel/signal.c
index bc972d7..ec81def 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1134,8 +1134,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
 	return error;
 }
 
-int
-kill_proc_info(int sig, struct siginfo *info, pid_t pid)
+static int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 {
 	int error;
 	rcu_read_lock();
-- 
cgit v0.10.2


From 9d69b7d3d115d0db3ddd87f09e496db7ecf72650 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 6 Dec 2006 20:38:23 -0800
Subject: [PATCH] I2O: handle __copy_from_user

Handle __copy_from_user() return value.

Noticed by inspection, not from build warning.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Markus Lidel <Markus.Lidel@shadowconnect.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/message/i2o/i2o_config.c b/drivers/message/i2o/i2o_config.c
index 7d23e08..685d7a4 100644
--- a/drivers/message/i2o/i2o_config.c
+++ b/drivers/message/i2o/i2o_config.c
@@ -265,7 +265,11 @@ static int i2o_cfg_swdl(unsigned long arg)
 		return -ENOMEM;
 	}
 
-	__copy_from_user(buffer.virt, kxfer.buf, fragsize);
+	if (__copy_from_user(buffer.virt, kxfer.buf, fragsize)) {
+		i2o_msg_nop(c, msg);
+		i2o_dma_free(&c->pdev->dev, &buffer);
+		return -EFAULT;
+	}
 
 	msg->u.head[0] = cpu_to_le32(NINE_WORD_MSG_SIZE | SGL_OFFSET_7);
 	msg->u.head[1] =
-- 
cgit v0.10.2


From f13a603786819cc8f2eef6a89bc4a6c88f40ee60 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 6 Dec 2006 20:38:24 -0800
Subject: [PATCH] I2O: fix I2O_CONFIG without Adaptec extension

With I2O_CONFIG=y and I2O_EXT_ADAPTEC=n, kernel build gets:

drivers/message/i2o/i2o_config.c:1115: error: 'i2o_cfg_compat_ioctl' undeclared here (not in a function)

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Markus Lidel <Markus.Lidel@shadowconnect.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/message/i2o/i2o_config.c b/drivers/message/i2o/i2o_config.c
index 685d7a4..1de30d7 100644
--- a/drivers/message/i2o/i2o_config.c
+++ b/drivers/message/i2o/i2o_config.c
@@ -520,7 +520,6 @@ static int i2o_cfg_evt_get(unsigned long arg, struct file *fp)
 	return 0;
 }
 
-#ifdef CONFIG_I2O_EXT_ADAPTEC
 #ifdef CONFIG_COMPAT
 static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
 			      unsigned long arg)
@@ -763,6 +762,7 @@ static long i2o_cfg_compat_ioctl(struct file *file, unsigned cmd,
 
 #endif
 
+#ifdef CONFIG_I2O_EXT_ADAPTEC
 static int i2o_cfg_passthru(unsigned long arg)
 {
 	struct i2o_cmd_passthru __user *cmd =
-- 
cgit v0.10.2


From 8487f2e4067839ad3d009c240d51b682264320ae Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:38:25 -0800
Subject: [PATCH] make ecryptfs_version_str_map[] static

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 306f8fb..3ede12b 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -691,7 +691,7 @@ static ssize_t version_show(struct ecryptfs_obj *obj, char *buff)
 
 static struct ecryptfs_attribute sysfs_attr_version = __ATTR_RO(version);
 
-struct ecryptfs_version_str_map_elem {
+static struct ecryptfs_version_str_map_elem {
 	u32 flag;
 	char *str;
 } ecryptfs_version_str_map[] = {
-- 
cgit v0.10.2


From d394e122bc1adba0f3eb1ebec1cedb8a8c524741 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:38:26 -0800
Subject: [PATCH] make fs/jbd/transaction.c:__journal_temp_unlink_buffer()
 static

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 4f82bcd6..d38e0d5 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -27,6 +27,8 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 
+static void __journal_temp_unlink_buffer(struct journal_head *jh);
+
 /*
  * get_transaction: obtain a new transaction_t object.
  *
@@ -1499,7 +1501,7 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
  *
  * Called under j_list_lock.  The journal may not be locked.
  */
-void __journal_temp_unlink_buffer(struct journal_head *jh)
+static void __journal_temp_unlink_buffer(struct journal_head *jh)
 {
 	struct journal_head **list = NULL;
 	transaction_t *transaction;
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index dacd566..4527375 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -839,7 +839,6 @@ struct journal_s
  */
 
 /* Filing buffers */
-extern void __journal_temp_unlink_buffer(struct journal_head *jh);
 extern void journal_unfile_buffer(journal_t *, struct journal_head *);
 extern void __journal_unfile_buffer(struct journal_head *);
 extern void __journal_refile_buffer(struct journal_head *);
-- 
cgit v0.10.2


From 7ddae86095794cce4364740edd8463c77654a265 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:38:27 -0800
Subject: [PATCH] make
 fs/jbd2/transaction.c:__kbd2_journal_temp_unlink_buffer() static

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index c051a94..3a87001 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -27,6 +27,8 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 
+static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
+
 /*
  * jbd2_get_transaction: obtain a new transaction_t object.
  *
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 98a0ae5..0e0fedd 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -848,7 +848,6 @@ struct journal_s
  */
 
 /* Filing buffers */
-extern void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
 extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *);
 extern void __jbd2_journal_unfile_buffer(struct journal_head *);
 extern void __jbd2_journal_refile_buffer(struct journal_head *);
-- 
cgit v0.10.2


From c585646dd1d98caf0a5f2e85c794c1441df6fac1 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:38:29 -0800
Subject: [PATCH] fs/lockd/host.c: make 2 functions static

Make the following needlessly global functions static:

 - nlm_lookup_host()
 - nsm_find()

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index fb24a97..3d4610c 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -36,34 +36,14 @@ static DEFINE_MUTEX(nlm_host_mutex);
 static void			nlm_gc_hosts(void);
 static struct nsm_handle *	__nsm_find(const struct sockaddr_in *,
 					const char *, int, int);
-
-/*
- * Find an NLM server handle in the cache. If there is none, create it.
- */
-struct nlm_host *
-nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version,
-			const char *hostname, int hostname_len)
-{
-	return nlm_lookup_host(0, sin, proto, version,
-			       hostname, hostname_len);
-}
-
-/*
- * Find an NLM client handle in the cache. If there is none, create it.
- */
-struct nlm_host *
-nlmsvc_lookup_host(struct svc_rqst *rqstp,
-			const char *hostname, int hostname_len)
-{
-	return nlm_lookup_host(1, &rqstp->rq_addr,
-			       rqstp->rq_prot, rqstp->rq_vers,
-			       hostname, hostname_len);
-}
+static struct nsm_handle *	nsm_find(const struct sockaddr_in *sin,
+					 const char *hostname,
+					 int hostname_len);
 
 /*
  * Common host lookup routine for server & client
  */
-struct nlm_host *
+static struct nlm_host *
 nlm_lookup_host(int server, const struct sockaddr_in *sin,
 					int proto, int version,
 					const char *hostname,
@@ -195,6 +175,29 @@ nlm_destroy_host(struct nlm_host *host)
 }
 
 /*
+ * Find an NLM server handle in the cache. If there is none, create it.
+ */
+struct nlm_host *
+nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version,
+			const char *hostname, int hostname_len)
+{
+	return nlm_lookup_host(0, sin, proto, version,
+			       hostname, hostname_len);
+}
+
+/*
+ * Find an NLM client handle in the cache. If there is none, create it.
+ */
+struct nlm_host *
+nlmsvc_lookup_host(struct svc_rqst *rqstp,
+			const char *hostname, int hostname_len)
+{
+	return nlm_lookup_host(1, &rqstp->rq_addr,
+			       rqstp->rq_prot, rqstp->rq_vers,
+			       hostname, hostname_len);
+}
+
+/*
  * Create the NLM RPC client for an NLM peer
  */
 struct rpc_clnt *
@@ -495,7 +498,7 @@ out:
 	return nsm;
 }
 
-struct nsm_handle *
+static struct nsm_handle *
 nsm_find(const struct sockaddr_in *sin, const char *hostname, int hostname_len)
 {
 	return __nsm_find(sin, hostname, hostname_len, 1);
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 862d973..8c39654 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -164,14 +164,12 @@ void		  nlmclnt_next_cookie(struct nlm_cookie *);
  */
 struct nlm_host * nlmclnt_lookup_host(const struct sockaddr_in *, int, int, const char *, int);
 struct nlm_host * nlmsvc_lookup_host(struct svc_rqst *, const char *, int);
-struct nlm_host * nlm_lookup_host(int server, const struct sockaddr_in *, int, int, const char *, int);
 struct rpc_clnt * nlm_bind_host(struct nlm_host *);
 void		  nlm_rebind_host(struct nlm_host *);
 struct nlm_host * nlm_get_host(struct nlm_host *);
 void		  nlm_release_host(struct nlm_host *);
 void		  nlm_shutdown_hosts(void);
 extern void	  nlm_host_rebooted(const struct sockaddr_in *, const char *, int, u32);
-struct nsm_handle *nsm_find(const struct sockaddr_in *, const char *, int);
 void		  nsm_release(struct nsm_handle *);
 
 
-- 
cgit v0.10.2


From 9711ef9945ce33b2b4ecb51a4db3f65f7d784876 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:38:31 -0800
Subject: [PATCH] make fs/proc/base.c:proc_pid_instantiate() static

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 05ace70..b859fc7 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1885,8 +1885,9 @@ out:
 	return;
 }
 
-struct dentry *proc_pid_instantiate(struct inode *dir,
-	struct dentry * dentry, struct task_struct *task, void *ptr)
+static struct dentry *proc_pid_instantiate(struct inode *dir,
+					   struct dentry * dentry,
+					   struct task_struct *task, void *ptr)
 {
 	struct dentry *error = ERR_PTR(-ENOENT);
 	struct inode *inode;
-- 
cgit v0.10.2


From 4438982f56b7e2ecb7932612601ba0390972ce3b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 6 Dec 2006 20:38:33 -0800
Subject: [PATCH] parport: section mismatches with HOTPLUG=n

When CONFIG_HOTPLUG=n, parport_pc calls some __devinit == __init code that
could be discarded.  These calls are made from parport_irq_probe(), which is
called from parport_pc_probe_port(), which is an exported symbol, so the calls
could (possibly) happen after init time.

WARNING: drivers/parport/parport_pc.o - Section mismatch: reference to .init.text: from .text between 'parport_irq_probe' (at offset 0x31d) and 'parport_pc_probe_port'
WARNING: drivers/parport/parport_pc.o - Section mismatch: reference to .init.text: from .text between 'parport_irq_probe' (at offset 0x346) and 'parport_pc_probe_port'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c
index 5749500f..b61c17b 100644
--- a/drivers/parport/parport_pc.c
+++ b/drivers/parport/parport_pc.c
@@ -1975,7 +1975,7 @@ static int __devinit parport_ECPPS2_supported(struct parport *pb){return 0;}
 /* --- IRQ detection -------------------------------------- */
 
 /* Only if supports ECP mode */
-static int __devinit programmable_irq_support(struct parport *pb)
+static int programmable_irq_support(struct parport *pb)
 {
 	int irq, intrLine;
 	unsigned char oecr = inb (ECONTROL (pb));
@@ -1992,7 +1992,7 @@ static int __devinit programmable_irq_support(struct parport *pb)
 	return irq;
 }
 
-static int __devinit irq_probe_ECP(struct parport *pb)
+static int irq_probe_ECP(struct parport *pb)
 {
 	int i;
 	unsigned long irqs;
@@ -2020,7 +2020,7 @@ static int __devinit irq_probe_ECP(struct parport *pb)
  * This detection seems that only works in National Semiconductors
  * This doesn't work in SMC, LGS, and Winbond 
  */
-static int __devinit irq_probe_EPP(struct parport *pb)
+static int irq_probe_EPP(struct parport *pb)
 {
 #ifndef ADVANCED_DETECT
 	return PARPORT_IRQ_NONE;
@@ -2059,7 +2059,7 @@ static int __devinit irq_probe_EPP(struct parport *pb)
 #endif /* Advanced detection */
 }
 
-static int __devinit irq_probe_SPP(struct parport *pb)
+static int irq_probe_SPP(struct parport *pb)
 {
 	/* Don't even try to do this. */
 	return PARPORT_IRQ_NONE;
-- 
cgit v0.10.2


From da015a6744f3648d34b83d1c4e015e6a798b8c56 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 6 Dec 2006 20:38:35 -0800
Subject: [PATCH] agp-amd64: section mismatches with HOTPLUG=n

When CONFIG_HOTPLUG=n, agp_amd64_resume() calls nforce3_agp_init(), which is
__devinit == __init, so has been discarded and is not usable for resume.

WARNING: drivers/char/agp/amd64-agp.o - Section mismatch: reference to .init.text: from .text between 'agp_amd64_resume' (at offset 0x249) and 'amd64_tlbflush'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index 00b17ae..2f2c4ef 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -459,7 +459,7 @@ static const struct aper_size_info_32 nforce3_sizes[5] =
 
 /* Handle shadow device of the Nvidia NForce3 */
 /* CHECK-ME original 2.4 version set up some IORRs. Check if that is needed. */
-static int __devinit nforce3_agp_init(struct pci_dev *pdev)
+static int nforce3_agp_init(struct pci_dev *pdev)
 {
 	u32 tmp, apbase, apbar, aplimit;
 	struct pci_dev *dev1;
-- 
cgit v0.10.2


From db68b189f4b8026b2f532e1b1bbdba5fcb36638c Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Wed, 6 Dec 2006 20:38:36 -0800
Subject: [PATCH] add rtc-omap driver

This creates a new RTC-framework driver for the RTC/calendar module found
in various OMAP1 chips.  (OMAP2 and OMAP3 use external RTCs, like those in
TI's multifunction PM companion chips.) It's been in the Linux-OMAP tree
for several months now, and other trees before that, so it's quite stable.
The most notable issue is that the OMAP IRQ code doesn't yet support the
RTC IRQ as a wakeup event.  Once that's fixed, a patch will be needed.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Alessandro Zummo <a.zummo@towertech.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/arm/mach-omap1/devices.c b/arch/arm/mach-omap1/devices.c
index a611c3b..6dcd10a 100644
--- a/arch/arm/mach-omap1/devices.c
+++ b/arch/arm/mach-omap1/devices.c
@@ -55,7 +55,7 @@ static inline void omap_init_irda(void) {}
 
 /*-------------------------------------------------------------------------*/
 
-#if	defined(CONFIG_OMAP_RTC) || defined(CONFIG_OMAP_RTC)
+#if defined(CONFIG_RTC_DRV_OMAP) || defined(CONFIG_RTC_DRV_OMAP_MODULE)
 
 #define	OMAP_RTC_BASE		0xfffb4800
 
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index fc766a7..4d32bad 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -163,6 +163,14 @@ config RTC_DRV_DS1742
 	  This driver can also be built as a module. If so, the module
 	  will be called rtc-ds1742.
 
+config RTC_DRV_OMAP
+	tristate "TI OMAP1"
+	depends on RTC_CLASS && ( \
+		ARCH_OMAP15XX || ARCH_OMAP16XX || ARCH_OMAP730 )
+	help
+	  Say "yes" here to support the real time clock on TI OMAP1 chips.
+	  This driver can also be built as a module called rtc-omap.
+
 config RTC_DRV_PCF8563
 	tristate "Philips PCF8563/Epson RTC8564"
 	depends on RTC_CLASS && I2C
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index 3ba5ff6..bd4c45d 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_RTC_DRV_TEST)	+= rtc-test.o
 obj-$(CONFIG_RTC_DRV_DS1307)	+= rtc-ds1307.o
 obj-$(CONFIG_RTC_DRV_DS1672)	+= rtc-ds1672.o
 obj-$(CONFIG_RTC_DRV_DS1742)	+= rtc-ds1742.o
+obj-$(CONFIG_RTC_DRV_OMAP)	+= rtc-omap.o
 obj-$(CONFIG_RTC_DRV_PCF8563)	+= rtc-pcf8563.o
 obj-$(CONFIG_RTC_DRV_PCF8583)	+= rtc-pcf8583.o
 obj-$(CONFIG_RTC_DRV_RS5C372)	+= rtc-rs5c372.o
diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c
new file mode 100644
index 0000000..eac5fb1
--- /dev/null
+++ b/drivers/rtc/rtc-omap.c
@@ -0,0 +1,572 @@
+/*
+ * TI OMAP1 Real Time Clock interface for Linux
+ *
+ * Copyright (C) 2003 MontaVista Software, Inc.
+ * Author: George G. Davis <gdavis@mvista.com> or <source@mvista.com>
+ *
+ * Copyright (C) 2006 David Brownell (new RTC framework)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/ioport.h>
+#include <linux/delay.h>
+#include <linux/rtc.h>
+#include <linux/bcd.h>
+#include <linux/platform_device.h>
+
+#include <asm/io.h>
+#include <asm/mach/time.h>
+
+
+/* The OMAP1 RTC is a year/month/day/hours/minutes/seconds BCD clock
+ * with century-range alarm matching, driven by the 32kHz clock.
+ *
+ * The main user-visible ways it differs from PC RTCs are by omitting
+ * "don't care" alarm fields and sub-second periodic IRQs, and having
+ * an autoadjust mechanism to calibrate to the true oscillator rate.
+ *
+ * Board-specific wiring options include using split power mode with
+ * RTC_OFF_NOFF used as the reset signal (so the RTC won't be reset),
+ * and wiring RTC_WAKE_INT (so the RTC alarm can wake the system from
+ * low power modes).  See the BOARD-SPECIFIC CUSTOMIZATION comment.
+ */
+
+#define OMAP_RTC_BASE			0xfffb4800
+
+/* RTC registers */
+#define OMAP_RTC_SECONDS_REG		0x00
+#define OMAP_RTC_MINUTES_REG		0x04
+#define OMAP_RTC_HOURS_REG		0x08
+#define OMAP_RTC_DAYS_REG		0x0C
+#define OMAP_RTC_MONTHS_REG		0x10
+#define OMAP_RTC_YEARS_REG		0x14
+#define OMAP_RTC_WEEKS_REG		0x18
+
+#define OMAP_RTC_ALARM_SECONDS_REG	0x20
+#define OMAP_RTC_ALARM_MINUTES_REG	0x24
+#define OMAP_RTC_ALARM_HOURS_REG	0x28
+#define OMAP_RTC_ALARM_DAYS_REG		0x2c
+#define OMAP_RTC_ALARM_MONTHS_REG	0x30
+#define OMAP_RTC_ALARM_YEARS_REG	0x34
+
+#define OMAP_RTC_CTRL_REG		0x40
+#define OMAP_RTC_STATUS_REG		0x44
+#define OMAP_RTC_INTERRUPTS_REG		0x48
+
+#define OMAP_RTC_COMP_LSB_REG		0x4c
+#define OMAP_RTC_COMP_MSB_REG		0x50
+#define OMAP_RTC_OSC_REG		0x54
+
+/* OMAP_RTC_CTRL_REG bit fields: */
+#define OMAP_RTC_CTRL_SPLIT		(1<<7)
+#define OMAP_RTC_CTRL_DISABLE		(1<<6)
+#define OMAP_RTC_CTRL_SET_32_COUNTER	(1<<5)
+#define OMAP_RTC_CTRL_TEST		(1<<4)
+#define OMAP_RTC_CTRL_MODE_12_24	(1<<3)
+#define OMAP_RTC_CTRL_AUTO_COMP		(1<<2)
+#define OMAP_RTC_CTRL_ROUND_30S		(1<<1)
+#define OMAP_RTC_CTRL_STOP		(1<<0)
+
+/* OMAP_RTC_STATUS_REG bit fields: */
+#define OMAP_RTC_STATUS_POWER_UP        (1<<7)
+#define OMAP_RTC_STATUS_ALARM           (1<<6)
+#define OMAP_RTC_STATUS_1D_EVENT        (1<<5)
+#define OMAP_RTC_STATUS_1H_EVENT        (1<<4)
+#define OMAP_RTC_STATUS_1M_EVENT        (1<<3)
+#define OMAP_RTC_STATUS_1S_EVENT        (1<<2)
+#define OMAP_RTC_STATUS_RUN             (1<<1)
+#define OMAP_RTC_STATUS_BUSY            (1<<0)
+
+/* OMAP_RTC_INTERRUPTS_REG bit fields: */
+#define OMAP_RTC_INTERRUPTS_IT_ALARM    (1<<3)
+#define OMAP_RTC_INTERRUPTS_IT_TIMER    (1<<2)
+
+
+#define rtc_read(addr)		omap_readb(OMAP_RTC_BASE + (addr))
+#define rtc_write(val, addr)	omap_writeb(val, OMAP_RTC_BASE + (addr))
+
+
+/* platform_bus isn't hotpluggable, so for static linkage it'd be safe
+ * to get rid of probe() and remove() code ... too bad the driver struct
+ * remembers probe(), that's about 25% of the runtime footprint!!
+ */
+#ifndef	MODULE
+#undef	__devexit
+#undef	__devexit_p
+#define	__devexit	__exit
+#define	__devexit_p	__exit_p
+#endif
+
+
+/* we rely on the rtc framework to handle locking (rtc->ops_lock),
+ * so the only other requirement is that register accesses which
+ * require BUSY to be clear are made with IRQs locally disabled
+ */
+static void rtc_wait_not_busy(void)
+{
+	int	count = 0;
+	u8	status;
+
+	/* BUSY may stay active for 1/32768 second (~30 usec) */
+	for (count = 0; count < 50; count++) {
+		status = rtc_read(OMAP_RTC_STATUS_REG);
+		if ((status & (u8)OMAP_RTC_STATUS_BUSY) == 0)
+			break;
+		udelay(1);
+	}
+	/* now we have ~15 usec to read/write various registers */
+}
+
+static irqreturn_t rtc_irq(int irq, void *class_dev)
+{
+	unsigned long		events = 0;
+	u8			irq_data;
+
+	irq_data = rtc_read(OMAP_RTC_STATUS_REG);
+
+	/* alarm irq? */
+	if (irq_data & OMAP_RTC_STATUS_ALARM) {
+		rtc_write(OMAP_RTC_STATUS_ALARM, OMAP_RTC_STATUS_REG);
+		events |= RTC_IRQF | RTC_AF;
+	}
+
+	/* 1/sec periodic/update irq? */
+	if (irq_data & OMAP_RTC_STATUS_1S_EVENT)
+		events |= RTC_IRQF | RTC_UF;
+
+	rtc_update_irq(class_dev, 1, events);
+
+	return IRQ_HANDLED;
+}
+
+#ifdef	CONFIG_RTC_INTF_DEV
+
+static int
+omap_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
+{
+	u8 reg;
+
+	switch (cmd) {
+	case RTC_AIE_OFF:
+	case RTC_AIE_ON:
+	case RTC_UIE_OFF:
+	case RTC_UIE_ON:
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	local_irq_disable();
+	rtc_wait_not_busy();
+	reg = rtc_read(OMAP_RTC_INTERRUPTS_REG);
+	switch (cmd) {
+	/* AIE = Alarm Interrupt Enable */
+	case RTC_AIE_OFF:
+		reg &= ~OMAP_RTC_INTERRUPTS_IT_ALARM;
+		break;
+	case RTC_AIE_ON:
+		reg |= OMAP_RTC_INTERRUPTS_IT_ALARM;
+		break;
+	/* UIE = Update Interrupt Enable (1/second) */
+	case RTC_UIE_OFF:
+		reg &= ~OMAP_RTC_INTERRUPTS_IT_TIMER;
+		break;
+	case RTC_UIE_ON:
+		reg |= OMAP_RTC_INTERRUPTS_IT_TIMER;
+		break;
+	}
+	rtc_wait_not_busy();
+	rtc_write(reg, OMAP_RTC_INTERRUPTS_REG);
+	local_irq_enable();
+
+	return 0;
+}
+
+#else
+#define	omap_rtc_ioctl	NULL
+#endif
+
+/* this hardware doesn't support "don't care" alarm fields */
+static int tm2bcd(struct rtc_time *tm)
+{
+	if (rtc_valid_tm(tm) != 0)
+		return -EINVAL;
+
+	tm->tm_sec = BIN2BCD(tm->tm_sec);
+	tm->tm_min = BIN2BCD(tm->tm_min);
+	tm->tm_hour = BIN2BCD(tm->tm_hour);
+	tm->tm_mday = BIN2BCD(tm->tm_mday);
+
+	tm->tm_mon = BIN2BCD(tm->tm_mon + 1);
+
+	/* epoch == 1900 */
+	if (tm->tm_year < 100 || tm->tm_year > 199)
+		return -EINVAL;
+	tm->tm_year = BIN2BCD(tm->tm_year - 100);
+
+	return 0;
+}
+
+static void bcd2tm(struct rtc_time *tm)
+{
+	tm->tm_sec = BCD2BIN(tm->tm_sec);
+	tm->tm_min = BCD2BIN(tm->tm_min);
+	tm->tm_hour = BCD2BIN(tm->tm_hour);
+	tm->tm_mday = BCD2BIN(tm->tm_mday);
+	tm->tm_mon = BCD2BIN(tm->tm_mon) - 1;
+	/* epoch == 1900 */
+	tm->tm_year = BCD2BIN(tm->tm_year) + 100;
+}
+
+
+static int omap_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+	/* we don't report wday/yday/isdst ... */
+	local_irq_disable();
+	rtc_wait_not_busy();
+
+	tm->tm_sec = rtc_read(OMAP_RTC_SECONDS_REG);
+	tm->tm_min = rtc_read(OMAP_RTC_MINUTES_REG);
+	tm->tm_hour = rtc_read(OMAP_RTC_HOURS_REG);
+	tm->tm_mday = rtc_read(OMAP_RTC_DAYS_REG);
+	tm->tm_mon = rtc_read(OMAP_RTC_MONTHS_REG);
+	tm->tm_year = rtc_read(OMAP_RTC_YEARS_REG);
+
+	local_irq_enable();
+
+	bcd2tm(tm);
+	return 0;
+}
+
+static int omap_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+	if (tm2bcd(tm) < 0)
+		return -EINVAL;
+	local_irq_disable();
+	rtc_wait_not_busy();
+
+	rtc_write(tm->tm_year, OMAP_RTC_YEARS_REG);
+	rtc_write(tm->tm_mon, OMAP_RTC_MONTHS_REG);
+	rtc_write(tm->tm_mday, OMAP_RTC_DAYS_REG);
+	rtc_write(tm->tm_hour, OMAP_RTC_HOURS_REG);
+	rtc_write(tm->tm_min, OMAP_RTC_MINUTES_REG);
+	rtc_write(tm->tm_sec, OMAP_RTC_SECONDS_REG);
+
+	local_irq_enable();
+
+	return 0;
+}
+
+static int omap_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm)
+{
+	local_irq_disable();
+	rtc_wait_not_busy();
+
+	alm->time.tm_sec = rtc_read(OMAP_RTC_ALARM_SECONDS_REG);
+	alm->time.tm_min = rtc_read(OMAP_RTC_ALARM_MINUTES_REG);
+	alm->time.tm_hour = rtc_read(OMAP_RTC_ALARM_HOURS_REG);
+	alm->time.tm_mday = rtc_read(OMAP_RTC_ALARM_DAYS_REG);
+	alm->time.tm_mon = rtc_read(OMAP_RTC_ALARM_MONTHS_REG);
+	alm->time.tm_year = rtc_read(OMAP_RTC_ALARM_YEARS_REG);
+
+	local_irq_enable();
+
+	bcd2tm(&alm->time);
+	alm->pending = !!(rtc_read(OMAP_RTC_INTERRUPTS_REG)
+			& OMAP_RTC_INTERRUPTS_IT_ALARM);
+	alm->enabled = alm->pending && device_may_wakeup(dev);
+
+	return 0;
+}
+
+static int omap_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
+{
+	u8 reg;
+
+	/* Much userspace code uses RTC_ALM_SET, thus "don't care" for
+	 * day/month/year specifies alarms up to 24 hours in the future.
+	 * So we need to handle that ... but let's ignore the "don't care"
+	 * values for hours/minutes/seconds.
+	 */
+	if (alm->time.tm_mday <= 0
+			&& alm->time.tm_mon < 0
+			&& alm->time.tm_year < 0) {
+		struct rtc_time tm;
+		unsigned long now, then;
+
+		omap_rtc_read_time(dev, &tm);
+		rtc_tm_to_time(&tm, &now);
+
+		alm->time.tm_mday = tm.tm_mday;
+		alm->time.tm_mon = tm.tm_mon;
+		alm->time.tm_year = tm.tm_year;
+		rtc_tm_to_time(&alm->time, &then);
+
+		/* sometimes the alarm wraps into tomorrow */
+		if (then < now) {
+			rtc_time_to_tm(now + 24 * 60 * 60, &tm);
+			alm->time.tm_mday = tm.tm_mday;
+			alm->time.tm_mon = tm.tm_mon;
+			alm->time.tm_year = tm.tm_year;
+		}
+	}
+
+	if (tm2bcd(&alm->time) < 0)
+		return -EINVAL;
+
+	local_irq_disable();
+	rtc_wait_not_busy();
+
+	rtc_write(alm->time.tm_year, OMAP_RTC_ALARM_YEARS_REG);
+	rtc_write(alm->time.tm_mon, OMAP_RTC_ALARM_MONTHS_REG);
+	rtc_write(alm->time.tm_mday, OMAP_RTC_ALARM_DAYS_REG);
+	rtc_write(alm->time.tm_hour, OMAP_RTC_ALARM_HOURS_REG);
+	rtc_write(alm->time.tm_min, OMAP_RTC_ALARM_MINUTES_REG);
+	rtc_write(alm->time.tm_sec, OMAP_RTC_ALARM_SECONDS_REG);
+
+	reg = rtc_read(OMAP_RTC_INTERRUPTS_REG);
+	if (alm->enabled)
+		reg |= OMAP_RTC_INTERRUPTS_IT_ALARM;
+	else
+		reg &= ~OMAP_RTC_INTERRUPTS_IT_ALARM;
+	rtc_write(reg, OMAP_RTC_INTERRUPTS_REG);
+
+	local_irq_enable();
+
+	return 0;
+}
+
+static struct rtc_class_ops omap_rtc_ops = {
+	.ioctl		= omap_rtc_ioctl,
+	.read_time	= omap_rtc_read_time,
+	.set_time	= omap_rtc_set_time,
+	.read_alarm	= omap_rtc_read_alarm,
+	.set_alarm	= omap_rtc_set_alarm,
+};
+
+static int omap_rtc_alarm;
+static int omap_rtc_timer;
+
+static int __devinit omap_rtc_probe(struct platform_device *pdev)
+{
+	struct resource		*res, *mem;
+	struct rtc_device	*rtc;
+	u8			reg, new_ctrl;
+
+	omap_rtc_timer = platform_get_irq(pdev, 0);
+	if (omap_rtc_timer <= 0) {
+		pr_debug("%s: no update irq?\n", pdev->name);
+		return -ENOENT;
+	}
+
+	omap_rtc_alarm = platform_get_irq(pdev, 1);
+	if (omap_rtc_alarm <= 0) {
+		pr_debug("%s: no alarm irq?\n", pdev->name);
+		return -ENOENT;
+	}
+
+	/* NOTE:  using static mapping for RTC registers */
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (res && res->start != OMAP_RTC_BASE) {
+		pr_debug("%s: RTC registers at %08x, expected %08x\n",
+			pdev->name, (unsigned) res->start, OMAP_RTC_BASE);
+		return -ENOENT;
+	}
+
+	if (res)
+		mem = request_mem_region(res->start,
+				res->end - res->start + 1,
+				pdev->name);
+	else
+		mem = NULL;
+	if (!mem) {
+		pr_debug("%s: RTC registers at %08x are not free\n",
+			pdev->name, OMAP_RTC_BASE);
+		return -EBUSY;
+	}
+
+	rtc = rtc_device_register(pdev->name, &pdev->dev,
+			&omap_rtc_ops, THIS_MODULE);
+	if (IS_ERR(rtc)) {
+		pr_debug("%s: can't register RTC device, err %ld\n",
+			pdev->name, PTR_ERR(rtc));
+		goto fail;
+	}
+	platform_set_drvdata(pdev, rtc);
+	class_set_devdata(&rtc->class_dev, mem);
+
+	/* clear pending irqs, and set 1/second periodic,
+	 * which we'll use instead of update irqs
+	 */
+	rtc_write(0, OMAP_RTC_INTERRUPTS_REG);
+
+	/* clear old status */
+	reg = rtc_read(OMAP_RTC_STATUS_REG);
+	if (reg & (u8) OMAP_RTC_STATUS_POWER_UP) {
+		pr_info("%s: RTC power up reset detected\n",
+			pdev->name);
+		rtc_write(OMAP_RTC_STATUS_POWER_UP, OMAP_RTC_STATUS_REG);
+	}
+	if (reg & (u8) OMAP_RTC_STATUS_ALARM)
+		rtc_write(OMAP_RTC_STATUS_ALARM, OMAP_RTC_STATUS_REG);
+
+	/* handle periodic and alarm irqs */
+	if (request_irq(omap_rtc_timer, rtc_irq, SA_INTERRUPT,
+			rtc->class_dev.class_id, &rtc->class_dev)) {
+		pr_debug("%s: RTC timer interrupt IRQ%d already claimed\n",
+			pdev->name, omap_rtc_timer);
+		goto fail0;
+	}
+	if (request_irq(omap_rtc_alarm, rtc_irq, SA_INTERRUPT,
+			rtc->class_dev.class_id, &rtc->class_dev)) {
+		pr_debug("%s: RTC alarm interrupt IRQ%d already claimed\n",
+			pdev->name, omap_rtc_alarm);
+		goto fail1;
+	}
+
+	/* On boards with split power, RTC_ON_NOFF won't reset the RTC */
+	reg = rtc_read(OMAP_RTC_CTRL_REG);
+	if (reg & (u8) OMAP_RTC_CTRL_STOP)
+		pr_info("%s: already running\n", pdev->name);
+
+	/* force to 24 hour mode */
+	new_ctrl = reg & ~(OMAP_RTC_CTRL_SPLIT|OMAP_RTC_CTRL_AUTO_COMP);
+	new_ctrl |= OMAP_RTC_CTRL_STOP;
+
+	/* BOARD-SPECIFIC CUSTOMIZATION CAN GO HERE:
+	 *
+	 *  - Boards wired so that RTC_WAKE_INT does something, and muxed
+	 *    right (W13_1610_RTC_WAKE_INT is the default after chip reset),
+	 *    should initialize the device wakeup flag appropriately.
+	 *
+	 *  - Boards wired so RTC_ON_nOFF is used as the reset signal,
+	 *    rather than nPWRON_RESET, should forcibly enable split
+	 *    power mode.  (Some chip errata report that RTC_CTRL_SPLIT
+	 *    is write-only, and always reads as zero...)
+	 */
+	device_init_wakeup(&pdev->dev, 0);
+
+	if (new_ctrl & (u8) OMAP_RTC_CTRL_SPLIT)
+		pr_info("%s: split power mode\n", pdev->name);
+
+	if (reg != new_ctrl)
+		rtc_write(new_ctrl, OMAP_RTC_CTRL_REG);
+
+	return 0;
+
+fail1:
+	free_irq(omap_rtc_timer, NULL);
+fail0:
+	rtc_device_unregister(rtc);
+fail:
+	release_resource(mem);
+	return -EIO;
+}
+
+static int __devexit omap_rtc_remove(struct platform_device *pdev)
+{
+	struct rtc_device	*rtc = platform_get_drvdata(pdev);;
+
+	device_init_wakeup(&pdev->dev, 0);
+
+	/* leave rtc running, but disable irqs */
+	rtc_write(0, OMAP_RTC_INTERRUPTS_REG);
+
+	free_irq(omap_rtc_timer, rtc);
+	free_irq(omap_rtc_alarm, rtc);
+
+	release_resource(class_get_devdata(&rtc->class_dev));
+	rtc_device_unregister(rtc);
+	return 0;
+}
+
+#ifdef CONFIG_PM
+
+static struct timespec rtc_delta;
+static u8 irqstat;
+
+static int omap_rtc_suspend(struct platform_device *pdev, pm_message_t state)
+{
+	struct rtc_time rtc_tm;
+	struct timespec time;
+
+	time.tv_nsec = 0;
+	omap_rtc_read_time(NULL, &rtc_tm);
+	rtc_tm_to_time(&rtc_tm, &time.tv_sec);
+
+	save_time_delta(&rtc_delta, &time);
+	irqstat = rtc_read(OMAP_RTC_INTERRUPTS_REG);
+
+	/* FIXME the RTC alarm is not currently acting as a wakeup event
+	 * source, and in fact this enable() call is just saving a flag
+	 * that's never used...
+	 */
+	if (device_may_wakeup(&pdev->dev))
+		enable_irq_wake(omap_rtc_alarm);
+	else
+		rtc_write(0, OMAP_RTC_INTERRUPTS_REG);
+
+	return 0;
+}
+
+static int omap_rtc_resume(struct platform_device *pdev)
+{
+	struct rtc_time rtc_tm;
+	struct timespec time;
+
+	time.tv_nsec = 0;
+	omap_rtc_read_time(NULL, &rtc_tm);
+	rtc_tm_to_time(&rtc_tm, &time.tv_sec);
+
+	restore_time_delta(&rtc_delta, &time);
+	if (device_may_wakeup(&pdev->dev))
+		disable_irq_wake(omap_rtc_alarm);
+	else
+		rtc_write(irqstat, OMAP_RTC_INTERRUPTS_REG);
+	return 0;
+}
+
+#else
+#define omap_rtc_suspend NULL
+#define omap_rtc_resume  NULL
+#endif
+
+static void omap_rtc_shutdown(struct platform_device *pdev)
+{
+	rtc_write(0, OMAP_RTC_INTERRUPTS_REG);
+}
+
+MODULE_ALIAS("omap_rtc");
+static struct platform_driver omap_rtc_driver = {
+	.probe		= omap_rtc_probe,
+	.remove		= __devexit_p(omap_rtc_remove),
+	.suspend	= omap_rtc_suspend,
+	.resume		= omap_rtc_resume,
+	.shutdown	= omap_rtc_shutdown,
+	.driver		= {
+		.name	= "omap_rtc",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init rtc_init(void)
+{
+	return platform_driver_register(&omap_rtc_driver);
+}
+module_init(rtc_init);
+
+static void __exit rtc_exit(void)
+{
+	platform_driver_unregister(&omap_rtc_driver);
+}
+module_exit(rtc_exit);
+
+MODULE_AUTHOR("George G. Davis (and others)");
+MODULE_LICENSE("GPL");
-- 
cgit v0.10.2


From 403aac965eba17a360a93c3b679f57b21030d4cd Mon Sep 17 00:00:00 2001
From: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Date: Wed, 6 Dec 2006 20:38:38 -0800
Subject: [PATCH] add return value checking of get_user() in
 set_vesa_blanking()

[akpm@osdl.org: bugfix]
Signed-off-by: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Cc: James Simmons <jsimmons@infradead.org>
Cc: "Antonino A. Daplas" <adaplas@pol.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 75ff028..a8239da 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -152,7 +152,7 @@ static void gotoxy(struct vc_data *vc, int new_x, int new_y);
 static void save_cur(struct vc_data *vc);
 static void reset_terminal(struct vc_data *vc, int do_clear);
 static void con_flush_chars(struct tty_struct *tty);
-static void set_vesa_blanking(char __user *p);
+static int set_vesa_blanking(char __user *p);
 static void set_cursor(struct vc_data *vc);
 static void hide_cursor(struct vc_data *vc);
 static void console_callback(struct work_struct *ignored);
@@ -2369,7 +2369,7 @@ int tioclinux(struct tty_struct *tty, unsigned long arg)
 			ret = __put_user(data, p);
 			break;
 		case TIOCL_SETVESABLANK:
-			set_vesa_blanking(p);
+			ret = set_vesa_blanking(p);
 			break;
 		case TIOCL_GETKMSGREDIRECT:
 			data = kmsg_redirect;
@@ -3313,11 +3313,15 @@ postcore_initcall(vtconsole_class_init);
  *	Screen blanking
  */
 
-static void set_vesa_blanking(char __user *p)
+static int set_vesa_blanking(char __user *p)
 {
-    unsigned int mode;
-    get_user(mode, p + 1);
-    vesa_blank_mode = (mode < 4) ? mode : 0;
+	unsigned int mode;
+
+	if (get_user(mode, p + 1))
+		return -EFAULT;
+
+	vesa_blank_mode = (mode < 4) ? mode : 0;
+	return 0;
 }
 
 void do_blank_screen(int entering_gfx)
-- 
cgit v0.10.2


From 3e29fe837a512229f4757cd93fdd7163d027d103 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 6 Dec 2006 20:38:40 -0800
Subject: [PATCH] CISS: require same SCSI module support

Building CCISS SCSI tape support in-kernel when SCSI=m causes build errors,
so require SCSI support to be =y or same as CCISS SCSI tape support.

  drivers/built-in.o: In function `cciss_remove_one':
  cciss.c:(.text+0x79d4c): undefined reference to `scsi_remove_host'
  cciss.c:(.text+0x79d55): undefined reference to `scsi_host_put'
  drivers/built-in.o: In function `cciss_update_non_disk_devices':
  cciss.c:(.text+0x7bb54): undefined reference to `scsi_device_type'
  cciss.c:(.text+0x7bcc8): undefined reference to `scsi_device_type'
  cciss.c:(.text+0x7be81): undefined reference to `scsi_device_type'
  cciss.c:(.text+0x7bf81): undefined reference to `scsi_device_type'
  drivers/built-in.o: In function `cciss_proc_write':
  cciss.c:(.text+0x7c175): undefined reference to `scsi_host_alloc'
  cciss.c:(.text+0x7c1ed): undefined reference to `scsi_add_host'
  cciss.c:(.text+0x7c1f9): undefined reference to `scsi_scan_host'
  cciss.c:(.text+0x7c206): undefined reference to `scsi_host_put'
  make: *** [.tmp_vmlinux1] Error 1

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Mike Miller <mike.miller@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 17dc222..432c17e 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -168,7 +168,8 @@ config BLK_CPQ_CISS_DA
 
 config CISS_SCSI_TAPE
 	bool "SCSI tape drive support for Smart Array 5xxx"
-	depends on BLK_CPQ_CISS_DA && SCSI && PROC_FS
+	depends on BLK_CPQ_CISS_DA && PROC_FS
+	depends on SCSI=y || SCSI=BLK_CPQ_CISS_DA
 	help
 	  When enabled (Y), this option allows SCSI tape drives and SCSI medium
 	  changers (tape robots) to be accessed via a Compaq 5xxx array 
-- 
cgit v0.10.2


From 0bf93226073db0c957664592ba2f1aaeef57bb82 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 6 Dec 2006 20:38:41 -0800
Subject: [PATCH] export toshiba SMM support for neofb module

When CONFIG_TOSHIBA=y and CONFIG_FB_NEOMAGIC=m, tosh_smm() needs
to be exported for neofb to use it.

  WARNING: "tosh_smm" [drivers/video/neofb.ko] undefined!
  make[1]: *** [__modpost] Error 1
  make: *** [modules] Error 2

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/toshiba.c b/drivers/char/toshiba.c
index dd36fd0..07067c3 100644
--- a/drivers/char/toshiba.c
+++ b/drivers/char/toshiba.c
@@ -249,6 +249,7 @@ int tosh_smm(SMMRegisters *regs)
 
 	return eax;
 }
+EXPORT_SYMBOL(tosh_smm);
 
 
 static int tosh_ioctl(struct inode *ip, struct file *fp, unsigned int cmd,
-- 
cgit v0.10.2


From da39aa8fbc031029b1f06f1abf2a933e1042fe99 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 6 Dec 2006 20:38:42 -0800
Subject: [PATCH] kernel-doc: add fusion and i2o to kernel-api book

Add Fusion and I2O message-based device interfaces to kernel-api book.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index a166675..fc77c99 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -423,6 +423,37 @@ X!Edrivers/pnp/system.c
 !Edrivers/media/video/videodev.c
   </chapter>
 
+  <chapter id="message_devices">
+	<title>Message-based devices</title>
+     <sect1><title>Fusion message devices</title>
+!Edrivers/message/fusion/mptbase.c
+!Idrivers/message/fusion/mptbase.c
+!Edrivers/message/fusion/mptscsih.c
+!Idrivers/message/fusion/mptscsih.c
+!Idrivers/message/fusion/mptctl.c
+!Idrivers/message/fusion/mptspi.c
+!Idrivers/message/fusion/mptfc.c
+!Idrivers/message/fusion/mptlan.c
+     </sect1>
+     <sect1><title>I2O message devices</title>
+!Iinclude/linux/i2o.h
+!Idrivers/message/i2o/core.h
+!Edrivers/message/i2o/iop.c
+!Idrivers/message/i2o/iop.c
+!Idrivers/message/i2o/config-osm.c
+!Edrivers/message/i2o/exec-osm.c
+!Idrivers/message/i2o/exec-osm.c
+!Idrivers/message/i2o/bus-osm.c
+!Edrivers/message/i2o/device.c
+!Idrivers/message/i2o/device.c
+!Idrivers/message/i2o/driver.c
+!Idrivers/message/i2o/pci.c
+!Idrivers/message/i2o/i2o_block.c
+!Idrivers/message/i2o/i2o_scsi.c
+!Idrivers/message/i2o/i2o_proc.c
+     </sect1>
+  </chapter>
+
   <chapter id="snddev">
      <title>Sound Devices</title>
 !Iinclude/sound/core.h
-- 
cgit v0.10.2


From d9489fb60614794cbca4b6b173c60ed9388974c6 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 6 Dec 2006 20:38:43 -0800
Subject: [PATCH] kernel-doc: fix fusion and i2o docs

Correct lots of typos, kernel-doc warnings, & kernel-doc usage in fusion and
i2o drivers.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c
index 051b7c5..6e068cf 100644
--- a/drivers/message/fusion/mptbase.c
+++ b/drivers/message/fusion/mptbase.c
@@ -347,7 +347,7 @@ mpt_reply(MPT_ADAPTER *ioc, u32 pa)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	mpt_interrupt - MPT adapter (IOC) specific interrupt handler.
  *	@irq: irq number (not used)
  *	@bus_id: bus identifier cookie == pointer to MPT_ADAPTER structure
@@ -387,14 +387,16 @@ mpt_interrupt(int irq, void *bus_id)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
- *	mpt_base_reply - MPT base driver's callback routine; all base driver
- *	"internal" request/reply processing is routed here.
- *	Currently used for EventNotification and EventAck handling.
+/**
+ *	mpt_base_reply - MPT base driver's callback routine
  *	@ioc: Pointer to MPT_ADAPTER structure
  *	@mf: Pointer to original MPT request frame
  *	@reply: Pointer to MPT reply frame (NULL if TurboReply)
  *
+ *	MPT base driver's callback routine; all base driver
+ *	"internal" request/reply processing is routed here.
+ *	Currently used for EventNotification and EventAck handling.
+ *
  *	Returns 1 indicating original alloc'd request frame ptr
  *	should be freed, or 0 if it shouldn't.
  */
@@ -530,7 +532,7 @@ mpt_base_reply(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, MPT_FRAME_HDR *reply)
  *	@dclass: Protocol driver's class (%MPT_DRIVER_CLASS enum value)
  *
  *	This routine is called by a protocol-specific driver (SCSI host,
- *	LAN, SCSI target) to register it's reply callback routine.  Each
+ *	LAN, SCSI target) to register its reply callback routine.  Each
  *	protocol-specific driver must do this before it will be able to
  *	use any IOC resources, such as obtaining request frames.
  *
@@ -572,7 +574,7 @@ mpt_register(MPT_CALLBACK cbfunc, MPT_DRIVER_CLASS dclass)
  *	mpt_deregister - Deregister a protocol drivers resources.
  *	@cb_idx: previously registered callback handle
  *
- *	Each protocol-specific driver should call this routine when it's
+ *	Each protocol-specific driver should call this routine when its
  *	module is unloaded.
  */
 void
@@ -617,7 +619,7 @@ mpt_event_register(int cb_idx, MPT_EVHANDLER ev_cbfunc)
  *
  *	Each protocol-specific driver should call this routine
  *	when it does not (or can no longer) handle events,
- *	or when it's module is unloaded.
+ *	or when its module is unloaded.
  */
 void
 mpt_event_deregister(int cb_idx)
@@ -656,7 +658,7 @@ mpt_reset_register(int cb_idx, MPT_RESETHANDLER reset_func)
  *
  *	Each protocol-specific driver should call this routine
  *	when it does not (or can no longer) handle IOC reset handling,
- *	or when it's module is unloaded.
+ *	or when its module is unloaded.
  */
 void
 mpt_reset_deregister(int cb_idx)
@@ -670,6 +672,8 @@ mpt_reset_deregister(int cb_idx)
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
 /**
  *	mpt_device_driver_register - Register device driver hooks
+ *	@dd_cbfunc: driver callbacks struct
+ *	@cb_idx: MPT protocol driver index
  */
 int
 mpt_device_driver_register(struct mpt_pci_driver * dd_cbfunc, int cb_idx)
@@ -696,6 +700,7 @@ mpt_device_driver_register(struct mpt_pci_driver * dd_cbfunc, int cb_idx)
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
 /**
  *	mpt_device_driver_deregister - DeRegister device driver hooks
+ *	@cb_idx: MPT protocol driver index
  */
 void
 mpt_device_driver_deregister(int cb_idx)
@@ -887,8 +892,7 @@ mpt_add_sge(char *pAddr, u32 flagslength, dma_addr_t dma_addr)
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
 /**
- *	mpt_send_handshake_request - Send MPT request via doorbell
- *	handshake method.
+ *	mpt_send_handshake_request - Send MPT request via doorbell handshake method.
  *	@handle: Handle of registered MPT protocol driver
  *	@ioc: Pointer to MPT adapter structure
  *	@reqBytes: Size of the request in bytes
@@ -981,10 +985,13 @@ mpt_send_handshake_request(int handle, MPT_ADAPTER *ioc, int reqBytes, u32 *req,
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
 /**
- * mpt_host_page_access_control - provides mechanism for the host
- * driver to control the IOC's Host Page Buffer access.
+ * mpt_host_page_access_control - control the IOC's Host Page Buffer access
  * @ioc: Pointer to MPT adapter structure
  * @access_control_value: define bits below
+ * @sleepFlag: Specifies whether the process can sleep
+ *
+ * Provides mechanism for the host driver to control the IOC's
+ * Host Page Buffer access.
  *
  * Access Control Value - bits[15:12]
  * 0h Reserved
@@ -1022,10 +1029,10 @@ mpt_host_page_access_control(MPT_ADAPTER *ioc, u8 access_control_value, int slee
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
 /**
  *	mpt_host_page_alloc - allocate system memory for the fw
- *	If we already allocated memory in past, then resend the same pointer.
- *	ioc@: Pointer to pointer to IOC adapter
- *	ioc_init@: Pointer to ioc init config page
+ *	@ioc: Pointer to pointer to IOC adapter
+ *	@ioc_init: Pointer to ioc init config page
  *
+ *	If we already allocated memory in past, then resend the same pointer.
  *	Returns 0 for success, non-zero for failure.
  */
 static int
@@ -1091,12 +1098,15 @@ return 0;
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
 /**
- *	mpt_verify_adapter - Given a unique IOC identifier, set pointer to
- *	the associated MPT adapter structure.
+ *	mpt_verify_adapter - Given IOC identifier, set pointer to its adapter structure.
  *	@iocid: IOC unique identifier (integer)
  *	@iocpp: Pointer to pointer to IOC adapter
  *
- *	Returns iocid and sets iocpp.
+ *	Given a unique IOC identifier, set pointer to the associated MPT
+ *	adapter structure.
+ *
+ *	Returns iocid and sets iocpp if iocid is found.
+ *	Returns -1 if iocid is not found.
  */
 int
 mpt_verify_adapter(int iocid, MPT_ADAPTER **iocpp)
@@ -1115,9 +1125,10 @@ mpt_verify_adapter(int iocid, MPT_ADAPTER **iocpp)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	mpt_attach - Install a PCI intelligent MPT adapter.
  *	@pdev: Pointer to pci_dev structure
+ *	@id: PCI device ID information
  *
  *	This routine performs all the steps necessary to bring the IOC of
  *	a MPT adapter to a OPERATIONAL state.  This includes registering
@@ -1417,10 +1428,9 @@ mpt_attach(struct pci_dev *pdev, const struct pci_device_id *id)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	mpt_detach - Remove a PCI intelligent MPT adapter.
  *	@pdev: Pointer to pci_dev structure
- *
  */
 
 void
@@ -1466,10 +1476,10 @@ mpt_detach(struct pci_dev *pdev)
  */
 #ifdef CONFIG_PM
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	mpt_suspend - Fusion MPT base driver suspend routine.
- *
- *
+ *	@pdev: Pointer to pci_dev structure
+ *	@state: new state to enter
  */
 int
 mpt_suspend(struct pci_dev *pdev, pm_message_t state)
@@ -1505,10 +1515,9 @@ mpt_suspend(struct pci_dev *pdev, pm_message_t state)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	mpt_resume - Fusion MPT base driver resume routine.
- *
- *
+ *	@pdev: Pointer to pci_dev structure
  */
 int
 mpt_resume(struct pci_dev *pdev)
@@ -1566,7 +1575,7 @@ mpt_signal_reset(int index, MPT_ADAPTER *ioc, int reset_phase)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	mpt_do_ioc_recovery - Initialize or recover MPT adapter.
  *	@ioc: Pointer to MPT adapter structure
  *	@reason: Event word / reason
@@ -1892,13 +1901,15 @@ mpt_do_ioc_recovery(MPT_ADAPTER *ioc, u32 reason, int sleepFlag)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
- *	mpt_detect_bound_ports - Search for PCI bus/dev_function
- *	which matches PCI bus/dev_function (+/-1) for newly discovered 929,
- *	929X, 1030 or 1035.
+/**
+ *	mpt_detect_bound_ports - Search for matching PCI bus/dev_function
  *	@ioc: Pointer to MPT adapter structure
  *	@pdev: Pointer to (struct pci_dev) structure
  *
+ *	Search for PCI bus/dev_function which matches
+ *	PCI bus/dev_function (+/-1) for newly discovered 929,
+ *	929X, 1030 or 1035.
+ *
  *	If match on PCI dev_function +/-1 is found, bind the two MPT adapters
  *	using alt_ioc pointer fields in their %MPT_ADAPTER structures.
  */
@@ -1945,9 +1956,9 @@ mpt_detect_bound_ports(MPT_ADAPTER *ioc, struct pci_dev *pdev)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	mpt_adapter_disable - Disable misbehaving MPT adapter.
- *	@this: Pointer to MPT adapter structure
+ *	@ioc: Pointer to MPT adapter structure
  */
 static void
 mpt_adapter_disable(MPT_ADAPTER *ioc)
@@ -2046,9 +2057,8 @@ mpt_adapter_disable(MPT_ADAPTER *ioc)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
- *	mpt_adapter_dispose - Free all resources associated with a MPT
- *	adapter.
+/**
+ *	mpt_adapter_dispose - Free all resources associated with an MPT adapter
  *	@ioc: Pointer to MPT adapter structure
  *
  *	This routine unregisters h/w resources and frees all alloc'd memory
@@ -2099,8 +2109,8 @@ mpt_adapter_dispose(MPT_ADAPTER *ioc)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
- *	MptDisplayIocCapabilities - Disply IOC's capacilities.
+/**
+ *	MptDisplayIocCapabilities - Disply IOC's capabilities.
  *	@ioc: Pointer to MPT adapter structure
  */
 static void
@@ -2142,7 +2152,7 @@ MptDisplayIocCapabilities(MPT_ADAPTER *ioc)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	MakeIocReady - Get IOC to a READY state, using KickStart if needed.
  *	@ioc: Pointer to MPT_ADAPTER structure
  *	@force: Force hard KickStart of IOC
@@ -2279,7 +2289,7 @@ MakeIocReady(MPT_ADAPTER *ioc, int force, int sleepFlag)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	mpt_GetIocState - Get the current state of a MPT adapter.
  *	@ioc: Pointer to MPT_ADAPTER structure
  *	@cooked: Request raw or cooked IOC state
@@ -2304,7 +2314,7 @@ mpt_GetIocState(MPT_ADAPTER *ioc, int cooked)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	GetIocFacts - Send IOCFacts request to MPT adapter.
  *	@ioc: Pointer to MPT_ADAPTER structure
  *	@sleepFlag: Specifies whether the process can sleep
@@ -2478,7 +2488,7 @@ GetIocFacts(MPT_ADAPTER *ioc, int sleepFlag, int reason)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	GetPortFacts - Send PortFacts request to MPT adapter.
  *	@ioc: Pointer to MPT_ADAPTER structure
  *	@portnum: Port number
@@ -2545,7 +2555,7 @@ GetPortFacts(MPT_ADAPTER *ioc, int portnum, int sleepFlag)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	SendIocInit - Send IOCInit request to MPT adapter.
  *	@ioc: Pointer to MPT_ADAPTER structure
  *	@sleepFlag: Specifies whether the process can sleep
@@ -2630,7 +2640,7 @@ SendIocInit(MPT_ADAPTER *ioc, int sleepFlag)
 	}
 
 	/* No need to byte swap the multibyte fields in the reply
-	 * since we don't even look at it's contents.
+	 * since we don't even look at its contents.
 	 */
 
 	dhsprintk((MYIOC_s_INFO_FMT "Sending PortEnable (req @ %p)\n",
@@ -2672,7 +2682,7 @@ SendIocInit(MPT_ADAPTER *ioc, int sleepFlag)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	SendPortEnable - Send PortEnable request to MPT adapter port.
  *	@ioc: Pointer to MPT_ADAPTER structure
  *	@portnum: Port number to enable
@@ -2723,9 +2733,13 @@ SendPortEnable(MPT_ADAPTER *ioc, int portnum, int sleepFlag)
 	return rc;
 }
 
-/*
- *	ioc: Pointer to MPT_ADAPTER structure
- *      size - total FW bytes
+/**
+ *	mpt_alloc_fw_memory - allocate firmware memory
+ *	@ioc: Pointer to MPT_ADAPTER structure
+ *      @size: total FW bytes
+ *
+ *	If memory has already been allocated, the same (cached) value
+ *	is returned.
  */
 void
 mpt_alloc_fw_memory(MPT_ADAPTER *ioc, int size)
@@ -2742,9 +2756,12 @@ mpt_alloc_fw_memory(MPT_ADAPTER *ioc, int size)
 			ioc->alloc_total += size;
 	}
 }
-/*
- * If alt_img is NULL, delete from ioc structure.
- * Else, delete a secondary image in same format.
+/**
+ *	mpt_free_fw_memory - free firmware memory
+ *	@ioc: Pointer to MPT_ADAPTER structure
+ *
+ *	If alt_img is NULL, delete from ioc structure.
+ *	Else, delete a secondary image in same format.
  */
 void
 mpt_free_fw_memory(MPT_ADAPTER *ioc)
@@ -2763,7 +2780,7 @@ mpt_free_fw_memory(MPT_ADAPTER *ioc)
 
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	mpt_do_upload - Construct and Send FWUpload request to MPT adapter port.
  *	@ioc: Pointer to MPT_ADAPTER structure
  *	@sleepFlag: Specifies whether the process can sleep
@@ -2865,10 +2882,10 @@ mpt_do_upload(MPT_ADAPTER *ioc, int sleepFlag)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	mpt_downloadboot - DownloadBoot code
  *	@ioc: Pointer to MPT_ADAPTER structure
- *	@flag: Specify which part of IOC memory is to be uploaded.
+ *	@pFwHeader: Pointer to firmware header info
  *	@sleepFlag: Specifies whether the process can sleep
  *
  *	FwDownloadBoot requires Programmed IO access.
@@ -3071,7 +3088,7 @@ mpt_downloadboot(MPT_ADAPTER *ioc, MpiFwHeader_t *pFwHeader, int sleepFlag)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	KickStart - Perform hard reset of MPT adapter.
  *	@ioc: Pointer to MPT_ADAPTER structure
  *	@force: Force hard reset
@@ -3145,12 +3162,12 @@ KickStart(MPT_ADAPTER *ioc, int force, int sleepFlag)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	mpt_diag_reset - Perform hard reset of the adapter.
  *	@ioc: Pointer to MPT_ADAPTER structure
  *	@ignore: Set if to honor and clear to ignore
  *		the reset history bit
- *	@sleepflag: CAN_SLEEP if called in a non-interrupt thread,
+ *	@sleepFlag: CAN_SLEEP if called in a non-interrupt thread,
  *		else set to NO_SLEEP (use mdelay instead)
  *
  *	This routine places the adapter in diagnostic mode via the
@@ -3436,11 +3453,12 @@ mpt_diag_reset(MPT_ADAPTER *ioc, int ignore, int sleepFlag)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	SendIocReset - Send IOCReset request to MPT adapter.
  *	@ioc: Pointer to MPT_ADAPTER structure
  *	@reset_type: reset type, expected values are
  *	%MPI_FUNCTION_IOC_MESSAGE_UNIT_RESET or %MPI_FUNCTION_IO_UNIT_RESET
+ *	@sleepFlag: Specifies whether the process can sleep
  *
  *	Send IOCReset request to the MPT adapter.
  *
@@ -3494,11 +3512,12 @@ SendIocReset(MPT_ADAPTER *ioc, u8 reset_type, int sleepFlag)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
- *	initChainBuffers - Allocate memory for and initialize
- *	chain buffers, chain buffer control arrays and spinlock.
- *	@hd: Pointer to MPT_SCSI_HOST structure
- *	@init: If set, initialize the spin lock.
+/**
+ *	initChainBuffers - Allocate memory for and initialize chain buffers
+ *	@ioc: Pointer to MPT_ADAPTER structure
+ *
+ *	Allocates memory for and initializes chain buffers,
+ *	chain buffer control arrays and spinlock.
  */
 static int
 initChainBuffers(MPT_ADAPTER *ioc)
@@ -3594,7 +3613,7 @@ initChainBuffers(MPT_ADAPTER *ioc)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	PrimeIocFifos - Initialize IOC request and reply FIFOs.
  *	@ioc: Pointer to MPT_ADAPTER structure
  *
@@ -3891,15 +3910,15 @@ mpt_handshake_req_reply_wait(MPT_ADAPTER *ioc, int reqBytes, u32 *req,
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
- *	WaitForDoorbellAck - Wait for IOC to clear the IOP_DOORBELL_STATUS bit
- *	in it's IntStatus register.
+/**
+ *	WaitForDoorbellAck - Wait for IOC doorbell handshake acknowledge
  *	@ioc: Pointer to MPT_ADAPTER structure
  *	@howlong: How long to wait (in seconds)
  *	@sleepFlag: Specifies whether the process can sleep
  *
  *	This routine waits (up to ~2 seconds max) for IOC doorbell
- *	handshake ACKnowledge.
+ *	handshake ACKnowledge, indicated by the IOP_DOORBELL_STATUS
+ *	bit in its IntStatus register being clear.
  *
  *	Returns a negative value on failure, else wait loop count.
  */
@@ -3942,14 +3961,14 @@ WaitForDoorbellAck(MPT_ADAPTER *ioc, int howlong, int sleepFlag)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
- *	WaitForDoorbellInt - Wait for IOC to set the HIS_DOORBELL_INTERRUPT bit
- *	in it's IntStatus register.
+/**
+ *	WaitForDoorbellInt - Wait for IOC to set its doorbell interrupt bit
  *	@ioc: Pointer to MPT_ADAPTER structure
  *	@howlong: How long to wait (in seconds)
  *	@sleepFlag: Specifies whether the process can sleep
  *
- *	This routine waits (up to ~2 seconds max) for IOC doorbell interrupt.
+ *	This routine waits (up to ~2 seconds max) for IOC doorbell interrupt
+ *	(MPI_HIS_DOORBELL_INTERRUPT) to be set in the IntStatus register.
  *
  *	Returns a negative value on failure, else wait loop count.
  */
@@ -3991,8 +4010,8 @@ WaitForDoorbellInt(MPT_ADAPTER *ioc, int howlong, int sleepFlag)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
- *	WaitForDoorbellReply - Wait for and capture a IOC handshake reply.
+/**
+ *	WaitForDoorbellReply - Wait for and capture an IOC handshake reply.
  *	@ioc: Pointer to MPT_ADAPTER structure
  *	@howlong: How long to wait (in seconds)
  *	@sleepFlag: Specifies whether the process can sleep
@@ -4077,7 +4096,7 @@ WaitForDoorbellReply(MPT_ADAPTER *ioc, int howlong, int sleepFlag)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	GetLanConfigPages - Fetch LANConfig pages.
  *	@ioc: Pointer to MPT_ADAPTER structure
  *
@@ -4188,12 +4207,9 @@ GetLanConfigPages(MPT_ADAPTER *ioc)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
- *	mptbase_sas_persist_operation - Perform operation on SAS Persitent Table
+/**
+ *	mptbase_sas_persist_operation - Perform operation on SAS Persistent Table
  *	@ioc: Pointer to MPT_ADAPTER structure
- *	@sas_address: 64bit SAS Address for operation.
- *	@target_id: specified target for operation
- *	@bus: specified bus for operation
  *	@persist_opcode: see below
  *
  *	MPI_SAS_OP_CLEAR_NOT_PRESENT - Free all persist TargetID mappings for
@@ -4202,7 +4218,7 @@ GetLanConfigPages(MPT_ADAPTER *ioc)
  *
  *	NOTE: Don't use not this function during interrupt time.
  *
- *	Returns: 0 for success, non-zero error
+ *	Returns 0 for success, non-zero error
  */
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
@@ -4399,7 +4415,7 @@ mptbase_raid_process_event_data(MPT_ADAPTER *ioc,
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	GetIoUnitPage2 - Retrieve BIOS version and boot order information.
  *	@ioc: Pointer to MPT_ADAPTER structure
  *
@@ -4457,7 +4473,8 @@ GetIoUnitPage2(MPT_ADAPTER *ioc)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*	mpt_GetScsiPortSettings - read SCSI Port Page 0 and 2
+/**
+ *	mpt_GetScsiPortSettings - read SCSI Port Page 0 and 2
  *	@ioc: Pointer to a Adapter Strucutre
  *	@portnum: IOC port number
  *
@@ -4644,7 +4661,8 @@ mpt_GetScsiPortSettings(MPT_ADAPTER *ioc, int portnum)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*	mpt_readScsiDevicePageHeaders - save version and length of SDP1
+/**
+ *	mpt_readScsiDevicePageHeaders - save version and length of SDP1
  *	@ioc: Pointer to a Adapter Strucutre
  *	@portnum: IOC port number
  *
@@ -4996,9 +5014,8 @@ mpt_read_ioc_pg_1(MPT_ADAPTER *ioc)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
- *	SendEventNotification - Send EventNotification (on or off) request
- *	to MPT adapter.
+/**
+ *	SendEventNotification - Send EventNotification (on or off) request to adapter
  *	@ioc: Pointer to MPT_ADAPTER structure
  *	@EvSwitch: Event switch flags
  */
@@ -5062,8 +5079,8 @@ SendEventAck(MPT_ADAPTER *ioc, EventNotificationReply_t *evnp)
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
 /**
  *	mpt_config - Generic function to issue config message
- *	@ioc - Pointer to an adapter structure
- *	@cfg - Pointer to a configuration structure. Struct contains
+ *	@ioc:   Pointer to an adapter structure
+ *	@pCfg:  Pointer to a configuration structure. Struct contains
  *		action, page address, direction, physical address
  *		and pointer to a configuration page header
  *		Page header is updated.
@@ -5188,8 +5205,8 @@ mpt_config(MPT_ADAPTER *ioc, CONFIGPARMS *pCfg)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
- *	mpt_timer_expired - Call back for timer process.
+/**
+ *	mpt_timer_expired - Callback for timer process.
  *	Used only internal config functionality.
  *	@data: Pointer to MPT_SCSI_HOST recast as an unsigned long
  */
@@ -5214,12 +5231,12 @@ mpt_timer_expired(unsigned long data)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	mpt_ioc_reset - Base cleanup for hard reset
  *	@ioc: Pointer to the adapter structure
  *	@reset_phase: Indicates pre- or post-reset functionality
  *
- *	Remark: Free's resources with internally generated commands.
+ *	Remark: Frees resources with internally generated commands.
  */
 static int
 mpt_ioc_reset(MPT_ADAPTER *ioc, int reset_phase)
@@ -5271,7 +5288,7 @@ mpt_ioc_reset(MPT_ADAPTER *ioc, int reset_phase)
  *	procfs (%MPT_PROCFS_MPTBASEDIR/...) support stuff...
  */
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	procmpt_create - Create %MPT_PROCFS_MPTBASEDIR entries.
  *
  *	Returns 0 for success, non-zero for failure.
@@ -5297,7 +5314,7 @@ procmpt_create(void)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	procmpt_destroy - Tear down %MPT_PROCFS_MPTBASEDIR entries.
  *
  *	Returns 0 for success, non-zero for failure.
@@ -5311,16 +5328,16 @@ procmpt_destroy(void)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
- *	procmpt_summary_read - Handle read request from /proc/mpt/summary
- *	or from /proc/mpt/iocN/summary.
+/**
+ *	procmpt_summary_read - Handle read request of a summary file
  *	@buf: Pointer to area to write information
  *	@start: Pointer to start pointer
  *	@offset: Offset to start writing
- *	@request:
+ *	@request: Amount of read data requested
  *	@eof: Pointer to EOF integer
  *	@data: Pointer
  *
+ *	Handles read request from /proc/mpt/summary or /proc/mpt/iocN/summary.
  *	Returns number of characters written to process performing the read.
  */
 static int
@@ -5355,12 +5372,12 @@ procmpt_summary_read(char *buf, char **start, off_t offset, int request, int *eo
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	procmpt_version_read - Handle read request from /proc/mpt/version.
  *	@buf: Pointer to area to write information
  *	@start: Pointer to start pointer
  *	@offset: Offset to start writing
- *	@request:
+ *	@request: Amount of read data requested
  *	@eof: Pointer to EOF integer
  *	@data: Pointer
  *
@@ -5411,12 +5428,12 @@ procmpt_version_read(char *buf, char **start, off_t offset, int request, int *eo
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	procmpt_iocinfo_read - Handle read request from /proc/mpt/iocN/info.
  *	@buf: Pointer to area to write information
  *	@start: Pointer to start pointer
  *	@offset: Offset to start writing
- *	@request:
+ *	@request: Amount of read data requested
  *	@eof: Pointer to EOF integer
  *	@data: Pointer
  *
@@ -5577,16 +5594,17 @@ mpt_print_ioc_summary(MPT_ADAPTER *ioc, char *buffer, int *size, int len, int sh
  */
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
 /**
- *	mpt_HardResetHandler - Generic reset handler, issue SCSI Task
- *	Management call based on input arg values.  If TaskMgmt fails,
- *	return associated SCSI request.
+ *	mpt_HardResetHandler - Generic reset handler
  *	@ioc: Pointer to MPT_ADAPTER structure
  *	@sleepFlag: Indicates if sleep or schedule must be called.
  *
+ *	Issues SCSI Task Management call based on input arg values.
+ *	If TaskMgmt fails, returns associated SCSI request.
+ *
  *	Remark: _HardResetHandler can be invoked from an interrupt thread (timer)
  *	or a non-interrupt thread.  In the former, must not call schedule().
  *
- *	Remark: A return of -1 is a FATAL error case, as it means a
+ *	Note: A return of -1 is a FATAL error case, as it means a
  *	FW reload/initialization failed.
  *
  *	Returns 0 for SUCCESS or -1 if FAILED.
@@ -5935,13 +5953,14 @@ EventDescriptionStr(u8 event, u32 evData0, char *evStr)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
- *	ProcessEventNotification - Route a received EventNotificationReply to
- *	all currently regeistered event handlers.
+/**
+ *	ProcessEventNotification - Route EventNotificationReply to all event handlers
  *	@ioc: Pointer to MPT_ADAPTER structure
  *	@pEventReply: Pointer to EventNotification reply frame
  *	@evHandlers: Pointer to integer, number of event handlers
  *
+ *	Routes a received EventNotificationReply to all currently registered
+ *	event handlers.
  *	Returns sum of event handlers return values.
  */
 static int
@@ -6056,7 +6075,7 @@ ProcessEventNotification(MPT_ADAPTER *ioc, EventNotificationReply_t *pEventReply
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	mpt_fc_log_info - Log information returned from Fibre Channel IOC.
  *	@ioc: Pointer to MPT_ADAPTER structure
  *	@log_info: U32 LogInfo reply word from the IOC
@@ -6077,7 +6096,7 @@ mpt_fc_log_info(MPT_ADAPTER *ioc, u32 log_info)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	mpt_spi_log_info - Log information returned from SCSI Parallel IOC.
  *	@ioc: Pointer to MPT_ADAPTER structure
  *	@mr: Pointer to MPT reply frame
@@ -6200,7 +6219,7 @@ mpt_spi_log_info(MPT_ADAPTER *ioc, u32 log_info)
 	};
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	mpt_sas_log_info - Log information returned from SAS IOC.
  *	@ioc: Pointer to MPT_ADAPTER structure
  *	@log_info: U32 LogInfo reply word from the IOC
@@ -6255,7 +6274,7 @@ union loginfo_type {
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	mpt_sp_ioc_info - IOC information returned from SCSI Parallel IOC.
  *	@ioc: Pointer to MPT_ADAPTER structure
  *	@ioc_status: U32 IOCStatus word from IOC
@@ -6416,7 +6435,7 @@ EXPORT_SYMBOL(mpt_free_fw_memory);
 EXPORT_SYMBOL(mptbase_sas_persist_operation);
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	fusion_init - Fusion MPT base driver initialization routine.
  *
  *	Returns 0 for success, non-zero for failure.
@@ -6456,7 +6475,7 @@ fusion_init(void)
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-/*
+/**
  *	fusion_exit - Perform driver unload cleanup.
  *
  *	This routine frees all resources associated with each MPT adapter
diff --git a/drivers/message/fusion/mptfc.c b/drivers/message/fusion/mptfc.c
index ef2b55e..ca2f910 100644
--- a/drivers/message/fusion/mptfc.c
+++ b/drivers/message/fusion/mptfc.c
@@ -1395,8 +1395,7 @@ mptfc_ioc_reset(MPT_ADAPTER *ioc, int reset_phase)
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
 /**
- *	mptfc_init - Register MPT adapter(s) as SCSI host(s) with
- *	linux scsi mid-layer.
+ *	mptfc_init - Register MPT adapter(s) as SCSI host(s) with SCSI mid-layer.
  *
  *	Returns 0 for success, non-zero for failure.
  */
@@ -1440,7 +1439,7 @@ mptfc_init(void)
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
 /**
- *	mptfc_remove - Removed fc infrastructure for devices
+ *	mptfc_remove - Remove fc infrastructure for devices
  *	@pdev: Pointer to pci_dev structure
  *
  */
diff --git a/drivers/message/fusion/mptscsih.c b/drivers/message/fusion/mptscsih.c
index 30524dc..2c72c36 100644
--- a/drivers/message/fusion/mptscsih.c
+++ b/drivers/message/fusion/mptscsih.c
@@ -1230,15 +1230,15 @@ mptscsih_host_info(MPT_ADAPTER *ioc, char *pbuf, off_t offset, int len)
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
 /**
  *	mptscsih_proc_info - Return information about MPT adapter
+ * 	@host:   scsi host struct
+ * 	@buffer: if write, user data; if read, buffer for user
+ *	@start: returns the buffer address
+ * 	@offset: if write, 0; if read, the current offset into the buffer from
+ * 		 the previous read.
+ * 	@length: if write, return length;
+ *	@func:   write = 1; read = 0
  *
  *	(linux scsi_host_template.info routine)
- *
- * 	buffer: if write, user data; if read, buffer for user
- * 	length: if write, return length;
- * 	offset: if write, 0; if read, the current offset into the buffer from
- * 		the previous read.
- * 	hostno: scsi host number
- *	func:   if write = 1; if read = 0
  */
 int
 mptscsih_proc_info(struct Scsi_Host *host, char *buffer, char **start, off_t offset,
@@ -1902,8 +1902,7 @@ mptscsih_bus_reset(struct scsi_cmnd * SCpnt)
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
 /**
- *	mptscsih_host_reset - Perform a SCSI host adapter RESET!
- *	new_eh variant
+ *	mptscsih_host_reset - Perform a SCSI host adapter RESET (new_eh variant)
  *	@SCpnt: Pointer to scsi_cmnd structure, IO which reset is due to
  *
  *	(linux scsi_host_template.eh_host_reset_handler routine)
@@ -1949,8 +1948,7 @@ mptscsih_host_reset(struct scsi_cmnd *SCpnt)
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
 /**
- *	mptscsih_tm_pending_wait - wait for pending task management request to
- *		complete.
+ *	mptscsih_tm_pending_wait - wait for pending task management request to complete
  *	@hd: Pointer to MPT host structure.
  *
  *	Returns {SUCCESS,FAILED}.
@@ -1982,6 +1980,7 @@ mptscsih_tm_pending_wait(MPT_SCSI_HOST * hd)
 /**
  *	mptscsih_tm_wait_for_completion - wait for completion of TM task
  *	@hd: Pointer to MPT host structure.
+ *	@timeout: timeout in seconds
  *
  *	Returns {SUCCESS,FAILED}.
  */
@@ -3429,8 +3428,7 @@ mptscsih_do_cmd(MPT_SCSI_HOST *hd, INTERNAL_CMD *io)
 /**
  *	mptscsih_synchronize_cache - Send SYNCHRONIZE_CACHE to all disks.
  *	@hd: Pointer to a SCSI HOST structure
- *	@vtarget: per device private data
- *	@lun: lun
+ *	@vdevice: virtual target device
  *
  *	Uses the ISR, but with special processing.
  *	MUST be single-threaded.
diff --git a/drivers/message/fusion/mptspi.c b/drivers/message/fusion/mptspi.c
index f422c0d..36641da 100644
--- a/drivers/message/fusion/mptspi.c
+++ b/drivers/message/fusion/mptspi.c
@@ -1100,8 +1100,7 @@ static struct pci_driver mptspi_driver = {
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
 /**
- *	mptspi_init - Register MPT adapter(s) as SCSI host(s) with
- *	linux scsi mid-layer.
+ *	mptspi_init - Register MPT adapter(s) as SCSI host(s) with SCSI mid-layer.
  *
  *	Returns 0 for success, non-zero for failure.
  */
@@ -1135,7 +1134,6 @@ mptspi_init(void)
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
 /**
  *	mptspi_exit - Unregisters MPT adapter(s)
- *
  */
 static void __exit
 mptspi_exit(void)
diff --git a/drivers/message/i2o/bus-osm.c b/drivers/message/i2o/bus-osm.c
index d96c687..c463dc2 100644
--- a/drivers/message/i2o/bus-osm.c
+++ b/drivers/message/i2o/bus-osm.c
@@ -56,6 +56,9 @@ static int i2o_bus_scan(struct i2o_device *dev)
 /**
  *	i2o_bus_store_scan - Scan the I2O Bus Adapter
  *	@d: device which should be scanned
+ *	@attr: device_attribute
+ *	@buf: output buffer
+ *	@count: buffer size
  *
  *	Returns count.
  */
diff --git a/drivers/message/i2o/device.c b/drivers/message/i2o/device.c
index 55757af..b9df143 100644
--- a/drivers/message/i2o/device.c
+++ b/drivers/message/i2o/device.c
@@ -54,8 +54,8 @@ static inline int i2o_device_issue_claim(struct i2o_device *dev, u32 cmd,
  *	@dev: I2O device to claim
  *	@drv: I2O driver which wants to claim the device
  *
- *	Do the leg work to assign a device to a given OSM. If the claim succeed
- *	the owner of the rimary. If the attempt fails a negative errno code
+ *	Do the leg work to assign a device to a given OSM. If the claim succeeds,
+ *	the owner is the primary. If the attempt fails a negative errno code
  *	is returned. On success zero is returned.
  */
 int i2o_device_claim(struct i2o_device *dev)
@@ -208,7 +208,7 @@ static struct i2o_device *i2o_device_alloc(void)
 
 /**
  *	i2o_device_add - allocate a new I2O device and add it to the IOP
- *	@iop: I2O controller where the device is on
+ *	@c: I2O controller that the device is on
  *	@entry: LCT entry of the I2O device
  *
  *	Allocate a new I2O device and initialize it with the LCT entry. The
@@ -280,7 +280,7 @@ err:
 
 /**
  *	i2o_device_remove - remove an I2O device from the I2O core
- *	@dev: I2O device which should be released
+ *	@i2o_dev: I2O device which should be released
  *
  *	Is used on I2O controller removal or LCT modification, when the device
  *	is removed from the system. Note that the device could still hang
diff --git a/drivers/message/i2o/driver.c b/drivers/message/i2o/driver.c
index 7fc7399..9104b65 100644
--- a/drivers/message/i2o/driver.c
+++ b/drivers/message/i2o/driver.c
@@ -34,9 +34,7 @@ static spinlock_t i2o_drivers_lock;
 static struct i2o_driver **i2o_drivers;
 
 /**
- *	i2o_bus_match - Tell if a I2O device class id match the class ids of
- *			the I2O driver (OSM)
- *
+ *	i2o_bus_match - Tell if I2O device class id matches the class ids of the I2O driver (OSM)
  *	@dev: device which should be verified
  *	@drv: the driver to match against
  *
@@ -248,7 +246,7 @@ int i2o_driver_dispatch(struct i2o_controller *c, u32 m)
 
 /**
  *	i2o_driver_notify_controller_add_all - Send notify of added controller
- *					       to all I2O drivers
+ *	@c: newly added controller
  *
  *	Send notifications to all registered drivers that a new controller was
  *	added.
@@ -267,8 +265,8 @@ void i2o_driver_notify_controller_add_all(struct i2o_controller *c)
 }
 
 /**
- *	i2o_driver_notify_controller_remove_all - Send notify of removed
- *						  controller to all I2O drivers
+ *	i2o_driver_notify_controller_remove_all - Send notify of removed controller
+ *	@c: controller that is being removed
  *
  *	Send notifications to all registered drivers that a controller was
  *	removed.
@@ -287,8 +285,8 @@ void i2o_driver_notify_controller_remove_all(struct i2o_controller *c)
 }
 
 /**
- *	i2o_driver_notify_device_add_all - Send notify of added device to all
- *					   I2O drivers
+ *	i2o_driver_notify_device_add_all - Send notify of added device
+ *	@i2o_dev: newly added I2O device
  *
  *	Send notifications to all registered drivers that a device was added.
  */
@@ -306,8 +304,8 @@ void i2o_driver_notify_device_add_all(struct i2o_device *i2o_dev)
 }
 
 /**
- *	i2o_driver_notify_device_remove_all - Send notify of removed device to
- *					      all I2O drivers
+ *	i2o_driver_notify_device_remove_all - Send notify of removed device
+ *	@i2o_dev: device that is being removed
  *
  *	Send notifications to all registered drivers that a device was removed.
  */
@@ -362,7 +360,7 @@ int __init i2o_driver_init(void)
 /**
  *	i2o_driver_exit - clean up I2O drivers (OSMs)
  *
- *	Unregisters the I2O bus and free driver array.
+ *	Unregisters the I2O bus and frees driver array.
  */
 void __exit i2o_driver_exit(void)
 {
diff --git a/drivers/message/i2o/exec-osm.c b/drivers/message/i2o/exec-osm.c
index 9e529d8..902753b 100644
--- a/drivers/message/i2o/exec-osm.c
+++ b/drivers/message/i2o/exec-osm.c
@@ -94,8 +94,8 @@ static struct i2o_exec_wait *i2o_exec_wait_alloc(void)
 };
 
 /**
- *	i2o_exec_wait_free - Free a i2o_exec_wait struct
- *	@i2o_exec_wait: I2O wait data which should be cleaned up
+ *	i2o_exec_wait_free - Free an i2o_exec_wait struct
+ *	@wait: I2O wait data which should be cleaned up
  */
 static void i2o_exec_wait_free(struct i2o_exec_wait *wait)
 {
@@ -105,7 +105,7 @@ static void i2o_exec_wait_free(struct i2o_exec_wait *wait)
 /**
  * 	i2o_msg_post_wait_mem - Post and wait a message with DMA buffers
  *	@c: controller
- *	@m: message to post
+ *	@msg: message to post
  *	@timeout: time in seconds to wait
  *	@dma: i2o_dma struct of the DMA buffer to free on failure
  *
@@ -269,6 +269,7 @@ static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m,
 /**
  *	i2o_exec_show_vendor_id - Displays Vendor ID of controller
  *	@d: device of which the Vendor ID should be displayed
+ *	@attr: device_attribute to display
  *	@buf: buffer into which the Vendor ID should be printed
  *
  *	Returns number of bytes printed into buffer.
@@ -290,6 +291,7 @@ static ssize_t i2o_exec_show_vendor_id(struct device *d,
 /**
  *	i2o_exec_show_product_id - Displays Product ID of controller
  *	@d: device of which the Product ID should be displayed
+ *	@attr: device_attribute to display
  *	@buf: buffer into which the Product ID should be printed
  *
  *	Returns number of bytes printed into buffer.
@@ -365,7 +367,7 @@ static int i2o_exec_remove(struct device *dev)
 
 /**
  *	i2o_exec_lct_modified - Called on LCT NOTIFY reply
- *	@c: I2O controller on which the LCT has modified
+ *	@work: work struct for a specific controller
  *
  *	This function handles asynchronus LCT NOTIFY replies. It parses the
  *	new LCT and if the buffer for the LCT was to small sends a LCT NOTIFY
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 70ae002..da9859f 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -259,7 +259,7 @@ static int i2o_block_device_unlock(struct i2o_device *dev, u32 media_id)
 /**
  *	i2o_block_device_power - Power management for device dev
  *	@dev: I2O device which should receive the power management request
- *	@operation: Operation which should be send
+ *	@op: Operation to send
  *
  *	Send a power management request to the device dev.
  *
@@ -315,7 +315,7 @@ static inline struct i2o_block_request *i2o_block_request_alloc(void)
  *	i2o_block_request_free - Frees a I2O block request
  *	@ireq: I2O block request which should be freed
  *
- *	Fres the allocated memory (give it back to the request mempool).
+ *	Frees the allocated memory (give it back to the request mempool).
  */
 static inline void i2o_block_request_free(struct i2o_block_request *ireq)
 {
@@ -326,6 +326,7 @@ static inline void i2o_block_request_free(struct i2o_block_request *ireq)
  *	i2o_block_sglist_alloc - Allocate the SG list and map it
  *	@c: I2O controller to which the request belongs
  *	@ireq: I2O block request
+ *	@mptr: message body pointer
  *
  *	Builds the SG list and map it to be accessable by the controller.
  *
@@ -490,7 +491,7 @@ static void i2o_block_end_request(struct request *req, int uptodate,
  *	i2o_block_reply - Block OSM reply handler.
  *	@c: I2O controller from which the message arrives
  *	@m: message id of reply
- *	qmsg: the actuall I2O message reply
+ *	@msg: the actual I2O message reply
  *
  *	This function gets all the message replies.
  *
@@ -602,6 +603,8 @@ static void i2o_block_biosparam(unsigned long capacity, unsigned short *cyls,
 
 /**
  *	i2o_block_open - Open the block device
+ *	@inode: inode for block device being opened
+ *	@file: file to open
  *
  *	Power up the device, mount and lock the media. This function is called,
  *	if the block device is opened for access.
@@ -629,6 +632,8 @@ static int i2o_block_open(struct inode *inode, struct file *file)
 
 /**
  *	i2o_block_release - Release the I2O block device
+ *	@inode: inode for block device being released
+ *	@file: file to close
  *
  *	Unlock and unmount the media, and power down the device. Gets called if
  *	the block device is closed.
@@ -675,6 +680,8 @@ static int i2o_block_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 
 /**
  *	i2o_block_ioctl - Issue device specific ioctl calls.
+ *	@inode: inode for block device ioctl
+ *	@file: file for ioctl
  *	@cmd: ioctl command
  *	@arg: arg
  *
@@ -902,7 +909,7 @@ static int i2o_block_transfer(struct request *req)
 
 /**
  *	i2o_block_request_fn - request queue handling function
- *	q: request queue from which the request could be fetched
+ *	@q: request queue from which the request could be fetched
  *
  *	Takes the next request from the queue, transfers it and if no error
  *	occurs dequeue it from the queue. On arrival of the reply the message
diff --git a/drivers/message/i2o/i2o_proc.c b/drivers/message/i2o/i2o_proc.c
index 3d2e76e..a61cb17 100644
--- a/drivers/message/i2o/i2o_proc.c
+++ b/drivers/message/i2o/i2o_proc.c
@@ -163,7 +163,7 @@ static int print_serial_number(struct seq_file *seq, u8 * serialno, int max_len)
  *	i2o_get_class_name - 	do i2o class name lookup
  *	@class: class number
  *
- *	Return a descriptive string for an i2o class
+ *	Return a descriptive string for an i2o class.
  */
 static const char *i2o_get_class_name(int class)
 {
diff --git a/drivers/message/i2o/i2o_scsi.c b/drivers/message/i2o/i2o_scsi.c
index d5f93b1..1045c8a 100644
--- a/drivers/message/i2o/i2o_scsi.c
+++ b/drivers/message/i2o/i2o_scsi.c
@@ -411,8 +411,7 @@ static void i2o_scsi_notify_device_add(struct i2o_device *i2o_dev)
 };
 
 /**
- *	i2o_scsi_notify_device_remove - Retrieve notifications of removed
- *				        devices
+ *	i2o_scsi_notify_device_remove - Retrieve notifications of removed devices
  *	@i2o_dev: the I2O device which was removed
  *
  *	If a I2O device is removed, we catch the notification to remove the
@@ -432,8 +431,7 @@ static void i2o_scsi_notify_device_remove(struct i2o_device *i2o_dev)
 };
 
 /**
- *	i2o_scsi_notify_controller_add - Retrieve notifications of added
- *					 controllers
+ *	i2o_scsi_notify_controller_add - Retrieve notifications of added controllers
  *	@c: the controller which was added
  *
  *	If a I2O controller is added, we catch the notification to add a
@@ -463,8 +461,7 @@ static void i2o_scsi_notify_controller_add(struct i2o_controller *c)
 };
 
 /**
- *	i2o_scsi_notify_controller_remove - Retrieve notifications of removed
- *					    controllers
+ *	i2o_scsi_notify_controller_remove - Retrieve notifications of removed controllers
  *	@c: the controller which was removed
  *
  *	If a I2O controller is removed, we catch the notification to remove the
@@ -751,7 +748,7 @@ static int i2o_scsi_abort(struct scsi_cmnd *SCpnt)
  *	@capacity: size in sectors
  *	@ip: geometry array
  *
- *	This is anyones guess quite frankly. We use the same rules everyone
+ *	This is anyone's guess quite frankly. We use the same rules everyone
  *	else appears to and hope. It seems to work.
  */
 
diff --git a/drivers/message/i2o/pci.c b/drivers/message/i2o/pci.c
index 8287f95..3661e6e 100644
--- a/drivers/message/i2o/pci.c
+++ b/drivers/message/i2o/pci.c
@@ -259,6 +259,7 @@ static irqreturn_t i2o_pci_interrupt(int irq, void *dev_id)
 
 /**
  *	i2o_pci_irq_enable - Allocate interrupt for I2O controller
+ *	@c: i2o_controller that the request is for
  *
  *	Allocate an interrupt for the I2O controller, and activate interrupts
  *	on the I2O controller.
@@ -305,7 +306,7 @@ static void i2o_pci_irq_disable(struct i2o_controller *c)
 
 /**
  *	i2o_pci_probe - Probe the PCI device for an I2O controller
- *	@dev: PCI device to test
+ *	@pdev: PCI device to test
  *	@id: id which matched with the PCI device id table
  *
  *	Probe the PCI device for any device which is a memory of the
@@ -447,7 +448,7 @@ static int __devinit i2o_pci_probe(struct pci_dev *pdev,
 
 /**
  *	i2o_pci_remove - Removes a I2O controller from the system
- *	pdev: I2O controller which should be removed
+ *	@pdev: I2O controller which should be removed
  *
  *	Reset the I2O controller, disable interrupts and remove all allocated
  *	resources.
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index 2514f4e..52f53e2 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -986,7 +986,8 @@ extern void i2o_driver_unregister(struct i2o_driver *);
 
 /**
  *	i2o_driver_notify_controller_add - Send notification of added controller
- *					   to a single I2O driver
+ *	@drv: I2O driver
+ *	@c: I2O controller
  *
  *	Send notification of added controller to a single registered driver.
  */
@@ -998,8 +999,9 @@ static inline void i2o_driver_notify_controller_add(struct i2o_driver *drv,
 };
 
 /**
- *	i2o_driver_notify_controller_remove - Send notification of removed
- *					      controller to a single I2O driver
+ *	i2o_driver_notify_controller_remove - Send notification of removed controller
+ *	@drv: I2O driver
+ *	@c: I2O controller
  *
  *	Send notification of removed controller to a single registered driver.
  */
@@ -1011,8 +1013,9 @@ static inline void i2o_driver_notify_controller_remove(struct i2o_driver *drv,
 };
 
 /**
- *	i2o_driver_notify_device_add - Send notification of added device to a
- *				       single I2O driver
+ *	i2o_driver_notify_device_add - Send notification of added device
+ *	@drv: I2O driver
+ *	@i2o_dev: the added i2o_device
  *
  *	Send notification of added device to a single registered driver.
  */
@@ -1025,7 +1028,8 @@ static inline void i2o_driver_notify_device_add(struct i2o_driver *drv,
 
 /**
  *	i2o_driver_notify_device_remove - Send notification of removed device
- *					  to a single I2O driver
+ *	@drv: I2O driver
+ *	@i2o_dev: the added i2o_device
  *
  *	Send notification of removed device to a single registered driver.
  */
@@ -1148,7 +1152,7 @@ static inline void i2o_msg_post(struct i2o_controller *c,
 /**
  * 	i2o_msg_post_wait - Post and wait a message and wait until return
  *	@c: controller
- *	@m: message to post
+ *	@msg: message to post
  *	@timeout: time in seconds to wait
  *
  * 	This API allows an OSM to post a message and then be told whether or
-- 
cgit v0.10.2


From 9a0efbb8795e8c5acb279d984b53d6af6ecdd00f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 6 Dec 2006 20:38:44 -0800
Subject: [PATCH] kernel-api book: remove videodev chapter

Remove the videodev chapter from the kernel-api book.  It's done much better
in the videobook kernel-doc.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index fc77c99..ca09491 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -418,11 +418,6 @@ X!Edrivers/pnp/system.c
 !Idrivers/parport/daisy.c
   </chapter>
 
-  <chapter id="viddev">
-     <title>Video4Linux</title>
-!Edrivers/media/video/videodev.c
-  </chapter>
-
   <chapter id="message_devices">
 	<title>Message-based devices</title>
      <sect1><title>Fusion message devices</title>
-- 
cgit v0.10.2


From 1c69d921ed9cc6593ad4f60c0f9951cb0d62b0b4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 6 Dec 2006 20:38:44 -0800
Subject: [PATCH] rcu: add a prefetch() in rcu_do_batch()

On some workloads, (for example when lot of close() syscalls are done), RCU
qlen can be quite large, and RCU heads are no longer in cpu cache when
rcu_do_batch() is called.

This patch adds a prefetch() in rcu_do_batch() to give CPU a hint to bring
back cache lines containing 'struct rcu_head's.

Most list manipulations macros include prefetch(), but not open coded ones
(at least with current C compilers :) )

I got a nice speedup on a trivial benchmark (3.48 us per iteration instead
of 3.95 us on a 1.6 GHz Pentium-M)

while (1) { pipe(p); close(fd[0]); close(fd[1]);}

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 26bb5ff..3554b76 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -235,12 +235,14 @@ static void rcu_do_batch(struct rcu_data *rdp)
 
 	list = rdp->donelist;
 	while (list) {
-		next = rdp->donelist = list->next;
+		next = list->next;
+		prefetch(next);
 		list->func(list);
 		list = next;
 		if (++count >= rdp->blimit)
 			break;
 	}
+	rdp->donelist = list;
 
 	local_irq_disable();
 	rdp->qlen -= count;
-- 
cgit v0.10.2


From d18de5a2721f84ffd6a5d637915746ed47becc1c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 6 Dec 2006 20:38:45 -0800
Subject: [PATCH] don't insert pipe dentries into dentry_hashtable.

We currently insert pipe dentries into the global dentry hashtable.  This
is suboptimal because there is currently no way these entries can be used
for a lookup().  (/proc/xxx/fd/xxx uses a different mechanism).  Inserting
them in dentry hashtable slows dcache lookups.

To let __dpath() still work correctly (ie not adding a " (deleted)") after
dentry name, we do :

 - Right after d_alloc(), pretend they are hashed by clearing the
   DCACHE_UNHASHED bit.

 - Call d_instantiate() instead of d_add() : dentry is not inserted in
   hash table.

__dpath() & friends work as intended during dentry lifetime.

 - At dismantle time, once dput() must clear the dentry, setting again
   DCACHE_UNHASHED bit inside the custom d_delete() function provided by
   pipe code, so that dput() can just kill_it.

This patch, combined with (avoid RCU for never hashed dentries) reduced
time of { pipe(p); close(p[0]); close(p[1]);} on my UP machine (1.6GHz
Pentium-M) from 3.23 us to 2.86 us (But this patch does not depend on other
patches, only bench results)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/pipe.c b/fs/pipe.c
index b1626f2..ae36b89 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -830,7 +830,14 @@ void free_pipe_info(struct inode *inode)
 static struct vfsmount *pipe_mnt __read_mostly;
 static int pipefs_delete_dentry(struct dentry *dentry)
 {
-	return 1;
+	/*
+	 * At creation time, we pretended this dentry was hashed
+	 * (by clearing DCACHE_UNHASHED bit in d_flags)
+	 * At delete time, we restore the truth : not hashed.
+	 * (so that dput() can proceed correctly)
+	 */
+	dentry->d_flags |= DCACHE_UNHASHED;
+	return 0;
 }
 
 static struct dentry_operations pipefs_dentry_operations = {
@@ -891,17 +898,22 @@ struct file *create_write_pipe(void)
 	if (!inode)
 		goto err_file;
 
-	sprintf(name, "[%lu]", inode->i_ino);
+	this.len = sprintf(name, "[%lu]", inode->i_ino);
 	this.name = name;
-	this.len = strlen(name);
-	this.hash = inode->i_ino; /* will go */
+	this.hash = 0;
 	err = -ENOMEM;
 	dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &this);
 	if (!dentry)
 		goto err_inode;
 
 	dentry->d_op = &pipefs_dentry_operations;
-	d_add(dentry, inode);
+	/*
+	 * We dont want to publish this dentry into global dentry hash table.
+	 * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
+	 * This permits a working /proc/$pid/fd/XXX on pipes
+	 */
+	dentry->d_flags &= ~DCACHE_UNHASHED;
+	d_instantiate(dentry, inode);
 	f->f_vfsmnt = mntget(pipe_mnt);
 	f->f_dentry = dentry;
 	f->f_mapping = inode->i_mapping;
-- 
cgit v0.10.2


From b3423415fbc2e5461605826317da1c8dbbf21f97 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 6 Dec 2006 20:38:48 -0800
Subject: [PATCH] dcache: avoid RCU for never-hashed dentries

Some dentries don't need to be globally visible in dentry hashtable.
(pipes & sockets)

Such dentries dont need to wait for a RCU grace period at delete time.
Being able to free them permits a better CPU cache use (hot cache)

This patch combined with (dont insert pipe dentries into dentry_hashtable)
reduced time of { pipe(p); close(p[0]); close(p[1]);} on my UP machine (1.6
GHz Pentium-M) from 3.23 us to 2.86 us (But this patch does not depend on
other patches, only bench results)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Maneesh Soni <maneesh@in.ibm.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Acked-by: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/dcache.c b/fs/dcache.c
index a7c67ce..d68631f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -68,15 +68,19 @@ struct dentry_stat_t dentry_stat = {
 	.age_limit = 45,
 };
 
-static void d_callback(struct rcu_head *head)
+static void __d_free(struct dentry *dentry)
 {
-	struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu);
-
 	if (dname_external(dentry))
 		kfree(dentry->d_name.name);
 	kmem_cache_free(dentry_cache, dentry); 
 }
 
+static void d_callback(struct rcu_head *head)
+{
+	struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu);
+	__d_free(dentry);
+}
+
 /*
  * no dcache_lock, please.  The caller must decrement dentry_stat.nr_dentry
  * inside dcache_lock.
@@ -85,7 +89,11 @@ static void d_free(struct dentry *dentry)
 {
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
- 	call_rcu(&dentry->d_u.d_rcu, d_callback);
+	/* if dentry was never inserted into hash, immediate free is OK */
+	if (dentry->d_hash.pprev == NULL)
+		__d_free(dentry);
+	else
+		call_rcu(&dentry->d_u.d_rcu, d_callback);
 }
 
 /*
-- 
cgit v0.10.2


From 304e61e6fbadec586dfe002b535f169a04248e49 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 6 Dec 2006 20:38:49 -0800
Subject: [PATCH] net: don't insert socket dentries into dentry_hashtable

We currently insert socket dentries into the global dentry hashtable.  This
is suboptimal because there is currently no way these entries can be used
for a lookup().  (/proc/xxx/fd/xxx uses a different mechanism).  Inserting
them in dentry hashtable slows dcache lookups.

To let __dpath() still work correctly (ie not adding a " (deleted)") after
dentry name, we do :

- Right after d_alloc(), pretend they are hashed by clearing the
  DCACHE_UNHASHED bit.

- Call d_instantiate() instead of d_add() : dentry is not inserted in
  hash table.

  __dpath() & friends work as intended during dentry lifetime.

- At dismantle time, once dput() must clear the dentry, setting again
  DCACHE_UNHASHED bit inside the custom d_delete() function provided by
  socket code, so that dput() can just kill_it.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/net/socket.c b/net/socket.c
index 43eff48..29ea1de 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -305,7 +305,14 @@ static struct file_system_type sock_fs_type = {
 
 static int sockfs_delete_dentry(struct dentry *dentry)
 {
-	return 1;
+	/*
+	 * At creation time, we pretended this dentry was hashed
+	 * (by clearing DCACHE_UNHASHED bit in d_flags)
+	 * At delete time, we restore the truth : not hashed.
+	 * (so that dput() can proceed correctly)
+	 */
+	dentry->d_flags |= DCACHE_UNHASHED;
+	return 0;
 }
 static struct dentry_operations sockfs_dentry_operations = {
 	.d_delete = sockfs_delete_dentry,
@@ -353,14 +360,20 @@ static int sock_attach_fd(struct socket *sock, struct file *file)
 
 	this.len = sprintf(name, "[%lu]", SOCK_INODE(sock)->i_ino);
 	this.name = name;
-	this.hash = SOCK_INODE(sock)->i_ino;
+	this.hash = 0;
 
 	file->f_dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this);
 	if (unlikely(!file->f_dentry))
 		return -ENOMEM;
 
 	file->f_dentry->d_op = &sockfs_dentry_operations;
-	d_add(file->f_dentry, SOCK_INODE(sock));
+	/*
+	 * We dont want to push this dentry into global dentry hash table.
+	 * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
+	 * This permits a working /proc/$pid/fd/XXX on sockets
+	 */
+	file->f_dentry->d_flags &= ~DCACHE_UNHASHED;
+	d_instantiate(file->f_dentry, SOCK_INODE(sock));
 	file->f_vfsmnt = mntget(sock_mnt);
 	file->f_mapping = file->f_dentry->d_inode->i_mapping;
 
-- 
cgit v0.10.2


From 4668edc334ee90cf50c382c3e423cfc510b5a126 Mon Sep 17 00:00:00 2001
From: Burman Yan <yan_952@hotmail.com>
Date: Wed, 6 Dec 2006 20:38:51 -0800
Subject: [PATCH] kernel core: replace kmalloc+memset with kzalloc

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/ipc/sem.c b/ipc/sem.c
index 21b3289..d3e12ef 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1070,14 +1070,13 @@ static struct sem_undo *find_undo(struct ipc_namespace *ns, int semid)
 	ipc_rcu_getref(sma);
 	sem_unlock(sma);
 
-	new = (struct sem_undo *) kmalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL);
+	new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL);
 	if (!new) {
 		ipc_lock_by_ptr(&sma->sem_perm);
 		ipc_rcu_putref(sma);
 		sem_unlock(sma);
 		return ERR_PTR(-ENOMEM);
 	}
-	memset(new, 0, sizeof(struct sem_undo) + sizeof(short)*nsems);
 	new->semadj = (short *) &new[1];
 	new->semid = semid;
 
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 4f40d92..2e896f8 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -636,10 +636,9 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
 	struct audit_rule *rule;
 	int i;
 
-	rule = kmalloc(sizeof(*rule), GFP_KERNEL);
+	rule = kzalloc(sizeof(*rule), GFP_KERNEL);
 	if (unlikely(!rule))
 		return NULL;
-	memset(rule, 0, sizeof(*rule));
 
 	rule->flags = krule->flags | krule->listnr;
 	rule->action = krule->action;
diff --git a/kernel/futex.c b/kernel/futex.c
index af7b81c..7c0d0d4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -324,12 +324,11 @@ static int refill_pi_state_cache(void)
 	if (likely(current->pi_state_cache))
 		return 0;
 
-	pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
+	pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
 
 	if (!pi_state)
 		return -ENOMEM;
 
-	memset(pi_state, 0, sizeof(*pi_state));
 	INIT_LIST_HEAD(&pi_state->list);
 	/* pi_mutex gets initialized later */
 	pi_state->owner = NULL;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index fcdd5d2..d43692c 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -108,11 +108,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
 
 	/* Allocate a controlling structure */
 	result = -ENOMEM;
-	image = kmalloc(sizeof(*image), GFP_KERNEL);
+	image = kzalloc(sizeof(*image), GFP_KERNEL);
 	if (!image)
 		goto out;
 
-	memset(image, 0, sizeof(*image));
 	image->head = 0;
 	image->entry = &image->head;
 	image->last_entry = &image->head;
diff --git a/lib/kobject.c b/lib/kobject.c
index 744a4b1..7ce6dc1 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -111,10 +111,9 @@ char *kobject_get_path(struct kobject *kobj, gfp_t gfp_mask)
 	len = get_kobj_path_length(kobj);
 	if (len == 0)
 		return NULL;
-	path = kmalloc(len, gfp_mask);
+	path = kzalloc(len, gfp_mask);
 	if (!path)
 		return NULL;
-	memset(path, 0x00, len);
 	fill_kobj_path(kobj, path, len);
 
 	return path;
diff --git a/mm/nommu.c b/mm/nommu.c
index 6a2a8aa..af87456 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -808,10 +808,9 @@ unsigned long do_mmap_pgoff(struct file *file,
 	vm_flags = determine_vm_flags(file, prot, flags, capabilities);
 
 	/* we're going to need to record the mapping if it works */
-	vml = kmalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
+	vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
 	if (!vml)
 		goto error_getting_vml;
-	memset(vml, 0, sizeof(*vml));
 
 	down_write(&nommu_vma_sem);
 
@@ -887,11 +886,10 @@ unsigned long do_mmap_pgoff(struct file *file,
 	}
 
 	/* we're going to need a VMA struct as well */
-	vma = kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
+	vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
 	if (!vma)
 		goto error_getting_vma;
 
-	memset(vma, 0, sizeof(*vma));
 	INIT_LIST_HEAD(&vma->anon_vma_node);
 	atomic_set(&vma->vm_usage, 1);
 	if (file)
-- 
cgit v0.10.2


From ea82c74093f48b28e632d03eeff22faf99727a8c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 6 Dec 2006 20:38:52 -0800
Subject: [PATCH] kernel-doc: stricter function pointer recognition

Be more careful about function pointer args:
look for "(...*" instead of just "(".

This line in include/linux/input.h fools the current kernel-doc script
into deciding that this is a function pointer:

	unsigned long ffbit[NBITS(FF_MAX)];

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 187f5de..df3b272 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -1430,7 +1430,7 @@ sub create_parameterlist($$$) {
 	    # corresponding data structures "correctly". Catch it later in
 	    # output_* subs.
 	    push_parameter($arg, "", $file);
-	} elsif ($arg =~ m/\(/) {
+	} elsif ($arg =~ m/\(.*\*/) {
 	    # pointer-to-function
 	    $arg =~ tr/#/,/;
 	    $arg =~ m/[^\(]+\(\*([^\)]+)\)/;
-- 
cgit v0.10.2


From 83b7b44e1c1e9e493ccd4146558481ab5af0116a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 6 Dec 2006 20:38:53 -0800
Subject: [PATCH] fs: reorder some 'struct inode' fields to speedup i_size
 manipulations

On 32bits SMP platforms, 64bits i_size is protected by a seqcount
(i_size_seqcount).

When i_size is read or written, i_size_seqcount is read/written as well, so
it make sense to group these two fields together in the same cache line.

This patch moves i_size_seqcount next to i_size, and also moves i_version
to let offsetof(struct inode, i_size) being 0x40 instead of 0x3c (for
32bits platforms).

For 64 bits platforms, i_size_seqcount doesnt exist, and the move of a
'long i_version' should not introduce a new hole because of padding.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/fs.h b/include/linux/fs.h
index d791bae..3a1927e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -548,12 +548,15 @@ struct inode {
 	uid_t			i_uid;
 	gid_t			i_gid;
 	dev_t			i_rdev;
+	unsigned long		i_version;
 	loff_t			i_size;
+#ifdef __NEED_I_SIZE_ORDERED
+	seqcount_t		i_size_seqcount;
+#endif
 	struct timespec		i_atime;
 	struct timespec		i_mtime;
 	struct timespec		i_ctime;
 	unsigned int		i_blkbits;
-	unsigned long		i_version;
 	blkcnt_t		i_blocks;
 	unsigned short          i_bytes;
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
@@ -598,9 +601,6 @@ struct inode {
 	void			*i_security;
 #endif
 	void			*i_private; /* fs or device private pointer */
-#ifdef __NEED_I_SIZE_ORDERED
-	seqcount_t		i_size_seqcount;
-#endif
 };
 
 /*
-- 
cgit v0.10.2


From f67637ee4b5d90d41160d755b9a8cca18c394586 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Wed, 6 Dec 2006 20:38:54 -0800
Subject: [PATCH] Add struct dev pointer to dma_is_consistent()

dma_is_consistent() is ill-designed in that it does not have a struct
device pointer argument which makes proper support for systems that consist
of a mix of coherent and non-coherent DMA devices hard.  Change
dma_is_consistent to take a struct device pointer as first argument and fix
the sole caller to pass it.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 8621a06..3dc1f91 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -431,10 +431,10 @@ be identical to those passed in (and returned by
 dma_alloc_noncoherent()).
 
 int
-dma_is_consistent(dma_addr_t dma_handle)
+dma_is_consistent(struct device *dev, dma_addr_t dma_handle)
 
-returns true if the memory pointed to by the dma_handle is actually
-consistent.
+returns true if the device dev is performing consistent DMA on the memory
+area pointed to by the dma_handle.
 
 int
 dma_get_cache_alignment(void)
diff --git a/arch/mips/mm/dma-coherent.c b/arch/mips/mm/dma-coherent.c
index 7fa5fd1..18bc83e 100644
--- a/arch/mips/mm/dma-coherent.c
+++ b/arch/mips/mm/dma-coherent.c
@@ -190,7 +190,7 @@ int dma_supported(struct device *dev, u64 mask)
 
 EXPORT_SYMBOL(dma_supported);
 
-int dma_is_consistent(dma_addr_t dma_addr)
+int dma_is_consistent(struct device *dev, dma_addr_t dma_addr)
 {
 	return 1;
 }
diff --git a/arch/mips/mm/dma-ip27.c b/arch/mips/mm/dma-ip27.c
index 8da19fd..8e9a5a8 100644
--- a/arch/mips/mm/dma-ip27.c
+++ b/arch/mips/mm/dma-ip27.c
@@ -197,7 +197,7 @@ int dma_supported(struct device *dev, u64 mask)
 
 EXPORT_SYMBOL(dma_supported);
 
-int dma_is_consistent(dma_addr_t dma_addr)
+int dma_is_consistent(struct device *dev, dma_addr_t dma_addr)
 {
 	return 1;
 }
diff --git a/arch/mips/mm/dma-ip32.c b/arch/mips/mm/dma-ip32.c
index ec54ed0..08720a4 100644
--- a/arch/mips/mm/dma-ip32.c
+++ b/arch/mips/mm/dma-ip32.c
@@ -363,7 +363,7 @@ int dma_supported(struct device *dev, u64 mask)
 
 EXPORT_SYMBOL(dma_supported);
 
-int dma_is_consistent(dma_addr_t dma_addr)
+int dma_is_consistent(struct device *dev, dma_addr_t dma_addr)
 {
 	return 1;
 }
diff --git a/arch/mips/mm/dma-noncoherent.c b/arch/mips/mm/dma-noncoherent.c
index 2eeffe5..4a3efc6 100644
--- a/arch/mips/mm/dma-noncoherent.c
+++ b/arch/mips/mm/dma-noncoherent.c
@@ -299,7 +299,7 @@ int dma_supported(struct device *dev, u64 mask)
 
 EXPORT_SYMBOL(dma_supported);
 
-int dma_is_consistent(dma_addr_t dma_addr)
+int dma_is_consistent(struct device *dev, dma_addr_t dma_addr)
 {
 	return 1;
 }
diff --git a/drivers/scsi/53c700.c b/drivers/scsi/53c700.c
index 335a255..acee062 100644
--- a/drivers/scsi/53c700.c
+++ b/drivers/scsi/53c700.c
@@ -313,7 +313,7 @@ NCR_700_detect(struct scsi_host_template *tpnt,
 	hostdata->status = memory + STATUS_OFFSET;
 	/* all of these offsets are L1_CACHE_BYTES separated.  It is fatal
 	 * if this isn't sufficient separation to avoid dma flushing issues */
-	BUG_ON(!dma_is_consistent(pScript) && L1_CACHE_BYTES < dma_get_cache_alignment());
+	BUG_ON(!dma_is_consistent(hostdata->dev, pScript) && L1_CACHE_BYTES < dma_get_cache_alignment());
 	hostdata->slots = (struct NCR_700_command_slot *)(memory + SLOTS_OFFSET);
 	hostdata->dev = dev;
 
diff --git a/include/asm-alpha/dma-mapping.h b/include/asm-alpha/dma-mapping.h
index b9ff4d8..b274bf631 100644
--- a/include/asm-alpha/dma-mapping.h
+++ b/include/asm-alpha/dma-mapping.h
@@ -51,7 +51,7 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 
 #define dma_alloc_noncoherent(d, s, h, f)	dma_alloc_coherent(d, s, h, f)
 #define dma_free_noncoherent(d, s, v, h)	dma_free_coherent(d, s, v, h)
-#define dma_is_consistent(dev)			(1)
+#define dma_is_consistent(d, h)			(1)
 
 int dma_set_mask(struct device *dev, u64 mask);
 
diff --git a/include/asm-arm/dma-mapping.h b/include/asm-arm/dma-mapping.h
index 6666177..9bc46b4 100644
--- a/include/asm-arm/dma-mapping.h
+++ b/include/asm-arm/dma-mapping.h
@@ -48,7 +48,7 @@ static inline int dma_get_cache_alignment(void)
 	return 32;
 }
 
-static inline int dma_is_consistent(dma_addr_t handle)
+static inline int dma_is_consistent(struct device *dev, dma_addr_t handle)
 {
 	return !!arch_is_coherent();
 }
diff --git a/include/asm-avr32/dma-mapping.h b/include/asm-avr32/dma-mapping.h
index 4c40cb4..44630be 100644
--- a/include/asm-avr32/dma-mapping.h
+++ b/include/asm-avr32/dma-mapping.h
@@ -307,7 +307,7 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
 
-static inline int dma_is_consistent(dma_addr_t dma_addr)
+static inline int dma_is_consistent(struct device *dev, dma_addr_t dma_addr)
 {
 	return 1;
 }
diff --git a/include/asm-cris/dma-mapping.h b/include/asm-cris/dma-mapping.h
index cbf1a98..af704fd 100644
--- a/include/asm-cris/dma-mapping.h
+++ b/include/asm-cris/dma-mapping.h
@@ -156,7 +156,7 @@ dma_get_cache_alignment(void)
 	return (1 << INTERNODE_CACHE_SHIFT);
 }
 
-#define dma_is_consistent(d)	(1)
+#define dma_is_consistent(d, h)	(1)
 
 static inline void
 dma_cache_sync(void *vaddr, size_t size,
diff --git a/include/asm-frv/dma-mapping.h b/include/asm-frv/dma-mapping.h
index e9fc1d4..7b97fc7 100644
--- a/include/asm-frv/dma-mapping.h
+++ b/include/asm-frv/dma-mapping.h
@@ -172,7 +172,7 @@ int dma_get_cache_alignment(void)
 	return 1 << L1_CACHE_SHIFT;
 }
 
-#define dma_is_consistent(d)	(1)
+#define dma_is_consistent(d, h)	(1)
 
 static inline
 void dma_cache_sync(void *vaddr, size_t size,
diff --git a/include/asm-generic/dma-mapping.h b/include/asm-generic/dma-mapping.h
index b541e48..b9be3fc 100644
--- a/include/asm-generic/dma-mapping.h
+++ b/include/asm-generic/dma-mapping.h
@@ -266,7 +266,7 @@ dma_error(dma_addr_t dma_addr)
 
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-#define dma_is_consistent(d)	(1)
+#define dma_is_consistent(d, h)	(1)
 
 static inline int
 dma_get_cache_alignment(void)
diff --git a/include/asm-i386/dma-mapping.h b/include/asm-i386/dma-mapping.h
index 81999a3..7da64c9 100644
--- a/include/asm-i386/dma-mapping.h
+++ b/include/asm-i386/dma-mapping.h
@@ -156,7 +156,7 @@ dma_get_cache_alignment(void)
 	return (1 << INTERNODE_CACHE_SHIFT);
 }
 
-#define dma_is_consistent(d)	(1)
+#define dma_is_consistent(d, h)	(1)
 
 static inline void
 dma_cache_sync(void *vaddr, size_t size,
diff --git a/include/asm-ia64/dma-mapping.h b/include/asm-ia64/dma-mapping.h
index 99a8f8e..4b075bc 100644
--- a/include/asm-ia64/dma-mapping.h
+++ b/include/asm-ia64/dma-mapping.h
@@ -59,6 +59,6 @@ dma_cache_sync (void *vaddr, size_t size, enum dma_data_direction dir)
 	mb();
 }
 
-#define dma_is_consistent(dma_handle)	(1)	/* all we do is coherent memory... */
+#define dma_is_consistent(d, h)	(1)	/* all we do is coherent memory... */
 
 #endif /* _ASM_IA64_DMA_MAPPING_H */
diff --git a/include/asm-m68k/dma-mapping.h b/include/asm-m68k/dma-mapping.h
index d90d841..efc89c1 100644
--- a/include/asm-m68k/dma-mapping.h
+++ b/include/asm-m68k/dma-mapping.h
@@ -21,7 +21,7 @@ static inline int dma_get_cache_alignment(void)
 	return 1 << L1_CACHE_SHIFT;
 }
 
-static inline int dma_is_consistent(dma_addr_t dma_addr)
+static inline int dma_is_consistent(struct device *dev, dma_addr_t dma_addr)
 {
 	return 0;
 }
diff --git a/include/asm-mips/dma-mapping.h b/include/asm-mips/dma-mapping.h
index 4328863..e17f70d 100644
--- a/include/asm-mips/dma-mapping.h
+++ b/include/asm-mips/dma-mapping.h
@@ -63,7 +63,7 @@ dma_get_cache_alignment(void)
 	return 128;
 }
 
-extern int dma_is_consistent(dma_addr_t dma_addr);
+extern int dma_is_consistent(struct device *dev, dma_addr_t dma_addr);
 
 extern void dma_cache_sync(void *vaddr, size_t size,
 	       enum dma_data_direction direction);
diff --git a/include/asm-parisc/dma-mapping.h b/include/asm-parisc/dma-mapping.h
index 1e387e1..c40d48a 100644
--- a/include/asm-parisc/dma-mapping.h
+++ b/include/asm-parisc/dma-mapping.h
@@ -191,7 +191,7 @@ dma_get_cache_alignment(void)
 }
 
 static inline int
-dma_is_consistent(dma_addr_t dma_addr)
+dma_is_consistent(struct device *dev, dma_addr_t dma_addr)
 {
 	return (hppa_dma_ops->dma_sync_single_for_cpu == NULL);
 }
diff --git a/include/asm-powerpc/dma-mapping.h b/include/asm-powerpc/dma-mapping.h
index 7e38b5f..3cf635b 100644
--- a/include/asm-powerpc/dma-mapping.h
+++ b/include/asm-powerpc/dma-mapping.h
@@ -342,9 +342,9 @@ static inline int dma_mapping_error(dma_addr_t dma_addr)
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
 #ifdef CONFIG_NOT_COHERENT_CACHE
-#define dma_is_consistent(d)	(0)
+#define dma_is_consistent(d, h)	(0)
 #else
-#define dma_is_consistent(d)	(1)
+#define dma_is_consistent(d, h)	(1)
 #endif
 
 static inline int dma_get_cache_alignment(void)
diff --git a/include/asm-sparc64/dma-mapping.h b/include/asm-sparc64/dma-mapping.h
index 27c46fb..5fe0072 100644
--- a/include/asm-sparc64/dma-mapping.h
+++ b/include/asm-sparc64/dma-mapping.h
@@ -181,7 +181,7 @@ dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t siz
 
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-#define dma_is_consistent(d)	(1)
+#define dma_is_consistent(d, h)	(1)
 
 static inline int
 dma_get_cache_alignment(void)
diff --git a/include/asm-um/dma-mapping.h b/include/asm-um/dma-mapping.h
index babd298..defb5b8 100644
--- a/include/asm-um/dma-mapping.h
+++ b/include/asm-um/dma-mapping.h
@@ -94,7 +94,7 @@ dma_sync_sg(struct device *dev, struct scatterlist *sg, int nelems,
 
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-#define dma_is_consistent(d) (1)
+#define dma_is_consistent(d, h) (1)
 
 static inline int
 dma_get_cache_alignment(void)
diff --git a/include/asm-x86_64/dma-mapping.h b/include/asm-x86_64/dma-mapping.h
index 10174b1..c8cc488 100644
--- a/include/asm-x86_64/dma-mapping.h
+++ b/include/asm-x86_64/dma-mapping.h
@@ -180,7 +180,7 @@ static inline int dma_get_cache_alignment(void)
 	return boot_cpu_data.x86_clflush_size;
 }
 
-#define dma_is_consistent(h) 1
+#define dma_is_consistent(d, h) 1
 
 extern int dma_set_mask(struct device *dev, u64 mask);
 
diff --git a/include/asm-xtensa/dma-mapping.h b/include/asm-xtensa/dma-mapping.h
index c39c91d..827d1df 100644
--- a/include/asm-xtensa/dma-mapping.h
+++ b/include/asm-xtensa/dma-mapping.h
@@ -170,7 +170,7 @@ dma_get_cache_alignment(void)
 	return L1_CACHE_BYTES;
 }
 
-#define dma_is_consistent(d)	(1)
+#define dma_is_consistent(d, h)	(1)
 
 static inline void
 dma_cache_sync(void *vaddr, size_t size,
-- 
cgit v0.10.2


From d3fa72e4556ec1f04e46a0d561d9e785ecaa173d Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Wed, 6 Dec 2006 20:38:56 -0800
Subject: [PATCH] Pass struct dev pointer to dma_cache_sync()

Pass struct dev pointer to dma_cache_sync()

dma_cache_sync() is ill-designed in that it does not have a struct device
pointer argument which makes proper support for systems that consist of a
mix of coherent and non-coherent DMA devices hard.  Change dma_cache_sync
to take a struct device pointer as first argument and fix all its callers
to pass it.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 3dc1f91..805db4b 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -459,7 +459,7 @@ anything like this.  You must also be extra careful about accessing
 memory you intend to sync partially.
 
 void
-dma_cache_sync(void *vaddr, size_t size,
+dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	       enum dma_data_direction direction)
 
 Do a partial sync of memory that was allocated by
diff --git a/arch/avr32/mm/dma-coherent.c b/arch/avr32/mm/dma-coherent.c
index 44ab8a7..b68d669 100644
--- a/arch/avr32/mm/dma-coherent.c
+++ b/arch/avr32/mm/dma-coherent.c
@@ -11,7 +11,7 @@
 #include <asm/addrspace.h>
 #include <asm/cacheflush.h>
 
-void dma_cache_sync(void *vaddr, size_t size, int direction)
+void dma_cache_sync(struct device *dev, void *vaddr, size_t size, int direction)
 {
 	/*
 	 * No need to sync an uncached area
diff --git a/arch/mips/mm/dma-coherent.c b/arch/mips/mm/dma-coherent.c
index 18bc83e..5697c6e 100644
--- a/arch/mips/mm/dma-coherent.c
+++ b/arch/mips/mm/dma-coherent.c
@@ -197,7 +197,7 @@ int dma_is_consistent(struct device *dev, dma_addr_t dma_addr)
 
 EXPORT_SYMBOL(dma_is_consistent);
 
-void dma_cache_sync(void *vaddr, size_t size,
+void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	       enum dma_data_direction direction)
 {
 	BUG_ON(direction == DMA_NONE);
diff --git a/arch/mips/mm/dma-ip27.c b/arch/mips/mm/dma-ip27.c
index 8e9a5a8..f088344 100644
--- a/arch/mips/mm/dma-ip27.c
+++ b/arch/mips/mm/dma-ip27.c
@@ -204,7 +204,7 @@ int dma_is_consistent(struct device *dev, dma_addr_t dma_addr)
 
 EXPORT_SYMBOL(dma_is_consistent);
 
-void dma_cache_sync(void *vaddr, size_t size,
+void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	       enum dma_data_direction direction)
 {
 	BUG_ON(direction == DMA_NONE);
diff --git a/arch/mips/mm/dma-ip32.c b/arch/mips/mm/dma-ip32.c
index 08720a4..b42b6f7 100644
--- a/arch/mips/mm/dma-ip32.c
+++ b/arch/mips/mm/dma-ip32.c
@@ -370,7 +370,8 @@ int dma_is_consistent(struct device *dev, dma_addr_t dma_addr)
 
 EXPORT_SYMBOL(dma_is_consistent);
 
-void dma_cache_sync(void *vaddr, size_t size, enum dma_data_direction direction)
+void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+	enum dma_data_direction direction)
 {
 	if (direction == DMA_NONE)
 		return;
diff --git a/arch/mips/mm/dma-noncoherent.c b/arch/mips/mm/dma-noncoherent.c
index 4a3efc6..8cecef0 100644
--- a/arch/mips/mm/dma-noncoherent.c
+++ b/arch/mips/mm/dma-noncoherent.c
@@ -306,7 +306,8 @@ int dma_is_consistent(struct device *dev, dma_addr_t dma_addr)
 
 EXPORT_SYMBOL(dma_is_consistent);
 
-void dma_cache_sync(void *vaddr, size_t size, enum dma_data_direction direction)
+void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+	enum dma_data_direction direction)
 {
 	if (direction == DMA_NONE)
 		return;
diff --git a/drivers/net/lasi_82596.c b/drivers/net/lasi_82596.c
index f4d815b..ea392f2 100644
--- a/drivers/net/lasi_82596.c
+++ b/drivers/net/lasi_82596.c
@@ -119,14 +119,14 @@
 #define DEB(x,y)	if (i596_debug & (x)) { y; }
 
 
-#define  CHECK_WBACK(addr,len) \
-	do { dma_cache_sync((void *)addr, len, DMA_TO_DEVICE); } while (0)
+#define  CHECK_WBACK(priv, addr,len) \
+	do { dma_cache_sync((priv)->dev, (void *)addr, len, DMA_TO_DEVICE); } while (0)
 
-#define  CHECK_INV(addr,len) \
-	do { dma_cache_sync((void *)addr, len, DMA_FROM_DEVICE); } while(0)
+#define  CHECK_INV(priv, addr,len) \
+	do { dma_cache_sync((priv)->dev, (void *)addr, len, DMA_FROM_DEVICE); } while(0)
 
-#define  CHECK_WBACK_INV(addr,len) \
-	do { dma_cache_sync((void *)addr, len, DMA_BIDIRECTIONAL); } while (0)
+#define  CHECK_WBACK_INV(priv, addr,len) \
+	do { dma_cache_sync((priv)->dev, (void *)addr, len, DMA_BIDIRECTIONAL); } while (0)
 
 
 #define PA_I82596_RESET		0	/* Offsets relative to LASI-LAN-Addr.*/
@@ -449,10 +449,10 @@ static inline void MPU_PORT(struct net_device *dev, int c, dma_addr_t x)
 
 static inline int wait_istat(struct net_device *dev, struct i596_private *lp, int delcnt, char *str)
 {
-	CHECK_INV(&(lp->iscp), sizeof(struct i596_iscp));
+	CHECK_INV(lp, &(lp->iscp), sizeof(struct i596_iscp));
 	while (--delcnt && lp->iscp.stat) {
 		udelay(10);
-		CHECK_INV(&(lp->iscp), sizeof(struct i596_iscp));
+		CHECK_INV(lp, &(lp->iscp), sizeof(struct i596_iscp));
 	}
 	if (!delcnt) {
 		printk("%s: %s, iscp.stat %04x, didn't clear\n",
@@ -466,10 +466,10 @@ static inline int wait_istat(struct net_device *dev, struct i596_private *lp, in
 
 static inline int wait_cmd(struct net_device *dev, struct i596_private *lp, int delcnt, char *str)
 {
-	CHECK_INV(&(lp->scb), sizeof(struct i596_scb));
+	CHECK_INV(lp, &(lp->scb), sizeof(struct i596_scb));
 	while (--delcnt && lp->scb.command) {
 		udelay(10);
-		CHECK_INV(&(lp->scb), sizeof(struct i596_scb));
+		CHECK_INV(lp, &(lp->scb), sizeof(struct i596_scb));
 	}
 	if (!delcnt) {
 		printk("%s: %s, status %4.4x, cmd %4.4x.\n",
@@ -522,7 +522,7 @@ static void i596_display_data(struct net_device *dev)
 			rbd, rbd->count, rbd->b_next, rbd->b_data, rbd->size);
 		rbd = rbd->v_next;
 	} while (rbd != lp->rbd_head);
-	CHECK_INV(lp, sizeof(struct i596_private));
+	CHECK_INV(lp, lp, sizeof(struct i596_private));
 }
 
 
@@ -592,7 +592,7 @@ static inline void init_rx_bufs(struct net_device *dev)
 	rfd->b_next = WSWAPrfd(virt_to_dma(lp,lp->rfds));
 	rfd->cmd = CMD_EOL|CMD_FLEX;
 
-	CHECK_WBACK_INV(lp, sizeof(struct i596_private));
+	CHECK_WBACK_INV(lp, lp, sizeof(struct i596_private));
 }
 
 static inline void remove_rx_bufs(struct net_device *dev)
@@ -629,7 +629,7 @@ static void rebuild_rx_bufs(struct net_device *dev)
 	lp->rbd_head = lp->rbds;
 	lp->rfds[0].rbd = WSWAPrbd(virt_to_dma(lp,lp->rbds));
 
-	CHECK_WBACK_INV(lp, sizeof(struct i596_private));
+	CHECK_WBACK_INV(lp, lp, sizeof(struct i596_private));
 }
 
 
@@ -663,8 +663,8 @@ static int init_i596_mem(struct net_device *dev)
 
 	DEB(DEB_INIT, printk("%s: starting i82596.\n", dev->name));
 
-	CHECK_WBACK(&(lp->scp), sizeof(struct i596_scp));
-	CHECK_WBACK(&(lp->iscp), sizeof(struct i596_iscp));
+	CHECK_WBACK(lp, &(lp->scp), sizeof(struct i596_scp));
+	CHECK_WBACK(lp, &(lp->iscp), sizeof(struct i596_iscp));
 
 	MPU_PORT(dev, PORT_ALTSCP, virt_to_dma(lp,&lp->scp));
 
@@ -678,25 +678,25 @@ static int init_i596_mem(struct net_device *dev)
 	rebuild_rx_bufs(dev);
 
 	lp->scb.command = 0;
-	CHECK_WBACK(&(lp->scb), sizeof(struct i596_scb));
+	CHECK_WBACK(lp, &(lp->scb), sizeof(struct i596_scb));
 
 	enable_irq(dev->irq);	/* enable IRQs from LAN */
 
 	DEB(DEB_INIT, printk("%s: queuing CmdConfigure\n", dev->name));
 	memcpy(lp->cf_cmd.i596_config, init_setup, 14);
 	lp->cf_cmd.cmd.command = CmdConfigure;
-	CHECK_WBACK(&(lp->cf_cmd), sizeof(struct cf_cmd));
+	CHECK_WBACK(lp, &(lp->cf_cmd), sizeof(struct cf_cmd));
 	i596_add_cmd(dev, &lp->cf_cmd.cmd);
 
 	DEB(DEB_INIT, printk("%s: queuing CmdSASetup\n", dev->name));
 	memcpy(lp->sa_cmd.eth_addr, dev->dev_addr, 6);
 	lp->sa_cmd.cmd.command = CmdSASetup;
-	CHECK_WBACK(&(lp->sa_cmd), sizeof(struct sa_cmd));
+	CHECK_WBACK(lp, &(lp->sa_cmd), sizeof(struct sa_cmd));
 	i596_add_cmd(dev, &lp->sa_cmd.cmd);
 
 	DEB(DEB_INIT, printk("%s: queuing CmdTDR\n", dev->name));
 	lp->tdr_cmd.cmd.command = CmdTDR;
-	CHECK_WBACK(&(lp->tdr_cmd), sizeof(struct tdr_cmd));
+	CHECK_WBACK(lp, &(lp->tdr_cmd), sizeof(struct tdr_cmd));
 	i596_add_cmd(dev, &lp->tdr_cmd.cmd);
 
 	spin_lock_irqsave (&lp->lock, flags);
@@ -708,7 +708,7 @@ static int init_i596_mem(struct net_device *dev)
 	DEB(DEB_INIT, printk("%s: Issuing RX_START\n", dev->name));
 	lp->scb.command = RX_START;
 	lp->scb.rfd = WSWAPrfd(virt_to_dma(lp,lp->rfds));
-	CHECK_WBACK(&(lp->scb), sizeof(struct i596_scb));
+	CHECK_WBACK(lp, &(lp->scb), sizeof(struct i596_scb));
 
 	CA(dev);
 
@@ -740,13 +740,13 @@ static inline int i596_rx(struct net_device *dev)
 
 	rfd = lp->rfd_head;		/* Ref next frame to check */
 
-	CHECK_INV(rfd, sizeof(struct i596_rfd));
+	CHECK_INV(lp, rfd, sizeof(struct i596_rfd));
 	while ((rfd->stat) & STAT_C) {	/* Loop while complete frames */
 		if (rfd->rbd == I596_NULL)
 			rbd = NULL;
 		else if (rfd->rbd == lp->rbd_head->b_addr) {
 			rbd = lp->rbd_head;
-			CHECK_INV(rbd, sizeof(struct i596_rbd));
+			CHECK_INV(lp, rbd, sizeof(struct i596_rbd));
 		}
 		else {
 			printk("%s: rbd chain broken!\n", dev->name);
@@ -790,7 +790,7 @@ static inline int i596_rx(struct net_device *dev)
 				dma_addr = dma_map_single(lp->dev, newskb->data, PKT_BUF_SZ, DMA_FROM_DEVICE);
 				rbd->v_data = newskb->data;
 				rbd->b_data = WSWAPchar(dma_addr);
-				CHECK_WBACK_INV(rbd, sizeof(struct i596_rbd));
+				CHECK_WBACK_INV(lp, rbd, sizeof(struct i596_rbd));
 			}
 			else
 				skb = dev_alloc_skb(pkt_len + 2);
@@ -842,7 +842,7 @@ memory_squeeze:
 		if (rbd != NULL && (rbd->count & 0x4000)) {
 			rbd->count = 0;
 			lp->rbd_head = rbd->v_next;
-			CHECK_WBACK_INV(rbd, sizeof(struct i596_rbd));
+			CHECK_WBACK_INV(lp, rbd, sizeof(struct i596_rbd));
 		}
 
 		/* Tidy the frame descriptor, marking it as end of list */
@@ -860,10 +860,10 @@ memory_squeeze:
 
 		lp->scb.rfd = rfd->b_next;
 		lp->rfd_head = rfd->v_next;
-		CHECK_WBACK_INV(rfd->v_prev, sizeof(struct i596_rfd));
-		CHECK_WBACK_INV(rfd, sizeof(struct i596_rfd));
+		CHECK_WBACK_INV(lp, rfd->v_prev, sizeof(struct i596_rfd));
+		CHECK_WBACK_INV(lp, rfd, sizeof(struct i596_rfd));
 		rfd = lp->rfd_head;
-		CHECK_INV(rfd, sizeof(struct i596_rfd));
+		CHECK_INV(lp, rfd, sizeof(struct i596_rfd));
 	}
 
 	DEB(DEB_RXFRAME, printk("frames %d\n", frames));
@@ -902,12 +902,12 @@ static inline void i596_cleanup_cmd(struct net_device *dev, struct i596_private
 			ptr->v_next = NULL;
 			ptr->b_next = I596_NULL;
 		}
-		CHECK_WBACK_INV(ptr, sizeof(struct i596_cmd));
+		CHECK_WBACK_INV(lp, ptr, sizeof(struct i596_cmd));
 	}
 
 	wait_cmd(dev, lp, 100, "i596_cleanup_cmd timed out");
 	lp->scb.cmd = I596_NULL;
-	CHECK_WBACK(&(lp->scb), sizeof(struct i596_scb));
+	CHECK_WBACK(lp, &(lp->scb), sizeof(struct i596_scb));
 }
 
 
@@ -925,7 +925,7 @@ static inline void i596_reset(struct net_device *dev, struct i596_private *lp)
 
 	/* FIXME: this command might cause an lpmc */
 	lp->scb.command = CUC_ABORT | RX_ABORT;
-	CHECK_WBACK(&(lp->scb), sizeof(struct i596_scb));
+	CHECK_WBACK(lp, &(lp->scb), sizeof(struct i596_scb));
 	CA(dev);
 
 	/* wait for shutdown */
@@ -951,20 +951,20 @@ static void i596_add_cmd(struct net_device *dev, struct i596_cmd *cmd)
 	cmd->command |= (CMD_EOL | CMD_INTR);
 	cmd->v_next = NULL;
 	cmd->b_next = I596_NULL;
-	CHECK_WBACK(cmd, sizeof(struct i596_cmd));
+	CHECK_WBACK(lp, cmd, sizeof(struct i596_cmd));
 
 	spin_lock_irqsave (&lp->lock, flags);
 
 	if (lp->cmd_head != NULL) {
 		lp->cmd_tail->v_next = cmd;
 		lp->cmd_tail->b_next = WSWAPcmd(virt_to_dma(lp,&cmd->status));
-		CHECK_WBACK(lp->cmd_tail, sizeof(struct i596_cmd));
+		CHECK_WBACK(lp, lp->cmd_tail, sizeof(struct i596_cmd));
 	} else {
 		lp->cmd_head = cmd;
 		wait_cmd(dev, lp, 100, "i596_add_cmd timed out");
 		lp->scb.cmd = WSWAPcmd(virt_to_dma(lp,&cmd->status));
 		lp->scb.command = CUC_START;
-		CHECK_WBACK(&(lp->scb), sizeof(struct i596_scb));
+		CHECK_WBACK(lp, &(lp->scb), sizeof(struct i596_scb));
 		CA(dev);
 	}
 	lp->cmd_tail = cmd;
@@ -998,12 +998,12 @@ static int i596_test(struct net_device *dev)
 	data = virt_to_dma(lp,tint);
 
 	tint[1] = -1;
-	CHECK_WBACK(tint,PAGE_SIZE);
+	CHECK_WBACK(lp, tint, PAGE_SIZE);
 
 	MPU_PORT(dev, 1, data);
 
 	for(data = 1000000; data; data--) {
-		CHECK_INV(tint,PAGE_SIZE);
+		CHECK_INV(lp, tint, PAGE_SIZE);
 		if(tint[1] != -1)
 			break;
 
@@ -1061,7 +1061,7 @@ static void i596_tx_timeout (struct net_device *dev)
 		/* Issue a channel attention signal */
 		DEB(DEB_ERRORS, printk("Kicking board.\n"));
 		lp->scb.command = CUC_START | RX_START;
-		CHECK_WBACK_INV(&(lp->scb), sizeof(struct i596_scb));
+		CHECK_WBACK_INV(lp, &(lp->scb), sizeof(struct i596_scb));
 		CA (dev);
 		lp->last_restart = lp->stats.tx_packets;
 	}
@@ -1118,8 +1118,8 @@ static int i596_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		tbd->data = WSWAPchar(tx_cmd->dma_addr);
 
 		DEB(DEB_TXADDR,print_eth(skb->data, "tx-queued"));
-		CHECK_WBACK_INV(tx_cmd, sizeof(struct tx_cmd));
-		CHECK_WBACK_INV(tbd, sizeof(struct i596_tbd));
+		CHECK_WBACK_INV(lp, tx_cmd, sizeof(struct tx_cmd));
+		CHECK_WBACK_INV(lp, tbd, sizeof(struct i596_tbd));
 		i596_add_cmd(dev, &tx_cmd->cmd);
 
 		lp->stats.tx_packets++;
@@ -1228,7 +1228,7 @@ static int __devinit i82596_probe(struct net_device *dev,
 	lp->dma_addr = dma_addr;
 	lp->dev = gen_dev;
 
-	CHECK_WBACK_INV(dev->mem_start, sizeof(struct i596_private));
+	CHECK_WBACK_INV(lp, dev->mem_start, sizeof(struct i596_private));
 
 	i = register_netdev(dev);
 	if (i) {
@@ -1295,7 +1295,7 @@ static irqreturn_t i596_interrupt(int irq, void *dev_id)
 			DEB(DEB_INTS, printk("%s: i596 interrupt command unit inactive %x.\n", dev->name, status & 0x0700));
 
 		while (lp->cmd_head != NULL) {
-			CHECK_INV(lp->cmd_head, sizeof(struct i596_cmd));
+			CHECK_INV(lp, lp->cmd_head, sizeof(struct i596_cmd));
 			if (!(lp->cmd_head->status & STAT_C))
 				break;
 
@@ -1358,7 +1358,7 @@ static irqreturn_t i596_interrupt(int irq, void *dev_id)
 			}
 			ptr->v_next = NULL;
 		        ptr->b_next = I596_NULL;
-			CHECK_WBACK(ptr, sizeof(struct i596_cmd));
+			CHECK_WBACK(lp, ptr, sizeof(struct i596_cmd));
 			lp->last_cmd = jiffies;
 		}
 
@@ -1372,13 +1372,13 @@ static irqreturn_t i596_interrupt(int irq, void *dev_id)
 
 			ptr->command &= 0x1fff;
 			ptr = ptr->v_next;
-			CHECK_WBACK_INV(prev, sizeof(struct i596_cmd));
+			CHECK_WBACK_INV(lp, prev, sizeof(struct i596_cmd));
 		}
 
 		if ((lp->cmd_head != NULL))
 			ack_cmd |= CUC_START;
 		lp->scb.cmd = WSWAPcmd(virt_to_dma(lp,&lp->cmd_head->status));
-		CHECK_WBACK_INV(&lp->scb, sizeof(struct i596_scb));
+		CHECK_WBACK_INV(lp, &lp->scb, sizeof(struct i596_scb));
 	}
 	if ((status & 0x1000) || (status & 0x4000)) {
 		if ((status & 0x4000))
@@ -1397,7 +1397,7 @@ static irqreturn_t i596_interrupt(int irq, void *dev_id)
 	}
 	wait_cmd(dev, lp, 100, "i596 interrupt, timeout");
 	lp->scb.command = ack_cmd;
-	CHECK_WBACK(&lp->scb, sizeof(struct i596_scb));
+	CHECK_WBACK(lp, &lp->scb, sizeof(struct i596_scb));
 
 	/* DANGER: I suspect that some kind of interrupt
 	 acknowledgement aside from acking the 82596 might be needed
@@ -1426,7 +1426,7 @@ static int i596_close(struct net_device *dev)
 
 	wait_cmd(dev, lp, 100, "close1 timed out");
 	lp->scb.command = CUC_ABORT | RX_ABORT;
-	CHECK_WBACK(&lp->scb, sizeof(struct i596_scb));
+	CHECK_WBACK(lp, &lp->scb, sizeof(struct i596_scb));
 
 	CA(dev);
 
@@ -1486,7 +1486,7 @@ static void set_multicast_list(struct net_device *dev)
 			       dev->name);
 		else {
 			lp->cf_cmd.cmd.command = CmdConfigure;
-			CHECK_WBACK_INV(&lp->cf_cmd, sizeof(struct cf_cmd));
+			CHECK_WBACK_INV(lp, &lp->cf_cmd, sizeof(struct cf_cmd));
 			i596_add_cmd(dev, &lp->cf_cmd.cmd);
 		}
 	}
@@ -1514,7 +1514,7 @@ static void set_multicast_list(struct net_device *dev)
 				DEB(DEB_MULTI, printk("%s: Adding address %02x:%02x:%02x:%02x:%02x:%02x\n",
 						dev->name, cp[0],cp[1],cp[2],cp[3],cp[4],cp[5]));
 		}
-		CHECK_WBACK_INV(&lp->mc_cmd, sizeof(struct mc_cmd));
+		CHECK_WBACK_INV(lp, &lp->mc_cmd, sizeof(struct mc_cmd));
 		i596_add_cmd(dev, &cmd->cmd);
 	}
 }
diff --git a/drivers/scsi/53c700.c b/drivers/scsi/53c700.c
index acee062..68103e5 100644
--- a/drivers/scsi/53c700.c
+++ b/drivers/scsi/53c700.c
@@ -362,11 +362,11 @@ NCR_700_detect(struct scsi_host_template *tpnt,
 	for (j = 0; j < PATCHES; j++)
 		script[LABELPATCHES[j]] = bS_to_host(pScript + SCRIPT[LABELPATCHES[j]]);
 	/* now patch up fixed addresses. */
-	script_patch_32(script, MessageLocation,
+	script_patch_32(hostdata->dev, script, MessageLocation,
 			pScript + MSGOUT_OFFSET);
-	script_patch_32(script, StatusAddress,
+	script_patch_32(hostdata->dev, script, StatusAddress,
 			pScript + STATUS_OFFSET);
-	script_patch_32(script, ReceiveMsgAddress,
+	script_patch_32(hostdata->dev, script, ReceiveMsgAddress,
 			pScript + MSGIN_OFFSET);
 
 	hostdata->script = script;
@@ -821,8 +821,9 @@ process_extended_message(struct Scsi_Host *host,
 			shost_printk(KERN_WARNING, host,
 				"Unexpected SDTR msg\n");
 			hostdata->msgout[0] = A_REJECT_MSG;
-			dma_cache_sync(hostdata->msgout, 1, DMA_TO_DEVICE);
-			script_patch_16(hostdata->script, MessageCount, 1);
+			dma_cache_sync(hostdata->dev, hostdata->msgout, 1, DMA_TO_DEVICE);
+			script_patch_16(hostdata->dev, hostdata->script,
+			                MessageCount, 1);
 			/* SendMsgOut returns, so set up the return
 			 * address */
 			resume_offset = hostdata->pScript + Ent_SendMessageWithATN;
@@ -833,8 +834,9 @@ process_extended_message(struct Scsi_Host *host,
 		printk(KERN_INFO "scsi%d: (%d:%d), Unsolicited WDTR after CMD, Rejecting\n",
 		       host->host_no, pun, lun);
 		hostdata->msgout[0] = A_REJECT_MSG;
-		dma_cache_sync(hostdata->msgout, 1, DMA_TO_DEVICE);
-		script_patch_16(hostdata->script, MessageCount, 1);
+		dma_cache_sync(hostdata->dev, hostdata->msgout, 1, DMA_TO_DEVICE);
+		script_patch_16(hostdata->dev, hostdata->script, MessageCount,
+		                1);
 		resume_offset = hostdata->pScript + Ent_SendMessageWithATN;
 
 		break;
@@ -847,8 +849,9 @@ process_extended_message(struct Scsi_Host *host,
 		printk("\n");
 		/* just reject it */
 		hostdata->msgout[0] = A_REJECT_MSG;
-		dma_cache_sync(hostdata->msgout, 1, DMA_TO_DEVICE);
-		script_patch_16(hostdata->script, MessageCount, 1);
+		dma_cache_sync(hostdata->dev, hostdata->msgout, 1, DMA_TO_DEVICE);
+		script_patch_16(hostdata->dev, hostdata->script, MessageCount,
+		                1);
 		/* SendMsgOut returns, so set up the return
 		 * address */
 		resume_offset = hostdata->pScript + Ent_SendMessageWithATN;
@@ -929,8 +932,9 @@ process_message(struct Scsi_Host *host,	struct NCR_700_Host_Parameters *hostdata
 		printk("\n");
 		/* just reject it */
 		hostdata->msgout[0] = A_REJECT_MSG;
-		dma_cache_sync(hostdata->msgout, 1, DMA_TO_DEVICE);
-		script_patch_16(hostdata->script, MessageCount, 1);
+		dma_cache_sync(hostdata->dev, hostdata->msgout, 1, DMA_TO_DEVICE);
+		script_patch_16(hostdata->dev, hostdata->script, MessageCount,
+		                1);
 		/* SendMsgOut returns, so set up the return
 		 * address */
 		resume_offset = hostdata->pScript + Ent_SendMessageWithATN;
@@ -939,7 +943,7 @@ process_message(struct Scsi_Host *host,	struct NCR_700_Host_Parameters *hostdata
 	}
 	NCR_700_writel(temp, host, TEMP_REG);
 	/* set us up to receive another message */
-	dma_cache_sync(hostdata->msgin, MSG_ARRAY_SIZE, DMA_FROM_DEVICE);
+	dma_cache_sync(hostdata->dev, hostdata->msgin, MSG_ARRAY_SIZE, DMA_FROM_DEVICE);
 	return resume_offset;
 }
 
@@ -1019,9 +1023,9 @@ process_script_interrupt(__u32 dsps, __u32 dsp, struct scsi_cmnd *SCp,
 				slot->SG[1].ins = bS_to_host(SCRIPT_RETURN);
 				slot->SG[1].pAddr = 0;
 				slot->resume_offset = hostdata->pScript;
-				dma_cache_sync(slot->SG, sizeof(slot->SG[0])*2, DMA_TO_DEVICE);
-				dma_cache_sync(SCp->sense_buffer, sizeof(SCp->sense_buffer), DMA_FROM_DEVICE);
-				
+				dma_cache_sync(hostdata->dev, slot->SG, sizeof(slot->SG[0])*2, DMA_TO_DEVICE);
+				dma_cache_sync(hostdata->dev, SCp->sense_buffer, sizeof(SCp->sense_buffer), DMA_FROM_DEVICE);
+
 				/* queue the command for reissue */
 				slot->state = NCR_700_SLOT_QUEUED;
 				slot->flags = NCR_700_FLAG_AUTOSENSE;
@@ -1136,11 +1140,12 @@ process_script_interrupt(__u32 dsps, __u32 dsp, struct scsi_cmnd *SCp,
 			hostdata->cmd = slot->cmnd;
 
 			/* re-patch for this command */
-			script_patch_32_abs(hostdata->script, CommandAddress, 
-					    slot->pCmd);
-			script_patch_16(hostdata->script,
+			script_patch_32_abs(hostdata->dev, hostdata->script,
+			                    CommandAddress, slot->pCmd);
+			script_patch_16(hostdata->dev, hostdata->script,
 					CommandCount, slot->cmnd->cmd_len);
-			script_patch_32_abs(hostdata->script, SGScriptStartAddress,
+			script_patch_32_abs(hostdata->dev, hostdata->script,
+			                    SGScriptStartAddress,
 					    to32bit(&slot->pSG[0].ins));
 
 			/* Note: setting SXFER only works if we're
@@ -1150,13 +1155,13 @@ process_script_interrupt(__u32 dsps, __u32 dsp, struct scsi_cmnd *SCp,
 			 * should therefore always clear ACK */
 			NCR_700_writeb(NCR_700_get_SXFER(hostdata->cmd->device),
 				       host, SXFER_REG);
-			dma_cache_sync(hostdata->msgin,
+			dma_cache_sync(hostdata->dev, hostdata->msgin,
 				       MSG_ARRAY_SIZE, DMA_FROM_DEVICE);
-			dma_cache_sync(hostdata->msgout,
+			dma_cache_sync(hostdata->dev, hostdata->msgout,
 				       MSG_ARRAY_SIZE, DMA_TO_DEVICE);
 			/* I'm just being paranoid here, the command should
 			 * already have been flushed from the cache */
-			dma_cache_sync(slot->cmnd->cmnd,
+			dma_cache_sync(hostdata->dev, slot->cmnd->cmnd,
 				       slot->cmnd->cmd_len, DMA_TO_DEVICE);
 
 
@@ -1220,7 +1225,7 @@ process_script_interrupt(__u32 dsps, __u32 dsp, struct scsi_cmnd *SCp,
 		hostdata->reselection_id = reselection_id;
 		/* just in case we have a stale simple tag message, clear it */
 		hostdata->msgin[1] = 0;
-		dma_cache_sync(hostdata->msgin,
+		dma_cache_sync(hostdata->dev, hostdata->msgin,
 			       MSG_ARRAY_SIZE, DMA_BIDIRECTIONAL);
 		if(hostdata->tag_negotiated & (1<<reselection_id)) {
 			resume_offset = hostdata->pScript + Ent_GetReselectionWithTag;
@@ -1336,7 +1341,7 @@ process_selection(struct Scsi_Host *host, __u32 dsp)
 	hostdata->cmd = NULL;
 	/* clear any stale simple tag message */
 	hostdata->msgin[1] = 0;
-	dma_cache_sync(hostdata->msgin, MSG_ARRAY_SIZE,
+	dma_cache_sync(hostdata->dev, hostdata->msgin, MSG_ARRAY_SIZE,
 		       DMA_BIDIRECTIONAL);
 
 	if(id == 0xff) {
@@ -1433,29 +1438,30 @@ NCR_700_start_command(struct scsi_cmnd *SCp)
 		NCR_700_set_flag(SCp->device, NCR_700_DEV_BEGIN_SYNC_NEGOTIATION);
 	}
 
-	script_patch_16(hostdata->script, MessageCount, count);
+	script_patch_16(hostdata->dev, hostdata->script, MessageCount, count);
 
 
-	script_patch_ID(hostdata->script,
+	script_patch_ID(hostdata->dev, hostdata->script,
 			Device_ID, 1<<scmd_id(SCp));
 
-	script_patch_32_abs(hostdata->script, CommandAddress, 
+	script_patch_32_abs(hostdata->dev, hostdata->script, CommandAddress,
 			    slot->pCmd);
-	script_patch_16(hostdata->script, CommandCount, SCp->cmd_len);
+	script_patch_16(hostdata->dev, hostdata->script, CommandCount,
+	                SCp->cmd_len);
 	/* finally plumb the beginning of the SG list into the script
 	 * */
-	script_patch_32_abs(hostdata->script, SGScriptStartAddress,
-			    to32bit(&slot->pSG[0].ins));
+	script_patch_32_abs(hostdata->dev, hostdata->script,
+	                    SGScriptStartAddress, to32bit(&slot->pSG[0].ins));
 	NCR_700_clear_fifo(SCp->device->host);
 
 	if(slot->resume_offset == 0)
 		slot->resume_offset = hostdata->pScript;
 	/* now perform all the writebacks and invalidates */
-	dma_cache_sync(hostdata->msgout, count, DMA_TO_DEVICE);
-	dma_cache_sync(hostdata->msgin, MSG_ARRAY_SIZE,
+	dma_cache_sync(hostdata->dev, hostdata->msgout, count, DMA_TO_DEVICE);
+	dma_cache_sync(hostdata->dev, hostdata->msgin, MSG_ARRAY_SIZE,
 		       DMA_FROM_DEVICE);
-	dma_cache_sync(SCp->cmnd, SCp->cmd_len, DMA_TO_DEVICE);
-	dma_cache_sync(hostdata->status, 1, DMA_FROM_DEVICE);
+	dma_cache_sync(hostdata->dev, SCp->cmnd, SCp->cmd_len, DMA_TO_DEVICE);
+	dma_cache_sync(hostdata->dev, hostdata->status, 1, DMA_FROM_DEVICE);
 
 	/* set the synchronous period/offset */
 	NCR_700_writeb(NCR_700_get_SXFER(SCp->device),
@@ -1631,7 +1637,7 @@ NCR_700_intr(int irq, void *dev_id)
 					slot->SG[i].ins = bS_to_host(SCRIPT_NOP);
 					slot->SG[i].pAddr = 0;
 				}
-				dma_cache_sync(slot->SG, sizeof(slot->SG), DMA_TO_DEVICE);
+				dma_cache_sync(hostdata->dev, slot->SG, sizeof(slot->SG), DMA_TO_DEVICE);
 				/* and pretend we disconnected after
 				 * the command phase */
 				resume_offset = hostdata->pScript + Ent_MsgInDuringData;
@@ -1897,9 +1903,9 @@ NCR_700_queuecommand(struct scsi_cmnd *SCp, void (*done)(struct scsi_cmnd *))
 		}
 		slot->SG[i].ins = bS_to_host(SCRIPT_RETURN);
 		slot->SG[i].pAddr = 0;
-		dma_cache_sync(slot->SG, sizeof(slot->SG), DMA_TO_DEVICE);
+		dma_cache_sync(hostdata->dev, slot->SG, sizeof(slot->SG), DMA_TO_DEVICE);
 		DEBUG((" SETTING %08lx to %x\n",
-		       (&slot->pSG[i].ins), 
+		       (&slot->pSG[i].ins),
 		       slot->SG[i].ins));
 	}
 	slot->resume_offset = 0;
diff --git a/drivers/scsi/53c700.h b/drivers/scsi/53c700.h
index f5c3caf..f38822d 100644
--- a/drivers/scsi/53c700.h
+++ b/drivers/scsi/53c700.h
@@ -415,31 +415,31 @@ struct NCR_700_Host_Parameters {
 #define NCR_710_MIN_XFERP	0
 #define NCR_700_MIN_PERIOD	25 /* for SDTR message, 100ns */
 
-#define script_patch_32(script, symbol, value) \
+#define script_patch_32(dev, script, symbol, value) \
 { \
 	int i; \
 	for(i=0; i< (sizeof(A_##symbol##_used) / sizeof(__u32)); i++) { \
 		__u32 val = bS_to_cpu((script)[A_##symbol##_used[i]]) + value; \
 		(script)[A_##symbol##_used[i]] = bS_to_host(val); \
-		dma_cache_sync(&(script)[A_##symbol##_used[i]], 4, DMA_TO_DEVICE); \
+		dma_cache_sync((dev), &(script)[A_##symbol##_used[i]], 4, DMA_TO_DEVICE); \
 		DEBUG((" script, patching %s at %d to 0x%lx\n", \
 		       #symbol, A_##symbol##_used[i], (value))); \
 	} \
 }
 
-#define script_patch_32_abs(script, symbol, value) \
+#define script_patch_32_abs(dev, script, symbol, value) \
 { \
 	int i; \
 	for(i=0; i< (sizeof(A_##symbol##_used) / sizeof(__u32)); i++) { \
 		(script)[A_##symbol##_used[i]] = bS_to_host(value); \
-		dma_cache_sync(&(script)[A_##symbol##_used[i]], 4, DMA_TO_DEVICE); \
+		dma_cache_sync((dev), &(script)[A_##symbol##_used[i]], 4, DMA_TO_DEVICE); \
 		DEBUG((" script, patching %s at %d to 0x%lx\n", \
 		       #symbol, A_##symbol##_used[i], (value))); \
 	} \
 }
 
 /* Used for patching the SCSI ID in the SELECT instruction */
-#define script_patch_ID(script, symbol, value) \
+#define script_patch_ID(dev, script, symbol, value) \
 { \
 	int i; \
 	for(i=0; i< (sizeof(A_##symbol##_used) / sizeof(__u32)); i++) { \
@@ -447,13 +447,13 @@ struct NCR_700_Host_Parameters {
 		val &= 0xff00ffff; \
 		val |= ((value) & 0xff) << 16; \
 		(script)[A_##symbol##_used[i]] = bS_to_host(val); \
-		dma_cache_sync(&(script)[A_##symbol##_used[i]], 4, DMA_TO_DEVICE); \
+		dma_cache_sync((dev), &(script)[A_##symbol##_used[i]], 4, DMA_TO_DEVICE); \
 		DEBUG((" script, patching ID field %s at %d to 0x%x\n", \
 		       #symbol, A_##symbol##_used[i], val)); \
 	} \
 }
 
-#define script_patch_16(script, symbol, value) \
+#define script_patch_16(dev, script, symbol, value) \
 { \
 	int i; \
 	for(i=0; i< (sizeof(A_##symbol##_used) / sizeof(__u32)); i++) { \
@@ -461,7 +461,7 @@ struct NCR_700_Host_Parameters {
 		val &= 0xffff0000; \
 		val |= ((value) & 0xffff); \
 		(script)[A_##symbol##_used[i]] = bS_to_host(val); \
-		dma_cache_sync(&(script)[A_##symbol##_used[i]], 4, DMA_TO_DEVICE); \
+		dma_cache_sync((dev), &(script)[A_##symbol##_used[i]], 4, DMA_TO_DEVICE); \
 		DEBUG((" script, patching short field %s at %d to 0x%x\n", \
 		       #symbol, A_##symbol##_used[i], val)); \
 	} \
diff --git a/drivers/serial/mpsc.c b/drivers/serial/mpsc.c
index 8eea69f..29823bd 100644
--- a/drivers/serial/mpsc.c
+++ b/drivers/serial/mpsc.c
@@ -555,7 +555,7 @@ mpsc_sdma_start_tx(struct mpsc_port_info *pi)
 	if (!mpsc_sdma_tx_active(pi)) {
 		txre = (struct mpsc_tx_desc *)(pi->txr +
 			(pi->txr_tail * MPSC_TXRE_SIZE));
-		dma_cache_sync((void *) txre, MPSC_TXRE_SIZE, DMA_FROM_DEVICE);
+		dma_cache_sync(pi->port.dev, (void *) txre, MPSC_TXRE_SIZE, DMA_FROM_DEVICE);
 #if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
 		if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
 			invalidate_dcache_range((ulong)txre,
@@ -931,7 +931,7 @@ mpsc_init_rings(struct mpsc_port_info *pi)
 	}
 	txre->link = cpu_to_be32(pi->txr_p);	/* Wrap last back to first */
 
-	dma_cache_sync((void *) pi->dma_region, MPSC_DMA_ALLOC_SIZE,
+	dma_cache_sync(pi->port.dev, (void *) pi->dma_region, MPSC_DMA_ALLOC_SIZE,
 		DMA_BIDIRECTIONAL);
 #if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
 		if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
@@ -1005,7 +1005,7 @@ mpsc_rx_intr(struct mpsc_port_info *pi)
 
 	rxre = (struct mpsc_rx_desc *)(pi->rxr + (pi->rxr_posn*MPSC_RXRE_SIZE));
 
-	dma_cache_sync((void *)rxre, MPSC_RXRE_SIZE, DMA_FROM_DEVICE);
+	dma_cache_sync(pi->port.dev, (void *)rxre, MPSC_RXRE_SIZE, DMA_FROM_DEVICE);
 #if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
 	if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
 		invalidate_dcache_range((ulong)rxre,
@@ -1029,7 +1029,7 @@ mpsc_rx_intr(struct mpsc_port_info *pi)
 		}
 
 		bp = pi->rxb + (pi->rxr_posn * MPSC_RXBE_SIZE);
-		dma_cache_sync((void *) bp, MPSC_RXBE_SIZE, DMA_FROM_DEVICE);
+		dma_cache_sync(pi->port.dev, (void *) bp, MPSC_RXBE_SIZE, DMA_FROM_DEVICE);
 #if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
 		if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
 			invalidate_dcache_range((ulong)bp,
@@ -1098,7 +1098,7 @@ next_frame:
 					    SDMA_DESC_CMDSTAT_F |
 					    SDMA_DESC_CMDSTAT_L);
 		wmb();
-		dma_cache_sync((void *)rxre, MPSC_RXRE_SIZE, DMA_BIDIRECTIONAL);
+		dma_cache_sync(pi->port.dev, (void *)rxre, MPSC_RXRE_SIZE, DMA_BIDIRECTIONAL);
 #if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
 		if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
 			flush_dcache_range((ulong)rxre,
@@ -1109,7 +1109,7 @@ next_frame:
 		pi->rxr_posn = (pi->rxr_posn + 1) & (MPSC_RXR_ENTRIES - 1);
 		rxre = (struct mpsc_rx_desc *)(pi->rxr +
 			(pi->rxr_posn * MPSC_RXRE_SIZE));
-		dma_cache_sync((void *)rxre, MPSC_RXRE_SIZE, DMA_FROM_DEVICE);
+		dma_cache_sync(pi->port.dev, (void *)rxre, MPSC_RXRE_SIZE, DMA_FROM_DEVICE);
 #if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
 		if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
 			invalidate_dcache_range((ulong)rxre,
@@ -1143,7 +1143,7 @@ mpsc_setup_tx_desc(struct mpsc_port_info *pi, u32 count, u32 intr)
 							   SDMA_DESC_CMDSTAT_EI
 							   : 0));
 	wmb();
-	dma_cache_sync((void *) txre, MPSC_TXRE_SIZE, DMA_BIDIRECTIONAL);
+	dma_cache_sync(pi->port.dev, (void *) txre, MPSC_TXRE_SIZE, DMA_BIDIRECTIONAL);
 #if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
 	if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
 		flush_dcache_range((ulong)txre,
@@ -1192,7 +1192,7 @@ mpsc_copy_tx_data(struct mpsc_port_info *pi)
 		else /* All tx data copied into ring bufs */
 			return;
 
-		dma_cache_sync((void *) bp, MPSC_TXBE_SIZE, DMA_BIDIRECTIONAL);
+		dma_cache_sync(pi->port.dev, (void *) bp, MPSC_TXBE_SIZE, DMA_BIDIRECTIONAL);
 #if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
 		if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
 			flush_dcache_range((ulong)bp,
@@ -1217,7 +1217,7 @@ mpsc_tx_intr(struct mpsc_port_info *pi)
 		txre = (struct mpsc_tx_desc *)(pi->txr +
 			(pi->txr_tail * MPSC_TXRE_SIZE));
 
-		dma_cache_sync((void *) txre, MPSC_TXRE_SIZE, DMA_FROM_DEVICE);
+		dma_cache_sync(pi->port.dev, (void *) txre, MPSC_TXRE_SIZE, DMA_FROM_DEVICE);
 #if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
 		if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
 			invalidate_dcache_range((ulong)txre,
@@ -1235,7 +1235,7 @@ mpsc_tx_intr(struct mpsc_port_info *pi)
 
 			txre = (struct mpsc_tx_desc *)(pi->txr +
 				(pi->txr_tail * MPSC_TXRE_SIZE));
-			dma_cache_sync((void *) txre, MPSC_TXRE_SIZE,
+			dma_cache_sync(pi->port.dev, (void *) txre, MPSC_TXRE_SIZE,
 				DMA_FROM_DEVICE);
 #if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
 			if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
@@ -1652,7 +1652,7 @@ mpsc_console_write(struct console *co, const char *s, uint count)
 			count--;
 		}
 
-		dma_cache_sync((void *) bp, MPSC_TXBE_SIZE, DMA_BIDIRECTIONAL);
+		dma_cache_sync(pi->port.dev, (void *) bp, MPSC_TXBE_SIZE, DMA_BIDIRECTIONAL);
 #if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE)
 		if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */
 			flush_dcache_range((ulong)bp,
diff --git a/include/asm-alpha/dma-mapping.h b/include/asm-alpha/dma-mapping.h
index b274bf631..57e09f5 100644
--- a/include/asm-alpha/dma-mapping.h
+++ b/include/asm-alpha/dma-mapping.h
@@ -60,7 +60,7 @@ int dma_set_mask(struct device *dev, u64 mask);
 #define dma_sync_single_range(dev, addr, off, size, dir)  do { } while (0)
 #define dma_sync_sg_for_cpu(dev, sg, nents, dir)	  do { } while (0)
 #define dma_sync_sg_for_device(dev, sg, nents, dir)	  do { } while (0)
-#define dma_cache_sync(va, size, dir)			  do { } while (0)
+#define dma_cache_sync(dev, va, size, dir)		  do { } while (0)
 
 #define dma_get_cache_alignment()			  L1_CACHE_BYTES
 
diff --git a/include/asm-avr32/dma-mapping.h b/include/asm-avr32/dma-mapping.h
index 44630be..0580b5d 100644
--- a/include/asm-avr32/dma-mapping.h
+++ b/include/asm-avr32/dma-mapping.h
@@ -8,7 +8,8 @@
 #include <asm/cacheflush.h>
 #include <asm/io.h>
 
-extern void dma_cache_sync(void *vaddr, size_t size, int direction);
+extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+	int direction);
 
 /*
  * Return whether the given device DMA address mask can be supported
diff --git a/include/asm-cris/dma-mapping.h b/include/asm-cris/dma-mapping.h
index af704fd..662cea7 100644
--- a/include/asm-cris/dma-mapping.h
+++ b/include/asm-cris/dma-mapping.h
@@ -159,7 +159,7 @@ dma_get_cache_alignment(void)
 #define dma_is_consistent(d, h)	(1)
 
 static inline void
-dma_cache_sync(void *vaddr, size_t size,
+dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	       enum dma_data_direction direction)
 {
 }
diff --git a/include/asm-frv/dma-mapping.h b/include/asm-frv/dma-mapping.h
index 7b97fc7..bcb2df6 100644
--- a/include/asm-frv/dma-mapping.h
+++ b/include/asm-frv/dma-mapping.h
@@ -175,7 +175,7 @@ int dma_get_cache_alignment(void)
 #define dma_is_consistent(d, h)	(1)
 
 static inline
-void dma_cache_sync(void *vaddr, size_t size,
+void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 		    enum dma_data_direction direction)
 {
 	flush_write_buffers();
diff --git a/include/asm-generic/dma-mapping.h b/include/asm-generic/dma-mapping.h
index b9be3fc..783ab99 100644
--- a/include/asm-generic/dma-mapping.h
+++ b/include/asm-generic/dma-mapping.h
@@ -295,7 +295,7 @@ dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
 }
 
 static inline void
-dma_cache_sync(void *vaddr, size_t size,
+dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	       enum dma_data_direction direction)
 {
 	/* could define this in terms of the dma_cache ... operations,
diff --git a/include/asm-i386/dma-mapping.h b/include/asm-i386/dma-mapping.h
index 7da64c9..183eebe 100644
--- a/include/asm-i386/dma-mapping.h
+++ b/include/asm-i386/dma-mapping.h
@@ -159,7 +159,7 @@ dma_get_cache_alignment(void)
 #define dma_is_consistent(d, h)	(1)
 
 static inline void
-dma_cache_sync(void *vaddr, size_t size,
+dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	       enum dma_data_direction direction)
 {
 	flush_write_buffers();
diff --git a/include/asm-ia64/dma-mapping.h b/include/asm-ia64/dma-mapping.h
index 4b075bc..ebd5887 100644
--- a/include/asm-ia64/dma-mapping.h
+++ b/include/asm-ia64/dma-mapping.h
@@ -50,7 +50,8 @@ dma_set_mask (struct device *dev, u64 mask)
 extern int dma_get_cache_alignment(void);
 
 static inline void
-dma_cache_sync (void *vaddr, size_t size, enum dma_data_direction dir)
+dma_cache_sync (struct device *dev, void *vaddr, size_t size,
+	enum dma_data_direction dir)
 {
 	/*
 	 * IA-64 is cache-coherent, so this is mostly a no-op.  However, we do need to
diff --git a/include/asm-m68k/dma-mapping.h b/include/asm-m68k/dma-mapping.h
index efc89c1..00259ed 100644
--- a/include/asm-m68k/dma-mapping.h
+++ b/include/asm-m68k/dma-mapping.h
@@ -41,7 +41,7 @@ static inline void dma_free_noncoherent(struct device *dev, size_t size,
 {
 	dma_free_coherent(dev, size, addr, handle);
 }
-static inline void dma_cache_sync(void *vaddr, size_t size,
+static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 				  enum dma_data_direction dir)
 {
 	/* we use coherent allocation, so not much to do here. */
diff --git a/include/asm-mips/dma-mapping.h b/include/asm-mips/dma-mapping.h
index e17f70d..236d1a4 100644
--- a/include/asm-mips/dma-mapping.h
+++ b/include/asm-mips/dma-mapping.h
@@ -65,7 +65,7 @@ dma_get_cache_alignment(void)
 
 extern int dma_is_consistent(struct device *dev, dma_addr_t dma_addr);
 
-extern void dma_cache_sync(void *vaddr, size_t size,
+extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	       enum dma_data_direction direction);
 
 #define ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
diff --git a/include/asm-parisc/dma-mapping.h b/include/asm-parisc/dma-mapping.h
index c40d48a..66f0b40 100644
--- a/include/asm-parisc/dma-mapping.h
+++ b/include/asm-parisc/dma-mapping.h
@@ -197,7 +197,7 @@ dma_is_consistent(struct device *dev, dma_addr_t dma_addr)
 }
 
 static inline void
-dma_cache_sync(void *vaddr, size_t size,
+dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	       enum dma_data_direction direction)
 {
 	if(hppa_dma_ops->dma_sync_single_for_cpu)
diff --git a/include/asm-powerpc/dma-mapping.h b/include/asm-powerpc/dma-mapping.h
index 3cf635b..7c7de87 100644
--- a/include/asm-powerpc/dma-mapping.h
+++ b/include/asm-powerpc/dma-mapping.h
@@ -378,7 +378,7 @@ static inline void dma_sync_single_range_for_device(struct device *dev,
 	dma_sync_single_for_device(dev, dma_handle, offset + size, direction);
 }
 
-static inline void dma_cache_sync(void *vaddr, size_t size,
+static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 		enum dma_data_direction direction)
 {
 	BUG_ON(direction == DMA_NONE);
diff --git a/include/asm-sh/dma-mapping.h b/include/asm-sh/dma-mapping.h
index 56cd4b9..37ab0c1 100644
--- a/include/asm-sh/dma-mapping.h
+++ b/include/asm-sh/dma-mapping.h
@@ -53,7 +53,7 @@ static inline void dma_free_coherent(struct device *dev, size_t size,
 	consistent_free(vaddr, size);
 }
 
-static inline void dma_cache_sync(void *vaddr, size_t size,
+static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 				  enum dma_data_direction dir)
 {
 	consistent_sync(vaddr, size, (int)dir);
diff --git a/include/asm-sh64/dma-mapping.h b/include/asm-sh64/dma-mapping.h
index 68e27a8..5efe906 100644
--- a/include/asm-sh64/dma-mapping.h
+++ b/include/asm-sh64/dma-mapping.h
@@ -35,7 +35,7 @@ static inline void dma_free_coherent(struct device *dev, size_t size,
 	consistent_free(NULL, size, vaddr, dma_handle);
 }
 
-static inline void dma_cache_sync(void *vaddr, size_t size,
+static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 				  enum dma_data_direction dir)
 {
 	dma_cache_wback_inv((unsigned long)vaddr, size);
diff --git a/include/asm-sparc64/dma-mapping.h b/include/asm-sparc64/dma-mapping.h
index 5fe0072..2f858a2 100644
--- a/include/asm-sparc64/dma-mapping.h
+++ b/include/asm-sparc64/dma-mapping.h
@@ -210,7 +210,7 @@ dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
 }
 
 static inline void
-dma_cache_sync(void *vaddr, size_t size,
+dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	       enum dma_data_direction direction)
 {
 	/* could define this in terms of the dma_cache ... operations,
diff --git a/include/asm-um/dma-mapping.h b/include/asm-um/dma-mapping.h
index defb5b8..f0ee4fb 100644
--- a/include/asm-um/dma-mapping.h
+++ b/include/asm-um/dma-mapping.h
@@ -112,7 +112,7 @@ dma_sync_single_range(struct device *dev, dma_addr_t dma_handle,
 }
 
 static inline void
-dma_cache_sync(void *vaddr, size_t size,
+dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	       enum dma_data_direction direction)
 {
 	BUG();
diff --git a/include/asm-x86_64/dma-mapping.h b/include/asm-x86_64/dma-mapping.h
index c8cc488..be9ec68 100644
--- a/include/asm-x86_64/dma-mapping.h
+++ b/include/asm-x86_64/dma-mapping.h
@@ -185,7 +185,8 @@ static inline int dma_get_cache_alignment(void)
 extern int dma_set_mask(struct device *dev, u64 mask);
 
 static inline void
-dma_cache_sync(void *vaddr, size_t size, enum dma_data_direction dir)
+dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+	enum dma_data_direction dir)
 {
 	flush_write_buffers();
 }
diff --git a/include/asm-xtensa/dma-mapping.h b/include/asm-xtensa/dma-mapping.h
index 827d1df..82b03b3 100644
--- a/include/asm-xtensa/dma-mapping.h
+++ b/include/asm-xtensa/dma-mapping.h
@@ -173,7 +173,7 @@ dma_get_cache_alignment(void)
 #define dma_is_consistent(d, h)	(1)
 
 static inline void
-dma_cache_sync(void *vaddr, size_t size,
+dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	       enum dma_data_direction direction)
 {
 	consistent_sync(vaddr, size, direction);
-- 
cgit v0.10.2


From f7dff2b12654149c9cac8d8c79b6588759edd5a9 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 6 Dec 2006 20:38:58 -0800
Subject: [PATCH] Handle per-subsystem mutexes for CONFIG_HOTPLUG_CPU not set

Provide a common interface for all the subsystems to lock and unlock their
per-subsystem hotcpu mutexes.

When CONFIG_HOTPLUG_CPU is not set, these operations would be no-ops.

[akpm@osdl.org: macros -> inlines]
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 71dc6ba..bf00ce6 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -24,6 +24,7 @@
 #include <linux/compiler.h>
 #include <linux/cpumask.h>
 #include <asm/semaphore.h>
+#include <linux/mutex.h>
 
 struct cpu {
 	int node_id;		/* The node which contains the CPU */
@@ -74,6 +75,17 @@ extern struct sysdev_class cpu_sysdev_class;
 
 #ifdef CONFIG_HOTPLUG_CPU
 /* Stop CPUs going up and down. */
+
+static inline void cpuhotplug_mutex_lock(struct mutex *cpu_hp_mutex)
+{
+	mutex_lock(cpu_hp_mutex);
+}
+
+static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
+{
+	mutex_unlock(cpu_hp_mutex);
+}
+
 extern void lock_cpu_hotplug(void);
 extern void unlock_cpu_hotplug(void);
 #define hotcpu_notifier(fn, pri) {				\
@@ -85,7 +97,14 @@ extern void unlock_cpu_hotplug(void);
 #define unregister_hotcpu_notifier(nb)	unregister_cpu_notifier(nb)
 int cpu_down(unsigned int cpu);
 #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu))
-#else
+
+#else		/* CONFIG_HOTPLUG_CPU */
+
+static inline void cpuhotplug_mutex_lock(struct mutex *cpu_hp_mutex)
+{ }
+static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
+{ }
+
 #define lock_cpu_hotplug()	do { } while (0)
 #define unlock_cpu_hotplug()	do { } while (0)
 #define lock_cpu_hotplug_interruptible() 0
@@ -95,7 +114,7 @@ int cpu_down(unsigned int cpu);
 
 /* CPUs don't go offline once they're online w/o CONFIG_HOTPLUG_CPU */
 static inline int cpu_is_offline(int cpu) { return 0; }
-#endif
+#endif		/* CONFIG_HOTPLUG_CPU */
 
 #ifdef CONFIG_SUSPEND_SMP
 extern int disable_nonboot_cpus(void);
-- 
cgit v0.10.2


From 9399575dd30edcb84e821583daf81d4ba774a95b Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Wed, 6 Dec 2006 20:38:59 -0800
Subject: [PATCH] dz: Fixes to make it work

This a set of fixes mostly to make the driver actually work:

1. Actually select the line for setting parameters and receiver
   disable/enable.
2. Select the line for receive and transmit interrupt handling correctly.
3. Report the transmitter empty state correctly.
4. Set the I/O type of ports correctly.
5. Perform polled transmission correctly.
6. Don't fix the console line at ttyS3.
7. Magic SysRq support.
8. Various small bits here and there.

Tested with a DECstation 2100 (thanks Flo for making this possible).

[akpm@osdl.org: fix typo]
Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/decserial.c b/drivers/char/decserial.c
index 85f404e..8ea2bea 100644
--- a/drivers/char/decserial.c
+++ b/drivers/char/decserial.c
@@ -23,20 +23,12 @@
 extern int zs_init(void);
 #endif
 
-#ifdef CONFIG_DZ
-extern int dz_init(void);
-#endif
-
 #ifdef CONFIG_SERIAL_CONSOLE
 
 #ifdef CONFIG_ZS
 extern void zs_serial_console_init(void);
 #endif
 
-#ifdef CONFIG_DZ
-extern void dz_serial_console_init(void);
-#endif
-
 #endif
 
 /* rs_init - starts up the serial interface -
@@ -46,23 +38,11 @@ extern void dz_serial_console_init(void);
 
 int __init rs_init(void)
 {
-
-#if defined(CONFIG_ZS) && defined(CONFIG_DZ)
+#ifdef CONFIG_ZS
     if (IOASIC)
 	return zs_init();
-    else
-	return dz_init();
-#else
-
-#ifdef CONFIG_ZS
-    return zs_init();
-#endif
-
-#ifdef CONFIG_DZ
-    return dz_init();
-#endif
-
 #endif
+    return -ENXIO;
 }
 
 __initcall(rs_init);
@@ -76,21 +56,9 @@ __initcall(rs_init);
  */
 static int __init decserial_console_init(void)
 {
-#if defined(CONFIG_ZS) && defined(CONFIG_DZ)
+#ifdef CONFIG_ZS
     if (IOASIC)
 	zs_serial_console_init();
-    else
-	dz_serial_console_init();
-#else
-
-#ifdef CONFIG_ZS
-    zs_serial_console_init();
-#endif
-
-#ifdef CONFIG_DZ
-    dz_serial_console_init();
-#endif
-
 #endif
     return 0;
 }
diff --git a/drivers/serial/dz.c b/drivers/serial/dz.c
index 53662b3..af1544f 100644
--- a/drivers/serial/dz.c
+++ b/drivers/serial/dz.c
@@ -1,11 +1,13 @@
 /*
- * dz.c: Serial port driver for DECStations equiped
+ * dz.c: Serial port driver for DECstations equipped
  *       with the DZ chipset.
  *
  * Copyright (C) 1998 Olivier A. D. Lebaillif
  *
  * Email: olivier.lebaillif@ifrsys.com
  *
+ * Copyright (C) 2004, 2006  Maciej W. Rozycki
+ *
  * [31-AUG-98] triemer
  * Changed IRQ to use Harald's dec internals interrupts.h
  * removed base_addr code - moving address assignment to setup.c
@@ -26,10 +28,16 @@
 
 #undef DEBUG_DZ
 
+#if defined(CONFIG_SERIAL_DZ_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ)
+#define SUPPORT_SYSRQ
+#endif
+
+#include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/console.h>
+#include <linux/sysrq.h>
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
 #include <linux/serial_core.h>
@@ -45,14 +53,10 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 
-#define CONSOLE_LINE (3)	/* for definition of struct console */
-
 #include "dz.h"
 
-#define DZ_INTR_DEBUG 1
-
 static char *dz_name = "DECstation DZ serial driver version ";
-static char *dz_version = "1.02";
+static char *dz_version = "1.03";
 
 struct dz_port {
 	struct uart_port	port;
@@ -61,22 +65,6 @@ struct dz_port {
 
 static struct dz_port dz_ports[DZ_NB_PORT];
 
-#ifdef DEBUG_DZ
-/*
- * debugging code to send out chars via prom
- */
-static void debug_console(const char *s, int count)
-{
-	unsigned i;
-
-	for (i = 0; i < count; i++) {
-		if (*s == 10)
-			prom_printf("%c", 13);
-		prom_printf("%c", *s++);
-	}
-}
-#endif
-
 /*
  * ------------------------------------------------------------
  * dz_in () and dz_out ()
@@ -90,6 +78,7 @@ static inline unsigned short dz_in(struct dz_port *dport, unsigned offset)
 {
 	volatile unsigned short *addr =
 		(volatile unsigned short *) (dport->port.membase + offset);
+
 	return *addr;
 }
 
@@ -98,6 +87,7 @@ static inline void dz_out(struct dz_port *dport, unsigned offset,
 {
 	volatile unsigned short *addr =
 		(volatile unsigned short *) (dport->port.membase + offset);
+
 	*addr = value;
 }
 
@@ -144,7 +134,7 @@ static void dz_stop_rx(struct uart_port *uport)
 
 	spin_lock_irqsave(&dport->port.lock, flags);
 	dport->cflag &= ~DZ_CREAD;
-	dz_out(dport, DZ_LPR, dport->cflag);
+	dz_out(dport, DZ_LPR, dport->cflag | dport->port.line);
 	spin_unlock_irqrestore(&dport->port.lock, flags);
 }
 
@@ -155,14 +145,14 @@ static void dz_enable_ms(struct uart_port *port)
 
 /*
  * ------------------------------------------------------------
- * Here starts the interrupt handling routines.  All of the
- * following subroutines are declared as inline and are folded
- * into dz_interrupt.  They were separated out for readability's
- * sake.
  *
- * Note: rs_interrupt() is a "fast" interrupt, which means that it
+ * Here start the interrupt handling routines.  All of the following
+ * subroutines are declared as inline and are folded into
+ * dz_interrupt.  They were separated out for readability's sake.
+ *
+ * Note: dz_interrupt() is a "fast" interrupt, which means that it
  * runs with interrupts turned off.  People who may want to modify
- * rs_interrupt() should try to keep the interrupt handler as fast as
+ * dz_interrupt() should try to keep the interrupt handler as fast as
  * possible.  After you are done making modifications, it is not a bad
  * idea to do:
  *
@@ -180,92 +170,74 @@ static void dz_enable_ms(struct uart_port *port)
  * This routine deals with inputs from any lines.
  * ------------------------------------------------------------
  */
-static inline void dz_receive_chars(struct dz_port *dport)
+static inline void dz_receive_chars(struct dz_port *dport_in,
+				    struct pt_regs *regs)
 {
+	struct dz_port *dport;
 	struct tty_struct *tty = NULL;
 	struct uart_icount *icount;
-	int ignore = 0;
-	unsigned short status, tmp;
+	int lines_rx[DZ_NB_PORT] = { [0 ... DZ_NB_PORT - 1] = 0 };
+	unsigned short status;
 	unsigned char ch, flag;
+	int i;
 
-	/* this code is going to be a problem...
-	   the call to tty_flip_buffer is going to need
-	   to be rethought...
-	 */
-	do {
-		status = dz_in(dport, DZ_RBUF);
-
-		/* punt so we don't get duplicate characters */
-		if (!(status & DZ_DVAL))
-			goto ignore_char;
-
-
-		ch = UCHAR(status);	/* grab the char */
-		flag = TTY_NORMAL;
+	while ((status = dz_in(dport_in, DZ_RBUF)) & DZ_DVAL) {
+		dport = &dz_ports[LINE(status)];
+		tty = dport->port.info->tty;	/* point to the proper dev */
 
-#if 0
-		if (info->is_console) {
-			if (ch == 0)
-				return;		/* it's a break ... */
-		}
-#endif
+		ch = UCHAR(status);		/* grab the char */
 
-		tty = dport->port.info->tty;/* now tty points to the proper dev */
 		icount = &dport->port.icount;
-
-		if (!tty)
-			break;
-
 		icount->rx++;
 
-		/* keep track of the statistics */
-		if (status & (DZ_OERR | DZ_FERR | DZ_PERR)) {
-			if (status & DZ_PERR)	/* parity error */
-				icount->parity++;
-			else if (status & DZ_FERR)	/* frame error */
-				icount->frame++;
-			if (status & DZ_OERR)	/* overrun error */
-				icount->overrun++;
-
-			/*  check to see if we should ignore the character
-			   and mask off conditions that should be ignored
+		flag = TTY_NORMAL;
+		if (status & DZ_FERR) {		/* frame error */
+			/*
+			 * There is no separate BREAK status bit, so
+			 * treat framing errors as BREAKs for Magic SysRq
+			 * and SAK; normally, otherwise.
 			 */
-
-			if (status & dport->port.ignore_status_mask) {
-				if (++ignore > 100)
-					break;
-				goto ignore_char;
-			}
-			/* mask off the error conditions we want to ignore */
-			tmp = status & dport->port.read_status_mask;
-
-			if (tmp & DZ_PERR) {
-				flag = TTY_PARITY;
-#ifdef DEBUG_DZ
-				debug_console("PERR\n", 5);
-#endif
-			} else if (tmp & DZ_FERR) {
+			if (uart_handle_break(&dport->port))
+				continue;
+			if (dport->port.flags & UPF_SAK)
+				flag = TTY_BREAK;
+			else
 				flag = TTY_FRAME;
-#ifdef DEBUG_DZ
-				debug_console("FERR\n", 5);
-#endif
-			}
-			if (tmp & DZ_OERR) {
-#ifdef DEBUG_DZ
-				debug_console("OERR\n", 5);
-#endif
-				tty_insert_flip_char(tty, ch, flag);
-				ch = 0;
-				flag = TTY_OVERRUN;
-			}
+		} else if (status & DZ_OERR)	/* overrun error */
+			flag = TTY_OVERRUN;
+		else if (status & DZ_PERR)	/* parity error */
+			flag = TTY_PARITY;
+
+		/* keep track of the statistics */
+		switch (flag) {
+		case TTY_FRAME:
+			icount->frame++;
+			break;
+		case TTY_PARITY:
+			icount->parity++;
+			break;
+		case TTY_OVERRUN:
+			icount->overrun++;
+			break;
+		case TTY_BREAK:
+			icount->brk++;
+			break;
+		default:
+			break;
 		}
-		tty_insert_flip_char(tty, ch, flag);
-	      ignore_char:
-			;
-	} while (status & DZ_DVAL);
 
-	if (tty)
-		tty_flip_buffer_push(tty);
+		if (uart_handle_sysrq_char(&dport->port, ch, regs))
+			continue;
+
+		if ((status & dport->port.ignore_status_mask) == 0) {
+			uart_insert_char(&dport->port,
+					 status, DZ_OERR, ch, flag);
+			lines_rx[LINE(status)] = 1;
+		}
+	}
+	for (i = 0; i < DZ_NB_PORT; i++)
+		if (lines_rx[i])
+			tty_flip_buffer_push(dz_ports[i].port.info->tty);
 }
 
 /*
@@ -275,26 +247,32 @@ static inline void dz_receive_chars(struct dz_port *dport)
  * This routine deals with outputs to any lines.
  * ------------------------------------------------------------
  */
-static inline void dz_transmit_chars(struct dz_port *dport)
+static inline void dz_transmit_chars(struct dz_port *dport_in)
 {
-	struct circ_buf *xmit = &dport->port.info->xmit;
+	struct dz_port *dport;
+	struct circ_buf *xmit;
+	unsigned short status;
 	unsigned char tmp;
 
-	if (dport->port.x_char) {	/* XON/XOFF chars */
+	status = dz_in(dport_in, DZ_CSR);
+	dport = &dz_ports[LINE(status)];
+	xmit = &dport->port.info->xmit;
+
+	if (dport->port.x_char) {		/* XON/XOFF chars */
 		dz_out(dport, DZ_TDR, dport->port.x_char);
 		dport->port.icount.tx++;
 		dport->port.x_char = 0;
 		return;
 	}
-	/* if nothing to do or stopped or hardware stopped */
+	/* If nothing to do or stopped or hardware stopped. */
 	if (uart_circ_empty(xmit) || uart_tx_stopped(&dport->port)) {
 		dz_stop_tx(&dport->port);
 		return;
 	}
 
 	/*
-	 * if something to do ... (rember the dz has no output fifo so we go
-	 * one char at a time :-<
+	 * If something to do... (remember the dz has no output fifo,
+	 * so we go one char at a time) :-<
 	 */
 	tmp = xmit->buf[xmit->tail];
 	xmit->tail = (xmit->tail + 1) & (DZ_XMIT_SIZE - 1);
@@ -304,23 +282,29 @@ static inline void dz_transmit_chars(struct dz_port *dport)
 	if (uart_circ_chars_pending(xmit) < DZ_WAKEUP_CHARS)
 		uart_write_wakeup(&dport->port);
 
-	/* Are we done */
+	/* Are we are done. */
 	if (uart_circ_empty(xmit))
 		dz_stop_tx(&dport->port);
 }
 
 /*
  * ------------------------------------------------------------
- * check_modem_status ()
+ * check_modem_status()
  *
- * Only valid for the MODEM line duh !
+ * DS 3100 & 5100: Only valid for the MODEM line, duh!
+ * DS 5000/200: Valid for the MODEM and PRINTER line.
  * ------------------------------------------------------------
  */
 static inline void check_modem_status(struct dz_port *dport)
 {
+	/*
+	 * FIXME:
+	 * 1. No status change interrupt; use a timer.
+	 * 2. Handle the 3100/5000 as appropriate. --macro
+	 */
 	unsigned short status;
 
-	/* if not ne modem line just return */
+	/* If not the modem line just return.  */
 	if (dport->port.line != DZ_MODEM)
 		return;
 
@@ -341,21 +325,18 @@ static inline void check_modem_status(struct dz_port *dport)
  */
 static irqreturn_t dz_interrupt(int irq, void *dev)
 {
-	struct dz_port *dport;
+	struct dz_port *dport = (struct dz_port *)dev;
 	unsigned short status;
 
 	/* get the reason why we just got an irq */
-	status = dz_in((struct dz_port *)dev, DZ_CSR);
-	dport = &dz_ports[LINE(status)];
+	status = dz_in(dport, DZ_CSR);
 
-	if (status & DZ_RDONE)
-		dz_receive_chars(dport);
+	if ((status & (DZ_RDONE | DZ_RIE)) == (DZ_RDONE | DZ_RIE))
+		dz_receive_chars(dport, regs);
 
-	if (status & DZ_TRDY)
+	if ((status & (DZ_TRDY | DZ_TIE)) == (DZ_TRDY | DZ_TIE))
 		dz_transmit_chars(dport);
 
-	/* FIXME: what about check modem status??? --rmk */
-
 	return IRQ_HANDLED;
 }
 
@@ -367,13 +348,13 @@ static irqreturn_t dz_interrupt(int irq, void *dev)
 
 static unsigned int dz_get_mctrl(struct uart_port *uport)
 {
+	/*
+	 * FIXME: Handle the 3100/5000 as appropriate. --macro
+	 */
 	struct dz_port *dport = (struct dz_port *)uport;
 	unsigned int mctrl = TIOCM_CAR | TIOCM_DSR | TIOCM_CTS;
 
 	if (dport->port.line == DZ_MODEM) {
-		/*
-		 * CHECKME: This is a guess from the other code... --rmk
-		 */
 		if (dz_in(dport, DZ_MSR) & DZ_MODEM_DSR)
 			mctrl &= ~TIOCM_DSR;
 	}
@@ -383,6 +364,9 @@ static unsigned int dz_get_mctrl(struct uart_port *uport)
 
 static void dz_set_mctrl(struct uart_port *uport, unsigned int mctrl)
 {
+	/*
+	 * FIXME: Handle the 3100/5000 as appropriate. --macro
+	 */
 	struct dz_port *dport = (struct dz_port *)uport;
 	unsigned short tmp;
 
@@ -409,13 +393,6 @@ static int dz_startup(struct uart_port *uport)
 	unsigned long flags;
 	unsigned short tmp;
 
-	/* The dz lines for the mouse/keyboard must be
-	 * opened using their respective drivers.
-	 */
-	if ((dport->port.line == DZ_KEYBOARD) ||
-	    (dport->port.line == DZ_MOUSE))
-		return -ENODEV;
-
 	spin_lock_irqsave(&dport->port.lock, flags);
 
 	/* enable the interrupt and the scanning */
@@ -442,7 +419,8 @@ static void dz_shutdown(struct uart_port *uport)
 }
 
 /*
- * get_lsr_info - get line status register info
+ * -------------------------------------------------------------------
+ * dz_tx_empty() -- get the transmitter empty status
  *
  * Purpose: Let user call ioctl() to get info when the UART physically
  *          is emptied.  On bus types like RS485, the transmitter must
@@ -450,21 +428,28 @@ static void dz_shutdown(struct uart_port *uport)
  *          the transmit shift register is empty, not be done when the
  *          transmit holding register is empty.  This functionality
  *          allows an RS485 driver to be written in user space.
+ * -------------------------------------------------------------------
  */
 static unsigned int dz_tx_empty(struct uart_port *uport)
 {
 	struct dz_port *dport = (struct dz_port *)uport;
-	unsigned short status = dz_in(dport, DZ_LPR);
+	unsigned short tmp, mask = 1 << dport->port.line;
 
-	/* FIXME: this appears to be obviously broken --rmk. */
-	return status ? TIOCSER_TEMT : 0;
+	tmp = dz_in(dport, DZ_TCR);
+	tmp &= mask;
+
+	return tmp ? 0 : TIOCSER_TEMT;
 }
 
 static void dz_break_ctl(struct uart_port *uport, int break_state)
 {
+	/*
+	 * FIXME: Can't access BREAK bits in TDR easily;
+	 * reuse the code for polled TX. --macro
+	 */
 	struct dz_port *dport = (struct dz_port *)uport;
 	unsigned long flags;
-	unsigned short tmp, mask = 1 << uport->line;
+	unsigned short tmp, mask = 1 << dport->port.line;
 
 	spin_lock_irqsave(&uport->lock, flags);
 	tmp = dz_in(dport, DZ_TCR);
@@ -561,7 +546,7 @@ static void dz_set_termios(struct uart_port *uport, struct termios *termios,
 
 	spin_lock_irqsave(&dport->port.lock, flags);
 
-	dz_out(dport, DZ_LPR, cflag);
+	dz_out(dport, DZ_LPR, cflag | dport->port.line);
 	dport->cflag = cflag;
 
 	/* setup accept flag */
@@ -650,7 +635,7 @@ static void __init dz_init_ports(void)
 	for (i = 0, dport = dz_ports; i < DZ_NB_PORT; i++, dport++) {
 		spin_lock_init(&dport->port.lock);
 		dport->port.membase	= (char *) base;
-		dport->port.iotype	= UPIO_PORT;
+		dport->port.iotype	= UPIO_MEM;
 		dport->port.irq		= dec_interrupt[DEC_IRQ_DZ11];
 		dport->port.line	= i;
 		dport->port.fifosize	= 1;
@@ -662,10 +647,7 @@ static void __init dz_init_ports(void)
 static void dz_reset(struct dz_port *dport)
 {
 	dz_out(dport, DZ_CSR, DZ_CLR);
-
 	while (dz_in(dport, DZ_CSR) & DZ_CLR);
-		/* FIXME: cpu_relax? */
-
 	iob();
 
 	/* enable scanning */
@@ -673,26 +655,55 @@ static void dz_reset(struct dz_port *dport)
 }
 
 #ifdef CONFIG_SERIAL_DZ_CONSOLE
+/*
+ * -------------------------------------------------------------------
+ * dz_console_putchar() -- transmit a character
+ *
+ * Polled transmission.  This is tricky.  We need to mask transmit
+ * interrupts so that they do not interfere, enable the transmitter
+ * for the line requested and then wait till the transmit scanner
+ * requests data for this line.  But it may request data for another
+ * line first, in which case we have to disable its transmitter and
+ * repeat waiting till our line pops up.  Only then the character may
+ * be transmitted.  Finally, the state of the transmitter mask is
+ * restored.  Welcome to the world of PDP-11!
+ * -------------------------------------------------------------------
+ */
 static void dz_console_putchar(struct uart_port *uport, int ch)
 {
 	struct dz_port *dport = (struct dz_port *)uport;
 	unsigned long flags;
-	int loops = 2500;
-	unsigned short tmp = (unsigned char)ch;
-	/* this code sends stuff out to serial device - spinning its
-	   wheels and waiting. */
+	unsigned short csr, tcr, trdy, mask;
+	int loops = 10000;
 
 	spin_lock_irqsave(&dport->port.lock, flags);
+	csr = dz_in(dport, DZ_CSR);
+	dz_out(dport, DZ_CSR, csr & ~DZ_TIE);
+	tcr = dz_in(dport, DZ_TCR);
+	tcr |= 1 << dport->port.line;
+	mask = tcr;
+	dz_out(dport, DZ_TCR, mask);
+	iob();
+	spin_unlock_irqrestore(&dport->port.lock, flags);
 
-	/* spin our wheels */
-	while (((dz_in(dport, DZ_CSR) & DZ_TRDY) != DZ_TRDY) && loops--)
-		/* FIXME: cpu_relax, udelay? --rmk */
-		;
+	while (loops--) {
+		trdy = dz_in(dport, DZ_CSR);
+		if (!(trdy & DZ_TRDY))
+			continue;
+		trdy = (trdy & DZ_TLINE) >> 8;
+		if (trdy == dport->port.line)
+			break;
+		mask &= ~(1 << trdy);
+		dz_out(dport, DZ_TCR, mask);
+		iob();
+		udelay(2);
+	}
 
-	/* Actually transmit the character. */
-	dz_out(dport, DZ_TDR, tmp);
+	if (loops)				/* Cannot send otherwise. */
+		dz_out(dport, DZ_TDR, ch);
 
-	spin_unlock_irqrestore(&dport->port.lock, flags);
+	dz_out(dport, DZ_TCR, tcr);
+	dz_out(dport, DZ_CSR, csr);
 }
 
 /*
@@ -703,11 +714,11 @@ static void dz_console_putchar(struct uart_port *uport, int ch)
  * The console must be locked when we get here.
  * -------------------------------------------------------------------
  */
-static void dz_console_print(struct console *cons,
+static void dz_console_print(struct console *co,
 			     const char *str,
 			     unsigned int count)
 {
-	struct dz_port *dport = &dz_ports[CONSOLE_LINE];
+	struct dz_port *dport = &dz_ports[co->index];
 #ifdef DEBUG_DZ
 	prom_printf((char *) str);
 #endif
@@ -716,49 +727,43 @@ static void dz_console_print(struct console *cons,
 
 static int __init dz_console_setup(struct console *co, char *options)
 {
-	struct dz_port *dport = &dz_ports[CONSOLE_LINE];
+	struct dz_port *dport = &dz_ports[co->index];
 	int baud = 9600;
 	int bits = 8;
 	int parity = 'n';
 	int flow = 'n';
-	int ret;
-	unsigned short mask, tmp;
 
 	if (options)
 		uart_parse_options(options, &baud, &parity, &bits, &flow);
 
 	dz_reset(dport);
 
-	ret = uart_set_options(&dport->port, co, baud, parity, bits, flow);
-	if (ret == 0) {
-		mask = 1 << dport->port.line;
-		tmp = dz_in(dport, DZ_TCR);	/* read the TX flag */
-		if (!(tmp & mask)) {
-			tmp |= mask;		/* set the TX flag */
-			dz_out(dport, DZ_TCR, tmp);
-		}
-	}
-
-	return ret;
+	return uart_set_options(&dport->port, co, baud, parity, bits, flow);
 }
 
-static struct console dz_sercons =
-{
+static struct uart_driver dz_reg;
+static struct console dz_sercons = {
 	.name	= "ttyS",
 	.write	= dz_console_print,
 	.device	= uart_console_device,
 	.setup	= dz_console_setup,
-	.flags	= CON_CONSDEV | CON_PRINTBUFFER,
-	.index	= CONSOLE_LINE,
+	.flags	= CON_PRINTBUFFER,
+	.index	= -1,
+	.data	= &dz_reg,
 };
 
-void __init dz_serial_console_init(void)
+static int __init dz_serial_console_init(void)
 {
-	dz_init_ports();
-
-	register_console(&dz_sercons);
+	if (!IOASIC) {
+		dz_init_ports();
+		register_console(&dz_sercons);
+		return 0;
+	} else
+		return -ENXIO;
 }
 
+console_initcall(dz_serial_console_init);
+
 #define SERIAL_DZ_CONSOLE	&dz_sercons
 #else
 #define SERIAL_DZ_CONSOLE	NULL
@@ -767,35 +772,29 @@ void __init dz_serial_console_init(void)
 static struct uart_driver dz_reg = {
 	.owner			= THIS_MODULE,
 	.driver_name		= "serial",
-	.dev_name		= "ttyS%d",
+	.dev_name		= "ttyS",
 	.major			= TTY_MAJOR,
 	.minor			= 64,
 	.nr			= DZ_NB_PORT,
 	.cons			= SERIAL_DZ_CONSOLE,
 };
 
-int __init dz_init(void)
+static int __init dz_init(void)
 {
-	unsigned long flags;
 	int ret, i;
 
+	if (IOASIC)
+		return -ENXIO;
+
 	printk("%s%s\n", dz_name, dz_version);
 
 	dz_init_ports();
 
-	save_flags(flags);
-	cli();
-
 #ifndef CONFIG_SERIAL_DZ_CONSOLE
 	/* reset the chip */
 	dz_reset(&dz_ports[0]);
 #endif
 
-	/* order matters here... the trick is that flags
-	   is updated... in request_irq - to immediatedly obliterate
-	   it is unwise. */
-	restore_flags(flags);
-
 	if (request_irq(dz_ports[0].port.irq, dz_interrupt,
 			IRQF_DISABLED, "DZ", &dz_ports[0]))
 		panic("Unable to register DZ interrupt");
@@ -810,5 +809,7 @@ int __init dz_init(void)
 	return ret;
 }
 
+module_init(dz_init);
+
 MODULE_DESCRIPTION("DECstation DZ serial driver");
 MODULE_LICENSE("GPL");
diff --git a/drivers/serial/dz.h b/drivers/serial/dz.h
index 86ef417..9674d4e 100644
--- a/drivers/serial/dz.h
+++ b/drivers/serial/dz.h
@@ -1,20 +1,22 @@
 /*
- * dz.h: Serial port driver for DECStations equiped 
+ * dz.h: Serial port driver for DECstations equipped
  *       with the DZ chipset.
  *
  * Copyright (C) 1998 Olivier A. D. Lebaillif 
  *             
  * Email: olivier.lebaillif@ifrsys.com
  *
+ * Copyright (C) 2004, 2006  Maciej W. Rozycki
  */
 #ifndef DZ_SERIAL_H
 #define DZ_SERIAL_H
 
 /*
- * Definitions for the Control and Status Received.
+ * Definitions for the Control and Status Register.
  */
 #define DZ_TRDY        0x8000                 /* Transmitter empty */
-#define DZ_TIE         0x4000                 /* Transmitter Interrupt Enable */
+#define DZ_TIE         0x4000                 /* Transmitter Interrupt Enbl */
+#define DZ_TLINE       0x0300                 /* Transmitter Line Number */
 #define DZ_RDONE       0x0080                 /* Receiver data ready */
 #define DZ_RIE         0x0040                 /* Receive Interrupt Enable */
 #define DZ_MSE         0x0020                 /* Master Scan Enable */
@@ -22,32 +24,44 @@
 #define DZ_MAINT       0x0008                 /* Loop Back Mode */
 
 /*
- * Definitions for the Received buffer. 
+ * Definitions for the Receiver Buffer Register.
  */
-#define DZ_RBUF_MASK   0x00FF                 /* Data Mask in the Receive Buffer */
-#define DZ_LINE_MASK   0x0300                 /* Line Mask in the Receive Buffer */
+#define DZ_RBUF_MASK   0x00FF                 /* Data Mask */
+#define DZ_LINE_MASK   0x0300                 /* Line Mask */
 #define DZ_DVAL        0x8000                 /* Valid Data indicator */
 #define DZ_OERR        0x4000                 /* Overrun error indicator */
 #define DZ_FERR        0x2000                 /* Frame error indicator */
 #define DZ_PERR        0x1000                 /* Parity error indicator */
 
-#define LINE(x) (x & DZ_LINE_MASK) >> 8       /* Get the line number from the input buffer */
-#define UCHAR(x) (unsigned char)(x & DZ_RBUF_MASK)
+#define LINE(x) ((x & DZ_LINE_MASK) >> 8)     /* Get the line number
+                                                 from the input buffer */
+#define UCHAR(x) ((unsigned char)(x & DZ_RBUF_MASK))
 
 /*
- * Definitions for the Transmit Register.
+ * Definitions for the Transmit Control Register.
  */
 #define DZ_LINE_KEYBOARD 0x0001
 #define DZ_LINE_MOUSE    0x0002
 #define DZ_LINE_MODEM    0x0004
 #define DZ_LINE_PRINTER  0x0008
 
+#define DZ_MODEM_RTS     0x0800               /* RTS for the modem line (2) */
 #define DZ_MODEM_DTR     0x0400               /* DTR for the modem line (2) */
+#define DZ_PRINT_RTS     0x0200               /* RTS for the prntr line (3) */
+#define DZ_PRINT_DTR     0x0100               /* DTR for the prntr line (3) */
+#define DZ_LNENB         0x000f               /* Transmitter Line Enable */
 
 /*
  * Definitions for the Modem Status Register.
  */
+#define DZ_MODEM_RI      0x0800               /* RI for the modem line (2) */
+#define DZ_MODEM_CD      0x0400               /* CD for the modem line (2) */
 #define DZ_MODEM_DSR     0x0200               /* DSR for the modem line (2) */
+#define DZ_MODEM_CTS     0x0100               /* CTS for the modem line (2) */
+#define DZ_PRINT_RI      0x0008               /* RI for the printer line (3) */
+#define DZ_PRINT_CD      0x0004               /* CD for the printer line (3) */
+#define DZ_PRINT_DSR     0x0002               /* DSR for the prntr line (3) */
+#define DZ_PRINT_CTS     0x0001               /* CTS for the prntr line (3) */
 
 /*
  * Definitions for the Transmit Data Register.
-- 
cgit v0.10.2


From 01afb2134ed079fa4551b4d26f62423df6790c09 Mon Sep 17 00:00:00 2001
From: Yan Burman <burman.yan@gmail.com>
Date: Wed, 6 Dec 2006 20:39:01 -0800
Subject: [PATCH] reiser: replace kmalloc+memset with kzalloc

Replace kmalloc+memset with kzalloc

Signed-off-by: Yan Burman <burman.yan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 6526498..970ecd9 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -317,12 +317,11 @@ static int reiserfs_allocate_blocks_for_region(struct reiserfs_transaction_handl
 			/* area filled with zeroes, to supply as list of zero blocknumbers
 			   We allocate it outside of loop just in case loop would spin for
 			   several iterations. */
-			char *zeros = kmalloc(to_paste * UNFM_P_SIZE, GFP_ATOMIC);	// We cannot insert more than MAX_ITEM_LEN bytes anyway.
+			char *zeros = kzalloc(to_paste * UNFM_P_SIZE, GFP_ATOMIC);	// We cannot insert more than MAX_ITEM_LEN bytes anyway.
 			if (!zeros) {
 				res = -ENOMEM;
 				goto error_exit_free_blocks;
 			}
-			memset(zeros, 0, to_paste * UNFM_P_SIZE);
 			do {
 				to_paste =
 				    min_t(__u64, hole_size,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a625688..254239e 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -929,15 +929,12 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 			if (blocks_needed == 1) {
 				un = &unf_single;
 			} else {
-				un = kmalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_ATOMIC);	// We need to avoid scheduling.
+				un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_ATOMIC);	// We need to avoid scheduling.
 				if (!un) {
 					un = &unf_single;
 					blocks_needed = 1;
 					max_to_insert = 0;
-				} else
-					memset(un, 0,
-					       UNFM_P_SIZE * min(blocks_needed,
-								 max_to_insert));
+				}
 			}
 			if (blocks_needed <= max_to_insert) {
 				/* we are going to add target block to the file. Use allocated
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 745bc71..7fb5fb0 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1549,13 +1549,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	struct reiserfs_sb_info *sbi;
 	int errval = -EINVAL;
 
-	sbi = kmalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
+	sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
 	if (!sbi) {
 		errval = -ENOMEM;
 		goto error;
 	}
 	s->s_fs_info = sbi;
-	memset(sbi, 0, sizeof(struct reiserfs_sb_info));
 	/* Set default values for options: non-aggressive tails, RO on errors */
 	REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
 	REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_ERROR_RO);
-- 
cgit v0.10.2


From 95362fa90312ff2d52c0b4d42412cd7ceeb3b89b Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 6 Dec 2006 20:39:03 -0800
Subject: [PATCH] futex: init error check

Check register_filesystem() and kern_mount() return values.

Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/futex.c b/kernel/futex.c
index 7c0d0d4..d60b7f7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1857,10 +1857,16 @@ static struct file_system_type futex_fs_type = {
 
 static int __init init(void)
 {
-	unsigned int i;
+	int i = register_filesystem(&futex_fs_type);
+
+	if (i)
+		return i;
 
-	register_filesystem(&futex_fs_type);
 	futex_mnt = kern_mount(&futex_fs_type);
+	if (IS_ERR(futex_mnt)) {
+		unregister_filesystem(&futex_fs_type);
+		return PTR_ERR(futex_mnt);
+	}
 
 	for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
 		INIT_LIST_HEAD(&futex_queues[i].chain);
-- 
cgit v0.10.2


From 7b92ff01c2cd21567c3342c1c77d5e4f1a2bd699 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 6 Dec 2006 20:39:05 -0800
Subject: [PATCH] spi: check platform_device_register_simple() error

Check the return value of platform_device_register_simple().

Cc: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/spi/spi_butterfly.c b/drivers/spi/spi_butterfly.c
index c2f601f..312987a 100644
--- a/drivers/spi/spi_butterfly.c
+++ b/drivers/spi/spi_butterfly.c
@@ -251,6 +251,8 @@ static void butterfly_attach(struct parport *p)
 	 * setting up a platform device like this is an ugly kluge...
 	 */
 	pdev = platform_device_register_simple("butterfly", -1, NULL, 0);
+	if (IS_ERR(pdev))
+		return;
 
 	master = spi_alloc_master(&pdev->dev, sizeof *pp);
 	if (!master) {
-- 
cgit v0.10.2


From 36499dc2bc8025bc931a0fb22bbe0ac0e46ffb14 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 6 Dec 2006 20:39:06 -0800
Subject: [PATCH] synclink_gt: fix init error handling

Initialization synclink_gt forgot to unregister pci driver on error path.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Paul Fulghum <paulkf@microgate.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
index 07f34d4..83b5d37 100644
--- a/drivers/char/synclink_gt.c
+++ b/drivers/char/synclink_gt.c
@@ -3522,6 +3522,7 @@ static int __init slgt_init(void)
 
 	if (!slgt_device_list) {
 		printk("%s no devices found\n",driver_name);
+		pci_unregister_driver(&pci_driver);
 		return -ENODEV;
 	}
 
-- 
cgit v0.10.2


From bd9b0bac6f601655044fc35978e26231dffee03e Mon Sep 17 00:00:00 2001
From: "BP, Praveen" <praveenbp@ti.com>
Date: Wed, 6 Dec 2006 20:39:09 -0800
Subject: [PATCH] sysctl: string length calculated is wrong if it contains
 negative numbers

In the functions do_proc_dointvec() and do_proc_doulongvec_minmax(),
there seems to be a bug in string length calculation if string contains
negative integer.

The console log given below explains the bug. Setting negative values
may not be a right thing to do for "console log level" but then the test
(given below) can be used to demonstrate the bug in the code.

# echo "-1 -1 -1 -123456" > /proc/sys/kernel/printk
# cat /proc/sys/kernel/printk
-1      -1      -1      -1234
#
# echo "-1 -1 -1 123456" > /proc/sys/kernel/printk
# cat /proc/sys/kernel/printk
-1      -1      -1      1234
#

(akpm: the bug is that 123456 gets truncated)

It works as expected if string contains all +ve integers

# echo "1 2 3 4" > /proc/sys/kernel/printk
# cat /proc/sys/kernel/printk
1       2       3       4
#

The patch given below fixes the issue.

Signed-off-by: Praveen BP <praveenbp@ti.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7abe970..6d7147c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1875,7 +1875,7 @@ static int __do_proc_dointvec(void *tbl_data, ctl_table *table,
 			p = buf;
 			if (*p == '-' && left > 1) {
 				neg = 1;
-				left--, p++;
+				p++;
 			}
 			if (*p < '0' || *p > '9')
 				break;
@@ -2126,7 +2126,7 @@ static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write,
 			p = buf;
 			if (*p == '-' && left > 1) {
 				neg = 1;
-				left--, p++;
+				p++;
 			}
 			if (*p < '0' || *p > '9')
 				break;
-- 
cgit v0.10.2


From 301827acbe49d0ba7ec9770803970893ac9ded97 Mon Sep 17 00:00:00 2001
From: Chris Caputo <ccaputo@alt.net>
Date: Wed, 6 Dec 2006 20:39:11 -0800
Subject: [PATCH] sched: correct output of show_state()

At present show_state prints a header the does not match the output of
show_task, as follows:

-
                                               sibling
  task             PC      pid father child younger older
init          S 00000000     0     1      0     2               (NOTLB)
-

This patch corrects the output of show_state so that the header is
aligned with the data, ala:

-
                         free                        sibling
  task             PC    stack   pid father child younger older
init          S 00000000     0     1      0     2               (NOTLB)
-

Signed-off-by: Chris Caputo <ccaputo@alt.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/sched.c b/kernel/sched.c
index c83f531..b43cef0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4822,12 +4822,12 @@ void show_state_filter(unsigned long state_filter)
 
 #if (BITS_PER_LONG == 32)
 	printk("\n"
-	       "                                               sibling\n");
-	printk("  task             PC      pid father child younger older\n");
+	       "                         free                        sibling\n");
+	printk("  task             PC    stack   pid father child younger older\n");
 #else
 	printk("\n"
-	       "                                                       sibling\n");
-	printk("  task                 PC          pid father child younger older\n");
+	       "                                 free                        sibling\n");
+	printk("  task                 PC        stack   pid father child younger older\n");
 #endif
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
-- 
cgit v0.10.2


From c55747682e938c57a9a859d3b26f2c4c83cea011 Mon Sep 17 00:00:00 2001
From: "Vladimir V. Saveliev" <vs@namesys.com>
Date: Wed, 6 Dec 2006 20:39:12 -0800
Subject: [PATCH] reiserfs: do not add save links for O_DIRECT writes

We add a save link for O_DIRECT writes to protect the i_size against the
crashes before we actually finish the I/O.  If we hit an -ENOSPC in
aops->prepare_write(), we would do a truncate() to release the blocks which
might have got initialized.  Now the truncate would add another save link
for the same inode causing a reiserfs panic for having multiple save links
for the same inode.

Signed-off-by: Vladimir V. Saveliev <vs@namesys.com>
Signed-off-by: Amit Arora <amitarora@in.ibm.com>
Signed-off-by: Suzuki K P <suzuki@in.ibm.com>
Cc: Jeff Mahoney <jeffm@suse.com>
Cc: Chris Mason <mason@suse.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 970ecd9..373d862 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -406,6 +406,8 @@ static int reiserfs_allocate_blocks_for_region(struct reiserfs_transaction_handl
 				   we restart it. This will also free the path. */
 				if (journal_transaction_should_end
 				    (th, th->t_blocks_allocated)) {
+					inode->i_size = cpu_key_k_offset(&key) +
+						(to_paste << inode->i_blkbits);
 					res =
 					    restart_transaction(th, inode,
 								&path);
@@ -1310,56 +1312,8 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
 			count = MAX_NON_LFS - (unsigned long)*ppos;
 	}
 
-	if (file->f_flags & O_DIRECT) {	// Direct IO needs treatment
-		ssize_t result, after_file_end = 0;
-		if ((*ppos + count >= inode->i_size)
-		    || (file->f_flags & O_APPEND)) {
-			/* If we are appending a file, we need to put this savelink in here.
-			   If we will crash while doing direct io, finish_unfinished will
-			   cut the garbage from the file end. */
-			reiserfs_write_lock(inode->i_sb);
-			err =
-			    journal_begin(&th, inode->i_sb,
-					  JOURNAL_PER_BALANCE_CNT);
-			if (err) {
-				reiserfs_write_unlock(inode->i_sb);
-				return err;
-			}
-			reiserfs_update_inode_transaction(inode);
-			add_save_link(&th, inode, 1 /* Truncate */ );
-			after_file_end = 1;
-			err =
-			    journal_end(&th, inode->i_sb,
-					JOURNAL_PER_BALANCE_CNT);
-			reiserfs_write_unlock(inode->i_sb);
-			if (err)
-				return err;
-		}
-		result = do_sync_write(file, buf, count, ppos);
-
-		if (after_file_end) {	/* Now update i_size and remove the savelink */
-			struct reiserfs_transaction_handle th;
-			reiserfs_write_lock(inode->i_sb);
-			err = journal_begin(&th, inode->i_sb, 1);
-			if (err) {
-				reiserfs_write_unlock(inode->i_sb);
-				return err;
-			}
-			reiserfs_update_inode_transaction(inode);
-			mark_inode_dirty(inode);
-			err = journal_end(&th, inode->i_sb, 1);
-			if (err) {
-				reiserfs_write_unlock(inode->i_sb);
-				return err;
-			}
-			err = remove_save_link(inode, 1 /* truncate */ );
-			reiserfs_write_unlock(inode->i_sb);
-			if (err)
-				return err;
-		}
-
-		return result;
-	}
+	if (file->f_flags & O_DIRECT)
+		return do_sync_write(file, buf, count, ppos);
 
 	if (unlikely((ssize_t) count < 0))
 		return -EINVAL;
-- 
cgit v0.10.2


From c6f24f99cd70a383699bdb34ddd9e6e51c83304c Mon Sep 17 00:00:00 2001
From: Riku Voipio <riku.voipio@movial.fi>
Date: Wed, 6 Dec 2006 20:39:13 -0800
Subject: [PATCH] rtc-rs5c372: change register reading method

According to the datasheet rs5c372 supports three different methods for
reading register values.  Change from method #1 to method #3, since method #3
is the only one that works on Thecus N2100 board with this RTC.

Signed-off-by: Riku Voipio <riku.voipio@movial.fi>
Cc: Alessandro Zummo <alessandro.zummo@towertech.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/rtc/rtc-rs5c372.c b/drivers/rtc/rtc-rs5c372.c
index 9e1bb3a..e2c7698f 100644
--- a/drivers/rtc/rtc-rs5c372.c
+++ b/drivers/rtc/rtc-rs5c372.c
@@ -13,7 +13,7 @@
 #include <linux/rtc.h>
 #include <linux/bcd.h>
 
-#define DRV_VERSION "0.2"
+#define DRV_VERSION "0.3"
 
 /* Addresses to scan */
 static unsigned short normal_i2c[] = { /* 0x32,*/ I2C_CLIENT_END };
@@ -39,6 +39,14 @@ static int rs5c372_attach(struct i2c_adapter *adapter);
 static int rs5c372_detach(struct i2c_client *client);
 static int rs5c372_probe(struct i2c_adapter *adapter, int address, int kind);
 
+struct rs5c372 {
+	u8 reg_addr;
+	u8 regs[17];
+	struct i2c_msg msg[1];
+	struct i2c_client client;
+	struct rtc_device *rtc;
+};
+
 static struct i2c_driver rs5c372_driver = {
 	.driver		= {
 		.name	= "rs5c372",
@@ -49,18 +57,16 @@ static struct i2c_driver rs5c372_driver = {
 
 static int rs5c372_get_datetime(struct i2c_client *client, struct rtc_time *tm)
 {
-	unsigned char buf[7] = { RS5C372_REG_BASE };
 
-	/* this implements the 1st reading method, according
-	 * to the datasheet. buf[0] is initialized with
-	 * address ptr and transmission format register.
+	struct rs5c372 *rs5c372 = i2c_get_clientdata(client);
+	u8 *buf = &(rs5c372->regs[1]);
+
+	/* this implements the 3rd reading method, according
+	 * to the datasheet. rs5c372 defaults to internal
+	 * address 0xF, so 0x0 is in regs[1]
 	 */
-	struct i2c_msg msgs[] = {
-		{ client->addr, 0, 1, buf },
-		{ client->addr, I2C_M_RD, 7, buf },
-	};
 
-	if ((i2c_transfer(client->adapter, msgs, 2)) != 2) {
+	if ((i2c_transfer(client->adapter, rs5c372->msg, 1)) != 1) {
 		dev_err(&client->dev, "%s: read error\n", __FUNCTION__);
 		return -EIO;
 	}
@@ -114,23 +120,14 @@ static int rs5c372_set_datetime(struct i2c_client *client, struct rtc_time *tm)
 
 static int rs5c372_get_trim(struct i2c_client *client, int *osc, int *trim)
 {
-	unsigned char buf = RS5C372_REG_TRIM;
-
-	struct i2c_msg msgs[] = {
-		{ client->addr, 0, 1, &buf },
-		{ client->addr, I2C_M_RD, 1, &buf },
-	};
-
-	if ((i2c_transfer(client->adapter, msgs, 2)) != 2) {
-		dev_err(&client->dev, "%s: read error\n", __FUNCTION__);
-		return -EIO;
-	}
+	struct rs5c372 *rs5c372 = i2c_get_clientdata(client);
+	u8 tmp = rs5c372->regs[RS5C372_REG_TRIM + 1];
 
 	if (osc)
-		*osc = (buf & RS5C372_TRIM_XSL) ? 32000 : 32768;
+		*osc = (tmp & RS5C372_TRIM_XSL) ? 32000 : 32768;
 
 	if (trim) {
-		*trim = buf & RS5C372_TRIM_MASK;
+		*trim = tmp & RS5C372_TRIM_MASK;
 		dev_dbg(&client->dev, "%s: raw trim=%x\n", __FUNCTION__, *trim);
 	}
 
@@ -201,7 +198,7 @@ static int rs5c372_probe(struct i2c_adapter *adapter, int address, int kind)
 {
 	int err = 0;
 	struct i2c_client *client;
-	struct rtc_device *rtc;
+	struct rs5c372 *rs5c372;
 
 	dev_dbg(&adapter->dev, "%s\n", __FUNCTION__);
 
@@ -210,10 +207,11 @@ static int rs5c372_probe(struct i2c_adapter *adapter, int address, int kind)
 		goto exit;
 	}
 
-	if (!(client = kzalloc(sizeof(struct i2c_client), GFP_KERNEL))) {
+	if (!(rs5c372 = kzalloc(sizeof(struct rs5c372), GFP_KERNEL))) {
 		err = -ENOMEM;
 		goto exit;
 	}
+	client = &rs5c372->client;
 
 	/* I2C client */
 	client->addr = address;
@@ -222,26 +220,33 @@ static int rs5c372_probe(struct i2c_adapter *adapter, int address, int kind)
 
 	strlcpy(client->name, rs5c372_driver.driver.name, I2C_NAME_SIZE);
 
+	i2c_set_clientdata(client, rs5c372);
+
+	rs5c372->msg[0].addr = address;
+	rs5c372->msg[0].flags = I2C_M_RD;
+	rs5c372->msg[0].len = sizeof(rs5c372->regs);
+	rs5c372->msg[0].buf = rs5c372->regs;
+
 	/* Inform the i2c layer */
 	if ((err = i2c_attach_client(client)))
 		goto exit_kfree;
 
 	dev_info(&client->dev, "chip found, driver version " DRV_VERSION "\n");
 
-	rtc = rtc_device_register(rs5c372_driver.driver.name, &client->dev,
-				&rs5c372_rtc_ops, THIS_MODULE);
+	rs5c372->rtc = rtc_device_register(rs5c372_driver.driver.name,
+				&client->dev, &rs5c372_rtc_ops, THIS_MODULE);
 
-	if (IS_ERR(rtc)) {
-		err = PTR_ERR(rtc);
+	if (IS_ERR(rs5c372->rtc)) {
+		err = PTR_ERR(rs5c372->rtc);
 		goto exit_detach;
 	}
 
-	i2c_set_clientdata(client, rtc);
-
 	err = device_create_file(&client->dev, &dev_attr_trim);
-	if (err) goto exit_devreg;
+	if (err)
+		goto exit_devreg;
 	err = device_create_file(&client->dev, &dev_attr_osc);
-	if (err) goto exit_trim;
+	if (err)
+		goto exit_trim;
 
 	return 0;
 
@@ -249,13 +254,13 @@ exit_trim:
 	device_remove_file(&client->dev, &dev_attr_trim);
 
 exit_devreg:
-	rtc_device_unregister(rtc);
+	rtc_device_unregister(rs5c372->rtc);
 
 exit_detach:
 	i2c_detach_client(client);
 
 exit_kfree:
-	kfree(client);
+	kfree(rs5c372);
 
 exit:
 	return err;
@@ -264,16 +269,15 @@ exit:
 static int rs5c372_detach(struct i2c_client *client)
 {
 	int err;
-	struct rtc_device *rtc = i2c_get_clientdata(client);
+	struct rs5c372 *rs5c372 = i2c_get_clientdata(client);
 
-	if (rtc)
-		rtc_device_unregister(rtc);
+	if (rs5c372->rtc)
+		rtc_device_unregister(rs5c372->rtc);
 
 	if ((err = i2c_detach_client(client)))
 		return err;
 
-	kfree(client);
-
+	kfree(rs5c372);
 	return 0;
 }
 
-- 
cgit v0.10.2


From 4e229beff7ee43d3d5e387ec91188b20f6267c00 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 6 Dec 2006 20:39:14 -0800
Subject: [PATCH] REPORTING-BUGS: request .config file

Add kernel .config file to REPORTING-BUGS.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/REPORTING-BUGS b/REPORTING-BUGS
index f9da827..ac02e42 100644
--- a/REPORTING-BUGS
+++ b/REPORTING-BUGS
@@ -40,7 +40,9 @@ summary from [1.]>" for easy identification by the developers.
 [1.] One line summary of the problem:
 [2.] Full description of the problem/report:
 [3.] Keywords (i.e., modules, networking, kernel):
-[4.] Kernel version (from /proc/version):
+[4.] Kernel information
+[4.1.] Kernel version (from /proc/version):
+[4.2.] Kernel .config file:
 [5.] Most recent kernel version which did not have the bug:
 [6.] Output of Oops.. message (if applicable) with symbolic information
      resolved (see Documentation/oops-tracing.txt)
-- 
cgit v0.10.2


From 0b71c8e76d20d0329bf7e54f172389f3c343dc41 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@hpl.hp.com>
Date: Wed, 6 Dec 2006 20:39:15 -0800
Subject: [PATCH] remove useless carta_random32.h

Remove the carta_random32.h header file.  The carta_random32() function was
was put in and removed in favor of random32().  In the removal process, the
header file was forgotten.

Signed-off-by: Stephane Eranian <eranian@hpl.hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/carta_random32.h b/include/linux/carta_random32.h
deleted file mode 100644
index f6f3bd9..0000000
--- a/include/linux/carta_random32.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Fast, simple, yet decent quality random number generator based on
- * a paper by David G. Carta ("Two Fast Implementations of the
- * `Minimal Standard' Random Number Generator," Communications of the
- * ACM, January, 1990).
- *
- * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P.
- *	Contributed by Stephane Eranian <eranian@hpl.hp.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
- * 02111-1307 USA
- */
-#ifndef _LINUX_CARTA_RANDOM32_H_
-#define _LINUX_CARTA_RANDOM32_H_
-
-u64 carta_random32(u64 seed);
-
-#endif /* _LINUX_CARTA_RANDOM32_H_ */
-- 
cgit v0.10.2


From 702a28b1e64be3dc313f5f0ceb6dc95edfbc5e18 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 6 Dec 2006 20:39:16 -0800
Subject: [PATCH] lib functions: always build hweight for loadable modules

Always build hweight8/16/32/64() functions into the kernel so that loadable
modules may use them.

I didn't remove GENERIC_HWEIGHT since ALPHA_EV67, ia64, and some variants
of UltraSparc(64) provide their own hweight functions.

Fixes config/build problems with NTFS=m and JOYSTICK_ANALOG=m.

  Kernel: arch/x86_64/boot/bzImage is ready  (#19)
    Building modules, stage 2.
    MODPOST 94 modules
  WARNING: "hweight32" [fs/ntfs/ntfs.ko] undefined!
  WARNING: "hweight16" [drivers/input/joystick/analog.ko] undefined!
  WARNING: "hweight8" [drivers/input/joystick/analog.ko] undefined!
  make[1]: *** [__modpost] Error 1
  make: *** [modules] Error 2

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/lib/Makefile b/lib/Makefile
index cf98fab..fea8f90 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -25,7 +25,7 @@ lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
-lib-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
+obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
 obj-$(CONFIG_PLIST) += plist.o
 obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
-- 
cgit v0.10.2


From 126039256cf164f69d8cfa20f1952d00fa61f52f Mon Sep 17 00:00:00 2001
From: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Date: Wed, 6 Dec 2006 20:39:17 -0800
Subject: [PATCH] jbd2: wait for already submitted t_sync_datalist buffer to
 complete

In the current jbd code, if a buffer on BJ_SyncData list is dirty and not
locked, the buffer is refiled to BJ_Locked list, submitted to the IO and
waited for IO completion.

But the fsstress test showed the case that when a buffer was already
submitted to the IO just before the buffer_dirty(bh) check, the buffer was
not waited for IO completion.

Following patch solves this problem.  If it is assumed that a buffer is
submitted to the IO before the buffer_dirty(bh) check and still being
written to disk, this buffer is refiled to BJ_Locked list.

Signed-off-by: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Cc: Jan Kara <jack@ucw.cz>
Cc: "Stephen C. Tweedie" <sct@redhat.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 70b2ae1..6bd8005 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -248,8 +248,12 @@ write_out_data:
 				bufs = 0;
 				goto write_out_data;
 			}
-		}
-		else {
+		} else if (!locked && buffer_locked(bh)) {
+			__jbd2_journal_file_buffer(jh, commit_transaction,
+						BJ_Locked);
+			jbd_unlock_bh_state(bh);
+			put_bh(bh);
+		} else {
 			BUFFER_TRACE(bh, "writeout complete: unfile");
 			__jbd2_journal_unfile_buffer(jh);
 			jbd_unlock_bh_state(bh);
-- 
cgit v0.10.2


From cd16c8f72aaaf2af06969d1441051b90010f7041 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 6 Dec 2006 20:39:18 -0800
Subject: [PATCH] ext4 balloc: reset windowsz when full

ext4_new_blocks should reset the reservation window size to 0 when squeezing
the last blocks out of an almost full filesystem, so the retry doesn't skip
any groups with less than half that free, reporting ENOSPC too soon.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 6bd0bd5..af3acf3 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1566,6 +1566,7 @@ retry_alloc:
 	 */
 	if (my_rsv) {
 		my_rsv = NULL;
+		windowsz = 0;
 		group_no = goal_group;
 		goto retry_alloc;
 	}
-- 
cgit v0.10.2


From e7dc95db2695dc92b223cdc49227ac57e63406d2 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 6 Dec 2006 20:39:19 -0800
Subject: [PATCH] ext4 balloc: fix off-by-one against grp_goal

grp_goal 0 is a genuine goal (unlike -1), so ext4_try_to_allocate_with_rsv
should treat it as such.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index af3acf3..2bcca52 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1288,7 +1288,7 @@ ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 	}
 	/*
 	 * grp_goal is a group relative block number (if there is a goal)
-	 * 0 < grp_goal < EXT4_BLOCKS_PER_GROUP(sb)
+	 * 0 <= grp_goal < EXT4_BLOCKS_PER_GROUP(sb)
 	 * first block is a filesystem wide block number
 	 * first block is the block number of the first block in this group
 	 */
@@ -1324,7 +1324,7 @@ ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 			if (!goal_in_my_reservation(&my_rsv->rsv_window,
 							grp_goal, group, sb))
 				grp_goal = -1;
-		} else if (grp_goal > 0) {
+		} else if (grp_goal >= 0) {
 			int curr = my_rsv->rsv_end -
 					(grp_goal + group_first_block) + 1;
 
-- 
cgit v0.10.2


From b2f2c76d17b68869914a1ec3ab04c7674668f60d Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 6 Dec 2006 20:39:20 -0800
Subject: [PATCH] ext4 balloc: fix off-by-one against rsv_end

rsv_end is the last block within the reservation, so alloc_new_reservation
should accept start_block == rsv_end as success.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 2bcca52..c5589b3 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1165,7 +1165,7 @@ retry:
 	 * check if the first free block is within the
 	 * free space we just reserved
 	 */
-	if (start_block >= my_rsv->rsv_start && start_block < my_rsv->rsv_end)
+	if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
 		return 0;		/* success */
 	/*
 	 * if the first free bit we found is out of the reservable space
-- 
cgit v0.10.2


From b78a657f0a64134b3813bbdf4e1853d1420eb8d4 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 6 Dec 2006 20:39:21 -0800
Subject: [PATCH] ext4 balloc: say rb_entry not list_entry

The reservations tree is an rb_tree not a list, so it's less confusing to use
rb_entry() than list_entry() - though they're both just container_of().

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index c5589b3..8e7249e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -165,7 +165,7 @@ restart:
 
 	printk("Block Allocation Reservation Windows Map (%s):\n", fn);
 	while (n) {
-		rsv = list_entry(n, struct ext4_reserve_window_node, rsv_node);
+		rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
 		if (verbose)
 			printk("reservation window 0x%p "
 			       "start:  %llu, end:  %llu\n",
@@ -966,7 +966,7 @@ static int find_next_reservable_window(
 
 		prev = rsv;
 		next = rb_next(&rsv->rsv_node);
-		rsv = list_entry(next,struct ext4_reserve_window_node,rsv_node);
+		rsv = rb_entry(next,struct ext4_reserve_window_node,rsv_node);
 
 		/*
 		 * Reached the last reservation, we can just append to the
@@ -1210,7 +1210,7 @@ static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
 	if (!next)
 		my_rsv->rsv_end += size;
 	else {
-		next_rsv = list_entry(next, struct ext4_reserve_window_node, rsv_node);
+		next_rsv = rb_entry(next, struct ext4_reserve_window_node, rsv_node);
 
 		if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
 			my_rsv->rsv_end += size;
-- 
cgit v0.10.2


From 341cee438593ff9c4520fb73c561460074f29a9a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 6 Dec 2006 20:39:24 -0800
Subject: [PATCH] ext4 balloc: use io_error label

ext4_new_blocks has a nice io_error label for setting -EIO, so goto that in
the one place that doesn't already use it.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 8e7249e..1c8ef0e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1529,10 +1529,8 @@ retry_alloc:
 		if (group_no >= ngroups)
 			group_no = 0;
 		gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
-		if (!gdp) {
-			*errp = -EIO;
-			goto out;
-		}
+		if (!gdp)
+			goto io_error;
 		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
 		/*
 		 * skip this group if the number of
-- 
cgit v0.10.2


From ec0837f230e57afde65db72539e748d2a75abed0 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 6 Dec 2006 20:39:26 -0800
Subject: [PATCH] ext4 balloc: fix _with_rsv freeze

Port fix to the off-by-one in find_next_usable_block's memscan from ext2 to
ext4; but it didn't cause a serious problem for ext4 because the additional
ext4_test_allocatable check rescued it from the error.

[akpm@osdl.org: build fix]
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1c8ef0e..c4dd110 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -747,7 +747,7 @@ find_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
 		here = 0;
 
 	p = ((char *)bh->b_data) + (here >> 3);
-	r = memscan(p, 0, (maxblocks - here + 7) >> 3);
+	r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
 	next = (r - ((char *)bh->b_data)) << 3;
 
 	if (next < maxblocks && next >= start && ext4_test_allocatable(next, bh))
-- 
cgit v0.10.2


From e9d55f9dbcf048a882478b437fa3f87becf8a770 Mon Sep 17 00:00:00 2001
From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Wed, 6 Dec 2006 20:39:27 -0800
Subject: [PATCH] better CONFIG_W1_SLAVE_DS2433_CRC handling

CONFIG_W1_SLAVE_DS2433_CRC can be used directly, there's no reason for the
indirection of defining a different variable in the Makefile.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/w1/Makefile b/drivers/w1/Makefile
index 93845a2..6bb0b54 100644
--- a/drivers/w1/Makefile
+++ b/drivers/w1/Makefile
@@ -2,10 +2,6 @@
 # Makefile for the Dallas's 1-wire bus.
 #
 
-ifeq ($(CONFIG_W1_DS2433_CRC), y)
-EXTRA_CFLAGS	+= -DCONFIG_W1_F23_CRC
-endif
-
 obj-$(CONFIG_W1)	+= wire.o
 wire-objs		:= w1.o w1_int.o w1_family.o w1_netlink.o w1_io.o
 
diff --git a/drivers/w1/slaves/Makefile b/drivers/w1/slaves/Makefile
index 70e21e2..725dcfd 100644
--- a/drivers/w1/slaves/Makefile
+++ b/drivers/w1/slaves/Makefile
@@ -2,10 +2,6 @@
 # Makefile for the Dallas's 1-wire slaves.
 #
 
-ifeq ($(CONFIG_W1_SLAVE_DS2433_CRC), y)
-EXTRA_CFLAGS += -DCONFIG_W1_F23_CRC
-endif
-
 obj-$(CONFIG_W1_SLAVE_THERM)	+= w1_therm.o
 obj-$(CONFIG_W1_SLAVE_SMEM)	+= w1_smem.o
 obj-$(CONFIG_W1_SLAVE_DS2433)	+= w1_ds2433.o
diff --git a/drivers/w1/slaves/w1_ds2433.c b/drivers/w1/slaves/w1_ds2433.c
index 2ac238f..8ea17a5 100644
--- a/drivers/w1/slaves/w1_ds2433.c
+++ b/drivers/w1/slaves/w1_ds2433.c
@@ -13,7 +13,7 @@
 #include <linux/device.h>
 #include <linux/types.h>
 #include <linux/delay.h>
-#ifdef CONFIG_W1_F23_CRC
+#ifdef CONFIG_W1_SLAVE_DS2433_CRC
 #include <linux/crc16.h>
 
 #define CRC16_INIT		0
@@ -62,7 +62,7 @@ static inline size_t w1_f23_fix_count(loff_t off, size_t count, size_t size)
 	return count;
 }
 
-#ifdef CONFIG_W1_F23_CRC
+#ifdef CONFIG_W1_SLAVE_DS2433_CRC
 static int w1_f23_refresh_block(struct w1_slave *sl, struct w1_f23_data *data,
 				int block)
 {
@@ -89,13 +89,13 @@ static int w1_f23_refresh_block(struct w1_slave *sl, struct w1_f23_data *data,
 
 	return 0;
 }
-#endif	/* CONFIG_W1_F23_CRC */
+#endif	/* CONFIG_W1_SLAVE_DS2433_CRC */
 
 static ssize_t w1_f23_read_bin(struct kobject *kobj, char *buf, loff_t off,
 			       size_t count)
 {
 	struct w1_slave *sl = kobj_to_w1_slave(kobj);
-#ifdef CONFIG_W1_F23_CRC
+#ifdef CONFIG_W1_SLAVE_DS2433_CRC
 	struct w1_f23_data *data = sl->family_data;
 	int i, min_page, max_page;
 #else
@@ -107,7 +107,7 @@ static ssize_t w1_f23_read_bin(struct kobject *kobj, char *buf, loff_t off,
 
 	mutex_lock(&sl->master->mutex);
 
-#ifdef CONFIG_W1_F23_CRC
+#ifdef CONFIG_W1_SLAVE_DS2433_CRC
 
 	min_page = (off >> W1_PAGE_BITS);
 	max_page = (off + count - 1) >> W1_PAGE_BITS;
@@ -119,7 +119,7 @@ static ssize_t w1_f23_read_bin(struct kobject *kobj, char *buf, loff_t off,
 	}
 	memcpy(buf, &data->memory[off], count);
 
-#else 	/* CONFIG_W1_F23_CRC */
+#else 	/* CONFIG_W1_SLAVE_DS2433_CRC */
 
 	/* read directly from the EEPROM */
 	if (w1_reset_select_slave(sl)) {
@@ -133,7 +133,7 @@ static ssize_t w1_f23_read_bin(struct kobject *kobj, char *buf, loff_t off,
 	w1_write_block(sl->master, wrbuf, 3);
 	w1_read_block(sl->master, buf, count);
 
-#endif	/* CONFIG_W1_F23_CRC */
+#endif	/* CONFIG_W1_SLAVE_DS2433_CRC */
 
 out_up:
 	mutex_unlock(&sl->master->mutex);
@@ -208,7 +208,7 @@ static ssize_t w1_f23_write_bin(struct kobject *kobj, char *buf, loff_t off,
 	if ((count = w1_f23_fix_count(off, count, W1_EEPROM_SIZE)) == 0)
 		return 0;
 
-#ifdef CONFIG_W1_F23_CRC
+#ifdef CONFIG_W1_SLAVE_DS2433_CRC
 	/* can only write full blocks in cached mode */
 	if ((off & W1_PAGE_MASK) || (count & W1_PAGE_MASK)) {
 		dev_err(&sl->dev, "invalid offset/count off=%d cnt=%zd\n",
@@ -223,7 +223,7 @@ static ssize_t w1_f23_write_bin(struct kobject *kobj, char *buf, loff_t off,
 			return -EINVAL;
 		}
 	}
-#endif	/* CONFIG_W1_F23_CRC */
+#endif	/* CONFIG_W1_SLAVE_DS2433_CRC */
 
 	mutex_lock(&sl->master->mutex);
 
@@ -262,7 +262,7 @@ static struct bin_attribute w1_f23_bin_attr = {
 static int w1_f23_add_slave(struct w1_slave *sl)
 {
 	int err;
-#ifdef CONFIG_W1_F23_CRC
+#ifdef CONFIG_W1_SLAVE_DS2433_CRC
 	struct w1_f23_data *data;
 
 	data = kmalloc(sizeof(struct w1_f23_data), GFP_KERNEL);
@@ -271,24 +271,24 @@ static int w1_f23_add_slave(struct w1_slave *sl)
 	memset(data, 0, sizeof(struct w1_f23_data));
 	sl->family_data = data;
 
-#endif	/* CONFIG_W1_F23_CRC */
+#endif	/* CONFIG_W1_SLAVE_DS2433_CRC */
 
 	err = sysfs_create_bin_file(&sl->dev.kobj, &w1_f23_bin_attr);
 
-#ifdef CONFIG_W1_F23_CRC
+#ifdef CONFIG_W1_SLAVE_DS2433_CRC
 	if (err)
 		kfree(data);
-#endif	/* CONFIG_W1_F23_CRC */
+#endif	/* CONFIG_W1_SLAVE_DS2433_CRC */
 
 	return err;
 }
 
 static void w1_f23_remove_slave(struct w1_slave *sl)
 {
-#ifdef CONFIG_W1_F23_CRC
+#ifdef CONFIG_W1_SLAVE_DS2433_CRC
 	kfree(sl->family_data);
 	sl->family_data = NULL;
-#endif	/* CONFIG_W1_F23_CRC */
+#endif	/* CONFIG_W1_SLAVE_DS2433_CRC */
 	sysfs_remove_bin_file(&sl->dev.kobj, &w1_f23_bin_attr);
 }
 
-- 
cgit v0.10.2


From 50cc670aebf4fc64afaf533fb9fa1c8570f09d74 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 6 Dec 2006 20:39:30 -0800
Subject: [PATCH] lockdep: more chains

Some have reported a chain-table overflow - double its size.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index eab043c..8ce09bc 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -20,7 +20,7 @@
 #define MAX_LOCKDEP_KEYS_BITS	11
 #define MAX_LOCKDEP_KEYS	(1UL << MAX_LOCKDEP_KEYS_BITS)
 
-#define MAX_LOCKDEP_CHAINS_BITS	13
+#define MAX_LOCKDEP_CHAINS_BITS	14
 #define MAX_LOCKDEP_CHAINS	(1UL << MAX_LOCKDEP_CHAINS_BITS)
 
 /*
-- 
cgit v0.10.2


From 2ee91f197c0bc654b24eed5831fd12aa0d566a7d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 6 Dec 2006 20:39:32 -0800
Subject: [PATCH] lockdep: show more details about self-test failures

Make the locking self-test failures (of 'FAILURE' type) easier to debug by
printing more information.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index 952bee7..a1c10b0 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -24,7 +24,7 @@ extern int debug_locks_off(void);
 	int __ret = 0;							\
 									\
 	if (unlikely(c)) {						\
-		if (debug_locks_off())					\
+		if (debug_locks_silent || debug_locks_off())		\
 			WARN_ON(1);					\
 		__ret = 1;						\
 	}								\
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 1865164..841539d 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -77,6 +77,9 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 
 void debug_mutex_unlock(struct mutex *lock)
 {
+	if (unlikely(!debug_locks))
+		return;
+
 	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
 	DEBUG_LOCKS_WARN_ON(lock->magic != lock);
 	DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 7945787..280332c 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -963,7 +963,9 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask)
 			printk("failed|");
 		} else {
 			unexpected_testcase_failures++;
+
 			printk("FAILED|");
+			dump_stack();
 		}
 	} else {
 		testcase_successes++;
-- 
cgit v0.10.2


From aaeab80bdbc0d10a98adc6fa76c29ca2f1816553 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Wed, 6 Dec 2006 20:39:33 -0800
Subject: [PATCH] ide_scsi: allow it to be used for non CD only

Some people want to use ide_cd for CD-ROM but still dynamically load
ide-scsi for things like tape drives.  If you compile in the CD driver this
works out but if you want them modular you need an option to ensure that
whoever loads first the right things happen.

This replaces the original draft patch which leaked a scsi host reference

[akpm@osdl.org: add MODULE_PARM_DESC]
Signed-off-by: Alan Cox <alan@redhat.com>
Cc: Bartlomiej Zolnierkiewicz <B.Zolnierkiewicz@elka.pw.edu.pl>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index 1427a41..8f6b5bf 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -110,6 +110,7 @@ typedef struct ide_scsi_obj {
 } idescsi_scsi_t;
 
 static DEFINE_MUTEX(idescsi_ref_mutex);
+static int idescsi_nocd;			/* Set by module param to skip cd */
 
 #define ide_scsi_g(disk) \
 	container_of((disk)->private_data, struct ide_scsi_obj, driver)
@@ -1127,6 +1128,9 @@ static int ide_scsi_probe(ide_drive_t *drive)
 		warned = 1;
 	}
 
+	if (idescsi_nocd && drive->media == ide_cdrom)
+		return -ENODEV;
+
 	if (!strstr("ide-scsi", drive->driver_req) ||
 	    !drive->present ||
 	    drive->media == ide_disk ||
@@ -1187,6 +1191,8 @@ static void __exit exit_idescsi_module(void)
 	driver_unregister(&idescsi_driver.gen_driver);
 }
 
+module_param(idescsi_nocd, int, 0600);
+MODULE_PARM_DESC(idescsi_nocd, "Disable handling of CD-ROMs so they may be driven by ide-cd");
 module_init(init_idescsi_module);
 module_exit(exit_idescsi_module);
 MODULE_LICENSE("GPL");
-- 
cgit v0.10.2


From f29219f17ae46cc182123bc7d2089b69869935ae Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 6 Dec 2006 20:39:35 -0800
Subject: [PATCH] make 8250_pnp serial driver work after suspend to ram

Add suspend/resume methods to drivers/serial/8250_pnp.c.  Tested on a
P4/HT 16550A box, ttyS0 login survives across suspend to ram.

[akpm@osdl.org: cleanups]
Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Adam Belay <ambx1@neo.rr.com>
Cc: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/serial/8250_pnp.c b/drivers/serial/8250_pnp.c
index 71d907c..d3d6b82 100644
--- a/drivers/serial/8250_pnp.c
+++ b/drivers/serial/8250_pnp.c
@@ -464,11 +464,38 @@ static void __devexit serial_pnp_remove(struct pnp_dev *dev)
 		serial8250_unregister_port(line - 1);
 }
 
+#ifdef CONFIG_PM
+static int serial_pnp_suspend(struct pnp_dev *dev, pm_message_t state)
+{
+	long line = (long)pnp_get_drvdata(dev);
+
+	if (!line)
+		return -ENODEV;
+	serial8250_suspend_port(line - 1);
+	return 0;
+}
+
+static int serial_pnp_resume(struct pnp_dev *dev)
+{
+	long line = (long)pnp_get_drvdata(dev);
+
+	if (!line)
+		return -ENODEV;
+	serial8250_resume_port(line - 1);
+	return 0;
+}
+#else
+#define serial_pnp_suspend NULL
+#define serial_pnp_resume NULL
+#endif /* CONFIG_PM */
+
 static struct pnp_driver serial_pnp_driver = {
 	.name		= "serial",
-	.id_table	= pnp_dev_table,
 	.probe		= serial_pnp_probe,
 	.remove		= __devexit_p(serial_pnp_remove),
+	.suspend	= serial_pnp_suspend,
+	.resume		= serial_pnp_resume,
+	.id_table	= pnp_dev_table,
 };
 
 static int __init serial8250_pnp_init(void)
-- 
cgit v0.10.2


From 595142e0490918af5ffae5a833a2b2acd7097c02 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Wed, 6 Dec 2006 20:39:36 -0800
Subject: [PATCH] MAINTAINERS: Update the i2c and hwmon subsystems info

The i2c and hwmon trees have moved to a new location.

The lm-sensors project moved to a new home as well.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index fa1bba8..c7d895a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1214,7 +1214,8 @@ HARDWARE MONITORING
 P:	Jean Delvare
 M:	khali@linux-fr.org
 L:	lm-sensors@lm-sensors.org
-W:	http://www.lm-sensors.nu/
+W:	http://www.lm-sensors.org/
+T:	quilt http://khali.linux-fr.org/devel/linux-2.6/jdelvare-hwmon/
 S:	Maintained
 
 HARDWARE RANDOM NUMBER GENERATOR CORE
@@ -1340,8 +1341,7 @@ I2C SUBSYSTEM
 P:	Jean Delvare
 M:	khali@linux-fr.org
 L:	i2c@lm-sensors.org
-W:	http://www.lm-sensors.nu/
-T:	quilt kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/
+T:	quilt http://khali.linux-fr.org/devel/linux-2.6/jdelvare-i2c/
 S:	Maintained
 
 I2O
-- 
cgit v0.10.2


From c949d4eb40ce021b1abc3b63af23aa535662cb17 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Wed, 6 Dec 2006 20:39:38 -0800
Subject: [PATCH] autofs: fix error code path in autofs_fill_sb()

When kernel is compiled with old version of autofs (CONFIG_AUTOFS_FS), and
new (observed at least with 5.x.x) automount deamon is started, kernel
correctly reports incompatible version of kernel and userland daemon, but
then screws things up instead of correct handling of the error:

 autofs: kernel does not match daemon version
 =====================================
 [ BUG: bad unlock balance detected! ]
 -------------------------------------
 automount/4199 is trying to release lock (&type->s_umount_key) at:
 [<c0163b9e>] get_sb_nodev+0x76/0xa4
 but there are no more locks to release!

 other info that might help us debug this:
 no locks held by automount/4199.

 stack backtrace:
  [<c0103b15>] dump_trace+0x68/0x1b2
  [<c0103c77>] show_trace_log_lvl+0x18/0x2c
  [<c01041db>] show_trace+0xf/0x11
  [<c010424d>] dump_stack+0x12/0x14
  [<c012e02c>] print_unlock_inbalance_bug+0xe7/0xf3
  [<c012fd4f>] lock_release+0x8d/0x164
  [<c012b452>] up_write+0x14/0x27
  [<c0163b9e>] get_sb_nodev+0x76/0xa4
  [<c0163689>] vfs_kern_mount+0x83/0xf6
  [<c016373e>] do_kern_mount+0x2d/0x3e
  [<c017513f>] do_mount+0x607/0x67a
  [<c0175224>] sys_mount+0x72/0xa4
  [<c0102b96>] sysenter_past_esp+0x5f/0x99
 DWARF2 unwinder stuck at sysenter_past_esp+0x5f/0x99
 Leftover inexact backtrace:
  =======================

and then deadlock comes.

The problem: autofs_fill_super() returns EINVAL to get_sb_nodev(), but
before that, it calls kill_anon_super() to destroy the superblock which
won't be needed.  This is however way too soon to call kill_anon_super(),
because get_sb_nodev() has to perform its own cleanup of the superblock
first (deactivate_super(), etc.).  The correct time to call
kill_anon_super() is in the autofs_kill_sb() callback, which is called by
deactivate_super() at proper time, when the superblock is ready to be
killed.

I can see the same faulty codepath also in autofs4.  This patch solves
issues in both filesystems in a same way - it postpones the
kill_anon_super() until the proper time is signalized by deactivate_super()
calling the kill_sb() callback.

[raven@themaw.net: update comment]
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Acked-by: Ian Kent <raven@themaw.net>
Cc: <stable@kernel.org>
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 38ede5c..f968d13 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -28,10 +28,11 @@ void autofs_kill_sb(struct super_block *sb)
 	/*
 	 * In the event of a failure in get_sb_nodev the superblock
 	 * info is not present so nothing else has been setup, so
-	 * just exit when we are called from deactivate_super.
+	 * just call kill_anon_super when we are called from
+	 * deactivate_super.
 	 */
 	if (!sbi)
-		return;
+		goto out_kill_sb;
 
 	if ( !sbi->catatonic )
 		autofs_catatonic_mode(sbi); /* Free wait queues, close pipe */
@@ -44,6 +45,7 @@ void autofs_kill_sb(struct super_block *sb)
 
 	kfree(sb->s_fs_info);
 
+out_kill_sb:
 	DPRINTK(("autofs: shutting down\n"));
 	kill_anon_super(sb);
 }
@@ -209,7 +211,6 @@ fail_iput:
 fail_free:
 	kfree(sbi);
 	s->s_fs_info = NULL;
-	kill_anon_super(s);
 fail_unlock:
 	return -EINVAL;
 }
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index ce7c0f1..9c48250 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -152,10 +152,11 @@ void autofs4_kill_sb(struct super_block *sb)
 	/*
 	 * In the event of a failure in get_sb_nodev the superblock
 	 * info is not present so nothing else has been setup, so
-	 * just exit when we are called from deactivate_super.
+	 * just call kill_anon_super when we are called from
+	 * deactivate_super.
 	 */
 	if (!sbi)
-		return;
+		goto out_kill_sb;
 
 	sb->s_fs_info = NULL;
 
@@ -167,6 +168,7 @@ void autofs4_kill_sb(struct super_block *sb)
 
 	kfree(sbi);
 
+out_kill_sb:
 	DPRINTK("shutting down");
 	kill_anon_super(sb);
 }
@@ -426,7 +428,6 @@ fail_ino:
 fail_free:
 	kfree(sbi);
 	s->s_fs_info = NULL;
-	kill_anon_super(s);
 fail_unlock:
 	return -EINVAL;
 }
-- 
cgit v0.10.2


From 3908fd2ed920af818aa596672da68ba26173ff27 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Wed, 6 Dec 2006 20:39:39 -0800
Subject: [PATCH] softirq: remove BUG_ONs which can incorrectly trigger

It is possible to have tasklets get scheduled before softirqd has had a chance
to spawn on all CPUs.  This is totally harmless; after success during action
CPU_UP_PREPARE, action CPU_ONLINE will be called, which immediately wakes
softirqd on the appropriate CPU to process the already pending tasklets.  So
there is no danger of having a missed wakeup for any tasklets that were
already pending.

In particular, i386 is affected by this during startup, and is visible when
using a very large initrd; during the time it takes for the initrd to be
decompressed, a timer IRQ can come in and schedule RCU callbacks.  It is also
possible that resending of a hardware IRQ via a softirq triggers the same bug.

Because of different timing conditions, this shows up in all emulators and
virtual machines tested, including Xen, VMware, Virtual PC, and Qemu.  It is
also possible to trigger on native hardware with a large enough initrd,
although I don't have a reliable case demonstrating that.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Cc: <caglar@pardus.org.tr>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/softirq.c b/kernel/softirq.c
index bf25015..918e52d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -574,8 +574,6 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
 
 	switch (action) {
 	case CPU_UP_PREPARE:
-		BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
-		BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
 		p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
 		if (IS_ERR(p)) {
 			printk("ksoftirqd for %i failed\n", hotcpu);
-- 
cgit v0.10.2


From f9231a0ca1afd05543d9f83539d6ecd5c018e8cf Mon Sep 17 00:00:00 2001
From: Torsten Ertbjerg Rasmussen <tr@newtec.dk>
Date: Wed, 6 Dec 2006 20:39:41 -0800
Subject: [PATCH] rtc: ds1743 support

The real time clocks ds1742 and ds1743 differs only in the size of the
nvram.  This patch changes the existing ds1742 driver to support also
ds1743.  The main change is that the nvram size is determined from the
resource attached to the device.

The patch have benefitted from suggestions from Atsushi Nemeto, who is the
author of the ds1742 driver.

Signed-off-by: Torsten Rasmussen Rasmussen <tr@newtec.dk>
Acked-by: Alessandro Zummo <a.zummo@towertech.it>
Acked-by: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 4d32bad..2a63ab2 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -154,11 +154,11 @@ config RTC_DRV_DS1672
 	  will be called rtc-ds1672.
 
 config RTC_DRV_DS1742
-	tristate "Dallas DS1742"
+	tristate "Dallas DS1742/1743"
 	depends on RTC_CLASS
 	help
 	  If you say yes here you get support for the
-	  Dallas DS1742 timekeeping chip.
+	  Dallas DS1742/1743 timekeeping chip.
 
 	  This driver can also be built as a module. If so, the module
 	  will be called rtc-ds1742.
diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c
index 6273a3d..17633bf 100644
--- a/drivers/rtc/rtc-ds1742.c
+++ b/drivers/rtc/rtc-ds1742.c
@@ -6,6 +6,10 @@
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
+ *
+ * Copyright (C) 2006 Torsten Ertbjerg Rasmussen <tr@newtec.dk>
+ *  - nvram size determined from resource
+ *  - this ds1742 driver now supports ds1743.
  */
 
 #include <linux/bcd.h>
@@ -17,20 +21,19 @@
 #include <linux/platform_device.h>
 #include <linux/io.h>
 
-#define DRV_VERSION "0.2"
+#define DRV_VERSION "0.3"
 
-#define RTC_REG_SIZE		0x800
-#define RTC_OFFSET		0x7f8
+#define RTC_SIZE		8
 
-#define RTC_CONTROL		(RTC_OFFSET + 0)
-#define RTC_CENTURY		(RTC_OFFSET + 0)
-#define RTC_SECONDS		(RTC_OFFSET + 1)
-#define RTC_MINUTES		(RTC_OFFSET + 2)
-#define RTC_HOURS		(RTC_OFFSET + 3)
-#define RTC_DAY			(RTC_OFFSET + 4)
-#define RTC_DATE		(RTC_OFFSET + 5)
-#define RTC_MONTH		(RTC_OFFSET + 6)
-#define RTC_YEAR		(RTC_OFFSET + 7)
+#define RTC_CONTROL		0
+#define RTC_CENTURY		0
+#define RTC_SECONDS		1
+#define RTC_MINUTES		2
+#define RTC_HOURS		3
+#define RTC_DAY			4
+#define RTC_DATE		5
+#define RTC_MONTH		6
+#define RTC_YEAR		7
 
 #define RTC_CENTURY_MASK	0x3f
 #define RTC_SECONDS_MASK	0x7f
@@ -48,7 +51,10 @@
 
 struct rtc_plat_data {
 	struct rtc_device *rtc;
-	void __iomem *ioaddr;
+	void __iomem *ioaddr_nvram;
+	void __iomem *ioaddr_rtc;
+	size_t size_nvram;
+	size_t size;
 	unsigned long baseaddr;
 	unsigned long last_jiffies;
 };
@@ -57,7 +63,7 @@ static int ds1742_rtc_set_time(struct device *dev, struct rtc_time *tm)
 {
 	struct platform_device *pdev = to_platform_device(dev);
 	struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
-	void __iomem *ioaddr = pdata->ioaddr;
+	void __iomem *ioaddr = pdata->ioaddr_rtc;
 	u8 century;
 
 	century = BIN2BCD((tm->tm_year + 1900) / 100);
@@ -82,7 +88,7 @@ static int ds1742_rtc_read_time(struct device *dev, struct rtc_time *tm)
 {
 	struct platform_device *pdev = to_platform_device(dev);
 	struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
-	void __iomem *ioaddr = pdata->ioaddr;
+	void __iomem *ioaddr = pdata->ioaddr_rtc;
 	unsigned int year, month, day, hour, minute, second, week;
 	unsigned int century;
 
@@ -127,10 +133,10 @@ static ssize_t ds1742_nvram_read(struct kobject *kobj, char *buf,
 	struct platform_device *pdev =
 		to_platform_device(container_of(kobj, struct device, kobj));
 	struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
-	void __iomem *ioaddr = pdata->ioaddr;
+	void __iomem *ioaddr = pdata->ioaddr_nvram;
 	ssize_t count;
 
-	for (count = 0; size > 0 && pos < RTC_OFFSET; count++, size--)
+	for (count = 0; size > 0 && pos < pdata->size_nvram; count++, size--)
 		*buf++ = readb(ioaddr + pos++);
 	return count;
 }
@@ -141,10 +147,10 @@ static ssize_t ds1742_nvram_write(struct kobject *kobj, char *buf,
 	struct platform_device *pdev =
 		to_platform_device(container_of(kobj, struct device, kobj));
 	struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
-	void __iomem *ioaddr = pdata->ioaddr;
+	void __iomem *ioaddr = pdata->ioaddr_nvram;
 	ssize_t count;
 
-	for (count = 0; size > 0 && pos < RTC_OFFSET; count++, size--)
+	for (count = 0; size > 0 && pos < pdata->size_nvram; count++, size--)
 		writeb(*buf++, ioaddr + pos++);
 	return count;
 }
@@ -155,7 +161,6 @@ static struct bin_attribute ds1742_nvram_attr = {
 		.mode = S_IRUGO | S_IWUGO,
 		.owner = THIS_MODULE,
 	},
-	.size = RTC_OFFSET,
 	.read = ds1742_nvram_read,
 	.write = ds1742_nvram_write,
 };
@@ -175,19 +180,23 @@ static int __init ds1742_rtc_probe(struct platform_device *pdev)
 	pdata = kzalloc(sizeof(*pdata), GFP_KERNEL);
 	if (!pdata)
 		return -ENOMEM;
-	if (!request_mem_region(res->start, RTC_REG_SIZE, pdev->name)) {
+	pdata->size = res->end - res->start + 1;
+	if (!request_mem_region(res->start, pdata->size, pdev->name)) {
 		ret = -EBUSY;
 		goto out;
 	}
 	pdata->baseaddr = res->start;
-	ioaddr = ioremap(pdata->baseaddr, RTC_REG_SIZE);
+	ioaddr = ioremap(pdata->baseaddr, pdata->size);
 	if (!ioaddr) {
 		ret = -ENOMEM;
 		goto out;
 	}
-	pdata->ioaddr = ioaddr;
+	pdata->ioaddr_nvram = ioaddr;
+	pdata->size_nvram = pdata->size - RTC_SIZE;
+	pdata->ioaddr_rtc = ioaddr + pdata->size_nvram;
 
 	/* turn RTC on if it was not on */
+	ioaddr = pdata->ioaddr_rtc;
 	sec = readb(ioaddr + RTC_SECONDS);
 	if (sec & RTC_STOP) {
 		sec &= RTC_SECONDS_MASK;
@@ -208,6 +217,8 @@ static int __init ds1742_rtc_probe(struct platform_device *pdev)
 	pdata->rtc = rtc;
 	pdata->last_jiffies = jiffies;
 	platform_set_drvdata(pdev, pdata);
+	ds1742_nvram_attr.size = max(ds1742_nvram_attr.size,
+				     pdata->size_nvram);
 	ret = sysfs_create_bin_file(&pdev->dev.kobj, &ds1742_nvram_attr);
 	if (ret)
 		goto out;
@@ -215,10 +226,10 @@ static int __init ds1742_rtc_probe(struct platform_device *pdev)
  out:
 	if (pdata->rtc)
 		rtc_device_unregister(pdata->rtc);
-	if (ioaddr)
-		iounmap(ioaddr);
+	if (pdata->ioaddr_nvram)
+		iounmap(pdata->ioaddr_nvram);
 	if (pdata->baseaddr)
-		release_mem_region(pdata->baseaddr, RTC_REG_SIZE);
+		release_mem_region(pdata->baseaddr, pdata->size);
 	kfree(pdata);
 	return ret;
 }
@@ -229,8 +240,8 @@ static int __devexit ds1742_rtc_remove(struct platform_device *pdev)
 
 	sysfs_remove_bin_file(&pdev->dev.kobj, &ds1742_nvram_attr);
 	rtc_device_unregister(pdata->rtc);
-	iounmap(pdata->ioaddr);
-	release_mem_region(pdata->baseaddr, RTC_REG_SIZE);
+	iounmap(pdata->ioaddr_nvram);
+	release_mem_region(pdata->baseaddr, pdata->size);
 	kfree(pdata);
 	return 0;
 }
-- 
cgit v0.10.2


From 652d3a3ed7610bb09530127ca297b538fe7d245a Mon Sep 17 00:00:00 2001
From: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
Date: Wed, 6 Dec 2006 20:39:43 -0800
Subject: [PATCH] char: ip2 remove broken macro

This macro is broken and unused so why not remove it.

Signed-off-by: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/ip2/i2cmd.h b/drivers/char/ip2/i2cmd.h
index baa4e72..29277ec 100644
--- a/drivers/char/ip2/i2cmd.h
+++ b/drivers/char/ip2/i2cmd.h
@@ -367,11 +367,6 @@ static UCHAR cc02[];
 #define CSE_NULL  3  // Replace with a null
 #define CSE_MARK  4  // Replace with a 3-character sequence (as Unix)
 
-#define  CMD_SET_REPLACEMENT(arg,ch)   \
-			(((cmdSyntaxPtr)(ct36a))->cmd[1] = (arg), \
-			(((cmdSyntaxPtr)(ct36a))->cmd[2] = (ch),  \
-			(cmdSyntaxPtr)(ct36a))
-
 #define CSE_REPLACE  0x8	// Replace the errored character with the
 							// replacement character defined here
 
-- 
cgit v0.10.2


From 12d40e43d251de4fa1f982567fc8b4ee5e858367 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Wed, 6 Dec 2006 20:39:53 -0800
Subject: [PATCH] Save some bytes in struct inode

[acme@newtoy net-2.6.20]$ pahole --cacheline 64 fs/inode.o inode
/* /pub/scm/linux/kernel/git/acme/net-2.6.20/include/linux/dcache.h:86 */
struct inode {
        struct hlist_node          i_hash;               /*     0     8 */
        struct list_head           i_list;               /*     8     8 */
        struct list_head           i_sb_list;            /*    16     8 */
        struct list_head           i_dentry;             /*    24     8 */
        long unsigned int          i_ino;                /*    32     4 */
        atomic_t                   i_count;              /*    36     4 */
        umode_t                    i_mode;               /*    40     2 */

        /* XXX 2 bytes hole, try to pack */

        unsigned int               i_nlink;              /*    44     4 */
        uid_t                      i_uid;                /*    48     4 */
        gid_t                      i_gid;                /*    52     4 */
        dev_t                      i_rdev;               /*    56     4 */
        loff_t                     i_size;               /*    60     8 */
        struct timespec            i_atime;              /*    68     8 */
        struct timespec            i_mtime;              /*    76     8 */
        struct timespec            i_ctime;              /*    84     8 */
        unsigned int               i_blkbits;            /*    92     4 */
        long unsigned int          i_version;            /*    96     4 */
        blkcnt_t                   i_blocks;             /*   100     4 */
        short unsigned int         i_bytes;              /*   104     2 */

        /* XXX 2 bytes hole, try to pack */

        spinlock_t                 i_lock;               /*   108    40 */
        struct mutex               i_mutex;              /*   148    76 */
        struct rw_semaphore        i_alloc_sem;          /*   224    64 */
        struct inode_operations *  i_op;                 /*   288     4 */
        const struct file_operations  * i_fop;           /*   292     4 */
        struct super_block *       i_sb;                 /*   296     4 */
        struct file_lock *         i_flock;              /*   300     4 */
        struct address_space *     i_mapping;            /*   304     4 */
        struct address_space       i_data;               /*   308   188 */
        struct list_head           i_devices;            /*   496     8 */
        union                      ;                     /*   504     4 */
        int                        i_cindex;             /*   508     4 */
        __u32                      i_generation;         /*   512     4 */
        /* ---------- cacheline 8 boundary ---------- */
        long unsigned int          i_dnotify_mask;       /*   516     4 */
        struct dnotify_struct *    i_dnotify;            /*   520     4 */
        struct list_head           inotify_watches;      /*   524     8 */
        struct mutex               inotify_mutex;        /*   532    76 */
        long unsigned int          i_state;              /*   608     4 */
        long unsigned int          dirtied_when;         /*   612     4 */
        unsigned int               i_flags;              /*   616     4 */
        atomic_t                   i_writecount;         /*   620     4 */
        void *                     i_security;           /*   624     4 */
        void *                     i_private;            /*   628     4 */
}; /* size: 632, sum members: 628, holes: 2, sum holes: 4 */

[acme@newtoy net-2.6.20]$

So just moving i_mode to after i_bytes we save 4 bytes by nuking both holes:

[acme@newtoy net-2.6.20]$ codiff -V /tmp/inode.o.before fs/inode.o
/pub/scm/linux/kernel/git/acme/net-2.6.20/fs/inode.c:
  struct inode |   -4
    i_mode;
     from: umode_t               /*    40(0)     2(0) */
     to:   umode_t               /*   102(0)     2(0) */
 1 struct changed
[acme@newtoy net-2.6.20]$

I've prunned all the other offset changes, only this one is of interest here.

So now we have:

[acme@newtoy net-2.6.20]$ pahole --cacheline 64 ../OUTPUT/qemu/net-2.6.20/fs/inode.o inode
/* /pub/scm/linux/kernel/git/acme/net-2.6.20/include/linux/dcache.h:86 */
struct inode {
        struct hlist_node          i_hash;               /*     0     8 */
        struct list_head           i_list;               /*     8     8 */
        struct list_head           i_sb_list;            /*    16     8 */
        struct list_head           i_dentry;             /*    24     8 */
        long unsigned int          i_ino;                /*    32     4 */
        atomic_t                   i_count;              /*    36     4 */
        unsigned int               i_nlink;              /*    40     4 */
        uid_t                      i_uid;                /*    44     4 */
        gid_t                      i_gid;                /*    48     4 */
        dev_t                      i_rdev;               /*    52     4 */
        loff_t                     i_size;               /*    56     8 */
        /* ---------- cacheline 1 boundary ---------- */
        struct timespec            i_atime;              /*    64     8 */
        struct timespec            i_mtime;              /*    72     8 */
        struct timespec            i_ctime;              /*    80     8 */
        unsigned int               i_blkbits;            /*    88     4 */
        long unsigned int          i_version;            /*    92     4 */
        blkcnt_t                   i_blocks;             /*    96     4 */
        short unsigned int         i_bytes;              /*   100     2 */
        umode_t                    i_mode;               /*   102     2 */
        spinlock_t                 i_lock;               /*   104    40 */
        struct mutex               i_mutex;              /*   144    76 */
        struct rw_semaphore        i_alloc_sem;          /*   220    64 */
        struct inode_operations *  i_op;                 /*   284     4 */
        const struct file_operations  * i_fop;           /*   288     4 */
        struct super_block *       i_sb;                 /*   292     4 */
        struct file_lock *         i_flock;              /*   296     4 */
        struct address_space *     i_mapping;            /*   300     4 */
        struct address_space       i_data;               /*   304   188 */
        struct list_head           i_devices;            /*   492     8 */
        union                      ;                     /*   500     4 */
        int                        i_cindex;             /*   504     4 */
        __u32                      i_generation;         /*   508     4 */
        /* ---------- cacheline 8 boundary ---------- */
        long unsigned int          i_dnotify_mask;       /*   512     4 */
        struct dnotify_struct *    i_dnotify;            /*   516     4 */
        struct list_head           inotify_watches;      /*   520     8 */
        struct mutex               inotify_mutex;        /*   528    76 */
        long unsigned int          i_state;              /*   604     4 */
        long unsigned int          dirtied_when;         /*   608     4 */
        unsigned int               i_flags;              /*   612     4 */
        atomic_t                   i_writecount;         /*   616     4 */
        void *                     i_security;           /*   620     4 */
        void *                     i_private;            /*   624     4 */
}; /* size: 628 */

[acme@newtoy net-2.6.20]$

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3a1927e..70b99fb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -543,7 +543,6 @@ struct inode {
 	struct list_head	i_dentry;
 	unsigned long		i_ino;
 	atomic_t		i_count;
-	umode_t			i_mode;
 	unsigned int		i_nlink;
 	uid_t			i_uid;
 	gid_t			i_gid;
@@ -559,6 +558,7 @@ struct inode {
 	unsigned int		i_blkbits;
 	blkcnt_t		i_blocks;
 	unsigned short          i_bytes;
+	umode_t			i_mode;
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
 	struct mutex		i_mutex;
 	struct rw_semaphore	i_alloc_sem;
-- 
cgit v0.10.2


From 9468f687d95d1825fd2f2c2f74e1a59429ef25d3 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Wed, 6 Dec 2006 20:39:55 -0800
Subject: [PATCH] winbond IDE depends on IDEDMA

winbond ide depends on idedma.
Move the option into the IDEDMA section.

  drivers/built-in.o: In function `.sl82c105_ide_dma_timeout':
  sl82c105.c:(.text+0x624d0): undefined reference to `.__ide_dma_timeout'
  drivers/built-in.o: In function `.sl82c105_ide_dma_off_quietly':
  sl82c105.c:(.text+0x6274c): undefined reference to `.__ide_dma_off_quietly'
  drivers/built-in.o: In function `.sl82c105_ide_dma_on':
  sl82c105.c:(.text+0x6284c): undefined reference to `.__ide_dma_on'
  drivers/built-in.o: In function `.sl82c105_check_drive':
  sl82c105.c:(.text+0x628ec): undefined reference to `.__ide_dma_bad_drive'
  sl82c105.c:(.text+0x62934): undefined reference to `.__ide_dma_good_drive'
  drivers/built-in.o: In function `.sl82c105_ide_dma_start':
  sl82c105.c:(.text+0x62c24): undefined reference to `.ide_dma_start'
  make[1]: *** [.tmp_vmlinux1] Error 1

Signed-off-by: Olaf Hering <olaf@aepfle.de>
Acked-by: "Bartlomiej Zolnierkiewicz" <bzolnier@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
index 0c68d0f..e23bc0d 100644
--- a/drivers/ide/Kconfig
+++ b/drivers/ide/Kconfig
@@ -389,14 +389,6 @@ config BLK_DEV_RZ1000
 	  Linux. This may slow disk throughput by a few percent, but at least
 	  things will operate 100% reliably.
 
-config BLK_DEV_SL82C105
-	tristate "Winbond SL82c105 support"
-	depends on PCI && (PPC || ARM) && BLK_DEV_IDEPCI
-	help
-	  If you have a Winbond SL82c105 IDE controller, say Y here to enable
-	  special configuration for this chip. This is common on various CHRP
-	  motherboards, but could be used elsewhere. If in doubt, say Y.
-
 config BLK_DEV_IDEDMA_PCI
 	bool "Generic PCI bus-master DMA support"
 	depends on PCI && BLK_DEV_IDEPCI
@@ -712,6 +704,14 @@ config BLK_DEV_SIS5513
 
 	  Please read the comments at the top of <file:drivers/ide/pci/sis5513.c>.
 
+config BLK_DEV_SL82C105
+	tristate "Winbond SL82c105 support"
+	depends on (PPC || ARM)
+	help
+	  If you have a Winbond SL82c105 IDE controller, say Y here to enable
+	  special configuration for this chip. This is common on various CHRP
+	  motherboards, but could be used elsewhere. If in doubt, say Y.
+
 config BLK_DEV_SLC90E66
 	tristate "SLC90E66 chipset support"
 	help
-- 
cgit v0.10.2


From a4ed06ad83acc3b7dafc018bc0b27469e787e27d Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Wed, 6 Dec 2006 20:39:57 -0800
Subject: [PATCH] amba-pl010: clear error flags on rx error

The pl010 primecell documentation specifies that an error indicated via RSR
should be cleared by a write to ECR.  We didn't do this, which was causing
errors to be re-reported on every call to pl010_rx_chars().

Doing a write to ECR once we detect an error appears to prevent the ep93xx
console UART driver from going into a mode where it reports "ttyAM0: X
input overrun(s)" every couple of keystrokes.

Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/serial/amba-pl010.c b/drivers/serial/amba-pl010.c
index 4213fab..4d3626e 100644
--- a/drivers/serial/amba-pl010.c
+++ b/drivers/serial/amba-pl010.c
@@ -129,6 +129,8 @@ static void pl010_rx_chars(struct uart_port *port)
 		 */
 		rsr = readb(port->membase + UART01x_RSR) | UART_DUMMY_RSR_RX;
 		if (unlikely(rsr & UART01x_RSR_ANY)) {
+			writel(0, port->membase + UART01x_ECR);
+
 			if (rsr & UART01x_RSR_BE) {
 				rsr &= ~(UART01x_RSR_FE | UART01x_RSR_PE);
 				port->icount.brk++;
-- 
cgit v0.10.2


From f1a60dbf68061e5a5364cbc723786b355637ffd3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 6 Dec 2006 20:39:59 -0800
Subject: [PATCH] gcc-4.1.0 is bust

Keith says

Compiling 2.6.19-rc6 with gcc version 4.1.0 (SUSE Linux), wait_hpet_tick is
optimized away to a never ending loop and the kernel hangs on boot in timer
setup.

0000001a <wait_hpet_tick>:
  1a:   55                      push   %ebp
  1b:   89 e5                   mov    %esp,%ebp
  1d:   eb fe                   jmp    1d <wait_hpet_tick+0x3>

This is not a problem with gcc 3.3.5.  Adding barrier() calls to
wait_hpet_tick does not help, making the variables volatile does.

And the consensus is that gcc-4.1.0 is busted.  Suse went and shipped
gcc-4.1.0 so we cannot ban it.  Add a warning.

Cc: Keith Owens <kaos@ocs.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/init/main.c b/init/main.c
index 36f608a..4650830 100644
--- a/init/main.c
+++ b/init/main.c
@@ -73,6 +73,10 @@
 #error Sorry, your GCC is too old. It builds incorrect kernels.
 #endif
 
+#if __GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 0
+#warning gcc-4.1.0 is known to miscompile the kernel.  A different compiler version is recommended.
+#endif
+
 static int init(void *);
 
 extern void init_IRQ(void);
-- 
cgit v0.10.2


From 3982cd99c30285ebc71d405cd8530a3dfb7265de Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:40:01 -0800
Subject: [PATCH] fs/sysv/: doc cleanup

Remove two different changelog files from fs/sysv/ and merges the INTRO
file into Documentation/filesystems/sysv-fs.txt

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/Documentation/filesystems/sysv-fs.txt b/Documentation/filesystems/sysv-fs.txt
index d817224..253b50d 100644
--- a/Documentation/filesystems/sysv-fs.txt
+++ b/Documentation/filesystems/sysv-fs.txt
@@ -1,11 +1,8 @@
-This is the implementation of the SystemV/Coherent filesystem for Linux.
 It implements all of
   - Xenix FS,
   - SystemV/386 FS,
   - Coherent FS.
 
-This is version beta 4.
-
 To install:
 * Answer the 'System V and Coherent filesystem support' question with 'y'
   when configuring the kernel.
@@ -28,11 +25,173 @@ Bugs in the present implementation:
   for this FS on hard disk yet.
 
 
-Please report any bugs and suggestions to
-  Bruno Haible <haible@ma2s2.mathematik.uni-karlsruhe.de>
-  Pascal Haible <haible@izfm.uni-stuttgart.de>
-  Krzysztof G. Baranowski <kgb@manjak.knm.org.pl>
+These filesystems are rather similar. Here is a comparison with Minix FS:
+
+* Linux fdisk reports on partitions
+  - Minix FS     0x81 Linux/Minix
+  - Xenix FS     ??
+  - SystemV FS   ??
+  - Coherent FS  0x08 AIX bootable
+
+* Size of a block or zone (data allocation unit on disk)
+  - Minix FS     1024
+  - Xenix FS     1024 (also 512 ??)
+  - SystemV FS   1024 (also 512 and 2048)
+  - Coherent FS   512
+
+* General layout: all have one boot block, one super block and
+  separate areas for inodes and for directories/data.
+  On SystemV Release 2 FS (e.g. Microport) the first track is reserved and
+  all the block numbers (including the super block) are offset by one track.
+
+* Byte ordering of "short" (16 bit entities) on disk:
+  - Minix FS     little endian  0 1
+  - Xenix FS     little endian  0 1
+  - SystemV FS   little endian  0 1
+  - Coherent FS  little endian  0 1
+  Of course, this affects only the file system, not the data of files on it!
+
+* Byte ordering of "long" (32 bit entities) on disk:
+  - Minix FS     little endian  0 1 2 3
+  - Xenix FS     little endian  0 1 2 3
+  - SystemV FS   little endian  0 1 2 3
+  - Coherent FS  PDP-11         2 3 0 1
+  Of course, this affects only the file system, not the data of files on it!
+
+* Inode on disk: "short", 0 means non-existent, the root dir ino is:
+  - Minix FS                            1
+  - Xenix FS, SystemV FS, Coherent FS   2
+
+* Maximum number of hard links to a file:
+  - Minix FS     250
+  - Xenix FS     ??
+  - SystemV FS   ??
+  - Coherent FS  >=10000
+
+* Free inode management:
+  - Minix FS                             a bitmap
+  - Xenix FS, SystemV FS, Coherent FS
+      There is a cache of a certain number of free inodes in the super-block.
+      When it is exhausted, new free inodes are found using a linear search.
+
+* Free block management:
+  - Minix FS                             a bitmap
+  - Xenix FS, SystemV FS, Coherent FS
+      Free blocks are organized in a "free list". Maybe a misleading term,
+      since it is not true that every free block contains a pointer to
+      the next free block. Rather, the free blocks are organized in chunks
+      of limited size, and every now and then a free block contains pointers
+      to the free blocks pertaining to the next chunk; the first of these
+      contains pointers and so on. The list terminates with a "block number"
+      0 on Xenix FS and SystemV FS, with a block zeroed out on Coherent FS.
+
+* Super-block location:
+  - Minix FS     block 1 = bytes 1024..2047
+  - Xenix FS     block 1 = bytes 1024..2047
+  - SystemV FS   bytes 512..1023
+  - Coherent FS  block 1 = bytes 512..1023
+
+* Super-block layout:
+  - Minix FS
+                    unsigned short s_ninodes;
+                    unsigned short s_nzones;
+                    unsigned short s_imap_blocks;
+                    unsigned short s_zmap_blocks;
+                    unsigned short s_firstdatazone;
+                    unsigned short s_log_zone_size;
+                    unsigned long s_max_size;
+                    unsigned short s_magic;
+  - Xenix FS, SystemV FS, Coherent FS
+                    unsigned short s_firstdatazone;
+                    unsigned long  s_nzones;
+                    unsigned short s_fzone_count;
+                    unsigned long  s_fzones[NICFREE];
+                    unsigned short s_finode_count;
+                    unsigned short s_finodes[NICINOD];
+                    char           s_flock;
+                    char           s_ilock;
+                    char           s_modified;
+                    char           s_rdonly;
+                    unsigned long  s_time;
+                    short          s_dinfo[4]; -- SystemV FS only
+                    unsigned long  s_free_zones;
+                    unsigned short s_free_inodes;
+                    short          s_dinfo[4]; -- Xenix FS only
+                    unsigned short s_interleave_m,s_interleave_n; -- Coherent FS only
+                    char           s_fname[6];
+                    char           s_fpack[6];
+    then they differ considerably:
+        Xenix FS
+                    char           s_clean;
+                    char           s_fill[371];
+                    long           s_magic;
+                    long           s_type;
+        SystemV FS
+                    long           s_fill[12 or 14];
+                    long           s_state;
+                    long           s_magic;
+                    long           s_type;
+        Coherent FS
+                    unsigned long  s_unique;
+    Note that Coherent FS has no magic.
+
+* Inode layout:
+  - Minix FS
+                    unsigned short i_mode;
+                    unsigned short i_uid;
+                    unsigned long  i_size;
+                    unsigned long  i_time;
+                    unsigned char  i_gid;
+                    unsigned char  i_nlinks;
+                    unsigned short i_zone[7+1+1];
+  - Xenix FS, SystemV FS, Coherent FS
+                    unsigned short i_mode;
+                    unsigned short i_nlink;
+                    unsigned short i_uid;
+                    unsigned short i_gid;
+                    unsigned long  i_size;
+                    unsigned char  i_zone[3*(10+1+1+1)];
+                    unsigned long  i_atime;
+                    unsigned long  i_mtime;
+                    unsigned long  i_ctime;
+
+* Regular file data blocks are organized as
+  - Minix FS
+               7 direct blocks
+               1 indirect block (pointers to blocks)
+               1 double-indirect block (pointer to pointers to blocks)
+  - Xenix FS, SystemV FS, Coherent FS
+              10 direct blocks
+               1 indirect block (pointers to blocks)
+               1 double-indirect block (pointer to pointers to blocks)
+               1 triple-indirect block (pointer to pointers to pointers to blocks)
+
+* Inode size, inodes per block
+  - Minix FS        32   32
+  - Xenix FS        64   16
+  - SystemV FS      64   16
+  - Coherent FS     64    8
+
+* Directory entry on disk
+  - Minix FS
+                    unsigned short inode;
+                    char name[14/30];
+  - Xenix FS, SystemV FS, Coherent FS
+                    unsigned short inode;
+                    char name[14];
+
+* Dir entry size, dir entries per block
+  - Minix FS     16/32    64/32
+  - Xenix FS     16       64
+  - SystemV FS   16       64
+  - Coherent FS  16       32
+
+* How to implement symbolic links such that the host fsck doesn't scream:
+  - Minix FS     normal
+  - Xenix FS     kludge: as regular files with  chmod 1000
+  - SystemV FS   ??
+  - Coherent FS  kludge: as regular files with  chmod 1000
 
-Bruno Haible
-<haible@ma2s2.mathematik.uni-karlsruhe.de>
 
+Notation: We often speak of a "block" but mean a zone (the allocation unit)
+and not the disk driver's notion of "block".
diff --git a/fs/sysv/CHANGES b/fs/sysv/CHANGES
deleted file mode 100644
index 66ea6e9..0000000
--- a/fs/sysv/CHANGES
+++ /dev/null
@@ -1,60 +0,0 @@
-Mon, 15 Dec 1997	  Krzysztof G. Baranowski <kgb@manjak.knm.org.pl>
-	*    namei.c: struct sysv_dir_inode_operations updated to use dentries.
-
-Fri, 23 Jan 1998   Krzysztof G. Baranowski <kgb@manjak.knm.org.pl>
-	*    inode.c: corrected 1 track offset setting (in sb->sv_block_base).
-		      Originally it was overridden (by setting to zero)
-		      in detected_[xenix,sysv4,sysv2,coherent]. Thanks
-		      to Andrzej Krzysztofowicz <ankry@mif.pg.gda.pl>
-		      for identifying the problem.
-
-Tue, 27 Jan 1998   Krzysztof G. Baranowski <kgb@manjak.knm.org.pl>
-        *    inode.c: added 2048-byte block support to SystemV FS.
-		      Merged detected_bs[512,1024,2048]() into one function:
-		      void detected_bs (u_char type, struct super_block *sb).
-		      Thanks to Andrzej Krzysztofowicz <ankry@mif.pg.gda.pl>
-		      for the patch.
-
-Wed, 4 Feb 1998   Krzysztof G. Baranowski <kgb@manjak.knm.org.pl>
-	*    namei.c: removed static subdir(); is_subdir() from dcache.c
-		      is used instead. Cosmetic changes.
-
-Thu, 3 Dec 1998   Al Viro (viro@parcelfarce.linux.theplanet.co.uk)
-	*    namei.c (sysv_rmdir):
-		      Bugectomy: old check for victim being busy
-		      (inode->i_count) wasn't replaced (with checking
-		      dentry->d_count) and escaped Linus in the last round
-		      of changes. Shot and buried.
-
-Wed, 9 Dec 1998   AV
-	*    namei.c (do_sysv_rename):
-		       Fixed incorrect check for other owners + race.
-		       Removed checks that went to VFS.
-	*    namei.c (sysv_unlink):
-		       Removed checks that went to VFS.
-
-Thu, 10 Dec 1998   AV
-	*    namei.c (do_mknod):
-			Removed dead code - mknod is never asked to
-			create a symlink or directory. Incidentially,
-			it wouldn't do it right if it would be called.
-
-Sat, 26 Dec 1998   KGB
-	*    inode.c (detect_sysv4):
-			Added detection of expanded s_type field (0x10,
-			0x20 and 0x30).  Forced read-only access in this case.
-
-Sun, 21 Mar 1999   AV
-	*    namei.c (sysv_link):
-			Fixed i_count usage that resulted in dcache corruption.
-	*    inode.c:
-			Filled ->delete_inode() method with sysv_delete_inode().
-			sysv_put_inode() is gone, as it tried to do ->delete_
-			_inode()'s job.
-	*    ialloc.c: (sysv_free_inode):
-			Fixed race.
-
-Sun, 30 Apr 1999   AV
-	*    namei.c (sysv_mknod):
-			Removed dead code (S_IFREG case is now passed to
-			->create() by VFS).
diff --git a/fs/sysv/ChangeLog b/fs/sysv/ChangeLog
deleted file mode 100644
index f403f8b..0000000
--- a/fs/sysv/ChangeLog
+++ /dev/null
@@ -1,106 +0,0 @@
-Thu Feb 14 2002  Andrew Morton  <akpm@zip.com.au>
-
-	* dir_commit_chunk(): call writeout_one_page() as well as
-	  waitfor_one_page() for IS_SYNC directories, so that we
-	  actually do sync the directory. (forward-port from 2.4).
-
-Thu Feb  7 2002  Alexander Viro  <viro@parcelfarce.linux.theplanet.co.uk>
-
-	* super.c: switched to ->get_sb()
-	* ChangeLog: fixed dates ;-)
-
-2002-01-24  David S. Miller  <davem@redhat.com>
-
-	* inode.c: Include linux/init.h
-
-Mon Jan 21 2002  Alexander Viro  <viro@parcelfarce.linux.theplanet.co.uk>
-	* ialloc.c (sysv_new_inode): zero SYSV_I(inode)->i_data out.
-	* i_vnode renamed to vfs_inode.  Sorry, but let's keep that
-	  consistent.
-
-Sat Jan 19 2002  Christoph Hellwig  <hch@infradead.org>
-
-	* include/linux/sysv_fs.h (SYSV_I): Get fs-private inode data using
-		list_entry() instead of inode->u.
-	* include/linux/sysv_fs_i.h: Add 'struct inode  i_vnode' field to
-		sysv_inode_info structure.
-	* inode.c: Include <linux/slab.h>, implement alloc_inode/destroy_inode
-		sop methods, add infrastructure for per-fs inode slab cache.
-	* super.c (init_sysv_fs): Initialize inode cache, recover properly
-		in the case of failed register_filesystem for V7.
-	(exit_sysv_fs): Destroy inode cache.
-
-Sat Jan 19 2002  Christoph Hellwig  <hch@infradead.org>
-
-	* include/linux/sysv_fs.h: Include <linux/sysv_fs_i.h>, declare SYSV_I().
-	* dir.c (sysv_find_entry): Use SYSV_I() instead of ->u.sysv_i to
-		access fs-private inode data.
-	* ialloc.c (sysv_new_inode): Likewise.
-	* inode.c (sysv_read_inode): Likewise.
-	(sysv_update_inode): Likewise.
-	* itree.c (get_branch): Likewise.
-	(sysv_truncate): Likewise.
-	* symlink.c (sysv_readlink): Likewise.
-	(sysv_follow_link): Likewise.
-
-Fri Jan  4 2002  Alexander Viro  <viro@parcelfarce.linux.theplanet.co.uk>
-
-	* ialloc.c (sysv_free_inode): Use sb->s_id instead of bdevname().
-	* inode.c (sysv_read_inode): Likewise.
-	  (sysv_update_inode): Likewise.
-	  (sysv_sync_inode): Likewise.
-	* super.c (detect_sysv): Likewise.
-	  (complete_read_super): Likewise.
-	  (sysv_read_super): Likewise.
-	  (v7_read_super): Likewise.
-
-Sun Dec 30 2001  Manfred Spraul  <manfred@colorfullife.com>
-
-	* dir.c (dir_commit_chunk): Do not set dir->i_version.
-	(sysv_readdir): Likewise.
-
-Thu Dec 27 2001  Alexander Viro  <viro@parcelfarce.linux.theplanet.co.uk>
-
-	* itree.c (get_block): Use map_bh() to fill out bh_result.
-
-Tue Dec 25 2001  Alexander Viro  <viro@parcelfarce.linux.theplanet.co.uk>
-
-	* super.c (sysv_read_super): Use sb_set_blocksize() to set blocksize.
-	  (v7_read_super): Likewise.
-
-Tue Nov 27 2001  Alexander Viro  <viro@parcelfarce.linux.theplanet.co.uk>
-
-	* itree.c (get_block): Change type for iblock argument to sector_t.
-	* super.c (sysv_read_super): Set s_blocksize early.
-	  (v7_read_super): Likewise.
-	* balloc.c (sysv_new_block): Use sb_bread(). instead of bread().
-	  (sysv_count_free_blocks): Likewise.
-	* ialloc.c (sysv_raw_inode): Likewise.
-	* itree.c (get_branch): Likewise.
-	  (free_branches): Likewise.
-	* super.c (sysv_read_super): Likewise.
-	  (v7_read_super): Likewise.
-
-Sat Dec 15 2001  Christoph Hellwig  <hch@infradead.org>
-
-	* inode.c (sysv_read_inode): Mark inode as bad in case of failure.
-	* super.c (complete_read_super): Check for bad root inode.
-
-Wed Nov 21 2001  Andrew Morton  <andrewm@uow.edu.au>
-
-	* file.c (sysv_sync_file): Call fsync_inode_data_buffers.
-
-Fri Oct 26 2001  Christoph Hellwig  <hch@infradead.org>
-
-	* dir.c, ialloc.c, namei.c, include/linux/sysv_fs_i.h:
-	Implement per-Inode lookup offset cache.
-	Modelled after Ted's ext2 patch.
-
-Fri Oct 26 2001  Christoph Hellwig  <hch@infradead.org>
-
-	* inode.c, super.c, include/linux/sysv_fs.h,
-	  include/linux/sysv_fs_sb.h:
-	Remove symlink faking.	Noone really wants to use these as
-	linux filesystems and native OSes don't support it anyway.
-
-
diff --git a/fs/sysv/INTRO b/fs/sysv/INTRO
deleted file mode 100644
index de4e4d1..0000000
--- a/fs/sysv/INTRO
+++ /dev/null
@@ -1,182 +0,0 @@
-This is the implementation of the SystemV/Coherent filesystem for Linux.
-It grew out of separate filesystem implementations
-
-    Xenix FS      Doug Evans <dje@cygnus.com>  June 1992
-    SystemV FS    Paul B. Monday <pmonday@eecs.wsu.edu> March-June 1993
-    Coherent FS   B. Haible <haible@ma2s2.mathematik.uni-karlsruhe.de> June 1993
-
-and was merged together in July 1993.
-
-These filesystems are rather similar. Here is a comparison with Minix FS:
-
-* Linux fdisk reports on partitions
-  - Minix FS     0x81 Linux/Minix
-  - Xenix FS     ??
-  - SystemV FS   ??
-  - Coherent FS  0x08 AIX bootable
-
-* Size of a block or zone (data allocation unit on disk)
-  - Minix FS     1024
-  - Xenix FS     1024 (also 512 ??)
-  - SystemV FS   1024 (also 512 and 2048)
-  - Coherent FS   512
-
-* General layout: all have one boot block, one super block and
-  separate areas for inodes and for directories/data.
-  On SystemV Release 2 FS (e.g. Microport) the first track is reserved and
-  all the block numbers (including the super block) are offset by one track.
-
-* Byte ordering of "short" (16 bit entities) on disk:
-  - Minix FS     little endian  0 1
-  - Xenix FS     little endian  0 1
-  - SystemV FS   little endian  0 1
-  - Coherent FS  little endian  0 1
-  Of course, this affects only the file system, not the data of files on it!
-
-* Byte ordering of "long" (32 bit entities) on disk:
-  - Minix FS     little endian  0 1 2 3
-  - Xenix FS     little endian  0 1 2 3
-  - SystemV FS   little endian  0 1 2 3
-  - Coherent FS  PDP-11         2 3 0 1
-  Of course, this affects only the file system, not the data of files on it!
-
-* Inode on disk: "short", 0 means non-existent, the root dir ino is:
-  - Minix FS                            1
-  - Xenix FS, SystemV FS, Coherent FS   2
-
-* Maximum number of hard links to a file:
-  - Minix FS     250
-  - Xenix FS     ??
-  - SystemV FS   ??
-  - Coherent FS  >=10000
-
-* Free inode management:
-  - Minix FS                             a bitmap
-  - Xenix FS, SystemV FS, Coherent FS
-      There is a cache of a certain number of free inodes in the super-block.
-      When it is exhausted, new free inodes are found using a linear search.
-
-* Free block management:
-  - Minix FS                             a bitmap
-  - Xenix FS, SystemV FS, Coherent FS
-      Free blocks are organized in a "free list". Maybe a misleading term,
-      since it is not true that every free block contains a pointer to
-      the next free block. Rather, the free blocks are organized in chunks
-      of limited size, and every now and then a free block contains pointers
-      to the free blocks pertaining to the next chunk; the first of these
-      contains pointers and so on. The list terminates with a "block number"
-      0 on Xenix FS and SystemV FS, with a block zeroed out on Coherent FS.
-
-* Super-block location:
-  - Minix FS     block 1 = bytes 1024..2047
-  - Xenix FS     block 1 = bytes 1024..2047
-  - SystemV FS   bytes 512..1023
-  - Coherent FS  block 1 = bytes 512..1023
-
-* Super-block layout:
-  - Minix FS
-                    unsigned short s_ninodes;
-                    unsigned short s_nzones;
-                    unsigned short s_imap_blocks;
-                    unsigned short s_zmap_blocks;
-                    unsigned short s_firstdatazone;
-                    unsigned short s_log_zone_size;
-                    unsigned long s_max_size;
-                    unsigned short s_magic;
-  - Xenix FS, SystemV FS, Coherent FS
-                    unsigned short s_firstdatazone;
-                    unsigned long  s_nzones;
-                    unsigned short s_fzone_count;
-                    unsigned long  s_fzones[NICFREE];
-                    unsigned short s_finode_count;
-                    unsigned short s_finodes[NICINOD];
-                    char           s_flock;
-                    char           s_ilock;
-                    char           s_modified;
-                    char           s_rdonly;
-                    unsigned long  s_time;
-                    short          s_dinfo[4]; -- SystemV FS only
-                    unsigned long  s_free_zones;
-                    unsigned short s_free_inodes;
-                    short          s_dinfo[4]; -- Xenix FS only
-                    unsigned short s_interleave_m,s_interleave_n; -- Coherent FS only
-                    char           s_fname[6];
-                    char           s_fpack[6];
-    then they differ considerably:
-        Xenix FS
-                    char           s_clean;
-                    char           s_fill[371];
-                    long           s_magic;
-                    long           s_type;
-        SystemV FS
-                    long           s_fill[12 or 14];
-                    long           s_state;
-                    long           s_magic;
-                    long           s_type;
-        Coherent FS
-                    unsigned long  s_unique;
-    Note that Coherent FS has no magic.
-
-* Inode layout:
-  - Minix FS
-                    unsigned short i_mode;
-                    unsigned short i_uid;
-                    unsigned long  i_size;
-                    unsigned long  i_time;
-                    unsigned char  i_gid;
-                    unsigned char  i_nlinks;
-                    unsigned short i_zone[7+1+1];
-  - Xenix FS, SystemV FS, Coherent FS
-                    unsigned short i_mode;
-                    unsigned short i_nlink;
-                    unsigned short i_uid;
-                    unsigned short i_gid;
-                    unsigned long  i_size;
-                    unsigned char  i_zone[3*(10+1+1+1)];
-                    unsigned long  i_atime;
-                    unsigned long  i_mtime;
-                    unsigned long  i_ctime;
-
-* Regular file data blocks are organized as
-  - Minix FS
-               7 direct blocks
-               1 indirect block (pointers to blocks)
-               1 double-indirect block (pointer to pointers to blocks)
-  - Xenix FS, SystemV FS, Coherent FS
-              10 direct blocks
-               1 indirect block (pointers to blocks)
-               1 double-indirect block (pointer to pointers to blocks)
-               1 triple-indirect block (pointer to pointers to pointers to blocks)
-
-* Inode size, inodes per block
-  - Minix FS        32   32
-  - Xenix FS        64   16
-  - SystemV FS      64   16
-  - Coherent FS     64    8
-
-* Directory entry on disk
-  - Minix FS
-                    unsigned short inode;
-                    char name[14/30];
-  - Xenix FS, SystemV FS, Coherent FS
-                    unsigned short inode;
-                    char name[14];
-
-* Dir entry size, dir entries per block
-  - Minix FS     16/32    64/32
-  - Xenix FS     16       64
-  - SystemV FS   16       64
-  - Coherent FS  16       32
-
-* How to implement symbolic links such that the host fsck doesn't scream:
-  - Minix FS     normal
-  - Xenix FS     kludge: as regular files with  chmod 1000
-  - SystemV FS   ??
-  - Coherent FS  kludge: as regular files with  chmod 1000
-
-
-Notation: We often speak of a "block" but mean a zone (the allocation unit)
-and not the disk driver's notion of "block".
-
-
-Bruno Haible  <haible@ma2s2.mathematik.uni-karlsruhe.de>
-- 
cgit v0.10.2


From 0da1480ec33d4bac8c32051c1d33202be6dc439f Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:40:03 -0800
Subject: [PATCH] proper prototype for remove_inode_dquot_ref()

Add a proper prototype for remove_inode_dquot_ref() in
include/linux/quotaops.h

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/inode.c b/fs/inode.c
index 699aa4f..9ecccab 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1242,9 +1242,6 @@ EXPORT_SYMBOL(inode_needs_sync);
  */
 #ifdef CONFIG_QUOTA
 
-/* Function back in dquot.c */
-int remove_inode_dquot_ref(struct inode *, int, struct list_head *);
-
 void remove_dquot_ref(struct super_block *sb, int type,
 			struct list_head *tofree_head)
 {
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 5110201..90c23f6 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -37,6 +37,9 @@ extern int dquot_release(struct dquot *dquot);
 extern int dquot_commit_info(struct super_block *sb, int type);
 extern int dquot_mark_dquot_dirty(struct dquot *dquot);
 
+int remove_inode_dquot_ref(struct inode *inode, int type,
+			   struct list_head *tofree_head);
+
 extern int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path);
 extern int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
 		int format_id, int type);
-- 
cgit v0.10.2


From d0ce7d0346132808d7170b56a01fa582324a7f4d Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:40:05 -0800
Subject: [PATCH] remove drivers/char/riscom8.c:baud_table[]

Commit c7bce3097c0f9bbed76ee6fd03742f2624031a45 removed all usages of
baud_table[] but not the array itself.

Spotted by the GNU C compiler.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/riscom8.c b/drivers/char/riscom8.c
index 722dd3e..0a77bfc 100644
--- a/drivers/char/riscom8.c
+++ b/drivers/char/riscom8.c
@@ -82,11 +82,6 @@
 static struct riscom_board * IRQ_to_board[16];
 static struct tty_driver *riscom_driver;
 
-static unsigned long baud_table[] =  {
-	0, 50, 75, 110, 134, 150, 200, 300, 600, 1200, 1800, 2400, 4800,
-	9600, 19200, 38400, 57600, 76800, 0, 
-};
-
 static struct riscom_board rc_board[RC_NBOARD] =  {
 	{
 		.base	= RC_IOBASE1,
-- 
cgit v0.10.2


From cd6ed52568e161ce924593ebc798050a2d23cca0 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:40:06 -0800
Subject: [PATCH] arch/i386/kernel/reboot.c should #include <linux/reboot.h>

Every file should #include the headers containing the prototypes for
its global functions.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
index 84278e0..3514b41 100644
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -12,6 +12,7 @@
 #include <linux/dmi.h>
 #include <linux/ctype.h>
 #include <linux/pm.h>
+#include <linux/reboot.h>
 #include <asm/uaccess.h>
 #include <asm/apic.h>
 #include <asm/desc.h>
-- 
cgit v0.10.2


From 65867beb0de4d055637476327b533e5ffbec2b97 Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jordan.crouse@amd.com>
Date: Wed, 6 Dec 2006 20:40:07 -0800
Subject: [PATCH] Trivial cleanup in the PCI IDs for the CS5535

Rename a poorly worded PCI ID for the Geode GX and CS5535 companion chips.
The graphics processor and host bridge actually live in the northbridge on
the integrated processor, not in the companion chip.

Signed-off-by: Jordan Crouse <jordan.crouse@amd.com>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/video/geode/gxfb_core.c b/drivers/video/geode/gxfb_core.c
index 0d3643f..a454dcb 100644
--- a/drivers/video/geode/gxfb_core.c
+++ b/drivers/video/geode/gxfb_core.c
@@ -380,7 +380,7 @@ static void gxfb_remove(struct pci_dev *pdev)
 }
 
 static struct pci_device_id gxfb_id_table[] = {
-	{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_CS5535_VIDEO,
+	{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_GX_VIDEO,
 	  PCI_ANY_ID, PCI_ANY_ID, PCI_BASE_CLASS_DISPLAY << 16,
 	  0xff0000, 0 },
 	{ 0, }
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index dcdb90f..ff2dcb4 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -390,7 +390,7 @@
 #define PCI_DEVICE_ID_NS_CS5535_IDE	0x002d
 #define PCI_DEVICE_ID_NS_CS5535_AUDIO	0x002e
 #define PCI_DEVICE_ID_NS_CS5535_USB	0x002f
-#define PCI_DEVICE_ID_NS_CS5535_VIDEO	0x0030
+#define PCI_DEVICE_ID_NS_GX_VIDEO	0x0030
 #define PCI_DEVICE_ID_NS_SATURN		0x0035
 #define PCI_DEVICE_ID_NS_SCx200_BRIDGE	0x0500
 #define PCI_DEVICE_ID_NS_SCx200_SMI	0x0501
@@ -403,8 +403,7 @@
 #define PCI_DEVICE_ID_NS_SC1100_XBUS	0x0515
 #define PCI_DEVICE_ID_NS_87410		0xd001
 
-#define PCI_DEVICE_ID_NS_CS5535_HOST_BRIDGE  0x0028
-#define PCI_DEVICE_ID_NS_CS5535_ISA_BRIDGE   0x002b
+#define PCI_DEVICE_ID_NS_GX_HOST_BRIDGE  0x0028
 
 #define PCI_VENDOR_ID_TSENG		0x100c
 #define PCI_DEVICE_ID_TSENG_W32P_2	0x3202
-- 
cgit v0.10.2


From 71a3d1b4f7633835048f96a4ba79c8c87f03df4b Mon Sep 17 00:00:00 2001
From: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
Date: Wed, 6 Dec 2006 20:40:09 -0800
Subject: [PATCH] fs: ufs add missing bracket

Signed-off-by: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
Cc: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 28fce6c..7dd12bb 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -299,7 +299,7 @@ static inline void *get_usb_offset(struct ufs_sb_private_info *uspi,
 
 #define ubh_get_addr16(ubh,begin) \
 	(((__fs16*)((ubh)->bh[(begin) >> (uspi->s_fshift-1)]->b_data)) + \
-	((begin) & (uspi->fsize>>1) - 1)))
+	((begin) & ((uspi->fsize>>1) - 1)))
 
 #define ubh_get_addr32(ubh,begin) \
 	(((__fs32*)((ubh)->bh[(begin) >> (uspi->s_fshift-2)]->b_data)) + \
-- 
cgit v0.10.2


From 5296c7bec8c85aa0a6964eca9ff4a1a8847fba8a Mon Sep 17 00:00:00 2001
From: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
Date: Wed, 6 Dec 2006 20:40:10 -0800
Subject: [PATCH] fs: reiserfs add missing brackets

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 7bc6bfb..d0e4dce 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -739,7 +739,7 @@ struct block_head {
 #define PUT_B_FREE_SPACE(p_s_bh,val)  do { set_blkh_free_space(B_BLK_HEAD(p_s_bh),val); } while (0)
 
 /* Get right delimiting key. -- little endian */
-#define B_PRIGHT_DELIM_KEY(p_s_bh)   (&(blk_right_delim_key(B_BLK_HEAD(p_s_bh))
+#define B_PRIGHT_DELIM_KEY(p_s_bh)   (&(blk_right_delim_key(B_BLK_HEAD(p_s_bh))))
 
 /* Does the buffer contain a disk leaf. */
 #define B_IS_ITEMS_LEVEL(p_s_bh)     (B_LEVEL(p_s_bh) == DISK_LEAF_NODE_LEVEL)
-- 
cgit v0.10.2


From a0e7688df1484fbf4d6d61c31f7d61a5d8cacf3c Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@mindspring.com>
Date: Wed, 6 Dec 2006 20:40:12 -0800
Subject: [PATCH] Kbuild: add 3 more header files to get properly "unifdef"ed

Add 3 more files to get "unifdef"ed when creating sanitized headers with
"make headers_install".

Signed-off-by: Robert P. J. Day <rpjday@mindspring.com>
Acked-by: David Woodhouse <dwmw2@infradead.org>
Acked-by: "John W. Linville" <linville@tuxdriver.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index ff43312..e618b25 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -221,6 +221,7 @@ unifdef-y += if_bridge.h
 unifdef-y += if_ec.h
 unifdef-y += if_eql.h
 unifdef-y += if_ether.h
+unifdef-y += if_fddi.h
 unifdef-y += if_frad.h
 unifdef-y += if_ltalk.h
 unifdef-y += if_pppox.h
@@ -282,6 +283,7 @@ unifdef-y += nvram.h
 unifdef-y += parport.h
 unifdef-y += patchkey.h
 unifdef-y += pci.h
+unifdef-y += personality.h
 unifdef-y += pktcdvd.h
 unifdef-y += pmu.h
 unifdef-y += poll.h
@@ -337,6 +339,7 @@ unifdef-y += videodev.h
 unifdef-y += wait.h
 unifdef-y += wanrouter.h
 unifdef-y += watchdog.h
+unifdef-y += wireless.h
 unifdef-y += xfrm.h
 
 objhdr-y += version.h
-- 
cgit v0.10.2


From a8f48a95619cbce8f85423480e7d0a1bf971a62b Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 6 Dec 2006 20:40:13 -0800
Subject: [PATCH] ext3/4: don't do orphan processing on readonly devices

If you do something like:

  # touch foo
  # tail -f foo &
  # rm foo
  # <take snapshot>
  # <mount snapshot>

you'll panic, because ext3/4 tries to do orphan list processing on the
readonly snapshot device, and:

  kernel: journal commit I/O error
  kernel: Assertion failure in journal_flush_Rsmp_e2f189ce() at journal.c:1356: "!journal->j_checkpoint_transactions"
  kernel: Kernel panic: Fatal exception

for a truly readonly underlying device, it's reasonable and necessary
to just skip orphan list processing.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 8ab1981..580b8a6 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1264,6 +1264,12 @@ static void ext3_orphan_cleanup (struct super_block * sb,
 		return;
 	}
 
+	if (bdev_read_only(sb->s_bdev)) {
+		printk(KERN_ERR "EXT3-fs: write access "
+			"unavailable, skipping orphan cleanup.\n");
+		return;
+	}
+
 	if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) {
 		if (es->s_last_orphan)
 			jbd_debug(1, "Errors on filesystem, "
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2ede7e2..486a641 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1321,6 +1321,12 @@ static void ext4_orphan_cleanup (struct super_block * sb,
 		return;
 	}
 
+	if (bdev_read_only(sb->s_bdev)) {
+		printk(KERN_ERR "EXT4-fs: write access "
+			"unavailable, skipping orphan cleanup.\n");
+		return;
+	}
+
 	if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
 		if (es->s_last_orphan)
 			jbd_debug(1, "Errors on filesystem, "
-- 
cgit v0.10.2


From 045f147f3290395661b56b9231fc4d221e150963 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:40:15 -0800
Subject: [PATCH] remove EXPORT_UNUSED_SYMBOL'ed symbols

In time for 2.6.20, we can get rid of this junk.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/printk.c b/kernel/printk.c
index ba59c2a..c3d90a5 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -53,8 +53,6 @@ int console_printk[4] = {
 	DEFAULT_CONSOLE_LOGLEVEL,	/* default_console_loglevel */
 };
 
-EXPORT_UNUSED_SYMBOL(console_printk);  /*  June 2006  */
-
 /*
  * Low lever drivers may need that to know if they can schedule in
  * their unblank() callback or not. So let's export it.
@@ -772,7 +770,6 @@ int is_console_locked(void)
 {
 	return console_locked;
 }
-EXPORT_UNUSED_SYMBOL(is_console_locked);  /*  June 2006  */
 
 /**
  * release_console_sem - unlock the console system
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 9425342..00a9697 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -27,8 +27,6 @@ unsigned long max_low_pfn;
 unsigned long min_low_pfn;
 unsigned long max_pfn;
 
-EXPORT_UNUSED_SYMBOL(max_pfn);  /*  June 2006  */
-
 static LIST_HEAD(bdata_list);
 #ifdef CONFIG_CRASH_DUMP
 /*
diff --git a/mm/memory.c b/mm/memory.c
index a07120d..4198df0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1902,7 +1902,6 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
 
 	return 0;
 }
-EXPORT_UNUSED_SYMBOL(vmtruncate_range);  /*  June 2006  */
 
 /**
  * swapin_readahead - swap in pages in hope we need them soon
diff --git a/mm/mmzone.c b/mm/mmzone.c
index febea1c..eb58386 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -14,8 +14,6 @@ struct pglist_data *first_online_pgdat(void)
 	return NODE_DATA(first_online_node);
 }
 
-EXPORT_UNUSED_SYMBOL(first_online_pgdat);  /*  June 2006  */
-
 struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
 {
 	int nid = next_online_node(pgdat->node_id);
@@ -24,8 +22,6 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
 		return NULL;
 	return NODE_DATA(nid);
 }
-EXPORT_UNUSED_SYMBOL(next_online_pgdat);  /*  June 2006  */
-
 
 /*
  * next_zone - helper magic for for_each_zone()
@@ -45,5 +41,4 @@ struct zone *next_zone(struct zone *zone)
 	}
 	return zone;
 }
-EXPORT_UNUSED_SYMBOL(next_zone);  /*  June 2006  */
 
-- 
cgit v0.10.2


From 8de61e69c2feb10e5391cca67a3faf1d2bf77ce0 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@cs.washington.edu>
Date: Wed, 6 Dec 2006 20:40:16 -0800
Subject: [PATCH] fs: remove unused variable

Removed unused 'have_pt_gnu_stack' variable.

Reported by David Binderman <dcb314@hotmail.com>

Signed-off-by: David Rientjes <rientjes@cs.washington.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 14ea630..be5869d 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -544,7 +544,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	unsigned long reloc_func_desc = 0;
 	char passed_fileno[6];
 	struct files_struct *files;
-	int have_pt_gnu_stack, executable_stack = EXSTACK_DEFAULT;
+	int executable_stack = EXSTACK_DEFAULT;
 	unsigned long def_flags = 0;
 	struct {
 		struct elfhdr elf_ex;
@@ -707,7 +707,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 				executable_stack = EXSTACK_DISABLE_X;
 			break;
 		}
-	have_pt_gnu_stack = (i < loc->elf_ex.e_phnum);
 
 	/* Some simple consistency checks for the interpreter */
 	if (elf_interpreter) {
-- 
cgit v0.10.2


From a09c17a6fdad9ae5b5ea1c3383080f84ec76ab20 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@cs.washington.edu>
Date: Wed, 6 Dec 2006 20:40:18 -0800
Subject: [PATCH] sys: remove unused variable

Remove unused 'new_ruid' variable.

Reported by David Binderman <dcb314@hotmail.com>.

Signed-off-by: David Rientjes <rientjes@cs.washington.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/sys.c b/kernel/sys.c
index c87b461..a0c1a29 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1102,14 +1102,14 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 asmlinkage long sys_setuid(uid_t uid)
 {
 	int old_euid = current->euid;
-	int old_ruid, old_suid, new_ruid, new_suid;
+	int old_ruid, old_suid, new_suid;
 	int retval;
 
 	retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
 	if (retval)
 		return retval;
 
-	old_ruid = new_ruid = current->uid;
+	old_ruid = current->uid;
 	old_suid = current->suid;
 	new_suid = old_suid;
 	
-- 
cgit v0.10.2


From 012d3ca8d85bf3ae5278ba897dd1ca3999247107 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@freedesktop.org>
Date: Wed, 6 Dec 2006 20:40:19 -0800
Subject: [PATCH] Add Sparse annotations to SRCU wrapper functions in
 rcutorture

The SRCU wrapper functions srcu_torture_read_lock and
srcu_torture_read_unlock in rcutorture intentionally change the SRCU
context; annotate them accordingly, to avoid a warning.

Signed-off-by: Josh Triplett <josh@freedesktop.org>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e2bda18..c52f981 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -401,7 +401,7 @@ static void srcu_torture_cleanup(void)
 	cleanup_srcu_struct(&srcu_ctl);
 }
 
-static int srcu_torture_read_lock(void)
+static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
 {
 	return srcu_read_lock(&srcu_ctl);
 }
@@ -419,7 +419,7 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
 		schedule_timeout_interruptible(longdelay);
 }
 
-static void srcu_torture_read_unlock(int idx)
+static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
 {
 	srcu_read_unlock(&srcu_ctl, idx);
 }
-- 
cgit v0.10.2


From 30aaa5c67e72849ccf290b9a7cad58e43bbb47ac Mon Sep 17 00:00:00 2001
From: Torben Mathiasen <torben.mathiasen@hp.com>
Date: Wed, 6 Dec 2006 20:40:20 -0800
Subject: [PATCH] New updated devices.txt - LANANA

[jengelh@linux01.gwdg.de: cleanups]
Signed-off-by: Jan Engelhardt <jengelh@gmx.de>
Acked-by: Torben Mathiasen <torben.mathiasen@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/Documentation/devices.txt b/Documentation/devices.txt
index 70690f1..8de132a 100644
--- a/Documentation/devices.txt
+++ b/Documentation/devices.txt
@@ -3,7 +3,7 @@
 
 	     Maintained by Torben Mathiasen <device@lanana.org>
 
-		      Last revised: 15 May 2006
+		      Last revised: 29 November 2006
 
 This list is the Linux Device List, the official registry of allocated
 device numbers and /dev directory nodes for the Linux operating
@@ -94,6 +94,7 @@ Your cooperation is appreciated.
 		  9 = /dev/urandom	Faster, less secure random number gen.
 		 10 = /dev/aio		Asynchronous I/O notification interface
 		 11 = /dev/kmsg		Writes to this come out as printk's
+
   1 block	RAM disk
 		  0 = /dev/ram0		First RAM disk
 		  1 = /dev/ram1		Second RAM disk
@@ -122,7 +123,7 @@ Your cooperation is appreciated.
 		devices are on major 128 and above and use the PTY
 		master multiplex (/dev/ptmx) to acquire a PTY on
 		demand.
-  
+
   2 block	Floppy disks
 		  0 = /dev/fd0		Controller 0, drive 0, autodetect
 		  1 = /dev/fd1		Controller 0, drive 1, autodetect
@@ -257,7 +258,7 @@ Your cooperation is appreciated.
 		129 = /dev/vcsa1	tty1 text/attribute contents
 		    ...
 		191 = /dev/vcsa63	tty63 text/attribute contents
-	
+
 		NOTE: These devices permit both read and write access.
 
   7 block	Loopback devices
@@ -411,7 +412,7 @@ Your cooperation is appreciated.
 		207 = /dev/video/em8300_sp	EM8300 DVD decoder subpicture
 		208 = /dev/compaq/cpqphpc	Compaq PCI Hot Plug Controller
 		209 = /dev/compaq/cpqrid	Compaq Remote Insight Driver
-		210 = /dev/impi/bt	IMPI coprocessor block transfer	
+		210 = /dev/impi/bt	IMPI coprocessor block transfer
 		211 = /dev/impi/smic	IMPI coprocessor stream interface
 		212 = /dev/watchdogs/0	First watchdog device
 		213 = /dev/watchdogs/1	Second watchdog device
@@ -506,6 +507,7 @@ Your cooperation is appreciated.
 		 33 = /dev/patmgr1	Sequencer patch manager
 		 34 = /dev/midi02	Third MIDI port
 		 50 = /dev/midi03	Fourth MIDI port
+
  14 block	BIOS harddrive callback support {2.6}
 		  0 = /dev/dos_hda	First BIOS harddrive whole disk
 		 64 = /dev/dos_hdb	Second BIOS harddrive whole disk
@@ -527,6 +529,7 @@ Your cooperation is appreciated.
 
  16 char	Non-SCSI scanners
 		  0 = /dev/gs4500	Genius 4500 handheld scanner
+
  16 block	GoldStar CD-ROM
 		  0 = /dev/gscd		GoldStar CD-ROM
 
@@ -548,6 +551,7 @@ Your cooperation is appreciated.
 		  0 = /dev/ttyC0	First Cyclades port
 		    ...
 		 31 = /dev/ttyC31	32nd Cyclades port
+
  19 block	"Double" compressed disk
 		  0 = /dev/double0	First compressed disk
 		    ...
@@ -563,6 +567,7 @@ Your cooperation is appreciated.
 		  0 = /dev/cub0		Callout device for ttyC0
 		    ...
 		 31 = /dev/cub31	Callout device for ttyC31
+
  20 block	Hitachi CD-ROM (under development)
 		  0 = /dev/hitcd	Hitachi CD-ROM
 
@@ -582,7 +587,7 @@ Your cooperation is appreciated.
 
 		This device is used on the ARM-based Acorn RiscPC.
 		Partitions are handled the same way as for IDE disks
-		(see major number 3). 
+		(see major number 3).
 
  22 char	Digiboard serial card
 		  0 = /dev/ttyD0	First Digiboard port
@@ -591,7 +596,7 @@ Your cooperation is appreciated.
  22 block	Second IDE hard disk/CD-ROM interface
 		  0 = /dev/hdc		Master: whole disk (or CD-ROM)
 		 64 = /dev/hdd		Slave: whole disk (or CD-ROM)
-		
+
 		Partitions are handled the same way as for the first
 		interface (see major number 3).
 
@@ -639,6 +644,7 @@ Your cooperation is appreciated.
 
  26 char	Quanta WinVision frame grabber {2.6}
 		  0 = /dev/wvisfgrab	Quanta WinVision frame grabber
+
  26 block	Second Matsushita (Panasonic/SoundBlaster) CD-ROM
 		  0 = /dev/sbpcd4	Panasonic CD-ROM controller 1 unit 0
 		  1 = /dev/sbpcd5	Panasonic CD-ROM controller 1 unit 1
@@ -670,6 +676,7 @@ Your cooperation is appreciated.
 		 37 = /dev/nrawqft1	Unit 1, no rewind-on-close, no file marks
 		 38 = /dev/nrawqft2	Unit 2, no rewind-on-close, no file marks
 		 39 = /dev/nrawqft3	Unit 3, no rewind-on-close, no file marks
+
  27 block	Third Matsushita (Panasonic/SoundBlaster) CD-ROM
 		  0 = /dev/sbpcd8	Panasonic CD-ROM controller 2 unit 0
 		  1 = /dev/sbpcd9	Panasonic CD-ROM controller 2 unit 1
@@ -681,6 +688,7 @@ Your cooperation is appreciated.
 		  1 = /dev/staliomem1	Second Stallion card I/O memory
 		  2 = /dev/staliomem2	Third Stallion card I/O memory
 		  3 = /dev/staliomem3	Fourth Stallion card I/O memory
+
  28 char	Atari SLM ACSI laser printer (68k/Atari)
 		  0 = /dev/slm0		First SLM laser printer
 		  1 = /dev/slm1		Second SLM laser printer
@@ -690,6 +698,7 @@ Your cooperation is appreciated.
 		  1 = /dev/sbpcd13	Panasonic CD-ROM controller 3 unit 1
 		  2 = /dev/sbpcd14	Panasonic CD-ROM controller 3 unit 2
 		  3 = /dev/sbpcd15	Panasonic CD-ROM controller 3 unit 3
+
  28 block	ACSI disk (68k/Atari)
 		  0 = /dev/ada		First ACSI disk whole disk
 		 16 = /dev/adb		Second ACSI disk whole disk
@@ -750,6 +759,7 @@ Your cooperation is appreciated.
  31 char	MPU-401 MIDI
 		  0 = /dev/mpu401data	MPU-401 data port
 		  1 = /dev/mpu401stat	MPU-401 status port
+
  31 block	ROM/flash memory card
 		  0 = /dev/rom0		First ROM card (rw)
 		      ...
@@ -801,7 +811,7 @@ Your cooperation is appreciated.
  34 block	Fourth IDE hard disk/CD-ROM interface
 		  0 = /dev/hdg		Master: whole disk (or CD-ROM)
 		 64 = /dev/hdh		Slave: whole disk (or CD-ROM)
-		
+
 		Partitions are handled the same way as for the first
 		interface (see major number 3).
 
@@ -818,6 +828,7 @@ Your cooperation is appreciated.
 		129 = /dev/smpte1	Second MIDI port, SMPTE timed
 		130 = /dev/smpte2	Third MIDI port, SMPTE timed
 		131 = /dev/smpte3	Fourth MIDI port, SMPTE timed
+
  35 block	Slow memory ramdisk
 		  0 = /dev/slram	Slow memory ramdisk
 
@@ -828,6 +839,7 @@ Your cooperation is appreciated.
 		 16 = /dev/tap0		First Ethertap device
 		    ...
 		 31 = /dev/tap15	16th Ethertap device
+
  36 block	MCA ESDI hard disk
 		  0 = /dev/eda		First ESDI disk whole disk
 		 64 = /dev/edb		Second ESDI disk whole disk
@@ -882,6 +894,7 @@ Your cooperation is appreciated.
 
  40 char	Matrox Meteor frame grabber {2.6}
 		  0 = /dev/mmetfgrab	Matrox Meteor frame grabber
+
  40 block	Syquest EZ135 parallel port removable drive
 		  0 = /dev/eza		Parallel EZ135 drive, whole disk
 
@@ -893,6 +906,7 @@ Your cooperation is appreciated.
 
  41 char	Yet Another Micro Monitor
 		  0 = /dev/yamm		Yet Another Micro Monitor
+
  41 block	MicroSolutions BackPack parallel port CD-ROM
 		  0 = /dev/bpcd		BackPack CD-ROM
 
@@ -901,6 +915,7 @@ Your cooperation is appreciated.
 		the parallel port ATAPI CD-ROM driver at major number 46.
 
  42 char	Demo/sample use
+
  42 block	Demo/sample use
 
 		This number is intended for use in sample code, as
@@ -918,6 +933,7 @@ Your cooperation is appreciated.
 		  0 = /dev/ttyI0	First virtual modem
 		    ...
 		 63 = /dev/ttyI63	64th virtual modem
+
  43 block	Network block devices
 		  0 = /dev/nb0		First network block device
 		  1 = /dev/nb1		Second network block device
@@ -934,12 +950,13 @@ Your cooperation is appreciated.
 		  0 = /dev/cui0		Callout device for ttyI0
 		    ...
 		 63 = /dev/cui63	Callout device for ttyI63
+
  44 block	Flash Translation Layer (FTL) filesystems
 		  0 = /dev/ftla		FTL on first Memory Technology Device
 		 16 = /dev/ftlb		FTL on second Memory Technology Device
 		 32 = /dev/ftlc		FTL on third Memory Technology Device
 		    ...
-		240 = /dev/ftlp		FTL on 16th Memory Technology Device 
+		240 = /dev/ftlp		FTL on 16th Memory Technology Device
 
 		Partitions are handled in the same way as for IDE
 		disks (see major number 3) except that the partition
@@ -958,6 +975,7 @@ Your cooperation is appreciated.
 		191 = /dev/ippp63	64th SyncPPP device
 
 		255 = /dev/isdninfo	ISDN monitor interface
+
  45 block	Parallel port IDE disk devices
 		  0 = /dev/pda		First parallel port IDE disk
 		 16 = /dev/pdb		Second parallel port IDE disk
@@ -1044,6 +1062,7 @@ Your cooperation is appreciated.
 		  1 = /dev/dcbri1	Second DataComm card
 		  2 = /dev/dcbri2	Third DataComm card
 		  3 = /dev/dcbri3	Fourth DataComm card
+
  52 block	Mylex DAC960 PCI RAID controller; fifth controller
 		  0 = /dev/rd/c4d0	First disk, whole disk
 		  8 = /dev/rd/c4d1	Second disk, whole disk
@@ -1093,6 +1112,7 @@ Your cooperation is appreciated.
 
  55 char	DSP56001 digital signal processor
 		  0 = /dev/dsp56k	First DSP56001
+
  55 block	Mylex DAC960 PCI RAID controller; eighth controller
 		  0 = /dev/rd/c7d0	First disk, whole disk
 		  8 = /dev/rd/c7d1	Second disk, whole disk
@@ -1130,6 +1150,7 @@ Your cooperation is appreciated.
 		  0 = /dev/cup0		Callout device for ttyP0
 		  1 = /dev/cup1		Callout device for ttyP1
 		    ...
+
  58 block	Reserved for logical volume manager
 
  59 char	sf firewall package
@@ -1149,6 +1170,7 @@ Your cooperation is appreciated.
 		NAMING CONFLICT -- PROPOSED REVISED NAME /dev/rpda0 etc
 
  60-63 char	LOCAL/EXPERIMENTAL USE
+
  60-63 block	LOCAL/EXPERIMENTAL USE
 		Allocated for local/experimental use.  For devices not
 		assigned official numbers, these ranges should be
@@ -1434,7 +1456,6 @@ Your cooperation is appreciated.
 		DAC960 (see major number 48) except that the limit on
 		partitions is 15.
 
-
  78 char	PAM Software's multimodem boards
 		  0 = /dev/ttyM0	First PAM modem
 		  1 = /dev/ttyM1	Second PAM modem
@@ -1450,7 +1471,6 @@ Your cooperation is appreciated.
 		DAC960 (see major number 48) except that the limit on
 		partitions is 15.
 
-
  79 char	PAM Software's multimodem boards - alternate devices
 		  0 = /dev/cum0		Callout device for ttyM0
 		  1 = /dev/cum1		Callout device for ttyM1
@@ -1466,7 +1486,6 @@ Your cooperation is appreciated.
 		DAC960 (see major number 48) except that the limit on
 		partitions is 15.
 
-
  80 char	Photometrics AT200 CCD camera
 		  0 = /dev/at200	Photometrics AT200 CCD camera
 
@@ -1679,7 +1698,7 @@ Your cooperation is appreciated.
 		  1 = /dev/dcxx1	Second capture card
 		    ...
 
- 94 block IBM S/390 DASD block storage
+ 94 block	IBM S/390 DASD block storage
     		  0 = /dev/dasda First DASD device, major
     		  1 = /dev/dasda1 First DASD device, block 1
 	    	  2 = /dev/dasda2 First DASD device, block 2
@@ -1695,7 +1714,7 @@ Your cooperation is appreciated.
 		  1 = /dev/ipnat	NAT control device/log file
 		  2 = /dev/ipstate	State information log file
 		  3 = /dev/ipauth	Authentication control device/log file
-		    ...		
+		    ...
 
  96 char	Parallel port ATAPI tape devices
 		  0 = /dev/pt0		First parallel port ATAPI tape
@@ -1705,7 +1724,7 @@ Your cooperation is appreciated.
 		129 = /dev/npt1		Second p.p. ATAPI tape, no rewind
 		    ...
 
- 96 block Inverse NAND Flash Translation Layer
+ 96 block	Inverse NAND Flash Translation Layer
 		  0 = /dev/inftla First INFTL layer
 		 16 = /dev/inftlb Second INFTL layer
 		    ...
@@ -1937,7 +1956,6 @@ Your cooperation is appreciated.
 		    ...
 
 113 block	IBM iSeries virtual CD-ROM
-
 		  0 = /dev/iseries/vcda	First virtual CD-ROM
 		  1 = /dev/iseries/vcdb	Second virtual CD-ROM
 		    ...
@@ -2059,11 +2077,12 @@ Your cooperation is appreciated.
 		    ...
 
 119 char	VMware virtual network control
-		  0 = /dev/vnet0	1st virtual network
-		  1 = /dev/vnet1	2nd virtual network
+		  0 = /dev/vmnet0	1st virtual network
+		  1 = /dev/vmnet1	2nd virtual network
 		    ...
 
 120-127 char	LOCAL/EXPERIMENTAL USE
+
 120-127 block	LOCAL/EXPERIMENTAL USE
 		Allocated for local/experimental use.  For devices not
 		assigned official numbers, these ranges should be
@@ -2075,7 +2094,6 @@ Your cooperation is appreciated.
 		nodes; instead they should be accessed through the
 		/dev/ptmx cloning interface.
 
-
 128 block       SCSI disk devices (128-143)
                   0 = /dev/sddy         129th SCSI disk whole disk
                  16 = /dev/sddz         130th SCSI disk whole disk
@@ -2087,7 +2105,6 @@ Your cooperation is appreciated.
 		disks (see major number 3) except that the limit on
 		partitions is 15.
 
-
 129 block       SCSI disk devices (144-159)
                   0 = /dev/sdeo         145th SCSI disk whole disk
                  16 = /dev/sdep         146th SCSI disk whole disk
@@ -2123,7 +2140,6 @@ Your cooperation is appreciated.
 		disks (see major number 3) except that the limit on
 		partitions is 15.
 
-
 132 block       SCSI disk devices (192-207)
                   0 = /dev/sdgk         193rd SCSI disk whole disk
                  16 = /dev/sdgl         194th SCSI disk whole disk
@@ -2135,7 +2151,6 @@ Your cooperation is appreciated.
 		disks (see major number 3) except that the limit on
 		partitions is 15.
 
-
 133 block       SCSI disk devices (208-223)
                   0 = /dev/sdha         209th SCSI disk whole disk
                  16 = /dev/sdhb         210th SCSI disk whole disk
@@ -2147,7 +2162,6 @@ Your cooperation is appreciated.
 		disks (see major number 3) except that the limit on
 		partitions is 15.
 
-
 134 block       SCSI disk devices (224-239)
                   0 = /dev/sdhq         225th SCSI disk whole disk
                  16 = /dev/sdhr         226th SCSI disk whole disk
@@ -2159,7 +2173,6 @@ Your cooperation is appreciated.
 		disks (see major number 3) except that the limit on
 		partitions is 15.
 
-
 135 block       SCSI disk devices (240-255)
                   0 = /dev/sdig         241st SCSI disk whole disk
                  16 = /dev/sdih         242nd SCSI disk whole disk
@@ -2171,7 +2184,6 @@ Your cooperation is appreciated.
 		disks (see major number 3) except that the limit on
 		partitions is 15.
 
-
 136-143 char	Unix98 PTY slaves
 		  0 = /dev/pts/0	First Unix98 pseudo-TTY
 		  1 = /dev/pts/1	Second Unix98 pesudo-TTY
@@ -2384,6 +2396,7 @@ Your cooperation is appreciated.
 		    ...
 
 159 char	RESERVED
+
 159 block	RESERVED
 
 160 char	General Purpose Instrument Bus (GPIB)
@@ -2427,7 +2440,7 @@ Your cooperation is appreciated.
 
 		Partitions are handled in the same way as for IDE
 		disks (see major number 3) except that the limit on
-		partitions is 31. 
+		partitions is 31.
 
 162 char	Raw block device interface
 		  0 = /dev/rawctl	Raw I/O control device
@@ -2483,7 +2496,6 @@ Your cooperation is appreciated.
 
 171 char	Reserved for IEEE 1394 (Firewire)
 
-
 172 char	Moxa Intellio serial card
 		  0 = /dev/ttyMX0	First Moxa port
 		  1 = /dev/ttyMX1	Second Moxa port
@@ -2543,9 +2555,6 @@ Your cooperation is appreciated.
 		 64 = /dev/usb/rio500	Diamond Rio 500
 		 65 = /dev/usb/usblcd	USBLCD Interface (info@usblcd.de)
 		 66 = /dev/usb/cpad0	Synaptics cPad (mouse/LCD)
-		 67 = /dev/usb/adutux0	1st Ontrak ADU device
-		    ...
-		 76 = /dev/usb/adutux10	10th Ontrak ADU device
 		 96 = /dev/usb/hiddev0	1st USB HID device
 		    ...
 		111 = /dev/usb/hiddev15	16th USB HID device
@@ -2558,7 +2567,7 @@ Your cooperation is appreciated.
 		132 = /dev/usb/idmouse	ID Mouse (fingerprint scanner) device
 		133 = /dev/usb/sisusbvga1	First SiSUSB VGA device
 		    ...
-		140 = /dev/usb/sisusbvga8	Eigth SISUSB VGA device
+		140 = /dev/usb/sisusbvga8	Eighth SISUSB VGA device
 		144 = /dev/usb/lcd	USB LCD device
 		160 = /dev/usb/legousbtower0	1st USB Legotower device
 		    ...
@@ -2571,7 +2580,7 @@ Your cooperation is appreciated.
 		  0 = /dev/uba		First USB block device
 		  8 = /dev/ubb		Second USB block device
 		 16 = /dev/ubc		Third USB block device
-		    ...
+ 		    ...
 
 181 char	Conrad Electronic parallel port radio clocks
 		  0 = /dev/pcfclock0	First Conrad radio clock
@@ -2657,7 +2666,7 @@ Your cooperation is appreciated.
 		 32 = /dev/mvideo/status2	Third device
 		    ...
 		    ...
-		240 = /dev/mvideo/status15	16th device 
+		240 = /dev/mvideo/status15	16th device
 		    ...
 
 195 char	Nvidia graphics devices
@@ -2795,6 +2804,10 @@ Your cooperation is appreciated.
 		    ...
 		 185 = /dev/ttyNX15		Hilscher netX serial port 15
 		 186 = /dev/ttyJ0		JTAG1 DCC protocol based serial port emulation
+		 187 = /dev/ttyUL0		Xilinx uartlite - port 0
+		    ...
+		 190 = /dev/ttyUL3		Xilinx uartlite - port 3
+		 191 = /dev/xvc0		Xen virtual console - port 0
 
 205 char	Low-density serial ports (alternate device)
 		  0 = /dev/culu0		Callout device for ttyLU0
@@ -2832,7 +2845,6 @@ Your cooperation is appreciated.
 		 82 = /dev/cuvr0		Callout device for ttyVR0
 		 83 = /dev/cuvr1		Callout device for ttyVR1
 
-
 206 char	OnStream SC-x0 tape devices
 		  0 = /dev/osst0		First OnStream SCSI tape, mode 0
 		  1 = /dev/osst1		Second OnStream SCSI tape, mode 0
@@ -2922,7 +2934,6 @@ Your cooperation is appreciated.
 		    ...
 
 212 char	LinuxTV.org DVB driver subsystem
-
 		  0 = /dev/dvb/adapter0/video0    first video decoder of first card
 		  1 = /dev/dvb/adapter0/audio0    first audio decoder of first card
 		  2 = /dev/dvb/adapter0/sec0      (obsolete/unused)
@@ -3008,9 +3019,9 @@ Your cooperation is appreciated.
 		  2 = /dev/3270/tub2		Second 3270 terminal
 		    ...
 
-229 char	IBM iSeries virtual console
-		  0 = /dev/iseries/vtty0	First console port
-		  1 = /dev/iseries/vtty1	Second console port
+229 char	IBM iSeries/pSeries virtual console
+		  0 = /dev/hvc0			First console port
+		  1 = /dev/hvc1			Second console port
 		    ...
 
 230 char	IBM iSeries virtual tape
@@ -3083,12 +3094,14 @@ Your cooperation is appreciated.
 234-239		UNASSIGNED
 
 240-254 char	LOCAL/EXPERIMENTAL USE
+
 240-254 block	LOCAL/EXPERIMENTAL USE
 		Allocated for local/experimental use.  For devices not
 		assigned official numbers, these ranges should be
 		used in order to avoid conflicting with future assignments.
 
 255 char	RESERVED
+
 255 block	RESERVED
 
 		This major is reserved to assist the expansion to a
@@ -3115,7 +3128,20 @@ Your cooperation is appreciated.
 257 char	Phoenix Technologies Cryptographic Services Driver
 		  0 = /dev/ptlsec	Crypto Services Driver
 
-
+257 block	SSFDC Flash Translation Layer filesystem
+		  0 = /dev/ssfdca	First SSFDC layer
+		  8 = /dev/ssfdcb	Second SSFDC layer
+		 16 = /dev/ssfdcc	Third SSFDC layer
+		 24 = /dev/ssfdcd	4th SSFDC layer
+		 32 = /dev/ssfdce	5th SSFDC layer
+		 40 = /dev/ssfdcf	6th SSFDC layer
+		 48 = /dev/ssfdcg	7th SSFDC layer
+		 56 = /dev/ssfdch	8th SSFDC layer
+
+258 block	ROM/Flash read-only translation layer
+		  0 = /dev/blockrom0	First ROM card's translation layer interface
+		  1 = /dev/blockrom1	Second ROM card's translation layer interface
+		  ...
 
  ****	ADDITIONAL /dev DIRECTORY ENTRIES
 
-- 
cgit v0.10.2


From bb8cc641653d785e3f3b8d3b0182a69edf825802 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:40:21 -0800
Subject: [PATCH] include/asm-cris/: "extern inline" -> "static inline"

"extern inline" generates a warning with -Wmissing-prototypes and I'm
currently working on getting the kernel cleaned up for adding this to the
CFLAGS since it will help us to avoid a nasty class of runtime errors.

If there are places that really need a forced inline, __always_inline would be
the correct solution.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Mikael Starvik <starvik@axis.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/asm-cris/arch-v10/bitops.h b/include/asm-cris/arch-v10/bitops.h
index b73f539..be85f6d 100644
--- a/include/asm-cris/arch-v10/bitops.h
+++ b/include/asm-cris/arch-v10/bitops.h
@@ -10,7 +10,7 @@
  * number.  They differ in that the first function also inverts all bits
  * in the input.
  */
-extern inline unsigned long cris_swapnwbrlz(unsigned long w)
+static inline unsigned long cris_swapnwbrlz(unsigned long w)
 {
 	/* Let's just say we return the result in the same register as the
 	   input.  Saying we clobber the input but can return the result
@@ -26,7 +26,7 @@ extern inline unsigned long cris_swapnwbrlz(unsigned long w)
 	return res;
 }
 
-extern inline unsigned long cris_swapwbrlz(unsigned long w)
+static inline unsigned long cris_swapwbrlz(unsigned long w)
 {
 	unsigned res;
 	__asm__ ("swapwbr %0 \n\t"
@@ -40,7 +40,7 @@ extern inline unsigned long cris_swapwbrlz(unsigned long w)
  * ffz = Find First Zero in word. Undefined if no zero exists,
  * so code should check against ~0UL first..
  */
-extern inline unsigned long ffz(unsigned long w)
+static inline unsigned long ffz(unsigned long w)
 {
 	return cris_swapnwbrlz(w);
 }
@@ -51,7 +51,7 @@ extern inline unsigned long ffz(unsigned long w)
  *
  * Undefined if no bit exists, so code should check against 0 first.
  */
-extern inline unsigned long __ffs(unsigned long word)
+static inline unsigned long __ffs(unsigned long word)
 {
 	return cris_swapnwbrlz(~word);
 }
@@ -65,7 +65,7 @@ extern inline unsigned long __ffs(unsigned long word)
  * differs in spirit from the above ffz (man ffs).
  */
 
-extern inline unsigned long kernel_ffs(unsigned long w)
+static inline unsigned long kernel_ffs(unsigned long w)
 {
 	return w ? cris_swapwbrlz (w) + 1 : 0;
 }
diff --git a/include/asm-cris/semaphore-helper.h b/include/asm-cris/semaphore-helper.h
index dbd0f30..a8e1e6c 100644
--- a/include/asm-cris/semaphore-helper.h
+++ b/include/asm-cris/semaphore-helper.h
@@ -20,12 +20,12 @@
 /*
  * These two _must_ execute atomically wrt each other.
  */
-extern inline void wake_one_more(struct semaphore * sem)
+static inline void wake_one_more(struct semaphore * sem)
 {
 	atomic_inc(&sem->waking);
 }
 
-extern inline int waking_non_zero(struct semaphore *sem)
+static inline int waking_non_zero(struct semaphore *sem)
 {
 	unsigned long flags;
 	int ret = 0;
@@ -40,7 +40,7 @@ extern inline int waking_non_zero(struct semaphore *sem)
 	return ret;
 }
 
-extern inline int waking_non_zero_interruptible(struct semaphore *sem,
+static inline int waking_non_zero_interruptible(struct semaphore *sem,
 						struct task_struct *tsk)
 {
 	int ret = 0;
@@ -59,7 +59,7 @@ extern inline int waking_non_zero_interruptible(struct semaphore *sem,
 	return ret;
 }
 
-extern inline int waking_non_zero_trylock(struct semaphore *sem)
+static inline int waking_non_zero_trylock(struct semaphore *sem)
 {
         int ret = 1;
 	unsigned long flags;
-- 
cgit v0.10.2


From 219576e127c4f7770aa985f719e815936bc54367 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:40:22 -0800
Subject: [PATCH] include/asm-h8300/: "extern inline" -> "static inline"

"extern inline" generates a warning with -Wmissing-prototypes and I'm
currently working on getting the kernel cleaned up for adding this to the
CFLAGS since it will help us to avoid a nasty class of runtime errors.

If there are places that really need a forced inline, __always_inline would be
the correct solution.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/asm-h8300/delay.h b/include/asm-h8300/delay.h
index cbccbbd..743beba 100644
--- a/include/asm-h8300/delay.h
+++ b/include/asm-h8300/delay.h
@@ -9,7 +9,7 @@
  * Delay routines, using a pre-computed "loops_per_second" value.
  */
 
-extern __inline__ void __delay(unsigned long loops)
+static inline void __delay(unsigned long loops)
 {
 	__asm__ __volatile__ ("1:\n\t"
 			      "dec.l #1,%0\n\t"
@@ -27,7 +27,7 @@ extern __inline__ void __delay(unsigned long loops)
 
 extern unsigned long loops_per_jiffy;
 
-extern __inline__ void udelay(unsigned long usecs)
+static inline void udelay(unsigned long usecs)
 {
 	usecs *= 4295;		/* 2**32 / 1000000 */
 	usecs /= (loops_per_jiffy*HZ);
diff --git a/include/asm-h8300/mmu_context.h b/include/asm-h8300/mmu_context.h
index 855721a..5c165f7 100644
--- a/include/asm-h8300/mmu_context.h
+++ b/include/asm-h8300/mmu_context.h
@@ -9,7 +9,7 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
 }
 
-extern inline int
+static inline int
 init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
 	// mm->context = virt_to_phys(mm->pgd);
@@ -23,7 +23,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, str
 {
 }
 
-extern inline void activate_mm(struct mm_struct *prev_mm,
+static inline void activate_mm(struct mm_struct *prev_mm,
 			       struct mm_struct *next_mm)
 {
 }
diff --git a/include/asm-h8300/pci.h b/include/asm-h8300/pci.h
index 5edad5b..0c771b0 100644
--- a/include/asm-h8300/pci.h
+++ b/include/asm-h8300/pci.h
@@ -10,12 +10,12 @@
 #define pcibios_assign_all_busses()	0
 #define pcibios_scan_all_fns(a, b)	0
 
-extern inline void pcibios_set_master(struct pci_dev *dev)
+static inline void pcibios_set_master(struct pci_dev *dev)
 {
 	/* No special bus mastering setup handling */
 }
 
-extern inline void pcibios_penalize_isa_irq(int irq, int active)
+static inline void pcibios_penalize_isa_irq(int irq, int active)
 {
 	/* We don't do dynamic PCI IRQ allocation */
 }
diff --git a/include/asm-h8300/tlbflush.h b/include/asm-h8300/tlbflush.h
index bbdffbee..9a2c5c9 100644
--- a/include/asm-h8300/tlbflush.h
+++ b/include/asm-h8300/tlbflush.h
@@ -47,12 +47,12 @@ static inline void flush_tlb_range(struct mm_struct *mm,
 	BUG();
 }
 
-extern inline void flush_tlb_kernel_page(unsigned long addr)
+static inline void flush_tlb_kernel_page(unsigned long addr)
 {
 	BUG();
 }
 
-extern inline void flush_tlb_pgtables(struct mm_struct *mm,
+static inline void flush_tlb_pgtables(struct mm_struct *mm,
 				      unsigned long start, unsigned long end)
 {
 	BUG();
-- 
cgit v0.10.2


From 3ee6f61ca0720c71086a9eaf3f5bd0f7c51fe139 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:40:23 -0800
Subject: [PATCH] remove NFSD_OPTIMIZE_SPACE

This patch removes the unused NFSD_OPTIMIZE_SPACE.

Additionally, it does differently what NFSD_OPTIMIZE_SPACE was supposed to do:

Nowadays, gcc knows best when to inline code, and CONFIG_CC_OPTIMIZE_FOR_SIZE
even tells gcc globally whether to optimize for size or for speed.  Therefore,
this patch also removes all inline's from these files.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index b4baca3..277df40 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -24,10 +24,6 @@
 
 #define NFSDDBG_FACILITY		NFSDDBG_XDR
 
-#ifdef NFSD_OPTIMIZE_SPACE
-# define inline
-#endif
-
 
 /*
  * Mapping of S_IF* types to NFS file types
@@ -42,14 +38,14 @@ static u32	nfs3_ftypes[] = {
 /*
  * XDR functions for basic NFS types
  */
-static inline __be32 *
+static __be32 *
 encode_time3(__be32 *p, struct timespec *time)
 {
 	*p++ = htonl((u32) time->tv_sec); *p++ = htonl(time->tv_nsec);
 	return p;
 }
 
-static inline __be32 *
+static __be32 *
 decode_time3(__be32 *p, struct timespec *time)
 {
 	time->tv_sec = ntohl(*p++);
@@ -57,7 +53,7 @@ decode_time3(__be32 *p, struct timespec *time)
 	return p;
 }
 
-static inline __be32 *
+static __be32 *
 decode_fh(__be32 *p, struct svc_fh *fhp)
 {
 	unsigned int size;
@@ -77,7 +73,7 @@ __be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp)
 	return decode_fh(p, fhp);
 }
 
-static inline __be32 *
+static __be32 *
 encode_fh(__be32 *p, struct svc_fh *fhp)
 {
 	unsigned int size = fhp->fh_handle.fh_size;
@@ -91,7 +87,7 @@ encode_fh(__be32 *p, struct svc_fh *fhp)
  * Decode a file name and make sure that the path contains
  * no slashes or null bytes.
  */
-static inline __be32 *
+static __be32 *
 decode_filename(__be32 *p, char **namp, int *lenp)
 {
 	char		*name;
@@ -107,7 +103,7 @@ decode_filename(__be32 *p, char **namp, int *lenp)
 	return p;
 }
 
-static inline __be32 *
+static __be32 *
 decode_sattr3(__be32 *p, struct iattr *iap)
 {
 	u32	tmp;
@@ -153,7 +149,7 @@ decode_sattr3(__be32 *p, struct iattr *iap)
 	return p;
 }
 
-static inline __be32 *
+static __be32 *
 encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
 	      struct kstat *stat)
 {
@@ -186,7 +182,7 @@ encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
 	return p;
 }
 
-static inline __be32 *
+static __be32 *
 encode_saved_post_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
 {
 	struct inode	*inode = fhp->fh_dentry->d_inode;
@@ -776,7 +772,7 @@ nfs3svc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p,
 		return xdr_ressize_check(rqstp, p);
 }
 
-static inline __be32 *
+static __be32 *
 encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
 	     int namlen, ino_t ino)
 {
@@ -790,7 +786,7 @@ encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
 	return p;
 }
 
-static inline __be32 *
+static __be32 *
 encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p,
 		struct svc_fh *fhp)
 {
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 56ebb14..f5243f9 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -18,11 +18,6 @@
 
 #define NFSDDBG_FACILITY		NFSDDBG_XDR
 
-
-#ifdef NFSD_OPTIMIZE_SPACE
-# define inline
-#endif
-
 /*
  * Mapping of S_IF* types to NFS file types
  */
@@ -55,7 +50,7 @@ __be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp)
 	return decode_fh(p, fhp);
 }
 
-static inline __be32 *
+static __be32 *
 encode_fh(__be32 *p, struct svc_fh *fhp)
 {
 	memcpy(p, &fhp->fh_handle.fh_base, NFS_FHSIZE);
@@ -66,7 +61,7 @@ encode_fh(__be32 *p, struct svc_fh *fhp)
  * Decode a file name and make sure that the path contains
  * no slashes or null bytes.
  */
-static inline __be32 *
+static __be32 *
 decode_filename(__be32 *p, char **namp, int *lenp)
 {
 	char		*name;
@@ -82,7 +77,7 @@ decode_filename(__be32 *p, char **namp, int *lenp)
 	return p;
 }
 
-static inline __be32 *
+static __be32 *
 decode_pathname(__be32 *p, char **namp, int *lenp)
 {
 	char		*name;
@@ -98,7 +93,7 @@ decode_pathname(__be32 *p, char **namp, int *lenp)
 	return p;
 }
 
-static inline __be32 *
+static __be32 *
 decode_sattr(__be32 *p, struct iattr *iap)
 {
 	u32	tmp, tmp1;
-- 
cgit v0.10.2


From af69c7f924b272927f9aea378f34f4548d3888d9 Mon Sep 17 00:00:00 2001
From: Paul Fulghum <paulkf@microgate.com>
Date: Wed, 6 Dec 2006 20:40:24 -0800
Subject: [PATCH] generic HDLC synclink config mismatch fix

Fix compile errors on mismatch between generic HDLC and synclink drivers.

Notes:

generic HDLC support for synclink drivers is *optional* so you can't just
use depend on in Kconfig

This solution is deemed the best after 7 months of review and criticism by
many developers including AKPM.  Read the threads on LKML before posting
about this solution.

Signed-off-by: Paul Fulghum <paulkf@microgate.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index 1bd1229..74d21c1 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -75,8 +75,10 @@
 #include <pcmcia/cisreg.h>
 #include <pcmcia/ds.h>
 
-#ifdef CONFIG_HDLC_MODULE
-#define CONFIG_HDLC 1
+#if defined(CONFIG_HDLC) || (defined(CONFIG_HDLC_MODULE) && defined(CONFIG_SYNCLINK_CS_MODULE))
+#define SYNCLINK_GENERIC_HDLC 1
+#else
+#define SYNCLINK_GENERIC_HDLC 0
 #endif
 
 #define GET_USER(error,value,addr) error = get_user(value,addr)
@@ -235,7 +237,7 @@ typedef struct _mgslpc_info {
 	int dosyncppp;
 	spinlock_t netlock;
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 	struct net_device *netdev;
 #endif
 
@@ -392,7 +394,7 @@ static void tx_timeout(unsigned long context);
 
 static int ioctl_common(MGSLPC_INFO *info, unsigned int cmd, unsigned long arg);
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 #define dev_to_port(D) (dev_to_hdlc(D)->priv)
 static void hdlcdev_tx_done(MGSLPC_INFO *info);
 static void hdlcdev_rx(MGSLPC_INFO *info, char *buf, int size);
@@ -1053,7 +1055,7 @@ static void tx_done(MGSLPC_INFO *info)
 		info->drop_rts_on_tx_done = 0;
 	}
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 	if (info->netcount)
 		hdlcdev_tx_done(info);
 	else 
@@ -1164,7 +1166,7 @@ static void dcd_change(MGSLPC_INFO *info)
 	}
 	else
 		info->input_signal_events.dcd_down++;
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 	if (info->netcount) {
 		if (info->serial_signals & SerialSignal_DCD)
 			netif_carrier_on(info->netdev);
@@ -2953,7 +2955,7 @@ static void mgslpc_add_device(MGSLPC_INFO *info)
 	printk( "SyncLink PC Card %s:IO=%04X IRQ=%d\n",
 		info->device_name, info->io_base, info->irq_level);
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 	hdlcdev_init(info);
 #endif
 }
@@ -2969,7 +2971,7 @@ static void mgslpc_remove_device(MGSLPC_INFO *remove_info)
 				last->next_device = info->next_device;
 			else
 				mgslpc_device_list = info->next_device;
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 			hdlcdev_exit(info);
 #endif
 			release_resources(info);
@@ -3901,7 +3903,7 @@ static int rx_get_frame(MGSLPC_INFO *info)
 				return_frame = 1;
 		}
 		framesize = 0;
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 		{
 			struct net_device_stats *stats = hdlc_stats(info->netdev);
 			stats->rx_errors++;
@@ -3935,7 +3937,7 @@ static int rx_get_frame(MGSLPC_INFO *info)
 				++framesize;
 			}
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 			if (info->netcount)
 				hdlcdev_rx(info, buf->data, framesize);
 			else
@@ -4091,7 +4093,7 @@ static void tx_timeout(unsigned long context)
 
 	spin_unlock_irqrestore(&info->lock,flags);
 	
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 	if (info->netcount)
 		hdlcdev_tx_done(info);
 	else
@@ -4099,7 +4101,7 @@ static void tx_timeout(unsigned long context)
 		bh_transmit(info);
 }
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 
 /**
  * called by generic HDLC layer when protocol selected (PPP, frame relay, etc.)
diff --git a/drivers/char/synclink.c b/drivers/char/synclink.c
index 147c30d..645187b 100644
--- a/drivers/char/synclink.c
+++ b/drivers/char/synclink.c
@@ -101,8 +101,10 @@
 #include <linux/hdlc.h>
 #include <linux/dma-mapping.h>
 
-#ifdef CONFIG_HDLC_MODULE
-#define CONFIG_HDLC 1
+#if defined(CONFIG_HDLC) || (defined(CONFIG_HDLC_MODULE) && defined(CONFIG_SYNCLINK_MODULE))
+#define SYNCLINK_GENERIC_HDLC 1
+#else
+#define SYNCLINK_GENERIC_HDLC 0
 #endif
 
 #define GET_USER(error,value,addr) error = get_user(value,addr)
@@ -320,7 +322,7 @@ struct mgsl_struct {
 	int dosyncppp;
 	spinlock_t netlock;
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 	struct net_device *netdev;
 #endif
 };
@@ -728,7 +730,7 @@ static void usc_loopmode_send_done( struct mgsl_struct * info );
 
 static int mgsl_ioctl_common(struct mgsl_struct *info, unsigned int cmd, unsigned long arg);
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 #define dev_to_port(D) (dev_to_hdlc(D)->priv)
 static void hdlcdev_tx_done(struct mgsl_struct *info);
 static void hdlcdev_rx(struct mgsl_struct *info, char *buf, int size);
@@ -1277,7 +1279,7 @@ static void mgsl_isr_transmit_status( struct mgsl_struct *info )
 		info->drop_rts_on_tx_done = 0;
 	}
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 	if (info->netcount)
 		hdlcdev_tx_done(info);
 	else 
@@ -1342,7 +1344,7 @@ static void mgsl_isr_io_pin( struct mgsl_struct *info )
 				info->input_signal_events.dcd_up++;
 			} else
 				info->input_signal_events.dcd_down++;
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 			if (info->netcount) {
 				if (status & MISCSTATUS_DCD)
 					netif_carrier_on(info->netdev);
@@ -4313,7 +4315,7 @@ static void mgsl_add_device( struct mgsl_struct *info )
 		     	info->max_frame_size );
 	}
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 	hdlcdev_init(info);
 #endif
 
@@ -4471,7 +4473,7 @@ static void synclink_cleanup(void)
 
 	info = mgsl_device_list;
 	while(info) {
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 		hdlcdev_exit(info);
 #endif
 		mgsl_release_resources(info);
@@ -6645,7 +6647,7 @@ static int mgsl_get_rx_frame(struct mgsl_struct *info)
 				return_frame = 1;
 		}
 		framesize = 0;
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 		{
 			struct net_device_stats *stats = hdlc_stats(info->netdev);
 			stats->rx_errors++;
@@ -6721,7 +6723,7 @@ static int mgsl_get_rx_frame(struct mgsl_struct *info)
 						*ptmp);
 			}
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 			if (info->netcount)
 				hdlcdev_rx(info,info->intermediate_rxbuffer,framesize);
 			else
@@ -7625,7 +7627,7 @@ static void mgsl_tx_timeout(unsigned long context)
 
 	spin_unlock_irqrestore(&info->irq_spinlock,flags);
 	
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 	if (info->netcount)
 		hdlcdev_tx_done(info);
 	else
@@ -7701,7 +7703,7 @@ static int usc_loopmode_active( struct mgsl_struct * info)
  	return usc_InReg( info, CCSR ) & BIT7 ? 1 : 0 ;
 }
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 
 /**
  * called by generic HDLC layer when protocol selected (PPP, frame relay, etc.)
diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
index 83b5d37..e4730a7 100644
--- a/drivers/char/synclink_gt.c
+++ b/drivers/char/synclink_gt.c
@@ -83,8 +83,10 @@
 
 #include "linux/synclink.h"
 
-#ifdef CONFIG_HDLC_MODULE
-#define CONFIG_HDLC 1
+#if defined(CONFIG_HDLC) || (defined(CONFIG_HDLC_MODULE) && defined(CONFIG_SYNCLINK_GT_MODULE))
+#define SYNCLINK_GENERIC_HDLC 1
+#else
+#define SYNCLINK_GENERIC_HDLC 0
 #endif
 
 /*
@@ -171,7 +173,7 @@ static void set_break(struct tty_struct *tty, int break_state);
 /*
  * generic HDLC support and callbacks
  */
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 #define dev_to_port(D) (dev_to_hdlc(D)->priv)
 static void hdlcdev_tx_done(struct slgt_info *info);
 static void hdlcdev_rx(struct slgt_info *info, char *buf, int size);
@@ -359,7 +361,7 @@ struct slgt_info {
 	int netcount;
 	int dosyncppp;
 	spinlock_t netlock;
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 	struct net_device *netdev;
 #endif
 
@@ -1354,7 +1356,7 @@ static void set_break(struct tty_struct *tty, int break_state)
 	spin_unlock_irqrestore(&info->lock,flags);
 }
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 
 /**
  * called by generic HDLC layer when protocol selected (PPP, frame relay, etc.)
@@ -2002,7 +2004,7 @@ static void dcd_change(struct slgt_info *info)
 	} else {
 		info->input_signal_events.dcd_down++;
 	}
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 	if (info->netcount) {
 		if (info->signals & SerialSignal_DCD)
 			netif_carrier_on(info->netdev);
@@ -2180,7 +2182,7 @@ static void isr_txeom(struct slgt_info *info, unsigned short status)
 			set_signals(info);
 		}
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 		if (info->netcount)
 			hdlcdev_tx_done(info);
 		else
@@ -3306,7 +3308,7 @@ static void add_device(struct slgt_info *info)
 		devstr, info->device_name, info->phys_reg_addr,
 		info->irq_level, info->max_frame_size);
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 	hdlcdev_init(info);
 #endif
 }
@@ -3488,7 +3490,7 @@ static void slgt_cleanup(void)
 	/* release devices */
 	info = slgt_device_list;
 	while(info) {
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 		hdlcdev_exit(info);
 #endif
 		free_dma_bufs(info);
@@ -4434,7 +4436,7 @@ check_again:
 			framesize = 0;
 	}
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 	if (framesize == 0) {
 		struct net_device_stats *stats = hdlc_stats(info->netdev);
 		stats->rx_errors++;
@@ -4477,7 +4479,7 @@ check_again:
 				framesize++;
 			}
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 			if (info->netcount)
 				hdlcdev_rx(info,info->tmp_rbuf, framesize);
 			else
@@ -4780,7 +4782,7 @@ static void tx_timeout(unsigned long context)
 	info->tx_count = 0;
 	spin_unlock_irqrestore(&info->lock,flags);
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 	if (info->netcount)
 		hdlcdev_tx_done(info);
 	else
diff --git a/drivers/char/synclinkmp.c b/drivers/char/synclinkmp.c
index 13a5724..20a96ef 100644
--- a/drivers/char/synclinkmp.c
+++ b/drivers/char/synclinkmp.c
@@ -67,8 +67,10 @@
 #include <linux/workqueue.h>
 #include <linux/hdlc.h>
 
-#ifdef CONFIG_HDLC_MODULE
-#define CONFIG_HDLC 1
+#if defined(CONFIG_HDLC) || (defined(CONFIG_HDLC_MODULE) && defined(CONFIG_SYNCLINKMP_MODULE))
+#define SYNCLINK_GENERIC_HDLC 1
+#else
+#define SYNCLINK_GENERIC_HDLC 0
 #endif
 
 #define GET_USER(error,value,addr) error = get_user(value,addr)
@@ -280,7 +282,7 @@ typedef struct _synclinkmp_info {
 	int dosyncppp;
 	spinlock_t netlock;
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 	struct net_device *netdev;
 #endif
 
@@ -536,7 +538,7 @@ static void throttle(struct tty_struct * tty);
 static void unthrottle(struct tty_struct * tty);
 static void set_break(struct tty_struct *tty, int break_state);
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 #define dev_to_port(D) (dev_to_hdlc(D)->priv)
 static void hdlcdev_tx_done(SLMP_INFO *info);
 static void hdlcdev_rx(SLMP_INFO *info, char *buf, int size);
@@ -1607,7 +1609,7 @@ static void set_break(struct tty_struct *tty, int break_state)
 	spin_unlock_irqrestore(&info->lock,flags);
 }
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 
 /**
  * called by generic HDLC layer when protocol selected (PPP, frame relay, etc.)
@@ -2339,7 +2341,7 @@ static void isr_txeom(SLMP_INFO * info, unsigned char status)
 			set_signals(info);
 		}
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 		if (info->netcount)
 			hdlcdev_tx_done(info);
 		else
@@ -2523,7 +2525,7 @@ void isr_io_pin( SLMP_INFO *info, u16 status )
 				info->input_signal_events.dcd_up++;
 			} else
 				info->input_signal_events.dcd_down++;
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 			if (info->netcount) {
 				if (status & SerialSignal_DCD)
 					netif_carrier_on(info->netdev);
@@ -3783,7 +3785,7 @@ void add_device(SLMP_INFO *info)
 		info->irq_level,
 		info->max_frame_size );
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 	hdlcdev_init(info);
 #endif
 }
@@ -3977,7 +3979,7 @@ static void synclinkmp_cleanup(void)
 	/* release devices */
 	info = synclinkmp_device_list;
 	while(info) {
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 		hdlcdev_exit(info);
 #endif
 		free_dma_bufs(info);
@@ -4979,7 +4981,7 @@ CheckAgain:
 			info->icount.rxcrc++;
 
 		framesize = 0;
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 		{
 			struct net_device_stats *stats = hdlc_stats(info->netdev);
 			stats->rx_errors++;
@@ -5020,7 +5022,7 @@ CheckAgain:
 					index = 0;
 			}
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 			if (info->netcount)
 				hdlcdev_rx(info,info->tmp_rx_buf,framesize);
 			else
@@ -5531,7 +5533,7 @@ void tx_timeout(unsigned long context)
 
 	spin_unlock_irqrestore(&info->lock,flags);
 
-#ifdef CONFIG_HDLC
+#if SYNCLINK_GENERIC_HDLC
 	if (info->netcount)
 		hdlcdev_tx_done(info);
 	else
-- 
cgit v0.10.2


From e384bd1692bab6a1d9aa40c849ac373a68769de2 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:40:25 -0800
Subject: [PATCH] MAINTAINERS: remove the non-existing sun3 list

sun3-list@redhat.com does no longer exist.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index c7d895a..e3f0831 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2913,7 +2913,6 @@ S:	Maintained
 SUN3/3X
 P:	Sam Creasey
 M:	sammy@sammy.net
-L:	sun3-list@redhat.com
 W:	http://sammy.net/sun3/
 S:	Maintained
 
-- 
cgit v0.10.2


From ccdea2f88b5689f0fd29c3804be43a3acf0311e3 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Wed, 6 Dec 2006 20:40:26 -0800
Subject: [PATCH] futex: remove unneeded barrier

When disassembling a kernel I found around over 90 sync Instructions from
mb, rmb and wmb calls in the kernel and only few of those make any sense to
me.  So here's the first one - I think the wmb() in kernel/futex.c is not
needed on uniprocessors so should become an smb_wmb().

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/futex.c b/kernel/futex.c
index d60b7f7..a8302a1 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -552,7 +552,7 @@ static void wake_futex(struct futex_q *q)
 	 * at the end of wake_up_all() does not prevent this store from
 	 * moving.
 	 */
-	wmb();
+	smp_wmb();
 	q->lock_ptr = NULL;
 }
 
-- 
cgit v0.10.2


From 4b358e22064b4551aa8b4dcfe3efe70a13548676 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:40:28 -0800
Subject: [PATCH] cleanup include/asm-generic/atomic.h

cleanup asm-generic/atomic.h

 - no longer a userspace header
 - remove the unneeded #include <asm/types.h>
 - #else/#endif comments

[akpm@osdl.org: fix arm build]
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild
index 3c06be3..fa14f8c 100644
--- a/include/asm-generic/Kbuild
+++ b/include/asm-generic/Kbuild
@@ -1,4 +1,3 @@
-header-y += atomic.h
 header-y += errno-base.h
 header-y += errno.h
 header-y += fcntl.h
diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
index 42a95d9..b7e4a04 100644
--- a/include/asm-generic/atomic.h
+++ b/include/asm-generic/atomic.h
@@ -66,7 +66,7 @@ static inline void atomic_long_sub(long i, atomic_long_t *l)
 	atomic64_sub(i, v);
 }
 
-#else
+#else  /*  BITS_PER_LONG == 64  */
 
 typedef atomic_t atomic_long_t;
 
@@ -113,5 +113,6 @@ static inline void atomic_long_sub(long i, atomic_long_t *l)
 	atomic_sub(i, v);
 }
 
-#endif
-#endif
+#endif  /*  BITS_PER_LONG == 64  */
+
+#endif  /*  _ASM_GENERIC_ATOMIC_H  */
-- 
cgit v0.10.2


From 9828673d7d24a09fcc3931b3b84c539146979224 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:40:29 -0800
Subject: [PATCH] paride: remove parport #ifdef's

CONFIG_PARIDE depends on CONFIG_PARPORT_PC, so there's no reason for
these #ifdef's.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/block/paride/bpck6.c b/drivers/block/paride/bpck6.c
index da6b5d2..ad12452 100644
--- a/drivers/block/paride/bpck6.c
+++ b/drivers/block/paride/bpck6.c
@@ -31,10 +31,7 @@ static int verbose; /* set this to 1 to see debugging messages and whatnot */
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <asm/io.h>
-
-#if defined(CONFIG_PARPORT_MODULE)||defined(CONFIG_PARPORT)
 #include <linux/parport.h>
-#endif
 
 #include "ppc6lnx.c"
 #include "paride.h"
@@ -139,11 +136,6 @@ static int bpck6_test_port ( PIA *pi )   /* check for 8-bit port */
 	PPCSTRUCT(pi)->ppc_id=pi->unit;
 	PPCSTRUCT(pi)->lpt_addr=pi->port;
 
-#ifdef CONFIG_PARPORT_PC_MODULE
-#define CONFIG_PARPORT_PC
-#endif
-
-#ifdef CONFIG_PARPORT_PC
 	/* look at the parport device to see if what modes we can use */
 	if(((struct pardevice *)(pi->pardev))->port->modes & 
 		(PARPORT_MODE_EPP)
@@ -161,11 +153,6 @@ static int bpck6_test_port ( PIA *pi )   /* check for 8-bit port */
 	{
 		return 1;
 	}
-#else
-	/* there is no way of knowing what kind of port we have
-	   default to the highest mode possible */
-	return 5;
-#endif
 }
 
 static int bpck6_probe_unit ( PIA *pi )
diff --git a/drivers/block/paride/paride.c b/drivers/block/paride/paride.c
index e4c55e0..48c50f1 100644
--- a/drivers/block/paride/paride.c
+++ b/drivers/block/paride/paride.c
@@ -29,14 +29,7 @@
 #include <linux/spinlock.h>
 #include <linux/wait.h>
 #include <linux/sched.h>	/* TASK_* */
-
-#ifdef CONFIG_PARPORT_MODULE
-#define CONFIG_PARPORT
-#endif
-
-#ifdef CONFIG_PARPORT
 #include <linux/parport.h>
-#endif
 
 #include "paride.h"
 
@@ -76,8 +69,6 @@ void pi_read_block(PIA * pi, char *buf, int count)
 
 EXPORT_SYMBOL(pi_read_block);
 
-#ifdef CONFIG_PARPORT
-
 static void pi_wake_up(void *p)
 {
 	PIA *pi = (PIA *) p;
@@ -100,11 +91,8 @@ static void pi_wake_up(void *p)
 		cont();
 }
 
-#endif
-
 int pi_schedule_claimed(PIA * pi, void (*cont) (void))
 {
-#ifdef CONFIG_PARPORT
 	unsigned long flags;
 
 	spin_lock_irqsave(&pi_spinlock, flags);
@@ -115,7 +103,6 @@ int pi_schedule_claimed(PIA * pi, void (*cont) (void))
 	}
 	pi->claimed = 1;
 	spin_unlock_irqrestore(&pi_spinlock, flags);
-#endif
 	return 1;
 }
 EXPORT_SYMBOL(pi_schedule_claimed);
@@ -133,20 +120,16 @@ static void pi_claim(PIA * pi)
 	if (pi->claimed)
 		return;
 	pi->claimed = 1;
-#ifdef CONFIG_PARPORT
 	if (pi->pardev)
 		wait_event(pi->parq,
 			   !parport_claim((struct pardevice *) pi->pardev));
-#endif
 }
 
 static void pi_unclaim(PIA * pi)
 {
 	pi->claimed = 0;
-#ifdef CONFIG_PARPORT
 	if (pi->pardev)
 		parport_release((struct pardevice *) (pi->pardev));
-#endif
 }
 
 void pi_connect(PIA * pi)
@@ -167,21 +150,15 @@ EXPORT_SYMBOL(pi_disconnect);
 
 static void pi_unregister_parport(PIA * pi)
 {
-#ifdef CONFIG_PARPORT
 	if (pi->pardev) {
 		parport_unregister_device((struct pardevice *) (pi->pardev));
 		pi->pardev = NULL;
 	}
-#endif
 }
 
 void pi_release(PIA * pi)
 {
 	pi_unregister_parport(pi);
-#ifndef CONFIG_PARPORT
-	if (pi->reserved)
-		release_region(pi->port, pi->reserved);
-#endif				/* !CONFIG_PARPORT */
 	if (pi->proto->release_proto)
 		pi->proto->release_proto(pi);
 	module_put(pi->proto->owner);
@@ -269,8 +246,6 @@ EXPORT_SYMBOL(paride_unregister);
 
 static int pi_register_parport(PIA * pi, int verbose)
 {
-#ifdef CONFIG_PARPORT
-
 	struct parport *port;
 
 	port = parport_find_base(pi->port);
@@ -290,7 +265,6 @@ static int pi_register_parport(PIA * pi, int verbose)
 		printk("%s: 0x%x is %s\n", pi->device, pi->port, port->name);
 
 	pi->parname = (char *) port->name;
-#endif
 
 	return 1;
 }
@@ -447,13 +421,6 @@ int pi_init(PIA * pi, int autoprobe, int port, int mode,
 			printk("%s: Adapter not found\n", device);
 		return 0;
 	}
-#ifndef CONFIG_PARPORT
-	if (!request_region(pi->port, pi->reserved, pi->device)) {
-		printk(KERN_WARNING "paride: Unable to request region 0x%x\n",
-		       pi->port);
-		return 0;
-	}
-#endif				/* !CONFIG_PARPORT */
 
 	if (pi->parname)
 		printk("%s: Sharing %s at 0x%x\n", pi->device,
-- 
cgit v0.10.2


From 4d2d3cec1da18123b301270381eb748e5ba4f638 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:40:31 -0800
Subject: [PATCH] remove drivers/block/paride/jumbo

Let's remove this pre-historic paride building script.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/block/paride/jumbo b/drivers/block/paride/jumbo
deleted file mode 100644
index e793b9c..0000000
--- a/drivers/block/paride/jumbo
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/bin/sh
-#
-# This script can be used to build "jumbo" modules that contain the
-# base PARIDE support, one protocol module and one high-level driver.
-#
-echo -n "High level driver [pcd] : "
-read X
-HLD=${X:-pcd}
-#
-echo -n "Protocol module [bpck] : "
-read X
-PROTO=${X:-bpck}
-#
-echo -n "Use MODVERSIONS [y] ? "
-read X
-UMODV=${X:-y}
-#
-echo -n "For SMP kernel [n] ? "
-read X
-USMP=${X:-n}
-#
-echo -n "Support PARPORT [n] ? "
-read X
-UPARP=${X:-n}
-#
-echo
-#
-case $USMP in
-	y* | Y* ) FSMP="-DCONFIG_SMP"
-		  ;;
-	*)	  FSMP=""
-		  ;;
-esac
-#
-MODI="-include ../../../include/linux/modversions.h"
-#
-case $UMODV in
-	y* | Y* ) FMODV="-DMODVERSIONS $MODI"
-		  ;;
-	*)	  FMODV=""
-		  ;;
-esac
-#
-case $UPARP in
-	y* | Y* ) FPARP="-DCONFIG_PARPORT"
-		  ;;
-	*)	  FPARP=""
-		  ;;
-esac
-#
-TARG=$HLD-$PROTO.o
-FPROTO=-DCONFIG_PARIDE_`echo "$PROTO" | tr [a-z] [A-Z]`
-FK="-D__KERNEL__ -I ../../../include"
-FLCH=-D_LINUX_CONFIG_H
-#
-echo cc $FK $FSMP $FLCH $FPARP $FPROTO $FMODV -Wall -O2 -o Jb.o -c paride.c
-cc $FK $FSMP $FLCH $FPARP $FPROTO $FMODV -Wall -O2 -o Jb.o -c paride.c
-#
-echo cc $FK $FSMP $FMODV -Wall -O2 -o Jp.o -c $PROTO.c
-cc $FK $FSMP $FMODV -Wall -O2 -o Jp.o -c $PROTO.c
-#
-echo cc $FK $FSMP $FMODV -DMODULE -DPARIDE_JUMBO -Wall -O2 -o Jd.o -c $HLD.c
-cc $FK $FSMP $FMODV -DMODULE -DPARIDE_JUMBO -Wall -O2 -o Jd.o -c $HLD.c
-#
-echo ld -r -o $TARG Jp.o Jb.o Jd.o
-ld -r -o $TARG Jp.o Jb.o Jd.o
-#
-#
-rm Jp.o Jb.o Jd.o
-#
-- 
cgit v0.10.2


From b593e48d2bf09c40c0f5165f2f2a10cc3983ea51 Mon Sep 17 00:00:00 2001
From: Yan Burman <burman.yan@gmail.com>
Date: Wed, 6 Dec 2006 20:40:32 -0800
Subject: [PATCH] affs: replace kmalloc+memset with kzalloc

Replace kmalloc+memset with kzalloc

Signed-off-by: Yan Burman <burman.yan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index b0b9536..b330009 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -289,12 +289,11 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
 	sbi->s_bmap_count = (sbi->s_partition_size - sbi->s_reserved +
 				 sbi->s_bmap_bits - 1) / sbi->s_bmap_bits;
 	size = sbi->s_bmap_count * sizeof(*bm);
-	bm = sbi->s_bitmap = kmalloc(size, GFP_KERNEL);
+	bm = sbi->s_bitmap = kzalloc(size, GFP_KERNEL);
 	if (!sbi->s_bitmap) {
 		printk(KERN_ERR "AFFS: Bitmap allocation failed\n");
 		return -ENOMEM;
 	}
-	memset(sbi->s_bitmap, 0, size);
 
 	bmap_blk = (__be32 *)sbi->s_root_bh->b_data;
 	blk = sb->s_blocksize / 4 - 49;
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 22afaae..44aff81 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -55,13 +55,12 @@ int afs_server_lookup(struct afs_cell *cell, const struct in_addr *addr,
 	_enter("%p,%08x,", cell, ntohl(addr->s_addr));
 
 	/* allocate and initialise a server record */
-	server = kmalloc(sizeof(struct afs_server), GFP_KERNEL);
+	server = kzalloc(sizeof(struct afs_server), GFP_KERNEL);
 	if (!server) {
 		_leave(" = -ENOMEM");
 		return -ENOMEM;
 	}
 
-	memset(server, 0, sizeof(struct afs_server));
 	atomic_set(&server->usage, 1);
 
 	INIT_LIST_HEAD(&server->link);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 9a351c4..18d9b77 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -242,14 +242,12 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent)
 	kenter("");
 
 	/* allocate a superblock info record */
-	as = kmalloc(sizeof(struct afs_super_info), GFP_KERNEL);
+	as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
 	if (!as) {
 		_leave(" = -ENOMEM");
 		return -ENOMEM;
 	}
 
-	memset(as, 0, sizeof(struct afs_super_info));
-
 	afs_get_volume(params->volume);
 	as->volume = params->volume;
 
-- 
cgit v0.10.2


From a0172a6dd906cf813cd56aeb133a85abf9a36c8e Mon Sep 17 00:00:00 2001
From: Yan Burman <burman.yan@gmail.com>
Date: Wed, 6 Dec 2006 20:40:33 -0800
Subject: [PATCH] arm26: replace kmalloc+memset with kzalloc

Replace kmalloc+memset with kzalloc

Signed-off-by: Yan Burman <burman.yan@gmail.com>
CC: Ian Molton <spyro@f2s.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/arm26/kernel/ecard.c b/arch/arm26/kernel/ecard.c
index 047d0a4..43dd41b 100644
--- a/arch/arm26/kernel/ecard.c
+++ b/arch/arm26/kernel/ecard.c
@@ -620,12 +620,10 @@ ecard_probe(int slot, card_type_t type)
 	struct ex_ecid cid;
 	int i, rc = -ENOMEM;
 
-	ec = kmalloc(sizeof(ecard_t), GFP_KERNEL);
+	ec = kzalloc(sizeof(ecard_t), GFP_KERNEL);
 	if (!ec)
 		goto nomem;
 
-	memset(ec, 0, sizeof(ecard_t));
-
 	ec->slot_no	= slot;
 	ec->type        = type;
 	ec->irq		= NO_IRQ;
-- 
cgit v0.10.2


From 4a08a9f68168e547c2baf100020e9b96cae5fbd1 Mon Sep 17 00:00:00 2001
From: Yan Burman <burman.yan@gmail.com>
Date: Wed, 6 Dec 2006 20:40:34 -0800
Subject: [PATCH] jffs: replace kmalloc+memset with kzalloc

Replace kmalloc+memset with kzalloc

Signed-off-by: Yan Burman <burman.yan@gmail.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/jffs/intrep.c b/fs/jffs/intrep.c
index 478a74e..d0e783f 100644
--- a/fs/jffs/intrep.c
+++ b/fs/jffs/intrep.c
@@ -592,7 +592,7 @@ jffs_add_virtual_root(struct jffs_control *c)
 	D2(printk("jffs_add_virtual_root(): "
 		  "Creating a virtual root directory.\n"));
 
-	if (!(root = kmalloc(sizeof(struct jffs_file), GFP_KERNEL))) {
+	if (!(root = kzalloc(sizeof(struct jffs_file), GFP_KERNEL))) {
 		return -ENOMEM;
 	}
 	no_jffs_file++;
@@ -604,7 +604,6 @@ jffs_add_virtual_root(struct jffs_control *c)
 	DJM(no_jffs_node++);
 	memset(node, 0, sizeof(struct jffs_node));
 	node->ino = JFFS_MIN_INO;
-	memset(root, 0, sizeof(struct jffs_file));
 	root->ino = JFFS_MIN_INO;
 	root->mode = S_IFDIR | S_IRWXU | S_IRGRP
 		     | S_IXGRP | S_IROTH | S_IXOTH;
-- 
cgit v0.10.2


From 15ad7cdcfd76450d4beebc789ec646664238184d Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Wed, 6 Dec 2006 20:40:36 -0800
Subject: [PATCH] struct seq_operations and struct file_operations
 constification

 - move some file_operations structs into the .rodata section

 - move static strings from policy_types[] array into the .rodata section

 - fix generic seq_operations usages, so that those structs may be defined
   as "const" as well

[akpm@osdl.org: couple of fixes]
Signed-off-by: Helge Deller <deller@gmx.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 555b9ac..10690aa 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -26,7 +26,7 @@
  *	ERR_PTR(error).  In the end of sequence they return %NULL. ->show()
  *	returns 0 in case of success and negative number in case of error.
  */
-int seq_open(struct file *file, struct seq_operations *op)
+int seq_open(struct file *file, const struct seq_operations *op)
 {
 	struct seq_file *p = file->private_data;
 
@@ -408,7 +408,7 @@ EXPORT_SYMBOL(single_open);
 
 int single_release(struct inode *inode, struct file *file)
 {
-	struct seq_operations *op = ((struct seq_file *)file->private_data)->op;
+	const struct seq_operations *op = ((struct seq_file *)file->private_data)->op;
 	int res = seq_release(inode, file);
 	kfree(op);
 	return res;
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 748d2c9..8821e1f 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -46,7 +46,7 @@ extern int cpuset_excl_nodes_overlap(const struct task_struct *p);
 extern int cpuset_memory_pressure_enabled;
 extern void __cpuset_memory_pressure_bump(void);
 
-extern struct file_operations proc_cpuset_operations;
+extern const struct file_operations proc_cpuset_operations;
 extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
 
 extern void cpuset_lock(void);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index da6002d..e339a73 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -278,7 +278,7 @@ struct zone {
 	/*
 	 * rarely used fields:
 	 */
-	char			*name;
+	const char		*name;
 } ____cacheline_internodealigned_in_smp;
 
 /*
diff --git a/include/linux/relay.h b/include/linux/relay.h
index 0e3d91b..c6a48bf 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -274,7 +274,7 @@ static inline void subbuf_start_reserve(struct rchan_buf *buf,
 /*
  * exported relay file operations, kernel/relay.c
  */
-extern struct file_operations relay_file_operations;
+extern const struct file_operations relay_file_operations;
 
 #endif /* _LINUX_RELAY_H */
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3a76724..dede82c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -573,7 +573,7 @@ struct sched_info {
 #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
 
 #ifdef CONFIG_SCHEDSTATS
-extern struct file_operations proc_schedstat_operations;
+extern const struct file_operations proc_schedstat_operations;
 #endif /* CONFIG_SCHEDSTATS */
 
 #ifdef CONFIG_TASK_DELAY_ACCT
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index b95f6eb..3e3cccb 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -20,7 +20,7 @@ struct seq_file {
 	loff_t index;
 	loff_t version;
 	struct mutex lock;
-	struct seq_operations *op;
+	const struct seq_operations *op;
 	void *private;
 };
 
@@ -31,7 +31,7 @@ struct seq_operations {
 	int (*show) (struct seq_file *m, void *v);
 };
 
-int seq_open(struct file *, struct seq_operations *);
+int seq_open(struct file *, const struct seq_operations *);
 ssize_t seq_read(struct file *, char __user *, size_t, loff_t *);
 loff_t seq_lseek(struct file *, loff_t, int);
 int seq_release(struct inode *, struct file *);
diff --git a/kernel/configs.c b/kernel/configs.c
index f9e3197..8fa1fb2 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -75,7 +75,7 @@ ikconfig_read_current(struct file *file, char __user *buf,
 	return count;
 }
 
-static struct file_operations ikconfig_file_ops = {
+static const struct file_operations ikconfig_file_ops = {
 	.owner = THIS_MODULE,
 	.read = ikconfig_read_current,
 };
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9b62b4c..4ef1d29 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1531,7 +1531,7 @@ static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
 }
 
-static struct file_operations cpuset_file_operations = {
+static const struct file_operations cpuset_file_operations = {
 	.read = cpuset_file_read,
 	.write = cpuset_file_write,
 	.llseek = generic_file_llseek,
@@ -2605,7 +2605,7 @@ static int cpuset_open(struct inode *inode, struct file *file)
 	return single_open(file, proc_cpuset_show, pid);
 }
 
-struct file_operations proc_cpuset_operations = {
+const struct file_operations proc_cpuset_operations = {
 	.open		= cpuset_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/kernel/dma.c b/kernel/dma.c
index 2020644..937b13c 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -140,7 +140,7 @@ static int proc_dma_open(struct inode *inode, struct file *file)
 	return single_open(file, proc_dma_show, NULL);
 }
 
-static struct file_operations proc_dma_operations = {
+static const struct file_operations proc_dma_operations = {
 	.open		= proc_dma_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/kernel/futex.c b/kernel/futex.c
index a8302a1..95989a3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1492,7 +1492,7 @@ static unsigned int futex_poll(struct file *filp,
 	return ret;
 }
 
-static struct file_operations futex_fops = {
+static const struct file_operations futex_fops = {
 	.release	= futex_close,
 	.poll		= futex_poll,
 };
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 54befe3..ab63cfc 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -398,7 +398,7 @@ static int s_show(struct seq_file *m, void *p)
 	return 0;
 }
 
-static struct seq_operations kallsyms_op = {
+static const struct seq_operations kallsyms_op = {
 	.start = s_start,
 	.next = s_next,
 	.stop = s_stop,
@@ -433,7 +433,7 @@ static int kallsyms_release(struct inode *inode, struct file *file)
 	return seq_release(inode, file);
 }
 
-static struct file_operations kallsyms_operations = {
+static const struct file_operations kallsyms_operations = {
 	.open = kallsyms_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index f6e72ea..b554b40 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -113,7 +113,7 @@ static int l_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct seq_operations lockdep_ops = {
+static const struct seq_operations lockdep_ops = {
 	.start	= l_start,
 	.next	= l_next,
 	.stop	= l_stop,
@@ -135,7 +135,7 @@ static int lockdep_open(struct inode *inode, struct file *file)
 	return res;
 }
 
-static struct file_operations proc_lockdep_operations = {
+static const struct file_operations proc_lockdep_operations = {
 	.open		= lockdep_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -319,7 +319,7 @@ static int lockdep_stats_open(struct inode *inode, struct file *file)
 	return single_open(file, lockdep_stats_show, NULL);
 }
 
-static struct file_operations proc_lockdep_stats_operations = {
+static const struct file_operations proc_lockdep_stats_operations = {
 	.open		= lockdep_stats_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/kernel/module.c b/kernel/module.c
index e2d09d6..d9eae45 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2209,7 +2209,7 @@ static int m_show(struct seq_file *m, void *p)
    Where refcount is a number or -, and deps is a comma-separated list
    of depends or -.
 */
-struct seq_operations modules_op = {
+const struct seq_operations modules_op = {
 	.start	= m_start,
 	.next	= m_next,
 	.stop	= m_stop,
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 069732e..89443b8 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -383,7 +383,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 	return error;
 }
 
-static struct file_operations snapshot_fops = {
+static const struct file_operations snapshot_fops = {
 	.open = snapshot_open,
 	.release = snapshot_release,
 	.read = snapshot_read,
diff --git a/kernel/profile.c b/kernel/profile.c
index 0961d93..fb5e03d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -501,7 +501,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
 	return count;
 }
 
-static struct file_operations proc_profile_operations = {
+static const struct file_operations proc_profile_operations = {
 	.read		= read_profile,
 	.write		= write_profile,
 };
diff --git a/kernel/relay.c b/kernel/relay.c
index 2b92e8e..75a3a9a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1013,7 +1013,7 @@ static ssize_t relay_file_sendfile(struct file *filp,
 				       actor, &desc);
 }
 
-struct file_operations relay_file_operations = {
+const struct file_operations relay_file_operations = {
 	.open		= relay_file_open,
 	.poll		= relay_file_poll,
 	.mmap		= relay_file_mmap,
diff --git a/kernel/resource.c b/kernel/resource.c
index 6de60c1..7b9a497 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -88,7 +88,7 @@ static int r_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct seq_operations resource_op = {
+static const struct seq_operations resource_op = {
 	.start	= r_start,
 	.next	= r_next,
 	.stop	= r_stop,
@@ -115,14 +115,14 @@ static int iomem_open(struct inode *inode, struct file *file)
 	return res;
 }
 
-static struct file_operations proc_ioports_operations = {
+static const struct file_operations proc_ioports_operations = {
 	.open		= ioports_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= seq_release,
 };
 
-static struct file_operations proc_iomem_operations = {
+static const struct file_operations proc_iomem_operations = {
 	.open		= iomem_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/kernel/sched.c b/kernel/sched.c
index b43cef0..f385eff 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -505,7 +505,7 @@ static int schedstat_open(struct inode *inode, struct file *file)
 	return res;
 }
 
-struct file_operations proc_schedstat_operations = {
+const struct file_operations proc_schedstat_operations = {
 	.open    = schedstat_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6d7147c..758dbbf 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -170,7 +170,7 @@ static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
 static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
 static int proc_opensys(struct inode *, struct file *);
 
-struct file_operations proc_sys_file_operations = {
+const struct file_operations proc_sys_file_operations = {
 	.open		= proc_opensys,
 	.read		= proc_readsys,
 	.write		= proc_writesys,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ad864f8..b917d6f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1707,8 +1707,8 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
  * Display pages allocated per node and memory policy via /proc.
  */
 
-static const char *policy_types[] = { "default", "prefer", "bind",
-				      "interleave" };
+static const char * const policy_types[] =
+	{ "default", "prefer", "bind", "interleave" };
 
 /*
  * Convert a mempolicy into a string.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 27ec7a1..cace22b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -83,7 +83,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
 
 EXPORT_SYMBOL(totalram_pages);
 
-static char *zone_names[MAX_NR_ZONES] = {
+static char * const zone_names[MAX_NR_ZONES] = {
 	 "DMA",
 #ifdef CONFIG_ZONE_DMA32
 	 "DMA32",
diff --git a/mm/shmem.c b/mm/shmem.c
index 0076536..c820b4f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -177,7 +177,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
 
 static struct super_operations shmem_ops;
 static const struct address_space_operations shmem_aops;
-static struct file_operations shmem_file_operations;
+static const struct file_operations shmem_file_operations;
 static struct inode_operations shmem_inode_operations;
 static struct inode_operations shmem_dir_inode_operations;
 static struct inode_operations shmem_special_inode_operations;
@@ -2319,7 +2319,7 @@ static const struct address_space_operations shmem_aops = {
 	.migratepage	= migrate_page,
 };
 
-static struct file_operations shmem_file_operations = {
+static const struct file_operations shmem_file_operations = {
 	.mmap		= shmem_mmap,
 #ifdef CONFIG_TMPFS
 	.llseek		= generic_file_llseek,
diff --git a/mm/slab.c b/mm/slab.c
index 86f5d6e..068cb45 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4142,7 +4142,7 @@ static int s_show(struct seq_file *m, void *p)
  * + further values on SMP and with statistics enabled
  */
 
-struct seq_operations slabinfo_op = {
+const struct seq_operations slabinfo_op = {
 	.start = s_start,
 	.next = s_next,
 	.stop = s_stop,
@@ -4340,7 +4340,7 @@ static int leaks_show(struct seq_file *m, void *p)
 	return 0;
 }
 
-struct seq_operations slabstats_op = {
+const struct seq_operations slabstats_op = {
 	.start = leaks_start,
 	.next = s_next,
 	.stop = s_stop,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 5524236..c543107 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1368,7 +1368,7 @@ static int swap_show(struct seq_file *swap, void *v)
 	return 0;
 }
 
-static struct seq_operations swaps_op = {
+static const struct seq_operations swaps_op = {
 	.start =	swap_start,
 	.next =		swap_next,
 	.stop =		swap_stop,
@@ -1380,7 +1380,7 @@ static int swaps_open(struct inode *inode, struct file *file)
 	return seq_open(file, &swaps_op);
 }
 
-static struct file_operations proc_swaps_operations = {
+static const struct file_operations proc_swaps_operations = {
 	.open		= swaps_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/mm/vmstat.c b/mm/vmstat.c
index ef41768..dc005a0 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -430,7 +430,7 @@ static int frag_show(struct seq_file *m, void *arg)
 	return 0;
 }
 
-struct seq_operations fragmentation_op = {
+const struct seq_operations fragmentation_op = {
 	.start	= frag_start,
 	.next	= frag_next,
 	.stop	= frag_stop,
@@ -452,7 +452,7 @@ struct seq_operations fragmentation_op = {
 #define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \
 					TEXT_FOR_HIGHMEM(xx)
 
-static char *vmstat_text[] = {
+static const char * const vmstat_text[] = {
 	/* Zoned VM counters */
 	"nr_anon_pages",
 	"nr_mapped",
@@ -597,7 +597,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
 	return 0;
 }
 
-struct seq_operations zoneinfo_op = {
+const struct seq_operations zoneinfo_op = {
 	.start	= frag_start, /* iterate over all zones. The same as in
 			       * fragmentation. */
 	.next	= frag_next,
@@ -660,7 +660,7 @@ static void vmstat_stop(struct seq_file *m, void *arg)
 	m->private = NULL;
 }
 
-struct seq_operations vmstat_op = {
+const struct seq_operations vmstat_op = {
 	.start	= vmstat_start,
 	.next	= vmstat_next,
 	.stop	= vmstat_stop,
-- 
cgit v0.10.2


From f46ba2235feab5e686b1234c328a0577cde86e21 Mon Sep 17 00:00:00 2001
From: Jun Chen <jimcgnu@yahoo.com>
Date: Wed, 6 Dec 2006 20:40:37 -0800
Subject: [PATCH] fs: make nls_cp936.c handle some U00XY characters and U20AC
 correctly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Twenty characters in cp936 are not correctly handled.  They're all in the
U00 plane.  nls_cp936 converts all U00XY to XY but this is not correct for
some characters.(e.g.  U00B7 -> A1A4, U00A8 -> A1A7).

This problem is fixed by generating u2c_00 based on all c2u_xx and changing
uni2char() to give U00 plane a special handling.  The "â¬"(U20AC,80 in
cp936) is also be handled properly.

Acked-by: Gang Chen <cgdlut@gmail.com>
Cc: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/nls/nls_cp936.c b/fs/nls/nls_cp936.c
index 046fde8..65e640c 100644
--- a/fs/nls/nls_cp936.c
+++ b/fs/nls/nls_cp936.c
@@ -4421,6 +4421,73 @@ static wchar_t *page_charset2uni[256] = {
 	c2u_F8, c2u_F9, c2u_FA, c2u_FB, c2u_FC, c2u_FD, c2u_FE, NULL,   
 };
 
+static unsigned char u2c_00[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xA1, 0xE8, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xEC, /* 0xA4-0xA7 */
+	0xA1, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xA1, 0xE3, 0xA1, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xA4, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC1, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xA8, 0xA4, 0xA8, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xA8, 0xA8, 0xA8, 0xA6, 0xA8, 0xBA, 0x00, 0x00, /* 0xE8-0xEB */
+	0xA8, 0xAC, 0xA8, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xB0, 0xA8, 0xAE, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC2, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xA8, 0xB4, 0xA8, 0xB2, 0x00, 0x00, /* 0xF8-0xFB */
+	0xA8, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
 static unsigned char u2c_01[512] = {
 	0xA8, 0xA1, 0xA8, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
@@ -10825,7 +10892,7 @@ static unsigned char u2c_FF[512] = {
 };
 
 static unsigned char *page_uni2charset[256] = {
-	NULL,   u2c_01, u2c_02, u2c_03, u2c_04, NULL,   NULL,   NULL,   
+	u2c_00, u2c_01, u2c_02, u2c_03, u2c_04, NULL,   NULL,   NULL,
 	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
 	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
 	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
@@ -10936,11 +11003,34 @@ static int uni2char(const wchar_t uni,
 	unsigned char *uni2charset;
 	unsigned char cl = uni&0xFF;
 	unsigned char ch = (uni>>8)&0xFF;
-	int n;
+	unsigned char out0,out1;
 
 	if (boundlen <= 0)
 		return -ENAMETOOLONG;
 
+	if (uni == 0x20ac) {/* Euro symbol.The only exception with a non-ascii unicode */
+		out[0] = 0x80;
+		return 1;
+	}
+
+	if (ch == 0) { /* handle the U00 plane*/
+		/* if (cl == 0) return -EINVAL;*/ /*U0000 is legal in cp936*/
+		out0 = u2c_00[cl*2];
+		out1 = u2c_00[cl*2+1];
+		if (out0 == 0x00 && out1 == 0x00) {
+			if (cl<0x80) {
+				out[0] = cl;
+				return 1;
+			}
+			return -EINVAL;
+		} else {
+			if (boundlen <= 1)
+				return -ENAMETOOLONG;
+			out[0] = out0;
+			out[1] = out1;
+			return 2;
+		}
+	}
 
 	uni2charset = page_uni2charset[ch];
 	if (uni2charset) {
@@ -10950,15 +11040,10 @@ static int uni2char(const wchar_t uni,
 		out[1] = uni2charset[cl*2+1];
 		if (out[0] == 0x00 && out[1] == 0x00)
 			return -EINVAL;
-		n = 2;
-	} else if (ch==0 && cl) {
-		out[0] = cl;
-		n = 1;
+		return 2;
 	}
 	else
 		return -EINVAL;
-
-	return n;
 }
 
 static int char2uni(const unsigned char *rawstring, int boundlen,
@@ -10972,7 +11057,11 @@ static int char2uni(const unsigned char *rawstring, int boundlen,
 		return -ENAMETOOLONG;
 
 	if (boundlen == 1) {
-		*uni = rawstring[0];
+		if (rawstring[0]==0x80) { /* Euro symbol.The only exception with a non-ascii unicode */
+			*uni = 0x20ac;
+		} else {
+			*uni = rawstring[0];
+		}
 		return 1;
 	}
 
@@ -10986,7 +11075,11 @@ static int char2uni(const unsigned char *rawstring, int boundlen,
 			return -EINVAL;
 		n = 2;
 	} else{
-		*uni = ch;
+		if (ch==0x80) {/* Euro symbol.The only exception with a non-ascii unicode */
+			*uni = 0x20ac;
+		} else {
+			*uni = ch;
+		}
 		n = 1;
 	}
 	return n;
-- 
cgit v0.10.2


From 7d1362c0d05b8543807ab403ac8ce813cab41fa4 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:40:38 -0800
Subject: [PATCH] cleanup asm/setup.h userspace visibility

Make the contents of the userspace asm/setup.h header consistent on all
architectures:

 - export setup.h to userspace on all architectures
 - export only COMMAND_LINE_SIZE to userspace
 - frv: move COMMAND_LINE_SIZE from param.h
 - i386: remove duplicate COMMAND_LINE_SIZE from param.h
 - arm:
   - export ATAGs to userspace
   - change u8/u16/u32 to __u8/__u16/__u32

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/asm-arm/setup.h b/include/asm-arm/setup.h
index aa4b578..e540739 100644
--- a/include/asm-arm/setup.h
+++ b/include/asm-arm/setup.h
@@ -14,55 +14,57 @@
 #ifndef __ASMARM_SETUP_H
 #define __ASMARM_SETUP_H
 
+#include <asm/types.h>
+
 #define COMMAND_LINE_SIZE 1024
 
 /* The list ends with an ATAG_NONE node. */
 #define ATAG_NONE	0x00000000
 
 struct tag_header {
-	u32 size;
-	u32 tag;
+	__u32 size;
+	__u32 tag;
 };
 
 /* The list must start with an ATAG_CORE node */
 #define ATAG_CORE	0x54410001
 
 struct tag_core {
-	u32 flags;		/* bit 0 = read-only */
-	u32 pagesize;
-	u32 rootdev;
+	__u32 flags;		/* bit 0 = read-only */
+	__u32 pagesize;
+	__u32 rootdev;
 };
 
 /* it is allowed to have multiple ATAG_MEM nodes */
 #define ATAG_MEM	0x54410002
 
 struct tag_mem32 {
-	u32	size;
-	u32	start;	/* physical start address */
+	__u32	size;
+	__u32	start;	/* physical start address */
 };
 
 /* VGA text type displays */
 #define ATAG_VIDEOTEXT	0x54410003
 
 struct tag_videotext {
-	u8		x;
-	u8		y;
-	u16		video_page;
-	u8		video_mode;
-	u8		video_cols;
-	u16		video_ega_bx;
-	u8		video_lines;
-	u8		video_isvga;
-	u16		video_points;
+	__u8		x;
+	__u8		y;
+	__u16		video_page;
+	__u8		video_mode;
+	__u8		video_cols;
+	__u16		video_ega_bx;
+	__u8		video_lines;
+	__u8		video_isvga;
+	__u16		video_points;
 };
 
 /* describes how the ramdisk will be used in kernel */
 #define ATAG_RAMDISK	0x54410004
 
 struct tag_ramdisk {
-	u32 flags;	/* bit 0 = load, bit 1 = prompt */
-	u32 size;	/* decompressed ramdisk size in _kilo_ bytes */
-	u32 start;	/* starting block of floppy-based RAM disk image */
+	__u32 flags;	/* bit 0 = load, bit 1 = prompt */
+	__u32 size;	/* decompressed ramdisk size in _kilo_ bytes */
+	__u32 start;	/* starting block of floppy-based RAM disk image */
 };
 
 /* describes where the compressed ramdisk image lives (virtual address) */
@@ -76,23 +78,23 @@ struct tag_ramdisk {
 #define ATAG_INITRD2	0x54420005
 
 struct tag_initrd {
-	u32 start;	/* physical start address */
-	u32 size;	/* size of compressed ramdisk image in bytes */
+	__u32 start;	/* physical start address */
+	__u32 size;	/* size of compressed ramdisk image in bytes */
 };
 
 /* board serial number. "64 bits should be enough for everybody" */
 #define ATAG_SERIAL	0x54410006
 
 struct tag_serialnr {
-	u32 low;
-	u32 high;
+	__u32 low;
+	__u32 high;
 };
 
 /* board revision */
 #define ATAG_REVISION	0x54410007
 
 struct tag_revision {
-	u32 rev;
+	__u32 rev;
 };
 
 /* initial values for vesafb-type framebuffers. see struct screen_info
@@ -101,20 +103,20 @@ struct tag_revision {
 #define ATAG_VIDEOLFB	0x54410008
 
 struct tag_videolfb {
-	u16		lfb_width;
-	u16		lfb_height;
-	u16		lfb_depth;
-	u16		lfb_linelength;
-	u32		lfb_base;
-	u32		lfb_size;
-	u8		red_size;
-	u8		red_pos;
-	u8		green_size;
-	u8		green_pos;
-	u8		blue_size;
-	u8		blue_pos;
-	u8		rsvd_size;
-	u8		rsvd_pos;
+	__u16		lfb_width;
+	__u16		lfb_height;
+	__u16		lfb_depth;
+	__u16		lfb_linelength;
+	__u32		lfb_base;
+	__u32		lfb_size;
+	__u8		red_size;
+	__u8		red_pos;
+	__u8		green_size;
+	__u8		green_pos;
+	__u8		blue_size;
+	__u8		blue_pos;
+	__u8		rsvd_size;
+	__u8		rsvd_pos;
 };
 
 /* command line: \0 terminated string */
@@ -128,17 +130,17 @@ struct tag_cmdline {
 #define ATAG_ACORN	0x41000101
 
 struct tag_acorn {
-	u32 memc_control_reg;
-	u32 vram_pages;
-	u8 sounddefault;
-	u8 adfsdrives;
+	__u32 memc_control_reg;
+	__u32 vram_pages;
+	__u8 sounddefault;
+	__u8 adfsdrives;
 };
 
 /* footbridge memory clock, see arch/arm/mach-footbridge/arch.c */
 #define ATAG_MEMCLK	0x41000402
 
 struct tag_memclk {
-	u32 fmemclk;
+	__u32 fmemclk;
 };
 
 struct tag {
@@ -167,24 +169,26 @@ struct tag {
 };
 
 struct tagtable {
-	u32 tag;
+	__u32 tag;
 	int (*parse)(const struct tag *);
 };
 
-#define __tag __attribute_used__ __attribute__((__section__(".taglist.init")))
-#define __tagtable(tag, fn) \
-static struct tagtable __tagtable_##fn __tag = { tag, fn }
-
 #define tag_member_present(tag,member)				\
 	((unsigned long)(&((struct tag *)0L)->member + 1)	\
 		<= (tag)->hdr.size * 4)
 
-#define tag_next(t)	((struct tag *)((u32 *)(t) + (t)->hdr.size))
+#define tag_next(t)	((struct tag *)((__u32 *)(t) + (t)->hdr.size))
 #define tag_size(type)	((sizeof(struct tag_header) + sizeof(struct type)) >> 2)
 
 #define for_each_tag(t,base)		\
 	for (t = base; t->hdr.size; t = tag_next(t))
 
+#ifdef __KERNEL__
+
+#define __tag __attribute_used__ __attribute__((__section__(".taglist.init")))
+#define __tagtable(tag, fn) \
+static struct tagtable __tagtable_##fn __tag = { tag, fn }
+
 /*
  * Memory map description
  */
@@ -217,4 +221,6 @@ struct early_params {
 static struct early_params __early_##fn __attribute_used__	\
 __attribute__((__section__(".early_param.init"))) = { name, fn }
 
+#endif  /*  __KERNEL__  */
+
 #endif
diff --git a/include/asm-arm26/setup.h b/include/asm-arm26/setup.h
index 6348931..1a867b4 100644
--- a/include/asm-arm26/setup.h
+++ b/include/asm-arm26/setup.h
@@ -16,6 +16,8 @@
 
 #define COMMAND_LINE_SIZE 1024
 
+#ifdef __KERNEL__
+
 /* The list ends with an ATAG_NONE node. */
 #define ATAG_NONE	0x00000000
 
@@ -202,4 +204,6 @@ struct meminfo {
 
 extern struct meminfo meminfo;
 
+#endif  /*  __KERNEL__  */
+
 #endif
diff --git a/include/asm-avr32/setup.h b/include/asm-avr32/setup.h
index 10193da..0a52242 100644
--- a/include/asm-avr32/setup.h
+++ b/include/asm-avr32/setup.h
@@ -13,6 +13,8 @@
 
 #define COMMAND_LINE_SIZE 256
 
+#ifdef __KERNEL__
+
 /* Magic number indicating that a tag table is present */
 #define ATAG_MAGIC	0xa2a25441
 
@@ -138,4 +140,6 @@ void chip_enable_sdram(void);
 
 #endif /* !__ASSEMBLY__ */
 
+#endif  /*  __KERNEL__  */
+
 #endif /* __ASM_AVR32_SETUP_H__ */
diff --git a/include/asm-frv/param.h b/include/asm-frv/param.h
index 168381e..365653b 100644
--- a/include/asm-frv/param.h
+++ b/include/asm-frv/param.h
@@ -18,6 +18,5 @@
 #endif
 
 #define MAXHOSTNAMELEN		64	/* max length of hostname */
-#define COMMAND_LINE_SIZE	512
 
 #endif /* _ASM_PARAM_H */
diff --git a/include/asm-frv/setup.h b/include/asm-frv/setup.h
index 0d293b9..afd787c 100644
--- a/include/asm-frv/setup.h
+++ b/include/asm-frv/setup.h
@@ -12,6 +12,10 @@
 #ifndef _ASM_SETUP_H
 #define _ASM_SETUP_H
 
+#define COMMAND_LINE_SIZE       512
+
+#ifdef __KERNEL__
+
 #include <linux/init.h>
 
 #ifndef __ASSEMBLY__
@@ -22,4 +26,6 @@ extern unsigned long __initdata num_mappedpages;
 
 #endif /* !__ASSEMBLY__ */
 
+#endif  /*  __KERNEL__  */
+
 #endif /* _ASM_SETUP_H */
diff --git a/include/asm-generic/Kbuild.asm b/include/asm-generic/Kbuild.asm
index a84c3d8..a37e95f 100644
--- a/include/asm-generic/Kbuild.asm
+++ b/include/asm-generic/Kbuild.asm
@@ -14,6 +14,7 @@ unifdef-y += posix_types.h
 unifdef-y += ptrace.h
 unifdef-y += resource.h
 unifdef-y += sembuf.h
+unifdef-y += setup.h
 unifdef-y += shmbuf.h
 unifdef-y += sigcontext.h
 unifdef-y += siginfo.h
diff --git a/include/asm-i386/Kbuild b/include/asm-i386/Kbuild
index 147e4ac..5ae93af 100644
--- a/include/asm-i386/Kbuild
+++ b/include/asm-i386/Kbuild
@@ -7,5 +7,4 @@ header-y += ptrace-abi.h
 header-y += ucontext.h
 
 unifdef-y += mtrr.h
-unifdef-y += setup.h
 unifdef-y += vm86.h
diff --git a/include/asm-i386/param.h b/include/asm-i386/param.h
index 745dc5b..21b3246 100644
--- a/include/asm-i386/param.h
+++ b/include/asm-i386/param.h
@@ -18,6 +18,5 @@
 #endif
 
 #define MAXHOSTNAMELEN	64	/* max length of hostname */
-#define COMMAND_LINE_SIZE 256
 
 #endif
diff --git a/include/asm-i386/setup.h b/include/asm-i386/setup.h
index 2734909..c5b504b 100644
--- a/include/asm-i386/setup.h
+++ b/include/asm-i386/setup.h
@@ -6,6 +6,8 @@
 #ifndef _i386_SETUP_H
 #define _i386_SETUP_H
 
+#define COMMAND_LINE_SIZE 256
+
 #ifdef __KERNEL__
 #include <linux/pfn.h>
 
@@ -14,10 +16,8 @@
  */
 #define MAXMEM_PFN	PFN_DOWN(MAXMEM)
 #define MAX_NONPAE_PFN	(1 << 20)
-#endif
 
 #define PARAM_SIZE 4096
-#define COMMAND_LINE_SIZE 256
 
 #define OLD_CL_MAGIC_ADDR	0x90020
 #define OLD_CL_MAGIC		0xA33F
@@ -78,4 +78,6 @@ void __init add_memory_region(unsigned long long start,
 
 #endif /* __ASSEMBLY__ */
 
+#endif  /*  __KERNEL__  */
+
 #endif /* _i386_SETUP_H */
diff --git a/include/asm-ia64/Kbuild b/include/asm-ia64/Kbuild
index 15818a1..4a1e48b 100644
--- a/include/asm-ia64/Kbuild
+++ b/include/asm-ia64/Kbuild
@@ -10,7 +10,6 @@ header-y += intrinsics.h
 header-y += perfmon_default_smpl.h
 header-y += ptrace_offsets.h
 header-y += rse.h
-header-y += setup.h
 header-y += ucontext.h
 
 unifdef-y += perfmon.h
diff --git a/include/asm-m32r/setup.h b/include/asm-m32r/setup.h
index 52f4fa2..6a0b322 100644
--- a/include/asm-m32r/setup.h
+++ b/include/asm-m32r/setup.h
@@ -1,6 +1,11 @@
 /*
  * This is set up by the setup-routine at boot-time
  */
+
+#define COMMAND_LINE_SIZE       512
+
+#ifdef __KERNEL__
+
 #define PARAM			((unsigned char *)empty_zero_page)
 
 #define MOUNT_ROOT_RDONLY	(*(unsigned long *) (PARAM+0x000))
@@ -18,8 +23,6 @@
 
 #define SCREEN_INFO		(*(struct screen_info *) (PARAM+0x200))
 
-#define COMMAND_LINE_SIZE	(512)
-
 #define RAMDISK_IMAGE_START_MASK	(0x07FF)
 #define RAMDISK_PROMPT_FLAG		(0x8000)
 #define RAMDISK_LOAD_FLAG		(0x4000)
@@ -27,3 +30,5 @@
 extern unsigned long memory_start;
 extern unsigned long memory_end;
 
+#endif  /*  __KERNEL__  */
+
diff --git a/include/asm-m68k/setup.h b/include/asm-m68k/setup.h
index 7facc9a..2a8853c 100644
--- a/include/asm-m68k/setup.h
+++ b/include/asm-m68k/setup.h
@@ -41,8 +41,12 @@
 #define MACH_Q40     10
 #define MACH_SUN3X   11
 
+#define COMMAND_LINE_SIZE 256
+
 #ifdef __KERNEL__
 
+#define CL_SIZE COMMAND_LINE_SIZE
+
 #ifndef __ASSEMBLY__
 extern unsigned long m68k_machtype;
 #endif /* !__ASSEMBLY__ */
@@ -355,8 +359,6 @@ extern int m68k_is040or060;
      */
 
 #define NUM_MEMINFO	4
-#define CL_SIZE		256
-#define COMMAND_LINE_SIZE	CL_SIZE
 
 #ifndef __ASSEMBLY__
 struct mem_info {
diff --git a/include/asm-m68knommu/setup.h b/include/asm-m68knommu/setup.h
index d2b0fcc..fb86bb2 100644
--- a/include/asm-m68knommu/setup.h
+++ b/include/asm-m68knommu/setup.h
@@ -1,5 +1,10 @@
+#ifdef __KERNEL__
+
 #include <asm-m68k/setup.h>
 
 /* We have a bigger command line buffer. */
 #undef COMMAND_LINE_SIZE
+
+#endif  /*  __KERNEL__  */
+
 #define COMMAND_LINE_SIZE	512
diff --git a/include/asm-mips/setup.h b/include/asm-mips/setup.h
index 737fa4a..70009a9 100644
--- a/include/asm-mips/setup.h
+++ b/include/asm-mips/setup.h
@@ -1,8 +1,6 @@
-#ifdef __KERNEL__
 #ifndef _MIPS_SETUP_H
 #define _MIPS_SETUP_H
 
 #define COMMAND_LINE_SIZE	256
 
 #endif /* __SETUP_H */
-#endif /* __KERNEL__ */
diff --git a/include/asm-powerpc/setup.h b/include/asm-powerpc/setup.h
index 3d9740a..817fac0 100644
--- a/include/asm-powerpc/setup.h
+++ b/include/asm-powerpc/setup.h
@@ -1,9 +1,6 @@
 #ifndef _ASM_POWERPC_SETUP_H
 #define _ASM_POWERPC_SETUP_H
 
-#ifdef __KERNEL__
-
 #define COMMAND_LINE_SIZE	512
 
-#endif	/* __KERNEL__ */
 #endif	/* _ASM_POWERPC_SETUP_H */
diff --git a/include/asm-s390/setup.h b/include/asm-s390/setup.h
index 7664bac..9574fe8 100644
--- a/include/asm-s390/setup.h
+++ b/include/asm-s390/setup.h
@@ -8,12 +8,13 @@
 #ifndef _ASM_S390_SETUP_H
 #define _ASM_S390_SETUP_H
 
+#define COMMAND_LINE_SIZE 	896
+
 #ifdef __KERNEL__
 
 #include <asm/types.h>
 
 #define PARMAREA		0x10400
-#define COMMAND_LINE_SIZE 	896
 #define MEMORY_CHUNKS		16	/* max 0x7fff */
 #define IPL_PARMBLOCK_ORIGIN	0x2000
 
diff --git a/include/asm-sh/setup.h b/include/asm-sh/setup.h
index 34ca8a7..1583c6b 100644
--- a/include/asm-sh/setup.h
+++ b/include/asm-sh/setup.h
@@ -1,10 +1,12 @@
-#ifdef __KERNEL__
 #ifndef _SH_SETUP_H
 #define _SH_SETUP_H
 
 #define COMMAND_LINE_SIZE 256
 
+#ifdef __KERNEL__
+
 int setup_early_printk(char *);
 
-#endif /* _SH_SETUP_H */
 #endif /* __KERNEL__ */
+
+#endif /* _SH_SETUP_H */
diff --git a/include/asm-sh64/setup.h b/include/asm-sh64/setup.h
index ebd42eb..5b07b14 100644
--- a/include/asm-sh64/setup.h
+++ b/include/asm-sh64/setup.h
@@ -1,6 +1,10 @@
 #ifndef __ASM_SH64_SETUP_H
 #define __ASM_SH64_SETUP_H
 
+#define COMMAND_LINE_SIZE 256
+
+#ifdef __KERNEL__
+
 #define PARAM ((unsigned char *)empty_zero_page)
 #define MOUNT_ROOT_RDONLY (*(unsigned long *) (PARAM+0x000))
 #define RAMDISK_FLAGS (*(unsigned long *) (PARAM+0x004))
@@ -12,5 +16,7 @@
 #define COMMAND_LINE ((char *) (PARAM+256))
 #define COMMAND_LINE_SIZE 256
 
+#endif  /*  __KERNEL__  */
+
 #endif /* __ASM_SH64_SETUP_H */
 
diff --git a/include/asm-x86_64/Kbuild b/include/asm-x86_64/Kbuild
index 1ee9b07..7635213 100644
--- a/include/asm-x86_64/Kbuild
+++ b/include/asm-x86_64/Kbuild
@@ -12,7 +12,6 @@ header-y += ldt.h
 header-y += msr.h
 header-y += prctl.h
 header-y += ptrace-abi.h
-header-y += setup.h
 header-y += sigcontext32.h
 header-y += ucontext.h
 header-y += vsyscall32.h
-- 
cgit v0.10.2


From 6d4df677f8a60ea6bc0ef1a596c1a3a79b1d4882 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 6 Dec 2006 20:40:39 -0800
Subject: [PATCH] do_coredump() and not stopping rewrite attacks?

On Sat, Dec 02, 2006 at 11:47:44PM +0300, Alexey Dobriyan wrote:
> David Binderman compiled 2.6.19 with icc and grepped for "was set but never
> used". Many warnings are on
> 	http://coderock.org/kj/unused-2.6.19-fs

Heh, the very first line:
fs/exec.c(1465): remark #593: variable "flag" was set but never used

fs/exec.c:
  1477		/*
  1478		 *	We cannot trust fsuid as being the "true" uid of the
  1479		 *	process nor do we know its entire history. We only know it
  1480		 *	was tainted so we dump it as root in mode 2.
  1481		 */
  1482		if (mm->dumpable == 2) {	/* Setuid core dump mode */
  1483			flag = O_EXCL;		/* Stop rewrite attacks */
  1484			current->fsuid = 0;	/* Dump root private */
  1485		}

And then filp_open follows with "flag" totally ignored.

(akpm: this restores the code to Alan's original version.  Andi's "Support
piping into commands in /proc/sys/kernel/core_pattern" (cset d025c9db) broke
it).

Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: <stable@kerenl.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/exec.c b/fs/exec.c
index 2092bd2..add0e03 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1515,7 +1515,8 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 		ispipe = 1;
  	} else
  		file = filp_open(corename,
-				 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE, 0600);
+				 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
+				 0600);
 	if (IS_ERR(file))
 		goto fail_unlock;
 	inode = file->f_dentry->d_inode;
-- 
cgit v0.10.2


From 85916f8166b59eeac63d2b4f7f1df8de849334b4 Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus@valinux.co.jp>
Date: Wed, 6 Dec 2006 20:40:41 -0800
Subject: [PATCH] Kexec / Kdump: Unify elf note code

The elf note saving code is currently duplicated over several
architectures.  This cleanup patch simply adds code to a common file and
then replaces the arch-specific code with calls to the newly added code.

The only drawback with this approach is that s390 doesn't fully support
kexec-on-panic which for that arch leads to introduction of unused code.

Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
Cc: Vivek Goyal <vgoyal@in.ibm.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index 144b432..a5e0e99 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -31,68 +31,6 @@
 /* This keeps a track of which one is crashing cpu. */
 static int crashing_cpu;
 
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
-							       size_t data_len)
-{
-	struct elf_note note;
-
-	note.n_namesz = strlen(name) + 1;
-	note.n_descsz = data_len;
-	note.n_type   = type;
-	memcpy(buf, &note, sizeof(note));
-	buf += (sizeof(note) +3)/4;
-	memcpy(buf, name, note.n_namesz);
-	buf += (note.n_namesz + 3)/4;
-	memcpy(buf, data, note.n_descsz);
-	buf += (note.n_descsz + 3)/4;
-
-	return buf;
-}
-
-static void final_note(u32 *buf)
-{
-	struct elf_note note;
-
-	note.n_namesz = 0;
-	note.n_descsz = 0;
-	note.n_type   = 0;
-	memcpy(buf, &note, sizeof(note));
-}
-
-static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
-{
-	struct elf_prstatus prstatus;
-	u32 *buf;
-
-	if ((cpu < 0) || (cpu >= NR_CPUS))
-		return;
-
-	/* Using ELF notes here is opportunistic.
-	 * I need a well defined structure format
-	 * for the data I pass, and I need tags
-	 * on the data to indicate what information I have
-	 * squirrelled away.  ELF notes happen to provide
-	 * all of that, so there is no need to invent something new.
-	 */
-	buf = (u32*)per_cpu_ptr(crash_notes, cpu);
-	if (!buf)
-		return;
-	memset(&prstatus, 0, sizeof(prstatus));
-	prstatus.pr_pid = current->pid;
-	elf_core_copy_regs(&prstatus.pr_reg, regs);
-	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
-				sizeof(prstatus));
-	final_note(buf);
-}
-
-static void crash_save_self(struct pt_regs *regs)
-{
-	int cpu;
-
-	cpu = safe_smp_processor_id();
-	crash_save_this_cpu(regs, cpu);
-}
-
 #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
 static atomic_t waiting_for_crash_ipi;
 
@@ -121,7 +59,7 @@ static int crash_nmi_callback(struct notifier_block *self,
 		crash_fixup_ss_esp(&fixed_regs, regs);
 		regs = &fixed_regs;
 	}
-	crash_save_this_cpu(regs, cpu);
+	crash_save_cpu(regs, cpu);
 	disable_local_APIC();
 	atomic_dec(&waiting_for_crash_ipi);
 	/* Assume hlt works */
@@ -195,5 +133,5 @@ void machine_crash_shutdown(struct pt_regs *regs)
 #if defined(CONFIG_X86_IO_APIC)
 	disable_IO_APIC();
 #endif
-	crash_save_self(regs);
+	crash_save_cpu(regs, safe_smp_processor_id());
 }
diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c
index 89b03c8..d3f2080 100644
--- a/arch/powerpc/kernel/crash.c
+++ b/arch/powerpc/kernel/crash.c
@@ -46,61 +46,6 @@ int crashing_cpu = -1;
 static cpumask_t cpus_in_crash = CPU_MASK_NONE;
 cpumask_t cpus_in_sr = CPU_MASK_NONE;
 
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
-							       size_t data_len)
-{
-	struct elf_note note;
-
-	note.n_namesz = strlen(name) + 1;
-	note.n_descsz = data_len;
-	note.n_type   = type;
-	memcpy(buf, &note, sizeof(note));
-	buf += (sizeof(note) +3)/4;
-	memcpy(buf, name, note.n_namesz);
-	buf += (note.n_namesz + 3)/4;
-	memcpy(buf, data, note.n_descsz);
-	buf += (note.n_descsz + 3)/4;
-
-	return buf;
-}
-
-static void final_note(u32 *buf)
-{
-	struct elf_note note;
-
-	note.n_namesz = 0;
-	note.n_descsz = 0;
-	note.n_type   = 0;
-	memcpy(buf, &note, sizeof(note));
-}
-
-static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
-{
-	struct elf_prstatus prstatus;
-	u32 *buf;
-
-	if ((cpu < 0) || (cpu >= NR_CPUS))
-		return;
-
-	/* Using ELF notes here is opportunistic.
-	 * I need a well defined structure format
-	 * for the data I pass, and I need tags
-	 * on the data to indicate what information I have
-	 * squirrelled away.  ELF notes happen to provide
-	 * all of that that no need to invent something new.
-	 */
-	buf = (u32*)per_cpu_ptr(crash_notes, cpu);
-	if (!buf) 
-		return;
-
-	memset(&prstatus, 0, sizeof(prstatus));
-	prstatus.pr_pid = current->pid;
-	elf_core_copy_regs(&prstatus.pr_reg, regs);
-	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
-			sizeof(prstatus));
-	final_note(buf);
-}
-
 #ifdef CONFIG_SMP
 static atomic_t enter_on_soft_reset = ATOMIC_INIT(0);
 
@@ -113,7 +58,7 @@ void crash_ipi_callback(struct pt_regs *regs)
 
 	hard_irq_disable();
 	if (!cpu_isset(cpu, cpus_in_crash))
-		crash_save_this_cpu(regs, cpu);
+		crash_save_cpu(regs, cpu);
 	cpu_set(cpu, cpus_in_crash);
 
 	/*
@@ -306,7 +251,7 @@ void default_machine_crash_shutdown(struct pt_regs *regs)
 	 * such that another IPI will not be sent.
 	 */
 	crashing_cpu = smp_processor_id();
-	crash_save_this_cpu(regs, crashing_cpu);
+	crash_save_cpu(regs, crashing_cpu);
 	crash_kexec_prepare_cpus(crashing_cpu);
 	cpu_set(crashing_cpu, cpus_in_crash);
 	if (ppc_md.kexec_cpu_down)
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
index 3525f88..95a7a2c 100644
--- a/arch/x86_64/kernel/crash.c
+++ b/arch/x86_64/kernel/crash.c
@@ -28,71 +28,6 @@
 /* This keeps a track of which one is crashing cpu. */
 static int crashing_cpu;
 
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type,
-						void *data, size_t data_len)
-{
-	struct elf_note note;
-
-	note.n_namesz = strlen(name) + 1;
-	note.n_descsz = data_len;
-	note.n_type   = type;
-	memcpy(buf, &note, sizeof(note));
-	buf += (sizeof(note) +3)/4;
-	memcpy(buf, name, note.n_namesz);
-	buf += (note.n_namesz + 3)/4;
-	memcpy(buf, data, note.n_descsz);
-	buf += (note.n_descsz + 3)/4;
-
-	return buf;
-}
-
-static void final_note(u32 *buf)
-{
-	struct elf_note note;
-
-	note.n_namesz = 0;
-	note.n_descsz = 0;
-	note.n_type   = 0;
-	memcpy(buf, &note, sizeof(note));
-}
-
-static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
-{
-	struct elf_prstatus prstatus;
-	u32 *buf;
-
-	if ((cpu < 0) || (cpu >= NR_CPUS))
-		return;
-
-	/* Using ELF notes here is opportunistic.
-	 * I need a well defined structure format
-	 * for the data I pass, and I need tags
-	 * on the data to indicate what information I have
-	 * squirrelled away.  ELF notes happen to provide
-	 * all of that, no need to invent something new.
-	 */
-
-	buf = (u32*)per_cpu_ptr(crash_notes, cpu);
-
-	if (!buf)
-		return;
-
-	memset(&prstatus, 0, sizeof(prstatus));
-	prstatus.pr_pid = current->pid;
-	elf_core_copy_regs(&prstatus.pr_reg, regs);
-	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
-					sizeof(prstatus));
-	final_note(buf);
-}
-
-static void crash_save_self(struct pt_regs *regs)
-{
-	int cpu;
-
-	cpu = smp_processor_id();
-	crash_save_this_cpu(regs, cpu);
-}
-
 #ifdef CONFIG_SMP
 static atomic_t waiting_for_crash_ipi;
 
@@ -117,7 +52,7 @@ static int crash_nmi_callback(struct notifier_block *self,
 		return NOTIFY_STOP;
 	local_irq_disable();
 
-	crash_save_this_cpu(regs, cpu);
+	crash_save_cpu(regs, cpu);
 	disable_local_APIC();
 	atomic_dec(&waiting_for_crash_ipi);
 	/* Assume hlt works */
@@ -196,5 +131,5 @@ void machine_crash_shutdown(struct pt_regs *regs)
 
 	disable_IO_APIC();
 
-	crash_save_self(regs);
+	crash_save_cpu(regs, smp_processor_id());
 }
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index a4ede62..e3abcec 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -105,6 +105,7 @@ extern struct page *kimage_alloc_control_pages(struct kimage *image,
 						unsigned int order);
 extern void crash_kexec(struct pt_regs *);
 int kexec_should_crash(struct task_struct *);
+void crash_save_cpu(struct pt_regs *regs, int cpu);
 extern struct kimage *kexec_image;
 extern struct kimage *kexec_crash_image;
 
diff --git a/kernel/kexec.c b/kernel/kexec.c
index d43692c..afbbbe9 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -20,6 +20,8 @@
 #include <linux/syscalls.h>
 #include <linux/ioport.h>
 #include <linux/hardirq.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -1066,6 +1068,60 @@ void crash_kexec(struct pt_regs *regs)
 	}
 }
 
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+			    size_t data_len)
+{
+	struct elf_note note;
+
+	note.n_namesz = strlen(name) + 1;
+	note.n_descsz = data_len;
+	note.n_type   = type;
+	memcpy(buf, &note, sizeof(note));
+	buf += (sizeof(note) + 3)/4;
+	memcpy(buf, name, note.n_namesz);
+	buf += (note.n_namesz + 3)/4;
+	memcpy(buf, data, note.n_descsz);
+	buf += (note.n_descsz + 3)/4;
+
+	return buf;
+}
+
+static void final_note(u32 *buf)
+{
+	struct elf_note note;
+
+	note.n_namesz = 0;
+	note.n_descsz = 0;
+	note.n_type   = 0;
+	memcpy(buf, &note, sizeof(note));
+}
+
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+	struct elf_prstatus prstatus;
+	u32 *buf;
+
+	if ((cpu < 0) || (cpu >= NR_CPUS))
+		return;
+
+	/* Using ELF notes here is opportunistic.
+	 * I need a well defined structure format
+	 * for the data I pass, and I need tags
+	 * on the data to indicate what information I have
+	 * squirrelled away.  ELF notes happen to provide
+	 * all of that, so there is no need to invent something new.
+	 */
+	buf = (u32*)per_cpu_ptr(crash_notes, cpu);
+	if (!buf)
+		return;
+	memset(&prstatus, 0, sizeof(prstatus));
+	prstatus.pr_pid = current->pid;
+	elf_core_copy_regs(&prstatus.pr_reg, regs);
+	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+				sizeof(prstatus));
+	final_note(buf);
+}
+
 static int __init crash_notes_memory_init(void)
 {
 	/* Allocate memory for saving cpu registers. */
-- 
cgit v0.10.2


From b62e8ec2ac580b47c11eb76e8852ac1ec7d617cd Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Wed, 6 Dec 2006 20:40:43 -0800
Subject: [PATCH] aio: kill pointless ki_nbytes assignment in
 aio_setup_single_vector

io_submit_one assigns ki_left = ki_nbytes = iocb->aio_nbytes, then calls
down to aio_setup_iocb, then to aio_setup_single_vector.  In there,
ki_nbytes is reassigned to the same value it got two call stack above it.
There is no need to do so.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: Zach Brown <zach.brown@oracle.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/aio.c b/fs/aio.c
index 13aa929..f02ad26 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1413,7 +1413,6 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
 	kiocb->ki_iovec->iov_len = kiocb->ki_left;
 	kiocb->ki_nr_segs = 1;
 	kiocb->ki_cur_seg = 0;
-	kiocb->ki_nbytes = kiocb->ki_left;
 	return 0;
 }
 
-- 
cgit v0.10.2


From 97d2a80584b30b5cd32da411deca1986ef61877a Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@kvack.org>
Date: Wed, 6 Dec 2006 20:40:45 -0800
Subject: [PATCH] aio: remove ki_retried debugging member

Remove the ki_retried member from struct kiocb.  I think the idea was
bounced around a while back, but Arnaldo pointed out another reason that we
should dig it up when he pointed out that the last cacheline of struct
kiocb only contains 4 bytes.  By removing the debugging member, we save
more than the 8 byte on 64 bit machines.

Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
Acked-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: Zach Brown <zach.brown@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/aio.c b/fs/aio.c
index f02ad26..d3a6ec2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -666,17 +666,6 @@ static ssize_t aio_run_iocb(struct kiocb *iocb)
 	ssize_t (*retry)(struct kiocb *);
 	ssize_t ret;
 
-	if (iocb->ki_retried++ > 1024*1024) {
-		printk("Maximal retry count.  Bytes done %Zd\n",
-			iocb->ki_nbytes - iocb->ki_left);
-		return -EAGAIN;
-	}
-
-	if (!(iocb->ki_retried & 0xff)) {
-		pr_debug("%ld retry: %zd of %zd\n", iocb->ki_retried,
-			iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes);
-	}
-
 	if (!(retry = iocb->ki_retry)) {
 		printk("aio_run_iocb: iocb->ki_retry = NULL\n");
 		return 0;
@@ -1005,9 +994,6 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 	kunmap_atomic(ring, KM_IRQ1);
 
 	pr_debug("added to ring %p at [%lu]\n", iocb, tail);
-
-	pr_debug("%ld retries: %zd of %zd\n", iocb->ki_retried,
-		iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes);
 put_rq:
 	/* everything turned out well, dispose of the aiocb. */
 	ret = __aio_put_req(ctx, iocb);
@@ -1590,7 +1576,6 @@ int fastcall io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	req->ki_opcode = iocb->aio_lio_opcode;
 	init_waitqueue_func_entry(&req->ki_wait, aio_wake_function);
 	INIT_LIST_HEAD(&req->ki_wait.task_list);
-	req->ki_retried = 0;
 
 	ret = aio_setup_iocb(req);
 
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 9e350fd..3372ec6 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -111,7 +111,6 @@ struct kiocb {
 	size_t			ki_nbytes; 	/* copy of iocb->aio_nbytes */
 	char 			__user *ki_buf;	/* remaining iocb->aio_buf */
 	size_t			ki_left; 	/* remaining bytes */
-	long			ki_retried; 	/* just for testing */
 	struct iovec		ki_inline_vec;	/* inline vector */
  	struct iovec		*ki_iovec;
  	unsigned long		ki_nr_segs;
@@ -238,7 +237,6 @@ do {									\
 } while (0)
 
 #define io_wait_to_kiocb(wait) container_of(wait, struct kiocb, ki_wait)
-#define is_retried_kiocb(iocb) ((iocb)->ki_retried > 1)
 
 #include <linux/aio_abi.h>
 
-- 
cgit v0.10.2


From feb189274638ea357750a9e809f5a6e2223a082e Mon Sep 17 00:00:00 2001
From: Johann Lombardi <johann.lombardi@bull.net>
Date: Wed, 6 Dec 2006 20:40:46 -0800
Subject: [PATCH] ext4: fix credit calculation in
 ext4_ext_calc_credits_for_insert

Fix a nit in ext4_ext_calc_credits_for_insert().  Besides, credits for the
new root are already added in the index split accounting.

Signed-off-by: Johann Lombardi <johann.lombardi@bull.net>
Signed-off-by: Alex Tomas <alex@clusterfs.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 1442ccb..994a6e4 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1531,16 +1531,17 @@ int inline ext4_ext_calc_credits_for_insert(struct inode *inode,
 
 	/*
 	 * tree can be full, so it would need to grow in depth:
-	 * allocation + old root + new root
+	 * we need one credit to modify old root, credits for
+	 * new root will be added in split accounting
 	 */
-	needed += 2 + 1 + 1;
+	needed += 1;
 
 	/*
 	 * Index split can happen, we would need:
 	 *    allocate intermediate indexes (bitmap + group)
 	 *  + change two blocks at each level, but root (already included)
 	 */
-	needed = (depth * 2) + (depth * 2);
+	needed += (depth * 2) + (depth * 2);
 
 	/* any allocation modifies superblock */
 	needed += 1;
-- 
cgit v0.10.2


From 72be2ccfff0e0e332b32f7ef8372890e39b7c4cb Mon Sep 17 00:00:00 2001
From: Erik Mouw <erik@harddisk-recovery.com>
Date: Wed, 6 Dec 2006 20:40:49 -0800
Subject: [PATCH] Update ext[23] mailing list address

The ext[23] mailing list moved from sf.net to vger.kernel.org so update
the MAINTAINERS file.

Signed-off-by: Erik Mouw <erik@harddisk-recovery.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index e3f0831..cf24400 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1091,13 +1091,19 @@ M:	miku@iki.fi
 S:	Maintained
 
 EXT2 FILE SYSTEM
-L:	ext2-devel@lists.sourceforge.net
+L:	linux-ext4@vger.kernel.org
 S:	Maintained
 
 EXT3 FILE SYSTEM
 P:	Stephen Tweedie, Andrew Morton
 M:	sct@redhat.com, akpm@osdl.org, adilger@clusterfs.com
-L:	ext2-devel@lists.sourceforge.net
+L:	linux-ext4@vger.kernel.org
+S:	Maintained
+
+EXT4 FILE SYSTEM
+P:	Stephen Tweedie, Andrew Morton
+M:	sct@redhat.com, akpm@osdl.org, adilger@clusterfs.com
+L:	linux-ext4@vger.kernel.org
 S:	Maintained
 
 F71805F HARDWARE MONITORING DRIVER
@@ -1673,7 +1679,7 @@ S:	Supported
 JOURNALLING LAYER FOR BLOCK DEVICES (JBD)
 P:	Stephen Tweedie, Andrew Morton
 M:	sct@redhat.com, akpm@osdl.org
-L:	ext2-devel@lists.sourceforge.net
+L:	linux-ext4@vger.kernel.org
 S:	Maintained
 
 K8TEMP HARDWARE MONITORING DRIVER
-- 
cgit v0.10.2


From 70e4506765602cca047cfa31933836e354c61a63 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 6 Dec 2006 20:40:50 -0800
Subject: [PATCH] lockdep: register_lock_class() fix

The hash_lock must only ever be taken with irqs disabled.  This happens in
all the important places, except one codepath: register_lock_class().  The
race should trigger rarely because register_lock_class() is quite rare and
single-threaded (happens during init most of the time).

The fix is to disable irqs.

( bug found live in -rt: there preemption is alot more agressive and
  preempting with the hash-lock held caused a lockup.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 3926c36..62e73ce 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1182,6 +1182,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	struct lockdep_subclass_key *key;
 	struct list_head *hash_head;
 	struct lock_class *class;
+	unsigned long flags;
 
 	class = look_up_lock_class(lock, subclass);
 	if (likely(class))
@@ -1203,6 +1204,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	key = lock->key->subkeys + subclass;
 	hash_head = classhashentry(key);
 
+	raw_local_irq_save(flags);
 	__raw_spin_lock(&hash_lock);
 	/*
 	 * We have to do the hash-walk again, to avoid races
@@ -1217,6 +1219,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	 */
 	if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
 		__raw_spin_unlock(&hash_lock);
+		raw_local_irq_restore(flags);
 		debug_locks_off();
 		printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
 		printk("turning off the locking correctness validator.\n");
@@ -1239,15 +1242,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 
 	if (verbose(class)) {
 		__raw_spin_unlock(&hash_lock);
+		raw_local_irq_restore(flags);
 		printk("\nnew class %p: %s", class->key, class->name);
 		if (class->name_version > 1)
 			printk("#%d", class->name_version);
 		printk("\n");
 		dump_stack();
+		raw_local_irq_save(flags);
 		__raw_spin_lock(&hash_lock);
 	}
 out_unlock_set:
 	__raw_spin_unlock(&hash_lock);
+	raw_local_irq_restore(flags);
 
 	if (!subclass || force)
 		lock->class_cache = class;
-- 
cgit v0.10.2


From 792908225064b1d841a8990b9d1d1cfc4e0e5bb2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 6 Dec 2006 20:40:51 -0800
Subject: [PATCH] add ignore_loglevel boot option

Sometimes the kernel prints something interesting while userspace bootup
keeps messages turned off via loglevel.  Enable the printing of /all/
kernel messages via the "ignore_loglevel" boot option.  Off by default.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 2a40d9f..6b3c3e3 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -650,6 +650,10 @@ and is between 256 and 4096 characters. It is defined in the file
 	idle=		[HW]
 			Format: idle=poll or idle=halt
 
+	ignore_loglevel	[KNL]
+			Ignore loglevel setting - this will print /all/
+			kernel messages to the console. Useful for debugging.
+
 	ihash_entries=	[KNL]
 			Set number of hash buckets for inode cache.
 
diff --git a/kernel/printk.c b/kernel/printk.c
index c3d90a5..185bb45 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -333,13 +333,25 @@ static void __call_console_drivers(unsigned long start, unsigned long end)
 	}
 }
 
+static int __read_mostly ignore_loglevel;
+
+int __init ignore_loglevel_setup(char *str)
+{
+	ignore_loglevel = 1;
+	printk(KERN_INFO "debug: ignoring loglevel setting.\n");
+
+	return 1;
+}
+
+__setup("ignore_loglevel", ignore_loglevel_setup);
+
 /*
  * Write out chars from start to end - 1 inclusive
  */
 static void _call_console_drivers(unsigned long start,
 				unsigned long end, int msg_log_level)
 {
-	if (msg_log_level < console_loglevel &&
+	if ((msg_log_level < console_loglevel || ignore_loglevel) &&
 			console_drivers && start != end) {
 		if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
 			/* wrapped write */
-- 
cgit v0.10.2


From 6b39bb6548d60b9a18826134b5ccd5c3cef85fe2 Mon Sep 17 00:00:00 2001
From: Paul Clements <paul.clements@steeleye.com>
Date: Wed, 6 Dec 2006 20:40:53 -0800
Subject: [PATCH] nbd: show nbd client pid in sysfs

Allow nbd to expose the nbd-client daemon's PID in /sys/block/nbd<x>/pid.

This is helpful for tracking connection status of a device and for
determining which nbd devices are currently in use.

Signed-off-by: Paul Clements <paul.clements@steeleye.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 9d1035e..7bf2cfb 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -355,14 +355,30 @@ harderror:
 	return NULL;
 }
 
+static ssize_t pid_show(struct gendisk *disk, char *page)
+{
+	return sprintf(page, "%ld\n",
+		(long) ((struct nbd_device *)disk->private_data)->pid);
+}
+
+static struct disk_attribute pid_attr = {
+	.attr = { .name = "pid", .mode = S_IRUGO },
+	.show = pid_show,
+};
+
 static void nbd_do_it(struct nbd_device *lo)
 {
 	struct request *req;
 
 	BUG_ON(lo->magic != LO_MAGIC);
 
+	lo->pid = current->pid;
+	sysfs_create_file(&lo->disk->kobj, &pid_attr.attr);
+
 	while ((req = nbd_read_stat(lo)) != NULL)
 		nbd_end_request(req);
+
+	sysfs_remove_file(&lo->disk->kobj, &pid_attr.attr);
 	return;
 }
 
diff --git a/include/linux/nbd.h b/include/linux/nbd.h
index d6b6dc0..0f3e693 100644
--- a/include/linux/nbd.h
+++ b/include/linux/nbd.h
@@ -64,6 +64,7 @@ struct nbd_device {
 	struct gendisk *disk;
 	int blksize;
 	u64 bytesize;
+	pid_t pid; /* pid of nbd-client, if attached */
 };
 
 #endif
-- 
cgit v0.10.2


From f0b55da0d2701230e973866c9dfbf932d8b884cb Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Wed, 6 Dec 2006 20:40:54 -0800
Subject: [PATCH] IPMI: Fix device model name

Add the product id to the driver model platform device name, in addition to
the device id.  The IPMI spec does not require that individual BMCs in a
system have unique devices IDs, but it does require that the product id/device
id combination be unique.

This also removes a redundant check and cleans up error handling
when the sysfs registration fails.

Signed-off-by: Corey Minyard <minyard@acm.org>
Cc: Carol Hebert <cah@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index c47add8..da13df4 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -1817,13 +1817,12 @@ static int __find_bmc_prod_dev_id(struct device *dev, void *data)
 	struct bmc_device *bmc = dev_get_drvdata(dev);
 
 	return (bmc->id.product_id == id->product_id
-		&& bmc->id.product_id == id->product_id
 		&& bmc->id.device_id == id->device_id);
 }
 
 static struct bmc_device *ipmi_find_bmc_prod_dev_id(
 	struct device_driver *drv,
-	unsigned char product_id, unsigned char device_id)
+	unsigned int product_id, unsigned char device_id)
 {
 	struct prod_dev_id id = {
 		.product_id = product_id,
@@ -1940,6 +1939,9 @@ static ssize_t guid_show(struct device *dev, struct device_attribute *attr,
 
 static void remove_files(struct bmc_device *bmc)
 {
+	if (!bmc->dev)
+		return;
+
 	device_remove_file(&bmc->dev->dev,
 			   &bmc->device_id_attr);
 	device_remove_file(&bmc->dev->dev,
@@ -1973,7 +1975,8 @@ cleanup_bmc_device(struct kref *ref)
 	bmc = container_of(ref, struct bmc_device, refcount);
 
 	remove_files(bmc);
-	platform_device_unregister(bmc->dev);
+	if (bmc->dev)
+		platform_device_unregister(bmc->dev);
 	kfree(bmc);
 }
 
@@ -1990,6 +1993,7 @@ static void ipmi_bmc_unregister(ipmi_smi_t intf)
 
 	mutex_lock(&ipmidriver_mutex);
 	kref_put(&bmc->refcount, cleanup_bmc_device);
+	intf->bmc = NULL;
 	mutex_unlock(&ipmidriver_mutex);
 }
 
@@ -1997,6 +2001,56 @@ static int create_files(struct bmc_device *bmc)
 {
 	int err;
 
+	bmc->device_id_attr.attr.name = "device_id";
+	bmc->device_id_attr.attr.owner = THIS_MODULE;
+	bmc->device_id_attr.attr.mode = S_IRUGO;
+	bmc->device_id_attr.show = device_id_show;
+
+	bmc->provides_dev_sdrs_attr.attr.name = "provides_device_sdrs";
+	bmc->provides_dev_sdrs_attr.attr.owner = THIS_MODULE;
+	bmc->provides_dev_sdrs_attr.attr.mode = S_IRUGO;
+	bmc->provides_dev_sdrs_attr.show = provides_dev_sdrs_show;
+
+	bmc->revision_attr.attr.name = "revision";
+	bmc->revision_attr.attr.owner = THIS_MODULE;
+	bmc->revision_attr.attr.mode = S_IRUGO;
+	bmc->revision_attr.show = revision_show;
+
+	bmc->firmware_rev_attr.attr.name = "firmware_revision";
+	bmc->firmware_rev_attr.attr.owner = THIS_MODULE;
+	bmc->firmware_rev_attr.attr.mode = S_IRUGO;
+	bmc->firmware_rev_attr.show = firmware_rev_show;
+
+	bmc->version_attr.attr.name = "ipmi_version";
+	bmc->version_attr.attr.owner = THIS_MODULE;
+	bmc->version_attr.attr.mode = S_IRUGO;
+	bmc->version_attr.show = ipmi_version_show;
+
+	bmc->add_dev_support_attr.attr.name = "additional_device_support";
+	bmc->add_dev_support_attr.attr.owner = THIS_MODULE;
+	bmc->add_dev_support_attr.attr.mode = S_IRUGO;
+	bmc->add_dev_support_attr.show = add_dev_support_show;
+
+	bmc->manufacturer_id_attr.attr.name = "manufacturer_id";
+	bmc->manufacturer_id_attr.attr.owner = THIS_MODULE;
+	bmc->manufacturer_id_attr.attr.mode = S_IRUGO;
+	bmc->manufacturer_id_attr.show = manufacturer_id_show;
+
+	bmc->product_id_attr.attr.name = "product_id";
+	bmc->product_id_attr.attr.owner = THIS_MODULE;
+	bmc->product_id_attr.attr.mode = S_IRUGO;
+	bmc->product_id_attr.show = product_id_show;
+
+	bmc->guid_attr.attr.name = "guid";
+	bmc->guid_attr.attr.owner = THIS_MODULE;
+	bmc->guid_attr.attr.mode = S_IRUGO;
+	bmc->guid_attr.show = guid_show;
+
+	bmc->aux_firmware_rev_attr.attr.name = "aux_firmware_revision";
+	bmc->aux_firmware_rev_attr.attr.owner = THIS_MODULE;
+	bmc->aux_firmware_rev_attr.attr.mode = S_IRUGO;
+	bmc->aux_firmware_rev_attr.show = aux_firmware_rev_show;
+
 	err = device_create_file(&bmc->dev->dev,
 			   &bmc->device_id_attr);
 	if (err) goto out;
@@ -2106,9 +2160,39 @@ static int ipmi_bmc_register(ipmi_smi_t intf)
 		       bmc->id.product_id,
 		       bmc->id.device_id);
 	} else {
-		bmc->dev = platform_device_alloc("ipmi_bmc",
-						 bmc->id.device_id);
+		char name[14];
+		unsigned char orig_dev_id = bmc->id.device_id;
+		int warn_printed = 0;
+
+		snprintf(name, sizeof(name),
+			 "ipmi_bmc.%4.4x", bmc->id.product_id);
+
+		while (ipmi_find_bmc_prod_dev_id(&ipmidriver,
+						 bmc->id.product_id,
+						 bmc->id.device_id))
+		{
+			if (!warn_printed) {
+				printk(KERN_WARNING PFX
+				       "This machine has two different BMCs"
+				       " with the same product id and device"
+				       " id.  This is an error in the"
+				       " firmware, but incrementing the"
+				       " device id to work around the problem."
+				       " Prod ID = 0x%x, Dev ID = 0x%x\n",
+				       bmc->id.product_id, bmc->id.device_id);
+				warn_printed = 1;
+			}
+			bmc->id.device_id++; /* Wraps at 255 */
+			if (bmc->id.device_id == orig_dev_id) {
+				printk(KERN_ERR PFX
+				       "Out of device ids!\n");
+				break;
+			}
+		}
+
+		bmc->dev = platform_device_alloc(name, bmc->id.device_id);
 		if (!bmc->dev) {
+			mutex_unlock(&ipmidriver_mutex);
 			printk(KERN_ERR
 			       "ipmi_msghandler:"
 			       " Unable to allocate platform device\n");
@@ -2121,6 +2205,8 @@ static int ipmi_bmc_register(ipmi_smi_t intf)
 		rv = platform_device_add(bmc->dev);
 		mutex_unlock(&ipmidriver_mutex);
 		if (rv) {
+			platform_device_put(bmc->dev);
+			bmc->dev = NULL;
 			printk(KERN_ERR
 			       "ipmi_msghandler:"
 			       " Unable to register bmc device: %d\n",
@@ -2130,57 +2216,6 @@ static int ipmi_bmc_register(ipmi_smi_t intf)
 			return rv;
 		}
 
-		bmc->device_id_attr.attr.name = "device_id";
-		bmc->device_id_attr.attr.owner = THIS_MODULE;
-		bmc->device_id_attr.attr.mode = S_IRUGO;
-		bmc->device_id_attr.show = device_id_show;
-
-		bmc->provides_dev_sdrs_attr.attr.name = "provides_device_sdrs";
-		bmc->provides_dev_sdrs_attr.attr.owner = THIS_MODULE;
-		bmc->provides_dev_sdrs_attr.attr.mode = S_IRUGO;
-		bmc->provides_dev_sdrs_attr.show = provides_dev_sdrs_show;
-
-		bmc->revision_attr.attr.name = "revision";
-		bmc->revision_attr.attr.owner = THIS_MODULE;
-		bmc->revision_attr.attr.mode = S_IRUGO;
-		bmc->revision_attr.show = revision_show;
-
-		bmc->firmware_rev_attr.attr.name = "firmware_revision";
-		bmc->firmware_rev_attr.attr.owner = THIS_MODULE;
-		bmc->firmware_rev_attr.attr.mode = S_IRUGO;
-		bmc->firmware_rev_attr.show = firmware_rev_show;
-
-		bmc->version_attr.attr.name = "ipmi_version";
-		bmc->version_attr.attr.owner = THIS_MODULE;
-		bmc->version_attr.attr.mode = S_IRUGO;
-		bmc->version_attr.show = ipmi_version_show;
-
-		bmc->add_dev_support_attr.attr.name
-			= "additional_device_support";
-		bmc->add_dev_support_attr.attr.owner = THIS_MODULE;
-		bmc->add_dev_support_attr.attr.mode = S_IRUGO;
-		bmc->add_dev_support_attr.show = add_dev_support_show;
-
-		bmc->manufacturer_id_attr.attr.name = "manufacturer_id";
-		bmc->manufacturer_id_attr.attr.owner = THIS_MODULE;
-		bmc->manufacturer_id_attr.attr.mode = S_IRUGO;
-		bmc->manufacturer_id_attr.show = manufacturer_id_show;
-
-		bmc->product_id_attr.attr.name = "product_id";
-		bmc->product_id_attr.attr.owner = THIS_MODULE;
-		bmc->product_id_attr.attr.mode = S_IRUGO;
-		bmc->product_id_attr.show = product_id_show;
-
-		bmc->guid_attr.attr.name = "guid";
-		bmc->guid_attr.attr.owner = THIS_MODULE;
-		bmc->guid_attr.attr.mode = S_IRUGO;
-		bmc->guid_attr.show = guid_show;
-
-		bmc->aux_firmware_rev_attr.attr.name = "aux_firmware_revision";
-		bmc->aux_firmware_rev_attr.attr.owner = THIS_MODULE;
-		bmc->aux_firmware_rev_attr.attr.mode = S_IRUGO;
-		bmc->aux_firmware_rev_attr.show = aux_firmware_rev_show;
-
 		rv = create_files(bmc);
 		if (rv) {
 			mutex_lock(&ipmidriver_mutex);
-- 
cgit v0.10.2


From bca0324d09e413ee089f44cc71263ae1fc582b35 Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Wed, 6 Dec 2006 20:40:57 -0800
Subject: [PATCH] IPMI: remove interface number limits

Remove the arbitrary limit of number of IPMI interfaces.  This has been tested
with 8 interfaces.

Signed-off-by: Corey Minyard <minyard@acm.org>
Cc: Carol Hebert <cah@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index da13df4..761ed26 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -193,6 +193,9 @@ struct ipmi_smi
 
 	struct kref refcount;
 
+	/* Used for a list of interfaces. */
+	struct list_head link;
+
 	/* The list of upper layers that are using me.  seq_lock
 	 * protects this. */
 	struct list_head users;
@@ -338,13 +341,6 @@ struct ipmi_smi
 };
 #define to_si_intf_from_dev(device) container_of(device, struct ipmi_smi, dev)
 
-/* Used to mark an interface entry that cannot be used but is not a
- * free entry, either, primarily used at creation and deletion time so
- * a slot doesn't get reused too quickly. */
-#define IPMI_INVALID_INTERFACE_ENTRY ((ipmi_smi_t) ((long) 1))
-#define IPMI_INVALID_INTERFACE(i) (((i) == NULL) \
-				   || (i == IPMI_INVALID_INTERFACE_ENTRY))
-
 /**
  * The driver model view of the IPMI messaging driver.
  */
@@ -354,11 +350,8 @@ static struct device_driver ipmidriver = {
 };
 static DEFINE_MUTEX(ipmidriver_mutex);
 
-#define MAX_IPMI_INTERFACES 4
-static ipmi_smi_t ipmi_interfaces[MAX_IPMI_INTERFACES];
-
-/* Directly protects the ipmi_interfaces data structure. */
-static DEFINE_SPINLOCK(interfaces_lock);
+static struct list_head ipmi_interfaces = LIST_HEAD_INIT(ipmi_interfaces);
+static DEFINE_MUTEX(ipmi_interfaces_mutex);
 
 /* List of watchers that want to know when smi's are added and
    deleted. */
@@ -423,25 +416,50 @@ static void intf_free(struct kref *ref)
 	kfree(intf);
 }
 
+struct watcher_entry {
+	struct list_head link;
+	int intf_num;
+};
+
 int ipmi_smi_watcher_register(struct ipmi_smi_watcher *watcher)
 {
-	int           i;
-	unsigned long flags;
+	ipmi_smi_t intf;
+	struct list_head to_deliver = LIST_HEAD_INIT(to_deliver);
+	struct watcher_entry *e, *e2;
+
+	mutex_lock(&ipmi_interfaces_mutex);
+
+	list_for_each_entry_rcu(intf, &ipmi_interfaces, link) {
+		if (intf->intf_num == -1)
+			continue;
+		e = kmalloc(sizeof(*e), GFP_KERNEL);
+		if (!e)
+			goto out_err;
+		e->intf_num = intf->intf_num;
+		list_add_tail(&e->link, &to_deliver);
+	}
 
 	down_write(&smi_watchers_sem);
 	list_add(&(watcher->link), &smi_watchers);
 	up_write(&smi_watchers_sem);
-	spin_lock_irqsave(&interfaces_lock, flags);
-	for (i = 0; i < MAX_IPMI_INTERFACES; i++) {
-		ipmi_smi_t intf = ipmi_interfaces[i];
-		if (IPMI_INVALID_INTERFACE(intf))
-			continue;
-		spin_unlock_irqrestore(&interfaces_lock, flags);
-		watcher->new_smi(i, intf->si_dev);
-		spin_lock_irqsave(&interfaces_lock, flags);
+
+	mutex_unlock(&ipmi_interfaces_mutex);
+
+	list_for_each_entry_safe(e, e2, &to_deliver, link) {
+		list_del(&e->link);
+		watcher->new_smi(e->intf_num, intf->si_dev);
+		kfree(e);
 	}
-	spin_unlock_irqrestore(&interfaces_lock, flags);
+
+
 	return 0;
+
+ out_err:
+	list_for_each_entry_safe(e, e2, &to_deliver, link) {
+		list_del(&e->link);
+		kfree(e);
+	}
+	return -ENOMEM;
 }
 
 int ipmi_smi_watcher_unregister(struct ipmi_smi_watcher *watcher)
@@ -776,17 +794,19 @@ int ipmi_create_user(unsigned int          if_num,
 	if (!new_user)
 		return -ENOMEM;
 
-	spin_lock_irqsave(&interfaces_lock, flags);
-	intf = ipmi_interfaces[if_num];
-	if ((if_num >= MAX_IPMI_INTERFACES) || IPMI_INVALID_INTERFACE(intf)) {
-		spin_unlock_irqrestore(&interfaces_lock, flags);
-		rv = -EINVAL;
-		goto out_kfree;
+	rcu_read_lock();
+	list_for_each_entry_rcu(intf, &ipmi_interfaces, link) {
+		if (intf->intf_num == if_num)
+			goto found;
 	}
+	rcu_read_unlock();
+	rv = -EINVAL;
+	goto out_kfree;
 
+ found:
 	/* Note that each existing user holds a refcount to the interface. */
 	kref_get(&intf->refcount);
-	spin_unlock_irqrestore(&interfaces_lock, flags);
+	rcu_read_unlock();
 
 	kref_init(&new_user->refcount);
 	new_user->handler = handler;
@@ -2449,9 +2469,10 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 	int              i, j;
 	int              rv;
 	ipmi_smi_t       intf;
-	unsigned long    flags;
+	ipmi_smi_t       tintf;
 	int              version_major;
 	int              version_minor;
+	struct list_head *link;
 
 	version_major = ipmi_version_major(device_id);
 	version_minor = ipmi_version_minor(device_id);
@@ -2477,7 +2498,7 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 		kfree(intf);
 		return -ENOMEM;
 	}
-	intf->intf_num = -1;
+	intf->intf_num = -1; /* Mark it invalid for now. */
 	kref_init(&intf->refcount);
 	intf->bmc->id = *device_id;
 	intf->si_dev = si_dev;
@@ -2511,20 +2532,22 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 	spin_lock_init(&intf->counter_lock);
 	intf->proc_dir = NULL;
 
-	rv = -ENOMEM;
-	spin_lock_irqsave(&interfaces_lock, flags);
-	for (i = 0; i < MAX_IPMI_INTERFACES; i++) {
-		if (ipmi_interfaces[i] == NULL) {
-			intf->intf_num = i;
-			/* Reserve the entry till we are done. */
-			ipmi_interfaces[i] = IPMI_INVALID_INTERFACE_ENTRY;
-			rv = 0;
+	mutex_lock(&ipmi_interfaces_mutex);
+	/* Look for a hole in the numbers. */
+	i = 0;
+	link = &ipmi_interfaces;
+	list_for_each_entry_rcu(tintf, &ipmi_interfaces, link) {
+		if (tintf->intf_num != i) {
+			link = &tintf->link;
 			break;
 		}
+		i++;
 	}
-	spin_unlock_irqrestore(&interfaces_lock, flags);
-	if (rv)
-		goto out;
+	/* Add the new interface in numeric order. */
+	if (i == 0)
+		list_add_rcu(&intf->link, &ipmi_interfaces);
+	else
+		list_add_tail_rcu(&intf->link, link);
 
 	rv = handlers->start_processing(send_info, intf);
 	if (rv)
@@ -2562,16 +2585,14 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 	if (rv) {
 		if (intf->proc_dir)
 			remove_proc_entries(intf);
+		list_del_rcu(&intf->link);
+		mutex_unlock(&ipmi_interfaces_mutex);
+		synchronize_rcu();
 		kref_put(&intf->refcount, intf_free);
-		if (i < MAX_IPMI_INTERFACES) {
-			spin_lock_irqsave(&interfaces_lock, flags);
-			ipmi_interfaces[i] = NULL;
-			spin_unlock_irqrestore(&interfaces_lock, flags);
-		}
 	} else {
-		spin_lock_irqsave(&interfaces_lock, flags);
-		ipmi_interfaces[i] = intf;
-		spin_unlock_irqrestore(&interfaces_lock, flags);
+		/* After this point the interface is legal to use. */
+		intf->intf_num = i;
+		mutex_unlock(&ipmi_interfaces_mutex);
 		call_smi_watchers(i, intf->si_dev);
 	}
 
@@ -2580,26 +2601,14 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 
 int ipmi_unregister_smi(ipmi_smi_t intf)
 {
-	int                     i;
 	struct ipmi_smi_watcher *w;
-	unsigned long           flags;
 
 	ipmi_bmc_unregister(intf);
 
-	spin_lock_irqsave(&interfaces_lock, flags);
-	for (i = 0; i < MAX_IPMI_INTERFACES; i++) {
-		if (ipmi_interfaces[i] == intf) {
-			/* Set the interface number reserved until we
-			 * are done. */
-			ipmi_interfaces[i] = IPMI_INVALID_INTERFACE_ENTRY;
-			intf->intf_num = -1;
-			break;
-		}
-	}
-	spin_unlock_irqrestore(&interfaces_lock,flags);
-
-	if (i == MAX_IPMI_INTERFACES)
-		return -ENODEV;
+	mutex_lock(&ipmi_interfaces_mutex);
+	list_del_rcu(&intf->link);
+	mutex_unlock(&ipmi_interfaces_mutex);
+	synchronize_rcu();
 
 	remove_proc_entries(intf);
 
@@ -2607,14 +2616,9 @@ int ipmi_unregister_smi(ipmi_smi_t intf)
 	   an interface is gone. */
 	down_read(&smi_watchers_sem);
 	list_for_each_entry(w, &smi_watchers, link)
-		w->smi_gone(i);
+		w->smi_gone(intf->intf_num);
 	up_read(&smi_watchers_sem);
 
-	/* Allow the entry to be reused now. */
-	spin_lock_irqsave(&interfaces_lock, flags);
-	ipmi_interfaces[i] = NULL;
-	spin_unlock_irqrestore(&interfaces_lock,flags);
-
 	kref_put(&intf->refcount, intf_free);
 	return 0;
 }
@@ -3446,18 +3450,12 @@ static void ipmi_timeout_handler(long timeout_period)
 	struct ipmi_recv_msg *msg, *msg2;
 	struct ipmi_smi_msg  *smi_msg, *smi_msg2;
 	unsigned long        flags;
-	int                  i, j;
+	int                  i;
 
 	INIT_LIST_HEAD(&timeouts);
 
-	spin_lock(&interfaces_lock);
-	for (i = 0; i < MAX_IPMI_INTERFACES; i++) {
-		intf = ipmi_interfaces[i];
-		if (IPMI_INVALID_INTERFACE(intf))
-			continue;
-		kref_get(&intf->refcount);
-		spin_unlock(&interfaces_lock);
-
+	rcu_read_lock();
+	list_for_each_entry_rcu(intf, &ipmi_interfaces, link) {
 		/* See if any waiting messages need to be processed. */
 		spin_lock_irqsave(&intf->waiting_msgs_lock, flags);
 		list_for_each_entry_safe(smi_msg, smi_msg2,
@@ -3477,35 +3475,26 @@ static void ipmi_timeout_handler(long timeout_period)
 		   have timed out, putting them in the timeouts
 		   list. */
 		spin_lock_irqsave(&intf->seq_lock, flags);
-		for (j = 0; j < IPMI_IPMB_NUM_SEQ; j++)
-			check_msg_timeout(intf, &(intf->seq_table[j]),
-					  &timeouts, timeout_period, j,
+		for (i = 0; i < IPMI_IPMB_NUM_SEQ; i++)
+			check_msg_timeout(intf, &(intf->seq_table[i]),
+					  &timeouts, timeout_period, i,
 					  &flags);
 		spin_unlock_irqrestore(&intf->seq_lock, flags);
 
 		list_for_each_entry_safe(msg, msg2, &timeouts, link)
 			handle_msg_timeout(msg);
-
-		kref_put(&intf->refcount, intf_free);
-		spin_lock(&interfaces_lock);
 	}
-	spin_unlock(&interfaces_lock);
+	rcu_read_unlock();
 }
 
 static void ipmi_request_event(void)
 {
 	ipmi_smi_t intf;
-	int        i;
-
-	spin_lock(&interfaces_lock);
-	for (i = 0; i < MAX_IPMI_INTERFACES; i++) {
-		intf = ipmi_interfaces[i];
-		if (IPMI_INVALID_INTERFACE(intf))
-			continue;
 
+	rcu_read_lock();
+	list_for_each_entry_rcu(intf, &ipmi_interfaces, link)
 		intf->handlers->request_events(intf->send_info);
-	}
-	spin_unlock(&interfaces_lock);
+	rcu_read_unlock();
 }
 
 static struct timer_list ipmi_timer;
@@ -3634,7 +3623,6 @@ static void send_panic_events(char *str)
 	struct kernel_ipmi_msg            msg;
 	ipmi_smi_t                        intf;
 	unsigned char                     data[16];
-	int                               i;
 	struct ipmi_system_interface_addr *si;
 	struct ipmi_addr                  addr;
 	struct ipmi_smi_msg               smi_msg;
@@ -3668,9 +3656,9 @@ static void send_panic_events(char *str)
 	recv_msg.done = dummy_recv_done_handler;
 
 	/* For every registered interface, send the event. */
-	for (i = 0; i < MAX_IPMI_INTERFACES; i++) {
-		intf = ipmi_interfaces[i];
-		if (IPMI_INVALID_INTERFACE(intf))
+	list_for_each_entry_rcu(intf, &ipmi_interfaces, link) {
+		if (intf->intf_num == -1)
+			/* Interface was not ready yet. */
 			continue;
 
 		/* Send the event announcing the panic. */
@@ -3695,13 +3683,14 @@ static void send_panic_events(char *str)
 	if (!str) 
 		return;
 
-	for (i = 0; i < MAX_IPMI_INTERFACES; i++) {
+	/* For every registered interface, send the event. */
+	list_for_each_entry_rcu(intf, &ipmi_interfaces, link) {
 		char                  *p = str;
 		struct ipmi_ipmb_addr *ipmb;
 		int                   j;
 
-		intf = ipmi_interfaces[i];
-		if (IPMI_INVALID_INTERFACE(intf))
+		if (intf->intf_num == -1)
+			/* Interface was not ready yet. */
 			continue;
 
 		/* First job here is to figure out where to send the
@@ -3827,7 +3816,6 @@ static int panic_event(struct notifier_block *this,
 		       unsigned long         event,
                        void                  *ptr)
 {
-	int        i;
 	ipmi_smi_t intf;
 
 	if (has_panicked)
@@ -3835,9 +3823,9 @@ static int panic_event(struct notifier_block *this,
 	has_panicked = 1;
 
 	/* For every registered interface, set it to run to completion. */
-	for (i = 0; i < MAX_IPMI_INTERFACES; i++) {
-		intf = ipmi_interfaces[i];
-		if (IPMI_INVALID_INTERFACE(intf))
+	list_for_each_entry_rcu(intf, &ipmi_interfaces, link) {
+		if (intf->intf_num == -1)
+			/* Interface was not ready yet. */
 			continue;
 
 		intf->handlers->set_run_to_completion(intf->send_info, 1);
@@ -3858,7 +3846,6 @@ static struct notifier_block panic_block = {
 
 static int ipmi_init_msghandler(void)
 {
-	int i;
 	int rv;
 
 	if (initialized)
@@ -3873,9 +3860,6 @@ static int ipmi_init_msghandler(void)
 	printk(KERN_INFO "ipmi message handler version "
 	       IPMI_DRIVER_VERSION "\n");
 
-	for (i = 0; i < MAX_IPMI_INTERFACES; i++)
-		ipmi_interfaces[i] = NULL;
-
 #ifdef CONFIG_PROC_FS
 	proc_ipmi_root = proc_mkdir("ipmi", NULL);
 	if (!proc_ipmi_root) {
-- 
cgit v0.10.2


From 759643b874907e76ae81e34df62f41ab6683f5c2 Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Wed, 6 Dec 2006 20:40:59 -0800
Subject: [PATCH] IPMI: pass sysfs name from lower level driver

Pass in the sysfs name from the lower-level IPMI driver, as the coming IPMI
serial driver will need that to link properly from the serial device sysfs
directory.

Signed-off-by: Corey Minyard <minyard@acm.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 761ed26..6a77b26 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -205,6 +205,7 @@ struct ipmi_smi
 
 	struct bmc_device *bmc;
 	char *my_dev_name;
+	char *sysfs_name;
 
 	/* This is the lower-layer's sender routine. */
 	struct ipmi_smi_handlers *handlers;
@@ -2004,7 +2005,11 @@ static void ipmi_bmc_unregister(ipmi_smi_t intf)
 {
 	struct bmc_device *bmc = intf->bmc;
 
-	sysfs_remove_link(&intf->si_dev->kobj, "bmc");
+	if (intf->sysfs_name) {
+		sysfs_remove_link(&intf->si_dev->kobj, intf->sysfs_name);
+		kfree(intf->sysfs_name);
+		intf->sysfs_name = NULL;
+	}
 	if (intf->my_dev_name) {
 		sysfs_remove_link(&bmc->dev->dev.kobj, intf->my_dev_name);
 		kfree(intf->my_dev_name);
@@ -2140,7 +2145,8 @@ out:
 	return err;
 }
 
-static int ipmi_bmc_register(ipmi_smi_t intf)
+static int ipmi_bmc_register(ipmi_smi_t intf, int ifnum,
+			     const char *sysfs_name)
 {
 	int               rv;
 	struct bmc_device *bmc = intf->bmc;
@@ -2257,29 +2263,44 @@ static int ipmi_bmc_register(ipmi_smi_t intf)
 	 * create symlink from system interface device to bmc device
 	 * and back.
 	 */
+	intf->sysfs_name = kstrdup(sysfs_name, GFP_KERNEL);
+	if (!intf->sysfs_name) {
+		rv = -ENOMEM;
+		printk(KERN_ERR
+		       "ipmi_msghandler: allocate link to BMC: %d\n",
+		       rv);
+		goto out_err;
+	}
+
 	rv = sysfs_create_link(&intf->si_dev->kobj,
-			       &bmc->dev->dev.kobj, "bmc");
+			       &bmc->dev->dev.kobj, intf->sysfs_name);
 	if (rv) {
+		kfree(intf->sysfs_name);
+		intf->sysfs_name = NULL;
 		printk(KERN_ERR
 		       "ipmi_msghandler: Unable to create bmc symlink: %d\n",
 		       rv);
 		goto out_err;
 	}
 
-	size = snprintf(dummy, 0, "ipmi%d", intf->intf_num);
+	size = snprintf(dummy, 0, "ipmi%d", ifnum);
 	intf->my_dev_name = kmalloc(size+1, GFP_KERNEL);
 	if (!intf->my_dev_name) {
+		kfree(intf->sysfs_name);
+		intf->sysfs_name = NULL;
 		rv = -ENOMEM;
 		printk(KERN_ERR
 		       "ipmi_msghandler: allocate link from BMC: %d\n",
 		       rv);
 		goto out_err;
 	}
-	snprintf(intf->my_dev_name, size+1, "ipmi%d", intf->intf_num);
+	snprintf(intf->my_dev_name, size+1, "ipmi%d", ifnum);
 
 	rv = sysfs_create_link(&bmc->dev->dev.kobj, &intf->si_dev->kobj,
 			       intf->my_dev_name);
 	if (rv) {
+		kfree(intf->sysfs_name);
+		intf->sysfs_name = NULL;
 		kfree(intf->my_dev_name);
 		intf->my_dev_name = NULL;
 		printk(KERN_ERR
@@ -2464,6 +2485,7 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 		      void		       *send_info,
 		      struct ipmi_device_id    *device_id,
 		      struct device            *si_dev,
+		      const char               *sysfs_name,
 		      unsigned char            slave_addr)
 {
 	int              i, j;
@@ -2579,7 +2601,7 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 	if (rv == 0)
 		rv = add_proc_entries(intf, i);
 
-	rv = ipmi_bmc_register(intf);
+	rv = ipmi_bmc_register(intf, i, sysfs_name);
 
  out:
 	if (rv) {
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index bb1fac1..89cdb92 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -2362,6 +2362,7 @@ static int try_smi_init(struct smi_info *new_smi)
 			       new_smi,
 			       &new_smi->device_id,
 			       new_smi->dev,
+			       "bmc",
 			       new_smi->slave_addr);
 	if (rv) {
 		printk(KERN_ERR
diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index 6d9c7e4..2cc960d 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -173,6 +173,7 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 		      void                     *send_info,
 		      struct ipmi_device_id    *device_id,
 		      struct device            *dev,
+		      const char               *sysfs_name,
 		      unsigned char            slave_addr);
 
 /*
-- 
cgit v0.10.2


From b2c03941b50944a268ee4d5823872f220809a3ba Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Wed, 6 Dec 2006 20:41:00 -0800
Subject: [PATCH] IPMI: Allow hot system interface remove

This modifies the IPMI driver so that a lower-level interface can be
dynamically removed while in use so it can support hot-removal of hardware.

It also adds the ability to specify and dynamically change the IPMI interface
the watchdog timer and the poweroff code use.

Signed-off-by: Corey Minyard <minyard@acm.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/Documentation/IPMI.txt b/Documentation/IPMI.txt
index 0e3924e..9101cbf 100644
--- a/Documentation/IPMI.txt
+++ b/Documentation/IPMI.txt
@@ -502,7 +502,10 @@ used to control it:
 
   modprobe ipmi_watchdog timeout=<t> pretimeout=<t> action=<action type>
       preaction=<preaction type> preop=<preop type> start_now=x
-      nowayout=x
+      nowayout=x ifnum_to_use=n
+
+ifnum_to_use specifies which interface the watchdog timer should use.
+The default is -1, which means to pick the first one registered.
 
 The timeout is the number of seconds to the action, and the pretimeout
 is the amount of seconds before the reset that the pre-timeout panic will
@@ -624,5 +627,9 @@ command line.  The parameter is also available via the proc filesystem
 in /proc/sys/dev/ipmi/poweroff_powercycle.  Note that if the system
 does not support power cycling, it will always do the power off.
 
+The "ifnum_to_use" parameter specifies which interface the poweroff
+code should use.  The default is -1, which means to pick the first one
+registered.
+
 Note that if you have ACPI enabled, the system will prefer using ACPI to
 power off.
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 6a77b26..03f3261 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -200,6 +200,10 @@ struct ipmi_smi
 	 * protects this. */
 	struct list_head users;
 
+	/* Information to supply to users. */
+	unsigned char ipmi_version_major;
+	unsigned char ipmi_version_minor;
+
 	/* Used for wake ups at startup. */
 	wait_queue_head_t waitq;
 
@@ -207,7 +211,10 @@ struct ipmi_smi
 	char *my_dev_name;
 	char *sysfs_name;
 
-	/* This is the lower-layer's sender routine. */
+	/* This is the lower-layer's sender routine.  Note that you
+	 * must either be holding the ipmi_interfaces_mutex or be in
+	 * an umpreemptible region to use this.  You must fetch the
+	 * value into a local variable and make sure it is not NULL. */
 	struct ipmi_smi_handlers *handlers;
 	void                     *send_info;
 
@@ -246,6 +253,7 @@ struct ipmi_smi
 	spinlock_t       events_lock; /* For dealing with event stuff. */
 	struct list_head waiting_events;
 	unsigned int     waiting_events_count; /* How many events in queue? */
+	int              delivering_events;
 
 	/* The event receiver for my BMC, only really used at panic
 	   shutdown as a place to store this. */
@@ -357,7 +365,7 @@ static DEFINE_MUTEX(ipmi_interfaces_mutex);
 /* List of watchers that want to know when smi's are added and
    deleted. */
 static struct list_head smi_watchers = LIST_HEAD_INIT(smi_watchers);
-static DECLARE_RWSEM(smi_watchers_sem);
+static DEFINE_MUTEX(smi_watchers_mutex);
 
 
 static void free_recv_msg_list(struct list_head *q)
@@ -418,8 +426,9 @@ static void intf_free(struct kref *ref)
 }
 
 struct watcher_entry {
+	int              intf_num;
+	ipmi_smi_t       intf;
 	struct list_head link;
-	int intf_num;
 };
 
 int ipmi_smi_watcher_register(struct ipmi_smi_watcher *watcher)
@@ -428,36 +437,45 @@ int ipmi_smi_watcher_register(struct ipmi_smi_watcher *watcher)
 	struct list_head to_deliver = LIST_HEAD_INIT(to_deliver);
 	struct watcher_entry *e, *e2;
 
+	mutex_lock(&smi_watchers_mutex);
+
 	mutex_lock(&ipmi_interfaces_mutex);
 
+	/* Build a list of things to deliver. */
 	list_for_each_entry_rcu(intf, &ipmi_interfaces, link) {
 		if (intf->intf_num == -1)
 			continue;
 		e = kmalloc(sizeof(*e), GFP_KERNEL);
 		if (!e)
 			goto out_err;
+		kref_get(&intf->refcount);
+		e->intf = intf;
 		e->intf_num = intf->intf_num;
 		list_add_tail(&e->link, &to_deliver);
 	}
 
-	down_write(&smi_watchers_sem);
-	list_add(&(watcher->link), &smi_watchers);
-	up_write(&smi_watchers_sem);
+	/* We will succeed, so add it to the list. */
+	list_add(&watcher->link, &smi_watchers);
 
 	mutex_unlock(&ipmi_interfaces_mutex);
 
 	list_for_each_entry_safe(e, e2, &to_deliver, link) {
 		list_del(&e->link);
-		watcher->new_smi(e->intf_num, intf->si_dev);
+		watcher->new_smi(e->intf_num, e->intf->si_dev);
+		kref_put(&e->intf->refcount, intf_free);
 		kfree(e);
 	}
 
+	mutex_unlock(&smi_watchers_mutex);
 
 	return 0;
 
  out_err:
+	mutex_unlock(&ipmi_interfaces_mutex);
+	mutex_unlock(&smi_watchers_mutex);
 	list_for_each_entry_safe(e, e2, &to_deliver, link) {
 		list_del(&e->link);
+		kref_put(&e->intf->refcount, intf_free);
 		kfree(e);
 	}
 	return -ENOMEM;
@@ -465,25 +483,26 @@ int ipmi_smi_watcher_register(struct ipmi_smi_watcher *watcher)
 
 int ipmi_smi_watcher_unregister(struct ipmi_smi_watcher *watcher)
 {
-	down_write(&smi_watchers_sem);
+	mutex_lock(&smi_watchers_mutex);
 	list_del(&(watcher->link));
-	up_write(&smi_watchers_sem);
+	mutex_unlock(&smi_watchers_mutex);
 	return 0;
 }
 
+/*
+ * Must be called with smi_watchers_mutex held.
+ */
 static void
 call_smi_watchers(int i, struct device *dev)
 {
 	struct ipmi_smi_watcher *w;
 
-	down_read(&smi_watchers_sem);
 	list_for_each_entry(w, &smi_watchers, link) {
 		if (try_module_get(w->owner)) {
 			w->new_smi(i, dev);
 			module_put(w->owner);
 		}
 	}
-	up_read(&smi_watchers_sem);
 }
 
 static int
@@ -609,6 +628,17 @@ static void deliver_response(struct ipmi_recv_msg *msg)
 	}
 }
 
+static void
+deliver_err_response(struct ipmi_recv_msg *msg, int err)
+{
+	msg->recv_type = IPMI_RESPONSE_RECV_TYPE;
+	msg->msg_data[0] = err;
+	msg->msg.netfn |= 1; /* Convert to a response. */
+	msg->msg.data_len = 1;
+	msg->msg.data = msg->msg_data;
+	deliver_response(msg);
+}
+
 /* Find the next sequence number not being used and add the given
    message with the given timeout to the sequence table.  This must be
    called with the interface's seq_lock held. */
@@ -746,14 +776,8 @@ static int intf_err_seq(ipmi_smi_t   intf,
 	}
 	spin_unlock_irqrestore(&(intf->seq_lock), flags);
 
-	if (msg) {
-		msg->recv_type = IPMI_RESPONSE_RECV_TYPE;
-		msg->msg_data[0] = err;
-		msg->msg.netfn |= 1; /* Convert to a response. */
-		msg->msg.data_len = 1;
-		msg->msg.data = msg->msg_data;
-		deliver_response(msg);
-	}
+	if (msg)
+		deliver_err_response(msg, err);
 
 	return rv;
 }
@@ -795,19 +819,18 @@ int ipmi_create_user(unsigned int          if_num,
 	if (!new_user)
 		return -ENOMEM;
 
-	rcu_read_lock();
+	mutex_lock(&ipmi_interfaces_mutex);
 	list_for_each_entry_rcu(intf, &ipmi_interfaces, link) {
 		if (intf->intf_num == if_num)
 			goto found;
 	}
-	rcu_read_unlock();
+	/* Not found, return an error */
 	rv = -EINVAL;
 	goto out_kfree;
 
  found:
 	/* Note that each existing user holds a refcount to the interface. */
 	kref_get(&intf->refcount);
-	rcu_read_unlock();
 
 	kref_init(&new_user->refcount);
 	new_user->handler = handler;
@@ -828,6 +851,10 @@ int ipmi_create_user(unsigned int          if_num,
 		}
 	}
 
+	/* Hold the lock so intf->handlers is guaranteed to be good
+	 * until now */
+	mutex_unlock(&ipmi_interfaces_mutex);
+
 	new_user->valid = 1;
 	spin_lock_irqsave(&intf->seq_lock, flags);
 	list_add_rcu(&new_user->link, &intf->users);
@@ -838,6 +865,7 @@ int ipmi_create_user(unsigned int          if_num,
 out_kref:
 	kref_put(&intf->refcount, intf_free);
 out_kfree:
+	mutex_unlock(&ipmi_interfaces_mutex);
 	kfree(new_user);
 	return rv;
 }
@@ -867,6 +895,7 @@ int ipmi_destroy_user(ipmi_user_t user)
 		    && (intf->seq_table[i].recv_msg->user == user))
 		{
 			intf->seq_table[i].inuse = 0;
+			ipmi_free_recv_msg(intf->seq_table[i].recv_msg);
 		}
 	}
 	spin_unlock_irqrestore(&intf->seq_lock, flags);
@@ -893,9 +922,13 @@ int ipmi_destroy_user(ipmi_user_t user)
 		kfree(rcvr);
 	}
 
-	module_put(intf->handlers->owner);
-	if (intf->handlers->dec_usecount)
-		intf->handlers->dec_usecount(intf->send_info);
+	mutex_lock(&ipmi_interfaces_mutex);
+	if (intf->handlers) {
+		module_put(intf->handlers->owner);
+		if (intf->handlers->dec_usecount)
+			intf->handlers->dec_usecount(intf->send_info);
+	}
+	mutex_unlock(&ipmi_interfaces_mutex);
 
 	kref_put(&intf->refcount, intf_free);
 
@@ -908,8 +941,8 @@ void ipmi_get_version(ipmi_user_t   user,
 		      unsigned char *major,
 		      unsigned char *minor)
 {
-	*major = ipmi_version_major(&user->intf->bmc->id);
-	*minor = ipmi_version_minor(&user->intf->bmc->id);
+	*major = user->intf->ipmi_version_major;
+	*minor = user->intf->ipmi_version_minor;
 }
 
 int ipmi_set_my_address(ipmi_user_t   user,
@@ -964,20 +997,33 @@ int ipmi_set_gets_events(ipmi_user_t user, int val)
 	spin_lock_irqsave(&intf->events_lock, flags);
 	user->gets_events = val;
 
-	if (val) {
-		/* Deliver any queued events. */
+	if (intf->delivering_events)
+		/*
+		 * Another thread is delivering events for this, so
+		 * let it handle any new events.
+		 */
+		goto out;
+
+	/* Deliver any queued events. */
+	while (user->gets_events && !list_empty(&intf->waiting_events)) {
 		list_for_each_entry_safe(msg, msg2, &intf->waiting_events, link)
 			list_move_tail(&msg->link, &msgs);
 		intf->waiting_events_count = 0;
-	}
 
-	/* Hold the events lock while doing this to preserve order. */
-	list_for_each_entry_safe(msg, msg2, &msgs, link) {
-		msg->user = user;
-		kref_get(&user->refcount);
-		deliver_response(msg);
+		intf->delivering_events = 1;
+		spin_unlock_irqrestore(&intf->events_lock, flags);
+
+		list_for_each_entry_safe(msg, msg2, &msgs, link) {
+			msg->user = user;
+			kref_get(&user->refcount);
+			deliver_response(msg);
+		}
+
+		spin_lock_irqsave(&intf->events_lock, flags);
+		intf->delivering_events = 0;
 	}
 
+ out:
 	spin_unlock_irqrestore(&intf->events_lock, flags);
 
 	return 0;
@@ -1088,7 +1134,8 @@ int ipmi_unregister_for_cmd(ipmi_user_t   user,
 void ipmi_user_set_run_to_completion(ipmi_user_t user, int val)
 {
 	ipmi_smi_t intf = user->intf;
-	intf->handlers->set_run_to_completion(intf->send_info, val);
+	if (intf->handlers)
+		intf->handlers->set_run_to_completion(intf->send_info, val);
 }
 
 static unsigned char
@@ -1199,10 +1246,11 @@ static int i_ipmi_request(ipmi_user_t          user,
 			  int                  retries,
 			  unsigned int         retry_time_ms)
 {
-	int                  rv = 0;
-	struct ipmi_smi_msg  *smi_msg;
-	struct ipmi_recv_msg *recv_msg;
-	unsigned long        flags;
+	int                      rv = 0;
+	struct ipmi_smi_msg      *smi_msg;
+	struct ipmi_recv_msg     *recv_msg;
+	unsigned long            flags;
+	struct ipmi_smi_handlers *handlers;
 
 
 	if (supplied_recv) {
@@ -1225,6 +1273,13 @@ static int i_ipmi_request(ipmi_user_t          user,
 		}
 	}
 
+	rcu_read_lock();
+	handlers = intf->handlers;
+	if (!handlers) {
+		rv = -ENODEV;
+		goto out_err;
+	}
+
 	recv_msg->user = user;
 	if (user)
 		kref_get(&user->refcount);
@@ -1541,11 +1596,14 @@ static int i_ipmi_request(ipmi_user_t          user,
 		printk("\n");
 	}
 #endif
-	intf->handlers->sender(intf->send_info, smi_msg, priority);
+
+	handlers->sender(intf->send_info, smi_msg, priority);
+	rcu_read_unlock();
 
 	return 0;
 
  out_err:
+	rcu_read_unlock();
 	ipmi_free_smi_msg(smi_msg);
 	ipmi_free_recv_msg(recv_msg);
 	return rv;
@@ -2492,13 +2550,8 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 	int              rv;
 	ipmi_smi_t       intf;
 	ipmi_smi_t       tintf;
-	int              version_major;
-	int              version_minor;
 	struct list_head *link;
 
-	version_major = ipmi_version_major(device_id);
-	version_minor = ipmi_version_minor(device_id);
-
 	/* Make sure the driver is actually initialized, this handles
 	   problems with initialization order. */
 	if (!initialized) {
@@ -2515,6 +2568,10 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 	if (!intf)
 		return -ENOMEM;
 	memset(intf, 0, sizeof(*intf));
+
+	intf->ipmi_version_major = ipmi_version_major(device_id);
+	intf->ipmi_version_minor = ipmi_version_minor(device_id);
+
 	intf->bmc = kzalloc(sizeof(*intf->bmc), GFP_KERNEL);
 	if (!intf->bmc) {
 		kfree(intf);
@@ -2554,6 +2611,7 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 	spin_lock_init(&intf->counter_lock);
 	intf->proc_dir = NULL;
 
+	mutex_lock(&smi_watchers_mutex);
 	mutex_lock(&ipmi_interfaces_mutex);
 	/* Look for a hole in the numbers. */
 	i = 0;
@@ -2577,8 +2635,9 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 
 	get_guid(intf);
 
-	if ((version_major > 1)
-	    || ((version_major == 1) && (version_minor >= 5)))
+	if ((intf->ipmi_version_major > 1)
+	    || ((intf->ipmi_version_major == 1)
+		&& (intf->ipmi_version_minor >= 5)))
 	{
 		/* Start scanning the channels to see what is
 		   available. */
@@ -2607,8 +2666,10 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 	if (rv) {
 		if (intf->proc_dir)
 			remove_proc_entries(intf);
+		intf->handlers = NULL;
 		list_del_rcu(&intf->link);
 		mutex_unlock(&ipmi_interfaces_mutex);
+		mutex_unlock(&smi_watchers_mutex);
 		synchronize_rcu();
 		kref_put(&intf->refcount, intf_free);
 	} else {
@@ -2616,30 +2677,50 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 		intf->intf_num = i;
 		mutex_unlock(&ipmi_interfaces_mutex);
 		call_smi_watchers(i, intf->si_dev);
+		mutex_unlock(&smi_watchers_mutex);
 	}
 
 	return rv;
 }
 
+static void cleanup_smi_msgs(ipmi_smi_t intf)
+{
+	int              i;
+	struct seq_table *ent;
+
+	/* No need for locks, the interface is down. */
+	for (i = 0; i < IPMI_IPMB_NUM_SEQ; i++) {
+		ent = &(intf->seq_table[i]);
+		if (!ent->inuse)
+			continue;
+		deliver_err_response(ent->recv_msg, IPMI_ERR_UNSPECIFIED);
+	}
+}
+
 int ipmi_unregister_smi(ipmi_smi_t intf)
 {
 	struct ipmi_smi_watcher *w;
+	int    intf_num = intf->intf_num;
 
 	ipmi_bmc_unregister(intf);
 
+	mutex_lock(&smi_watchers_mutex);
 	mutex_lock(&ipmi_interfaces_mutex);
+	intf->intf_num = -1;
+	intf->handlers = NULL;
 	list_del_rcu(&intf->link);
 	mutex_unlock(&ipmi_interfaces_mutex);
 	synchronize_rcu();
 
+	cleanup_smi_msgs(intf);
+
 	remove_proc_entries(intf);
 
 	/* Call all the watcher interfaces to tell them that
 	   an interface is gone. */
-	down_read(&smi_watchers_sem);
 	list_for_each_entry(w, &smi_watchers, link)
-		w->smi_gone(intf->intf_num);
-	up_read(&smi_watchers_sem);
+		w->smi_gone(intf_num);
+	mutex_unlock(&smi_watchers_mutex);
 
 	kref_put(&intf->refcount, intf_free);
 	return 0;
@@ -2721,6 +2802,7 @@ static int handle_ipmb_get_msg_cmd(ipmi_smi_t          intf,
 	struct ipmi_ipmb_addr    *ipmb_addr;
 	struct ipmi_recv_msg     *recv_msg;
 	unsigned long            flags;
+	struct ipmi_smi_handlers *handlers;
 
 	if (msg->rsp_size < 10) {
 		/* Message not big enough, just ignore it. */
@@ -2777,10 +2859,16 @@ static int handle_ipmb_get_msg_cmd(ipmi_smi_t          intf,
 		printk("\n");
 	}
 #endif
-		intf->handlers->sender(intf->send_info, msg, 0);
-
-		rv = -1; /* We used the message, so return the value that
-			    causes it to not be freed or queued. */
+		rcu_read_lock();
+		handlers = intf->handlers;
+		if (handlers) {
+			handlers->sender(intf->send_info, msg, 0);
+			/* We used the message, so return the value
+			   that causes it to not be freed or
+			   queued. */
+			rv = -1;
+		}
+		rcu_read_unlock();
 	} else {
 		/* Deliver the message to the user. */
 		spin_lock_irqsave(&intf->counter_lock, flags);
@@ -3370,16 +3458,6 @@ void ipmi_smi_watchdog_pretimeout(ipmi_smi_t intf)
 	rcu_read_unlock();
 }
 
-static void
-handle_msg_timeout(struct ipmi_recv_msg *msg)
-{
-	msg->recv_type = IPMI_RESPONSE_RECV_TYPE;
-	msg->msg_data[0] = IPMI_TIMEOUT_COMPLETION_CODE;
-	msg->msg.netfn |= 1; /* Convert to a response. */
-	msg->msg.data_len = 1;
-	msg->msg.data = msg->msg_data;
-	deliver_response(msg);
-}
 
 static struct ipmi_smi_msg *
 smi_from_recv_msg(ipmi_smi_t intf, struct ipmi_recv_msg *recv_msg,
@@ -3411,7 +3489,11 @@ static void check_msg_timeout(ipmi_smi_t intf, struct seq_table *ent,
 			      struct list_head *timeouts, long timeout_period,
 			      int slot, unsigned long *flags)
 {
-	struct ipmi_recv_msg *msg;
+	struct ipmi_recv_msg     *msg;
+	struct ipmi_smi_handlers *handlers;
+
+	if (intf->intf_num == -1)
+		return;
 
 	if (!ent->inuse)
 		return;
@@ -3454,13 +3536,19 @@ static void check_msg_timeout(ipmi_smi_t intf, struct seq_table *ent,
 			return;
 
 		spin_unlock_irqrestore(&intf->seq_lock, *flags);
+
 		/* Send the new message.  We send with a zero
 		 * priority.  It timed out, I doubt time is
 		 * that critical now, and high priority
 		 * messages are really only for messages to the
 		 * local MC, which don't get resent. */
-		intf->handlers->sender(intf->send_info,
-				       smi_msg, 0);
+		handlers = intf->handlers;
+		if (handlers)
+			intf->handlers->sender(intf->send_info,
+					       smi_msg, 0);
+		else
+			ipmi_free_smi_msg(smi_msg);
+
 		spin_lock_irqsave(&intf->seq_lock, *flags);
 	}
 }
@@ -3504,18 +3592,24 @@ static void ipmi_timeout_handler(long timeout_period)
 		spin_unlock_irqrestore(&intf->seq_lock, flags);
 
 		list_for_each_entry_safe(msg, msg2, &timeouts, link)
-			handle_msg_timeout(msg);
+			deliver_err_response(msg, IPMI_TIMEOUT_COMPLETION_CODE);
 	}
 	rcu_read_unlock();
 }
 
 static void ipmi_request_event(void)
 {
-	ipmi_smi_t intf;
+	ipmi_smi_t               intf;
+	struct ipmi_smi_handlers *handlers;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(intf, &ipmi_interfaces, link)
-		intf->handlers->request_events(intf->send_info);
+	/* Called from the timer, no need to check if handlers is
+	 * valid. */
+	list_for_each_entry_rcu(intf, &ipmi_interfaces, link) {
+		handlers = intf->handlers;
+		if (handlers)
+			handlers->request_events(intf->send_info);
+	}
 	rcu_read_unlock();
 }
 
@@ -3679,8 +3773,8 @@ static void send_panic_events(char *str)
 
 	/* For every registered interface, send the event. */
 	list_for_each_entry_rcu(intf, &ipmi_interfaces, link) {
-		if (intf->intf_num == -1)
-			/* Interface was not ready yet. */
+		if (!intf->handlers)
+			/* Interface is not ready. */
 			continue;
 
 		/* Send the event announcing the panic. */
@@ -3846,8 +3940,8 @@ static int panic_event(struct notifier_block *this,
 
 	/* For every registered interface, set it to run to completion. */
 	list_for_each_entry_rcu(intf, &ipmi_interfaces, link) {
-		if (intf->intf_num == -1)
-			/* Interface was not ready yet. */
+		if (!intf->handlers)
+			/* Interface is not ready. */
 			continue;
 
 		intf->handlers->set_run_to_completion(intf->send_info, 1);
diff --git a/drivers/char/ipmi/ipmi_poweroff.c b/drivers/char/ipmi/ipmi_poweroff.c
index 8d941db..85f8071 100644
--- a/drivers/char/ipmi/ipmi_poweroff.c
+++ b/drivers/char/ipmi/ipmi_poweroff.c
@@ -43,6 +43,9 @@
 
 #define PFX "IPMI poweroff: "
 
+static void ipmi_po_smi_gone(int if_num);
+static void ipmi_po_new_smi(int if_num, struct device *device);
+
 /* Definitions for controlling power off (if the system supports it).  It
  * conveniently matches the IPMI chassis control values. */
 #define IPMI_CHASSIS_POWER_DOWN		0	/* power down, the default. */
@@ -51,6 +54,37 @@
 /* the IPMI data command */
 static int poweroff_powercycle;
 
+/* Which interface to use, -1 means the first we see. */
+static int ifnum_to_use = -1;
+
+/* Our local state. */
+static int ready = 0;
+static ipmi_user_t ipmi_user;
+static int ipmi_ifnum;
+static void (*specific_poweroff_func)(ipmi_user_t user) = NULL;
+
+/* Holds the old poweroff function so we can restore it on removal. */
+static void (*old_poweroff_func)(void);
+
+static int set_param_ifnum(const char *val, struct kernel_param *kp)
+{
+	int rv = param_set_int(val, kp);
+	if (rv)
+		return rv;
+	if ((ifnum_to_use < 0) || (ifnum_to_use == ipmi_ifnum))
+		return 0;
+
+	ipmi_po_smi_gone(ipmi_ifnum);
+	ipmi_po_new_smi(ifnum_to_use, NULL);
+	return 0;
+}
+
+module_param_call(ifnum_to_use, set_param_ifnum, param_get_int,
+		  &ifnum_to_use, 0644);
+MODULE_PARM_DESC(ifnum_to_use, "The interface number to use for the watchdog "
+		 "timer.  Setting to -1 defaults to the first registered "
+		 "interface");
+
 /* parameter definition to allow user to flag power cycle */
 module_param(poweroff_powercycle, int, 0644);
 MODULE_PARM_DESC(poweroff_powercycle, " Set to non-zero to enable power cycle instead of power down. Power cycle is contingent on hardware support, otherwise it defaults back to power down.");
@@ -440,15 +474,6 @@ static struct poweroff_function poweroff_functions[] = {
 		      / sizeof(struct poweroff_function))
 
 
-/* Our local state. */
-static int ready = 0;
-static ipmi_user_t ipmi_user;
-static void (*specific_poweroff_func)(ipmi_user_t user) = NULL;
-
-/* Holds the old poweroff function so we can restore it on removal. */
-static void (*old_poweroff_func)(void);
-
-
 /* Called on a powerdown request. */
 static void ipmi_poweroff_function (void)
 {
@@ -473,6 +498,9 @@ static void ipmi_po_new_smi(int if_num, struct device *device)
 	if (ready)
 		return;
 
+	if ((ifnum_to_use >= 0) && (ifnum_to_use != if_num))
+		return;
+
 	rv = ipmi_create_user(if_num, &ipmi_poweroff_handler, NULL,
 			      &ipmi_user);
 	if (rv) {
@@ -481,6 +509,8 @@ static void ipmi_po_new_smi(int if_num, struct device *device)
 		return;
 	}
 
+	ipmi_ifnum = if_num;
+
         /*
          * Do a get device ide and store some results, since this is
 	 * used by several functions.
@@ -541,9 +571,15 @@ static void ipmi_po_new_smi(int if_num, struct device *device)
 
 static void ipmi_po_smi_gone(int if_num)
 {
-	/* This can never be called, because once poweroff driver is
-	   registered, the interface can't go away until the power
-	   driver is unregistered. */
+	if (!ready)
+		return;
+
+	if (ipmi_ifnum != if_num)
+		return;
+
+	ready = 0;
+	ipmi_destroy_user(ipmi_user);
+	pm_power_off = old_poweroff_func;
 }
 
 static struct ipmi_smi_watcher smi_watcher =
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index 73f759e..90fb2a5 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -135,6 +135,7 @@
 static int nowayout = WATCHDOG_NOWAYOUT;
 
 static ipmi_user_t watchdog_user = NULL;
+static int watchdog_ifnum;
 
 /* Default the timeout to 10 seconds. */
 static int timeout = 10;
@@ -161,6 +162,8 @@ static struct fasync_struct *fasync_q = NULL;
 static char pretimeout_since_last_heartbeat = 0;
 static char expect_close;
 
+static int ifnum_to_use = -1;
+
 static DECLARE_RWSEM(register_sem);
 
 /* Parameters to ipmi_set_timeout */
@@ -169,6 +172,8 @@ static DECLARE_RWSEM(register_sem);
 #define IPMI_SET_TIMEOUT_FORCE_HB		2
 
 static int ipmi_set_timeout(int do_heartbeat);
+static void ipmi_register_watchdog(int ipmi_intf);
+static void ipmi_unregister_watchdog(int ipmi_intf);
 
 /* If true, the driver will start running as soon as it is configured
    and ready. */
@@ -245,6 +250,26 @@ static int get_param_str(char *buffer, struct kernel_param *kp)
 	return strlen(buffer);
 }
 
+
+static int set_param_wdog_ifnum(const char *val, struct kernel_param *kp)
+{
+	int rv = param_set_int(val, kp);
+	if (rv)
+		return rv;
+	if ((ifnum_to_use < 0) || (ifnum_to_use == watchdog_ifnum))
+		return 0;
+
+	ipmi_unregister_watchdog(watchdog_ifnum);
+	ipmi_register_watchdog(ifnum_to_use);
+	return 0;
+}
+
+module_param_call(ifnum_to_use, set_param_wdog_ifnum, get_param_int,
+		  &ifnum_to_use, 0644);
+MODULE_PARM_DESC(ifnum_to_use, "The interface number to use for the watchdog "
+		 "timer.  Setting to -1 defaults to the first registered "
+		 "interface");
+
 module_param_call(timeout, set_param_int, get_param_int, &timeout, 0644);
 MODULE_PARM_DESC(timeout, "Timeout value in seconds.");
 
@@ -263,12 +288,13 @@ module_param_call(preop, set_param_str, get_param_str, preop_op, 0644);
 MODULE_PARM_DESC(preop, "Pretimeout driver operation.  One of: "
 		 "preop_none, preop_panic, preop_give_data.");
 
-module_param(start_now, int, 0);
+module_param(start_now, int, 0444);
 MODULE_PARM_DESC(start_now, "Set to 1 to start the watchdog as"
 		 "soon as the driver is loaded.");
 
 module_param(nowayout, int, 0644);
-MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
+MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started "
+		 "(default=CONFIG_WATCHDOG_NOWAYOUT)");
 
 /* Default state of the timer. */
 static unsigned char ipmi_watchdog_state = WDOG_TIMEOUT_NONE;
@@ -872,6 +898,11 @@ static void ipmi_register_watchdog(int ipmi_intf)
 	if (watchdog_user)
 		goto out;
 
+	if ((ifnum_to_use >= 0) && (ifnum_to_use != ipmi_intf))
+		goto out;
+
+	watchdog_ifnum = ipmi_intf;
+
 	rv = ipmi_create_user(ipmi_intf, &ipmi_hndlrs, NULL, &watchdog_user);
 	if (rv < 0) {
 		printk(KERN_CRIT PFX "Unable to register with ipmi\n");
@@ -901,6 +932,39 @@ static void ipmi_register_watchdog(int ipmi_intf)
 	}
 }
 
+static void ipmi_unregister_watchdog(int ipmi_intf)
+{
+	int rv;
+
+	down_write(&register_sem);
+
+	if (!watchdog_user)
+		goto out;
+
+	if (watchdog_ifnum != ipmi_intf)
+		goto out;
+
+	/* Make sure no one can call us any more. */
+	misc_deregister(&ipmi_wdog_miscdev);
+
+	/* Wait to make sure the message makes it out.  The lower layer has
+	   pointers to our buffers, we want to make sure they are done before
+	   we release our memory. */
+	while (atomic_read(&set_timeout_tofree))
+		schedule_timeout_uninterruptible(1);
+
+	/* Disconnect from IPMI. */
+	rv = ipmi_destroy_user(watchdog_user);
+	if (rv) {
+		printk(KERN_WARNING PFX "error unlinking from IPMI: %d\n",
+		       rv);
+	}
+	watchdog_user = NULL;
+
+ out:
+	up_write(&register_sem);
+}
+
 #ifdef HAVE_NMI_HANDLER
 static int
 ipmi_nmi(void *dev_id, int cpu, int handled)
@@ -1004,9 +1068,7 @@ static void ipmi_new_smi(int if_num, struct device *device)
 
 static void ipmi_smi_gone(int if_num)
 {
-	/* This can never be called, because once the watchdog is
-	   registered, the interface can't go away until the watchdog
-	   is unregistered. */
+	ipmi_unregister_watchdog(if_num);
 }
 
 static struct ipmi_smi_watcher smi_watcher =
@@ -1148,30 +1210,32 @@ static int __init ipmi_wdog_init(void)
 
 	check_parms();
 
+	register_reboot_notifier(&wdog_reboot_notifier);
+	atomic_notifier_chain_register(&panic_notifier_list,
+			&wdog_panic_notifier);
+
 	rv = ipmi_smi_watcher_register(&smi_watcher);
 	if (rv) {
 #ifdef HAVE_NMI_HANDLER
 		if (preaction_val == WDOG_PRETIMEOUT_NMI)
 			release_nmi(&ipmi_nmi_handler);
 #endif
+		atomic_notifier_chain_unregister(&panic_notifier_list,
+						 &wdog_panic_notifier);
+		unregister_reboot_notifier(&wdog_reboot_notifier);
 		printk(KERN_WARNING PFX "can't register smi watcher\n");
 		return rv;
 	}
 
-	register_reboot_notifier(&wdog_reboot_notifier);
-	atomic_notifier_chain_register(&panic_notifier_list,
-			&wdog_panic_notifier);
-
 	printk(KERN_INFO PFX "driver initialized\n");
 
 	return 0;
 }
 
-static __exit void ipmi_unregister_watchdog(void)
+static void __exit ipmi_wdog_exit(void)
 {
-	int rv;
-
-	down_write(&register_sem);
+	ipmi_smi_watcher_unregister(&smi_watcher);
+	ipmi_unregister_watchdog(watchdog_ifnum);
 
 #ifdef HAVE_NMI_HANDLER
 	if (nmi_handler_registered)
@@ -1179,37 +1243,8 @@ static __exit void ipmi_unregister_watchdog(void)
 #endif
 
 	atomic_notifier_chain_unregister(&panic_notifier_list,
-			&wdog_panic_notifier);
+					 &wdog_panic_notifier);
 	unregister_reboot_notifier(&wdog_reboot_notifier);
-
-	if (! watchdog_user)
-		goto out;
-
-	/* Make sure no one can call us any more. */
-	misc_deregister(&ipmi_wdog_miscdev);
-
-	/* Wait to make sure the message makes it out.  The lower layer has
-	   pointers to our buffers, we want to make sure they are done before
-	   we release our memory. */
-	while (atomic_read(&set_timeout_tofree))
-		schedule_timeout_uninterruptible(1);
-
-	/* Disconnect from IPMI. */
-	rv = ipmi_destroy_user(watchdog_user);
-	if (rv) {
-		printk(KERN_WARNING PFX "error unlinking from IPMI: %d\n",
-		       rv);
-	}
-	watchdog_user = NULL;
-
- out:
-	up_write(&register_sem);
-}
-
-static void __exit ipmi_wdog_exit(void)
-{
-	ipmi_smi_watcher_unregister(&smi_watcher);
-	ipmi_unregister_watchdog();
 }
 module_exit(ipmi_wdog_exit);
 module_init(ipmi_wdog_init);
-- 
cgit v0.10.2


From b9675136e2ad95156fb93be6155f17590bb26fd7 Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Wed, 6 Dec 2006 20:41:02 -0800
Subject: [PATCH] IPMI: Add maintenance mode

Some commands and operations on a BMC can cause the BMC to "go away" for a
while.  This can cause the automatic flag processing and other things of that
nature to timeout and generate annoying logs, or possibly cause other bad
things to happen when in firmware update mode.

Add detection of those commands (cold reset, warm reset, and any firmware
command) and turns off automatic processing for 30 seconds.  It also add a
manual override either way.

Signed-off-by: Corey Minyard <minyard@acm.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/ipmi/ipmi_devintf.c b/drivers/char/ipmi/ipmi_devintf.c
index 81fcf0c..375d337 100644
--- a/drivers/char/ipmi/ipmi_devintf.c
+++ b/drivers/char/ipmi/ipmi_devintf.c
@@ -596,6 +596,31 @@ static int ipmi_ioctl(struct inode  *inode,
 		rv = 0;
 		break;
 	}
+
+	case IPMICTL_GET_MAINTENANCE_MODE_CMD:
+	{
+		int mode;
+
+		mode = ipmi_get_maintenance_mode(priv->user);
+		if (copy_to_user(arg, &mode, sizeof(mode))) {
+			rv = -EFAULT;
+			break;
+		}
+		rv = 0;
+		break;
+	}
+
+	case IPMICTL_SET_MAINTENANCE_MODE_CMD:
+	{
+		int mode;
+
+		if (copy_from_user(&mode, arg, sizeof(mode))) {
+			rv = -EFAULT;
+			break;
+		}
+		rv = ipmi_set_maintenance_mode(priv->user, mode);
+		break;
+	}
 	}
   
 	return rv;
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 03f3261..ff0e68f 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -48,7 +48,7 @@
 
 #define PFX "IPMI message handler: "
 
-#define IPMI_DRIVER_VERSION "39.0"
+#define IPMI_DRIVER_VERSION "39.1"
 
 static struct ipmi_recv_msg *ipmi_alloc_recv_msg(void);
 static int ipmi_init_msghandler(void);
@@ -59,6 +59,9 @@ static int initialized = 0;
 static struct proc_dir_entry *proc_ipmi_root = NULL;
 #endif /* CONFIG_PROC_FS */
 
+/* Remain in auto-maintenance mode for this amount of time (in ms). */
+#define IPMI_MAINTENANCE_MODE_TIMEOUT 30000
+
 #define MAX_EVENTS_IN_QUEUE	25
 
 /* Don't let a message sit in a queue forever, always time it with at lest
@@ -262,6 +265,12 @@ struct ipmi_smi
 	unsigned char local_sel_device;
 	unsigned char local_event_generator;
 
+	/* For handling of maintenance mode. */
+	int maintenance_mode;
+	int maintenance_mode_enable;
+	int auto_maintenance_timeout;
+	spinlock_t maintenance_mode_lock; /* Used in a timer... */
+
 	/* A cheap hack, if this is non-null and a message to an
 	   interface comes in with a NULL user, call this routine with
 	   it.  Note that the message will still be freed by the
@@ -985,6 +994,65 @@ int ipmi_get_my_LUN(ipmi_user_t   user,
 	return 0;
 }
 
+int ipmi_get_maintenance_mode(ipmi_user_t user)
+{
+	int           mode;
+	unsigned long flags;
+
+	spin_lock_irqsave(&user->intf->maintenance_mode_lock, flags);
+	mode = user->intf->maintenance_mode;
+	spin_unlock_irqrestore(&user->intf->maintenance_mode_lock, flags);
+
+	return mode;
+}
+EXPORT_SYMBOL(ipmi_get_maintenance_mode);
+
+static void maintenance_mode_update(ipmi_smi_t intf)
+{
+	if (intf->handlers->set_maintenance_mode)
+		intf->handlers->set_maintenance_mode(
+			intf->send_info, intf->maintenance_mode_enable);
+}
+
+int ipmi_set_maintenance_mode(ipmi_user_t user, int mode)
+{
+	int           rv = 0;
+	unsigned long flags;
+	ipmi_smi_t    intf = user->intf;
+
+	spin_lock_irqsave(&intf->maintenance_mode_lock, flags);
+	if (intf->maintenance_mode != mode) {
+		switch (mode) {
+		case IPMI_MAINTENANCE_MODE_AUTO:
+			intf->maintenance_mode = mode;
+			intf->maintenance_mode_enable
+				= (intf->auto_maintenance_timeout > 0);
+			break;
+
+		case IPMI_MAINTENANCE_MODE_OFF:
+			intf->maintenance_mode = mode;
+			intf->maintenance_mode_enable = 0;
+			break;
+
+		case IPMI_MAINTENANCE_MODE_ON:
+			intf->maintenance_mode = mode;
+			intf->maintenance_mode_enable = 1;
+			break;
+
+		default:
+			rv = -EINVAL;
+			goto out_unlock;
+		}
+
+		maintenance_mode_update(intf);
+	}
+ out_unlock:
+	spin_unlock_irqrestore(&intf->maintenance_mode_lock, flags);
+
+	return rv;
+}
+EXPORT_SYMBOL(ipmi_set_maintenance_mode);
+
 int ipmi_set_gets_events(ipmi_user_t user, int val)
 {
 	unsigned long        flags;
@@ -1322,6 +1390,24 @@ static int i_ipmi_request(ipmi_user_t          user,
 			goto out_err;
 		}
 
+		if (((msg->netfn == IPMI_NETFN_APP_REQUEST)
+		      && ((msg->cmd == IPMI_COLD_RESET_CMD)
+			  || (msg->cmd == IPMI_WARM_RESET_CMD)))
+		     || (msg->netfn == IPMI_NETFN_FIRMWARE_REQUEST))
+		{
+			spin_lock_irqsave(&intf->maintenance_mode_lock, flags);
+			intf->auto_maintenance_timeout
+				= IPMI_MAINTENANCE_MODE_TIMEOUT;
+			if (!intf->maintenance_mode
+			    && !intf->maintenance_mode_enable)
+			{
+				intf->maintenance_mode_enable = 1;
+				maintenance_mode_update(intf);
+			}
+			spin_unlock_irqrestore(&intf->maintenance_mode_lock,
+					       flags);
+		}
+
 		if ((msg->data_len + 2) > IPMI_MAX_MSG_LENGTH) {
 			spin_lock_irqsave(&intf->counter_lock, flags);
 			intf->sent_invalid_commands++;
@@ -2605,6 +2691,7 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 	INIT_LIST_HEAD(&intf->waiting_events);
 	intf->waiting_events_count = 0;
 	mutex_init(&intf->cmd_rcvrs_mutex);
+	spin_lock_init(&intf->maintenance_mode_lock);
 	INIT_LIST_HEAD(&intf->cmd_rcvrs);
 	init_waitqueue_head(&intf->waitq);
 
@@ -3593,6 +3680,30 @@ static void ipmi_timeout_handler(long timeout_period)
 
 		list_for_each_entry_safe(msg, msg2, &timeouts, link)
 			deliver_err_response(msg, IPMI_TIMEOUT_COMPLETION_CODE);
+
+		/*
+		 * Maintenance mode handling.  Check the timeout
+		 * optimistically before we claim the lock.  It may
+		 * mean a timeout gets missed occasionally, but that
+		 * only means the timeout gets extended by one period
+		 * in that case.  No big deal, and it avoids the lock
+		 * most of the time.
+		 */
+		if (intf->auto_maintenance_timeout > 0) {
+			spin_lock_irqsave(&intf->maintenance_mode_lock, flags);
+			if (intf->auto_maintenance_timeout > 0) {
+				intf->auto_maintenance_timeout
+					-= timeout_period;
+				if (!intf->maintenance_mode
+				    && (intf->auto_maintenance_timeout <= 0))
+				{
+					intf->maintenance_mode_enable = 0;
+					maintenance_mode_update(intf);
+				}
+			}
+			spin_unlock_irqrestore(&intf->maintenance_mode_lock,
+					       flags);
+		}
 	}
 	rcu_read_unlock();
 }
@@ -3606,6 +3717,10 @@ static void ipmi_request_event(void)
 	/* Called from the timer, no need to check if handlers is
 	 * valid. */
 	list_for_each_entry_rcu(intf, &ipmi_interfaces, link) {
+		/* No event requests when in maintenance mode. */
+		if (intf->maintenance_mode_enable)
+			continue;
+
 		handlers = intf->handlers;
 		if (handlers)
 			handlers->request_events(intf->send_info);
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 89cdb92..f04bee7 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -949,12 +949,21 @@ static int smi_start_processing(void       *send_info,
 	return 0;
 }
 
+static void set_maintenance_mode(void *send_info, int enable)
+{
+	struct smi_info   *smi_info = send_info;
+
+	if (!enable)
+		atomic_set(&smi_info->req_events, 0);
+}
+
 static struct ipmi_smi_handlers handlers =
 {
 	.owner                  = THIS_MODULE,
 	.start_processing       = smi_start_processing,
 	.sender			= sender,
 	.request_events		= request_events,
+	.set_maintenance_mode   = set_maintenance_mode,
 	.set_run_to_completion  = set_run_to_completion,
 	.poll			= poll,
 };
diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h
index 796ca00..7a9db390 100644
--- a/include/linux/ipmi.h
+++ b/include/linux/ipmi.h
@@ -208,6 +208,15 @@ struct kernel_ipmi_msg
    code as the first byte of the incoming data, unlike a response. */
 
 
+/*
+ * Modes for ipmi_set_maint_mode() and the userland IOCTL.  The AUTO
+ * setting is the default and means it will be set on certain
+ * commands.  Hard setting it on and off will override automatic
+ * operation.
+ */
+#define IPMI_MAINTENANCE_MODE_AUTO	0
+#define IPMI_MAINTENANCE_MODE_OFF	1
+#define IPMI_MAINTENANCE_MODE_ON	2
 
 #ifdef __KERNEL__
 
@@ -374,6 +383,35 @@ int ipmi_unregister_for_cmd(ipmi_user_t   user,
 			    unsigned int  chans);
 
 /*
+ * Go into a mode where the driver will not autonomously attempt to do
+ * things with the interface.  It will still respond to attentions and
+ * interrupts, and it will expect that commands will complete.  It
+ * will not automatcially check for flags, events, or things of that
+ * nature.
+ *
+ * This is primarily used for firmware upgrades.  The idea is that
+ * when you go into firmware upgrade mode, you do this operation
+ * and the driver will not attempt to do anything but what you tell
+ * it or what the BMC asks for.
+ *
+ * Note that if you send a command that resets the BMC, the driver
+ * will still expect a response from that command.  So the BMC should
+ * reset itself *after* the response is sent.  Resetting before the
+ * response is just silly.
+ *
+ * If in auto maintenance mode, the driver will automatically go into
+ * maintenance mode for 30 seconds if it sees a cold reset, a warm
+ * reset, or a firmware NetFN.  This means that code that uses only
+ * firmware NetFN commands to do upgrades will work automatically
+ * without change, assuming it sends a message every 30 seconds or
+ * less.
+ *
+ * See the IPMI_MAINTENANCE_MODE_xxx defines for what the mode means.
+ */
+int ipmi_get_maintenance_mode(ipmi_user_t user);
+int ipmi_set_maintenance_mode(ipmi_user_t user, int mode);
+
+/*
  * Allow run-to-completion mode to be set for the interface of
  * a specific user.
  */
@@ -656,4 +694,11 @@ struct ipmi_timing_parms
 #define IPMICTL_GET_TIMING_PARMS_CMD	_IOR(IPMI_IOC_MAGIC, 23, \
 					     struct ipmi_timing_parms)
 
+/*
+ * Set the maintenance mode.  See ipmi_set_maintenance_mode() above
+ * for a description of what this does.
+ */
+#define IPMICTL_GET_MAINTENANCE_MODE_CMD	_IOR(IPMI_IOC_MAGIC, 30, int)
+#define IPMICTL_SET_MAINTENANCE_MODE_CMD	_IOW(IPMI_IOC_MAGIC, 31, int)
+
 #endif /* __LINUX_IPMI_H */
diff --git a/include/linux/ipmi_msgdefs.h b/include/linux/ipmi_msgdefs.h
index 4d04d8b..8d6759c 100644
--- a/include/linux/ipmi_msgdefs.h
+++ b/include/linux/ipmi_msgdefs.h
@@ -46,6 +46,8 @@
 #define IPMI_NETFN_APP_REQUEST			0x06
 #define IPMI_NETFN_APP_RESPONSE			0x07
 #define IPMI_GET_DEVICE_ID_CMD		0x01
+#define IPMI_COLD_RESET_CMD		0x02
+#define IPMI_WARM_RESET_CMD		0x03
 #define IPMI_CLEAR_MSG_FLAGS_CMD	0x30
 #define IPMI_GET_DEVICE_GUID_CMD	0x08
 #define IPMI_GET_MSG_FLAGS_CMD		0x31
@@ -60,6 +62,9 @@
 #define IPMI_NETFN_STORAGE_RESPONSE		0x0b
 #define IPMI_ADD_SEL_ENTRY_CMD		0x44
 
+#define IPMI_NETFN_FIRMWARE_REQUEST		0x08
+#define IPMI_NETFN_FIRMWARE_RESPONSE		0x09
+
 /* The default slave address */
 #define IPMI_BMC_SLAVE_ADDR	0x20
 
diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index 2cc960d..c063310 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -115,6 +115,13 @@ struct ipmi_smi_handlers
 	   poll for operations during things like crash dumps. */
 	void (*poll)(void *send_info);
 
+	/* Enable/disable firmware maintenance mode.  Note that this
+	   is *not* the modes defined, this is simply an on/off
+	   setting.  The message handler does the mode handling.  Note
+	   that this is called from interupt context, so it cannot
+	   block. */
+	void (*set_maintenance_mode)(void *send_info, int enable);
+
 	/* Tell the handler that we are using it/not using it.  The
 	   message handler get the modules that this handler belongs
 	   to; this function lets the SMI claim any modules that it
-- 
cgit v0.10.2


From 55162fb165571b3ba4283fe657f1a5968cef361a Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Wed, 6 Dec 2006 20:41:04 -0800
Subject: [PATCH] IPMI: fix request events

When the IPMI message handler requested that the interface look for events,
the ipmi_si driver would request flags, see if the event buffer full flag was
set, then request events.  It's better to just send the request for events, as
it cuts one message out of the transaction if there happens to be events, and
it will fetch events even if the event buffer was not full.

Signed-off-by: Corey Minyard <minyard@acm.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index f04bee7..c02f517 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -684,22 +684,24 @@ static enum si_sm_result smi_event_handler(struct smi_info *smi_info,
 	{
 		/* We are idle and the upper layer requested that I fetch
 		   events, so do so. */
-		unsigned char msg[2];
+		atomic_set(&smi_info->req_events, 0);
 
-		spin_lock(&smi_info->count_lock);
-		smi_info->flag_fetches++;
-		spin_unlock(&smi_info->count_lock);
+		smi_info->curr_msg = ipmi_alloc_smi_msg();
+		if (!smi_info->curr_msg)
+			goto out;
 
-		atomic_set(&smi_info->req_events, 0);
-		msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
-		msg[1] = IPMI_GET_MSG_FLAGS_CMD;
+		smi_info->curr_msg->data[0] = (IPMI_NETFN_APP_REQUEST << 2);
+		smi_info->curr_msg->data[1] = IPMI_READ_EVENT_MSG_BUFFER_CMD;
+		smi_info->curr_msg->data_size = 2;
 
 		smi_info->handlers->start_transaction(
-			smi_info->si_sm, msg, 2);
-		smi_info->si_state = SI_GETTING_FLAGS;
+			smi_info->si_sm,
+			smi_info->curr_msg->data,
+			smi_info->curr_msg->data_size);
+		smi_info->si_state = SI_GETTING_EVENTS;
 		goto restart;
 	}
-
+ out:
 	return si_sm_result;
 }
 
-- 
cgit v0.10.2


From 15c62e10bb49eebf3da2b010a9196f5095947b0b Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Wed, 6 Dec 2006 20:41:06 -0800
Subject: [PATCH] IPMI: add poll delay

Make sure to delay a little in the IPMI poll routine so we can pass in a
timeout time and thus time things out.

Signed-off-by: Corey Minyard <minyard@acm.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index c02f517..bedd763 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -807,7 +807,12 @@ static void poll(void *send_info)
 {
 	struct smi_info *smi_info = send_info;
 
-	smi_event_handler(smi_info, 0);
+	/*
+	 * Make sure there is some delay in the poll loop so we can
+	 * drive time forward and timeout things.
+	 */
+	udelay(10);
+	smi_event_handler(smi_info, 10);
 }
 
 static void request_events(void *send_info)
-- 
cgit v0.10.2


From b361e27bba261ba59c73df464fa640f7c0fe3553 Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Wed, 6 Dec 2006 20:41:07 -0800
Subject: [PATCH] IPMI: system interface hotplug

Add the ability to hot add and remove interfaces in the ipmi_si driver.  Any
users who have the device open will get errors if they try to send a message.

Signed-off-by: Corey Minyard <minyard@acm.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/Documentation/IPMI.txt b/Documentation/IPMI.txt
index 9101cbf..24dc3fc 100644
--- a/Documentation/IPMI.txt
+++ b/Documentation/IPMI.txt
@@ -365,6 +365,7 @@ You can change this at module load time (for a module) with:
        regshifts=<shift1>,<shift2>,...
        slave_addrs=<addr1>,<addr2>,...
        force_kipmid=<enable1>,<enable2>,...
+       unload_when_empty=[0|1]
 
 Each of these except si_trydefaults is a list, the first item for the
 first interface, second item for the second interface, etc.
@@ -416,6 +417,11 @@ by the driver, but systems with broken interrupts might need an enable,
 or users that don't want the daemon (don't need the performance, don't
 want the CPU hit) can disable it.
 
+If unload_when_empty is set to 1, the driver will be unloaded if it
+doesn't find any interfaces or all the interfaces fail to work.  The
+default is one.  Setting to 0 is useful with the hotmod, but is
+obviously only useful for modules.
+
 When compiled into the kernel, the parameters can be specified on the
 kernel command line as:
 
@@ -441,6 +447,25 @@ have high-res timers enabled in the kernel and you don't have
 interrupts enabled, the driver will run VERY slowly.  Don't blame me,
 these interfaces suck.
 
+The driver supports a hot add and remove of interfaces.  This way,
+interfaces can be added or removed after the kernel is up and running.
+This is done using /sys/modules/ipmi_si/hotmod, which is a write-only
+parameter.  You write a string to this interface.  The string has the
+format:
+   <op1>[:op2[:op3...]]
+The "op"s are:
+   add|remove,kcs|bt|smic,mem|i/o,<address>[,<opt1>[,<opt2>[,...]]]
+You can specify more than one interface on the line.  The "opt"s are:
+   rsp=<regspacing>
+   rsi=<regsize>
+   rsh=<regshift>
+   irq=<irq>
+   ipmb=<ipmb slave addr>
+and these have the same meanings as discussed above.  Note that you
+can also use this on the kernel command line for a more compact format
+for specifying an interface.  Note that when removing an interface,
+only the first three parameters (si type, address type, and address)
+are used for the comparison.  Any options are ignored for removing.
 
 The SMBus Driver
 ----------------
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index bedd763..8b36b50 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -61,6 +61,10 @@
 #include "ipmi_si_sm.h"
 #include <linux/init.h>
 #include <linux/dmi.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+
+#define PFX "ipmi_si: "
 
 /* Measure times between events in the driver. */
 #undef DEBUG_TIMING
@@ -92,7 +96,7 @@ enum si_intf_state {
 enum si_type {
     SI_KCS, SI_SMIC, SI_BT
 };
-static char *si_to_str[] = { "KCS", "SMIC", "BT" };
+static char *si_to_str[] = { "kcs", "smic", "bt" };
 
 #define DEVICE_NAME "ipmi_si"
 
@@ -222,7 +226,10 @@ struct smi_info
 static int force_kipmid[SI_MAX_PARMS];
 static int num_force_kipmid;
 
+static int unload_when_empty = 1;
+
 static int try_smi_init(struct smi_info *smi);
+static void cleanup_one_si(struct smi_info *to_clean);
 
 static ATOMIC_NOTIFIER_HEAD(xaction_notifier_list);
 static int register_xaction_notifier(struct notifier_block * nb)
@@ -247,7 +254,7 @@ static void return_hosed_msg(struct smi_info *smi_info)
 	/* Make it a reponse */
 	msg->rsp[0] = msg->data[0] | 4;
 	msg->rsp[1] = msg->data[1];
-	msg->rsp[2] = 0xFF; /* Unknown error. */
+	msg->rsp[2] = IPMI_ERR_UNSPECIFIED;
 	msg->rsp_size = 3;
 
 	smi_info->curr_msg = NULL;
@@ -716,6 +723,15 @@ static void sender(void                *send_info,
 	struct timeval    t;
 #endif
 
+	if (atomic_read(&smi_info->stop_operation)) {
+		msg->rsp[0] = msg->data[0] | 4;
+		msg->rsp[1] = msg->data[1];
+		msg->rsp[2] = IPMI_ERR_UNSPECIFIED;
+		msg->rsp_size = 3;
+		deliver_recv_msg(smi_info, msg);
+		return;
+	}
+
 	spin_lock_irqsave(&(smi_info->msg_lock), flags);
 #ifdef DEBUG_TIMING
 	do_gettimeofday(&t);
@@ -819,6 +835,9 @@ static void request_events(void *send_info)
 {
 	struct smi_info *smi_info = send_info;
 
+	if (atomic_read(&smi_info->stop_operation))
+		return;
+
 	atomic_set(&smi_info->req_events, 1);
 }
 
@@ -1003,6 +1022,16 @@ static int num_regshifts = 0;
 static int slave_addrs[SI_MAX_PARMS];
 static int num_slave_addrs = 0;
 
+#define IPMI_IO_ADDR_SPACE  0
+#define IPMI_MEM_ADDR_SPACE 1
+static char *addr_space_to_str[] = { "I/O", "mem" };
+
+static int hotmod_handler(const char *val, struct kernel_param *kp);
+
+module_param_call(hotmod, hotmod_handler, NULL, NULL, 0200);
+MODULE_PARM_DESC(hotmod, "Add and remove interfaces.  See"
+		 " Documentation/IPMI.txt in the kernel sources for the"
+		 " gory details.");
 
 module_param_named(trydefaults, si_trydefaults, bool, 0);
 MODULE_PARM_DESC(trydefaults, "Setting this to 'false' will disable the"
@@ -1054,12 +1083,12 @@ module_param_array(force_kipmid, int, &num_force_kipmid, 0);
 MODULE_PARM_DESC(force_kipmid, "Force the kipmi daemon to be enabled (1) or"
 		 " disabled(0).  Normally the IPMI driver auto-detects"
 		 " this, but the value may be overridden by this parm.");
+module_param(unload_when_empty, int, 0);
+MODULE_PARM_DESC(unload_when_empty, "Unload the module if no interfaces are"
+		 " specified or found, default is 1.  Setting to 0"
+		 " is useful for hot add of devices using hotmod.");
 
 
-#define IPMI_IO_ADDR_SPACE  0
-#define IPMI_MEM_ADDR_SPACE 1
-static char *addr_space_to_str[] = { "I/O", "memory" };
-
 static void std_irq_cleanup(struct smi_info *info)
 {
 	if (info->si_type == SI_BT)
@@ -1333,6 +1362,234 @@ static int mem_setup(struct smi_info *info)
 	return 0;
 }
 
+/*
+ * Parms come in as <op1>[:op2[:op3...]].  ops are:
+ *   add|remove,kcs|bt|smic,mem|i/o,<address>[,<opt1>[,<opt2>[,...]]]
+ * Options are:
+ *   rsp=<regspacing>
+ *   rsi=<regsize>
+ *   rsh=<regshift>
+ *   irq=<irq>
+ *   ipmb=<ipmb addr>
+ */
+enum hotmod_op { HM_ADD, HM_REMOVE };
+struct hotmod_vals {
+	char *name;
+	int  val;
+};
+static struct hotmod_vals hotmod_ops[] = {
+	{ "add",	HM_ADD },
+	{ "remove",	HM_REMOVE },
+	{ NULL }
+};
+static struct hotmod_vals hotmod_si[] = {
+	{ "kcs",	SI_KCS },
+	{ "smic",	SI_SMIC },
+	{ "bt",		SI_BT },
+	{ NULL }
+};
+static struct hotmod_vals hotmod_as[] = {
+	{ "mem",	IPMI_MEM_ADDR_SPACE },
+	{ "i/o",	IPMI_IO_ADDR_SPACE },
+	{ NULL }
+};
+static int ipmi_strcasecmp(const char *s1, const char *s2)
+{
+	while (*s1 || *s2) {
+		if (!*s1)
+			return -1;
+		if (!*s2)
+			return 1;
+		if (*s1 != *s2)
+			return *s1 - *s2;
+		s1++;
+		s2++;
+	}
+	return 0;
+}
+static int parse_str(struct hotmod_vals *v, int *val, char *name, char **curr)
+{
+	char *s;
+	int  i;
+
+	s = strchr(*curr, ',');
+	if (!s) {
+		printk(KERN_WARNING PFX "No hotmod %s given.\n", name);
+		return -EINVAL;
+	}
+	*s = '\0';
+	s++;
+	for (i = 0; hotmod_ops[i].name; i++) {
+		if (ipmi_strcasecmp(*curr, v[i].name) == 0) {
+			*val = v[i].val;
+			*curr = s;
+			return 0;
+		}
+	}
+
+	printk(KERN_WARNING PFX "Invalid hotmod %s '%s'\n", name, *curr);
+	return -EINVAL;
+}
+
+static int hotmod_handler(const char *val, struct kernel_param *kp)
+{
+	char *str = kstrdup(val, GFP_KERNEL);
+	int  rv = -EINVAL;
+	char *next, *curr, *s, *n, *o;
+	enum hotmod_op op;
+	enum si_type si_type;
+	int  addr_space;
+	unsigned long addr;
+	int regspacing;
+	int regsize;
+	int regshift;
+	int irq;
+	int ipmb;
+	int ival;
+	struct smi_info *info;
+
+	if (!str)
+		return -ENOMEM;
+
+	/* Kill any trailing spaces, as we can get a "\n" from echo. */
+	ival = strlen(str) - 1;
+	while ((ival >= 0) && isspace(str[ival])) {
+		str[ival] = '\0';
+		ival--;
+	}
+
+	for (curr = str; curr; curr = next) {
+		regspacing = 1;
+		regsize = 1;
+		regshift = 0;
+		irq = 0;
+		ipmb = 0x20;
+
+		next = strchr(curr, ':');
+		if (next) {
+			*next = '\0';
+			next++;
+		}
+
+		rv = parse_str(hotmod_ops, &ival, "operation", &curr);
+		if (rv)
+			break;
+		op = ival;
+
+		rv = parse_str(hotmod_si, &ival, "interface type", &curr);
+		if (rv)
+			break;
+		si_type = ival;
+
+		rv = parse_str(hotmod_as, &addr_space, "address space", &curr);
+		if (rv)
+			break;
+
+		s = strchr(curr, ',');
+		if (s) {
+			*s = '\0';
+			s++;
+		}
+		addr = simple_strtoul(curr, &n, 0);
+		if ((*n != '\0') || (*curr == '\0')) {
+			printk(KERN_WARNING PFX "Invalid hotmod address"
+			       " '%s'\n", curr);
+			break;
+		}
+
+		while (s) {
+			curr = s;
+			s = strchr(curr, ',');
+			if (s) {
+				*s = '\0';
+				s++;
+			}
+			o = strchr(curr, '=');
+			if (o) {
+				*o = '\0';
+				o++;
+			}
+#define HOTMOD_INT_OPT(name, val) \
+			if (ipmi_strcasecmp(curr, name) == 0) {		\
+				if (!o) {				\
+					printk(KERN_WARNING PFX		\
+					       "No option given for '%s'\n", \
+						curr);			\
+					goto out;			\
+				}					\
+				val = simple_strtoul(o, &n, 0);		\
+				if ((*n != '\0') || (*o == '\0')) {	\
+					printk(KERN_WARNING PFX		\
+					       "Bad option given for '%s'\n", \
+					       curr);			\
+					goto out;			\
+				}					\
+			}
+
+			HOTMOD_INT_OPT("rsp", regspacing)
+			else HOTMOD_INT_OPT("rsi", regsize)
+			else HOTMOD_INT_OPT("rsh", regshift)
+			else HOTMOD_INT_OPT("irq", irq)
+			else HOTMOD_INT_OPT("ipmb", ipmb)
+			else {
+				printk(KERN_WARNING PFX
+				       "Invalid hotmod option '%s'\n",
+				       curr);
+				goto out;
+			}
+#undef HOTMOD_INT_OPT
+		}
+
+		if (op == HM_ADD) {
+			info = kzalloc(sizeof(*info), GFP_KERNEL);
+			if (!info) {
+				rv = -ENOMEM;
+				goto out;
+			}
+
+			info->addr_source = "hotmod";
+			info->si_type = si_type;
+			info->io.addr_data = addr;
+			info->io.addr_type = addr_space;
+			if (addr_space == IPMI_MEM_ADDR_SPACE)
+				info->io_setup = mem_setup;
+			else
+				info->io_setup = port_setup;
+
+			info->io.addr = NULL;
+			info->io.regspacing = regspacing;
+			if (!info->io.regspacing)
+				info->io.regspacing = DEFAULT_REGSPACING;
+			info->io.regsize = regsize;
+			if (!info->io.regsize)
+				info->io.regsize = DEFAULT_REGSPACING;
+			info->io.regshift = regshift;
+			info->irq = irq;
+			if (info->irq)
+				info->irq_setup = std_irq_setup;
+			info->slave_addr = ipmb;
+
+			try_smi_init(info);
+		} else {
+			/* remove */
+			struct smi_info *e, *tmp_e;
+
+			mutex_lock(&smi_infos_lock);
+			list_for_each_entry_safe(e, tmp_e, &smi_infos, link) {
+				if (e->io.addr_type != addr_space)
+					continue;
+				if (e->si_type != si_type)
+					continue;
+				if (e->io.addr_data == addr)
+					cleanup_one_si(e);
+			}
+			mutex_unlock(&smi_infos_lock);
+		}
+	}
+ out:
+	kfree(str);
+	return rv;
+}
 
 static __devinit void hardcode_find_bmc(void)
 {
@@ -1349,11 +1606,11 @@ static __devinit void hardcode_find_bmc(void)
 
 		info->addr_source = "hardcoded";
 
-		if (!si_type[i] || strcmp(si_type[i], "kcs") == 0) {
+		if (!si_type[i] || ipmi_strcasecmp(si_type[i], "kcs") == 0) {
 			info->si_type = SI_KCS;
-		} else if (strcmp(si_type[i], "smic") == 0) {
+		} else if (ipmi_strcasecmp(si_type[i], "smic") == 0) {
 			info->si_type = SI_SMIC;
-		} else if (strcmp(si_type[i], "bt") == 0) {
+		} else if (ipmi_strcasecmp(si_type[i], "bt") == 0) {
 			info->si_type = SI_BT;
 		} else {
 			printk(KERN_WARNING
@@ -1968,19 +2225,9 @@ static int try_get_dev_id(struct smi_info *smi_info)
 static int type_file_read_proc(char *page, char **start, off_t off,
 			       int count, int *eof, void *data)
 {
-	char            *out = (char *) page;
 	struct smi_info *smi = data;
 
-	switch (smi->si_type) {
-	    case SI_KCS:
-		return sprintf(out, "kcs\n");
-	    case SI_SMIC:
-		return sprintf(out, "smic\n");
-	    case SI_BT:
-		return sprintf(out, "bt\n");
-	    default:
-		return 0;
-	}
+	return sprintf(page, "%s\n", si_to_str[smi->si_type]);
 }
 
 static int stat_file_read_proc(char *page, char **start, off_t off,
@@ -2016,7 +2263,24 @@ static int stat_file_read_proc(char *page, char **start, off_t off,
 	out += sprintf(out, "incoming_messages:     %ld\n",
 		       smi->incoming_messages);
 
-	return (out - ((char *) page));
+	return out - page;
+}
+
+static int param_read_proc(char *page, char **start, off_t off,
+			   int count, int *eof, void *data)
+{
+	struct smi_info *smi = data;
+
+	return sprintf(page,
+		       "%s,%s,0x%lx,rsp=%d,rsi=%d,rsh=%d,irq=%d,ipmb=%d\n",
+		       si_to_str[smi->si_type],
+		       addr_space_to_str[smi->io.addr_type],
+		       smi->io.addr_data,
+		       smi->io.regspacing,
+		       smi->io.regsize,
+		       smi->io.regshift,
+		       smi->irq,
+		       smi->slave_addr);
 }
 
 /*
@@ -2407,6 +2671,16 @@ static int try_smi_init(struct smi_info *new_smi)
 		goto out_err_stop_timer;
 	}
 
+	rv = ipmi_smi_add_proc_entry(new_smi->intf, "params",
+				     param_read_proc, NULL,
+				     new_smi, THIS_MODULE);
+	if (rv) {
+		printk(KERN_ERR
+		       "ipmi_si: Unable to create proc entry: %d\n",
+		       rv);
+		goto out_err_stop_timer;
+	}
+
 	list_add_tail(&new_smi->link, &smi_infos);
 
 	mutex_unlock(&smi_infos_lock);
@@ -2515,7 +2789,7 @@ static __devinit int init_ipmi_si(void)
 	}
 
 	mutex_lock(&smi_infos_lock);
-	if (list_empty(&smi_infos)) {
+	if (unload_when_empty && list_empty(&smi_infos)) {
 		mutex_unlock(&smi_infos_lock);
 #ifdef CONFIG_PCI
 		pci_unregister_driver(&ipmi_pci_driver);
@@ -2530,7 +2804,7 @@ static __devinit int init_ipmi_si(void)
 }
 module_init(init_ipmi_si);
 
-static void __devexit cleanup_one_si(struct smi_info *to_clean)
+static void cleanup_one_si(struct smi_info *to_clean)
 {
 	int           rv;
 	unsigned long flags;
-- 
cgit v0.10.2


From 7d8ba9669a28f650001093ebbb6ecc3b0a72926f Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Wed, 6 Dec 2006 20:41:09 -0800
Subject: [PATCH] IPMI: add pigeonpoint poweroff

X86 boards generally use ACPI for the power management interactions with the
BMC.  However, non-x86 boards need some help.  This patch adds the help for
the Motorola PigeonPoint-based IPMCs.

Signed-off-by: Joseph Barnett <jbarnett@motorola.com>
Signed-off-by: Corey Minyard <minyard@acm.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/ipmi/ipmi_poweroff.c b/drivers/char/ipmi/ipmi_poweroff.c
index 85f8071..8ce447a 100644
--- a/drivers/char/ipmi/ipmi_poweroff.c
+++ b/drivers/char/ipmi/ipmi_poweroff.c
@@ -176,6 +176,42 @@ static int ipmi_request_in_rc_mode(ipmi_user_t            user,
 #define IPMI_ATCA_GET_ADDR_INFO_CMD	0x01
 #define IPMI_PICMG_ID			0
 
+#define IPMI_NETFN_OEM				0x2e
+#define IPMI_ATCA_PPS_GRACEFUL_RESTART		0x11
+#define IPMI_ATCA_PPS_IANA			"\x00\x40\x0A"
+#define IPMI_MOTOROLA_MANUFACTURER_ID		0x0000A1
+#define IPMI_MOTOROLA_PPS_IPMC_PRODUCT_ID	0x0051
+
+static void (*atca_oem_poweroff_hook)(ipmi_user_t user) = NULL;
+
+static void pps_poweroff_atca (ipmi_user_t user)
+{
+        struct ipmi_system_interface_addr smi_addr;
+        struct kernel_ipmi_msg            send_msg;
+        int                               rv;
+        /*
+         * Configure IPMI address for local access
+         */
+        smi_addr.addr_type = IPMI_SYSTEM_INTERFACE_ADDR_TYPE;
+        smi_addr.channel = IPMI_BMC_CHANNEL;
+        smi_addr.lun = 0;
+
+        printk(KERN_INFO PFX "PPS powerdown hook used");
+
+        send_msg.netfn = IPMI_NETFN_OEM;
+        send_msg.cmd = IPMI_ATCA_PPS_GRACEFUL_RESTART;
+        send_msg.data = IPMI_ATCA_PPS_IANA;
+        send_msg.data_len = 3;
+        rv = ipmi_request_in_rc_mode(user,
+                                  (struct ipmi_addr *) &smi_addr,
+                                   &send_msg);
+        if (rv && rv != IPMI_UNKNOWN_ERR_COMPLETION_CODE) {
+                printk(KERN_ERR PFX "Unable to send ATCA ,"
+                       " IPMI error 0x%x\n", rv);
+        }
+	return;
+}
+
 static int ipmi_atca_detect (ipmi_user_t user)
 {
 	struct ipmi_system_interface_addr smi_addr;
@@ -201,6 +237,13 @@ static int ipmi_atca_detect (ipmi_user_t user)
 	rv = ipmi_request_wait_for_response(user,
 					    (struct ipmi_addr *) &smi_addr,
 					    &send_msg);
+
+        printk(KERN_INFO PFX "ATCA Detect mfg 0x%X prod 0x%X\n", mfg_id, prod_id);
+        if((mfg_id == IPMI_MOTOROLA_MANUFACTURER_ID)
+            && (prod_id == IPMI_MOTOROLA_PPS_IPMC_PRODUCT_ID)) {
+		printk(KERN_INFO PFX "Installing Pigeon Point Systems Poweroff Hook\n");
+		atca_oem_poweroff_hook = pps_poweroff_atca;
+	}
 	return !rv;
 }
 
@@ -234,12 +277,19 @@ static void ipmi_poweroff_atca (ipmi_user_t user)
 	rv = ipmi_request_in_rc_mode(user,
 				     (struct ipmi_addr *) &smi_addr,
 				     &send_msg);
-	if (rv) {
+        /** At this point, the system may be shutting down, and most
+         ** serial drivers (if used) will have interrupts turned off
+         ** it may be better to ignore IPMI_UNKNOWN_ERR_COMPLETION_CODE
+         ** return code
+         **/
+        if (rv && rv != IPMI_UNKNOWN_ERR_COMPLETION_CODE) {
 		printk(KERN_ERR PFX "Unable to send ATCA powerdown message,"
 		       " IPMI error 0x%x\n", rv);
 		goto out;
 	}
 
+	if(atca_oem_poweroff_hook)
+		return atca_oem_poweroff_hook(user);
  out:
 	return;
 }
-- 
cgit v0.10.2


From 168b35a7f67c5a8189e6b92780dfb5262604057c Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Wed, 6 Dec 2006 20:41:11 -0800
Subject: [PATCH] IPMI: fix pci warning

Change pci_module_init() to the new interface, and check the return code to
avoid warnings and give the user useful information if this fails.

Signed-off-by: Corey Minyard <minyard@acm.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 8b36b50..c7de2e8 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -2774,7 +2774,12 @@ static __devinit int init_ipmi_si(void)
 #endif
 
 #ifdef CONFIG_PCI
-	pci_module_init(&ipmi_pci_driver);
+	rv = pci_register_driver(&ipmi_pci_driver);
+	if (rv){
+		printk(KERN_ERR
+		       "init_ipmi_si: Unable to register PCI driver: %d\n",
+		       rv);
+	}
 #endif
 
 	if (si_trydefaults) {
-- 
cgit v0.10.2


From 4d7cbac7c870ca66d8fb27d68188efbb5de2dffa Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Wed, 6 Dec 2006 20:41:14 -0800
Subject: [PATCH] IPMI: Fix BT long busy

The IPMI BT subdriver has been patched to survive "long busy" timeouts seen
during firmware upgrades and resets.  The patch never returns the HOSED state,
synthesizes response messages with meaningful completion codes, and recovers
gracefully when the hardware finishes the long busy.  The subdriver now issues
a "Get BT Capabilities" command and properly uses those results.  More
informative completion codes are returned on error from transaction starts;
this logic was propogated to the KCS and SMIC subdrivers.  Finally, indent and
other style quirks were normalized.

Signed-off-by: Rocky Craig <rocky.craig@hp.com>
Signed-off-by: Corey Minyard <minyard@acm.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/ipmi/ipmi_bt_sm.c b/drivers/char/ipmi/ipmi_bt_sm.c
index 0030cd8..6c59baa 100644
--- a/drivers/char/ipmi/ipmi_bt_sm.c
+++ b/drivers/char/ipmi/ipmi_bt_sm.c
@@ -33,11 +33,13 @@
 #include <linux/ipmi_msgdefs.h>		/* for completion codes */
 #include "ipmi_si_sm.h"
 
-static int bt_debug = 0x00;	/* Production value 0, see following flags */
+#define BT_DEBUG_OFF	0	/* Used in production */
+#define BT_DEBUG_ENABLE	1	/* Generic messages */
+#define BT_DEBUG_MSG	2	/* Prints all request/response buffers */
+#define BT_DEBUG_STATES	4	/* Verbose look at state changes */
+
+static int bt_debug = BT_DEBUG_OFF;
 
-#define	BT_DEBUG_ENABLE	1
-#define BT_DEBUG_MSG	2
-#define BT_DEBUG_STATES	4
 module_param(bt_debug, int, 0644);
 MODULE_PARM_DESC(bt_debug, "debug bitmask, 1=enable, 2=messages, 4=states");
 
@@ -47,38 +49,54 @@ MODULE_PARM_DESC(bt_debug, "debug bitmask, 1=enable, 2=messages, 4=states");
    Since the Open IPMI architecture is single-message oriented at this
    stage, the queue depth of BT is of no concern. */
 
-#define BT_NORMAL_TIMEOUT	5000000	/* seconds in microseconds */
-#define BT_RETRY_LIMIT		2
-#define BT_RESET_DELAY		6000000	/* 6 seconds after warm reset */
+#define BT_NORMAL_TIMEOUT	5	/* seconds */
+#define BT_NORMAL_RETRY_LIMIT	2
+#define BT_RESET_DELAY		6	/* seconds after warm reset */
+
+/* States are written in chronological order and usually cover
+   multiple rows of the state table discussion in the IPMI spec. */
 
 enum bt_states {
-	BT_STATE_IDLE,
+	BT_STATE_IDLE = 0,	/* Order is critical in this list */
 	BT_STATE_XACTION_START,
 	BT_STATE_WRITE_BYTES,
-	BT_STATE_WRITE_END,
 	BT_STATE_WRITE_CONSUME,
-	BT_STATE_B2H_WAIT,
-	BT_STATE_READ_END,
-	BT_STATE_RESET1,		/* These must come last */
+	BT_STATE_READ_WAIT,
+	BT_STATE_CLEAR_B2H,
+	BT_STATE_READ_BYTES,
+	BT_STATE_RESET1,	/* These must come last */
 	BT_STATE_RESET2,
 	BT_STATE_RESET3,
 	BT_STATE_RESTART,
-	BT_STATE_HOSED
+	BT_STATE_PRINTME,
+	BT_STATE_CAPABILITIES_BEGIN,
+	BT_STATE_CAPABILITIES_END,
+	BT_STATE_LONG_BUSY	/* BT doesn't get hosed :-) */
 };
 
+/* Macros seen at the end of state "case" blocks.  They help with legibility
+   and debugging. */
+
+#define BT_STATE_CHANGE(X,Y) { bt->state = X; return Y; }
+
+#define BT_SI_SM_RETURN(Y)   { last_printed = BT_STATE_PRINTME; return Y; }
+
 struct si_sm_data {
 	enum bt_states	state;
-	enum bt_states	last_state;	/* assist printing and resets */
 	unsigned char	seq;		/* BT sequence number */
 	struct si_sm_io	*io;
-        unsigned char	write_data[IPMI_MAX_MSG_LENGTH];
-        int		write_count;
-        unsigned char	read_data[IPMI_MAX_MSG_LENGTH];
-        int		read_count;
-        int		truncated;
-        long		timeout;
-        unsigned int	error_retries;	/* end of "common" fields */
+	unsigned char	write_data[IPMI_MAX_MSG_LENGTH];
+	int		write_count;
+	unsigned char	read_data[IPMI_MAX_MSG_LENGTH];
+	int		read_count;
+	int		truncated;
+	long		timeout;	/* microseconds countdown */
+	int		error_retries;	/* end of "common" fields */
 	int		nonzero_status;	/* hung BMCs stay all 0 */
+	enum bt_states	complete;	/* to divert the state machine */
+	int		BT_CAP_outreqs;
+	long		BT_CAP_req2rsp;
+	int		BT_CAP_retries;	/* Recommended retries */
 };
 
 #define BT_CLR_WR_PTR	0x01	/* See IPMI 1.5 table 11.6.4 */
@@ -111,86 +129,118 @@ struct si_sm_data {
 static char *state2txt(unsigned char state)
 {
 	switch (state) {
-		case BT_STATE_IDLE:		return("IDLE");
-		case BT_STATE_XACTION_START:	return("XACTION");
-		case BT_STATE_WRITE_BYTES:	return("WR_BYTES");
-		case BT_STATE_WRITE_END:	return("WR_END");
-		case BT_STATE_WRITE_CONSUME:	return("WR_CONSUME");
-		case BT_STATE_B2H_WAIT:		return("B2H_WAIT");
-		case BT_STATE_READ_END:		return("RD_END");
-		case BT_STATE_RESET1:		return("RESET1");
-		case BT_STATE_RESET2:		return("RESET2");
-		case BT_STATE_RESET3:		return("RESET3");
-		case BT_STATE_RESTART:		return("RESTART");
-		case BT_STATE_HOSED:		return("HOSED");
+	case BT_STATE_IDLE:		return("IDLE");
+	case BT_STATE_XACTION_START:	return("XACTION");
+	case BT_STATE_WRITE_BYTES:	return("WR_BYTES");
+	case BT_STATE_WRITE_CONSUME:	return("WR_CONSUME");
+	case BT_STATE_READ_WAIT:	return("RD_WAIT");
+	case BT_STATE_CLEAR_B2H:	return("CLEAR_B2H");
+	case BT_STATE_READ_BYTES:	return("RD_BYTES");
+	case BT_STATE_RESET1:		return("RESET1");
+	case BT_STATE_RESET2:		return("RESET2");
+	case BT_STATE_RESET3:		return("RESET3");
+	case BT_STATE_RESTART:		return("RESTART");
+	case BT_STATE_LONG_BUSY:	return("LONG_BUSY");
+	case BT_STATE_CAPABILITIES_BEGIN: return("CAP_BEGIN");
+	case BT_STATE_CAPABILITIES_END:	return("CAP_END");
 	}
 	return("BAD STATE");
 }
 #define STATE2TXT state2txt(bt->state)
 
-static char *status2txt(unsigned char status, char *buf)
+static char *status2txt(unsigned char status)
 {
+	/*
+	 * This cannot be called by two threads at the same time and
+	 * the buffer is always consumed immediately, so the static is
+	 * safe to use.
+	 */
+	static char buf[40];
+
 	strcpy(buf, "[ ");
-	if (status & BT_B_BUSY) strcat(buf, "B_BUSY ");
-	if (status & BT_H_BUSY) strcat(buf, "H_BUSY ");
-	if (status & BT_OEM0) strcat(buf, "OEM0 ");
-	if (status & BT_SMS_ATN) strcat(buf, "SMS ");
-	if (status & BT_B2H_ATN) strcat(buf, "B2H ");
-	if (status & BT_H2B_ATN) strcat(buf, "H2B ");
+	if (status & BT_B_BUSY)
+		strcat(buf, "B_BUSY ");
+	if (status & BT_H_BUSY)
+		strcat(buf, "H_BUSY ");
+	if (status & BT_OEM0)
+		strcat(buf, "OEM0 ");
+	if (status & BT_SMS_ATN)
+		strcat(buf, "SMS ");
+	if (status & BT_B2H_ATN)
+		strcat(buf, "B2H ");
+	if (status & BT_H2B_ATN)
+		strcat(buf, "H2B ");
 	strcat(buf, "]");
 	return buf;
 }
-#define STATUS2TXT(buf) status2txt(status, buf)
+#define STATUS2TXT status2txt(status)
+
+/* called externally at insmod time, and internally on cleanup */
 
-/* This will be called from within this module on a hosed condition */
-#define FIRST_SEQ	0
 static unsigned int bt_init_data(struct si_sm_data *bt, struct si_sm_io *io)
 {
-	bt->state = BT_STATE_IDLE;
-	bt->last_state = BT_STATE_IDLE;
-	bt->seq = FIRST_SEQ;
-	bt->io = io;
-	bt->write_count = 0;
-	bt->read_count = 0;
-	bt->error_retries = 0;
-	bt->nonzero_status = 0;
-	bt->truncated = 0;
-	bt->timeout = BT_NORMAL_TIMEOUT;
+	memset(bt, 0, sizeof(struct si_sm_data));
+	if (bt->io != io) {		/* external: one-time only things */
+		bt->io = io;
+		bt->seq = 0;
+	}
+	bt->state = BT_STATE_IDLE;	/* start here */
+	bt->complete = BT_STATE_IDLE;	/* end here */
+	bt->BT_CAP_req2rsp = BT_NORMAL_TIMEOUT * 1000000;
+	bt->BT_CAP_retries = BT_NORMAL_RETRY_LIMIT;
+	/* BT_CAP_outreqs == zero is a flag to read BT Capabilities */
 	return 3; /* We claim 3 bytes of space; ought to check SPMI table */
 }
 
+/* Jam a completion code (probably an error) into a response */
+
+static void force_result(struct si_sm_data *bt, unsigned char completion_code)
+{
+	bt->read_data[0] = 4;				/* # following bytes */
+	bt->read_data[1] = bt->write_data[1] | 4;	/* Odd NetFn/LUN */
+	bt->read_data[2] = bt->write_data[2];		/* seq (ignored) */
+	bt->read_data[3] = bt->write_data[3];		/* Command */
+	bt->read_data[4] = completion_code;
+	bt->read_count = 5;
+}
+
+/* The upper state machine starts here */
+
 static int bt_start_transaction(struct si_sm_data *bt,
 				unsigned char *data,
 				unsigned int size)
 {
 	unsigned int i;
 
-	if ((size < 2) || (size > (IPMI_MAX_MSG_LENGTH - 2)))
-	       return -1;
+	if (size < 2)
+		return IPMI_REQ_LEN_INVALID_ERR;
+	if (size > IPMI_MAX_MSG_LENGTH)
+		return IPMI_REQ_LEN_EXCEEDED_ERR;
 
-	if ((bt->state != BT_STATE_IDLE) && (bt->state != BT_STATE_HOSED))
-		return -2;
+	if (bt->state == BT_STATE_LONG_BUSY)
+		return IPMI_NODE_BUSY_ERR;
+
+	if (bt->state != BT_STATE_IDLE)
+		return IPMI_NOT_IN_MY_STATE_ERR;
 
 	if (bt_debug & BT_DEBUG_MSG) {
-    		printk(KERN_WARNING "+++++++++++++++++++++++++++++++++++++\n");
-		printk(KERN_WARNING "BT: write seq=0x%02X:", bt->seq);
+		printk(KERN_WARNING "BT: +++++++++++++++++ New command\n");
+		printk(KERN_WARNING "BT: NetFn/LUN CMD [%d data]:", size - 2);
 		for (i = 0; i < size; i ++)
-		       printk (" %02x", data[i]);
+			printk (" %02x", data[i]);
 		printk("\n");
 	}
 	bt->write_data[0] = size + 1;	/* all data plus seq byte */
 	bt->write_data[1] = *data;	/* NetFn/LUN */
-	bt->write_data[2] = bt->seq;
+	bt->write_data[2] = bt->seq++;
 	memcpy(bt->write_data + 3, data + 1, size - 1);
 	bt->write_count = size + 2;
-
 	bt->error_retries = 0;
 	bt->nonzero_status = 0;
-	bt->read_count = 0;
 	bt->truncated = 0;
 	bt->state = BT_STATE_XACTION_START;
-	bt->last_state = BT_STATE_IDLE;
-	bt->timeout = BT_NORMAL_TIMEOUT;
+	bt->timeout = bt->BT_CAP_req2rsp;
+	force_result(bt, IPMI_ERR_UNSPECIFIED);
 	return 0;
 }
 
@@ -198,38 +248,30 @@ static int bt_start_transaction(struct si_sm_data *bt,
    it calls this.  Strip out the length and seq bytes. */
 
 static int bt_get_result(struct si_sm_data *bt,
-			   unsigned char *data,
-			   unsigned int length)
+			 unsigned char *data,
+			 unsigned int length)
 {
 	int i, msg_len;
 
 	msg_len = bt->read_count - 2;		/* account for length & seq */
-	/* Always NetFn, Cmd, cCode */
 	if (msg_len < 3 || msg_len > IPMI_MAX_MSG_LENGTH) {
-		printk(KERN_DEBUG "BT results: bad msg_len = %d\n", msg_len);
-		data[0] = bt->write_data[1] | 0x4;	/* Kludge a response */
-		data[1] = bt->write_data[3];
-		data[2] = IPMI_ERR_UNSPECIFIED;
+		force_result(bt, IPMI_ERR_UNSPECIFIED);
 		msg_len = 3;
-	} else {
-		data[0] = bt->read_data[1];
-		data[1] = bt->read_data[3];
-		if (length < msg_len)
-		       bt->truncated = 1;
-		if (bt->truncated) {	/* can be set in read_all_bytes() */
-			data[2] = IPMI_ERR_MSG_TRUNCATED;
-			msg_len = 3;
-		} else
-		       memcpy(data + 2, bt->read_data + 4, msg_len - 2);
+	}
+	data[0] = bt->read_data[1];
+	data[1] = bt->read_data[3];
+	if (length < msg_len || bt->truncated) {
+		data[2] = IPMI_ERR_MSG_TRUNCATED;
+		msg_len = 3;
+	} else
+		memcpy(data + 2, bt->read_data + 4, msg_len - 2);
 
-		if (bt_debug & BT_DEBUG_MSG) {
-			printk (KERN_WARNING "BT: res (raw)");
-			for (i = 0; i < msg_len; i++)
-			       printk(" %02x", data[i]);
-			printk ("\n");
-		}
+	if (bt_debug & BT_DEBUG_MSG) {
+		printk (KERN_WARNING "BT: result %d bytes:", msg_len);
+		for (i = 0; i < msg_len; i++)
+			printk(" %02x", data[i]);
+		printk ("\n");
 	}
-	bt->read_count = 0;	/* paranoia */
 	return msg_len;
 }
 
@@ -238,22 +280,40 @@ static int bt_get_result(struct si_sm_data *bt,
 
 static void reset_flags(struct si_sm_data *bt)
 {
+	if (bt_debug)
+		printk(KERN_WARNING "IPMI BT: flag reset %s\n",
+					status2txt(BT_STATUS));
 	if (BT_STATUS & BT_H_BUSY)
-	       BT_CONTROL(BT_H_BUSY);
-	if (BT_STATUS & BT_B_BUSY)
-	       BT_CONTROL(BT_B_BUSY);
-	BT_CONTROL(BT_CLR_WR_PTR);
-	BT_CONTROL(BT_SMS_ATN);
-
-	if (BT_STATUS & BT_B2H_ATN) {
-		int i;
-		BT_CONTROL(BT_H_BUSY);
-		BT_CONTROL(BT_B2H_ATN);
-		BT_CONTROL(BT_CLR_RD_PTR);
-		for (i = 0; i < IPMI_MAX_MSG_LENGTH + 2; i++)
-		       BMC2HOST;
-		BT_CONTROL(BT_H_BUSY);
-	}
+		BT_CONTROL(BT_H_BUSY);	/* force clear */
+	BT_CONTROL(BT_CLR_WR_PTR);	/* always reset */
+	BT_CONTROL(BT_SMS_ATN);		/* always clear */
+	BT_INTMASK_W(BT_BMC_HWRST);
+}
+
+/* Get rid of an unwanted/stale response.  This should only be needed for
+   BMCs that support multiple outstanding requests. */
+
+static void drain_BMC2HOST(struct si_sm_data *bt)
+{
+	int i, size;
+
+	if (!(BT_STATUS & BT_B2H_ATN)) 	/* Not signalling a response */
+		return;
+
+	BT_CONTROL(BT_H_BUSY);		/* now set */
+	BT_CONTROL(BT_B2H_ATN);		/* always clear */
+	BT_STATUS;			/* pause */
+	BT_CONTROL(BT_B2H_ATN);		/* some BMCs are stubborn */
+	BT_CONTROL(BT_CLR_RD_PTR);	/* always reset */
+	if (bt_debug)
+		printk(KERN_WARNING "IPMI BT: stale response %s; ",
+			status2txt(BT_STATUS));
+	size = BMC2HOST;
+	for (i = 0; i < size ; i++)
+		BMC2HOST;
+	BT_CONTROL(BT_H_BUSY);		/* now clear */
+	if (bt_debug)
+		printk("drained %d bytes\n", size + 1);
 }
 
 static inline void write_all_bytes(struct si_sm_data *bt)
@@ -261,201 +321,256 @@ static inline void write_all_bytes(struct si_sm_data *bt)
 	int i;
 
 	if (bt_debug & BT_DEBUG_MSG) {
-    		printk(KERN_WARNING "BT: write %d bytes seq=0x%02X",
+		printk(KERN_WARNING "BT: write %d bytes seq=0x%02X",
 			bt->write_count, bt->seq);
 		for (i = 0; i < bt->write_count; i++)
 			printk (" %02x", bt->write_data[i]);
 		printk ("\n");
 	}
 	for (i = 0; i < bt->write_count; i++)
-	       HOST2BMC(bt->write_data[i]);
+		HOST2BMC(bt->write_data[i]);
 }
 
 static inline int read_all_bytes(struct si_sm_data *bt)
 {
 	unsigned char i;
 
+	/* length is "framing info", minimum = 4: NetFn, Seq, Cmd, cCode.
+	   Keep layout of first four bytes aligned with write_data[] */
+
 	bt->read_data[0] = BMC2HOST;
 	bt->read_count = bt->read_data[0];
-	if (bt_debug & BT_DEBUG_MSG)
-    		printk(KERN_WARNING "BT: read %d bytes:", bt->read_count);
 
-	/* minimum: length, NetFn, Seq, Cmd, cCode == 5 total, or 4 more
-	   following the length byte. */
 	if (bt->read_count < 4 || bt->read_count >= IPMI_MAX_MSG_LENGTH) {
 		if (bt_debug & BT_DEBUG_MSG)
-			printk("bad length %d\n", bt->read_count);
+			printk(KERN_WARNING "BT: bad raw rsp len=%d\n",
+				bt->read_count);
 		bt->truncated = 1;
 		return 1;	/* let next XACTION START clean it up */
 	}
 	for (i = 1; i <= bt->read_count; i++)
-	       bt->read_data[i] = BMC2HOST;
-	bt->read_count++;	/* account for the length byte */
+		bt->read_data[i] = BMC2HOST;
+	bt->read_count++;	/* Account internally for length byte */
 
 	if (bt_debug & BT_DEBUG_MSG) {
-	    	for (i = 0; i < bt->read_count; i++)
+		int max = bt->read_count;
+
+		printk(KERN_WARNING "BT: got %d bytes seq=0x%02X",
+			max, bt->read_data[2]);
+		if (max > 16)
+			max = 16;
+		for (i = 0; i < max; i++)
 			printk (" %02x", bt->read_data[i]);
-	    	printk ("\n");
+		printk ("%s\n", bt->read_count == max ? "" : " ...");
 	}
-	if (bt->seq != bt->write_data[2])	/* idiot check */
-		printk(KERN_DEBUG "BT: internal error: sequence mismatch\n");
 
-	/* per the spec, the (NetFn, Seq, Cmd) tuples should match */
-	if ((bt->read_data[3] == bt->write_data[3]) &&		/* Cmd */
-        	(bt->read_data[2] == bt->write_data[2]) &&	/* Sequence */
-        	((bt->read_data[1] & 0xF8) == (bt->write_data[1] & 0xF8)))
+	/* per the spec, the (NetFn[1], Seq[2], Cmd[3]) tuples must match */
+	if ((bt->read_data[3] == bt->write_data[3]) &&
+	    (bt->read_data[2] == bt->write_data[2]) &&
+	    ((bt->read_data[1] & 0xF8) == (bt->write_data[1] & 0xF8)))
 			return 1;
 
 	if (bt_debug & BT_DEBUG_MSG)
-	       printk(KERN_WARNING "BT: bad packet: "
+		printk(KERN_WARNING "IPMI BT: bad packet: "
 		"want 0x(%02X, %02X, %02X) got (%02X, %02X, %02X)\n",
-		bt->write_data[1], bt->write_data[2], bt->write_data[3],
+		bt->write_data[1] | 0x04, bt->write_data[2], bt->write_data[3],
 		bt->read_data[1],  bt->read_data[2],  bt->read_data[3]);
 	return 0;
 }
 
-/* Modifies bt->state appropriately, need to get into the bt_event() switch */
+/* Restart if retries are left, or return an error completion code */
 
-static void error_recovery(struct si_sm_data *bt, char *reason)
+static enum si_sm_result error_recovery(struct si_sm_data *bt,
+					unsigned char status,
+					unsigned char cCode)
 {
-	unsigned char status;
-	char buf[40]; /* For getting status */
+	char *reason;
 
-	bt->timeout = BT_NORMAL_TIMEOUT; /* various places want to retry */
+	bt->timeout = bt->BT_CAP_req2rsp;
 
-	status = BT_STATUS;
-	printk(KERN_DEBUG "BT: %s in %s %s\n", reason, STATE2TXT,
-	       STATUS2TXT(buf));
+	switch (cCode) {
+	case IPMI_TIMEOUT_ERR:
+		reason = "timeout";
+		break;
+	default:
+		reason = "internal error";
+		break;
+	}
+
+	printk(KERN_WARNING "IPMI BT: %s in %s %s ", 	/* open-ended line */
+		reason, STATE2TXT, STATUS2TXT);
 
+	/* Per the IPMI spec, retries are based on the sequence number
+	   known only to this module, so manage a restart here. */
 	(bt->error_retries)++;
-	if (bt->error_retries > BT_RETRY_LIMIT) {
-		printk(KERN_DEBUG "retry limit (%d) exceeded\n", BT_RETRY_LIMIT);
-		bt->state = BT_STATE_HOSED;
-		if (!bt->nonzero_status)
-			printk(KERN_ERR "IPMI: BT stuck, try power cycle\n");
-		else if (bt->error_retries <= BT_RETRY_LIMIT + 1) {
-			printk(KERN_DEBUG "IPMI: BT reset (takes 5 secs)\n");
-        		bt->state = BT_STATE_RESET1;
-		}
-	return;
+	if (bt->error_retries < bt->BT_CAP_retries) {
+		printk("%d retries left\n",
+			bt->BT_CAP_retries - bt->error_retries);
+		bt->state = BT_STATE_RESTART;
+		return SI_SM_CALL_WITHOUT_DELAY;
 	}
 
-	/* Sometimes the BMC queues get in an "off-by-one" state...*/
-	if ((bt->state == BT_STATE_B2H_WAIT) && (status & BT_B2H_ATN)) {
-    		printk(KERN_DEBUG "retry B2H_WAIT\n");
-		return;
+	printk("failed %d retries, sending error response\n",
+		bt->BT_CAP_retries);
+	if (!bt->nonzero_status)
+		printk(KERN_ERR "IPMI BT: stuck, try power cycle\n");
+
+	/* this is most likely during insmod */
+	else if (bt->seq <= (unsigned char)(bt->BT_CAP_retries & 0xFF)) {
+		printk(KERN_WARNING "IPMI: BT reset (takes 5 secs)\n");
+		bt->state = BT_STATE_RESET1;
+		return SI_SM_CALL_WITHOUT_DELAY;
 	}
 
-	printk(KERN_DEBUG "restart command\n");
-	bt->state = BT_STATE_RESTART;
+	/* Concoct a useful error message, set up the next state, and
+	   be done with this sequence. */
+
+	bt->state = BT_STATE_IDLE;
+	switch (cCode) {
+	case IPMI_TIMEOUT_ERR:
+		if (status & BT_B_BUSY) {
+			cCode = IPMI_NODE_BUSY_ERR;
+			bt->state = BT_STATE_LONG_BUSY;
+		}
+		break;
+	default:
+		break;
+	}
+	force_result(bt, cCode);
+	return SI_SM_TRANSACTION_COMPLETE;
 }
 
-/* Check the status and (possibly) advance the BT state machine.  The
-   default return is SI_SM_CALL_WITH_DELAY. */
+/* Check status and (usually) take action and change this state machine. */
 
 static enum si_sm_result bt_event(struct si_sm_data *bt, long time)
 {
-	unsigned char status;
-	char buf[40]; /* For getting status */
+	unsigned char status, BT_CAP[8];
+	static enum bt_states last_printed = BT_STATE_PRINTME;
 	int i;
 
 	status = BT_STATUS;
 	bt->nonzero_status |= status;
-
-	if ((bt_debug & BT_DEBUG_STATES) && (bt->state != bt->last_state))
+	if ((bt_debug & BT_DEBUG_STATES) && (bt->state != last_printed)) {
 		printk(KERN_WARNING "BT: %s %s TO=%ld - %ld \n",
 			STATE2TXT,
-			STATUS2TXT(buf),
+			STATUS2TXT,
 			bt->timeout,
 			time);
-	bt->last_state = bt->state;
+		last_printed = bt->state;
+	}
 
-	if (bt->state == BT_STATE_HOSED)
-	       return SI_SM_HOSED;
+	/* Commands that time out may still (eventually) provide a response.
+	   This stale response will get in the way of a new response so remove
+	   it if possible (hopefully during IDLE).  Even if it comes up later
+	   it will be rejected by its (now-forgotten) seq number. */
+
+	if ((bt->state < BT_STATE_WRITE_BYTES) && (status & BT_B2H_ATN)) {
+		drain_BMC2HOST(bt);
+		BT_SI_SM_RETURN(SI_SM_CALL_WITH_DELAY);
+	}
 
-	if (bt->state != BT_STATE_IDLE) {	/* do timeout test */
+	if ((bt->state != BT_STATE_IDLE) &&
+	    (bt->state <  BT_STATE_PRINTME)) {		/* check timeout */
 		bt->timeout -= time;
-		if ((bt->timeout < 0) && (bt->state < BT_STATE_RESET1)) {
-			error_recovery(bt, "timed out");
-			return SI_SM_CALL_WITHOUT_DELAY;
-		}
+		if ((bt->timeout < 0) && (bt->state < BT_STATE_RESET1))
+			return error_recovery(bt,
+					      status,
+					      IPMI_TIMEOUT_ERR);
 	}
 
 	switch (bt->state) {
 
-    	case BT_STATE_IDLE:	/* check for asynchronous messages */
+	/* Idle state first checks for asynchronous messages from another
+	   channel, then does some opportunistic housekeeping. */
+
+	case BT_STATE_IDLE:
 		if (status & BT_SMS_ATN) {
 			BT_CONTROL(BT_SMS_ATN);	/* clear it */
 			return SI_SM_ATTN;
 		}
-		return SI_SM_IDLE;
 
-	case BT_STATE_XACTION_START:
-		if (status & BT_H_BUSY) {
+		if (status & BT_H_BUSY)		/* clear a leftover H_BUSY */
 			BT_CONTROL(BT_H_BUSY);
-			break;
-		}
-    		if (status & BT_B2H_ATN)
-		       break;
-		bt->state = BT_STATE_WRITE_BYTES;
-		return SI_SM_CALL_WITHOUT_DELAY;	/* for logging */
 
-	case BT_STATE_WRITE_BYTES:
+		/* Read BT capabilities if it hasn't been done yet */
+		if (!bt->BT_CAP_outreqs)
+			BT_STATE_CHANGE(BT_STATE_CAPABILITIES_BEGIN,
+					SI_SM_CALL_WITHOUT_DELAY);
+		bt->timeout = bt->BT_CAP_req2rsp;
+		BT_SI_SM_RETURN(SI_SM_IDLE);
+
+	case BT_STATE_XACTION_START:
 		if (status & (BT_B_BUSY | BT_H2B_ATN))
-		       break;
+			BT_SI_SM_RETURN(SI_SM_CALL_WITH_DELAY);
+		if (BT_STATUS & BT_H_BUSY)
+			BT_CONTROL(BT_H_BUSY);	/* force clear */
+		BT_STATE_CHANGE(BT_STATE_WRITE_BYTES,
+				SI_SM_CALL_WITHOUT_DELAY);
+
+	case BT_STATE_WRITE_BYTES:
+		if (status & BT_H_BUSY)
+			BT_CONTROL(BT_H_BUSY);	/* clear */
 		BT_CONTROL(BT_CLR_WR_PTR);
 		write_all_bytes(bt);
-		BT_CONTROL(BT_H2B_ATN);	/* clears too fast to catch? */
-		bt->state = BT_STATE_WRITE_CONSUME;
-		return SI_SM_CALL_WITHOUT_DELAY; /* it MIGHT sail through */
-
-	case BT_STATE_WRITE_CONSUME: /* BMCs usually blow right thru here */
-        	if (status & (BT_H2B_ATN | BT_B_BUSY))
-		       break;
-		bt->state = BT_STATE_B2H_WAIT;
-		/* fall through with status */
-
-	/* Stay in BT_STATE_B2H_WAIT until a packet matches.  However, spinning
-	   hard here, constantly reading status, seems to hold off the
-	   generation of B2H_ATN so ALWAYS return CALL_WITH_DELAY. */
-
-	case BT_STATE_B2H_WAIT:
-    		if (!(status & BT_B2H_ATN))
-		       break;
-
-		/* Assume ordered, uncached writes: no need to wait */
-		if (!(status & BT_H_BUSY))
-		       BT_CONTROL(BT_H_BUSY); /* set */
-		BT_CONTROL(BT_B2H_ATN);		/* clear it, ACK to the BMC */
-		BT_CONTROL(BT_CLR_RD_PTR);	/* reset the queue */
-		i = read_all_bytes(bt);
-		BT_CONTROL(BT_H_BUSY);		/* clear */
-		if (!i)				/* Try this state again */
-		       break;
-		bt->state = BT_STATE_READ_END;
-		return SI_SM_CALL_WITHOUT_DELAY;	/* for logging */
-
-    	case BT_STATE_READ_END:
-
-		/* I could wait on BT_H_BUSY to go clear for a truly clean
-		   exit.  However, this is already done in XACTION_START
-		   and the (possible) extra loop/status/possible wait affects
-		   performance.  So, as long as it works, just ignore H_BUSY */
-
-#ifdef MAKE_THIS_TRUE_IF_NECESSARY
+		BT_CONTROL(BT_H2B_ATN);	/* can clear too fast to catch */
+		BT_STATE_CHANGE(BT_STATE_WRITE_CONSUME,
+				SI_SM_CALL_WITHOUT_DELAY);
 
-		if (status & BT_H_BUSY)
-		       break;
-#endif
-		bt->seq++;
-		bt->state = BT_STATE_IDLE;
-		return SI_SM_TRANSACTION_COMPLETE;
+	case BT_STATE_WRITE_CONSUME:
+		if (status & (BT_B_BUSY | BT_H2B_ATN))
+			BT_SI_SM_RETURN(SI_SM_CALL_WITH_DELAY);
+		BT_STATE_CHANGE(BT_STATE_READ_WAIT,
+				SI_SM_CALL_WITHOUT_DELAY);
+
+	/* Spinning hard can suppress B2H_ATN and force a timeout */
+
+	case BT_STATE_READ_WAIT:
+		if (!(status & BT_B2H_ATN))
+			BT_SI_SM_RETURN(SI_SM_CALL_WITH_DELAY);
+		BT_CONTROL(BT_H_BUSY);		/* set */
+
+		/* Uncached, ordered writes should just proceeed serially but
+		   some BMCs don't clear B2H_ATN with one hit.  Fast-path a
+		   workaround without too much penalty to the general case. */
+
+		BT_CONTROL(BT_B2H_ATN);		/* clear it to ACK the BMC */
+		BT_STATE_CHANGE(BT_STATE_CLEAR_B2H,
+				SI_SM_CALL_WITHOUT_DELAY);
+
+	case BT_STATE_CLEAR_B2H:
+		if (status & BT_B2H_ATN) {	/* keep hitting it */
+			BT_CONTROL(BT_B2H_ATN);
+			BT_SI_SM_RETURN(SI_SM_CALL_WITH_DELAY);
+		}
+		BT_STATE_CHANGE(BT_STATE_READ_BYTES,
+				SI_SM_CALL_WITHOUT_DELAY);
+
+	case BT_STATE_READ_BYTES:
+		if (!(status & BT_H_BUSY))	/* check in case of retry */
+			BT_CONTROL(BT_H_BUSY);
+		BT_CONTROL(BT_CLR_RD_PTR);	/* start of BMC2HOST buffer */
+		i = read_all_bytes(bt);		/* true == packet seq match */
+		BT_CONTROL(BT_H_BUSY);		/* NOW clear */
+		if (!i) 			/* Not my message */
+			BT_STATE_CHANGE(BT_STATE_READ_WAIT,
+					SI_SM_CALL_WITHOUT_DELAY);
+		bt->state = bt->complete;
+		return bt->state == BT_STATE_IDLE ?	/* where to next? */
+			SI_SM_TRANSACTION_COMPLETE :	/* normal */
+			SI_SM_CALL_WITHOUT_DELAY;	/* Startup magic */
+
+	case BT_STATE_LONG_BUSY:	/* For example: after FW update */
+		if (!(status & BT_B_BUSY)) {
+			reset_flags(bt);	/* next state is now IDLE */
+			bt_init_data(bt, bt->io);
+		}
+		return SI_SM_CALL_WITH_DELAY;	/* No repeat printing */
 
 	case BT_STATE_RESET1:
-    		reset_flags(bt);
-    		bt->timeout = BT_RESET_DELAY;
-		bt->state = BT_STATE_RESET2;
-		break;
+		reset_flags(bt);
+		drain_BMC2HOST(bt);
+		BT_STATE_CHANGE(BT_STATE_RESET2,
+				SI_SM_CALL_WITH_DELAY);
 
 	case BT_STATE_RESET2:		/* Send a soft reset */
 		BT_CONTROL(BT_CLR_WR_PTR);
@@ -464,29 +579,59 @@ static enum si_sm_result bt_event(struct si_sm_data *bt, long time)
 		HOST2BMC(42);		/* Sequence number */
 		HOST2BMC(3);		/* Cmd == Soft reset */
 		BT_CONTROL(BT_H2B_ATN);
-		bt->state = BT_STATE_RESET3;
-		break;
+		bt->timeout = BT_RESET_DELAY * 1000000;
+		BT_STATE_CHANGE(BT_STATE_RESET3,
+				SI_SM_CALL_WITH_DELAY);
 
-	case BT_STATE_RESET3:
+	case BT_STATE_RESET3:		/* Hold off everything for a bit */
 		if (bt->timeout > 0)
-		       return SI_SM_CALL_WITH_DELAY;
-		bt->state = BT_STATE_RESTART;	/* printk in debug modes */
-		break;
+			return SI_SM_CALL_WITH_DELAY;
+		drain_BMC2HOST(bt);
+		BT_STATE_CHANGE(BT_STATE_RESTART,
+				SI_SM_CALL_WITH_DELAY);
 
-	case BT_STATE_RESTART:		/* don't reset retries! */
-		reset_flags(bt);
-		bt->write_data[2] = ++bt->seq;
+	case BT_STATE_RESTART:		/* don't reset retries or seq! */
 		bt->read_count = 0;
 		bt->nonzero_status = 0;
-		bt->timeout = BT_NORMAL_TIMEOUT;
-		bt->state = BT_STATE_XACTION_START;
-		break;
-
-	default:	/* HOSED is supposed to be caught much earlier */
-		error_recovery(bt, "internal logic error");
-		break;
-  	}
-  	return SI_SM_CALL_WITH_DELAY;
+		bt->timeout = bt->BT_CAP_req2rsp;
+		BT_STATE_CHANGE(BT_STATE_XACTION_START,
+				SI_SM_CALL_WITH_DELAY);
+
+	/* Get BT Capabilities, using timing of upper level state machine.
+	   Set outreqs to prevent infinite loop on timeout. */
+	case BT_STATE_CAPABILITIES_BEGIN:
+		bt->BT_CAP_outreqs = 1;
+		{
+			unsigned char GetBT_CAP[] = { 0x18, 0x36 };
+			bt->state = BT_STATE_IDLE;
+			bt_start_transaction(bt, GetBT_CAP, sizeof(GetBT_CAP));
+		}
+		bt->complete = BT_STATE_CAPABILITIES_END;
+		BT_STATE_CHANGE(BT_STATE_XACTION_START,
+				SI_SM_CALL_WITH_DELAY);
+
+	case BT_STATE_CAPABILITIES_END:
+		i = bt_get_result(bt, BT_CAP, sizeof(BT_CAP));
+		bt_init_data(bt, bt->io);
+		if ((i == 8) && !BT_CAP[2]) {
+			bt->BT_CAP_outreqs = BT_CAP[3];
+			bt->BT_CAP_req2rsp = BT_CAP[6] * 1000000;
+			bt->BT_CAP_retries = BT_CAP[7];
+		} else
+			printk(KERN_WARNING "IPMI BT: using default values\n");
+		if (!bt->BT_CAP_outreqs)
+			bt->BT_CAP_outreqs = 1;
+		printk(KERN_WARNING "IPMI BT: req2rsp=%ld secs retries=%d\n",
+			bt->BT_CAP_req2rsp / 1000000L, bt->BT_CAP_retries);
+		bt->timeout = bt->BT_CAP_req2rsp;
+		return SI_SM_CALL_WITHOUT_DELAY;
+
+	default:	/* should never occur */
+		return error_recovery(bt,
+				      status,
+				      IPMI_ERR_UNSPECIFIED);
+	}
+	return SI_SM_CALL_WITH_DELAY;
 }
 
 static int bt_detect(struct si_sm_data *bt)
@@ -497,7 +642,7 @@ static int bt_detect(struct si_sm_data *bt)
 	   test that first.  The calling routine uses negative logic. */
 
 	if ((BT_STATUS == 0xFF) && (BT_INTMASK_R == 0xFF))
-	       return 1;
+		return 1;
 	reset_flags(bt);
 	return 0;
 }
@@ -513,11 +658,11 @@ static int bt_size(void)
 
 struct si_sm_handlers bt_smi_handlers =
 {
-	.init_data         = bt_init_data,
-	.start_transaction = bt_start_transaction,
-	.get_result        = bt_get_result,
-	.event             = bt_event,
-	.detect            = bt_detect,
-	.cleanup           = bt_cleanup,
-	.size              = bt_size,
+	.init_data		= bt_init_data,
+	.start_transaction	= bt_start_transaction,
+	.get_result		= bt_get_result,
+	.event			= bt_event,
+	.detect			= bt_detect,
+	.cleanup		= bt_cleanup,
+	.size			= bt_size,
 };
diff --git a/drivers/char/ipmi/ipmi_kcs_sm.c b/drivers/char/ipmi/ipmi_kcs_sm.c
index 2062675..fb46979 100644
--- a/drivers/char/ipmi/ipmi_kcs_sm.c
+++ b/drivers/char/ipmi/ipmi_kcs_sm.c
@@ -261,12 +261,14 @@ static int start_kcs_transaction(struct si_sm_data *kcs, unsigned char *data,
 {
 	unsigned int i;
 
-	if ((size < 2) || (size > MAX_KCS_WRITE_SIZE)) {
-		return -1;
-	}
-	if ((kcs->state != KCS_IDLE) && (kcs->state != KCS_HOSED)) {
-		return -2;
-	}
+	if (size < 2)
+		return IPMI_REQ_LEN_INVALID_ERR;
+	if (size > MAX_KCS_WRITE_SIZE)
+		return IPMI_REQ_LEN_EXCEEDED_ERR;
+
+	if ((kcs->state != KCS_IDLE) && (kcs->state != KCS_HOSED))
+		return IPMI_NOT_IN_MY_STATE_ERR;
+
 	if (kcs_debug & KCS_DEBUG_MSG) {
 		printk(KERN_DEBUG "start_kcs_transaction -");
 		for (i = 0; i < size; i ++) {
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index c7de2e8..81a0c89 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -247,14 +247,18 @@ static void deliver_recv_msg(struct smi_info *smi_info,
 	spin_lock(&(smi_info->si_lock));
 }
 
-static void return_hosed_msg(struct smi_info *smi_info)
+static void return_hosed_msg(struct smi_info *smi_info, int cCode)
 {
 	struct ipmi_smi_msg *msg = smi_info->curr_msg;
 
+	if (cCode < 0 || cCode > IPMI_ERR_UNSPECIFIED)
+		cCode = IPMI_ERR_UNSPECIFIED;
+	/* else use it as is */
+
 	/* Make it a reponse */
 	msg->rsp[0] = msg->data[0] | 4;
 	msg->rsp[1] = msg->data[1];
-	msg->rsp[2] = IPMI_ERR_UNSPECIFIED;
+	msg->rsp[2] = cCode;
 	msg->rsp_size = 3;
 
 	smi_info->curr_msg = NULL;
@@ -305,7 +309,7 @@ static enum si_sm_result start_next_msg(struct smi_info *smi_info)
 			smi_info->curr_msg->data,
 			smi_info->curr_msg->data_size);
 		if (err) {
-			return_hosed_msg(smi_info);
+			return_hosed_msg(smi_info, err);
 		}
 
 		rv = SI_SM_CALL_WITHOUT_DELAY;
@@ -647,7 +651,7 @@ static enum si_sm_result smi_event_handler(struct smi_info *smi_info,
 			/* If we were handling a user message, format
                            a response to send to the upper layer to
                            tell it about the error. */
-			return_hosed_msg(smi_info);
+			return_hosed_msg(smi_info, IPMI_ERR_UNSPECIFIED);
 		}
 		si_sm_result = smi_info->handlers->event(smi_info->si_sm, 0);
 	}
diff --git a/drivers/char/ipmi/ipmi_smic_sm.c b/drivers/char/ipmi/ipmi_smic_sm.c
index 39d7e5e..e64ea7d 100644
--- a/drivers/char/ipmi/ipmi_smic_sm.c
+++ b/drivers/char/ipmi/ipmi_smic_sm.c
@@ -141,12 +141,14 @@ static int start_smic_transaction(struct si_sm_data *smic,
 {
 	unsigned int i;
 
-	if ((size < 2) || (size > MAX_SMIC_WRITE_SIZE)) {
-		return -1;
-	}
-	if ((smic->state != SMIC_IDLE) && (smic->state != SMIC_HOSED)) {
-		return -2;
-	}
+	if (size < 2)
+		return IPMI_REQ_LEN_INVALID_ERR;
+	if (size > MAX_SMIC_WRITE_SIZE)
+		return IPMI_REQ_LEN_EXCEEDED_ERR;
+
+	if ((smic->state != SMIC_IDLE) && (smic->state != SMIC_HOSED))
+		return IPMI_NOT_IN_MY_STATE_ERR;
+
 	if (smic_debug & SMIC_DEBUG_MSG) {
 		printk(KERN_INFO "start_smic_transaction -");
 		for (i = 0; i < size; i ++) {
diff --git a/include/linux/ipmi_msgdefs.h b/include/linux/ipmi_msgdefs.h
index 8d6759c..b56a158 100644
--- a/include/linux/ipmi_msgdefs.h
+++ b/include/linux/ipmi_msgdefs.h
@@ -71,14 +71,18 @@
 /* The BT interface on high-end HP systems supports up to 255 bytes in
  * one transfer.  Its "virtual" BMC supports some commands that are longer
  * than 128 bytes.  Use the full 256, plus NetFn/LUN, Cmd, cCode, plus
- * some overhead.  It would be nice to base this on the "BT Capabilities"
- * but that's too hard to propagate to the rest of the driver. */
+ * some overhead; it's not worth the effort to dynamically size this based
+ * on the results of the "Get BT Capabilities" command. */
 #define IPMI_MAX_MSG_LENGTH	272	/* multiple of 16 */
 
 #define IPMI_CC_NO_ERROR		0x00
 #define IPMI_NODE_BUSY_ERR		0xc0
 #define IPMI_INVALID_COMMAND_ERR	0xc1
+#define IPMI_TIMEOUT_ERR		0xc3
 #define IPMI_ERR_MSG_TRUNCATED		0xc6
+#define IPMI_REQ_LEN_INVALID_ERR	0xc7
+#define IPMI_REQ_LEN_EXCEEDED_ERR	0xc8
+#define IPMI_NOT_IN_MY_STATE_ERR	0xd5	/* IPMI 2.0 */
 #define IPMI_LOST_ARBITRATION_ERR	0x81
 #define IPMI_BUS_ERR			0x82
 #define IPMI_NAK_ON_WRITE_ERR		0x83
-- 
cgit v0.10.2


From 5a1099baa86c17c521859f417c4c1353d0ae9096 Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Wed, 6 Dec 2006 20:41:16 -0800
Subject: [PATCH] IPMI: increase KCS message size

Increase the maximum message size a KCS interface supports to the maximum
message size the rest of the driver supports.  Some systems really support
messages this big.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
Cc: Patrick Schoeller <Patrick.Schoeller@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/ipmi/ipmi_kcs_sm.c b/drivers/char/ipmi/ipmi_kcs_sm.c
index fb46979..c1b8228 100644
--- a/drivers/char/ipmi/ipmi_kcs_sm.c
+++ b/drivers/char/ipmi/ipmi_kcs_sm.c
@@ -93,8 +93,8 @@ enum kcs_states {
 				   state machine. */
 };
 
-#define MAX_KCS_READ_SIZE 80
-#define MAX_KCS_WRITE_SIZE 80
+#define MAX_KCS_READ_SIZE IPMI_MAX_MSG_LENGTH
+#define MAX_KCS_WRITE_SIZE IPMI_MAX_MSG_LENGTH
 
 /* Timeouts in microseconds. */
 #define IBF_RETRY_TIMEOUT 1000000
-- 
cgit v0.10.2


From 1aa16eeacd6f10c202da1cf0cff5c81107dac942 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 6 Dec 2006 20:41:20 -0800
Subject: [PATCH] IPMI: fix PROC_FS=n warnings

Fix build warnings for PROC_FS=n.

  drivers/char/ipmi/ipmi_poweroff.c:707: warning: label 'out_err' defined but not used

  drivers/char/ipmi/ipmi_msghandler.c:1774: warning: 'ipmb_file_read_proc' defined but not used
  drivers/char/ipmi/ipmi_msghandler.c:1790: warning: 'version_file_read_proc' defined but not used
  drivers/char/ipmi/ipmi_msghandler.c:1801: warning: 'stat_file_read_proc' defined but not used

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Corey Minyard <minyard@acm.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index ff0e68f..5703ee2 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -1769,6 +1769,7 @@ int ipmi_request_supply_msgs(ipmi_user_t          user,
 			      -1, 0);
 }
 
+#ifdef CONFIG_PROC_FS
 static int ipmb_file_read_proc(char *page, char **start, off_t off,
 			       int count, int *eof, void *data)
 {
@@ -1857,6 +1858,7 @@ static int stat_file_read_proc(char *page, char **start, off_t off,
 
 	return (out - ((char *) page));
 }
+#endif /* CONFIG_PROC_FS */
 
 int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name,
 			    read_proc_t *read_proc, write_proc_t *write_proc,
diff --git a/drivers/char/ipmi/ipmi_poweroff.c b/drivers/char/ipmi/ipmi_poweroff.c
index 8ce447a..597eb4f 100644
--- a/drivers/char/ipmi/ipmi_poweroff.c
+++ b/drivers/char/ipmi/ipmi_poweroff.c
@@ -702,9 +702,9 @@ static int ipmi_poweroff_init (void)
 		printk(KERN_ERR PFX "Unable to register SMI watcher: %d\n", rv);
 		goto out_err;
 	}
-#endif
 
  out_err:
+#endif
 	return rv;
 }
 
-- 
cgit v0.10.2


From ef5036782e619ab394daaec5c00876fa6b17d7a1 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 6 Dec 2006 20:41:23 -0800
Subject: [PATCH] ext3 balloc: reset windowsz when full

ext3_new_blocks should reset the reservation window size to 0 when squeezing
the last blocks out of an almost full filesystem, so the retry doesn't skip
any groups with less than half that free, reporting ENOSPC too soon.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index f96c40a..db8ca34 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1552,6 +1552,7 @@ retry_alloc:
 	 */
 	if (my_rsv) {
 		my_rsv = NULL;
+		windowsz = 0;
 		group_no = goal_group;
 		goto retry_alloc;
 	}
-- 
cgit v0.10.2


From 1650242324a0c19d629a8be0d5a8ecdc174d31f8 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 6 Dec 2006 20:41:25 -0800
Subject: [PATCH] ext3 balloc: fix off-by-one against grp_goal

grp_goal 0 is a genuine goal (unlike -1), so ext3_try_to_allocate_with_rsv
should treat it as such.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index db8ca34..0706a45 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1271,7 +1271,7 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 	}
 	/*
 	 * grp_goal is a group relative block number (if there is a goal)
-	 * 0 < grp_goal < EXT3_BLOCKS_PER_GROUP(sb)
+	 * 0 <= grp_goal < EXT3_BLOCKS_PER_GROUP(sb)
 	 * first block is a filesystem wide block number
 	 * first block is the block number of the first block in this group
 	 */
@@ -1307,7 +1307,7 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 			if (!goal_in_my_reservation(&my_rsv->rsv_window,
 							grp_goal, group, sb))
 				grp_goal = -1;
-		} else if (grp_goal > 0) {
+		} else if (grp_goal >= 0) {
 			int curr = my_rsv->rsv_end -
 					(grp_goal + group_first_block) + 1;
 
-- 
cgit v0.10.2


From ff50dc562bf6de00fec264b11531e2745209ba12 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 6 Dec 2006 20:41:25 -0800
Subject: [PATCH] ext3 balloc: fix off-by-one against rsv_end

rsv_end is the last block within the reservation, so alloc_new_reservation
should accept start_block == rsv_end as success.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 0706a45..57cf47a 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1148,7 +1148,7 @@ retry:
 	 * check if the first free block is within the
 	 * free space we just reserved
 	 */
-	if (start_block >= my_rsv->rsv_start && start_block < my_rsv->rsv_end)
+	if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
 		return 0;		/* success */
 	/*
 	 * if the first free bit we found is out of the reservable space
-- 
cgit v0.10.2


From c56d2561f7189762c4fce854630d4f7f6d64ccee Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 6 Dec 2006 20:41:27 -0800
Subject: [PATCH] ext3 balloc: say rb_entry not list_entry

The reservations tree is an rb_tree not a list, so it's less confusing to use
rb_entry() than list_entry() - though they're both just container_of().

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 57cf47a..03d39b6 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -144,7 +144,7 @@ restart:
 
 	printk("Block Allocation Reservation Windows Map (%s):\n", fn);
 	while (n) {
-		rsv = list_entry(n, struct ext3_reserve_window_node, rsv_node);
+		rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
 		if (verbose)
 			printk("reservation window 0x%p "
 			       "start:  %lu, end:  %lu\n",
@@ -949,7 +949,7 @@ static int find_next_reservable_window(
 
 		prev = rsv;
 		next = rb_next(&rsv->rsv_node);
-		rsv = list_entry(next,struct ext3_reserve_window_node,rsv_node);
+		rsv = rb_entry(next,struct ext3_reserve_window_node,rsv_node);
 
 		/*
 		 * Reached the last reservation, we can just append to the
@@ -1193,7 +1193,7 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
 	if (!next)
 		my_rsv->rsv_end += size;
 	else {
-		next_rsv = list_entry(next, struct ext3_reserve_window_node, rsv_node);
+		next_rsv = rb_entry(next, struct ext3_reserve_window_node, rsv_node);
 
 		if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
 			my_rsv->rsv_end += size;
-- 
cgit v0.10.2


From 2823b5535efad71e950ef50c2ce5f9e4dbaedc17 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 6 Dec 2006 20:41:28 -0800
Subject: [PATCH] ext3 balloc: use io_error label

ext3_new_blocks has a nice io_error label for setting -EIO, so goto that in
the one place that doesn't already use it.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 03d39b6..5a2c198 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1515,10 +1515,8 @@ retry_alloc:
 		if (group_no >= ngroups)
 			group_no = 0;
 		gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
-		if (!gdp) {
-			*errp = -EIO;
-			goto out;
-		}
+		if (!gdp)
+			goto io_error;
 		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
 		/*
 		 * skip this group if the number of
-- 
cgit v0.10.2


From 7d1c520bb57e4b5e94ec937c13553dccf473341b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 6 Dec 2006 20:41:30 -0800
Subject: [PATCH] ext3 balloc: fix _with_rsv freeze

Port fix to the off-by-one in find_next_usable_block's memscan from ext2 to
ext3; but it didn't cause a serious problem for ext3 because the additional
ext3_test_allocatable check rescued it from the error.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 5a2c198..2216174 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -730,7 +730,7 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
 		here = 0;
 
 	p = ((char *)bh->b_data) + (here >> 3);
-	r = memscan(p, 0, (maxblocks - here + 7) >> 3);
+	r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
 	next = (r - ((char *)bh->b_data)) << 3;
 
 	if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
-- 
cgit v0.10.2


From 7e0289766a0750a56260565bd3b74eb544483d45 Mon Sep 17 00:00:00 2001
From: Avantika Mathur <mathur@us.ibm.com>
Date: Wed, 6 Dec 2006 20:41:33 -0800
Subject: [PATCH] ext4: if expression format

changes instances of
	if ((lhs = expression)) {

to the preferred coding style

	lhs=expression;
	if (lhs) {

Signed-off-by: Avantika Mathur <mathur@us.ibm.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 994a6e4..06ce8e8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -186,7 +186,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 		depth = path->p_depth;
 
 		/* try to predict block placement */
-		if ((ex = path[depth].p_ext))
+		ex = path[depth].p_ext;
+		if (ex)
 			return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
 
 		/* it looks like index is empty;
@@ -543,7 +544,8 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 	struct ext4_extent_idx *ix;
 	int len, err;
 
-	if ((err = ext4_ext_get_access(handle, inode, curp)))
+	err = ext4_ext_get_access(handle, inode, curp);
+	if (err)
 		return err;
 
 	BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block));
@@ -665,7 +667,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	}
 	lock_buffer(bh);
 
-	if ((err = ext4_journal_get_create_access(handle, bh)))
+	err = ext4_journal_get_create_access(handle, bh);
+	if (err)
 		goto cleanup;
 
 	neh = ext_block_hdr(bh);
@@ -702,18 +705,21 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	set_buffer_uptodate(bh);
 	unlock_buffer(bh);
 
-	if ((err = ext4_journal_dirty_metadata(handle, bh)))
+	err = ext4_journal_dirty_metadata(handle, bh);
+	if (err)
 		goto cleanup;
 	brelse(bh);
 	bh = NULL;
 
 	/* correct old leaf */
 	if (m) {
-		if ((err = ext4_ext_get_access(handle, inode, path + depth)))
+		err = ext4_ext_get_access(handle, inode, path + depth);
+		if (err)
 			goto cleanup;
 		path[depth].p_hdr->eh_entries =
 		     cpu_to_le16(le16_to_cpu(path[depth].p_hdr->eh_entries)-m);
-		if ((err = ext4_ext_dirty(handle, inode, path + depth)))
+		err = ext4_ext_dirty(handle, inode, path + depth);
+		if (err)
 			goto cleanup;
 
 	}
@@ -736,7 +742,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 		}
 		lock_buffer(bh);
 
-		if ((err = ext4_journal_get_create_access(handle, bh)))
+		err = ext4_journal_get_create_access(handle, bh);
+		if (err)
 			goto cleanup;
 
 		neh = ext_block_hdr(bh);
@@ -780,7 +787,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
 
-		if ((err = ext4_journal_dirty_metadata(handle, bh)))
+		err = ext4_journal_dirty_metadata(handle, bh);
+		if (err)
 			goto cleanup;
 		brelse(bh);
 		bh = NULL;
@@ -854,7 +862,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 	}
 	lock_buffer(bh);
 
-	if ((err = ext4_journal_get_create_access(handle, bh))) {
+	err = ext4_journal_get_create_access(handle, bh);
+	if (err) {
 		unlock_buffer(bh);
 		goto out;
 	}
@@ -874,11 +883,13 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 	set_buffer_uptodate(bh);
 	unlock_buffer(bh);
 
-	if ((err = ext4_journal_dirty_metadata(handle, bh)))
+	err = ext4_journal_dirty_metadata(handle, bh);
+	if (err)
 		goto out;
 
 	/* create index in new top-level index: num,max,pointer */
-	if ((err = ext4_ext_get_access(handle, inode, curp)))
+	err = ext4_ext_get_access(handle, inode, curp);
+	if (err)
 		goto out;
 
 	curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
@@ -1070,20 +1081,24 @@ int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
 	 */
 	k = depth - 1;
 	border = path[depth].p_ext->ee_block;
-	if ((err = ext4_ext_get_access(handle, inode, path + k)))
+	err = ext4_ext_get_access(handle, inode, path + k);
+	if (err)
 		return err;
 	path[k].p_idx->ei_block = border;
-	if ((err = ext4_ext_dirty(handle, inode, path + k)))
+	err = ext4_ext_dirty(handle, inode, path + k);
+	if (err)
 		return err;
 
 	while (k--) {
 		/* change all left-side indexes */
 		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
 			break;
-		if ((err = ext4_ext_get_access(handle, inode, path + k)))
+		err = ext4_ext_get_access(handle, inode, path + k);
+		if (err)
 			break;
 		path[k].p_idx->ei_block = border;
-		if ((err = ext4_ext_dirty(handle, inode, path + k)))
+		err = ext4_ext_dirty(handle, inode, path + k);
+		if (err)
 			break;
 	}
 
@@ -1142,7 +1157,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 				le16_to_cpu(newext->ee_len),
 				le32_to_cpu(ex->ee_block),
 				le16_to_cpu(ex->ee_len), ext_pblock(ex));
-		if ((err = ext4_ext_get_access(handle, inode, path + depth)))
+		err = ext4_ext_get_access(handle, inode, path + depth);
+		if (err)
 			return err;
 		ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len)
 					 + le16_to_cpu(newext->ee_len));
@@ -1192,7 +1208,8 @@ repeat:
 has_space:
 	nearex = path[depth].p_ext;
 
-	if ((err = ext4_ext_get_access(handle, inode, path + depth)))
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
 		goto cleanup;
 
 	if (!nearex) {
@@ -1486,10 +1503,12 @@ int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
 	path--;
 	leaf = idx_pblock(path->p_idx);
 	BUG_ON(path->p_hdr->eh_entries == 0);
-	if ((err = ext4_ext_get_access(handle, inode, path)))
+	err = ext4_ext_get_access(handle, inode, path);
+	if (err)
 		return err;
 	path->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path->p_hdr->eh_entries)-1);
-	if ((err = ext4_ext_dirty(handle, inode, path)))
+	err = ext4_ext_dirty(handle, inode, path);
+	if (err)
 		return err;
 	ext_debug("index is empty, remove it, free block %llu\n", leaf);
 	bh = sb_find_get_block(inode->i_sb, leaf);
@@ -1930,7 +1949,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	mutex_lock(&EXT4_I(inode)->truncate_mutex);
 
 	/* check in cache */
-	if ((goal = ext4_ext_in_cache(inode, iblock, &newex))) {
+	goal = ext4_ext_in_cache(inode, iblock, &newex);
+	if (goal) {
 		if (goal == EXT4_EXT_CACHE_GAP) {
 			if (!create) {
 				/* block isn't allocated yet and
@@ -1969,7 +1989,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 */
 	BUG_ON(path[depth].p_ext == NULL && depth != 0);
 
-	if ((ex = path[depth].p_ext)) {
+	ex = path[depth].p_ext;
+	if (ex) {
 	        unsigned long ee_block = le32_to_cpu(ex->ee_block);
 		ext4_fsblk_t ee_start = ext_pblock(ex);
 		unsigned short ee_len  = le16_to_cpu(ex->ee_len);
-- 
cgit v0.10.2


From 5d4958f923f431e148d9ba8ff894209a134b943e Mon Sep 17 00:00:00 2001
From: Avantika Mathur <mathur@us.ibm.com>
Date: Wed, 6 Dec 2006 20:41:35 -0800
Subject: [PATCH] ext4: kmalloc to kzalloc

Performs kmalloc to kzalloc conversion

Signed-off-by: Avantika Mathur <mathur@us.ibm.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 06ce8e8..e41be18 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -477,13 +477,12 @@ ext4_ext_find_extent(struct inode *inode, int block, struct ext4_ext_path *path)
 
 	/* account possible depth increase */
 	if (!path) {
-		path = kmalloc(sizeof(struct ext4_ext_path) * (depth + 2),
+		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
 				GFP_NOFS);
 		if (!path)
 			return ERR_PTR(-ENOMEM);
 		alloc = 1;
 	}
-	memset(path, 0, sizeof(struct ext4_ext_path) * (depth + 1));
 	path[0].p_hdr = eh;
 
 	/* walk through the tree */
@@ -643,10 +642,9 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	 * We need this to handle errors and free blocks
 	 * upon them.
 	 */
-	ablocks = kmalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
+	ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
 	if (!ablocks)
 		return -ENOMEM;
-	memset(ablocks, 0, sizeof(ext4_fsblk_t) * depth);
 
 	/* allocate all needed blocks */
 	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
@@ -1773,12 +1771,11 @@ int ext4_ext_remove_space(struct inode *inode, unsigned long start)
 	 * We start scanning from right side, freeing all the blocks
 	 * after i_size and walking into the tree depth-wise.
 	 */
-	path = kmalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_KERNEL);
+	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_KERNEL);
 	if (path == NULL) {
 		ext4_journal_stop(handle);
 		return -ENOMEM;
 	}
-	memset(path, 0, sizeof(struct ext4_ext_path) * (depth + 1));
 	path[0].p_hdr = ext_inode_hdr(inode);
 	if (ext4_ext_check_header(__FUNCTION__, inode, path[0].p_hdr)) {
 		err = -EIO;
-- 
cgit v0.10.2


From 09b882520bbe01f2e5044642109c1c1d19fe3559 Mon Sep 17 00:00:00 2001
From: Avantika Mathur <mathur@us.ibm.com>
Date: Wed, 6 Dec 2006 20:41:36 -0800
Subject: [PATCH] ext4: Eliminate inline functions

Removes all inline keywords, since the compiler will make static functions
inline when it is appropriate.

Signed-off-by: Avantika Mathur <mathur@us.ibm.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e41be18..dc2724f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -48,7 +48,7 @@
  * ext_pblock:
  * combine low and high parts of physical block number into ext4_fsblk_t
  */
-static inline ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
+static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
 {
 	ext4_fsblk_t block;
 
@@ -61,7 +61,7 @@ static inline ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
  * idx_pblock:
  * combine low and high parts of a leaf physical block number into ext4_fsblk_t
  */
-static inline ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
+static ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
 {
 	ext4_fsblk_t block;
 
@@ -75,7 +75,7 @@ static inline ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
  * stores a large physical block number into an extent struct,
  * breaking it into parts
  */
-static inline void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
+static void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
 {
 	ex->ee_start = cpu_to_le32((unsigned long) (pb & 0xffffffff));
 	ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
@@ -86,7 +86,7 @@ static inline void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb
  * stores a large physical block number into an index struct,
  * breaking it into parts
  */
-static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
+static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
 {
 	ix->ei_leaf = cpu_to_le32((unsigned long) (pb & 0xffffffff));
 	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
@@ -216,7 +216,7 @@ ext4_ext_new_block(handle_t *handle, struct inode *inode,
 	return newblock;
 }
 
-static inline int ext4_ext_space_block(struct inode *inode)
+static int ext4_ext_space_block(struct inode *inode)
 {
 	int size;
 
@@ -229,7 +229,7 @@ static inline int ext4_ext_space_block(struct inode *inode)
 	return size;
 }
 
-static inline int ext4_ext_space_block_idx(struct inode *inode)
+static int ext4_ext_space_block_idx(struct inode *inode)
 {
 	int size;
 
@@ -242,7 +242,7 @@ static inline int ext4_ext_space_block_idx(struct inode *inode)
 	return size;
 }
 
-static inline int ext4_ext_space_root(struct inode *inode)
+static int ext4_ext_space_root(struct inode *inode)
 {
 	int size;
 
@@ -256,7 +256,7 @@ static inline int ext4_ext_space_root(struct inode *inode)
 	return size;
 }
 
-static inline int ext4_ext_space_root_idx(struct inode *inode)
+static int ext4_ext_space_root_idx(struct inode *inode)
 {
 	int size;
 
@@ -1103,7 +1103,7 @@ int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
 	return err;
 }
 
-static int inline
+static int
 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 				struct ext4_extent *ex2)
 {
@@ -1395,7 +1395,7 @@ int ext4_ext_walk_space(struct inode *inode, unsigned long block,
 	return err;
 }
 
-static inline void
+static void
 ext4_ext_put_in_cache(struct inode *inode, __u32 block,
 			__u32 len, __u32 start, int type)
 {
@@ -1413,7 +1413,7 @@ ext4_ext_put_in_cache(struct inode *inode, __u32 block,
  * calculate boundaries of the gap that the requested block fits into
  * and cache this gap
  */
-static inline void
+static void
 ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 				unsigned long block)
 {
@@ -1454,7 +1454,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 	ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
 }
 
-static inline int
+static int
 ext4_ext_in_cache(struct inode *inode, unsigned long block,
 			struct ext4_extent *ex)
 {
@@ -1523,7 +1523,7 @@ int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
  * the caller should calculate credits under truncate_mutex and
  * pass the actual path.
  */
-int inline ext4_ext_calc_credits_for_insert(struct inode *inode,
+int ext4_ext_calc_credits_for_insert(struct inode *inode,
 						struct ext4_ext_path *path)
 {
 	int depth, needed;
@@ -1733,7 +1733,7 @@ out:
  * ext4_ext_more_to_rm:
  * returns 1 if current index has to be freed (even partial)
  */
-static int inline
+static int
 ext4_ext_more_to_rm(struct ext4_ext_path *path)
 {
 	BUG_ON(path->p_idx == NULL);
-- 
cgit v0.10.2


From d3ed11c35635487d34ad476e1d6a66dd298ab2de Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Wed, 6 Dec 2006 20:41:37 -0800
Subject: [PATCH] cpuset: allow a larger buffer for writes to cpuset files

When using fake NUMA setup, the number of memory nodes can greatly exceed
the number of CPUs.  So the current limit in cpuset_common_file_write() is
insufficient.

Signed-off-by: Paul Menage <menage@google.com>
Acked-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4ef1d29..0a6b4d8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1280,7 +1280,8 @@ typedef enum {
 	FILE_TASKLIST,
 } cpuset_filetype_t;
 
-static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf,
+static ssize_t cpuset_common_file_write(struct file *file,
+					const char __user *userbuf,
 					size_t nbytes, loff_t *unused_ppos)
 {
 	struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
@@ -1291,7 +1292,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	int retval = 0;
 
 	/* Crude upper limit on largest legitimate cpulist user might write. */
-	if (nbytes > 100 + 6 * NR_CPUS)
+	if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES))
 		return -E2BIG;
 
 	/* +1 for nul-terminator */
-- 
cgit v0.10.2


From 6cf24f031bc97cb5a7c9df3b6e73c45b628b2b28 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 6 Dec 2006 20:41:39 -0800
Subject: [PATCH] elf.h: forward declare struct file

  In file included from include/asm/patch.h:14,
		 from arch/ia64/kernel/patch.c:10:
  include/linux/elf.h:375: warning: "struct file" declared inside parameter list
  include/linux/elf.h:375: warning: its scope is only this definition or declaration, which is probably not what you want

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/include/linux/elf.h b/include/linux/elf.h
index b403516..60713e6 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -6,6 +6,8 @@
 #include <linux/elf-em.h>
 #include <asm/elf.h>
 
+struct file;
+
 #ifndef elf_read_implies_exec
   /* Executables for which elf_read_implies_exec() returns TRUE will
      have the READ_IMPLIES_EXEC personality flag set automatically.
-- 
cgit v0.10.2


From 68380b581383c028830f79ec2670f4a193854aa6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@woody.osdl.org>
Date: Thu, 7 Dec 2006 09:28:19 -0800
Subject: Add "run_scheduled_work()" workqueue function

This allows workqueue users to run just their own pending work, rather
than wait for the whole workqueue to finish running.  This solves the
deadlock with networking libphy that was due to other workqueue entries
possibly needing a lock that was held by the routine that wanted to
flush its own work.

It's not wonderful: if you absolutely need to synchronize with the work
function having been executed, any user strictly speaking should have
its own completion tracking logic, since when we run things explicitly
by hand, the generic workqueue layer can no longer help us synchronize.

Also, this is strictly only usable for work that has been scheduled
without any delayed timers.  You can not mix the new interface with
schedule_delayed_work().

But it's better than what we had currently.

Acked-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 4044bb1..e175f39 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -587,8 +587,7 @@ int phy_stop_interrupts(struct phy_device *phydev)
 	 * Finish any pending work; we might have been scheduled
 	 * to be called from keventd ourselves, though.
 	 */
-	if (!current_is_keventd())
-		flush_scheduled_work();
+	run_scheduled_work(&phydev->phy_queue);
 
 	free_irq(phydev->irq, phydev);
 
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index f0cb1df..edef8d5 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -162,6 +162,7 @@ extern int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 extern void FASTCALL(flush_workqueue(struct workqueue_struct *wq));
 
 extern int FASTCALL(schedule_work(struct work_struct *work));
+extern int FASTCALL(run_scheduled_work(struct work_struct *work));
 extern int FASTCALL(schedule_delayed_work(struct delayed_work *work, unsigned long delay));
 
 extern int schedule_delayed_work_on(int cpu, struct delayed_work *work, unsigned long delay);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c525731..6b18675 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -108,6 +108,79 @@ static inline void *get_wq_data(struct work_struct *work)
 	return (void *) (work->management & WORK_STRUCT_WQ_DATA_MASK);
 }
 
+static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work)
+{
+	int ret = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cwq->lock, flags);
+	/*
+	 * We need to re-validate the work info after we've gotten
+	 * the cpu_workqueue lock. We can run the work now iff:
+	 *
+	 *  - the wq_data still matches the cpu_workqueue_struct
+	 *  - AND the work is still marked pending
+	 *  - AND the work is still on a list (which will be this
+	 *    workqueue_struct list)
+	 *
+	 * All these conditions are important, because we
+	 * need to protect against the work being run right
+	 * now on another CPU (all but the last one might be
+	 * true if it's currently running and has not been
+	 * released yet, for example).
+	 */
+	if (get_wq_data(work) == cwq
+	    && work_pending(work)
+	    && !list_empty(&work->entry)) {
+		work_func_t f = work->func;
+		list_del_init(&work->entry);
+		spin_unlock_irqrestore(&cwq->lock, flags);
+
+		if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management))
+			work_release(work);
+		f(work);
+
+		spin_lock_irqsave(&cwq->lock, flags);
+		cwq->remove_sequence++;
+		wake_up(&cwq->work_done);
+		ret = 1;
+	}
+	spin_unlock_irqrestore(&cwq->lock, flags);
+	return ret;
+}
+
+/**
+ * run_scheduled_work - run scheduled work synchronously
+ * @work: work to run
+ *
+ * This checks if the work was pending, and runs it
+ * synchronously if so. It returns a boolean to indicate
+ * whether it had any scheduled work to run or not.
+ *
+ * NOTE! This _only_ works for normal work_structs. You
+ * CANNOT use this for delayed work, because the wq data
+ * for delayed work will not point properly to the per-
+ * CPU workqueue struct, but will change!
+ */
+int fastcall run_scheduled_work(struct work_struct *work)
+{
+	for (;;) {
+		struct cpu_workqueue_struct *cwq;
+
+		if (!work_pending(work))
+			return 0;
+		if (list_empty(&work->entry))
+			return 0;
+		/* NOTE! This depends intimately on __queue_work! */
+		cwq = get_wq_data(work);
+		if (!cwq)
+			return 0;
+		if (__run_work(cwq, work))
+			return 1;
+	}
+}
+EXPORT_SYMBOL(run_scheduled_work);
+
 /* Preempt must be disabled. */
 static void __queue_work(struct cpu_workqueue_struct *cwq,
 			 struct work_struct *work)
-- 
cgit v0.10.2