From be6d3e56a6b9b3a4ee44a0685e39e595073c6f0d Mon Sep 17 00:00:00 2001
From: Kentaro Takeda <takedakn@nttdata.co.jp>
Date: Wed, 17 Dec 2008 13:24:15 +0900
Subject: introduce new LSM hooks where vfsmount is available.

Add new LSM hooks for path-based checks.  Call them on directory-modifying
operations at the points where we still know the vfsmount involved.

Signed-off-by: Kentaro Takeda <takedakn@nttdata.co.jp>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Toshiharu Harada <haradats@nttdata.co.jp>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index af3783f..ab441af 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1556,6 +1556,9 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
 		 * Refuse to truncate files with mandatory locks held on them.
 		 */
 		error = locks_verify_locked(inode);
+		if (!error)
+			error = security_path_truncate(&nd->path, 0,
+					       ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
 		if (!error) {
 			DQUOT_INIT(inode);
 
@@ -1586,7 +1589,11 @@ static int __open_namei_create(struct nameidata *nd, struct path *path,
 
 	if (!IS_POSIXACL(dir->d_inode))
 		mode &= ~current->fs->umask;
+	error = security_path_mknod(&nd->path, path->dentry, mode, 0);
+	if (error)
+		goto out_unlock;
 	error = vfs_create(dir->d_inode, path->dentry, mode, nd);
+out_unlock:
 	mutex_unlock(&dir->d_inode->i_mutex);
 	dput(nd->path.dentry);
 	nd->path.dentry = path->dentry;
@@ -1999,6 +2006,9 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto out_dput;
+	error = security_path_mknod(&nd.path, dentry, mode, dev);
+	if (error)
+		goto out_drop_write;
 	switch (mode & S_IFMT) {
 		case 0: case S_IFREG:
 			error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd);
@@ -2011,6 +2021,7 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
 			error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
 			break;
 	}
+out_drop_write:
 	mnt_drop_write(nd.path.mnt);
 out_dput:
 	dput(dentry);
@@ -2070,7 +2081,11 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto out_dput;
+	error = security_path_mkdir(&nd.path, dentry, mode);
+	if (error)
+		goto out_drop_write;
 	error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
+out_drop_write:
 	mnt_drop_write(nd.path.mnt);
 out_dput:
 	dput(dentry);
@@ -2180,7 +2195,11 @@ static long do_rmdir(int dfd, const char __user *pathname)
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto exit3;
+	error = security_path_rmdir(&nd.path, dentry);
+	if (error)
+		goto exit4;
 	error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
+exit4:
 	mnt_drop_write(nd.path.mnt);
 exit3:
 	dput(dentry);
@@ -2265,7 +2284,11 @@ static long do_unlinkat(int dfd, const char __user *pathname)
 		error = mnt_want_write(nd.path.mnt);
 		if (error)
 			goto exit2;
+		error = security_path_unlink(&nd.path, dentry);
+		if (error)
+			goto exit3;
 		error = vfs_unlink(nd.path.dentry->d_inode, dentry);
+exit3:
 		mnt_drop_write(nd.path.mnt);
 	exit2:
 		dput(dentry);
@@ -2346,7 +2369,11 @@ asmlinkage long sys_symlinkat(const char __user *oldname,
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto out_dput;
+	error = security_path_symlink(&nd.path, dentry, from);
+	if (error)
+		goto out_drop_write;
 	error = vfs_symlink(nd.path.dentry->d_inode, dentry, from);
+out_drop_write:
 	mnt_drop_write(nd.path.mnt);
 out_dput:
 	dput(dentry);
@@ -2443,7 +2470,11 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto out_dput;
+	error = security_path_link(old_path.dentry, &nd.path, new_dentry);
+	if (error)
+		goto out_drop_write;
 	error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry);
+out_drop_write:
 	mnt_drop_write(nd.path.mnt);
 out_dput:
 	dput(new_dentry);
@@ -2679,8 +2710,13 @@ asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
 	error = mnt_want_write(oldnd.path.mnt);
 	if (error)
 		goto exit5;
+	error = security_path_rename(&oldnd.path, old_dentry,
+				     &newnd.path, new_dentry);
+	if (error)
+		goto exit6;
 	error = vfs_rename(old_dir->d_inode, old_dentry,
 				   new_dir->d_inode, new_dentry);
+exit6:
 	mnt_drop_write(oldnd.path.mnt);
 exit5:
 	dput(new_dentry);
diff --git a/fs/open.c b/fs/open.c
index c0a426d..1cd7d40 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -272,6 +272,8 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
 		goto put_write_and_out;
 
 	error = locks_verify_truncate(inode, NULL, length);
+	if (!error)
+		error = security_path_truncate(&path, length, 0);
 	if (!error) {
 		DQUOT_INIT(inode);
 		error = do_truncate(path.dentry, length, 0, NULL);
@@ -329,6 +331,9 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 
 	error = locks_verify_truncate(inode, file, length);
 	if (!error)
+		error = security_path_truncate(&file->f_path, length,
+					       ATTR_MTIME|ATTR_CTIME);
+	if (!error)
 		error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
 out_putf:
 	fput(file);
diff --git a/include/linux/security.h b/include/linux/security.h
index 3416cb8..b92b5e4 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -335,17 +335,37 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@dir contains the inode structure of the parent directory of the new link.
  *	@new_dentry contains the dentry structure for the new link.
  *	Return 0 if permission is granted.
+ * @path_link:
+ *	Check permission before creating a new hard link to a file.
+ *	@old_dentry contains the dentry structure for an existing link
+ *	to the file.
+ *	@new_dir contains the path structure of the parent directory of
+ *	the new link.
+ *	@new_dentry contains the dentry structure for the new link.
+ *	Return 0 if permission is granted.
  * @inode_unlink:
  *	Check the permission to remove a hard link to a file.
  *	@dir contains the inode structure of parent directory of the file.
  *	@dentry contains the dentry structure for file to be unlinked.
  *	Return 0 if permission is granted.
+ * @path_unlink:
+ *	Check the permission to remove a hard link to a file.
+ *	@dir contains the path structure of parent directory of the file.
+ *	@dentry contains the dentry structure for file to be unlinked.
+ *	Return 0 if permission is granted.
  * @inode_symlink:
  *	Check the permission to create a symbolic link to a file.
  *	@dir contains the inode structure of parent directory of the symbolic link.
  *	@dentry contains the dentry structure of the symbolic link.
  *	@old_name contains the pathname of file.
  *	Return 0 if permission is granted.
+ * @path_symlink:
+ *	Check the permission to create a symbolic link to a file.
+ *	@dir contains the path structure of parent directory of
+ *	the symbolic link.
+ *	@dentry contains the dentry structure of the symbolic link.
+ *	@old_name contains the pathname of file.
+ *	Return 0 if permission is granted.
  * @inode_mkdir:
  *	Check permissions to create a new directory in the existing directory
  *	associated with inode strcture @dir.
@@ -353,11 +373,25 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@dentry contains the dentry structure of new directory.
  *	@mode contains the mode of new directory.
  *	Return 0 if permission is granted.
+ * @path_mkdir:
+ *	Check permissions to create a new directory in the existing directory
+ *	associated with path strcture @path.
+ *	@dir containst the path structure of parent of the directory
+ *	to be created.
+ *	@dentry contains the dentry structure of new directory.
+ *	@mode contains the mode of new directory.
+ *	Return 0 if permission is granted.
  * @inode_rmdir:
  *	Check the permission to remove a directory.
  *	@dir contains the inode structure of parent of the directory to be removed.
  *	@dentry contains the dentry structure of directory to be removed.
  *	Return 0 if permission is granted.
+ * @path_rmdir:
+ *	Check the permission to remove a directory.
+ *	@dir contains the path structure of parent of the directory to be
+ *	removed.
+ *	@dentry contains the dentry structure of directory to be removed.
+ *	Return 0 if permission is granted.
  * @inode_mknod:
  *	Check permissions when creating a special file (or a socket or a fifo
  *	file created via the mknod system call).  Note that if mknod operation
@@ -368,6 +402,15 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@mode contains the mode of the new file.
  *	@dev contains the device number.
  *	Return 0 if permission is granted.
+ * @path_mknod:
+ *	Check permissions when creating a file. Note that this hook is called
+ *	even if mknod operation is being done for a regular file.
+ *	@dir contains the path structure of parent of the new file.
+ *	@dentry contains the dentry structure of the new file.
+ *	@mode contains the mode of the new file.
+ *	@dev contains the undecoded device number. Use new_decode_dev() to get
+ *	the decoded device number.
+ *	Return 0 if permission is granted.
  * @inode_rename:
  *	Check for permission to rename a file or directory.
  *	@old_dir contains the inode structure for parent of the old link.
@@ -375,6 +418,13 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@new_dir contains the inode structure for parent of the new link.
  *	@new_dentry contains the dentry structure of the new link.
  *	Return 0 if permission is granted.
+ * @path_rename:
+ *	Check for permission to rename a file or directory.
+ *	@old_dir contains the path structure for parent of the old link.
+ *	@old_dentry contains the dentry structure of the old link.
+ *	@new_dir contains the path structure for parent of the new link.
+ *	@new_dentry contains the dentry structure of the new link.
+ *	Return 0 if permission is granted.
  * @inode_readlink:
  *	Check the permission to read the symbolic link.
  *	@dentry contains the dentry structure for the file link.
@@ -403,6 +453,12 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@dentry contains the dentry structure for the file.
  *	@attr is the iattr structure containing the new file attributes.
  *	Return 0 if permission is granted.
+ * @path_truncate:
+ *	Check permission before truncating a file.
+ *	@path contains the path structure for the file.
+ *	@length is the new length of the file.
+ *	@time_attrs is the flags passed to do_truncate().
+ *	Return 0 if permission is granted.
  * @inode_getattr:
  *	Check permission before obtaining file attributes.
  *	@mnt is the vfsmount where the dentry was looked up
@@ -1331,6 +1387,22 @@ struct security_operations {
 				   struct super_block *newsb);
 	int (*sb_parse_opts_str) (char *options, struct security_mnt_opts *opts);
 
+#ifdef CONFIG_SECURITY_PATH
+	int (*path_unlink) (struct path *dir, struct dentry *dentry);
+	int (*path_mkdir) (struct path *dir, struct dentry *dentry, int mode);
+	int (*path_rmdir) (struct path *dir, struct dentry *dentry);
+	int (*path_mknod) (struct path *dir, struct dentry *dentry, int mode,
+			   unsigned int dev);
+	int (*path_truncate) (struct path *path, loff_t length,
+			      unsigned int time_attrs);
+	int (*path_symlink) (struct path *dir, struct dentry *dentry,
+			     const char *old_name);
+	int (*path_link) (struct dentry *old_dentry, struct path *new_dir,
+			  struct dentry *new_dentry);
+	int (*path_rename) (struct path *old_dir, struct dentry *old_dentry,
+			    struct path *new_dir, struct dentry *new_dentry);
+#endif
+
 	int (*inode_alloc_security) (struct inode *inode);
 	void (*inode_free_security) (struct inode *inode);
 	int (*inode_init_security) (struct inode *inode, struct inode *dir,
@@ -2705,6 +2777,71 @@ static inline void security_skb_classify_flow(struct sk_buff *skb, struct flowi
 
 #endif	/* CONFIG_SECURITY_NETWORK_XFRM */
 
+#ifdef CONFIG_SECURITY_PATH
+int security_path_unlink(struct path *dir, struct dentry *dentry);
+int security_path_mkdir(struct path *dir, struct dentry *dentry, int mode);
+int security_path_rmdir(struct path *dir, struct dentry *dentry);
+int security_path_mknod(struct path *dir, struct dentry *dentry, int mode,
+			unsigned int dev);
+int security_path_truncate(struct path *path, loff_t length,
+			   unsigned int time_attrs);
+int security_path_symlink(struct path *dir, struct dentry *dentry,
+			  const char *old_name);
+int security_path_link(struct dentry *old_dentry, struct path *new_dir,
+		       struct dentry *new_dentry);
+int security_path_rename(struct path *old_dir, struct dentry *old_dentry,
+			 struct path *new_dir, struct dentry *new_dentry);
+#else	/* CONFIG_SECURITY_PATH */
+static inline int security_path_unlink(struct path *dir, struct dentry *dentry)
+{
+	return 0;
+}
+
+static inline int security_path_mkdir(struct path *dir, struct dentry *dentry,
+				      int mode)
+{
+	return 0;
+}
+
+static inline int security_path_rmdir(struct path *dir, struct dentry *dentry)
+{
+	return 0;
+}
+
+static inline int security_path_mknod(struct path *dir, struct dentry *dentry,
+				      int mode, unsigned int dev)
+{
+	return 0;
+}
+
+static inline int security_path_truncate(struct path *path, loff_t length,
+					 unsigned int time_attrs)
+{
+	return 0;
+}
+
+static inline int security_path_symlink(struct path *dir, struct dentry *dentry,
+					const char *old_name)
+{
+	return 0;
+}
+
+static inline int security_path_link(struct dentry *old_dentry,
+				     struct path *new_dir,
+				     struct dentry *new_dentry)
+{
+	return 0;
+}
+
+static inline int security_path_rename(struct path *old_dir,
+				       struct dentry *old_dentry,
+				       struct path *new_dir,
+				       struct dentry *new_dentry)
+{
+	return 0;
+}
+#endif	/* CONFIG_SECURITY_PATH */
+
 #ifdef CONFIG_KEYS
 #ifdef CONFIG_SECURITY
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c6250d0..d1b8982 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -836,7 +836,11 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		err = mnt_want_write(nd.path.mnt);
 		if (err)
 			goto out_mknod_dput;
+		err = security_path_mknod(&nd.path, dentry, mode, 0);
+		if (err)
+			goto out_mknod_drop_write;
 		err = vfs_mknod(nd.path.dentry->d_inode, dentry, mode, 0);
+out_mknod_drop_write:
 		mnt_drop_write(nd.path.mnt);
 		if (err)
 			goto out_mknod_dput;
diff --git a/security/Kconfig b/security/Kconfig
index d9f47ce..9438535 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -81,6 +81,15 @@ config SECURITY_NETWORK_XFRM
 	  IPSec.
 	  If you are unsure how to answer this question, answer N.
 
+config SECURITY_PATH
+	bool "Security hooks for pathname based access control"
+	depends on SECURITY
+	help
+	  This enables the security hooks for pathname based access control.
+	  If enabled, a security module can use these hooks to
+	  implement pathname based access controls.
+	  If you are unsure how to answer this question, answer N.
+
 config SECURITY_FILE_CAPABILITIES
 	bool "File POSIX Capabilities"
 	default n
diff --git a/security/capability.c b/security/capability.c
index 2dce66f..c545bd1 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -263,6 +263,53 @@ static void cap_inode_getsecid(const struct inode *inode, u32 *secid)
 	*secid = 0;
 }
 
+#ifdef CONFIG_SECURITY_PATH
+static int cap_path_mknod(struct path *dir, struct dentry *dentry, int mode,
+			  unsigned int dev)
+{
+	return 0;
+}
+
+static int cap_path_mkdir(struct path *dir, struct dentry *dentry, int mode)
+{
+	return 0;
+}
+
+static int cap_path_rmdir(struct path *dir, struct dentry *dentry)
+{
+	return 0;
+}
+
+static int cap_path_unlink(struct path *dir, struct dentry *dentry)
+{
+	return 0;
+}
+
+static int cap_path_symlink(struct path *dir, struct dentry *dentry,
+			    const char *old_name)
+{
+	return 0;
+}
+
+static int cap_path_link(struct dentry *old_dentry, struct path *new_dir,
+			 struct dentry *new_dentry)
+{
+	return 0;
+}
+
+static int cap_path_rename(struct path *old_path, struct dentry *old_dentry,
+			   struct path *new_path, struct dentry *new_dentry)
+{
+	return 0;
+}
+
+static int cap_path_truncate(struct path *path, loff_t length,
+			     unsigned int time_attrs)
+{
+	return 0;
+}
+#endif
+
 static int cap_file_permission(struct file *file, int mask)
 {
 	return 0;
@@ -883,6 +930,16 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, inode_setsecurity);
 	set_to_cap_if_null(ops, inode_listsecurity);
 	set_to_cap_if_null(ops, inode_getsecid);
+#ifdef CONFIG_SECURITY_PATH
+	set_to_cap_if_null(ops, path_mknod);
+	set_to_cap_if_null(ops, path_mkdir);
+	set_to_cap_if_null(ops, path_rmdir);
+	set_to_cap_if_null(ops, path_unlink);
+	set_to_cap_if_null(ops, path_symlink);
+	set_to_cap_if_null(ops, path_link);
+	set_to_cap_if_null(ops, path_rename);
+	set_to_cap_if_null(ops, path_truncate);
+#endif
 	set_to_cap_if_null(ops, file_permission);
 	set_to_cap_if_null(ops, file_alloc_security);
 	set_to_cap_if_null(ops, file_free_security);
diff --git a/security/security.c b/security/security.c
index d85dbb3..678d4d0 100644
--- a/security/security.c
+++ b/security/security.c
@@ -355,6 +355,72 @@ int security_inode_init_security(struct inode *inode, struct inode *dir,
 }
 EXPORT_SYMBOL(security_inode_init_security);
 
+#ifdef CONFIG_SECURITY_PATH
+int security_path_mknod(struct path *path, struct dentry *dentry, int mode,
+			unsigned int dev)
+{
+	if (unlikely(IS_PRIVATE(path->dentry->d_inode)))
+		return 0;
+	return security_ops->path_mknod(path, dentry, mode, dev);
+}
+EXPORT_SYMBOL(security_path_mknod);
+
+int security_path_mkdir(struct path *path, struct dentry *dentry, int mode)
+{
+	if (unlikely(IS_PRIVATE(path->dentry->d_inode)))
+		return 0;
+	return security_ops->path_mkdir(path, dentry, mode);
+}
+
+int security_path_rmdir(struct path *path, struct dentry *dentry)
+{
+	if (unlikely(IS_PRIVATE(path->dentry->d_inode)))
+		return 0;
+	return security_ops->path_rmdir(path, dentry);
+}
+
+int security_path_unlink(struct path *path, struct dentry *dentry)
+{
+	if (unlikely(IS_PRIVATE(path->dentry->d_inode)))
+		return 0;
+	return security_ops->path_unlink(path, dentry);
+}
+
+int security_path_symlink(struct path *path, struct dentry *dentry,
+			  const char *old_name)
+{
+	if (unlikely(IS_PRIVATE(path->dentry->d_inode)))
+		return 0;
+	return security_ops->path_symlink(path, dentry, old_name);
+}
+
+int security_path_link(struct dentry *old_dentry, struct path *new_dir,
+		       struct dentry *new_dentry)
+{
+	if (unlikely(IS_PRIVATE(old_dentry->d_inode)))
+		return 0;
+	return security_ops->path_link(old_dentry, new_dir, new_dentry);
+}
+
+int security_path_rename(struct path *old_dir, struct dentry *old_dentry,
+			 struct path *new_dir, struct dentry *new_dentry)
+{
+	if (unlikely(IS_PRIVATE(old_dentry->d_inode) ||
+		     (new_dentry->d_inode && IS_PRIVATE(new_dentry->d_inode))))
+		return 0;
+	return security_ops->path_rename(old_dir, old_dentry, new_dir,
+					 new_dentry);
+}
+
+int security_path_truncate(struct path *path, loff_t length,
+			   unsigned int time_attrs)
+{
+	if (unlikely(IS_PRIVATE(path->dentry->d_inode)))
+		return 0;
+	return security_ops->path_truncate(path, length, time_attrs);
+}
+#endif
+
 int security_inode_create(struct inode *dir, struct dentry *dentry, int mode)
 {
 	if (unlikely(IS_PRIVATE(dir)))
-- 
cgit v0.10.2


From e2b689d82c0394e5239a3557a217f19e2f47f1be Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Thu, 4 Dec 2008 11:17:47 +0000
Subject: fs: reorder struct inotify_device on 64bits to remove padding

Reorder struct inotify_device to remove 8 bytes of padding on 64bit
builds, reducing size to 128 bytes . Therefore allocating from a smaller
slab & using one fewer cachelines.

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>

----
Hi,
patch against 2.6.28-rc7.
built & tested on AMDX2 desktop.

I've not been able to send this to the listed inotify maintainers, I
just get mail failures. So I guessed filesystem was the best home for
it, hope that's ok.

regards
Richard
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index e2425bb..400f806 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -76,10 +76,10 @@ struct inotify_device {
 	struct mutex		ev_mutex;	/* protects event queue */
 	struct mutex		up_mutex;	/* synchronizes watch updates */
 	struct list_head 	events;		/* list of queued events */
-	atomic_t		count;		/* reference count */
 	struct user_struct	*user;		/* user who opened this dev */
 	struct inotify_handle	*ih;		/* inotify handle */
 	struct fasync_struct    *fa;            /* async notification */
+	atomic_t		count;		/* reference count */
 	unsigned int		queue_size;	/* size of the queue (bytes) */
 	unsigned int		event_count;	/* number of pending events */
 	unsigned int		max_events;	/* maximum number of events */
-- 
cgit v0.10.2


From c2452f32786159ed85f0e4b21fec09258f822fc8 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 1 Dec 2008 09:33:43 +0100
Subject: shrink struct dentry

struct dentry is one of the most critical structures in the kernel. So it's
sad to see it going neglected.

With CONFIG_PROFILING turned on (which is probably the common case at least
for distros and kernel developers), sizeof(struct dcache) == 208 here
(64-bit). This gives 19 objects per slab.

I packed d_mounted into a hole, and took another 4 bytes off the inline
name length to take the padding out from the end of the structure. This
shinks it to 200 bytes. I could have gone the other way and increased the
length to 40, but I'm aiming for a magic number, read on...

I then got rid of the d_cookie pointer. This shrinks it to 192 bytes. Rant:
why was this ever a good idea? The cookie system should increase its hash
size or use a tree or something if lookups are a problem. Also the "fast
dcookie lookups" in oprofile should be moved into the dcookie code -- how
can oprofile possibly care about the dcookie_mutex? It gets dropped after
get_dcookie() returns so it can't be providing any sort of protection.

At 192 bytes, 21 objects fit into a 4K page, saving about 3MB on my system
with ~140 000 entries allocated. 192 is also a multiple of 64, so we get
nice cacheline alignment on 64 and 32 byte line systems -- any given dentry
will now require 3 cachelines to touch all fields wheras previously it
would require 4.

I know the inline name size was chosen quite carefully, however with the
reduction in cacheline footprint, it should actually be just about as fast
to do a name lookup for a 36 character name as it was before the patch (and
faster for other sizes). The memory footprint savings for names which are
<= 32 or > 36 bytes long should more than make up for the memory cost for
33-36 byte names.

Performance is a feature...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/arch/powerpc/oprofile/cell/spu_task_sync.c b/arch/powerpc/oprofile/cell/spu_task_sync.c
index 2949126..6b793ae 100644
--- a/arch/powerpc/oprofile/cell/spu_task_sync.c
+++ b/arch/powerpc/oprofile/cell/spu_task_sync.c
@@ -297,7 +297,7 @@ static inline unsigned long fast_get_dcookie(struct path *path)
 {
 	unsigned long cookie;
 
-	if (path->dentry->d_cookie)
+	if (path->dentry->d_flags & DCACHE_COOKIE)
 		return (unsigned long)path->dentry;
 	get_dcookie(path, &cookie);
 	return cookie;
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index 737bd94..65e8294 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -200,7 +200,7 @@ static inline unsigned long fast_get_dcookie(struct path *path)
 {
 	unsigned long cookie;
 
-	if (path->dentry->d_cookie)
+	if (path->dentry->d_flags & DCACHE_COOKIE)
 		return (unsigned long)path->dentry;
 	get_dcookie(path, &cookie);
 	return cookie;
diff --git a/fs/dcache.c b/fs/dcache.c
index a1d86c7..fd244c7 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -34,7 +34,6 @@
 #include <linux/bootmem.h>
 #include "internal.h"
 
-
 int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 
@@ -948,9 +947,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 	dentry->d_op = NULL;
 	dentry->d_fsdata = NULL;
 	dentry->d_mounted = 0;
-#ifdef CONFIG_PROFILING
-	dentry->d_cookie = NULL;
-#endif
 	INIT_HLIST_NODE(&dentry->d_hash);
 	INIT_LIST_HEAD(&dentry->d_lru);
 	INIT_LIST_HEAD(&dentry->d_subdirs);
diff --git a/fs/dcookies.c b/fs/dcookies.c
index 855d4b1..180e9fe 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -93,10 +93,15 @@ static struct dcookie_struct *alloc_dcookie(struct path *path)
 {
 	struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache,
 							GFP_KERNEL);
+	struct dentry *d;
 	if (!dcs)
 		return NULL;
 
-	path->dentry->d_cookie = dcs;
+	d = path->dentry;
+	spin_lock(&d->d_lock);
+	d->d_flags |= DCACHE_COOKIE;
+	spin_unlock(&d->d_lock);
+
 	dcs->path = *path;
 	path_get(path);
 	hash_dcookie(dcs);
@@ -119,14 +124,14 @@ int get_dcookie(struct path *path, unsigned long *cookie)
 		goto out;
 	}
 
-	dcs = path->dentry->d_cookie;
-
-	if (!dcs)
+	if (path->dentry->d_flags & DCACHE_COOKIE) {
+		dcs = find_dcookie((unsigned long)path->dentry);
+	} else {
 		dcs = alloc_dcookie(path);
-
-	if (!dcs) {
-		err = -ENOMEM;
-		goto out;
+		if (!dcs) {
+			err = -ENOMEM;
+			goto out;
+		}
 	}
 
 	*cookie = dcookie_value(dcs);
@@ -251,7 +256,12 @@ out_kmem:
 
 static void free_dcookie(struct dcookie_struct * dcs)
 {
-	dcs->path.dentry->d_cookie = NULL;
+	struct dentry *d = dcs->path.dentry;
+
+	spin_lock(&d->d_lock);
+	d->d_flags &= ~DCACHE_COOKIE;
+	spin_unlock(&d->d_lock);
+
 	path_put(&dcs->path);
 	kmem_cache_free(dcookie_cache, dcs);
 }
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index a37359d..c66d224 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -75,14 +75,22 @@ full_name_hash(const unsigned char *name, unsigned int len)
 	return end_name_hash(hash);
 }
 
-struct dcookie_struct;
-
-#define DNAME_INLINE_LEN_MIN 36
+/*
+ * Try to keep struct dentry aligned on 64 byte cachelines (this will
+ * give reasonable cacheline footprint with larger lines without the
+ * large memory footprint increase).
+ */
+#ifdef CONFIG_64BIT
+#define DNAME_INLINE_LEN_MIN 32 /* 192 bytes */
+#else
+#define DNAME_INLINE_LEN_MIN 40 /* 128 bytes */
+#endif
 
 struct dentry {
 	atomic_t d_count;
 	unsigned int d_flags;		/* protected by d_lock */
 	spinlock_t d_lock;		/* per dentry lock */
+	int d_mounted;
 	struct inode *d_inode;		/* Where the name belongs to - NULL is
 					 * negative */
 	/*
@@ -107,10 +115,7 @@ struct dentry {
 	struct dentry_operations *d_op;
 	struct super_block *d_sb;	/* The root of the dentry tree */
 	void *d_fsdata;			/* fs-specific data */
-#ifdef CONFIG_PROFILING
-	struct dcookie_struct *d_cookie; /* cookie, if any */
-#endif
-	int d_mounted;
+
 	unsigned char d_iname[DNAME_INLINE_LEN_MIN];	/* small names */
 };
 
@@ -177,6 +182,8 @@ d_iput:		no		no		no       yes
 
 #define DCACHE_INOTIFY_PARENT_WATCHED	0x0020 /* Parent inode is watched */
 
+#define DCACHE_COOKIE		0x0040	/* For use by dcookie subsystem */
+
 extern spinlock_t dcache_lock;
 extern seqlock_t rename_lock;
 
-- 
cgit v0.10.2


From dded4f4d5048e64a01cf52eed4d27c8cb2600525 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Mon, 1 Dec 2008 14:34:50 -0800
Subject: include: linux/fs.h: put declarations in __KERNEL__

include/linux/fs.h contains externs for a bunch of variables.  That obviously
belongs under ifdef __KERNEL__.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 001ded4..c5e4c5c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -21,7 +21,6 @@
 
 /* Fixed constants first: */
 #undef NR_OPEN
-extern int sysctl_nr_open;
 #define INR_OPEN 1024		/* Initial setting for nfile rlimits */
 
 #define BLOCK_SIZE_BITS 10
@@ -38,21 +37,13 @@ struct files_stat_struct {
 	int nr_free_files;	/* read only */
 	int max_files;		/* tunable */
 };
-extern struct files_stat_struct files_stat;
-extern int get_max_files(void);
 
 struct inodes_stat_t {
 	int nr_inodes;
 	int nr_unused;
 	int dummy[5];		/* padding for sysctl ABI compatibility */
 };
-extern struct inodes_stat_t inodes_stat;
 
-extern int leases_enable, lease_break_time;
-
-#ifdef CONFIG_DNOTIFY
-extern int dir_notify_enable;
-#endif
 
 #define NR_FILE  8192	/* this can well be larger on a larger system */
 
@@ -330,6 +321,15 @@ extern void __init inode_init(void);
 extern void __init inode_init_early(void);
 extern void __init files_init(unsigned long);
 
+extern struct files_stat_struct files_stat;
+extern int get_max_files(void);
+extern int sysctl_nr_open;
+extern struct inodes_stat_t inodes_stat;
+extern int leases_enable, lease_break_time;
+#ifdef CONFIG_DNOTIFY
+extern int dir_notify_enable;
+#endif
+
 struct buffer_head;
 typedef int (get_block_t)(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create);
-- 
cgit v0.10.2


From 5cc4a0341a1295ea56b2e62eb70d96d8fdb94ded Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Mon, 1 Dec 2008 14:34:51 -0800
Subject: fs/namespace.c: drop code after return

The extra semicolon serves no purpose.

Signed-off-by: Julia Lawall <julia@diku.dk>
Reviewed-by: Richard Genoud <richard.genoud@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namespace.c b/fs/namespace.c
index 1c09cab..a40685d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1990,7 +1990,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 	if (!new_ns->root) {
 		up_write(&namespace_sem);
 		kfree(new_ns);
-		return ERR_PTR(-ENOMEM);;
+		return ERR_PTR(-ENOMEM);
 	}
 	spin_lock(&vfsmount_lock);
 	list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
-- 
cgit v0.10.2


From a17d5232de7b53d34229de79ec22f4bb04adb7e4 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 19 Dec 2008 20:47:10 +0000
Subject: eCryptfs: check readlink result was not an error before using it

The result from readlink is being used to index into the link name
buffer without checking whether it is a valid length. If readlink
returns an error this will fault or cause memory corruption.

Cc: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Cc: Dustin Kirkland <kirkland@canonical.com>
Cc: ecryptfs-devel@lists.launchpad.net
Signed-off-by: Duane Griffin <duaneg@dghda.com>
Acked-by: Michael Halcrow <mhalcrow@us.ibm.com>
Acked-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 89209f0..5e78fc1 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -673,10 +673,11 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 	ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
 			"dentry->d_name.name = [%s]\n", dentry->d_name.name);
 	rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
-	buf[rc] = '\0';
 	set_fs(old_fs);
 	if (rc < 0)
 		goto out_free;
+	else
+		buf[rc] = '\0';
 	rc = 0;
 	nd_set_link(nd, buf);
 	goto out;
-- 
cgit v0.10.2


From 035146851cfa2fe24c1d9dc7637bb009ad06b2f7 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 19 Dec 2008 20:47:11 +0000
Subject: vfs: introduce helper function to safely NUL-terminate symlinks

A number of filesystems were potentially triggering kernel bugs due to
corrupted symlink names on disk. This function helps safely terminate
the names.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Duane Griffin <duaneg@dghda.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/include/linux/namei.h b/include/linux/namei.h
index 99eb803..fc2e035 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -94,4 +94,9 @@ static inline char *nd_get_link(struct nameidata *nd)
 	return nd->saved_names[nd->depth];
 }
 
+static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
+{
+	((char *) name)[min(len, maxlen)] = '\0';
+}
+
 #endif /* _LINUX_NAMEI_H */
-- 
cgit v0.10.2


From ebd09abbd9699f328165aee50a070403fbf55a37 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 19 Dec 2008 20:47:12 +0000
Subject: vfs: ensure page symlinks are NUL-terminated

On-disk data corruption could cause a page link to have its i_size set
to PAGE_SIZE (or a multiple thereof) and its contents all non-NUL.
NUL-terminate the link name to ensure this doesn't cause further
problems for the kernel.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Duane Griffin <duaneg@dghda.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index ab441af..9ed5e28 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2786,13 +2786,16 @@ int vfs_follow_link(struct nameidata *nd, const char *link)
 /* get the link contents into pagecache */
 static char *page_getlink(struct dentry * dentry, struct page **ppage)
 {
-	struct page * page;
+	char *kaddr;
+	struct page *page;
 	struct address_space *mapping = dentry->d_inode->i_mapping;
 	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		return (char*)page;
 	*ppage = page;
-	return kmap(page);
+	kaddr = kmap(page);
+	nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
+	return kaddr;
 }
 
 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
-- 
cgit v0.10.2


From 8d6d0c4da2dbbe0a69fea3692146af39f139f8b4 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 19 Dec 2008 20:47:13 +0000
Subject: ext2: ensure fast symlinks are NUL-terminated

Ensure fast symlink targets are NUL-terminated, even if corrupted
on-disk.

Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Duane Griffin <duaneg@dghda.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 7658b33..02b39a5 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -32,6 +32,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
 #include <linux/fiemap.h>
+#include <linux/namei.h>
 #include "ext2.h"
 #include "acl.h"
 #include "xip.h"
@@ -1286,9 +1287,11 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
 		else
 			inode->i_mapping->a_ops = &ext2_aops;
 	} else if (S_ISLNK(inode->i_mode)) {
-		if (ext2_inode_is_fast_symlink(inode))
+		if (ext2_inode_is_fast_symlink(inode)) {
 			inode->i_op = &ext2_fast_symlink_inode_operations;
-		else {
+			nd_terminate_link(ei->i_data, inode->i_size,
+				sizeof(ei->i_data) - 1);
+		} else {
 			inode->i_op = &ext2_symlink_inode_operations;
 			if (test_opt(inode->i_sb, NOBH))
 				inode->i_mapping->a_ops = &ext2_nobh_aops;
-- 
cgit v0.10.2


From b5ed3112b5f74c8ec1c7aa03a76c596635e85197 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 19 Dec 2008 20:47:14 +0000
Subject: ext3: ensure fast symlinks are NUL-terminated

Ensure fast symlink targets are NUL-terminated, even if corrupted
on-disk.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: linux-ext4@vger.kernel.org
Signed-off-by: Duane Griffin <duaneg@dghda.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index f8424ad..c4bdccf 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -37,6 +37,7 @@
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include <linux/fiemap.h>
+#include <linux/namei.h>
 #include "xattr.h"
 #include "acl.h"
 
@@ -2817,9 +2818,11 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
 		inode->i_op = &ext3_dir_inode_operations;
 		inode->i_fop = &ext3_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
-		if (ext3_inode_is_fast_symlink(inode))
+		if (ext3_inode_is_fast_symlink(inode)) {
 			inode->i_op = &ext3_fast_symlink_inode_operations;
-		else {
+			nd_terminate_link(ei->i_data, inode->i_size,
+				sizeof(ei->i_data) - 1);
+		} else {
 			inode->i_op = &ext3_symlink_inode_operations;
 			ext3_set_aops(inode);
 		}
-- 
cgit v0.10.2


From e83c1397cafc4e44f868289db5e417463c0d09a4 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 19 Dec 2008 20:47:15 +0000
Subject: ext4: ensure fast symlinks are NUL-terminated

Ensure fast symlink targets are NUL-terminated, even if corrupted
on-disk.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: adilger@sun.com
Cc: linux-ext4@vger.kernel.org
Signed-off-by: Duane Griffin <duaneg@dghda.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index be21a5a..7c3325e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -34,6 +34,7 @@
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
 #include <linux/mpage.h>
+#include <linux/namei.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include "ext4_jbd2.h"
@@ -4164,9 +4165,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		inode->i_op = &ext4_dir_inode_operations;
 		inode->i_fop = &ext4_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
-		if (ext4_inode_is_fast_symlink(inode))
+		if (ext4_inode_is_fast_symlink(inode)) {
 			inode->i_op = &ext4_fast_symlink_inode_operations;
-		else {
+			nd_terminate_link(ei->i_data, inode->i_size,
+				sizeof(ei->i_data) - 1);
+		} else {
 			inode->i_op = &ext4_symlink_inode_operations;
 			ext4_set_aops(inode);
 		}
-- 
cgit v0.10.2


From 21acaf8e8da00235be59a3e489d5fa2a8721cafc Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 19 Dec 2008 20:47:16 +0000
Subject: sysv: ensure fast symlinks are NUL-terminated

Ensure fast symlink targets are NUL-terminated, even if corrupted
on-disk.

Cc: Christoph Hellwig <hch@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Duane Griffin <duaneg@dghda.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index df0d435..3d81bf5 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -27,6 +27,7 @@
 #include <linux/init.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
+#include <linux/namei.h>
 #include <asm/byteorder.h>
 #include "sysv.h"
 
@@ -163,8 +164,11 @@ void sysv_set_inode(struct inode *inode, dev_t rdev)
 		if (inode->i_blocks) {
 			inode->i_op = &sysv_symlink_inode_operations;
 			inode->i_mapping->a_ops = &sysv_aops;
-		} else
+		} else {
 			inode->i_op = &sysv_fast_symlink_inode_operations;
+			nd_terminate_link(SYSV_I(inode)->i_data, inode->i_size,
+				sizeof(SYSV_I(inode)->i_data) - 1);
+		}
 	} else
 		init_special_inode(inode, inode->i_mode, rdev);
 }
-- 
cgit v0.10.2


From a63d0ff31a136bdf52350c4e6c2929eaf47ea2b2 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 19 Dec 2008 20:47:17 +0000
Subject: freevxfs: ensure fast symlinks are NUL-terminated

Ensure fast symlink targets are NUL-terminated, even if corrupted
on-disk.

Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Duane Griffin <duaneg@dghda.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 9f3f2ce..03a6ea5 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -325,8 +325,10 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
 		if (!VXFS_ISIMMED(vip)) {
 			ip->i_op = &page_symlink_inode_operations;
 			ip->i_mapping->a_ops = &vxfs_aops;
-		} else
+		} else {
 			ip->i_op = &vxfs_immed_symlink_iops;
+			vip->vii_immed.vi_immed[ip->i_size] = '\0';
+		}
 	} else
 		init_special_inode(ip, ip->i_mode, old_decode_dev(vip->vii_rdev));
 
-- 
cgit v0.10.2


From 7df5fa06de89a4ac311957e0cb9c1d87552b4325 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 19 Dec 2008 20:47:18 +0000
Subject: befs: ensure fast symlinks are NUL-terminated

Ensure fast symlink targets are NUL-terminated, even if corrupted
on-disk.

Cc: Sergey S. Kostyliov <rathamahata@php4.ru>
Signed-off-by: Duane Griffin <duaneg@dghda.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index b6dfee3..d06cb02 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -378,7 +378,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
 		inode->i_size = 0;
 		inode->i_blocks = befs_sb->block_size / VFS_BLOCK_SIZE;
 		strncpy(befs_ino->i_data.symlink, raw_inode->data.symlink,
-			BEFS_SYMLINK_LEN);
+			BEFS_SYMLINK_LEN - 1);
+		befs_ino->i_data.symlink[BEFS_SYMLINK_LEN - 1] = '\0';
 	} else {
 		int num_blks;
 
@@ -477,6 +478,8 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd)
 			kfree(link);
 			befs_error(sb, "Failed to read entire long symlink");
 			link = ERR_PTR(-EIO);
+		} else {
+			link[len - 1] = '\0';
 		}
 	} else {
 		link = befs_ino->i_data.symlink;
-- 
cgit v0.10.2


From dc711ca35f9d95a1eec02118e0c298b5e3068315 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 3 Nov 2008 15:03:50 -0500
Subject: fix switch_names() breakage in short-to-short case

We want ->name.len to match the resulting name on *both*
source and target

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/dcache.c b/fs/dcache.c
index fd244c7..eeafc14 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1616,8 +1616,11 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
 			 */
 			memcpy(dentry->d_iname, target->d_name.name,
 					target->d_name.len + 1);
+			dentry->d_name.len = target->d_name.len;
+			return;
 		}
 	}
+	do_switch(dentry->d_name.len, target->d_name.len);
 }
 
 /*
@@ -1677,7 +1680,6 @@ already_unhashed:
 
 	/* Switch the names.. */
 	switch_names(dentry, target);
-	do_switch(dentry->d_name.len, target->d_name.len);
 	do_switch(dentry->d_name.hash, target->d_name.hash);
 
 	/* ... and switch the parents */
@@ -1787,7 +1789,6 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 	struct dentry *dparent, *aparent;
 
 	switch_names(dentry, anon);
-	do_switch(dentry->d_name.len, anon->d_name.len);
 	do_switch(dentry->d_name.hash, anon->d_name.hash);
 
 	dparent = dentry->d_parent;
-- 
cgit v0.10.2


From be42c4c433c2c0d3f1583c08908fead00d36d222 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Mon, 1 Dec 2008 14:34:58 -0800
Subject: correct wrong function name of d_put in kernel document and source
 comment

no function named d_put(), it should be dput().

Impact: fix document and comment, no functionality changed

Signed-off-by: Zhao Lei <zhaolei@cn.fuijtsu.com>
Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 5579bda..041cb77 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -931,7 +931,7 @@ manipulate dentries:
   d_lookup: look up a dentry given its parent and path name component
 	It looks up the child of that given name from the dcache
 	hash table. If it is found, the reference count is incremented
-	and the dentry is returned. The caller must use d_put()
+	and the dentry is returned. The caller must use dput()
 	to free the dentry when it finishes using it.
 
 For further information on dentry locking, please refer to the document
diff --git a/fs/dcache.c b/fs/dcache.c
index eeafc14..c231a63 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1332,7 +1332,7 @@ err_out:
  *
  * Searches the children of the parent dentry for the name in question. If
  * the dentry is found its reference count is incremented and the dentry
- * is returned. The caller must use d_put to free the entry when it has
+ * is returned. The caller must use dput to free the entry when it has
  * finished using it. %NULL is returned on failure.
  *
  * __d_lookup is dcache_lock free. The hash list is protected using RCU.
-- 
cgit v0.10.2


From 52afeefb9dac9287429642189996426a2bfd6a25 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Mon, 1 Dec 2008 14:35:00 -0800
Subject: expand some comments (d_path / seq_path)

Explain that you really need to use the return value of d_path rather than
the buffer you passed into it.

Also fix the comment for seq_path(), the function arguments changed
recently but the comment hadn't been updated in sync.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/dcache.c b/fs/dcache.c
index c231a63..bdb3f50 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1908,7 +1908,8 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
  * Convert a dentry into an ASCII path name. If the entry has been deleted
  * the string " (deleted)" is appended. Note that this is ambiguous.
  *
- * Returns the buffer or an error code if the path was too long.
+ * Returns a pointer into the buffer or an error code if the
+ * path was too long.
  *
  * "buflen" should be positive. Caller holds the dcache_lock.
  *
@@ -1984,7 +1985,10 @@ Elong:
  * Convert a dentry into an ASCII path name. If the entry has been deleted
  * the string " (deleted)" is appended. Note that this is ambiguous.
  *
- * Returns the buffer or an error code if the path was too long.
+ * Returns a pointer into the buffer or an error code if the path was
+ * too long. Note: Callers should use the returned pointer, not the passed
+ * in buffer, to use the name! The implementation often starts at an offset
+ * into the buffer, and may leave 0 bytes at the start.
  *
  * "buflen" should be positive.
  */
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 16c2115..99d8b8c 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -389,8 +389,14 @@ char *mangle_path(char *s, char *p, char *esc)
 }
 EXPORT_SYMBOL(mangle_path);
 
-/*
- * return the absolute path of 'dentry' residing in mount 'mnt'.
+/**
+ * seq_path - seq_file interface to print a pathname
+ * @m: the seq_file handle
+ * @path: the struct path to print
+ * @esc: set of characters to escape in the output
+ *
+ * return the absolute path of 'path', as represented by the
+ * dentry / mnt pair in the path parameter.
  */
 int seq_path(struct seq_file *m, struct path *path, char *esc)
 {
-- 
cgit v0.10.2


From 66f221875dc10813aa2f06c83ad60d0eb1356406 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Nov 2008 15:04:29 +0100
Subject: remove incorrect comment in inode_permission

We now pass on all MAY_ flags to the filesystems permission routines,
so remove the comment stating the contrary.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 9ed5e28..631cfdd 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -247,7 +247,6 @@ int inode_permission(struct inode *inode, int mask)
 			return -EACCES;
 	}
 
-	/* Ordinary permission routines do not understand MAY_APPEND. */
 	if (inode->i_op && inode->i_op->permission)
 		retval = inode->i_op->permission(inode, mask);
 	else
-- 
cgit v0.10.2


From b4091d5f6fde28ab762e1094a1a26d81f3badfa5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Nov 2008 15:07:21 +0100
Subject: kill walk_init_root

walk_init_root is a tiny helper that is marked __always_inline, has just
one caller and an unused argument.  Just merge it into the caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 631cfdd..d4d0b59 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -526,18 +526,6 @@ out_unlock:
 	return result;
 }
 
-/* SMP-safe */
-static __always_inline void
-walk_init_root(const char *name, struct nameidata *nd)
-{
-	struct fs_struct *fs = current->fs;
-
-	read_lock(&fs->lock);
-	nd->path = fs->root;
-	path_get(&fs->root);
-	read_unlock(&fs->lock);
-}
-
 /*
  * Wrapper to retry pathname resolution whenever the underlying
  * file system returns an ESTALE.
@@ -575,9 +563,16 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
 		goto fail;
 
 	if (*link == '/') {
+		struct fs_struct *fs = current->fs;
+
 		path_put(&nd->path);
-		walk_init_root(link, nd);
+
+		read_lock(&fs->lock);
+		nd->path = fs->root;
+		path_get(&fs->root);
+		read_unlock(&fs->lock);
 	}
+
 	res = link_path_walk(link, nd);
 	if (nd->depth || res || nd->last_type!=LAST_NORM)
 		return res;
-- 
cgit v0.10.2


From 3fb64190aa3c23c10e6e9fd0124ac030115c99bf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Oct 2008 09:58:10 +0200
Subject: pass a struct path * to may_open

No need for the nameidata in may_open - a struct path is enough.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index d4d0b59..5cc0dc9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1487,9 +1487,9 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	return error;
 }
 
-int may_open(struct nameidata *nd, int acc_mode, int flag)
+int may_open(struct path *path, int acc_mode, int flag)
 {
-	struct dentry *dentry = nd->path.dentry;
+	struct dentry *dentry = path->dentry;
 	struct inode *inode = dentry->d_inode;
 	int error;
 
@@ -1510,13 +1510,13 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
 	if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
 	    	flag &= ~O_TRUNC;
 	} else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
-		if (nd->path.mnt->mnt_flags & MNT_NODEV)
+		if (path->mnt->mnt_flags & MNT_NODEV)
 			return -EACCES;
 
 		flag &= ~O_TRUNC;
 	}
 
-	error = vfs_permission(nd, acc_mode);
+	error = inode_permission(inode, acc_mode);
 	if (error)
 		return error;
 	/*
@@ -1551,7 +1551,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
 		 */
 		error = locks_verify_locked(inode);
 		if (!error)
-			error = security_path_truncate(&nd->path, 0,
+			error = security_path_truncate(path, 0,
 					       ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
 		if (!error) {
 			DQUOT_INIT(inode);
@@ -1594,7 +1594,7 @@ out_unlock:
 	if (error)
 		return error;
 	/* Don't check for write permission, don't truncate */
-	return may_open(nd, 0, flag & ~O_TRUNC);
+	return may_open(&nd->path, 0, flag & ~O_TRUNC);
 }
 
 /*
@@ -1780,7 +1780,7 @@ ok:
 		if (error)
 			goto exit;
 	}
-	error = may_open(&nd, acc_mode, flag);
+	error = may_open(&nd.path, acc_mode, flag);
 	if (error) {
 		if (will_write)
 			mnt_drop_write(nd.path.mnt);
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index b1acbd6..b274519 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -38,9 +38,10 @@ static struct file *do_open(char *name, int flags)
 		return ERR_PTR(error);
 
 	if (flags == O_RDWR)
-		error = may_open(&nd,MAY_READ|MAY_WRITE,FMODE_READ|FMODE_WRITE);
+		error = may_open(&nd.path, MAY_READ|MAY_WRITE,
+					   FMODE_READ|FMODE_WRITE);
 	else
-		error = may_open(&nd, MAY_WRITE, FMODE_WRITE);
+		error = may_open(&nd.path, MAY_WRITE, FMODE_WRITE);
 
 	if (!error)
 		return dentry_open(nd.path.dentry, nd.path.mnt, flags,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c5e4c5c..3468df5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1869,7 +1869,7 @@ extern void free_write_pipe(struct file *);
 
 extern struct file *do_filp_open(int dfd, const char *pathname,
 		int open_flag, int mode);
-extern int may_open(struct nameidata *, int, int);
+extern int may_open(struct path *, int, int);
 
 extern int kernel_read(struct file *, unsigned long, char *, unsigned long);
 extern struct file * open_exec(const char *);
-- 
cgit v0.10.2


From cb23beb55100171646e69e248fb45f10db6e99a4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Oct 2008 09:59:29 +0200
Subject: kill vfs_permission

With all the nameidata removal there's no point anymore for this helper.
Of the three callers left two will go away with the next lookup series
anyway.

Also add proper kerneldoc to inode_permission as this is the main
permission check routine now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/exec.c b/fs/exec.c
index 02d2e12..dfbf700 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -127,7 +127,8 @@ asmlinkage long sys_uselib(const char __user * library)
 	if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
 		goto exit;
 
-	error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN);
+	error = inode_permission(nd.path.dentry->d_inode,
+				 MAY_READ | MAY_EXEC | MAY_OPEN);
 	if (error)
 		goto exit;
 
@@ -680,7 +681,7 @@ struct file *open_exec(const char *name)
 	if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
 		goto out_path_put;
 
-	err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN);
+	err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
 	if (err)
 		goto out_path_put;
 
diff --git a/fs/namei.c b/fs/namei.c
index 5cc0dc9..3f88e04 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -226,6 +226,16 @@ int generic_permission(struct inode *inode, int mask,
 	return -EACCES;
 }
 
+/**
+ * inode_permission  -  check for access rights to a given inode
+ * @inode:	inode to check permission on
+ * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ *
+ * Used to check for read/write/execute permissions on an inode.
+ * We use "fsuid" for this, letting us set arbitrary permissions
+ * for filesystem access without changing the "normal" uids which
+ * are used for other things.
+ */
 int inode_permission(struct inode *inode, int mask)
 {
 	int retval;
@@ -264,21 +274,6 @@ int inode_permission(struct inode *inode, int mask)
 }
 
 /**
- * vfs_permission  -  check for access rights to a given path
- * @nd:		lookup result that describes the path
- * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
- *
- * Used to check for read/write/execute permissions on a path.
- * We use "fsuid" for this, letting us set arbitrary permissions
- * for filesystem access without changing the "normal" uids which
- * are used for other things.
- */
-int vfs_permission(struct nameidata *nd, int mask)
-{
-	return inode_permission(nd->path.dentry->d_inode, mask);
-}
-
-/**
  * file_permission  -  check for additional access rights to a given file
  * @file:	file to check access rights for
  * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
@@ -288,7 +283,7 @@ int vfs_permission(struct nameidata *nd, int mask)
  *
  * Note:
  *	Do not use this function in new code.  All access checks should
- *	be done using vfs_permission().
+ *	be done using inode_permission().
  */
 int file_permission(struct file *file, int mask)
 {
@@ -853,7 +848,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
 		nd->flags |= LOOKUP_CONTINUE;
 		err = exec_permission_lite(inode);
 		if (err == -EAGAIN)
-			err = vfs_permission(nd, MAY_EXEC);
+			err = inode_permission(nd->path.dentry->d_inode,
+					       MAY_EXEC);
  		if (err)
 			break;
 
@@ -2882,7 +2878,6 @@ EXPORT_SYMBOL(path_lookup);
 EXPORT_SYMBOL(kern_path);
 EXPORT_SYMBOL(vfs_path_lookup);
 EXPORT_SYMBOL(inode_permission);
-EXPORT_SYMBOL(vfs_permission);
 EXPORT_SYMBOL(file_permission);
 EXPORT_SYMBOL(unlock_rename);
 EXPORT_SYMBOL(vfs_create);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3468df5..fd61598 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1212,7 +1212,6 @@ extern void unlock_super(struct super_block *);
 /*
  * VFS helper functions..
  */
-extern int vfs_permission(struct nameidata *, int);
 extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *);
 extern int vfs_mkdir(struct inode *, struct dentry *, int);
 extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t);
-- 
cgit v0.10.2


From 18d8fda7c3c9439be04d7ea2e82da2513b121acb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 26 Dec 2008 00:35:37 -0500
Subject: take init_fs to saner place

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/arch/alpha/kernel/init_task.c b/arch/alpha/kernel/init_task.c
index 1f76218..c2938e5 100644
--- a/arch/alpha/kernel/init_task.c
+++ b/arch/alpha/kernel/init_task.c
@@ -8,7 +8,6 @@
 #include <asm/uaccess.h>
 
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/arm/kernel/init_task.c b/arch/arm/kernel/init_task.c
index 0bbf806..e859af3 100644
--- a/arch/arm/kernel/init_task.c
+++ b/arch/arm/kernel/init_task.c
@@ -12,7 +12,6 @@
 
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/avr32/kernel/init_task.c b/arch/avr32/kernel/init_task.c
index 4405846..993d56e 100644
--- a/arch/avr32/kernel/init_task.c
+++ b/arch/avr32/kernel/init_task.c
@@ -13,7 +13,6 @@
 
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/blackfin/kernel/init_task.c b/arch/blackfin/kernel/init_task.c
index 6bdba7b..2c228c0 100644
--- a/arch/blackfin/kernel/init_task.c
+++ b/arch/blackfin/kernel/init_task.c
@@ -33,7 +33,6 @@
 #include <linux/mqueue.h>
 #include <linux/fs.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 
diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c
index 5933656..60816e8 100644
--- a/arch/cris/kernel/process.c
+++ b/arch/cris/kernel/process.c
@@ -37,7 +37,6 @@
  * setup.
  */
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/frv/kernel/init_task.c b/arch/frv/kernel/init_task.c
index e219881..29429a8 100644
--- a/arch/frv/kernel/init_task.c
+++ b/arch/frv/kernel/init_task.c
@@ -10,7 +10,6 @@
 #include <asm/pgtable.h>
 
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/h8300/kernel/init_task.c b/arch/h8300/kernel/init_task.c
index 93a4899..cb5dc55 100644
--- a/arch/h8300/kernel/init_task.c
+++ b/arch/h8300/kernel/init_task.c
@@ -12,7 +12,6 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c
index 9d7e1c6..5b0e830 100644
--- a/arch/ia64/kernel/init_task.c
+++ b/arch/ia64/kernel/init_task.c
@@ -17,7 +17,6 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/m32r/kernel/init_task.c b/arch/m32r/kernel/init_task.c
index 0d658db..016885c 100644
--- a/arch/m32r/kernel/init_task.c
+++ b/arch/m32r/kernel/init_task.c
@@ -11,7 +11,6 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 3042c2b..632ce01 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -40,7 +40,6 @@
  * alignment requirements and potentially different initial
  * setup.
  */
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/m68knommu/kernel/init_task.c b/arch/m68knommu/kernel/init_task.c
index 344c01a..fe282de 100644
--- a/arch/m68knommu/kernel/init_task.c
+++ b/arch/m68knommu/kernel/init_task.c
@@ -12,7 +12,6 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/mips/kernel/init_task.c b/arch/mips/kernel/init_task.c
index d72487a..149cd91 100644
--- a/arch/mips/kernel/init_task.c
+++ b/arch/mips/kernel/init_task.c
@@ -9,7 +9,6 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/mn10300/kernel/init_task.c b/arch/mn10300/kernel/init_task.c
index af16f6e..5ac3566 100644
--- a/arch/mn10300/kernel/init_task.c
+++ b/arch/mn10300/kernel/init_task.c
@@ -18,7 +18,6 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/parisc/kernel/init_task.c b/arch/parisc/kernel/init_task.c
index f5941c0..1e25a45 100644
--- a/arch/parisc/kernel/init_task.c
+++ b/arch/parisc/kernel/init_task.c
@@ -34,7 +34,6 @@
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/powerpc/kernel/init_task.c b/arch/powerpc/kernel/init_task.c
index 4c85b8d..688b329 100644
--- a/arch/powerpc/kernel/init_task.c
+++ b/arch/powerpc/kernel/init_task.c
@@ -7,7 +7,6 @@
 #include <linux/mqueue.h>
 #include <asm/uaccess.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/s390/kernel/init_task.c b/arch/s390/kernel/init_task.c
index e807168..7db95c0 100644
--- a/arch/s390/kernel/init_task.c
+++ b/arch/s390/kernel/init_task.c
@@ -16,7 +16,6 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/sh/kernel/init_task.c b/arch/sh/kernel/init_task.c
index b151a25..80c35ff 100644
--- a/arch/sh/kernel/init_task.c
+++ b/arch/sh/kernel/init_task.c
@@ -7,7 +7,6 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct pt_regs fake_swapper_regs;
diff --git a/arch/sparc/kernel/init_task.c b/arch/sparc/kernel/init_task.c
index 62126e4..f28cb82 100644
--- a/arch/sparc/kernel/init_task.c
+++ b/arch/sparc/kernel/init_task.c
@@ -8,7 +8,6 @@
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/um/kernel/init_task.c b/arch/um/kernel/init_task.c
index 910eda8..806d381 100644
--- a/arch/um/kernel/init_task.c
+++ b/arch/um/kernel/init_task.c
@@ -10,7 +10,6 @@
 #include "linux/mqueue.h"
 #include "asm/uaccess.h"
 
-static struct fs_struct init_fs = INIT_FS;
 struct mm_struct init_mm = INIT_MM(init_mm);
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index d399180..df3bf26 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -10,7 +10,6 @@
 #include <asm/pgtable.h>
 #include <asm/desc.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/xtensa/kernel/init_task.c b/arch/xtensa/kernel/init_task.c
index 3df469d..e07f5c9 100644
--- a/arch/xtensa/kernel/init_task.c
+++ b/arch/xtensa/kernel/init_task.c
@@ -21,7 +21,6 @@
 
 #include <asm/uaccess.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/fs/namei.c b/fs/namei.c
index 3f88e04..e203691 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2893,3 +2893,10 @@ EXPORT_SYMBOL(vfs_symlink);
 EXPORT_SYMBOL(vfs_unlink);
 EXPORT_SYMBOL(dentry_unhash);
 EXPORT_SYMBOL(generic_readlink);
+
+/* to be mentioned only in INIT_TASK */
+struct fs_struct init_fs = {
+	.count		= ATOMIC_INIT(1),
+	.lock		= RW_LOCK_UNLOCKED,
+	.umask		= 0022,
+};
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 9e5a06e..a97c053 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -10,12 +10,6 @@ struct fs_struct {
 	struct path root, pwd;
 };
 
-#define INIT_FS {				\
-	.count		= ATOMIC_INIT(1),	\
-	.lock		= RW_LOCK_UNLOCKED,	\
-	.umask		= 0022, \
-}
-
 extern struct kmem_cache *fs_cachep;
 
 extern void exit_fs(struct task_struct *);
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 959f552..2f3c2d4 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -12,6 +12,7 @@
 #include <net/net_namespace.h>
 
 extern struct files_struct init_files;
+extern struct fs_struct init_fs;
 
 #define INIT_KIOCTX(name, which_mm) \
 {							\
-- 
cgit v0.10.2


From 1239f26c05899f1f3c541b41e719c59d58038786 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 10 Dec 2008 18:37:28 -0500
Subject: make INIT_FS use the __RW_LOCK_UNLOCKED initialization

[AV: rediffed on top of unification of init_fs]
Initialization of init_fs still uses the deprecated RW_LOCK_UNLOCKED macro.
This patch updates it to use the __RW_LOCK_UNLOCKED(lock) macro.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index e203691..dd5c9f0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2897,6 +2897,6 @@ EXPORT_SYMBOL(generic_readlink);
 /* to be mentioned only in INIT_TASK */
 struct fs_struct init_fs = {
 	.count		= ATOMIC_INIT(1),
-	.lock		= RW_LOCK_UNLOCKED,
+	.lock		= __RW_LOCK_UNLOCKED(init_fs.lock),
 	.umask		= 0022,
 };
-- 
cgit v0.10.2


From fd659fd6275d3426d7967da1f0e3638bbbd2fedb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 10 Dec 2008 09:35:45 -0800
Subject: fix f_count description in Documentation/filesystems/files.txt

Documentation/filesystems/files.txt was not updated when
f_count became an atomic_long_t.
atomic_long_inc_not_zero() is now used instead of atomic_inc_not_zero()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/Documentation/filesystems/files.txt b/Documentation/filesystems/files.txt
index bb0142f..ac2facc 100644
--- a/Documentation/filesystems/files.txt
+++ b/Documentation/filesystems/files.txt
@@ -76,13 +76,13 @@ the fdtable structure -
 5. Handling of the file structures is special. Since the look-up
    of the fd (fget()/fget_light()) are lock-free, it is possible
    that look-up may race with the last put() operation on the
-   file structure. This is avoided using atomic_inc_not_zero()
+   file structure. This is avoided using atomic_long_inc_not_zero()
    on ->f_count :
 
 	rcu_read_lock();
 	file = fcheck_files(files, fd);
 	if (file) {
-		if (atomic_inc_not_zero(&file->f_count))
+		if (atomic_long_inc_not_zero(&file->f_count))
 			*fput_needed = 1;
 		else
 		/* Didn't get the reference, someone's freed */
@@ -92,7 +92,7 @@ the fdtable structure -
 	....
 	return file;
 
-   atomic_inc_not_zero() detects if refcounts is already zero or
+   atomic_long_inc_not_zero() detects if refcounts is already zero or
    goes to zero during increment. If it does, we fail
    fget()/fget_light().
 
-- 
cgit v0.10.2


From b6b3fdead251d432f32f2cfce2a893ab8a658110 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 10 Dec 2008 09:35:45 -0800
Subject: filp_cachep can be static in fs/file_table.c

Instead of creating the "filp" kmem_cache in vfs_caches_init(),
we can do it a litle be later in files_init(), so that filp_cachep
is static to fs/file_table.c

Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/dcache.c b/fs/dcache.c
index bdb3f50..e88c23b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2314,9 +2314,6 @@ static void __init dcache_init(void)
 /* SLAB cache for __getname() consumers */
 struct kmem_cache *names_cachep __read_mostly;
 
-/* SLAB cache for file structures */
-struct kmem_cache *filp_cachep __read_mostly;
-
 EXPORT_SYMBOL(d_genocide);
 
 void __init vfs_caches_init_early(void)
@@ -2338,9 +2335,6 @@ void __init vfs_caches_init(unsigned long mempages)
 	names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
-	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-
 	dcache_init();
 	inode_init();
 	files_init(mempages);
diff --git a/fs/file_table.c b/fs/file_table.c
index 0fbcacc..bbeeac6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -32,6 +32,9 @@ struct files_stat_struct files_stat = {
 /* public. Not pretty! */
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
 
+/* SLAB cache for file structures */
+static struct kmem_cache *filp_cachep __read_mostly;
+
 static struct percpu_counter nr_files __cacheline_aligned_in_smp;
 
 static inline void file_free_rcu(struct rcu_head *head)
@@ -397,7 +400,12 @@ too_bad:
 void __init files_init(unsigned long mempages)
 { 
 	int n; 
-	/* One file with associated inode and dcache is very roughly 1K. 
+
+	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
+			SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+
+	/*
+	 * One file with associated inode and dcache is very roughly 1K.
 	 * Per default don't use more than 10% of our memory for files. 
 	 */ 
 
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 4aab6f1..09d6c5b 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -57,8 +57,6 @@ struct files_struct {
 
 #define files_fdtable(files) (rcu_dereference((files)->fdt))
 
-extern struct kmem_cache *filp_cachep;
-
 struct file_operations;
 struct vfsmount;
 struct dentry;
-- 
cgit v0.10.2


From 6badd79bd002788aaec27b50a74ab69ef65ab8ee Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 26 Dec 2008 00:57:40 -0500
Subject: kill ->dir_notify()

Remove the hopelessly misguided ->dir_notify().  The only instance (cifs)
has been broken by design from the very beginning; the objects it creates
are never destroyed, keep references to struct file they can outlive, nothing
that could possibly evict them exists on close(2) path *and* no locking
whatsoever is done to prevent races with close(), should the previous, er,
deficiencies someday be dealt with.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 23d2f44..ccec553 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -394,7 +394,6 @@ prototypes:
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long,
 			unsigned long, unsigned long, unsigned long);
 	int (*check_flags)(int);
-	int (*dir_notify)(struct file *, unsigned long);
 };
 
 locking rules:
@@ -424,7 +423,6 @@ sendfile:		no
 sendpage:		no
 get_unmapped_area:	no
 check_flags:		no
-dir_notify:		no
 
 ->llseek() locking has moved from llseek to the individual llseek
 implementations.  If your fs is not using generic_file_llseek, you
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 041cb77..ef19afa 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -733,7 +733,6 @@ struct file_operations {
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
 	int (*check_flags)(int);
-	int (*dir_notify)(struct file *filp, unsigned long arg);
 	int (*flock) (struct file *, int, struct file_lock *);
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, size_t, unsigned int);
 	ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int);
@@ -800,8 +799,6 @@ otherwise noted.
 
   check_flags: called by the fcntl(2) system call for F_SETFL command
 
-  dir_notify: called by the fcntl(2) system call for F_NOTIFY command
-
   flock: called by the flock(2) system call
 
   splice_write: called by the VFS to splice data from a pipe to a file. This
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 5f1538c..a05287a 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -132,11 +132,6 @@ static int bad_file_check_flags(int flags)
 	return -EIO;
 }
 
-static int bad_file_dir_notify(struct file *file, unsigned long arg)
-{
-	return -EIO;
-}
-
 static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl)
 {
 	return -EIO;
@@ -179,7 +174,6 @@ static const struct file_operations bad_file_ops =
 	.sendpage	= bad_file_sendpage,
 	.get_unmapped_area = bad_file_get_unmapped_area,
 	.check_flags	= bad_file_check_flags,
-	.dir_notify	= bad_file_dir_notify,
 	.flock		= bad_file_flock,
 	.splice_write	= bad_file_splice_write,
 	.splice_read	= bad_file_splice_read,
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 6ba43fb..9948c00 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,7 +5,7 @@ obj-$(CONFIG_CIFS) += cifs.o
 
 cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
 	  link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
-	  md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o \
+	  md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
 	  readdir.o ioctl.o sess.o export.o cifsacl.o
 
 cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 0005a19..13ea532 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -747,7 +747,6 @@ const struct file_operations cifs_file_ops = {
 #endif /* CONFIG_CIFS_POSIX */
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-	.dir_notify = cifs_dir_notify,
 	.setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -768,7 +767,6 @@ const struct file_operations cifs_file_direct_ops = {
 #endif /* CONFIG_CIFS_POSIX */
 	.llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-	.dir_notify = cifs_dir_notify,
 	.setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -789,7 +787,6 @@ const struct file_operations cifs_file_nobrl_ops = {
 #endif /* CONFIG_CIFS_POSIX */
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-	.dir_notify = cifs_dir_notify,
 	.setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -809,7 +806,6 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 #endif /* CONFIG_CIFS_POSIX */
 	.llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-	.dir_notify = cifs_dir_notify,
 	.setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -818,9 +814,6 @@ const struct file_operations cifs_dir_ops = {
 	.readdir = cifs_readdir,
 	.release = cifs_closedir,
 	.read    = generic_read_dir,
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-	.dir_notify = cifs_dir_notify,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 	.unlocked_ioctl  = cifs_ioctl,
 	.llseek = generic_file_llseek,
 };
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 2ce04c7..7ac4818 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -76,7 +76,6 @@ extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
 extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
-extern int cifs_dir_notify(struct file *, unsigned long arg);
 
 /* Functions related to dir entries */
 extern struct dentry_operations cifs_dentry_ops;
diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c
deleted file mode 100644
index 5a57581..0000000
--- a/fs/cifs/fcntl.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- *   fs/cifs/fcntl.c
- *
- *   vfs operations that deal with the file control API
- *
- *   Copyright (C) International Business Machines  Corp., 2003,2004
- *   Author(s): Steve French (sfrench@us.ibm.com)
- *
- *   This library is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU Lesser General Public License as published
- *   by the Free Software Foundation; either version 2.1 of the License, or
- *   (at your option) any later version.
- *
- *   This library is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU Lesser General Public License for more details.
- *
- *   You should have received a copy of the GNU Lesser General Public License
- *   along with this library; if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-#include <linux/fs.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include "cifsglob.h"
-#include "cifsproto.h"
-#include "cifs_unicode.h"
-#include "cifs_debug.h"
-#include "cifsfs.h"
-
-static __u32 convert_to_cifs_notify_flags(unsigned long fcntl_notify_flags)
-{
-	__u32 cifs_ntfy_flags = 0;
-
-	/* No way on Linux VFS to ask to monitor xattr
-	changes (and no stream support either */
-	if (fcntl_notify_flags & DN_ACCESS)
-		cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_ACCESS;
-	if (fcntl_notify_flags & DN_MODIFY) {
-		/* What does this mean on directories? */
-		cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE |
-			FILE_NOTIFY_CHANGE_SIZE;
-	}
-	if (fcntl_notify_flags & DN_CREATE) {
-		cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_CREATION |
-			FILE_NOTIFY_CHANGE_LAST_WRITE;
-	}
-	if (fcntl_notify_flags & DN_DELETE)
-		cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE;
-	if (fcntl_notify_flags & DN_RENAME) {
-		/* BB review this - checking various server behaviors */
-		cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_DIR_NAME |
-			FILE_NOTIFY_CHANGE_FILE_NAME;
-	}
-	if (fcntl_notify_flags & DN_ATTRIB) {
-		cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_SECURITY |
-			FILE_NOTIFY_CHANGE_ATTRIBUTES;
-	}
-/*	if (fcntl_notify_flags & DN_MULTISHOT) {
-		cifs_ntfy_flags |= ;
-	} */ /* BB fixme - not sure how to handle this with CIFS yet */
-
-	return cifs_ntfy_flags;
-}
-
-int cifs_dir_notify(struct file *file, unsigned long arg)
-{
-	int xid;
-	int rc = -EINVAL;
-	int oplock = 0;
-	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
-	char *full_path = NULL;
-	__u32 filter = FILE_NOTIFY_CHANGE_NAME | FILE_NOTIFY_CHANGE_ATTRIBUTES;
-	__u16 netfid;
-
-	if (experimEnabled == 0)
-		return 0;
-
-	xid = GetXid();
-	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	pTcon = cifs_sb->tcon;
-
-	full_path = build_path_from_dentry(file->f_path.dentry);
-
-	if (full_path == NULL) {
-		rc = -ENOMEM;
-	} else {
-		cFYI(1, ("dir notify on file %s Arg 0x%lx", full_path, arg));
-		rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
-			GENERIC_READ | SYNCHRONIZE, 0 /* create options */,
-			&netfid, &oplock, NULL, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-		/* BB fixme - add this handle to a notify handle list */
-		if (rc) {
-			cFYI(1, ("Could not open directory for notify"));
-		} else {
-			filter = convert_to_cifs_notify_flags(arg);
-			if (filter != 0) {
-				rc = CIFSSMBNotify(xid, pTcon,
-					0 /* no subdirs */, netfid,
-					filter, file, arg & DN_MULTISHOT,
-					cifs_sb->local_nls);
-			} else {
-				rc = -EINVAL;
-			}
-			/* BB add code to close file eventually (at unmount
-			it would close automatically but may be a way
-			to do it easily when inode freed or when
-			notify info is cleared/changed */
-			cFYI(1, ("notify rc %d", rc));
-		}
-	}
-
-	FreeXid(xid);
-	return rc;
-}
diff --git a/fs/dnotify.c b/fs/dnotify.c
index 676073b..b0aa2cd 100644
--- a/fs/dnotify.c
+++ b/fs/dnotify.c
@@ -115,9 +115,6 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 	dn->dn_next = inode->i_dnotify;
 	inode->i_dnotify = dn;
 	spin_unlock(&inode->i_lock);
-
-	if (filp->f_op && filp->f_op->dir_notify)
-		return filp->f_op->dir_notify(filp, arg);
 	return 0;
 
 out_free:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd61598..be16ce0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1309,7 +1309,6 @@ struct file_operations {
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
 	int (*check_flags)(int);
-	int (*dir_notify)(struct file *filp, unsigned long arg);
 	int (*flock) (struct file *, int, struct file_lock *);
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
 	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
-- 
cgit v0.10.2


From c2acf7b90821785fe812cc0aa05148e5a1f84204 Mon Sep 17 00:00:00 2001
From: Denis ChengRq <crquan@gmail.com>
Date: Mon, 1 Dec 2008 14:34:56 -0800
Subject: fs/block_dev.c: __read_mostly improvement and sb_is_blkdev_sb
 utilization

- iget5_locked in bdget really needs blockdev_superblock, instead of
  bd_mnt, so bd_mnt could be just a local variable;

- blockdev_superblock really needs __read_mostly, while local var bd_mnt
  not;

- make use of sb_is_blkdev_sb in bd_forget, instead of direct reference
  to blockdev_superblock.

Signed-off-by: Denis ChengRq <crquan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 99e0ae1..349a26c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -326,12 +326,13 @@ static struct file_system_type bd_type = {
 	.kill_sb	= kill_anon_super,
 };
 
-static struct vfsmount *bd_mnt __read_mostly;
-struct super_block *blockdev_superblock;
+struct super_block *blockdev_superblock __read_mostly;
 
 void __init bdev_cache_init(void)
 {
 	int err;
+	struct vfsmount *bd_mnt;
+
 	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
 			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 				SLAB_MEM_SPREAD|SLAB_PANIC),
@@ -373,7 +374,7 @@ struct block_device *bdget(dev_t dev)
 	struct block_device *bdev;
 	struct inode *inode;
 
-	inode = iget5_locked(bd_mnt->mnt_sb, hash(dev),
+	inode = iget5_locked(blockdev_superblock, hash(dev),
 			bdev_test, bdev_set, &dev);
 
 	if (!inode)
@@ -463,7 +464,7 @@ void bd_forget(struct inode *inode)
 
 	spin_lock(&bdev_lock);
 	if (inode->i_bdev) {
-		if (inode->i_sb != blockdev_superblock)
+		if (!sb_is_blkdev_sb(inode->i_sb))
 			bdev = inode->i_bdev;
 		__bd_forget(inode);
 	}
-- 
cgit v0.10.2


From 272eb01485dda98e3b8910c7c1a53d597616b0a0 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 17 Dec 2008 13:59:41 -0500
Subject: filesystem notification: create fs/notify to contain all fs
 notification

Creating a generic filesystem notification interface, fsnotify, which will be
used by inotify, dnotify, and eventually fanotify is really starting to
clutter the fs directory.  This patch simply moves inotify and dnotify into
fs/notify/inotify and fs/notify/dnotify respectively to make both current fs/
and future notification tidier.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/Kconfig b/fs/Kconfig
index 522469a..ff0e819 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -270,44 +270,7 @@ config OCFS2_COMPAT_JBD
 
 endif # BLOCK
 
-config DNOTIFY
-	bool "Dnotify support"
-	default y
-	help
-	  Dnotify is a directory-based per-fd file change notification system
-	  that uses signals to communicate events to user-space.  There exist
-	  superior alternatives, but some applications may still rely on
-	  dnotify.
-
-	  If unsure, say Y.
-
-config INOTIFY
-	bool "Inotify file change notification support"
-	default y
-	---help---
-	  Say Y here to enable inotify support.  Inotify is a file change
-	  notification system and a replacement for dnotify.  Inotify fixes
-	  numerous shortcomings in dnotify and introduces several new features
-	  including multiple file events, one-shot support, and unmount
-	  notification.
-
-	  For more information, see <file:Documentation/filesystems/inotify.txt>
-
-	  If unsure, say Y.
-
-config INOTIFY_USER
-	bool "Inotify support for userspace"
-	depends on INOTIFY
-	default y
-	---help---
-	  Say Y here to enable inotify support for userspace, including the
-	  associated system calls.  Inotify allows monitoring of both files and
-	  directories via a single open fd.  Events are read from the file
-	  descriptor, which is also select()- and poll()-able.
-
-	  For more information, see <file:Documentation/filesystems/inotify.txt>
-
-	  If unsure, say Y.
+source "fs/notify/Kconfig"
 
 config QUOTA
 	bool "Quota support"
diff --git a/fs/Makefile b/fs/Makefile
index d9f8afe..e6f423d 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -20,8 +20,7 @@ obj-y +=	no-block.o
 endif
 
 obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
-obj-$(CONFIG_INOTIFY)		+= inotify.o
-obj-$(CONFIG_INOTIFY_USER)	+= inotify_user.o
+obj-y				+= notify/
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
 obj-$(CONFIG_ANON_INODES)	+= anon_inodes.o
 obj-$(CONFIG_SIGNALFD)		+= signalfd.o
@@ -57,8 +56,6 @@ obj-$(CONFIG_QFMT_V1)		+= quota_v1.o
 obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
 obj-$(CONFIG_QUOTACTL)		+= quota.o
 
-obj-$(CONFIG_DNOTIFY)		+= dnotify.o
-
 obj-$(CONFIG_PROC_FS)		+= proc/
 obj-y				+= partitions/
 obj-$(CONFIG_SYSFS)		+= sysfs/
diff --git a/fs/dnotify.c b/fs/dnotify.c
deleted file mode 100644
index b0aa2cd..0000000
--- a/fs/dnotify.c
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Directory notifications for Linux.
- *
- * Copyright (C) 2000,2001,2002 Stephen Rothwell
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#include <linux/fs.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/dnotify.h>
-#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/slab.h>
-#include <linux/fdtable.h>
-
-int dir_notify_enable __read_mostly = 1;
-
-static struct kmem_cache *dn_cache __read_mostly;
-
-static void redo_inode_mask(struct inode *inode)
-{
-	unsigned long new_mask;
-	struct dnotify_struct *dn;
-
-	new_mask = 0;
-	for (dn = inode->i_dnotify; dn != NULL; dn = dn->dn_next)
-		new_mask |= dn->dn_mask & ~DN_MULTISHOT;
-	inode->i_dnotify_mask = new_mask;
-}
-
-void dnotify_flush(struct file *filp, fl_owner_t id)
-{
-	struct dnotify_struct *dn;
-	struct dnotify_struct **prev;
-	struct inode *inode;
-
-	inode = filp->f_path.dentry->d_inode;
-	if (!S_ISDIR(inode->i_mode))
-		return;
-	spin_lock(&inode->i_lock);
-	prev = &inode->i_dnotify;
-	while ((dn = *prev) != NULL) {
-		if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
-			*prev = dn->dn_next;
-			redo_inode_mask(inode);
-			kmem_cache_free(dn_cache, dn);
-			break;
-		}
-		prev = &dn->dn_next;
-	}
-	spin_unlock(&inode->i_lock);
-}
-
-int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
-{
-	struct dnotify_struct *dn;
-	struct dnotify_struct *odn;
-	struct dnotify_struct **prev;
-	struct inode *inode;
-	fl_owner_t id = current->files;
-	struct file *f;
-	int error = 0;
-
-	if ((arg & ~DN_MULTISHOT) == 0) {
-		dnotify_flush(filp, id);
-		return 0;
-	}
-	if (!dir_notify_enable)
-		return -EINVAL;
-	inode = filp->f_path.dentry->d_inode;
-	if (!S_ISDIR(inode->i_mode))
-		return -ENOTDIR;
-	dn = kmem_cache_alloc(dn_cache, GFP_KERNEL);
-	if (dn == NULL)
-		return -ENOMEM;
-	spin_lock(&inode->i_lock);
-	prev = &inode->i_dnotify;
-	while ((odn = *prev) != NULL) {
-		if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
-			odn->dn_fd = fd;
-			odn->dn_mask |= arg;
-			inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
-			goto out_free;
-		}
-		prev = &odn->dn_next;
-	}
-
-	rcu_read_lock();
-	f = fcheck(fd);
-	rcu_read_unlock();
-	/* we'd lost the race with close(), sod off silently */
-	/* note that inode->i_lock prevents reordering problems
-	 * between accesses to descriptor table and ->i_dnotify */
-	if (f != filp)
-		goto out_free;
-
-	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
-	if (error)
-		goto out_free;
-
-	dn->dn_mask = arg;
-	dn->dn_fd = fd;
-	dn->dn_filp = filp;
-	dn->dn_owner = id;
-	inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
-	dn->dn_next = inode->i_dnotify;
-	inode->i_dnotify = dn;
-	spin_unlock(&inode->i_lock);
-	return 0;
-
-out_free:
-	spin_unlock(&inode->i_lock);
-	kmem_cache_free(dn_cache, dn);
-	return error;
-}
-
-void __inode_dir_notify(struct inode *inode, unsigned long event)
-{
-	struct dnotify_struct *	dn;
-	struct dnotify_struct **prev;
-	struct fown_struct *	fown;
-	int			changed = 0;
-
-	spin_lock(&inode->i_lock);
-	prev = &inode->i_dnotify;
-	while ((dn = *prev) != NULL) {
-		if ((dn->dn_mask & event) == 0) {
-			prev = &dn->dn_next;
-			continue;
-		}
-		fown = &dn->dn_filp->f_owner;
-		send_sigio(fown, dn->dn_fd, POLL_MSG);
-		if (dn->dn_mask & DN_MULTISHOT)
-			prev = &dn->dn_next;
-		else {
-			*prev = dn->dn_next;
-			changed = 1;
-			kmem_cache_free(dn_cache, dn);
-		}
-	}
-	if (changed)
-		redo_inode_mask(inode);
-	spin_unlock(&inode->i_lock);
-}
-
-EXPORT_SYMBOL(__inode_dir_notify);
-
-/*
- * This is hopelessly wrong, but unfixable without API changes.  At
- * least it doesn't oops the kernel...
- *
- * To safely access ->d_parent we need to keep d_move away from it.  Use the
- * dentry's d_lock for this.
- */
-void dnotify_parent(struct dentry *dentry, unsigned long event)
-{
-	struct dentry *parent;
-
-	if (!dir_notify_enable)
-		return;
-
-	spin_lock(&dentry->d_lock);
-	parent = dentry->d_parent;
-	if (parent->d_inode->i_dnotify_mask & event) {
-		dget(parent);
-		spin_unlock(&dentry->d_lock);
-		__inode_dir_notify(parent->d_inode, event);
-		dput(parent);
-	} else {
-		spin_unlock(&dentry->d_lock);
-	}
-}
-EXPORT_SYMBOL_GPL(dnotify_parent);
-
-static int __init dnotify_init(void)
-{
-	dn_cache = kmem_cache_create("dnotify_cache",
-		sizeof(struct dnotify_struct), 0, SLAB_PANIC, NULL);
-	return 0;
-}
-
-module_init(dnotify_init)
diff --git a/fs/inotify.c b/fs/inotify.c
deleted file mode 100644
index dae3f28..0000000
--- a/fs/inotify.c
+++ /dev/null
@@ -1,913 +0,0 @@
-/*
- * fs/inotify.c - inode-based file event notifications
- *
- * Authors:
- *	John McCutchan	<ttb@tentacle.dhs.org>
- *	Robert Love	<rml@novell.com>
- *
- * Kernel API added by: Amy Griffis <amy.griffis@hp.com>
- *
- * Copyright (C) 2005 John McCutchan
- * Copyright 2006 Hewlett-Packard Development Company, L.P.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/spinlock.h>
-#include <linux/idr.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/writeback.h>
-#include <linux/inotify.h>
-
-static atomic_t inotify_cookie;
-
-/*
- * Lock ordering:
- *
- * dentry->d_lock (used to keep d_move() away from dentry->d_parent)
- * iprune_mutex (synchronize shrink_icache_memory())
- * 	inode_lock (protects the super_block->s_inodes list)
- * 	inode->inotify_mutex (protects inode->inotify_watches and watches->i_list)
- * 		inotify_handle->mutex (protects inotify_handle and watches->h_list)
- *
- * The inode->inotify_mutex and inotify_handle->mutex and held during execution
- * of a caller's event handler.  Thus, the caller must not hold any locks
- * taken in their event handler while calling any of the published inotify
- * interfaces.
- */
-
-/*
- * Lifetimes of the three main data structures--inotify_handle, inode, and
- * inotify_watch--are managed by reference count.
- *
- * inotify_handle: Lifetime is from inotify_init() to inotify_destroy().
- * Additional references can bump the count via get_inotify_handle() and drop
- * the count via put_inotify_handle().
- *
- * inotify_watch: for inotify's purposes, lifetime is from inotify_add_watch()
- * to remove_watch_no_event().  Additional references can bump the count via
- * get_inotify_watch() and drop the count via put_inotify_watch().  The caller
- * is reponsible for the final put after receiving IN_IGNORED, or when using
- * IN_ONESHOT after receiving the first event.  Inotify does the final put if
- * inotify_destroy() is called.
- *
- * inode: Pinned so long as the inode is associated with a watch, from
- * inotify_add_watch() to the final put_inotify_watch().
- */
-
-/*
- * struct inotify_handle - represents an inotify instance
- *
- * This structure is protected by the mutex 'mutex'.
- */
-struct inotify_handle {
-	struct idr		idr;		/* idr mapping wd -> watch */
-	struct mutex		mutex;		/* protects this bad boy */
-	struct list_head	watches;	/* list of watches */
-	atomic_t		count;		/* reference count */
-	u32			last_wd;	/* the last wd allocated */
-	const struct inotify_operations *in_ops; /* inotify caller operations */
-};
-
-static inline void get_inotify_handle(struct inotify_handle *ih)
-{
-	atomic_inc(&ih->count);
-}
-
-static inline void put_inotify_handle(struct inotify_handle *ih)
-{
-	if (atomic_dec_and_test(&ih->count)) {
-		idr_destroy(&ih->idr);
-		kfree(ih);
-	}
-}
-
-/**
- * get_inotify_watch - grab a reference to an inotify_watch
- * @watch: watch to grab
- */
-void get_inotify_watch(struct inotify_watch *watch)
-{
-	atomic_inc(&watch->count);
-}
-EXPORT_SYMBOL_GPL(get_inotify_watch);
-
-int pin_inotify_watch(struct inotify_watch *watch)
-{
-	struct super_block *sb = watch->inode->i_sb;
-	spin_lock(&sb_lock);
-	if (sb->s_count >= S_BIAS) {
-		atomic_inc(&sb->s_active);
-		spin_unlock(&sb_lock);
-		atomic_inc(&watch->count);
-		return 1;
-	}
-	spin_unlock(&sb_lock);
-	return 0;
-}
-
-/**
- * put_inotify_watch - decrements the ref count on a given watch.  cleans up
- * watch references if the count reaches zero.  inotify_watch is freed by
- * inotify callers via the destroy_watch() op.
- * @watch: watch to release
- */
-void put_inotify_watch(struct inotify_watch *watch)
-{
-	if (atomic_dec_and_test(&watch->count)) {
-		struct inotify_handle *ih = watch->ih;
-
-		iput(watch->inode);
-		ih->in_ops->destroy_watch(watch);
-		put_inotify_handle(ih);
-	}
-}
-EXPORT_SYMBOL_GPL(put_inotify_watch);
-
-void unpin_inotify_watch(struct inotify_watch *watch)
-{
-	struct super_block *sb = watch->inode->i_sb;
-	put_inotify_watch(watch);
-	deactivate_super(sb);
-}
-
-/*
- * inotify_handle_get_wd - returns the next WD for use by the given handle
- *
- * Callers must hold ih->mutex.  This function can sleep.
- */
-static int inotify_handle_get_wd(struct inotify_handle *ih,
-				 struct inotify_watch *watch)
-{
-	int ret;
-
-	do {
-		if (unlikely(!idr_pre_get(&ih->idr, GFP_KERNEL)))
-			return -ENOSPC;
-		ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
-	} while (ret == -EAGAIN);
-
-	if (likely(!ret))
-		ih->last_wd = watch->wd;
-
-	return ret;
-}
-
-/*
- * inotify_inode_watched - returns nonzero if there are watches on this inode
- * and zero otherwise.  We call this lockless, we do not care if we race.
- */
-static inline int inotify_inode_watched(struct inode *inode)
-{
-	return !list_empty(&inode->inotify_watches);
-}
-
-/*
- * Get child dentry flag into synch with parent inode.
- * Flag should always be clear for negative dentrys.
- */
-static void set_dentry_child_flags(struct inode *inode, int watched)
-{
-	struct dentry *alias;
-
-	spin_lock(&dcache_lock);
-	list_for_each_entry(alias, &inode->i_dentry, d_alias) {
-		struct dentry *child;
-
-		list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
-			if (!child->d_inode)
-				continue;
-
-			spin_lock(&child->d_lock);
-			if (watched)
-				child->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
-			else
-				child->d_flags &=~DCACHE_INOTIFY_PARENT_WATCHED;
-			spin_unlock(&child->d_lock);
-		}
-	}
-	spin_unlock(&dcache_lock);
-}
-
-/*
- * inotify_find_handle - find the watch associated with the given inode and
- * handle
- *
- * Callers must hold inode->inotify_mutex.
- */
-static struct inotify_watch *inode_find_handle(struct inode *inode,
-					       struct inotify_handle *ih)
-{
-	struct inotify_watch *watch;
-
-	list_for_each_entry(watch, &inode->inotify_watches, i_list) {
-		if (watch->ih == ih)
-			return watch;
-	}
-
-	return NULL;
-}
-
-/*
- * remove_watch_no_event - remove watch without the IN_IGNORED event.
- *
- * Callers must hold both inode->inotify_mutex and ih->mutex.
- */
-static void remove_watch_no_event(struct inotify_watch *watch,
-				  struct inotify_handle *ih)
-{
-	list_del(&watch->i_list);
-	list_del(&watch->h_list);
-
-	if (!inotify_inode_watched(watch->inode))
-		set_dentry_child_flags(watch->inode, 0);
-
-	idr_remove(&ih->idr, watch->wd);
-}
-
-/**
- * inotify_remove_watch_locked - Remove a watch from both the handle and the
- * inode.  Sends the IN_IGNORED event signifying that the inode is no longer
- * watched.  May be invoked from a caller's event handler.
- * @ih: inotify handle associated with watch
- * @watch: watch to remove
- *
- * Callers must hold both inode->inotify_mutex and ih->mutex.
- */
-void inotify_remove_watch_locked(struct inotify_handle *ih,
-				 struct inotify_watch *watch)
-{
-	remove_watch_no_event(watch, ih);
-	ih->in_ops->handle_event(watch, watch->wd, IN_IGNORED, 0, NULL, NULL);
-}
-EXPORT_SYMBOL_GPL(inotify_remove_watch_locked);
-
-/* Kernel API for producing events */
-
-/*
- * inotify_d_instantiate - instantiate dcache entry for inode
- */
-void inotify_d_instantiate(struct dentry *entry, struct inode *inode)
-{
-	struct dentry *parent;
-
-	if (!inode)
-		return;
-
-	spin_lock(&entry->d_lock);
-	parent = entry->d_parent;
-	if (parent->d_inode && inotify_inode_watched(parent->d_inode))
-		entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
-	spin_unlock(&entry->d_lock);
-}
-
-/*
- * inotify_d_move - dcache entry has been moved
- */
-void inotify_d_move(struct dentry *entry)
-{
-	struct dentry *parent;
-
-	parent = entry->d_parent;
-	if (inotify_inode_watched(parent->d_inode))
-		entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
-	else
-		entry->d_flags &= ~DCACHE_INOTIFY_PARENT_WATCHED;
-}
-
-/**
- * inotify_inode_queue_event - queue an event to all watches on this inode
- * @inode: inode event is originating from
- * @mask: event mask describing this event
- * @cookie: cookie for synchronization, or zero
- * @name: filename, if any
- * @n_inode: inode associated with name
- */
-void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
-			       const char *name, struct inode *n_inode)
-{
-	struct inotify_watch *watch, *next;
-
-	if (!inotify_inode_watched(inode))
-		return;
-
-	mutex_lock(&inode->inotify_mutex);
-	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
-		u32 watch_mask = watch->mask;
-		if (watch_mask & mask) {
-			struct inotify_handle *ih= watch->ih;
-			mutex_lock(&ih->mutex);
-			if (watch_mask & IN_ONESHOT)
-				remove_watch_no_event(watch, ih);
-			ih->in_ops->handle_event(watch, watch->wd, mask, cookie,
-						 name, n_inode);
-			mutex_unlock(&ih->mutex);
-		}
-	}
-	mutex_unlock(&inode->inotify_mutex);
-}
-EXPORT_SYMBOL_GPL(inotify_inode_queue_event);
-
-/**
- * inotify_dentry_parent_queue_event - queue an event to a dentry's parent
- * @dentry: the dentry in question, we queue against this dentry's parent
- * @mask: event mask describing this event
- * @cookie: cookie for synchronization, or zero
- * @name: filename, if any
- */
-void inotify_dentry_parent_queue_event(struct dentry *dentry, u32 mask,
-				       u32 cookie, const char *name)
-{
-	struct dentry *parent;
-	struct inode *inode;
-
-	if (!(dentry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED))
-		return;
-
-	spin_lock(&dentry->d_lock);
-	parent = dentry->d_parent;
-	inode = parent->d_inode;
-
-	if (inotify_inode_watched(inode)) {
-		dget(parent);
-		spin_unlock(&dentry->d_lock);
-		inotify_inode_queue_event(inode, mask, cookie, name,
-					  dentry->d_inode);
-		dput(parent);
-	} else
-		spin_unlock(&dentry->d_lock);
-}
-EXPORT_SYMBOL_GPL(inotify_dentry_parent_queue_event);
-
-/**
- * inotify_get_cookie - return a unique cookie for use in synchronizing events.
- */
-u32 inotify_get_cookie(void)
-{
-	return atomic_inc_return(&inotify_cookie);
-}
-EXPORT_SYMBOL_GPL(inotify_get_cookie);
-
-/**
- * inotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
- * @list: list of inodes being unmounted (sb->s_inodes)
- *
- * Called with inode_lock held, protecting the unmounting super block's list
- * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
- * We temporarily drop inode_lock, however, and CAN block.
- */
-void inotify_unmount_inodes(struct list_head *list)
-{
-	struct inode *inode, *next_i, *need_iput = NULL;
-
-	list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
-		struct inotify_watch *watch, *next_w;
-		struct inode *need_iput_tmp;
-		struct list_head *watches;
-
-		/*
-		 * If i_count is zero, the inode cannot have any watches and
-		 * doing an __iget/iput with MS_ACTIVE clear would actually
-		 * evict all inodes with zero i_count from icache which is
-		 * unnecessarily violent and may in fact be illegal to do.
-		 */
-		if (!atomic_read(&inode->i_count))
-			continue;
-
-		/*
-		 * We cannot __iget() an inode in state I_CLEAR, I_FREEING, or
-		 * I_WILL_FREE which is fine because by that point the inode
-		 * cannot have any associated watches.
-		 */
-		if (inode->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))
-			continue;
-
-		need_iput_tmp = need_iput;
-		need_iput = NULL;
-		/* In case inotify_remove_watch_locked() drops a reference. */
-		if (inode != need_iput_tmp)
-			__iget(inode);
-		else
-			need_iput_tmp = NULL;
-		/* In case the dropping of a reference would nuke next_i. */
-		if ((&next_i->i_sb_list != list) &&
-				atomic_read(&next_i->i_count) &&
-				!(next_i->i_state & (I_CLEAR | I_FREEING |
-					I_WILL_FREE))) {
-			__iget(next_i);
-			need_iput = next_i;
-		}
-
-		/*
-		 * We can safely drop inode_lock here because we hold
-		 * references on both inode and next_i.  Also no new inodes
-		 * will be added since the umount has begun.  Finally,
-		 * iprune_mutex keeps shrink_icache_memory() away.
-		 */
-		spin_unlock(&inode_lock);
-
-		if (need_iput_tmp)
-			iput(need_iput_tmp);
-
-		/* for each watch, send IN_UNMOUNT and then remove it */
-		mutex_lock(&inode->inotify_mutex);
-		watches = &inode->inotify_watches;
-		list_for_each_entry_safe(watch, next_w, watches, i_list) {
-			struct inotify_handle *ih= watch->ih;
-			get_inotify_watch(watch);
-			mutex_lock(&ih->mutex);
-			ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0,
-						 NULL, NULL);
-			inotify_remove_watch_locked(ih, watch);
-			mutex_unlock(&ih->mutex);
-			put_inotify_watch(watch);
-		}
-		mutex_unlock(&inode->inotify_mutex);
-		iput(inode);		
-
-		spin_lock(&inode_lock);
-	}
-}
-EXPORT_SYMBOL_GPL(inotify_unmount_inodes);
-
-/**
- * inotify_inode_is_dead - an inode has been deleted, cleanup any watches
- * @inode: inode that is about to be removed
- */
-void inotify_inode_is_dead(struct inode *inode)
-{
-	struct inotify_watch *watch, *next;
-
-	mutex_lock(&inode->inotify_mutex);
-	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
-		struct inotify_handle *ih = watch->ih;
-		mutex_lock(&ih->mutex);
-		inotify_remove_watch_locked(ih, watch);
-		mutex_unlock(&ih->mutex);
-	}
-	mutex_unlock(&inode->inotify_mutex);
-}
-EXPORT_SYMBOL_GPL(inotify_inode_is_dead);
-
-/* Kernel Consumer API */
-
-/**
- * inotify_init - allocate and initialize an inotify instance
- * @ops: caller's inotify operations
- */
-struct inotify_handle *inotify_init(const struct inotify_operations *ops)
-{
-	struct inotify_handle *ih;
-
-	ih = kmalloc(sizeof(struct inotify_handle), GFP_KERNEL);
-	if (unlikely(!ih))
-		return ERR_PTR(-ENOMEM);
-
-	idr_init(&ih->idr);
-	INIT_LIST_HEAD(&ih->watches);
-	mutex_init(&ih->mutex);
-	ih->last_wd = 0;
-	ih->in_ops = ops;
-	atomic_set(&ih->count, 0);
-	get_inotify_handle(ih);
-
-	return ih;
-}
-EXPORT_SYMBOL_GPL(inotify_init);
-
-/**
- * inotify_init_watch - initialize an inotify watch
- * @watch: watch to initialize
- */
-void inotify_init_watch(struct inotify_watch *watch)
-{
-	INIT_LIST_HEAD(&watch->h_list);
-	INIT_LIST_HEAD(&watch->i_list);
-	atomic_set(&watch->count, 0);
-	get_inotify_watch(watch); /* initial get */
-}
-EXPORT_SYMBOL_GPL(inotify_init_watch);
-
-/*
- * Watch removals suck violently.  To kick the watch out we need (in this
- * order) inode->inotify_mutex and ih->mutex.  That's fine if we have
- * a hold on inode; however, for all other cases we need to make damn sure
- * we don't race with umount.  We can *NOT* just grab a reference to a
- * watch - inotify_unmount_inodes() will happily sail past it and we'll end
- * with reference to inode potentially outliving its superblock.  Ideally
- * we just want to grab an active reference to superblock if we can; that
- * will make sure we won't go into inotify_umount_inodes() until we are
- * done.  Cleanup is just deactivate_super().  However, that leaves a messy
- * case - what if we *are* racing with umount() and active references to
- * superblock can't be acquired anymore?  We can bump ->s_count, grab
- * ->s_umount, which will almost certainly wait until the superblock is shut
- * down and the watch in question is pining for fjords.  That's fine, but
- * there is a problem - we might have hit the window between ->s_active
- * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock
- * is past the point of no return and is heading for shutdown) and the
- * moment when deactivate_super() acquires ->s_umount.  We could just do
- * drop_super() yield() and retry, but that's rather antisocial and this
- * stuff is luser-triggerable.  OTOH, having grabbed ->s_umount and having
- * found that we'd got there first (i.e. that ->s_root is non-NULL) we know
- * that we won't race with inotify_umount_inodes().  So we could grab a
- * reference to watch and do the rest as above, just with drop_super() instead
- * of deactivate_super(), right?  Wrong.  We had to drop ih->mutex before we
- * could grab ->s_umount.  So the watch could've been gone already.
- *
- * That still can be dealt with - we need to save watch->wd, do idr_find()
- * and compare its result with our pointer.  If they match, we either have
- * the damn thing still alive or we'd lost not one but two races at once,
- * the watch had been killed and a new one got created with the same ->wd
- * at the same address.  That couldn't have happened in inotify_destroy(),
- * but inotify_rm_wd() could run into that.  Still, "new one got created"
- * is not a problem - we have every right to kill it or leave it alone,
- * whatever's more convenient.
- *
- * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
- * "grab it and kill it" check.  If it's been our original watch, we are
- * fine, if it's a newcomer - nevermind, just pretend that we'd won the
- * race and kill the fscker anyway; we are safe since we know that its
- * superblock won't be going away.
- *
- * And yes, this is far beyond mere "not very pretty"; so's the entire
- * concept of inotify to start with.
- */
-
-/**
- * pin_to_kill - pin the watch down for removal
- * @ih: inotify handle
- * @watch: watch to kill
- *
- * Called with ih->mutex held, drops it.  Possible return values:
- * 0 - nothing to do, it has died
- * 1 - remove it, drop the reference and deactivate_super()
- * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid
- * that variant, since it involved a lot of PITA, but that's the best that
- * could've been done.
- */
-static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
-{
-	struct super_block *sb = watch->inode->i_sb;
-	s32 wd = watch->wd;
-
-	spin_lock(&sb_lock);
-	if (sb->s_count >= S_BIAS) {
-		atomic_inc(&sb->s_active);
-		spin_unlock(&sb_lock);
-		get_inotify_watch(watch);
-		mutex_unlock(&ih->mutex);
-		return 1;	/* the best outcome */
-	}
-	sb->s_count++;
-	spin_unlock(&sb_lock);
-	mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
-	down_read(&sb->s_umount);
-	if (likely(!sb->s_root)) {
-		/* fs is already shut down; the watch is dead */
-		drop_super(sb);
-		return 0;
-	}
-	/* raced with the final deactivate_super() */
-	mutex_lock(&ih->mutex);
-	if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) {
-		/* the watch is dead */
-		mutex_unlock(&ih->mutex);
-		drop_super(sb);
-		return 0;
-	}
-	/* still alive or freed and reused with the same sb and wd; kill */
-	get_inotify_watch(watch);
-	mutex_unlock(&ih->mutex);
-	return 2;
-}
-
-static void unpin_and_kill(struct inotify_watch *watch, int how)
-{
-	struct super_block *sb = watch->inode->i_sb;
-	put_inotify_watch(watch);
-	switch (how) {
-	case 1:
-		deactivate_super(sb);
-		break;
-	case 2:
-		drop_super(sb);
-	}
-}
-
-/**
- * inotify_destroy - clean up and destroy an inotify instance
- * @ih: inotify handle
- */
-void inotify_destroy(struct inotify_handle *ih)
-{
-	/*
-	 * Destroy all of the watches for this handle. Unfortunately, not very
-	 * pretty.  We cannot do a simple iteration over the list, because we
-	 * do not know the inode until we iterate to the watch.  But we need to
-	 * hold inode->inotify_mutex before ih->mutex.  The following works.
-	 *
-	 * AV: it had to become even uglier to start working ;-/
-	 */
-	while (1) {
-		struct inotify_watch *watch;
-		struct list_head *watches;
-		struct super_block *sb;
-		struct inode *inode;
-		int how;
-
-		mutex_lock(&ih->mutex);
-		watches = &ih->watches;
-		if (list_empty(watches)) {
-			mutex_unlock(&ih->mutex);
-			break;
-		}
-		watch = list_first_entry(watches, struct inotify_watch, h_list);
-		sb = watch->inode->i_sb;
-		how = pin_to_kill(ih, watch);
-		if (!how)
-			continue;
-
-		inode = watch->inode;
-		mutex_lock(&inode->inotify_mutex);
-		mutex_lock(&ih->mutex);
-
-		/* make sure we didn't race with another list removal */
-		if (likely(idr_find(&ih->idr, watch->wd))) {
-			remove_watch_no_event(watch, ih);
-			put_inotify_watch(watch);
-		}
-
-		mutex_unlock(&ih->mutex);
-		mutex_unlock(&inode->inotify_mutex);
-		unpin_and_kill(watch, how);
-	}
-
-	/* free this handle: the put matching the get in inotify_init() */
-	put_inotify_handle(ih);
-}
-EXPORT_SYMBOL_GPL(inotify_destroy);
-
-/**
- * inotify_find_watch - find an existing watch for an (ih,inode) pair
- * @ih: inotify handle
- * @inode: inode to watch
- * @watchp: pointer to existing inotify_watch
- *
- * Caller must pin given inode (via nameidata).
- */
-s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode,
-		       struct inotify_watch **watchp)
-{
-	struct inotify_watch *old;
-	int ret = -ENOENT;
-
-	mutex_lock(&inode->inotify_mutex);
-	mutex_lock(&ih->mutex);
-
-	old = inode_find_handle(inode, ih);
-	if (unlikely(old)) {
-		get_inotify_watch(old); /* caller must put watch */
-		*watchp = old;
-		ret = old->wd;
-	}
-
-	mutex_unlock(&ih->mutex);
-	mutex_unlock(&inode->inotify_mutex);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(inotify_find_watch);
-
-/**
- * inotify_find_update_watch - find and update the mask of an existing watch
- * @ih: inotify handle
- * @inode: inode's watch to update
- * @mask: mask of events to watch
- *
- * Caller must pin given inode (via nameidata).
- */
-s32 inotify_find_update_watch(struct inotify_handle *ih, struct inode *inode,
-			      u32 mask)
-{
-	struct inotify_watch *old;
-	int mask_add = 0;
-	int ret;
-
-	if (mask & IN_MASK_ADD)
-		mask_add = 1;
-
-	/* don't allow invalid bits: we don't want flags set */
-	mask &= IN_ALL_EVENTS | IN_ONESHOT;
-	if (unlikely(!mask))
-		return -EINVAL;
-
-	mutex_lock(&inode->inotify_mutex);
-	mutex_lock(&ih->mutex);
-
-	/*
-	 * Handle the case of re-adding a watch on an (inode,ih) pair that we
-	 * are already watching.  We just update the mask and return its wd.
-	 */
-	old = inode_find_handle(inode, ih);
-	if (unlikely(!old)) {
-		ret = -ENOENT;
-		goto out;
-	}
-
-	if (mask_add)
-		old->mask |= mask;
-	else
-		old->mask = mask;
-	ret = old->wd;
-out:
-	mutex_unlock(&ih->mutex);
-	mutex_unlock(&inode->inotify_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(inotify_find_update_watch);
-
-/**
- * inotify_add_watch - add a watch to an inotify instance
- * @ih: inotify handle
- * @watch: caller allocated watch structure
- * @inode: inode to watch
- * @mask: mask of events to watch
- *
- * Caller must pin given inode (via nameidata).
- * Caller must ensure it only calls inotify_add_watch() once per watch.
- * Calls inotify_handle_get_wd() so may sleep.
- */
-s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
-		      struct inode *inode, u32 mask)
-{
-	int ret = 0;
-	int newly_watched;
-
-	/* don't allow invalid bits: we don't want flags set */
-	mask &= IN_ALL_EVENTS | IN_ONESHOT;
-	if (unlikely(!mask))
-		return -EINVAL;
-	watch->mask = mask;
-
-	mutex_lock(&inode->inotify_mutex);
-	mutex_lock(&ih->mutex);
-
-	/* Initialize a new watch */
-	ret = inotify_handle_get_wd(ih, watch);
-	if (unlikely(ret))
-		goto out;
-	ret = watch->wd;
-
-	/* save a reference to handle and bump the count to make it official */
-	get_inotify_handle(ih);
-	watch->ih = ih;
-
-	/*
-	 * Save a reference to the inode and bump the ref count to make it
-	 * official.  We hold a reference to nameidata, which makes this safe.
-	 */
-	watch->inode = igrab(inode);
-
-	/* Add the watch to the handle's and the inode's list */
-	newly_watched = !inotify_inode_watched(inode);
-	list_add(&watch->h_list, &ih->watches);
-	list_add(&watch->i_list, &inode->inotify_watches);
-	/*
-	 * Set child flags _after_ adding the watch, so there is no race
-	 * windows where newly instantiated children could miss their parent's
-	 * watched flag.
-	 */
-	if (newly_watched)
-		set_dentry_child_flags(inode, 1);
-
-out:
-	mutex_unlock(&ih->mutex);
-	mutex_unlock(&inode->inotify_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(inotify_add_watch);
-
-/**
- * inotify_clone_watch - put the watch next to existing one
- * @old: already installed watch
- * @new: new watch
- *
- * Caller must hold the inotify_mutex of inode we are dealing with;
- * it is expected to remove the old watch before unlocking the inode.
- */
-s32 inotify_clone_watch(struct inotify_watch *old, struct inotify_watch *new)
-{
-	struct inotify_handle *ih = old->ih;
-	int ret = 0;
-
-	new->mask = old->mask;
-	new->ih = ih;
-
-	mutex_lock(&ih->mutex);
-
-	/* Initialize a new watch */
-	ret = inotify_handle_get_wd(ih, new);
-	if (unlikely(ret))
-		goto out;
-	ret = new->wd;
-
-	get_inotify_handle(ih);
-
-	new->inode = igrab(old->inode);
-
-	list_add(&new->h_list, &ih->watches);
-	list_add(&new->i_list, &old->inode->inotify_watches);
-out:
-	mutex_unlock(&ih->mutex);
-	return ret;
-}
-
-void inotify_evict_watch(struct inotify_watch *watch)
-{
-	get_inotify_watch(watch);
-	mutex_lock(&watch->ih->mutex);
-	inotify_remove_watch_locked(watch->ih, watch);
-	mutex_unlock(&watch->ih->mutex);
-}
-
-/**
- * inotify_rm_wd - remove a watch from an inotify instance
- * @ih: inotify handle
- * @wd: watch descriptor to remove
- *
- * Can sleep.
- */
-int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
-{
-	struct inotify_watch *watch;
-	struct super_block *sb;
-	struct inode *inode;
-	int how;
-
-	mutex_lock(&ih->mutex);
-	watch = idr_find(&ih->idr, wd);
-	if (unlikely(!watch)) {
-		mutex_unlock(&ih->mutex);
-		return -EINVAL;
-	}
-	sb = watch->inode->i_sb;
-	how = pin_to_kill(ih, watch);
-	if (!how)
-		return 0;
-
-	inode = watch->inode;
-
-	mutex_lock(&inode->inotify_mutex);
-	mutex_lock(&ih->mutex);
-
-	/* make sure that we did not race */
-	if (likely(idr_find(&ih->idr, wd) == watch))
-		inotify_remove_watch_locked(ih, watch);
-
-	mutex_unlock(&ih->mutex);
-	mutex_unlock(&inode->inotify_mutex);
-	unpin_and_kill(watch, how);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(inotify_rm_wd);
-
-/**
- * inotify_rm_watch - remove a watch from an inotify instance
- * @ih: inotify handle
- * @watch: watch to remove
- *
- * Can sleep.
- */
-int inotify_rm_watch(struct inotify_handle *ih,
-		     struct inotify_watch *watch)
-{
-	return inotify_rm_wd(ih, watch->wd);
-}
-EXPORT_SYMBOL_GPL(inotify_rm_watch);
-
-/*
- * inotify_setup - core initialization function
- */
-static int __init inotify_setup(void)
-{
-	atomic_set(&inotify_cookie, 0);
-
-	return 0;
-}
-
-module_init(inotify_setup);
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
deleted file mode 100644
index 400f806..0000000
--- a/fs/inotify_user.c
+++ /dev/null
@@ -1,778 +0,0 @@
-/*
- * fs/inotify_user.c - inotify support for userspace
- *
- * Authors:
- *	John McCutchan	<ttb@tentacle.dhs.org>
- *	Robert Love	<rml@novell.com>
- *
- * Copyright (C) 2005 John McCutchan
- * Copyright 2006 Hewlett-Packard Development Company, L.P.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/poll.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/inotify.h>
-#include <linux/syscalls.h>
-#include <linux/magic.h>
-
-#include <asm/ioctls.h>
-
-static struct kmem_cache *watch_cachep __read_mostly;
-static struct kmem_cache *event_cachep __read_mostly;
-
-static struct vfsmount *inotify_mnt __read_mostly;
-
-/* these are configurable via /proc/sys/fs/inotify/ */
-static int inotify_max_user_instances __read_mostly;
-static int inotify_max_user_watches __read_mostly;
-static int inotify_max_queued_events __read_mostly;
-
-/*
- * Lock ordering:
- *
- * inotify_dev->up_mutex (ensures we don't re-add the same watch)
- * 	inode->inotify_mutex (protects inode's watch list)
- * 		inotify_handle->mutex (protects inotify_handle's watch list)
- * 			inotify_dev->ev_mutex (protects device's event queue)
- */
-
-/*
- * Lifetimes of the main data structures:
- *
- * inotify_device: Lifetime is managed by reference count, from
- * sys_inotify_init() until release.  Additional references can bump the count
- * via get_inotify_dev() and drop the count via put_inotify_dev().
- *
- * inotify_user_watch: Lifetime is from create_watch() to the receipt of an
- * IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the
- * first event, or to inotify_destroy().
- */
-
-/*
- * struct inotify_device - represents an inotify instance
- *
- * This structure is protected by the mutex 'mutex'.
- */
-struct inotify_device {
-	wait_queue_head_t 	wq;		/* wait queue for i/o */
-	struct mutex		ev_mutex;	/* protects event queue */
-	struct mutex		up_mutex;	/* synchronizes watch updates */
-	struct list_head 	events;		/* list of queued events */
-	struct user_struct	*user;		/* user who opened this dev */
-	struct inotify_handle	*ih;		/* inotify handle */
-	struct fasync_struct    *fa;            /* async notification */
-	atomic_t		count;		/* reference count */
-	unsigned int		queue_size;	/* size of the queue (bytes) */
-	unsigned int		event_count;	/* number of pending events */
-	unsigned int		max_events;	/* maximum number of events */
-};
-
-/*
- * struct inotify_kernel_event - An inotify event, originating from a watch and
- * queued for user-space.  A list of these is attached to each instance of the
- * device.  In read(), this list is walked and all events that can fit in the
- * buffer are returned.
- *
- * Protected by dev->ev_mutex of the device in which we are queued.
- */
-struct inotify_kernel_event {
-	struct inotify_event	event;	/* the user-space event */
-	struct list_head        list;	/* entry in inotify_device's list */
-	char			*name;	/* filename, if any */
-};
-
-/*
- * struct inotify_user_watch - our version of an inotify_watch, we add
- * a reference to the associated inotify_device.
- */
-struct inotify_user_watch {
-	struct inotify_device	*dev;	/* associated device */
-	struct inotify_watch	wdata;	/* inotify watch data */
-};
-
-#ifdef CONFIG_SYSCTL
-
-#include <linux/sysctl.h>
-
-static int zero;
-
-ctl_table inotify_table[] = {
-	{
-		.ctl_name	= INOTIFY_MAX_USER_INSTANCES,
-		.procname	= "max_user_instances",
-		.data		= &inotify_max_user_instances,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &zero,
-	},
-	{
-		.ctl_name	= INOTIFY_MAX_USER_WATCHES,
-		.procname	= "max_user_watches",
-		.data		= &inotify_max_user_watches,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &zero,
-	},
-	{
-		.ctl_name	= INOTIFY_MAX_QUEUED_EVENTS,
-		.procname	= "max_queued_events",
-		.data		= &inotify_max_queued_events,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &zero
-	},
-	{ .ctl_name = 0 }
-};
-#endif /* CONFIG_SYSCTL */
-
-static inline void get_inotify_dev(struct inotify_device *dev)
-{
-	atomic_inc(&dev->count);
-}
-
-static inline void put_inotify_dev(struct inotify_device *dev)
-{
-	if (atomic_dec_and_test(&dev->count)) {
-		atomic_dec(&dev->user->inotify_devs);
-		free_uid(dev->user);
-		kfree(dev);
-	}
-}
-
-/*
- * free_inotify_user_watch - cleans up the watch and its references
- */
-static void free_inotify_user_watch(struct inotify_watch *w)
-{
-	struct inotify_user_watch *watch;
-	struct inotify_device *dev;
-
-	watch = container_of(w, struct inotify_user_watch, wdata);
-	dev = watch->dev;
-
-	atomic_dec(&dev->user->inotify_watches);
-	put_inotify_dev(dev);
-	kmem_cache_free(watch_cachep, watch);
-}
-
-/*
- * kernel_event - create a new kernel event with the given parameters
- *
- * This function can sleep.
- */
-static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
-						  const char *name)
-{
-	struct inotify_kernel_event *kevent;
-
-	kevent = kmem_cache_alloc(event_cachep, GFP_NOFS);
-	if (unlikely(!kevent))
-		return NULL;
-
-	/* we hand this out to user-space, so zero it just in case */
-	memset(&kevent->event, 0, sizeof(struct inotify_event));
-
-	kevent->event.wd = wd;
-	kevent->event.mask = mask;
-	kevent->event.cookie = cookie;
-
-	INIT_LIST_HEAD(&kevent->list);
-
-	if (name) {
-		size_t len, rem, event_size = sizeof(struct inotify_event);
-
-		/*
-		 * We need to pad the filename so as to properly align an
-		 * array of inotify_event structures.  Because the structure is
-		 * small and the common case is a small filename, we just round
-		 * up to the next multiple of the structure's sizeof.  This is
-		 * simple and safe for all architectures.
-		 */
-		len = strlen(name) + 1;
-		rem = event_size - len;
-		if (len > event_size) {
-			rem = event_size - (len % event_size);
-			if (len % event_size == 0)
-				rem = 0;
-		}
-
-		kevent->name = kmalloc(len + rem, GFP_KERNEL);
-		if (unlikely(!kevent->name)) {
-			kmem_cache_free(event_cachep, kevent);
-			return NULL;
-		}
-		memcpy(kevent->name, name, len);
-		if (rem)
-			memset(kevent->name + len, 0, rem);
-		kevent->event.len = len + rem;
-	} else {
-		kevent->event.len = 0;
-		kevent->name = NULL;
-	}
-
-	return kevent;
-}
-
-/*
- * inotify_dev_get_event - return the next event in the given dev's queue
- *
- * Caller must hold dev->ev_mutex.
- */
-static inline struct inotify_kernel_event *
-inotify_dev_get_event(struct inotify_device *dev)
-{
-	return list_entry(dev->events.next, struct inotify_kernel_event, list);
-}
-
-/*
- * inotify_dev_get_last_event - return the last event in the given dev's queue
- *
- * Caller must hold dev->ev_mutex.
- */
-static inline struct inotify_kernel_event *
-inotify_dev_get_last_event(struct inotify_device *dev)
-{
-	if (list_empty(&dev->events))
-		return NULL;
-	return list_entry(dev->events.prev, struct inotify_kernel_event, list);
-}
-
-/*
- * inotify_dev_queue_event - event handler registered with core inotify, adds
- * a new event to the given device
- *
- * Can sleep (calls kernel_event()).
- */
-static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
-				    u32 cookie, const char *name,
-				    struct inode *ignored)
-{
-	struct inotify_user_watch *watch;
-	struct inotify_device *dev;
-	struct inotify_kernel_event *kevent, *last;
-
-	watch = container_of(w, struct inotify_user_watch, wdata);
-	dev = watch->dev;
-
-	mutex_lock(&dev->ev_mutex);
-
-	/* we can safely put the watch as we don't reference it while
-	 * generating the event
-	 */
-	if (mask & IN_IGNORED || w->mask & IN_ONESHOT)
-		put_inotify_watch(w); /* final put */
-
-	/* coalescing: drop this event if it is a dupe of the previous */
-	last = inotify_dev_get_last_event(dev);
-	if (last && last->event.mask == mask && last->event.wd == wd &&
-			last->event.cookie == cookie) {
-		const char *lastname = last->name;
-
-		if (!name && !lastname)
-			goto out;
-		if (name && lastname && !strcmp(lastname, name))
-			goto out;
-	}
-
-	/* the queue overflowed and we already sent the Q_OVERFLOW event */
-	if (unlikely(dev->event_count > dev->max_events))
-		goto out;
-
-	/* if the queue overflows, we need to notify user space */
-	if (unlikely(dev->event_count == dev->max_events))
-		kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
-	else
-		kevent = kernel_event(wd, mask, cookie, name);
-
-	if (unlikely(!kevent))
-		goto out;
-
-	/* queue the event and wake up anyone waiting */
-	dev->event_count++;
-	dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
-	list_add_tail(&kevent->list, &dev->events);
-	wake_up_interruptible(&dev->wq);
-	kill_fasync(&dev->fa, SIGIO, POLL_IN);
-
-out:
-	mutex_unlock(&dev->ev_mutex);
-}
-
-/*
- * remove_kevent - cleans up the given kevent
- *
- * Caller must hold dev->ev_mutex.
- */
-static void remove_kevent(struct inotify_device *dev,
-			  struct inotify_kernel_event *kevent)
-{
-	list_del(&kevent->list);
-
-	dev->event_count--;
-	dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
-}
-
-/*
- * free_kevent - frees the given kevent.
- */
-static void free_kevent(struct inotify_kernel_event *kevent)
-{
-	kfree(kevent->name);
-	kmem_cache_free(event_cachep, kevent);
-}
-
-/*
- * inotify_dev_event_dequeue - destroy an event on the given device
- *
- * Caller must hold dev->ev_mutex.
- */
-static void inotify_dev_event_dequeue(struct inotify_device *dev)
-{
-	if (!list_empty(&dev->events)) {
-		struct inotify_kernel_event *kevent;
-		kevent = inotify_dev_get_event(dev);
-		remove_kevent(dev, kevent);
-		free_kevent(kevent);
-	}
-}
-
-/*
- * find_inode - resolve a user-given path to a specific inode
- */
-static int find_inode(const char __user *dirname, struct path *path,
-		      unsigned flags)
-{
-	int error;
-
-	error = user_path_at(AT_FDCWD, dirname, flags, path);
-	if (error)
-		return error;
-	/* you can only watch an inode if you have read permissions on it */
-	error = inode_permission(path->dentry->d_inode, MAY_READ);
-	if (error)
-		path_put(path);
-	return error;
-}
-
-/*
- * create_watch - creates a watch on the given device.
- *
- * Callers must hold dev->up_mutex.
- */
-static int create_watch(struct inotify_device *dev, struct inode *inode,
-			u32 mask)
-{
-	struct inotify_user_watch *watch;
-	int ret;
-
-	if (atomic_read(&dev->user->inotify_watches) >=
-			inotify_max_user_watches)
-		return -ENOSPC;
-
-	watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
-	if (unlikely(!watch))
-		return -ENOMEM;
-
-	/* save a reference to device and bump the count to make it official */
-	get_inotify_dev(dev);
-	watch->dev = dev;
-
-	atomic_inc(&dev->user->inotify_watches);
-
-	inotify_init_watch(&watch->wdata);
-	ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
-	if (ret < 0)
-		free_inotify_user_watch(&watch->wdata);
-
-	return ret;
-}
-
-/* Device Interface */
-
-static unsigned int inotify_poll(struct file *file, poll_table *wait)
-{
-	struct inotify_device *dev = file->private_data;
-	int ret = 0;
-
-	poll_wait(file, &dev->wq, wait);
-	mutex_lock(&dev->ev_mutex);
-	if (!list_empty(&dev->events))
-		ret = POLLIN | POLLRDNORM;
-	mutex_unlock(&dev->ev_mutex);
-
-	return ret;
-}
-
-static ssize_t inotify_read(struct file *file, char __user *buf,
-			    size_t count, loff_t *pos)
-{
-	size_t event_size = sizeof (struct inotify_event);
-	struct inotify_device *dev;
-	char __user *start;
-	int ret;
-	DEFINE_WAIT(wait);
-
-	start = buf;
-	dev = file->private_data;
-
-	while (1) {
-
-		prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
-
-		mutex_lock(&dev->ev_mutex);
-		if (!list_empty(&dev->events)) {
-			ret = 0;
-			break;
-		}
-		mutex_unlock(&dev->ev_mutex);
-
-		if (file->f_flags & O_NONBLOCK) {
-			ret = -EAGAIN;
-			break;
-		}
-
-		if (signal_pending(current)) {
-			ret = -EINTR;
-			break;
-		}
-
-		schedule();
-	}
-
-	finish_wait(&dev->wq, &wait);
-	if (ret)
-		return ret;
-
-	while (1) {
-		struct inotify_kernel_event *kevent;
-
-		ret = buf - start;
-		if (list_empty(&dev->events))
-			break;
-
-		kevent = inotify_dev_get_event(dev);
-		if (event_size + kevent->event.len > count) {
-			if (ret == 0 && count > 0) {
-				/*
-				 * could not get a single event because we
-				 * didn't have enough buffer space.
-				 */
-				ret = -EINVAL;
-			}
-			break;
-		}
-		remove_kevent(dev, kevent);
-
-		/*
-		 * Must perform the copy_to_user outside the mutex in order
-		 * to avoid a lock order reversal with mmap_sem.
-		 */
-		mutex_unlock(&dev->ev_mutex);
-
-		if (copy_to_user(buf, &kevent->event, event_size)) {
-			ret = -EFAULT;
-			break;
-		}
-		buf += event_size;
-		count -= event_size;
-
-		if (kevent->name) {
-			if (copy_to_user(buf, kevent->name, kevent->event.len)){
-				ret = -EFAULT;
-				break;
-			}
-			buf += kevent->event.len;
-			count -= kevent->event.len;
-		}
-
-		free_kevent(kevent);
-
-		mutex_lock(&dev->ev_mutex);
-	}
-	mutex_unlock(&dev->ev_mutex);
-
-	return ret;
-}
-
-static int inotify_fasync(int fd, struct file *file, int on)
-{
-	struct inotify_device *dev = file->private_data;
-
-	return fasync_helper(fd, file, on, &dev->fa) >= 0 ? 0 : -EIO;
-}
-
-static int inotify_release(struct inode *ignored, struct file *file)
-{
-	struct inotify_device *dev = file->private_data;
-
-	inotify_destroy(dev->ih);
-
-	/* destroy all of the events on this device */
-	mutex_lock(&dev->ev_mutex);
-	while (!list_empty(&dev->events))
-		inotify_dev_event_dequeue(dev);
-	mutex_unlock(&dev->ev_mutex);
-
-	/* free this device: the put matching the get in inotify_init() */
-	put_inotify_dev(dev);
-
-	return 0;
-}
-
-static long inotify_ioctl(struct file *file, unsigned int cmd,
-			  unsigned long arg)
-{
-	struct inotify_device *dev;
-	void __user *p;
-	int ret = -ENOTTY;
-
-	dev = file->private_data;
-	p = (void __user *) arg;
-
-	switch (cmd) {
-	case FIONREAD:
-		ret = put_user(dev->queue_size, (int __user *) p);
-		break;
-	}
-
-	return ret;
-}
-
-static const struct file_operations inotify_fops = {
-	.poll           = inotify_poll,
-	.read           = inotify_read,
-	.fasync         = inotify_fasync,
-	.release        = inotify_release,
-	.unlocked_ioctl = inotify_ioctl,
-	.compat_ioctl	= inotify_ioctl,
-};
-
-static const struct inotify_operations inotify_user_ops = {
-	.handle_event	= inotify_dev_queue_event,
-	.destroy_watch	= free_inotify_user_watch,
-};
-
-asmlinkage long sys_inotify_init1(int flags)
-{
-	struct inotify_device *dev;
-	struct inotify_handle *ih;
-	struct user_struct *user;
-	struct file *filp;
-	int fd, ret;
-
-	/* Check the IN_* constants for consistency.  */
-	BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC);
-	BUILD_BUG_ON(IN_NONBLOCK != O_NONBLOCK);
-
-	if (flags & ~(IN_CLOEXEC | IN_NONBLOCK))
-		return -EINVAL;
-
-	fd = get_unused_fd_flags(flags & O_CLOEXEC);
-	if (fd < 0)
-		return fd;
-
-	filp = get_empty_filp();
-	if (!filp) {
-		ret = -ENFILE;
-		goto out_put_fd;
-	}
-
-	user = get_current_user();
-	if (unlikely(atomic_read(&user->inotify_devs) >=
-			inotify_max_user_instances)) {
-		ret = -EMFILE;
-		goto out_free_uid;
-	}
-
-	dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
-	if (unlikely(!dev)) {
-		ret = -ENOMEM;
-		goto out_free_uid;
-	}
-
-	ih = inotify_init(&inotify_user_ops);
-	if (IS_ERR(ih)) {
-		ret = PTR_ERR(ih);
-		goto out_free_dev;
-	}
-	dev->ih = ih;
-	dev->fa = NULL;
-
-	filp->f_op = &inotify_fops;
-	filp->f_path.mnt = mntget(inotify_mnt);
-	filp->f_path.dentry = dget(inotify_mnt->mnt_root);
-	filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
-	filp->f_mode = FMODE_READ;
-	filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
-	filp->private_data = dev;
-
-	INIT_LIST_HEAD(&dev->events);
-	init_waitqueue_head(&dev->wq);
-	mutex_init(&dev->ev_mutex);
-	mutex_init(&dev->up_mutex);
-	dev->event_count = 0;
-	dev->queue_size = 0;
-	dev->max_events = inotify_max_queued_events;
-	dev->user = user;
-	atomic_set(&dev->count, 0);
-
-	get_inotify_dev(dev);
-	atomic_inc(&user->inotify_devs);
-	fd_install(fd, filp);
-
-	return fd;
-out_free_dev:
-	kfree(dev);
-out_free_uid:
-	free_uid(user);
-	put_filp(filp);
-out_put_fd:
-	put_unused_fd(fd);
-	return ret;
-}
-
-asmlinkage long sys_inotify_init(void)
-{
-	return sys_inotify_init1(0);
-}
-
-asmlinkage long sys_inotify_add_watch(int fd, const char __user *pathname, u32 mask)
-{
-	struct inode *inode;
-	struct inotify_device *dev;
-	struct path path;
-	struct file *filp;
-	int ret, fput_needed;
-	unsigned flags = 0;
-
-	filp = fget_light(fd, &fput_needed);
-	if (unlikely(!filp))
-		return -EBADF;
-
-	/* verify that this is indeed an inotify instance */
-	if (unlikely(filp->f_op != &inotify_fops)) {
-		ret = -EINVAL;
-		goto fput_and_out;
-	}
-
-	if (!(mask & IN_DONT_FOLLOW))
-		flags |= LOOKUP_FOLLOW;
-	if (mask & IN_ONLYDIR)
-		flags |= LOOKUP_DIRECTORY;
-
-	ret = find_inode(pathname, &path, flags);
-	if (unlikely(ret))
-		goto fput_and_out;
-
-	/* inode held in place by reference to path; dev by fget on fd */
-	inode = path.dentry->d_inode;
-	dev = filp->private_data;
-
-	mutex_lock(&dev->up_mutex);
-	ret = inotify_find_update_watch(dev->ih, inode, mask);
-	if (ret == -ENOENT)
-		ret = create_watch(dev, inode, mask);
-	mutex_unlock(&dev->up_mutex);
-
-	path_put(&path);
-fput_and_out:
-	fput_light(filp, fput_needed);
-	return ret;
-}
-
-asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
-{
-	struct file *filp;
-	struct inotify_device *dev;
-	int ret, fput_needed;
-
-	filp = fget_light(fd, &fput_needed);
-	if (unlikely(!filp))
-		return -EBADF;
-
-	/* verify that this is indeed an inotify instance */
-	if (unlikely(filp->f_op != &inotify_fops)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	dev = filp->private_data;
-
-	/* we free our watch data when we get IN_IGNORED */
-	ret = inotify_rm_wd(dev->ih, wd);
-
-out:
-	fput_light(filp, fput_needed);
-	return ret;
-}
-
-static int
-inotify_get_sb(struct file_system_type *fs_type, int flags,
-	       const char *dev_name, void *data, struct vfsmount *mnt)
-{
-	return get_sb_pseudo(fs_type, "inotify", NULL,
-			INOTIFYFS_SUPER_MAGIC, mnt);
-}
-
-static struct file_system_type inotify_fs_type = {
-    .name           = "inotifyfs",
-    .get_sb         = inotify_get_sb,
-    .kill_sb        = kill_anon_super,
-};
-
-/*
- * inotify_user_setup - Our initialization function.  Note that we cannnot return
- * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
- * must result in panic().
- */
-static int __init inotify_user_setup(void)
-{
-	int ret;
-
-	ret = register_filesystem(&inotify_fs_type);
-	if (unlikely(ret))
-		panic("inotify: register_filesystem returned %d!\n", ret);
-
-	inotify_mnt = kern_mount(&inotify_fs_type);
-	if (IS_ERR(inotify_mnt))
-		panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
-
-	inotify_max_queued_events = 16384;
-	inotify_max_user_instances = 128;
-	inotify_max_user_watches = 8192;
-
-	watch_cachep = kmem_cache_create("inotify_watch_cache",
-					 sizeof(struct inotify_user_watch),
-					 0, SLAB_PANIC, NULL);
-	event_cachep = kmem_cache_create("inotify_event_cache",
-					 sizeof(struct inotify_kernel_event),
-					 0, SLAB_PANIC, NULL);
-
-	return 0;
-}
-
-module_init(inotify_user_setup);
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
new file mode 100644
index 0000000..50914d7
--- /dev/null
+++ b/fs/notify/Kconfig
@@ -0,0 +1,2 @@
+source "fs/notify/dnotify/Kconfig"
+source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
new file mode 100644
index 0000000..5a95b60
--- /dev/null
+++ b/fs/notify/Makefile
@@ -0,0 +1,2 @@
+obj-y			+= dnotify/
+obj-y			+= inotify/
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
new file mode 100644
index 0000000..26adf5d
--- /dev/null
+++ b/fs/notify/dnotify/Kconfig
@@ -0,0 +1,10 @@
+config DNOTIFY
+	bool "Dnotify support"
+	default y
+	help
+	  Dnotify is a directory-based per-fd file change notification system
+	  that uses signals to communicate events to user-space.  There exist
+	  superior alternatives, but some applications may still rely on
+	  dnotify.
+
+	  If unsure, say Y.
diff --git a/fs/notify/dnotify/Makefile b/fs/notify/dnotify/Makefile
new file mode 100644
index 0000000..f145251
--- /dev/null
+++ b/fs/notify/dnotify/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_DNOTIFY)		+= dnotify.o
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
new file mode 100644
index 0000000..b0aa2cd
--- /dev/null
+++ b/fs/notify/dnotify/dnotify.c
@@ -0,0 +1,191 @@
+/*
+ * Directory notifications for Linux.
+ *
+ * Copyright (C) 2000,2001,2002 Stephen Rothwell
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/dnotify.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/fdtable.h>
+
+int dir_notify_enable __read_mostly = 1;
+
+static struct kmem_cache *dn_cache __read_mostly;
+
+static void redo_inode_mask(struct inode *inode)
+{
+	unsigned long new_mask;
+	struct dnotify_struct *dn;
+
+	new_mask = 0;
+	for (dn = inode->i_dnotify; dn != NULL; dn = dn->dn_next)
+		new_mask |= dn->dn_mask & ~DN_MULTISHOT;
+	inode->i_dnotify_mask = new_mask;
+}
+
+void dnotify_flush(struct file *filp, fl_owner_t id)
+{
+	struct dnotify_struct *dn;
+	struct dnotify_struct **prev;
+	struct inode *inode;
+
+	inode = filp->f_path.dentry->d_inode;
+	if (!S_ISDIR(inode->i_mode))
+		return;
+	spin_lock(&inode->i_lock);
+	prev = &inode->i_dnotify;
+	while ((dn = *prev) != NULL) {
+		if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
+			*prev = dn->dn_next;
+			redo_inode_mask(inode);
+			kmem_cache_free(dn_cache, dn);
+			break;
+		}
+		prev = &dn->dn_next;
+	}
+	spin_unlock(&inode->i_lock);
+}
+
+int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
+{
+	struct dnotify_struct *dn;
+	struct dnotify_struct *odn;
+	struct dnotify_struct **prev;
+	struct inode *inode;
+	fl_owner_t id = current->files;
+	struct file *f;
+	int error = 0;
+
+	if ((arg & ~DN_MULTISHOT) == 0) {
+		dnotify_flush(filp, id);
+		return 0;
+	}
+	if (!dir_notify_enable)
+		return -EINVAL;
+	inode = filp->f_path.dentry->d_inode;
+	if (!S_ISDIR(inode->i_mode))
+		return -ENOTDIR;
+	dn = kmem_cache_alloc(dn_cache, GFP_KERNEL);
+	if (dn == NULL)
+		return -ENOMEM;
+	spin_lock(&inode->i_lock);
+	prev = &inode->i_dnotify;
+	while ((odn = *prev) != NULL) {
+		if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
+			odn->dn_fd = fd;
+			odn->dn_mask |= arg;
+			inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
+			goto out_free;
+		}
+		prev = &odn->dn_next;
+	}
+
+	rcu_read_lock();
+	f = fcheck(fd);
+	rcu_read_unlock();
+	/* we'd lost the race with close(), sod off silently */
+	/* note that inode->i_lock prevents reordering problems
+	 * between accesses to descriptor table and ->i_dnotify */
+	if (f != filp)
+		goto out_free;
+
+	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
+	if (error)
+		goto out_free;
+
+	dn->dn_mask = arg;
+	dn->dn_fd = fd;
+	dn->dn_filp = filp;
+	dn->dn_owner = id;
+	inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
+	dn->dn_next = inode->i_dnotify;
+	inode->i_dnotify = dn;
+	spin_unlock(&inode->i_lock);
+	return 0;
+
+out_free:
+	spin_unlock(&inode->i_lock);
+	kmem_cache_free(dn_cache, dn);
+	return error;
+}
+
+void __inode_dir_notify(struct inode *inode, unsigned long event)
+{
+	struct dnotify_struct *	dn;
+	struct dnotify_struct **prev;
+	struct fown_struct *	fown;
+	int			changed = 0;
+
+	spin_lock(&inode->i_lock);
+	prev = &inode->i_dnotify;
+	while ((dn = *prev) != NULL) {
+		if ((dn->dn_mask & event) == 0) {
+			prev = &dn->dn_next;
+			continue;
+		}
+		fown = &dn->dn_filp->f_owner;
+		send_sigio(fown, dn->dn_fd, POLL_MSG);
+		if (dn->dn_mask & DN_MULTISHOT)
+			prev = &dn->dn_next;
+		else {
+			*prev = dn->dn_next;
+			changed = 1;
+			kmem_cache_free(dn_cache, dn);
+		}
+	}
+	if (changed)
+		redo_inode_mask(inode);
+	spin_unlock(&inode->i_lock);
+}
+
+EXPORT_SYMBOL(__inode_dir_notify);
+
+/*
+ * This is hopelessly wrong, but unfixable without API changes.  At
+ * least it doesn't oops the kernel...
+ *
+ * To safely access ->d_parent we need to keep d_move away from it.  Use the
+ * dentry's d_lock for this.
+ */
+void dnotify_parent(struct dentry *dentry, unsigned long event)
+{
+	struct dentry *parent;
+
+	if (!dir_notify_enable)
+		return;
+
+	spin_lock(&dentry->d_lock);
+	parent = dentry->d_parent;
+	if (parent->d_inode->i_dnotify_mask & event) {
+		dget(parent);
+		spin_unlock(&dentry->d_lock);
+		__inode_dir_notify(parent->d_inode, event);
+		dput(parent);
+	} else {
+		spin_unlock(&dentry->d_lock);
+	}
+}
+EXPORT_SYMBOL_GPL(dnotify_parent);
+
+static int __init dnotify_init(void)
+{
+	dn_cache = kmem_cache_create("dnotify_cache",
+		sizeof(struct dnotify_struct), 0, SLAB_PANIC, NULL);
+	return 0;
+}
+
+module_init(dnotify_init)
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
new file mode 100644
index 0000000..4467928
--- /dev/null
+++ b/fs/notify/inotify/Kconfig
@@ -0,0 +1,27 @@
+config INOTIFY
+	bool "Inotify file change notification support"
+	default y
+	---help---
+	  Say Y here to enable inotify support.  Inotify is a file change
+	  notification system and a replacement for dnotify.  Inotify fixes
+	  numerous shortcomings in dnotify and introduces several new features
+	  including multiple file events, one-shot support, and unmount
+	  notification.
+
+	  For more information, see <file:Documentation/filesystems/inotify.txt>
+
+	  If unsure, say Y.
+
+config INOTIFY_USER
+	bool "Inotify support for userspace"
+	depends on INOTIFY
+	default y
+	---help---
+	  Say Y here to enable inotify support for userspace, including the
+	  associated system calls.  Inotify allows monitoring of both files and
+	  directories via a single open fd.  Events are read from the file
+	  descriptor, which is also select()- and poll()-able.
+
+	  For more information, see <file:Documentation/filesystems/inotify.txt>
+
+	  If unsure, say Y.
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
new file mode 100644
index 0000000..e290f3b
--- /dev/null
+++ b/fs/notify/inotify/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_INOTIFY)		+= inotify.o
+obj-$(CONFIG_INOTIFY_USER)	+= inotify_user.o
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
new file mode 100644
index 0000000..dae3f28
--- /dev/null
+++ b/fs/notify/inotify/inotify.c
@@ -0,0 +1,913 @@
+/*
+ * fs/inotify.c - inode-based file event notifications
+ *
+ * Authors:
+ *	John McCutchan	<ttb@tentacle.dhs.org>
+ *	Robert Love	<rml@novell.com>
+ *
+ * Kernel API added by: Amy Griffis <amy.griffis@hp.com>
+ *
+ * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/writeback.h>
+#include <linux/inotify.h>
+
+static atomic_t inotify_cookie;
+
+/*
+ * Lock ordering:
+ *
+ * dentry->d_lock (used to keep d_move() away from dentry->d_parent)
+ * iprune_mutex (synchronize shrink_icache_memory())
+ * 	inode_lock (protects the super_block->s_inodes list)
+ * 	inode->inotify_mutex (protects inode->inotify_watches and watches->i_list)
+ * 		inotify_handle->mutex (protects inotify_handle and watches->h_list)
+ *
+ * The inode->inotify_mutex and inotify_handle->mutex and held during execution
+ * of a caller's event handler.  Thus, the caller must not hold any locks
+ * taken in their event handler while calling any of the published inotify
+ * interfaces.
+ */
+
+/*
+ * Lifetimes of the three main data structures--inotify_handle, inode, and
+ * inotify_watch--are managed by reference count.
+ *
+ * inotify_handle: Lifetime is from inotify_init() to inotify_destroy().
+ * Additional references can bump the count via get_inotify_handle() and drop
+ * the count via put_inotify_handle().
+ *
+ * inotify_watch: for inotify's purposes, lifetime is from inotify_add_watch()
+ * to remove_watch_no_event().  Additional references can bump the count via
+ * get_inotify_watch() and drop the count via put_inotify_watch().  The caller
+ * is reponsible for the final put after receiving IN_IGNORED, or when using
+ * IN_ONESHOT after receiving the first event.  Inotify does the final put if
+ * inotify_destroy() is called.
+ *
+ * inode: Pinned so long as the inode is associated with a watch, from
+ * inotify_add_watch() to the final put_inotify_watch().
+ */
+
+/*
+ * struct inotify_handle - represents an inotify instance
+ *
+ * This structure is protected by the mutex 'mutex'.
+ */
+struct inotify_handle {
+	struct idr		idr;		/* idr mapping wd -> watch */
+	struct mutex		mutex;		/* protects this bad boy */
+	struct list_head	watches;	/* list of watches */
+	atomic_t		count;		/* reference count */
+	u32			last_wd;	/* the last wd allocated */
+	const struct inotify_operations *in_ops; /* inotify caller operations */
+};
+
+static inline void get_inotify_handle(struct inotify_handle *ih)
+{
+	atomic_inc(&ih->count);
+}
+
+static inline void put_inotify_handle(struct inotify_handle *ih)
+{
+	if (atomic_dec_and_test(&ih->count)) {
+		idr_destroy(&ih->idr);
+		kfree(ih);
+	}
+}
+
+/**
+ * get_inotify_watch - grab a reference to an inotify_watch
+ * @watch: watch to grab
+ */
+void get_inotify_watch(struct inotify_watch *watch)
+{
+	atomic_inc(&watch->count);
+}
+EXPORT_SYMBOL_GPL(get_inotify_watch);
+
+int pin_inotify_watch(struct inotify_watch *watch)
+{
+	struct super_block *sb = watch->inode->i_sb;
+	spin_lock(&sb_lock);
+	if (sb->s_count >= S_BIAS) {
+		atomic_inc(&sb->s_active);
+		spin_unlock(&sb_lock);
+		atomic_inc(&watch->count);
+		return 1;
+	}
+	spin_unlock(&sb_lock);
+	return 0;
+}
+
+/**
+ * put_inotify_watch - decrements the ref count on a given watch.  cleans up
+ * watch references if the count reaches zero.  inotify_watch is freed by
+ * inotify callers via the destroy_watch() op.
+ * @watch: watch to release
+ */
+void put_inotify_watch(struct inotify_watch *watch)
+{
+	if (atomic_dec_and_test(&watch->count)) {
+		struct inotify_handle *ih = watch->ih;
+
+		iput(watch->inode);
+		ih->in_ops->destroy_watch(watch);
+		put_inotify_handle(ih);
+	}
+}
+EXPORT_SYMBOL_GPL(put_inotify_watch);
+
+void unpin_inotify_watch(struct inotify_watch *watch)
+{
+	struct super_block *sb = watch->inode->i_sb;
+	put_inotify_watch(watch);
+	deactivate_super(sb);
+}
+
+/*
+ * inotify_handle_get_wd - returns the next WD for use by the given handle
+ *
+ * Callers must hold ih->mutex.  This function can sleep.
+ */
+static int inotify_handle_get_wd(struct inotify_handle *ih,
+				 struct inotify_watch *watch)
+{
+	int ret;
+
+	do {
+		if (unlikely(!idr_pre_get(&ih->idr, GFP_KERNEL)))
+			return -ENOSPC;
+		ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
+	} while (ret == -EAGAIN);
+
+	if (likely(!ret))
+		ih->last_wd = watch->wd;
+
+	return ret;
+}
+
+/*
+ * inotify_inode_watched - returns nonzero if there are watches on this inode
+ * and zero otherwise.  We call this lockless, we do not care if we race.
+ */
+static inline int inotify_inode_watched(struct inode *inode)
+{
+	return !list_empty(&inode->inotify_watches);
+}
+
+/*
+ * Get child dentry flag into synch with parent inode.
+ * Flag should always be clear for negative dentrys.
+ */
+static void set_dentry_child_flags(struct inode *inode, int watched)
+{
+	struct dentry *alias;
+
+	spin_lock(&dcache_lock);
+	list_for_each_entry(alias, &inode->i_dentry, d_alias) {
+		struct dentry *child;
+
+		list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
+			if (!child->d_inode)
+				continue;
+
+			spin_lock(&child->d_lock);
+			if (watched)
+				child->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
+			else
+				child->d_flags &=~DCACHE_INOTIFY_PARENT_WATCHED;
+			spin_unlock(&child->d_lock);
+		}
+	}
+	spin_unlock(&dcache_lock);
+}
+
+/*
+ * inotify_find_handle - find the watch associated with the given inode and
+ * handle
+ *
+ * Callers must hold inode->inotify_mutex.
+ */
+static struct inotify_watch *inode_find_handle(struct inode *inode,
+					       struct inotify_handle *ih)
+{
+	struct inotify_watch *watch;
+
+	list_for_each_entry(watch, &inode->inotify_watches, i_list) {
+		if (watch->ih == ih)
+			return watch;
+	}
+
+	return NULL;
+}
+
+/*
+ * remove_watch_no_event - remove watch without the IN_IGNORED event.
+ *
+ * Callers must hold both inode->inotify_mutex and ih->mutex.
+ */
+static void remove_watch_no_event(struct inotify_watch *watch,
+				  struct inotify_handle *ih)
+{
+	list_del(&watch->i_list);
+	list_del(&watch->h_list);
+
+	if (!inotify_inode_watched(watch->inode))
+		set_dentry_child_flags(watch->inode, 0);
+
+	idr_remove(&ih->idr, watch->wd);
+}
+
+/**
+ * inotify_remove_watch_locked - Remove a watch from both the handle and the
+ * inode.  Sends the IN_IGNORED event signifying that the inode is no longer
+ * watched.  May be invoked from a caller's event handler.
+ * @ih: inotify handle associated with watch
+ * @watch: watch to remove
+ *
+ * Callers must hold both inode->inotify_mutex and ih->mutex.
+ */
+void inotify_remove_watch_locked(struct inotify_handle *ih,
+				 struct inotify_watch *watch)
+{
+	remove_watch_no_event(watch, ih);
+	ih->in_ops->handle_event(watch, watch->wd, IN_IGNORED, 0, NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(inotify_remove_watch_locked);
+
+/* Kernel API for producing events */
+
+/*
+ * inotify_d_instantiate - instantiate dcache entry for inode
+ */
+void inotify_d_instantiate(struct dentry *entry, struct inode *inode)
+{
+	struct dentry *parent;
+
+	if (!inode)
+		return;
+
+	spin_lock(&entry->d_lock);
+	parent = entry->d_parent;
+	if (parent->d_inode && inotify_inode_watched(parent->d_inode))
+		entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
+	spin_unlock(&entry->d_lock);
+}
+
+/*
+ * inotify_d_move - dcache entry has been moved
+ */
+void inotify_d_move(struct dentry *entry)
+{
+	struct dentry *parent;
+
+	parent = entry->d_parent;
+	if (inotify_inode_watched(parent->d_inode))
+		entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
+	else
+		entry->d_flags &= ~DCACHE_INOTIFY_PARENT_WATCHED;
+}
+
+/**
+ * inotify_inode_queue_event - queue an event to all watches on this inode
+ * @inode: inode event is originating from
+ * @mask: event mask describing this event
+ * @cookie: cookie for synchronization, or zero
+ * @name: filename, if any
+ * @n_inode: inode associated with name
+ */
+void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
+			       const char *name, struct inode *n_inode)
+{
+	struct inotify_watch *watch, *next;
+
+	if (!inotify_inode_watched(inode))
+		return;
+
+	mutex_lock(&inode->inotify_mutex);
+	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
+		u32 watch_mask = watch->mask;
+		if (watch_mask & mask) {
+			struct inotify_handle *ih= watch->ih;
+			mutex_lock(&ih->mutex);
+			if (watch_mask & IN_ONESHOT)
+				remove_watch_no_event(watch, ih);
+			ih->in_ops->handle_event(watch, watch->wd, mask, cookie,
+						 name, n_inode);
+			mutex_unlock(&ih->mutex);
+		}
+	}
+	mutex_unlock(&inode->inotify_mutex);
+}
+EXPORT_SYMBOL_GPL(inotify_inode_queue_event);
+
+/**
+ * inotify_dentry_parent_queue_event - queue an event to a dentry's parent
+ * @dentry: the dentry in question, we queue against this dentry's parent
+ * @mask: event mask describing this event
+ * @cookie: cookie for synchronization, or zero
+ * @name: filename, if any
+ */
+void inotify_dentry_parent_queue_event(struct dentry *dentry, u32 mask,
+				       u32 cookie, const char *name)
+{
+	struct dentry *parent;
+	struct inode *inode;
+
+	if (!(dentry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED))
+		return;
+
+	spin_lock(&dentry->d_lock);
+	parent = dentry->d_parent;
+	inode = parent->d_inode;
+
+	if (inotify_inode_watched(inode)) {
+		dget(parent);
+		spin_unlock(&dentry->d_lock);
+		inotify_inode_queue_event(inode, mask, cookie, name,
+					  dentry->d_inode);
+		dput(parent);
+	} else
+		spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL_GPL(inotify_dentry_parent_queue_event);
+
+/**
+ * inotify_get_cookie - return a unique cookie for use in synchronizing events.
+ */
+u32 inotify_get_cookie(void)
+{
+	return atomic_inc_return(&inotify_cookie);
+}
+EXPORT_SYMBOL_GPL(inotify_get_cookie);
+
+/**
+ * inotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
+ * @list: list of inodes being unmounted (sb->s_inodes)
+ *
+ * Called with inode_lock held, protecting the unmounting super block's list
+ * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
+ * We temporarily drop inode_lock, however, and CAN block.
+ */
+void inotify_unmount_inodes(struct list_head *list)
+{
+	struct inode *inode, *next_i, *need_iput = NULL;
+
+	list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
+		struct inotify_watch *watch, *next_w;
+		struct inode *need_iput_tmp;
+		struct list_head *watches;
+
+		/*
+		 * If i_count is zero, the inode cannot have any watches and
+		 * doing an __iget/iput with MS_ACTIVE clear would actually
+		 * evict all inodes with zero i_count from icache which is
+		 * unnecessarily violent and may in fact be illegal to do.
+		 */
+		if (!atomic_read(&inode->i_count))
+			continue;
+
+		/*
+		 * We cannot __iget() an inode in state I_CLEAR, I_FREEING, or
+		 * I_WILL_FREE which is fine because by that point the inode
+		 * cannot have any associated watches.
+		 */
+		if (inode->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))
+			continue;
+
+		need_iput_tmp = need_iput;
+		need_iput = NULL;
+		/* In case inotify_remove_watch_locked() drops a reference. */
+		if (inode != need_iput_tmp)
+			__iget(inode);
+		else
+			need_iput_tmp = NULL;
+		/* In case the dropping of a reference would nuke next_i. */
+		if ((&next_i->i_sb_list != list) &&
+				atomic_read(&next_i->i_count) &&
+				!(next_i->i_state & (I_CLEAR | I_FREEING |
+					I_WILL_FREE))) {
+			__iget(next_i);
+			need_iput = next_i;
+		}
+
+		/*
+		 * We can safely drop inode_lock here because we hold
+		 * references on both inode and next_i.  Also no new inodes
+		 * will be added since the umount has begun.  Finally,
+		 * iprune_mutex keeps shrink_icache_memory() away.
+		 */
+		spin_unlock(&inode_lock);
+
+		if (need_iput_tmp)
+			iput(need_iput_tmp);
+
+		/* for each watch, send IN_UNMOUNT and then remove it */
+		mutex_lock(&inode->inotify_mutex);
+		watches = &inode->inotify_watches;
+		list_for_each_entry_safe(watch, next_w, watches, i_list) {
+			struct inotify_handle *ih= watch->ih;
+			get_inotify_watch(watch);
+			mutex_lock(&ih->mutex);
+			ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0,
+						 NULL, NULL);
+			inotify_remove_watch_locked(ih, watch);
+			mutex_unlock(&ih->mutex);
+			put_inotify_watch(watch);
+		}
+		mutex_unlock(&inode->inotify_mutex);
+		iput(inode);		
+
+		spin_lock(&inode_lock);
+	}
+}
+EXPORT_SYMBOL_GPL(inotify_unmount_inodes);
+
+/**
+ * inotify_inode_is_dead - an inode has been deleted, cleanup any watches
+ * @inode: inode that is about to be removed
+ */
+void inotify_inode_is_dead(struct inode *inode)
+{
+	struct inotify_watch *watch, *next;
+
+	mutex_lock(&inode->inotify_mutex);
+	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
+		struct inotify_handle *ih = watch->ih;
+		mutex_lock(&ih->mutex);
+		inotify_remove_watch_locked(ih, watch);
+		mutex_unlock(&ih->mutex);
+	}
+	mutex_unlock(&inode->inotify_mutex);
+}
+EXPORT_SYMBOL_GPL(inotify_inode_is_dead);
+
+/* Kernel Consumer API */
+
+/**
+ * inotify_init - allocate and initialize an inotify instance
+ * @ops: caller's inotify operations
+ */
+struct inotify_handle *inotify_init(const struct inotify_operations *ops)
+{
+	struct inotify_handle *ih;
+
+	ih = kmalloc(sizeof(struct inotify_handle), GFP_KERNEL);
+	if (unlikely(!ih))
+		return ERR_PTR(-ENOMEM);
+
+	idr_init(&ih->idr);
+	INIT_LIST_HEAD(&ih->watches);
+	mutex_init(&ih->mutex);
+	ih->last_wd = 0;
+	ih->in_ops = ops;
+	atomic_set(&ih->count, 0);
+	get_inotify_handle(ih);
+
+	return ih;
+}
+EXPORT_SYMBOL_GPL(inotify_init);
+
+/**
+ * inotify_init_watch - initialize an inotify watch
+ * @watch: watch to initialize
+ */
+void inotify_init_watch(struct inotify_watch *watch)
+{
+	INIT_LIST_HEAD(&watch->h_list);
+	INIT_LIST_HEAD(&watch->i_list);
+	atomic_set(&watch->count, 0);
+	get_inotify_watch(watch); /* initial get */
+}
+EXPORT_SYMBOL_GPL(inotify_init_watch);
+
+/*
+ * Watch removals suck violently.  To kick the watch out we need (in this
+ * order) inode->inotify_mutex and ih->mutex.  That's fine if we have
+ * a hold on inode; however, for all other cases we need to make damn sure
+ * we don't race with umount.  We can *NOT* just grab a reference to a
+ * watch - inotify_unmount_inodes() will happily sail past it and we'll end
+ * with reference to inode potentially outliving its superblock.  Ideally
+ * we just want to grab an active reference to superblock if we can; that
+ * will make sure we won't go into inotify_umount_inodes() until we are
+ * done.  Cleanup is just deactivate_super().  However, that leaves a messy
+ * case - what if we *are* racing with umount() and active references to
+ * superblock can't be acquired anymore?  We can bump ->s_count, grab
+ * ->s_umount, which will almost certainly wait until the superblock is shut
+ * down and the watch in question is pining for fjords.  That's fine, but
+ * there is a problem - we might have hit the window between ->s_active
+ * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock
+ * is past the point of no return and is heading for shutdown) and the
+ * moment when deactivate_super() acquires ->s_umount.  We could just do
+ * drop_super() yield() and retry, but that's rather antisocial and this
+ * stuff is luser-triggerable.  OTOH, having grabbed ->s_umount and having
+ * found that we'd got there first (i.e. that ->s_root is non-NULL) we know
+ * that we won't race with inotify_umount_inodes().  So we could grab a
+ * reference to watch and do the rest as above, just with drop_super() instead
+ * of deactivate_super(), right?  Wrong.  We had to drop ih->mutex before we
+ * could grab ->s_umount.  So the watch could've been gone already.
+ *
+ * That still can be dealt with - we need to save watch->wd, do idr_find()
+ * and compare its result with our pointer.  If they match, we either have
+ * the damn thing still alive or we'd lost not one but two races at once,
+ * the watch had been killed and a new one got created with the same ->wd
+ * at the same address.  That couldn't have happened in inotify_destroy(),
+ * but inotify_rm_wd() could run into that.  Still, "new one got created"
+ * is not a problem - we have every right to kill it or leave it alone,
+ * whatever's more convenient.
+ *
+ * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
+ * "grab it and kill it" check.  If it's been our original watch, we are
+ * fine, if it's a newcomer - nevermind, just pretend that we'd won the
+ * race and kill the fscker anyway; we are safe since we know that its
+ * superblock won't be going away.
+ *
+ * And yes, this is far beyond mere "not very pretty"; so's the entire
+ * concept of inotify to start with.
+ */
+
+/**
+ * pin_to_kill - pin the watch down for removal
+ * @ih: inotify handle
+ * @watch: watch to kill
+ *
+ * Called with ih->mutex held, drops it.  Possible return values:
+ * 0 - nothing to do, it has died
+ * 1 - remove it, drop the reference and deactivate_super()
+ * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid
+ * that variant, since it involved a lot of PITA, but that's the best that
+ * could've been done.
+ */
+static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
+{
+	struct super_block *sb = watch->inode->i_sb;
+	s32 wd = watch->wd;
+
+	spin_lock(&sb_lock);
+	if (sb->s_count >= S_BIAS) {
+		atomic_inc(&sb->s_active);
+		spin_unlock(&sb_lock);
+		get_inotify_watch(watch);
+		mutex_unlock(&ih->mutex);
+		return 1;	/* the best outcome */
+	}
+	sb->s_count++;
+	spin_unlock(&sb_lock);
+	mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
+	down_read(&sb->s_umount);
+	if (likely(!sb->s_root)) {
+		/* fs is already shut down; the watch is dead */
+		drop_super(sb);
+		return 0;
+	}
+	/* raced with the final deactivate_super() */
+	mutex_lock(&ih->mutex);
+	if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) {
+		/* the watch is dead */
+		mutex_unlock(&ih->mutex);
+		drop_super(sb);
+		return 0;
+	}
+	/* still alive or freed and reused with the same sb and wd; kill */
+	get_inotify_watch(watch);
+	mutex_unlock(&ih->mutex);
+	return 2;
+}
+
+static void unpin_and_kill(struct inotify_watch *watch, int how)
+{
+	struct super_block *sb = watch->inode->i_sb;
+	put_inotify_watch(watch);
+	switch (how) {
+	case 1:
+		deactivate_super(sb);
+		break;
+	case 2:
+		drop_super(sb);
+	}
+}
+
+/**
+ * inotify_destroy - clean up and destroy an inotify instance
+ * @ih: inotify handle
+ */
+void inotify_destroy(struct inotify_handle *ih)
+{
+	/*
+	 * Destroy all of the watches for this handle. Unfortunately, not very
+	 * pretty.  We cannot do a simple iteration over the list, because we
+	 * do not know the inode until we iterate to the watch.  But we need to
+	 * hold inode->inotify_mutex before ih->mutex.  The following works.
+	 *
+	 * AV: it had to become even uglier to start working ;-/
+	 */
+	while (1) {
+		struct inotify_watch *watch;
+		struct list_head *watches;
+		struct super_block *sb;
+		struct inode *inode;
+		int how;
+
+		mutex_lock(&ih->mutex);
+		watches = &ih->watches;
+		if (list_empty(watches)) {
+			mutex_unlock(&ih->mutex);
+			break;
+		}
+		watch = list_first_entry(watches, struct inotify_watch, h_list);
+		sb = watch->inode->i_sb;
+		how = pin_to_kill(ih, watch);
+		if (!how)
+			continue;
+
+		inode = watch->inode;
+		mutex_lock(&inode->inotify_mutex);
+		mutex_lock(&ih->mutex);
+
+		/* make sure we didn't race with another list removal */
+		if (likely(idr_find(&ih->idr, watch->wd))) {
+			remove_watch_no_event(watch, ih);
+			put_inotify_watch(watch);
+		}
+
+		mutex_unlock(&ih->mutex);
+		mutex_unlock(&inode->inotify_mutex);
+		unpin_and_kill(watch, how);
+	}
+
+	/* free this handle: the put matching the get in inotify_init() */
+	put_inotify_handle(ih);
+}
+EXPORT_SYMBOL_GPL(inotify_destroy);
+
+/**
+ * inotify_find_watch - find an existing watch for an (ih,inode) pair
+ * @ih: inotify handle
+ * @inode: inode to watch
+ * @watchp: pointer to existing inotify_watch
+ *
+ * Caller must pin given inode (via nameidata).
+ */
+s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode,
+		       struct inotify_watch **watchp)
+{
+	struct inotify_watch *old;
+	int ret = -ENOENT;
+
+	mutex_lock(&inode->inotify_mutex);
+	mutex_lock(&ih->mutex);
+
+	old = inode_find_handle(inode, ih);
+	if (unlikely(old)) {
+		get_inotify_watch(old); /* caller must put watch */
+		*watchp = old;
+		ret = old->wd;
+	}
+
+	mutex_unlock(&ih->mutex);
+	mutex_unlock(&inode->inotify_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(inotify_find_watch);
+
+/**
+ * inotify_find_update_watch - find and update the mask of an existing watch
+ * @ih: inotify handle
+ * @inode: inode's watch to update
+ * @mask: mask of events to watch
+ *
+ * Caller must pin given inode (via nameidata).
+ */
+s32 inotify_find_update_watch(struct inotify_handle *ih, struct inode *inode,
+			      u32 mask)
+{
+	struct inotify_watch *old;
+	int mask_add = 0;
+	int ret;
+
+	if (mask & IN_MASK_ADD)
+		mask_add = 1;
+
+	/* don't allow invalid bits: we don't want flags set */
+	mask &= IN_ALL_EVENTS | IN_ONESHOT;
+	if (unlikely(!mask))
+		return -EINVAL;
+
+	mutex_lock(&inode->inotify_mutex);
+	mutex_lock(&ih->mutex);
+
+	/*
+	 * Handle the case of re-adding a watch on an (inode,ih) pair that we
+	 * are already watching.  We just update the mask and return its wd.
+	 */
+	old = inode_find_handle(inode, ih);
+	if (unlikely(!old)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	if (mask_add)
+		old->mask |= mask;
+	else
+		old->mask = mask;
+	ret = old->wd;
+out:
+	mutex_unlock(&ih->mutex);
+	mutex_unlock(&inode->inotify_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(inotify_find_update_watch);
+
+/**
+ * inotify_add_watch - add a watch to an inotify instance
+ * @ih: inotify handle
+ * @watch: caller allocated watch structure
+ * @inode: inode to watch
+ * @mask: mask of events to watch
+ *
+ * Caller must pin given inode (via nameidata).
+ * Caller must ensure it only calls inotify_add_watch() once per watch.
+ * Calls inotify_handle_get_wd() so may sleep.
+ */
+s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
+		      struct inode *inode, u32 mask)
+{
+	int ret = 0;
+	int newly_watched;
+
+	/* don't allow invalid bits: we don't want flags set */
+	mask &= IN_ALL_EVENTS | IN_ONESHOT;
+	if (unlikely(!mask))
+		return -EINVAL;
+	watch->mask = mask;
+
+	mutex_lock(&inode->inotify_mutex);
+	mutex_lock(&ih->mutex);
+
+	/* Initialize a new watch */
+	ret = inotify_handle_get_wd(ih, watch);
+	if (unlikely(ret))
+		goto out;
+	ret = watch->wd;
+
+	/* save a reference to handle and bump the count to make it official */
+	get_inotify_handle(ih);
+	watch->ih = ih;
+
+	/*
+	 * Save a reference to the inode and bump the ref count to make it
+	 * official.  We hold a reference to nameidata, which makes this safe.
+	 */
+	watch->inode = igrab(inode);
+
+	/* Add the watch to the handle's and the inode's list */
+	newly_watched = !inotify_inode_watched(inode);
+	list_add(&watch->h_list, &ih->watches);
+	list_add(&watch->i_list, &inode->inotify_watches);
+	/*
+	 * Set child flags _after_ adding the watch, so there is no race
+	 * windows where newly instantiated children could miss their parent's
+	 * watched flag.
+	 */
+	if (newly_watched)
+		set_dentry_child_flags(inode, 1);
+
+out:
+	mutex_unlock(&ih->mutex);
+	mutex_unlock(&inode->inotify_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(inotify_add_watch);
+
+/**
+ * inotify_clone_watch - put the watch next to existing one
+ * @old: already installed watch
+ * @new: new watch
+ *
+ * Caller must hold the inotify_mutex of inode we are dealing with;
+ * it is expected to remove the old watch before unlocking the inode.
+ */
+s32 inotify_clone_watch(struct inotify_watch *old, struct inotify_watch *new)
+{
+	struct inotify_handle *ih = old->ih;
+	int ret = 0;
+
+	new->mask = old->mask;
+	new->ih = ih;
+
+	mutex_lock(&ih->mutex);
+
+	/* Initialize a new watch */
+	ret = inotify_handle_get_wd(ih, new);
+	if (unlikely(ret))
+		goto out;
+	ret = new->wd;
+
+	get_inotify_handle(ih);
+
+	new->inode = igrab(old->inode);
+
+	list_add(&new->h_list, &ih->watches);
+	list_add(&new->i_list, &old->inode->inotify_watches);
+out:
+	mutex_unlock(&ih->mutex);
+	return ret;
+}
+
+void inotify_evict_watch(struct inotify_watch *watch)
+{
+	get_inotify_watch(watch);
+	mutex_lock(&watch->ih->mutex);
+	inotify_remove_watch_locked(watch->ih, watch);
+	mutex_unlock(&watch->ih->mutex);
+}
+
+/**
+ * inotify_rm_wd - remove a watch from an inotify instance
+ * @ih: inotify handle
+ * @wd: watch descriptor to remove
+ *
+ * Can sleep.
+ */
+int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
+{
+	struct inotify_watch *watch;
+	struct super_block *sb;
+	struct inode *inode;
+	int how;
+
+	mutex_lock(&ih->mutex);
+	watch = idr_find(&ih->idr, wd);
+	if (unlikely(!watch)) {
+		mutex_unlock(&ih->mutex);
+		return -EINVAL;
+	}
+	sb = watch->inode->i_sb;
+	how = pin_to_kill(ih, watch);
+	if (!how)
+		return 0;
+
+	inode = watch->inode;
+
+	mutex_lock(&inode->inotify_mutex);
+	mutex_lock(&ih->mutex);
+
+	/* make sure that we did not race */
+	if (likely(idr_find(&ih->idr, wd) == watch))
+		inotify_remove_watch_locked(ih, watch);
+
+	mutex_unlock(&ih->mutex);
+	mutex_unlock(&inode->inotify_mutex);
+	unpin_and_kill(watch, how);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(inotify_rm_wd);
+
+/**
+ * inotify_rm_watch - remove a watch from an inotify instance
+ * @ih: inotify handle
+ * @watch: watch to remove
+ *
+ * Can sleep.
+ */
+int inotify_rm_watch(struct inotify_handle *ih,
+		     struct inotify_watch *watch)
+{
+	return inotify_rm_wd(ih, watch->wd);
+}
+EXPORT_SYMBOL_GPL(inotify_rm_watch);
+
+/*
+ * inotify_setup - core initialization function
+ */
+static int __init inotify_setup(void)
+{
+	atomic_set(&inotify_cookie, 0);
+
+	return 0;
+}
+
+module_init(inotify_setup);
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
new file mode 100644
index 0000000..400f806
--- /dev/null
+++ b/fs/notify/inotify/inotify_user.c
@@ -0,0 +1,778 @@
+/*
+ * fs/inotify_user.c - inotify support for userspace
+ *
+ * Authors:
+ *	John McCutchan	<ttb@tentacle.dhs.org>
+ *	Robert Love	<rml@novell.com>
+ *
+ * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/inotify.h>
+#include <linux/syscalls.h>
+#include <linux/magic.h>
+
+#include <asm/ioctls.h>
+
+static struct kmem_cache *watch_cachep __read_mostly;
+static struct kmem_cache *event_cachep __read_mostly;
+
+static struct vfsmount *inotify_mnt __read_mostly;
+
+/* these are configurable via /proc/sys/fs/inotify/ */
+static int inotify_max_user_instances __read_mostly;
+static int inotify_max_user_watches __read_mostly;
+static int inotify_max_queued_events __read_mostly;
+
+/*
+ * Lock ordering:
+ *
+ * inotify_dev->up_mutex (ensures we don't re-add the same watch)
+ * 	inode->inotify_mutex (protects inode's watch list)
+ * 		inotify_handle->mutex (protects inotify_handle's watch list)
+ * 			inotify_dev->ev_mutex (protects device's event queue)
+ */
+
+/*
+ * Lifetimes of the main data structures:
+ *
+ * inotify_device: Lifetime is managed by reference count, from
+ * sys_inotify_init() until release.  Additional references can bump the count
+ * via get_inotify_dev() and drop the count via put_inotify_dev().
+ *
+ * inotify_user_watch: Lifetime is from create_watch() to the receipt of an
+ * IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the
+ * first event, or to inotify_destroy().
+ */
+
+/*
+ * struct inotify_device - represents an inotify instance
+ *
+ * This structure is protected by the mutex 'mutex'.
+ */
+struct inotify_device {
+	wait_queue_head_t 	wq;		/* wait queue for i/o */
+	struct mutex		ev_mutex;	/* protects event queue */
+	struct mutex		up_mutex;	/* synchronizes watch updates */
+	struct list_head 	events;		/* list of queued events */
+	struct user_struct	*user;		/* user who opened this dev */
+	struct inotify_handle	*ih;		/* inotify handle */
+	struct fasync_struct    *fa;            /* async notification */
+	atomic_t		count;		/* reference count */
+	unsigned int		queue_size;	/* size of the queue (bytes) */
+	unsigned int		event_count;	/* number of pending events */
+	unsigned int		max_events;	/* maximum number of events */
+};
+
+/*
+ * struct inotify_kernel_event - An inotify event, originating from a watch and
+ * queued for user-space.  A list of these is attached to each instance of the
+ * device.  In read(), this list is walked and all events that can fit in the
+ * buffer are returned.
+ *
+ * Protected by dev->ev_mutex of the device in which we are queued.
+ */
+struct inotify_kernel_event {
+	struct inotify_event	event;	/* the user-space event */
+	struct list_head        list;	/* entry in inotify_device's list */
+	char			*name;	/* filename, if any */
+};
+
+/*
+ * struct inotify_user_watch - our version of an inotify_watch, we add
+ * a reference to the associated inotify_device.
+ */
+struct inotify_user_watch {
+	struct inotify_device	*dev;	/* associated device */
+	struct inotify_watch	wdata;	/* inotify watch data */
+};
+
+#ifdef CONFIG_SYSCTL
+
+#include <linux/sysctl.h>
+
+static int zero;
+
+ctl_table inotify_table[] = {
+	{
+		.ctl_name	= INOTIFY_MAX_USER_INSTANCES,
+		.procname	= "max_user_instances",
+		.data		= &inotify_max_user_instances,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+	{
+		.ctl_name	= INOTIFY_MAX_USER_WATCHES,
+		.procname	= "max_user_watches",
+		.data		= &inotify_max_user_watches,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+	{
+		.ctl_name	= INOTIFY_MAX_QUEUED_EVENTS,
+		.procname	= "max_queued_events",
+		.data		= &inotify_max_queued_events,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero
+	},
+	{ .ctl_name = 0 }
+};
+#endif /* CONFIG_SYSCTL */
+
+static inline void get_inotify_dev(struct inotify_device *dev)
+{
+	atomic_inc(&dev->count);
+}
+
+static inline void put_inotify_dev(struct inotify_device *dev)
+{
+	if (atomic_dec_and_test(&dev->count)) {
+		atomic_dec(&dev->user->inotify_devs);
+		free_uid(dev->user);
+		kfree(dev);
+	}
+}
+
+/*
+ * free_inotify_user_watch - cleans up the watch and its references
+ */
+static void free_inotify_user_watch(struct inotify_watch *w)
+{
+	struct inotify_user_watch *watch;
+	struct inotify_device *dev;
+
+	watch = container_of(w, struct inotify_user_watch, wdata);
+	dev = watch->dev;
+
+	atomic_dec(&dev->user->inotify_watches);
+	put_inotify_dev(dev);
+	kmem_cache_free(watch_cachep, watch);
+}
+
+/*
+ * kernel_event - create a new kernel event with the given parameters
+ *
+ * This function can sleep.
+ */
+static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
+						  const char *name)
+{
+	struct inotify_kernel_event *kevent;
+
+	kevent = kmem_cache_alloc(event_cachep, GFP_NOFS);
+	if (unlikely(!kevent))
+		return NULL;
+
+	/* we hand this out to user-space, so zero it just in case */
+	memset(&kevent->event, 0, sizeof(struct inotify_event));
+
+	kevent->event.wd = wd;
+	kevent->event.mask = mask;
+	kevent->event.cookie = cookie;
+
+	INIT_LIST_HEAD(&kevent->list);
+
+	if (name) {
+		size_t len, rem, event_size = sizeof(struct inotify_event);
+
+		/*
+		 * We need to pad the filename so as to properly align an
+		 * array of inotify_event structures.  Because the structure is
+		 * small and the common case is a small filename, we just round
+		 * up to the next multiple of the structure's sizeof.  This is
+		 * simple and safe for all architectures.
+		 */
+		len = strlen(name) + 1;
+		rem = event_size - len;
+		if (len > event_size) {
+			rem = event_size - (len % event_size);
+			if (len % event_size == 0)
+				rem = 0;
+		}
+
+		kevent->name = kmalloc(len + rem, GFP_KERNEL);
+		if (unlikely(!kevent->name)) {
+			kmem_cache_free(event_cachep, kevent);
+			return NULL;
+		}
+		memcpy(kevent->name, name, len);
+		if (rem)
+			memset(kevent->name + len, 0, rem);
+		kevent->event.len = len + rem;
+	} else {
+		kevent->event.len = 0;
+		kevent->name = NULL;
+	}
+
+	return kevent;
+}
+
+/*
+ * inotify_dev_get_event - return the next event in the given dev's queue
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static inline struct inotify_kernel_event *
+inotify_dev_get_event(struct inotify_device *dev)
+{
+	return list_entry(dev->events.next, struct inotify_kernel_event, list);
+}
+
+/*
+ * inotify_dev_get_last_event - return the last event in the given dev's queue
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static inline struct inotify_kernel_event *
+inotify_dev_get_last_event(struct inotify_device *dev)
+{
+	if (list_empty(&dev->events))
+		return NULL;
+	return list_entry(dev->events.prev, struct inotify_kernel_event, list);
+}
+
+/*
+ * inotify_dev_queue_event - event handler registered with core inotify, adds
+ * a new event to the given device
+ *
+ * Can sleep (calls kernel_event()).
+ */
+static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
+				    u32 cookie, const char *name,
+				    struct inode *ignored)
+{
+	struct inotify_user_watch *watch;
+	struct inotify_device *dev;
+	struct inotify_kernel_event *kevent, *last;
+
+	watch = container_of(w, struct inotify_user_watch, wdata);
+	dev = watch->dev;
+
+	mutex_lock(&dev->ev_mutex);
+
+	/* we can safely put the watch as we don't reference it while
+	 * generating the event
+	 */
+	if (mask & IN_IGNORED || w->mask & IN_ONESHOT)
+		put_inotify_watch(w); /* final put */
+
+	/* coalescing: drop this event if it is a dupe of the previous */
+	last = inotify_dev_get_last_event(dev);
+	if (last && last->event.mask == mask && last->event.wd == wd &&
+			last->event.cookie == cookie) {
+		const char *lastname = last->name;
+
+		if (!name && !lastname)
+			goto out;
+		if (name && lastname && !strcmp(lastname, name))
+			goto out;
+	}
+
+	/* the queue overflowed and we already sent the Q_OVERFLOW event */
+	if (unlikely(dev->event_count > dev->max_events))
+		goto out;
+
+	/* if the queue overflows, we need to notify user space */
+	if (unlikely(dev->event_count == dev->max_events))
+		kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
+	else
+		kevent = kernel_event(wd, mask, cookie, name);
+
+	if (unlikely(!kevent))
+		goto out;
+
+	/* queue the event and wake up anyone waiting */
+	dev->event_count++;
+	dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
+	list_add_tail(&kevent->list, &dev->events);
+	wake_up_interruptible(&dev->wq);
+	kill_fasync(&dev->fa, SIGIO, POLL_IN);
+
+out:
+	mutex_unlock(&dev->ev_mutex);
+}
+
+/*
+ * remove_kevent - cleans up the given kevent
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static void remove_kevent(struct inotify_device *dev,
+			  struct inotify_kernel_event *kevent)
+{
+	list_del(&kevent->list);
+
+	dev->event_count--;
+	dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
+}
+
+/*
+ * free_kevent - frees the given kevent.
+ */
+static void free_kevent(struct inotify_kernel_event *kevent)
+{
+	kfree(kevent->name);
+	kmem_cache_free(event_cachep, kevent);
+}
+
+/*
+ * inotify_dev_event_dequeue - destroy an event on the given device
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static void inotify_dev_event_dequeue(struct inotify_device *dev)
+{
+	if (!list_empty(&dev->events)) {
+		struct inotify_kernel_event *kevent;
+		kevent = inotify_dev_get_event(dev);
+		remove_kevent(dev, kevent);
+		free_kevent(kevent);
+	}
+}
+
+/*
+ * find_inode - resolve a user-given path to a specific inode
+ */
+static int find_inode(const char __user *dirname, struct path *path,
+		      unsigned flags)
+{
+	int error;
+
+	error = user_path_at(AT_FDCWD, dirname, flags, path);
+	if (error)
+		return error;
+	/* you can only watch an inode if you have read permissions on it */
+	error = inode_permission(path->dentry->d_inode, MAY_READ);
+	if (error)
+		path_put(path);
+	return error;
+}
+
+/*
+ * create_watch - creates a watch on the given device.
+ *
+ * Callers must hold dev->up_mutex.
+ */
+static int create_watch(struct inotify_device *dev, struct inode *inode,
+			u32 mask)
+{
+	struct inotify_user_watch *watch;
+	int ret;
+
+	if (atomic_read(&dev->user->inotify_watches) >=
+			inotify_max_user_watches)
+		return -ENOSPC;
+
+	watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
+	if (unlikely(!watch))
+		return -ENOMEM;
+
+	/* save a reference to device and bump the count to make it official */
+	get_inotify_dev(dev);
+	watch->dev = dev;
+
+	atomic_inc(&dev->user->inotify_watches);
+
+	inotify_init_watch(&watch->wdata);
+	ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
+	if (ret < 0)
+		free_inotify_user_watch(&watch->wdata);
+
+	return ret;
+}
+
+/* Device Interface */
+
+static unsigned int inotify_poll(struct file *file, poll_table *wait)
+{
+	struct inotify_device *dev = file->private_data;
+	int ret = 0;
+
+	poll_wait(file, &dev->wq, wait);
+	mutex_lock(&dev->ev_mutex);
+	if (!list_empty(&dev->events))
+		ret = POLLIN | POLLRDNORM;
+	mutex_unlock(&dev->ev_mutex);
+
+	return ret;
+}
+
+static ssize_t inotify_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *pos)
+{
+	size_t event_size = sizeof (struct inotify_event);
+	struct inotify_device *dev;
+	char __user *start;
+	int ret;
+	DEFINE_WAIT(wait);
+
+	start = buf;
+	dev = file->private_data;
+
+	while (1) {
+
+		prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
+
+		mutex_lock(&dev->ev_mutex);
+		if (!list_empty(&dev->events)) {
+			ret = 0;
+			break;
+		}
+		mutex_unlock(&dev->ev_mutex);
+
+		if (file->f_flags & O_NONBLOCK) {
+			ret = -EAGAIN;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+
+		schedule();
+	}
+
+	finish_wait(&dev->wq, &wait);
+	if (ret)
+		return ret;
+
+	while (1) {
+		struct inotify_kernel_event *kevent;
+
+		ret = buf - start;
+		if (list_empty(&dev->events))
+			break;
+
+		kevent = inotify_dev_get_event(dev);
+		if (event_size + kevent->event.len > count) {
+			if (ret == 0 && count > 0) {
+				/*
+				 * could not get a single event because we
+				 * didn't have enough buffer space.
+				 */
+				ret = -EINVAL;
+			}
+			break;
+		}
+		remove_kevent(dev, kevent);
+
+		/*
+		 * Must perform the copy_to_user outside the mutex in order
+		 * to avoid a lock order reversal with mmap_sem.
+		 */
+		mutex_unlock(&dev->ev_mutex);
+
+		if (copy_to_user(buf, &kevent->event, event_size)) {
+			ret = -EFAULT;
+			break;
+		}
+		buf += event_size;
+		count -= event_size;
+
+		if (kevent->name) {
+			if (copy_to_user(buf, kevent->name, kevent->event.len)){
+				ret = -EFAULT;
+				break;
+			}
+			buf += kevent->event.len;
+			count -= kevent->event.len;
+		}
+
+		free_kevent(kevent);
+
+		mutex_lock(&dev->ev_mutex);
+	}
+	mutex_unlock(&dev->ev_mutex);
+
+	return ret;
+}
+
+static int inotify_fasync(int fd, struct file *file, int on)
+{
+	struct inotify_device *dev = file->private_data;
+
+	return fasync_helper(fd, file, on, &dev->fa) >= 0 ? 0 : -EIO;
+}
+
+static int inotify_release(struct inode *ignored, struct file *file)
+{
+	struct inotify_device *dev = file->private_data;
+
+	inotify_destroy(dev->ih);
+
+	/* destroy all of the events on this device */
+	mutex_lock(&dev->ev_mutex);
+	while (!list_empty(&dev->events))
+		inotify_dev_event_dequeue(dev);
+	mutex_unlock(&dev->ev_mutex);
+
+	/* free this device: the put matching the get in inotify_init() */
+	put_inotify_dev(dev);
+
+	return 0;
+}
+
+static long inotify_ioctl(struct file *file, unsigned int cmd,
+			  unsigned long arg)
+{
+	struct inotify_device *dev;
+	void __user *p;
+	int ret = -ENOTTY;
+
+	dev = file->private_data;
+	p = (void __user *) arg;
+
+	switch (cmd) {
+	case FIONREAD:
+		ret = put_user(dev->queue_size, (int __user *) p);
+		break;
+	}
+
+	return ret;
+}
+
+static const struct file_operations inotify_fops = {
+	.poll           = inotify_poll,
+	.read           = inotify_read,
+	.fasync         = inotify_fasync,
+	.release        = inotify_release,
+	.unlocked_ioctl = inotify_ioctl,
+	.compat_ioctl	= inotify_ioctl,
+};
+
+static const struct inotify_operations inotify_user_ops = {
+	.handle_event	= inotify_dev_queue_event,
+	.destroy_watch	= free_inotify_user_watch,
+};
+
+asmlinkage long sys_inotify_init1(int flags)
+{
+	struct inotify_device *dev;
+	struct inotify_handle *ih;
+	struct user_struct *user;
+	struct file *filp;
+	int fd, ret;
+
+	/* Check the IN_* constants for consistency.  */
+	BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC);
+	BUILD_BUG_ON(IN_NONBLOCK != O_NONBLOCK);
+
+	if (flags & ~(IN_CLOEXEC | IN_NONBLOCK))
+		return -EINVAL;
+
+	fd = get_unused_fd_flags(flags & O_CLOEXEC);
+	if (fd < 0)
+		return fd;
+
+	filp = get_empty_filp();
+	if (!filp) {
+		ret = -ENFILE;
+		goto out_put_fd;
+	}
+
+	user = get_current_user();
+	if (unlikely(atomic_read(&user->inotify_devs) >=
+			inotify_max_user_instances)) {
+		ret = -EMFILE;
+		goto out_free_uid;
+	}
+
+	dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
+	if (unlikely(!dev)) {
+		ret = -ENOMEM;
+		goto out_free_uid;
+	}
+
+	ih = inotify_init(&inotify_user_ops);
+	if (IS_ERR(ih)) {
+		ret = PTR_ERR(ih);
+		goto out_free_dev;
+	}
+	dev->ih = ih;
+	dev->fa = NULL;
+
+	filp->f_op = &inotify_fops;
+	filp->f_path.mnt = mntget(inotify_mnt);
+	filp->f_path.dentry = dget(inotify_mnt->mnt_root);
+	filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
+	filp->f_mode = FMODE_READ;
+	filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
+	filp->private_data = dev;
+
+	INIT_LIST_HEAD(&dev->events);
+	init_waitqueue_head(&dev->wq);
+	mutex_init(&dev->ev_mutex);
+	mutex_init(&dev->up_mutex);
+	dev->event_count = 0;
+	dev->queue_size = 0;
+	dev->max_events = inotify_max_queued_events;
+	dev->user = user;
+	atomic_set(&dev->count, 0);
+
+	get_inotify_dev(dev);
+	atomic_inc(&user->inotify_devs);
+	fd_install(fd, filp);
+
+	return fd;
+out_free_dev:
+	kfree(dev);
+out_free_uid:
+	free_uid(user);
+	put_filp(filp);
+out_put_fd:
+	put_unused_fd(fd);
+	return ret;
+}
+
+asmlinkage long sys_inotify_init(void)
+{
+	return sys_inotify_init1(0);
+}
+
+asmlinkage long sys_inotify_add_watch(int fd, const char __user *pathname, u32 mask)
+{
+	struct inode *inode;
+	struct inotify_device *dev;
+	struct path path;
+	struct file *filp;
+	int ret, fput_needed;
+	unsigned flags = 0;
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(!filp))
+		return -EBADF;
+
+	/* verify that this is indeed an inotify instance */
+	if (unlikely(filp->f_op != &inotify_fops)) {
+		ret = -EINVAL;
+		goto fput_and_out;
+	}
+
+	if (!(mask & IN_DONT_FOLLOW))
+		flags |= LOOKUP_FOLLOW;
+	if (mask & IN_ONLYDIR)
+		flags |= LOOKUP_DIRECTORY;
+
+	ret = find_inode(pathname, &path, flags);
+	if (unlikely(ret))
+		goto fput_and_out;
+
+	/* inode held in place by reference to path; dev by fget on fd */
+	inode = path.dentry->d_inode;
+	dev = filp->private_data;
+
+	mutex_lock(&dev->up_mutex);
+	ret = inotify_find_update_watch(dev->ih, inode, mask);
+	if (ret == -ENOENT)
+		ret = create_watch(dev, inode, mask);
+	mutex_unlock(&dev->up_mutex);
+
+	path_put(&path);
+fput_and_out:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
+{
+	struct file *filp;
+	struct inotify_device *dev;
+	int ret, fput_needed;
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(!filp))
+		return -EBADF;
+
+	/* verify that this is indeed an inotify instance */
+	if (unlikely(filp->f_op != &inotify_fops)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	dev = filp->private_data;
+
+	/* we free our watch data when we get IN_IGNORED */
+	ret = inotify_rm_wd(dev->ih, wd);
+
+out:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+static int
+inotify_get_sb(struct file_system_type *fs_type, int flags,
+	       const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	return get_sb_pseudo(fs_type, "inotify", NULL,
+			INOTIFYFS_SUPER_MAGIC, mnt);
+}
+
+static struct file_system_type inotify_fs_type = {
+    .name           = "inotifyfs",
+    .get_sb         = inotify_get_sb,
+    .kill_sb        = kill_anon_super,
+};
+
+/*
+ * inotify_user_setup - Our initialization function.  Note that we cannnot return
+ * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
+ * must result in panic().
+ */
+static int __init inotify_user_setup(void)
+{
+	int ret;
+
+	ret = register_filesystem(&inotify_fs_type);
+	if (unlikely(ret))
+		panic("inotify: register_filesystem returned %d!\n", ret);
+
+	inotify_mnt = kern_mount(&inotify_fs_type);
+	if (IS_ERR(inotify_mnt))
+		panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
+
+	inotify_max_queued_events = 16384;
+	inotify_max_user_instances = 128;
+	inotify_max_user_watches = 8192;
+
+	watch_cachep = kmem_cache_create("inotify_watch_cache",
+					 sizeof(struct inotify_user_watch),
+					 0, SLAB_PANIC, NULL);
+	event_cachep = kmem_cache_create("inotify_event_cache",
+					 sizeof(struct inotify_kernel_event),
+					 0, SLAB_PANIC, NULL);
+
+	return 0;
+}
+
+module_init(inotify_user_setup);
-- 
cgit v0.10.2


From 261bca86ed4f7f391d1938167624e78da61dcc6b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 30 Dec 2008 01:48:21 -0500
Subject: nfsd/create race fixes, infrastructure

new helpers - insert_inode_locked() and insert_inode_locked4().
Hash new inode, making sure that there's no such inode in icache
already.  If there is and it does not end up unhashed (as would
happen if we have nfsd trying to resolve a bogus fhandle), fail.
Otherwise insert our inode into hash and succeed.

In either case have i_state set to new+locked; cleanup ends up
being simpler with such calling conventions.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/inode.c b/fs/inode.c
index 098a244..7de1cda 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1032,6 +1032,65 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
 
 EXPORT_SYMBOL(iget_locked);
 
+int insert_inode_locked(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	ino_t ino = inode->i_ino;
+	struct hlist_head *head = inode_hashtable + hash(sb, ino);
+	struct inode *old;
+
+	inode->i_state |= I_LOCK|I_NEW;
+	while (1) {
+		spin_lock(&inode_lock);
+		old = find_inode_fast(sb, head, ino);
+		if (likely(!old)) {
+			hlist_add_head(&inode->i_hash, head);
+			spin_unlock(&inode_lock);
+			return 0;
+		}
+		__iget(old);
+		spin_unlock(&inode_lock);
+		wait_on_inode(old);
+		if (unlikely(!hlist_unhashed(&old->i_hash))) {
+			iput(old);
+			return -EBUSY;
+		}
+		iput(old);
+	}
+}
+
+EXPORT_SYMBOL(insert_inode_locked);
+
+int insert_inode_locked4(struct inode *inode, unsigned long hashval,
+		int (*test)(struct inode *, void *), void *data)
+{
+	struct super_block *sb = inode->i_sb;
+	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+	struct inode *old;
+
+	inode->i_state |= I_LOCK|I_NEW;
+
+	while (1) {
+		spin_lock(&inode_lock);
+		old = find_inode(sb, head, test, data);
+		if (likely(!old)) {
+			hlist_add_head(&inode->i_hash, head);
+			spin_unlock(&inode_lock);
+			return 0;
+		}
+		__iget(old);
+		spin_unlock(&inode_lock);
+		wait_on_inode(old);
+		if (unlikely(!hlist_unhashed(&old->i_hash))) {
+			iput(old);
+			return -EBUSY;
+		}
+		iput(old);
+	}
+}
+
+EXPORT_SYMBOL(insert_inode_locked4);
+
 /**
  *	__insert_inode_hash - hash an inode
  *	@inode: unhashed inode
diff --git a/include/linux/fs.h b/include/linux/fs.h
index be16ce0..e2170ee 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1902,6 +1902,8 @@ extern struct inode *ilookup(struct super_block *sb, unsigned long ino);
 
 extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *);
 extern struct inode * iget_locked(struct super_block *, unsigned long);
+extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
+extern int insert_inode_locked(struct inode *);
 extern void unlock_new_inode(struct inode *);
 
 extern void __iget(struct inode * inode);
-- 
cgit v0.10.2


From 41080b5a240113328c607f22b849f653373db0ce Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 30 Dec 2008 01:52:35 -0500
Subject: nfsd race fixes: ext2

* make ext2_new_inode() put the inode into icache in locked state
* do not unlock until the inode is fully set up; otherwise nfsd
might pick it in half-baked state.
* make sure that ext2_new_inode() does *not* lead to two inodes with the
same inumber hashed at the same time; otherwise a bogus fhandle coming
from nfsd might race with inode creation:

nfsd: iget_locked() creates inode
nfsd: try to read from disk, block on that.
ext2_new_inode(): allocate inode with that inumber
ext2_new_inode(): insert it into icache, set it up and dirty
ext2_write_inode(): get the relevant part of inode table in cache,
set the entry for our inode (and start writing to disk)
nfsd: get CPU again, look into inode table, see nice and sane on-disk
inode, set the in-core inode from it

oops - we have two in-core inodes with the same inumber live in icache,
both used for IO.  Welcome to fs corruption...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 8d0add6..c454d5d 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -585,7 +585,10 @@ got:
 	spin_lock(&sbi->s_next_gen_lock);
 	inode->i_generation = sbi->s_next_generation++;
 	spin_unlock(&sbi->s_next_gen_lock);
-	insert_inode_hash(inode);
+	if (insert_inode_locked(inode) < 0) {
+		err = -EINVAL;
+		goto fail_drop;
+	}
 
 	if (DQUOT_ALLOC_INODE(inode)) {
 		err = -EDQUOT;
@@ -612,6 +615,7 @@ fail_drop:
 	DQUOT_DROP(inode);
 	inode->i_flags |= S_NOQUOTA;
 	inode->i_nlink = 0;
+	unlock_new_inode(inode);
 	iput(inode);
 	return ERR_PTR(err);
 
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 2a74725..90ea179 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -41,9 +41,11 @@ static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
 	int err = ext2_add_link(dentry, inode);
 	if (!err) {
 		d_instantiate(dentry, inode);
+		unlock_new_inode(inode);
 		return 0;
 	}
 	inode_dec_link_count(inode);
+	unlock_new_inode(inode);
 	iput(inode);
 	return err;
 }
@@ -170,6 +172,7 @@ out:
 
 out_fail:
 	inode_dec_link_count(inode);
+	unlock_new_inode(inode);
 	iput (inode);
 	goto out;
 }
@@ -178,6 +181,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
 	struct dentry *dentry)
 {
 	struct inode *inode = old_dentry->d_inode;
+	int err;
 
 	if (inode->i_nlink >= EXT2_LINK_MAX)
 		return -EMLINK;
@@ -186,7 +190,14 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
 	inode_inc_link_count(inode);
 	atomic_inc(&inode->i_count);
 
-	return ext2_add_nondir(dentry, inode);
+	err = ext2_add_link(dentry, inode);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	inode_dec_link_count(inode);
+	iput(inode);
+	return err;
 }
 
 static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
@@ -222,12 +233,14 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 		goto out_fail;
 
 	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
 out:
 	return err;
 
 out_fail:
 	inode_dec_link_count(inode);
 	inode_dec_link_count(inode);
+	unlock_new_inode(inode);
 	iput(inode);
 out_dir:
 	inode_dec_link_count(dir);
-- 
cgit v0.10.2


From c38012daa7ad902a39a4213ba2b3fe50e81157ea Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 30 Dec 2008 02:02:50 -0500
Subject: nfsd race fixes: ext3

ext3 analog of the previous patch

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 490bd0e..5655fbc 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -579,7 +579,10 @@ got:
 	ext3_set_inode_flags(inode);
 	if (IS_DIRSYNC(inode))
 		handle->h_sync = 1;
-	insert_inode_hash(inode);
+	if (insert_inode_locked(inode) < 0) {
+		err = -EINVAL;
+		goto fail_drop;
+	}
 	spin_lock(&sbi->s_next_gen_lock);
 	inode->i_generation = sbi->s_next_generation++;
 	spin_unlock(&sbi->s_next_gen_lock);
@@ -627,6 +630,7 @@ fail_drop:
 	DQUOT_DROP(inode);
 	inode->i_flags |= S_NOQUOTA;
 	inode->i_nlink = 0;
+	unlock_new_inode(inode);
 	iput(inode);
 	brelse(bitmap_bh);
 	return ERR_PTR(err);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 3e5edc9..297ea8d 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1652,9 +1652,11 @@ static int ext3_add_nondir(handle_t *handle,
 	if (!err) {
 		ext3_mark_inode_dirty(handle, inode);
 		d_instantiate(dentry, inode);
+		unlock_new_inode(inode);
 		return 0;
 	}
 	drop_nlink(inode);
+	unlock_new_inode(inode);
 	iput(inode);
 	return err;
 }
@@ -1765,6 +1767,7 @@ retry:
 	dir_block = ext3_bread (handle, inode, 0, 1, &err);
 	if (!dir_block) {
 		drop_nlink(inode); /* is this nlink == 0? */
+		unlock_new_inode(inode);
 		ext3_mark_inode_dirty(handle, inode);
 		iput (inode);
 		goto out_stop;
@@ -1792,6 +1795,7 @@ retry:
 	err = ext3_add_entry (handle, dentry, inode);
 	if (err) {
 		inode->i_nlink = 0;
+		unlock_new_inode(inode);
 		ext3_mark_inode_dirty(handle, inode);
 		iput (inode);
 		goto out_stop;
@@ -1800,6 +1804,7 @@ retry:
 	ext3_update_dx_flag(dir);
 	ext3_mark_inode_dirty(handle, dir);
 	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
 out_stop:
 	ext3_journal_stop(handle);
 	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
@@ -2174,6 +2179,7 @@ retry:
 				mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
 		if (err) {
 			drop_nlink(inode);
+			unlock_new_inode(inode);
 			ext3_mark_inode_dirty(handle, inode);
 			iput (inode);
 			goto out_stop;
@@ -2221,7 +2227,14 @@ retry:
 	inc_nlink(inode);
 	atomic_inc(&inode->i_count);
 
-	err = ext3_add_nondir(handle, dentry, inode);
+	err = ext3_add_entry(handle, dentry, inode);
+	if (!err) {
+		ext3_mark_inode_dirty(handle, inode);
+		d_instantiate(dentry, inode);
+	} else {
+		drop_nlink(inode);
+		iput(inode);
+	}
 	ext3_journal_stop(handle);
 	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
-- 
cgit v0.10.2


From 6b38e842bb832a3dbeb17e382404aef3c40ac5f9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 30 Dec 2008 02:03:31 -0500
Subject: nfsd race fixes: ext4

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 08cac9f..6e60528 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -826,7 +826,10 @@ got:
 	ext4_set_inode_flags(inode);
 	if (IS_DIRSYNC(inode))
 		handle->h_sync = 1;
-	insert_inode_hash(inode);
+	if (insert_inode_locked(inode) < 0) {
+		err = -EINVAL;
+		goto fail_drop;
+	}
 	spin_lock(&sbi->s_next_gen_lock);
 	inode->i_generation = sbi->s_next_generation++;
 	spin_unlock(&sbi->s_next_gen_lock);
@@ -881,6 +884,7 @@ fail_drop:
 	DQUOT_DROP(inode);
 	inode->i_flags |= S_NOQUOTA;
 	inode->i_nlink = 0;
+	unlock_new_inode(inode);
 	iput(inode);
 	brelse(bitmap_bh);
 	return ERR_PTR(err);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 63adcb7..da98a90 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1693,9 +1693,11 @@ static int ext4_add_nondir(handle_t *handle,
 	if (!err) {
 		ext4_mark_inode_dirty(handle, inode);
 		d_instantiate(dentry, inode);
+		unlock_new_inode(inode);
 		return 0;
 	}
 	drop_nlink(inode);
+	unlock_new_inode(inode);
 	iput(inode);
 	return err;
 }
@@ -1830,6 +1832,7 @@ retry:
 	if (err) {
 out_clear_inode:
 		clear_nlink(inode);
+		unlock_new_inode(inode);
 		ext4_mark_inode_dirty(handle, inode);
 		iput(inode);
 		goto out_stop;
@@ -1838,6 +1841,7 @@ out_clear_inode:
 	ext4_update_dx_flag(dir);
 	ext4_mark_inode_dirty(handle, dir);
 	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
 out_stop:
 	ext4_journal_stop(handle);
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
@@ -2212,6 +2216,7 @@ retry:
 				mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
 		if (err) {
 			clear_nlink(inode);
+			unlock_new_inode(inode);
 			ext4_mark_inode_dirty(handle, inode);
 			iput(inode);
 			goto out_stop;
@@ -2262,7 +2267,14 @@ retry:
 	ext4_inc_count(handle, inode);
 	atomic_inc(&inode->i_count);
 
-	err = ext4_add_nondir(handle, dentry, inode);
+	err = ext4_add_entry(handle, dentry, inode);
+	if (!err) {
+		ext4_mark_inode_dirty(handle, inode);
+		d_instantiate(dentry, inode);
+	} else {
+		drop_nlink(inode);
+		iput(inode);
+	}
 	ext4_journal_stop(handle);
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
-- 
cgit v0.10.2


From c1eaa26b671299b3ec01d40c6c71ee19a4f81517 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 30 Dec 2008 02:03:58 -0500
Subject: nfsd race fixes: reiserfs

... and the same for reiserfs.  The difference here is that we need
insert_inode_locked4() to match iget5_locked().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 6c4c2c6..145c2d3 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1753,6 +1753,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 		       struct inode *inode)
 {
 	struct super_block *sb;
+	struct reiserfs_iget_args args;
 	INITIALIZE_PATH(path_to_key);
 	struct cpu_key key;
 	struct item_head ih;
@@ -1780,6 +1781,14 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 		err = -ENOMEM;
 		goto out_bad_inode;
 	}
+	args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
+	memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
+	args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
+	if (insert_inode_locked4(inode, args.objectid,
+			     reiserfs_find_actor, &args) < 0) {
+		err = -EINVAL;
+		goto out_bad_inode;
+	}
 	if (old_format_only(sb))
 		/* not a perfect generation count, as object ids can be reused, but 
 		 ** this is as good as reiserfs can do right now.
@@ -1859,13 +1868,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 	} else {
 		inode2sd(&sd, inode, inode->i_size);
 	}
-	// these do not go to on-disk stat data
-	inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
-
 	// store in in-core inode the key of stat data and version all
 	// object items will have (directory items will have old offset
 	// format, other new objects will consist of new items)
-	memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
 	if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
 		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
 	else
@@ -1929,7 +1934,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 		reiserfs_mark_inode_private(inode);
 	}
 
-	insert_inode_hash(inode);
 	reiserfs_update_sd(th, inode);
 	reiserfs_check_path(&path_to_key);
 
@@ -1956,6 +1960,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
       out_inserted_sd:
 	inode->i_nlink = 0;
 	th->t_trans_id = 0;	/* so the caller can't use this handle later */
+	unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
 
 	/* If we were inheriting an ACL, we need to release the lock so that
 	 * iput doesn't deadlock in reiserfs_delete_xattrs. The locking
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 4f322e5..738967f 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -646,6 +646,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
 		err = journal_end(&th, dir->i_sb, jbegin_count);
 		if (err)
 			retval = err;
+		unlock_new_inode(inode);
 		iput(inode);
 		goto out_failed;
 	}
@@ -653,6 +654,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	reiserfs_update_inode_transaction(dir);
 
 	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
 	retval = journal_end(&th, dir->i_sb, jbegin_count);
 
       out_failed:
@@ -727,11 +729,13 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
 		err = journal_end(&th, dir->i_sb, jbegin_count);
 		if (err)
 			retval = err;
+		unlock_new_inode(inode);
 		iput(inode);
 		goto out_failed;
 	}
 
 	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
 	retval = journal_end(&th, dir->i_sb, jbegin_count);
 
       out_failed:
@@ -812,6 +816,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		err = journal_end(&th, dir->i_sb, jbegin_count);
 		if (err)
 			retval = err;
+		unlock_new_inode(inode);
 		iput(inode);
 		goto out_failed;
 	}
@@ -819,6 +824,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	reiserfs_update_sd(&th, dir);
 
 	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
 	retval = journal_end(&th, dir->i_sb, jbegin_count);
       out_failed:
 	if (locked)
@@ -1096,11 +1102,13 @@ static int reiserfs_symlink(struct inode *parent_dir,
 		err = journal_end(&th, parent_dir->i_sb, jbegin_count);
 		if (err)
 			retval = err;
+		unlock_new_inode(inode);
 		iput(inode);
 		goto out_failed;
 	}
 
 	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
 	retval = journal_end(&th, parent_dir->i_sb, jbegin_count);
       out_failed:
 	reiserfs_write_unlock(parent_dir->i_sb);
-- 
cgit v0.10.2


From 1f3403fa640f9f7b135dee79f2d39d01c8ad4a08 Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Date: Tue, 30 Dec 2008 22:08:37 -0600
Subject: nfsd race fixes: jfs

jfs version of Al Viro's nfsd race patches

Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 70022fd..d4d142c 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -79,7 +79,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 	inode = new_inode(sb);
 	if (!inode) {
 		jfs_warn("ialloc: new_inode returned NULL!");
-		return ERR_PTR(-ENOMEM);
+		rc = -ENOMEM;
+		goto fail;
 	}
 
 	jfs_inode = JFS_IP(inode);
@@ -89,8 +90,12 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 		jfs_warn("ialloc: diAlloc returned %d!", rc);
 		if (rc == -EIO)
 			make_bad_inode(inode);
-		iput(inode);
-		return ERR_PTR(rc);
+		goto fail_put;
+	}
+
+	if (insert_inode_locked(inode) < 0) {
+		rc = -EINVAL;
+		goto fail_unlock;
 	}
 
 	inode->i_uid = current_fsuid();
@@ -112,11 +117,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 	 * Allocate inode to quota.
 	 */
 	if (DQUOT_ALLOC_INODE(inode)) {
-		DQUOT_DROP(inode);
-		inode->i_flags |= S_NOQUOTA;
-		inode->i_nlink = 0;
-		iput(inode);
-		return ERR_PTR(-EDQUOT);
+		rc = -EDQUOT;
+		goto fail_drop;
 	}
 
 	inode->i_mode = mode;
@@ -158,4 +160,15 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 	jfs_info("ialloc returns inode = 0x%p\n", inode);
 
 	return inode;
+
+fail_drop:
+	DQUOT_DROP(inode);
+	inode->i_flags |= S_NOQUOTA;
+fail_unlock:
+	inode->i_nlink = 0;
+	unlock_new_inode(inode);
+fail_put:
+	iput(inode);
+fail:
+	return ERR_PTR(rc);
 }
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index cc3cedf..b4de56b 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -155,7 +155,6 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
 	ip->i_fop = &jfs_file_operations;
 	ip->i_mapping->a_ops = &jfs_aops;
 
-	insert_inode_hash(ip);
 	mark_inode_dirty(ip);
 
 	dip->i_ctime = dip->i_mtime = CURRENT_TIME;
@@ -171,9 +170,12 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
 	if (rc) {
 		free_ea_wmap(ip);
 		ip->i_nlink = 0;
+		unlock_new_inode(ip);
 		iput(ip);
-	} else
+	} else {
 		d_instantiate(dentry, ip);
+		unlock_new_inode(ip);
+	}
 
       out2:
 	free_UCSname(&dname);
@@ -289,7 +291,6 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
 	ip->i_op = &jfs_dir_inode_operations;
 	ip->i_fop = &jfs_dir_operations;
 
-	insert_inode_hash(ip);
 	mark_inode_dirty(ip);
 
 	/* update parent directory inode */
@@ -306,9 +307,12 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
 	if (rc) {
 		free_ea_wmap(ip);
 		ip->i_nlink = 0;
+		unlock_new_inode(ip);
 		iput(ip);
-	} else
+	} else {
 		d_instantiate(dentry, ip);
+		unlock_new_inode(ip);
+	}
 
       out2:
 	free_UCSname(&dname);
@@ -1019,7 +1023,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 		goto out3;
 	}
 
-	insert_inode_hash(ip);
 	mark_inode_dirty(ip);
 
 	dip->i_ctime = dip->i_mtime = CURRENT_TIME;
@@ -1039,9 +1042,12 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 	if (rc) {
 		free_ea_wmap(ip);
 		ip->i_nlink = 0;
+		unlock_new_inode(ip);
 		iput(ip);
-	} else
+	} else {
 		d_instantiate(dentry, ip);
+		unlock_new_inode(ip);
+	}
 
       out2:
 	free_UCSname(&dname);
@@ -1399,7 +1405,6 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
 	jfs_ip->dev = new_encode_dev(rdev);
 	init_special_inode(ip, ip->i_mode, rdev);
 
-	insert_inode_hash(ip);
 	mark_inode_dirty(ip);
 
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -1417,9 +1422,12 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (rc) {
 		free_ea_wmap(ip);
 		ip->i_nlink = 0;
+		unlock_new_inode(ip);
 		iput(ip);
-	} else
+	} else {
 		d_instantiate(dentry, ip);
+		unlock_new_inode(ip);
+	}
 
       out1:
 	free_UCSname(&dname);
-- 
cgit v0.10.2