From 06777d308f8f9ddb67798d34bf193101a4bdf06c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 17 Dec 2009 04:52:13 -0500 Subject: dio: fix use-after-free Signed-off-by: Al Viro diff --git a/fs/direct-io.c b/fs/direct-io.c index 4012885..e82adc2 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1206,7 +1206,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, * NOTE: filesystems with their own locking have to handle this * on their own. */ - if (dio->flags & DIO_LOCKING) { + if (flags & DIO_LOCKING) { if (unlikely((rw & WRITE) && retval < 0)) { loff_t isize = i_size_read(inode); if (end > isize) -- cgit v0.10.2 From b9aff027b2c1d6019d237382c78fd396f9de2ea5 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 20 Nov 2009 14:28:35 -0800 Subject: fs: anon_inodes implement dname Add a d_dname method for anon_inodes filesystem, the same way pipefs and sockfs pseudo filesystems. This allows us to remove the DCACHE_UNHASHED hack from anon_inodes.c (see next patch). [AV: inumber is useless here, dropped from anon_inodefs_dname()] Signed-off-by: Nick Piggin Cc: Miklos Szeredi Cc: Davide Libenzi Cc: "David S. Miller" Cc: Jens Axboe Cc: Christoph Hellwig Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Al Viro diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 94f5110..01c529a 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -45,6 +45,15 @@ static int anon_inodefs_delete_dentry(struct dentry *dentry) return 1; } +/* + * anon_inodefs_dname() is called from d_path(). + */ +static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen) +{ + return dynamic_dname(dentry, buffer, buflen, "anon_inode:%s", + dentry->d_name.name); +} + static struct file_system_type anon_inode_fs_type = { .name = "anon_inodefs", .get_sb = anon_inodefs_get_sb, @@ -52,6 +61,7 @@ static struct file_system_type anon_inode_fs_type = { }; static const struct dentry_operations anon_inodefs_dentry_operations = { .d_delete = anon_inodefs_delete_dentry, + .d_dname = anon_inodefs_dname, }; /* -- cgit v0.10.2 From a3a065e3f13da8a3470ed09c7f38aad256083726 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 18 Nov 2009 05:30:19 +0100 Subject: fs: no games with DCACHE_UNHASHED Filesystems outside the regular namespace do not have to clear DCACHE_UNHASHED in order to have a working /proc/$pid/fd/XXX. Nothing in proc prevents the fd link from being used if its dentry is not in the hash. Also, it does not get put into the dcache hash if DCACHE_UNHASHED is clear; that depends on the filesystem calling d_add or d_rehash. So delete the misleading comments and needless code. Acked-by: Miklos Szeredi Signed-off-by: Nick Piggin Signed-off-by: Al Viro diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 01c529a..2c99459 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -35,16 +35,6 @@ static int anon_inodefs_get_sb(struct file_system_type *fs_type, int flags, mnt); } -static int anon_inodefs_delete_dentry(struct dentry *dentry) -{ - /* - * We faked vfs to believe the dentry was hashed when we created it. - * Now we restore the flag so that dput() will work correctly. - */ - dentry->d_flags |= DCACHE_UNHASHED; - return 1; -} - /* * anon_inodefs_dname() is called from d_path(). */ @@ -60,7 +50,6 @@ static struct file_system_type anon_inode_fs_type = { .kill_sb = kill_anon_super, }; static const struct dentry_operations anon_inodefs_dentry_operations = { - .d_delete = anon_inodefs_delete_dentry, .d_dname = anon_inodefs_dname, }; @@ -129,8 +118,6 @@ struct file *anon_inode_getfile(const char *name, atomic_inc(&anon_inode_inode->i_count); path.dentry->d_op = &anon_inodefs_dentry_operations; - /* Do not publish this dentry inside the global dentry hash table */ - path.dentry->d_flags &= ~DCACHE_UNHASHED; d_instantiate(path.dentry, anon_inode_inode); error = -ENFILE; diff --git a/fs/pipe.c b/fs/pipe.c index 43d79da..37ba29f 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -906,17 +906,6 @@ void free_pipe_info(struct inode *inode) } static struct vfsmount *pipe_mnt __read_mostly; -static int pipefs_delete_dentry(struct dentry *dentry) -{ - /* - * At creation time, we pretended this dentry was hashed - * (by clearing DCACHE_UNHASHED bit in d_flags) - * At delete time, we restore the truth : not hashed. - * (so that dput() can proceed correctly) - */ - dentry->d_flags |= DCACHE_UNHASHED; - return 0; -} /* * pipefs_dname() is called from d_path(). @@ -928,7 +917,6 @@ static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) } static const struct dentry_operations pipefs_dentry_operations = { - .d_delete = pipefs_delete_dentry, .d_dname = pipefs_dname, }; @@ -989,12 +977,6 @@ struct file *create_write_pipe(int flags) path.mnt = mntget(pipe_mnt); path.dentry->d_op = &pipefs_dentry_operations; - /* - * We dont want to publish this dentry into global dentry hash table. - * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED - * This permits a working /proc/$pid/fd/XXX on pipes - */ - path.dentry->d_flags &= ~DCACHE_UNHASHED; d_instantiate(path.dentry, inode); err = -ENFILE; diff --git a/net/socket.c b/net/socket.c index dbfdfa9..769c386 100644 --- a/net/socket.c +++ b/net/socket.c @@ -312,18 +312,6 @@ static struct file_system_type sock_fs_type = { .kill_sb = kill_anon_super, }; -static int sockfs_delete_dentry(struct dentry *dentry) -{ - /* - * At creation time, we pretended this dentry was hashed - * (by clearing DCACHE_UNHASHED bit in d_flags) - * At delete time, we restore the truth : not hashed. - * (so that dput() can proceed correctly) - */ - dentry->d_flags |= DCACHE_UNHASHED; - return 0; -} - /* * sockfs_dname() is called from d_path(). */ @@ -334,7 +322,6 @@ static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen) } static const struct dentry_operations sockfs_dentry_operations = { - .d_delete = sockfs_delete_dentry, .d_dname = sockfs_dname, }; @@ -374,12 +361,6 @@ static int sock_alloc_file(struct socket *sock, struct file **f, int flags) path.mnt = mntget(sock_mnt); path.dentry->d_op = &sockfs_dentry_operations; - /* - * We dont want to push this dentry into global dentry hash table. - * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED - * This permits a working /proc/$pid/fd/XXX on sockets - */ - path.dentry->d_flags &= ~DCACHE_UNHASHED; d_instantiate(path.dentry, SOCK_INODE(sock)); SOCK_INODE(sock)->i_fop = &socket_file_ops; -- cgit v0.10.2 From cb59861f03a626196a23fdef5e20ddbb8cca6466 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 16 Nov 2009 12:05:20 -0800 Subject: vfs: remove extraneous NULL d_inode check from do_filp_open We can't get to this point unless it's a valid pointer. Signed-off-by: Jeff Layton Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Al Viro diff --git a/fs/namei.c b/fs/namei.c index d2783c8..dad4b80 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1764,7 +1764,7 @@ do_last: path_to_nameidata(&path, &nd); error = -EISDIR; - if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode)) + if (S_ISDIR(path.dentry->d_inode->i_mode)) goto exit; ok: /* -- cgit v0.10.2 From 9afa2fb6c13501e5b3536d15344fce4e5442c469 Mon Sep 17 00:00:00 2001 From: Erez Zadok Date: Wed, 2 Dec 2009 19:51:54 -0500 Subject: fsstack/ecryptfs: remove unused get_nlinks param to fsstack_copy_attr_all This get_nlinks parameter was never used by the only mainline user, ecryptfs; and it has never been used by unionfs or wrapfs either. Acked-by: Dustin Kirkland Acked-by: Tyler Hicks Signed-off-by: Erez Zadok Signed-off-by: Al Viro diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 2dda5ad..8f006a0 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -62,7 +62,7 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd) struct inode *lower_inode = ecryptfs_inode_to_lower(dentry->d_inode); - fsstack_copy_attr_all(dentry->d_inode, lower_inode, NULL); + fsstack_copy_attr_all(dentry->d_inode, lower_inode); } out: return rc; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 056fed62..429ca0b 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -626,9 +626,9 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, lower_new_dir_dentry->d_inode, lower_new_dentry); if (rc) goto out_lock; - fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode, NULL); + fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode); if (new_dir != old_dir) - fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode, NULL); + fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode); out_lock: unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); dput(lower_new_dentry->d_parent); @@ -967,7 +967,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) rc = notify_change(lower_dentry, ia); mutex_unlock(&lower_dentry->d_inode->i_mutex); out: - fsstack_copy_attr_all(inode, lower_inode, NULL); + fsstack_copy_attr_all(inode, lower_inode); return rc; } diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 101fe4c..567bc4b 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -189,7 +189,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, init_special_inode(inode, lower_inode->i_mode, lower_inode->i_rdev); dentry->d_op = &ecryptfs_dops; - fsstack_copy_attr_all(inode, lower_inode, NULL); + fsstack_copy_attr_all(inode, lower_inode); /* This size will be overwritten for real files w/ headers and * other metadata */ fsstack_copy_inode_size(inode, lower_inode); diff --git a/fs/stack.c b/fs/stack.c index 67716f6..0e20e43 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -14,11 +14,8 @@ void fsstack_copy_inode_size(struct inode *dst, const struct inode *src) } EXPORT_SYMBOL_GPL(fsstack_copy_inode_size); -/* copy all attributes; get_nlinks is optional way to override the i_nlink - * copying - */ -void fsstack_copy_attr_all(struct inode *dest, const struct inode *src, - int (*get_nlinks)(struct inode *)) +/* copy all attributes */ +void fsstack_copy_attr_all(struct inode *dest, const struct inode *src) { dest->i_mode = src->i_mode; dest->i_uid = src->i_uid; @@ -29,14 +26,6 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src, dest->i_ctime = src->i_ctime; dest->i_blkbits = src->i_blkbits; dest->i_flags = src->i_flags; - - /* - * Update the nlinks AFTER updating the above fields, because the - * get_links callback may depend on them. - */ - if (!get_nlinks) - dest->i_nlink = src->i_nlink; - else - dest->i_nlink = (*get_nlinks)(dest); + dest->i_nlink = src->i_nlink; } EXPORT_SYMBOL_GPL(fsstack_copy_attr_all); diff --git a/include/linux/fs_stack.h b/include/linux/fs_stack.h index bb516ce..aa60311 100644 --- a/include/linux/fs_stack.h +++ b/include/linux/fs_stack.h @@ -8,9 +8,7 @@ #include /* externs for fs/stack.c */ -extern void fsstack_copy_attr_all(struct inode *dest, const struct inode *src, - int (*get_nlinks)(struct inode *)); - +extern void fsstack_copy_attr_all(struct inode *dest, const struct inode *src); extern void fsstack_copy_inode_size(struct inode *dst, const struct inode *src); /* inlines */ -- cgit v0.10.2 From 1b8ab8159ef8f818f870a1d2e3b6953d80eefd3f Mon Sep 17 00:00:00 2001 From: Erez Zadok Date: Thu, 3 Dec 2009 21:56:09 -0500 Subject: VFS/fsstack: handle 32-bit smp + preempt + large files in fsstack_copy_inode_size Copy the inode size and blocks from one inode to another correctly on 32-bit systems with CONFIG_SMP, CONFIG_PREEMPT, or CONFIG_LBDAF. Use proper inode spinlocks only when i_size/i_blocks cannot fit in one 32-bit word. Signed-off-by: Hugh Dickins Signed-off-by: Erez Zadok Signed-off-by: Al Viro diff --git a/fs/stack.c b/fs/stack.c index 0e20e43..4a6f7f4 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -7,10 +7,58 @@ * This function cannot be inlined since i_size_{read,write} is rather * heavy-weight on 32-bit systems */ -void fsstack_copy_inode_size(struct inode *dst, const struct inode *src) +void fsstack_copy_inode_size(struct inode *dst, struct inode *src) { - i_size_write(dst, i_size_read((struct inode *)src)); - dst->i_blocks = src->i_blocks; + loff_t i_size; + blkcnt_t i_blocks; + + /* + * i_size_read() includes its own seqlocking and protection from + * preemption (see include/linux/fs.h): we need nothing extra for + * that here, and prefer to avoid nesting locks than attempt to keep + * i_size and i_blocks in sync together. + */ + i_size = i_size_read(src); + + /* + * But if CONFIG_LBDAF (on 32-bit), we ought to make an effort to + * keep the two halves of i_blocks in sync despite SMP or PREEMPT - + * though stat's generic_fillattr() doesn't bother, and we won't be + * applying quotas (where i_blocks does become important) at the + * upper level. + * + * We don't actually know what locking is used at the lower level; + * but if it's a filesystem that supports quotas, it will be using + * i_lock as in inode_add_bytes(). tmpfs uses other locking, and + * its 32-bit is (just) able to exceed 2TB i_size with the aid of + * holes; but its i_blocks cannot carry into the upper long without + * almost 2TB swap - let's ignore that case. + */ + if (sizeof(i_blocks) > sizeof(long)) + spin_lock(&src->i_lock); + i_blocks = src->i_blocks; + if (sizeof(i_blocks) > sizeof(long)) + spin_unlock(&src->i_lock); + + /* + * If CONFIG_SMP or CONFIG_PREEMPT on 32-bit, it's vital for + * fsstack_copy_inode_size() to hold some lock around + * i_size_write(), otherwise i_size_read() may spin forever (see + * include/linux/fs.h). We don't necessarily hold i_mutex when this + * is called, so take i_lock for that case. + * + * And if CONFIG_LBADF (on 32-bit), continue our effort to keep the + * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock + * for that case too, and do both at once by combining the tests. + * + * There is none of this locking overhead in the 64-bit case. + */ + if (sizeof(i_size) > sizeof(long) || sizeof(i_blocks) > sizeof(long)) + spin_lock(&dst->i_lock); + i_size_write(dst, i_size); + dst->i_blocks = i_blocks; + if (sizeof(i_size) > sizeof(long) || sizeof(i_blocks) > sizeof(long)) + spin_unlock(&dst->i_lock); } EXPORT_SYMBOL_GPL(fsstack_copy_inode_size); diff --git a/include/linux/fs_stack.h b/include/linux/fs_stack.h index aa60311..da317c7 100644 --- a/include/linux/fs_stack.h +++ b/include/linux/fs_stack.h @@ -9,7 +9,7 @@ /* externs for fs/stack.c */ extern void fsstack_copy_attr_all(struct inode *dest, const struct inode *src); -extern void fsstack_copy_inode_size(struct inode *dst, const struct inode *src); +extern void fsstack_copy_inode_size(struct inode *dst, struct inode *src); /* inlines */ static inline void fsstack_copy_attr_atime(struct inode *dest, -- cgit v0.10.2 From 76b7e0058d09f8104387980a690001681c04cc0a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Dec 2009 14:24:20 +0100 Subject: fix up O_SYNC comments Proper Posix O_SYNC handling only made it into 2.6.33, not 2.6.32. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/arch/alpha/include/asm/fcntl.h b/arch/alpha/include/asm/fcntl.h index 21b1117..70145cb 100644 --- a/arch/alpha/include/asm/fcntl.h +++ b/arch/alpha/include/asm/fcntl.h @@ -16,7 +16,7 @@ #define O_NOATIME 04000000 #define O_CLOEXEC 010000000 /* set close_on_exec */ /* - * Before Linux 2.6.32 only O_DSYNC semantics were implemented, but using + * Before Linux 2.6.33 only O_DSYNC semantics were implemented, but using * the O_SYNC flag. We continue to use the existing numerical value * for O_DSYNC semantics now, but using the correct symbolic name for it. * This new value is used to request true Posix O_SYNC semantics. It is diff --git a/arch/mips/include/asm/fcntl.h b/arch/mips/include/asm/fcntl.h index 7c6681a..e482fe9 100644 --- a/arch/mips/include/asm/fcntl.h +++ b/arch/mips/include/asm/fcntl.h @@ -19,7 +19,7 @@ #define FASYNC 0x1000 /* fcntl, for BSD compatibility */ #define O_LARGEFILE 0x2000 /* allow large file opens */ /* - * Before Linux 2.6.32 only O_DSYNC semantics were implemented, but using + * Before Linux 2.6.33 only O_DSYNC semantics were implemented, but using * the O_SYNC flag. We continue to use the existing numerical value * for O_DSYNC semantics now, but using the correct symbolic name for it. * This new value is used to request true Posix O_SYNC semantics. It is diff --git a/arch/sparc/include/asm/fcntl.h b/arch/sparc/include/asm/fcntl.h index 3b9cfb3..38f37b33 100644 --- a/arch/sparc/include/asm/fcntl.h +++ b/arch/sparc/include/asm/fcntl.h @@ -19,7 +19,7 @@ #define O_NOATIME 0x200000 #define O_CLOEXEC 0x400000 /* - * Before Linux 2.6.32 only O_DSYNC semantics were implemented, but using + * Before Linux 2.6.33 only O_DSYNC semantics were implemented, but using * the O_SYNC flag. We continue to use the existing numerical value * for O_DSYNC semantics now, but using the correct symbolic name for it. * This new value is used to request true Posix O_SYNC semantics. It is diff --git a/include/asm-generic/fcntl.h b/include/asm-generic/fcntl.h index 681ddf3..fcd268c 100644 --- a/include/asm-generic/fcntl.h +++ b/include/asm-generic/fcntl.h @@ -51,7 +51,7 @@ #endif /* - * Before Linux 2.6.32 only O_DSYNC semantics were implemented, but using + * Before Linux 2.6.33 only O_DSYNC semantics were implemented, but using * the O_SYNC flag. We continue to use the existing numerical value * for O_DSYNC semantics now, but using the correct symbolic name for it. * This new value is used to request true Posix O_SYNC semantics. It is -- cgit v0.10.2 From 7a0ad10c367ab57c899d340372f37880cbe6ab52 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Dec 2009 14:24:40 +0100 Subject: fold do_sync_file_range into sys_sync_file_range We recently go rid of all callers of do_sync_file_range as they're better served with vfs_fsync or the filemap_write_and_wait. Now that do_sync_file_range is down to a single caller fold it into it so that people don't start using it again accidentally. While at it also switch it from using __filemap_fdatawrite_range(..., WB_SYNC_ALL) to the more clear filemap_fdatawrite_range(). Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/sync.c b/fs/sync.c index 36752a6..418727a 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -355,6 +355,7 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes, { int ret; struct file *file; + struct address_space *mapping; loff_t endbyte; /* inclusive */ int fput_needed; umode_t i_mode; @@ -405,7 +406,28 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes, !S_ISLNK(i_mode)) goto out_put; - ret = do_sync_mapping_range(file->f_mapping, offset, endbyte, flags); + mapping = file->f_mapping; + if (!mapping) { + ret = -EINVAL; + goto out_put; + } + + ret = 0; + if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { + ret = filemap_fdatawait_range(mapping, offset, endbyte); + if (ret < 0) + goto out_put; + } + + if (flags & SYNC_FILE_RANGE_WRITE) { + ret = filemap_fdatawrite_range(mapping, offset, endbyte); + if (ret < 0) + goto out_put; + } + + if (flags & SYNC_FILE_RANGE_WAIT_AFTER) + ret = filemap_fdatawait_range(mapping, offset, endbyte); + out_put: fput_light(file, fput_needed); out: @@ -437,38 +459,3 @@ asmlinkage long SyS_sync_file_range2(long fd, long flags, } SYSCALL_ALIAS(sys_sync_file_range2, SyS_sync_file_range2); #endif - -/* - * `endbyte' is inclusive - */ -int do_sync_mapping_range(struct address_space *mapping, loff_t offset, - loff_t endbyte, unsigned int flags) -{ - int ret; - - if (!mapping) { - ret = -EINVAL; - goto out; - } - - ret = 0; - if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { - ret = filemap_fdatawait_range(mapping, offset, endbyte); - if (ret < 0) - goto out; - } - - if (flags & SYNC_FILE_RANGE_WRITE) { - ret = __filemap_fdatawrite_range(mapping, offset, endbyte, - WB_SYNC_ALL); - if (ret < 0) - goto out; - } - - if (flags & SYNC_FILE_RANGE_WAIT_AFTER) { - ret = filemap_fdatawait_range(mapping, offset, endbyte); - } -out: - return ret; -} -EXPORT_SYMBOL_GPL(do_sync_mapping_range); diff --git a/include/linux/fs.h b/include/linux/fs.h index 66bc0a5..77a9750 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1095,10 +1095,6 @@ struct file_lock { extern void send_sigio(struct fown_struct *fown, int fd, int band); -/* fs/sync.c */ -extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset, - loff_t endbyte, unsigned int flags); - #ifdef CONFIG_FILE_LOCKING extern int fcntl_getlk(struct file *, struct flock __user *); extern int fcntl_setlk(unsigned int, struct file *, unsigned int, -- cgit v0.10.2 From eaff8079d4f1016a12e34ab323737314f24127dd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Dec 2009 14:25:01 +0100 Subject: kill I_LOCK After I_SYNC was split from I_LOCK the leftover is always used together with I_NEW and thus superflous. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 3ff32fa..6e220f4 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -125,7 +125,7 @@ static struct inode *gfs2_iget_skip(struct super_block *sb, * directory entry when gfs2_inode_lookup() is invoked. Part of the code * segment inside gfs2_inode_lookup code needs to get moved around. * - * Clean up I_LOCK and I_NEW as well. + * Clears I_NEW as well. **/ void gfs2_set_iop(struct inode *inode) diff --git a/fs/inode.c b/fs/inode.c index 06c1f02..03dfeb2 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -113,7 +113,7 @@ static void wake_up_inode(struct inode *inode) * Prevent speculative execution through spin_unlock(&inode_lock); */ smp_mb(); - wake_up_bit(&inode->i_state, __I_LOCK); + wake_up_bit(&inode->i_state, __I_NEW); } /** @@ -690,17 +690,17 @@ void unlock_new_inode(struct inode *inode) } #endif /* - * This is special! We do not need the spinlock when clearing I_LOCK, + * This is special! We do not need the spinlock when clearing I_NEW, * because we're guaranteed that nobody else tries to do anything about * the state of the inode when it is locked, as we just created it (so - * there can be no old holders that haven't tested I_LOCK). + * there can be no old holders that haven't tested I_NEW). * However we must emit the memory barrier so that other CPUs reliably - * see the clearing of I_LOCK after the other inode initialisation has + * see the clearing of I_NEW after the other inode initialisation has * completed. */ smp_mb(); - WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW)); - inode->i_state &= ~(I_LOCK|I_NEW); + WARN_ON(!(inode->i_state & I_NEW)); + inode->i_state &= ~I_NEW; wake_up_inode(inode); } EXPORT_SYMBOL(unlock_new_inode); @@ -731,7 +731,7 @@ static struct inode *get_new_inode(struct super_block *sb, goto set_failed; __inode_add_to_lists(sb, head, inode); - inode->i_state = I_LOCK|I_NEW; + inode->i_state = I_NEW; spin_unlock(&inode_lock); /* Return the locked inode with I_NEW set, the @@ -778,7 +778,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb, if (!old) { inode->i_ino = ino; __inode_add_to_lists(sb, head, inode); - inode->i_state = I_LOCK|I_NEW; + inode->i_state = I_NEW; spin_unlock(&inode_lock); /* Return the locked inode with I_NEW set, the @@ -1083,7 +1083,7 @@ int insert_inode_locked(struct inode *inode) ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); - inode->i_state |= I_LOCK|I_NEW; + inode->i_state |= I_NEW; while (1) { struct hlist_node *node; struct inode *old = NULL; @@ -1120,7 +1120,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, struct super_block *sb = inode->i_sb; struct hlist_head *head = inode_hashtable + hash(sb, hashval); - inode->i_state |= I_LOCK|I_NEW; + inode->i_state |= I_NEW; while (1) { struct hlist_node *node; @@ -1510,7 +1510,7 @@ EXPORT_SYMBOL(inode_wait); * until the deletion _might_ have completed. Callers are responsible * to recheck inode state. * - * It doesn't matter if I_LOCK is not set initially, a call to + * It doesn't matter if I_NEW is not set initially, a call to * wake_up_inode() after removing from the hash list will DTRT. * * This is called with inode_lock held. @@ -1518,8 +1518,8 @@ EXPORT_SYMBOL(inode_wait); static void __wait_on_freeing_inode(struct inode *inode) { wait_queue_head_t *wq; - DEFINE_WAIT_BIT(wait, &inode->i_state, __I_LOCK); - wq = bit_waitqueue(&inode->i_state, __I_LOCK); + DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); + wq = bit_waitqueue(&inode->i_state, __I_NEW); prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); spin_unlock(&inode_lock); schedule(); diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index f26e4d0..d945ea7 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -1292,7 +1292,7 @@ int txCommit(tid_t tid, /* transaction identifier */ */ /* * I believe this code is no longer needed. Splitting I_LOCK - * into two bits, I_LOCK and I_SYNC should prevent this + * into two bits, I_NEW and I_SYNC should prevent this * deadlock as well. But since I don't have a JFS testload * to verify this, only a trivial s/I_LOCK/I_SYNC/ was done. * Joern diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 9938034..dc2505a 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -530,7 +530,7 @@ err_corrupt_attr: * the ntfs inode. * * Q: What locks are held when the function is called? - * A: i_state has I_LOCK set, hence the inode is locked, also + * A: i_state has I_NEW set, hence the inode is locked, also * i_count is set to 1, so it is not going to go away * i_flags is set to 0 and we have no business touching it. Only an ioctl() * is allowed to write to them. We should of course be honouring them but @@ -1207,7 +1207,7 @@ err_out: * necessary fields in @vi as well as initializing the ntfs inode. * * Q: What locks are held when the function is called? - * A: i_state has I_LOCK set, hence the inode is locked, also + * A: i_state has I_NEW set, hence the inode is locked, also * i_count is set to 1, so it is not going to go away * * Return 0 on success and -errno on error. In the error case, the inode will @@ -1474,7 +1474,7 @@ err_out: * normal directory inodes. * * Q: What locks are held when the function is called? - * A: i_state has I_LOCK set, hence the inode is locked, also + * A: i_state has I_NEW set, hence the inode is locked, also * i_count is set to 1, so it is not going to go away * * Return 0 on success and -errno on error. In the error case, the inode will diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 39849f8..16a6444 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -45,7 +45,7 @@ * * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the * read-ahead path does not lock it ("sys_read -> generic_file_aio_read -> - * ondemand_readahead -> readpage"). In case of readahead, @I_LOCK flag is not + * ondemand_readahead -> readpage"). In case of readahead, @I_SYNC flag is not * set as well. However, UBIFS disables readahead. */ diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 1d5b298..2259460 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -794,7 +794,7 @@ xfs_setup_inode( struct inode *inode = &ip->i_vnode; inode->i_ino = ip->i_ino; - inode->i_state = I_NEW|I_LOCK; + inode->i_state = I_NEW; inode_add_to_lists(ip->i_mount->m_super, inode); inode->i_mode = ip->i_d.di_mode; diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 0de36c2..fa402a6 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -91,7 +91,7 @@ xfs_inode_alloc( ip->i_new_size = 0; /* prevent anyone from using this yet */ - VFS_I(ip)->i_state = I_NEW|I_LOCK; + VFS_I(ip)->i_state = I_NEW; return ip; } @@ -217,7 +217,7 @@ xfs_iget_cache_hit( trace_xfs_iget_reclaim(ip); goto out_error; } - inode->i_state = I_LOCK|I_NEW; + inode->i_state = I_NEW; } else { /* If the VFS inode is being torn down, pause and try again. */ if (!igrab(inode)) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 77a9750..cca1919 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1587,7 +1587,7 @@ struct super_operations { * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at * various stages of removing an inode. * - * Two bits are used for locking and completion notification, I_LOCK and I_SYNC. + * Two bits are used for locking and completion notification, I_NEW and I_SYNC. * * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on * fdatasync(). i_atime is the usual cause. @@ -1596,8 +1596,14 @@ struct super_operations { * don't have to write inode on fdatasync() when only * mtime has changed in it. * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. - * I_NEW get_new_inode() sets i_state to I_LOCK|I_NEW. Both - * are cleared by unlock_new_inode(), called from iget(). + * I_NEW Serves as both a mutex and completion notification. + * New inodes set I_NEW. If two processes both create + * the same inode, one of them will release its inode and + * wait for I_NEW to be released before returning. + * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can + * also cause waiting on I_NEW, without I_NEW actually + * being set. find_inode() uses this to prevent returning + * nearly-dead inodes. * I_WILL_FREE Must be set when calling write_inode_now() if i_count * is zero. I_FREEING must be set when I_WILL_FREE is * cleared. @@ -1611,20 +1617,11 @@ struct super_operations { * prohibited for many purposes. iget() must wait for * the inode to be completely released, then create it * anew. Other functions will just ignore such inodes, - * if appropriate. I_LOCK is used for waiting. + * if appropriate. I_NEW is used for waiting. * - * I_LOCK Serves as both a mutex and completion notification. - * New inodes set I_LOCK. If two processes both create - * the same inode, one of them will release its inode and - * wait for I_LOCK to be released before returning. - * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can - * also cause waiting on I_LOCK, without I_LOCK actually - * being set. find_inode() uses this to prevent returning - * nearly-dead inodes. - * I_SYNC Similar to I_LOCK, but limited in scope to writeback - * of inode dirty data. Having a separate lock for this - * purpose reduces latency and prevents some filesystem- - * specific deadlocks. + * I_SYNC Synchonized write of dirty inode data. The bits is + * set during data writeback, and cleared with a wakeup + * on the bit address once it is done. * * Q: What is the difference between I_WILL_FREE and I_FREEING? * Q: igrab() only checks on (I_FREEING|I_WILL_FREE). Should it also check on @@ -1633,13 +1630,12 @@ struct super_operations { #define I_DIRTY_SYNC 1 #define I_DIRTY_DATASYNC 2 #define I_DIRTY_PAGES 4 -#define I_NEW 8 +#define __I_NEW 3 +#define I_NEW (1 << __I_NEW) #define I_WILL_FREE 16 #define I_FREEING 32 #define I_CLEAR 64 -#define __I_LOCK 7 -#define I_LOCK (1 << __I_LOCK) -#define __I_SYNC 8 +#define __I_SYNC 7 #define I_SYNC (1 << __I_SYNC) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 705f01f..c18c008 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -79,8 +79,7 @@ void wakeup_flusher_threads(long nr_pages); static inline void wait_on_inode(struct inode *inode) { might_sleep(); - wait_on_bit(&inode->i_state, __I_LOCK, inode_wait, - TASK_UNINTERRUPTIBLE); + wait_on_bit(&inode->i_state, __I_NEW, inode_wait, TASK_UNINTERRUPTIBLE); } static inline void inode_sync_wait(struct inode *inode) { -- cgit v0.10.2