From 81fca444001e5a41ab80ce8cf9a5734c00ec6546 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Oct 2010 10:47:47 +0200 Subject: fs: move permission check back into __lookup_hash The caller that didn't need it is gone. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/namei.c b/fs/namei.c index 24896e8..f1ef97d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1121,11 +1121,13 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, static struct dentry *__lookup_hash(struct qstr *name, struct dentry *base, struct nameidata *nd) { + struct inode *inode = base->d_inode; struct dentry *dentry; - struct inode *inode; int err; - inode = base->d_inode; + err = exec_permission(inode); + if (err) + return ERR_PTR(err); /* * See if the low-level filesystem might want @@ -1161,11 +1163,6 @@ out: */ static struct dentry *lookup_hash(struct nameidata *nd) { - int err; - - err = exec_permission(nd->path.dentry->d_inode); - if (err) - return ERR_PTR(err); return __lookup_hash(&nd->last, nd->path.dentry, nd); } @@ -1213,9 +1210,6 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) if (err) return ERR_PTR(err); - err = exec_permission(base->d_inode); - if (err) - return ERR_PTR(err); return __lookup_hash(&this, base, NULL); } -- cgit v0.10.2 From c37650161a53c01ddd88587675f9a4adc909a73e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Oct 2010 10:48:20 +0200 Subject: fs: add sync_inode_metadata Add a new helper to write out the inode using the writeback code, that is including the correct dirty bit and list manipulation. A few of filesystems already opencode this, and a lot of others should be using it instead of using write_inode_now which also writes out the data. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c index 97dae29..c62d300 100644 --- a/drivers/staging/pohmelfs/inode.c +++ b/drivers/staging/pohmelfs/inode.c @@ -882,12 +882,8 @@ static struct inode *pohmelfs_alloc_inode(struct super_block *sb) static int pohmelfs_fsync(struct file *file, int datasync) { struct inode *inode = file->f_mapping->host; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* sys_fsync did this */ - }; - return sync_inode(inode, &wbc); + return sync_inode_metadata(inode, 1); } ssize_t pohmelfs_write(struct file *file, const char __user *buf, diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 68cb23e..b905c79 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -46,10 +46,6 @@ static int exofs_file_fsync(struct file *filp, int datasync) { int ret; struct inode *inode = filp->f_mapping->host; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* metadata-only; caller takes care of data */ - }; struct super_block *sb; if (!(inode->i_state & I_DIRTY)) @@ -57,7 +53,7 @@ static int exofs_file_fsync(struct file *filp, int datasync) if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) return 0; - ret = sync_inode(inode, &wbc); + ret = sync_inode_metadata(inode, 1); /* This is a good place to write the sb */ /* TODO: Sechedule an sb-sync on create */ diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 7641098..2709b34 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -98,7 +98,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) if (IS_DIRSYNC(dir)) { err = write_one_page(page, 1); if (!err) - err = ext2_sync_inode(dir); + err = sync_inode_metadata(dir, 1); } else { unlock_page(page); } diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 416daa6..6346a2a 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -120,7 +120,6 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned); extern struct inode *ext2_iget (struct super_block *, unsigned long); extern int ext2_write_inode (struct inode *, struct writeback_control *); extern void ext2_evict_inode(struct inode *); -extern int ext2_sync_inode (struct inode *); extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); extern int ext2_setattr (struct dentry *, struct iattr *); extern void ext2_set_inode_flags(struct inode *inode); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 533699c..40ad210 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1203,7 +1203,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; if (inode_needs_sync(inode)) { sync_mapping_buffers(inode->i_mapping); - ext2_sync_inode (inode); + sync_inode_metadata(inode, 1); } else { mark_inode_dirty(inode); } @@ -1523,15 +1523,6 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc) return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } -int ext2_sync_inode(struct inode *inode) -{ - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* sys_fsync did this */ - }; - return sync_inode(inode, &wbc); -} - int ext2_setattr(struct dentry *dentry, struct iattr *iattr) { struct inode *inode = dentry->d_inode; diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 8c29ae1..f84700b 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -699,7 +699,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; inode->i_ctime = CURRENT_TIME_SEC; if (IS_SYNC(inode)) { - error = ext2_sync_inode (inode); + error = sync_inode_metadata(inode, 1); /* In case sync failed due to ENOSPC the inode was actually * written (only some dirty data were not) so we just proceed * as if nothing happened and cleanup the unused block */ diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ab38fef..29e3f40 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1198,3 +1198,23 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc) return ret; } EXPORT_SYMBOL(sync_inode); + +/** + * sync_inode - write an inode to disk + * @inode: the inode to sync + * @wait: wait for I/O to complete. + * + * Write an inode to disk and adjust it's dirty state after completion. + * + * Note: only writes the actual inode, no associated data or other metadata. + */ +int sync_inode_metadata(struct inode *inode, int wait) +{ + struct writeback_control wbc = { + .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, + .nr_to_write = 0, /* metadata-only */ + }; + + return sync_inode(inode, &wbc); +} +EXPORT_SYMBOL(sync_inode_metadata); diff --git a/fs/libfs.c b/fs/libfs.c index 62baa03..2dbf487 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -892,10 +892,6 @@ EXPORT_SYMBOL_GPL(generic_fh_to_parent); */ int generic_file_fsync(struct file *file, int datasync) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* metadata-only; caller takes care of data */ - }; struct inode *inode = file->f_mapping->host; int err; int ret; @@ -906,7 +902,7 @@ int generic_file_fsync(struct file *file, int datasync) if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) return ret; - err = sync_inode(inode, &wbc); + err = sync_inode_metadata(inode, 1); if (ret == 0) ret = err; return ret; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 661a6cf..184938f 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -281,23 +281,13 @@ commit_metadata(struct svc_fh *fhp) { struct inode *inode = fhp->fh_dentry->d_inode; const struct export_operations *export_ops = inode->i_sb->s_export_op; - int error = 0; if (!EX_ISSYNC(fhp->fh_export)) return 0; - if (export_ops->commit_metadata) { - error = export_ops->commit_metadata(inode); - } else { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* metadata only */ - }; - - error = sync_inode(inode, &wbc); - } - - return error; + if (export_ops->commit_metadata) + return export_ops->commit_metadata(inode); + return sync_inode_metadata(inode, 1); } /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f34ff6..0b03f49 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1734,6 +1734,7 @@ static inline void file_accessed(struct file *file) } int sync_inode(struct inode *inode, struct writeback_control *wbc); +int sync_inode_metadata(struct inode *inode, int wait); struct file_system_type { const char *name; -- cgit v0.10.2 From 56b0dacfa2b8416815a2f2a5f4f51e46be4cf14c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Oct 2010 10:48:55 +0200 Subject: fs: mark destroy_inode static Hugetlbfs used to need it, but after the destroy_inode and evict_inode changes it's not required anymore. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/inode.c b/fs/inode.c index 8646433..3368abd 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -235,7 +235,7 @@ void __destroy_inode(struct inode *inode) } EXPORT_SYMBOL(__destroy_inode); -void destroy_inode(struct inode *inode) +static void destroy_inode(struct inode *inode) { __destroy_inode(inode); if (inode->i_sb->s_op->destroy_inode) diff --git a/include/linux/fs.h b/include/linux/fs.h index 0b03f49..0a5d836 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2187,7 +2187,6 @@ extern void unlock_new_inode(struct inode *); extern void __iget(struct inode * inode); extern void iget_failed(struct inode *); extern void end_writeback(struct inode *); -extern void destroy_inode(struct inode *); extern void __destroy_inode(struct inode *); extern struct inode *new_inode(struct super_block *); extern int should_remove_suid(struct dentry *); -- cgit v0.10.2 From ebdec241d509cf69f6ebf1ecdc036359d3dbe154 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Oct 2010 10:47:23 +0200 Subject: fs: kill block_prepare_write __block_write_begin and block_prepare_write are identical except for slightly different calling conventions. Convert all callers to the __block_write_begin calling conventions and drop block_prepare_write. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/buffer.c b/fs/buffer.c index 7f0b9b0..a7b8f3c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1834,9 +1834,11 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) } EXPORT_SYMBOL(page_zero_new_buffers); -int block_prepare_write(struct page *page, unsigned from, unsigned to, +int __block_write_begin(struct page *page, loff_t pos, unsigned len, get_block_t *get_block) { + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + len; struct inode *inode = page->mapping->host; unsigned block_start, block_end; sector_t block; @@ -1916,7 +1918,7 @@ int block_prepare_write(struct page *page, unsigned from, unsigned to, } return err; } -EXPORT_SYMBOL(block_prepare_write); +EXPORT_SYMBOL(__block_write_begin); static int __block_commit_write(struct inode *inode, struct page *page, unsigned from, unsigned to) @@ -1953,15 +1955,6 @@ static int __block_commit_write(struct inode *inode, struct page *page, return 0; } -int __block_write_begin(struct page *page, loff_t pos, unsigned len, - get_block_t *get_block) -{ - unsigned start = pos & (PAGE_CACHE_SIZE - 1); - - return block_prepare_write(page, start, start + len, get_block); -} -EXPORT_SYMBOL(__block_write_begin); - /* * block_write_begin takes care of the basic task of block allocation and * bringing partial write blocks uptodate first. @@ -2379,7 +2372,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, else end = PAGE_CACHE_SIZE; - ret = block_prepare_write(page, 0, end, get_block); + ret = __block_write_begin(page, 0, end, get_block); if (!ret) ret = block_commit_write(page, 0, end); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 5e0faf4..ad05353 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1696,8 +1696,8 @@ static int ext3_journalled_writepage(struct page *page, * doesn't seem much point in redirtying the page here. */ ClearPageChecked(page); - ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, - ext3_get_block); + ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE, + ext3_get_block); if (ret != 0) { ext3_journal_stop(handle); goto out_unlock; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4b8debe..49635ef 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1538,10 +1538,10 @@ static int do_journal_get_write_access(handle_t *handle, if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; /* - * __block_prepare_write() could have dirtied some buffers. Clean + * __block_write_begin() could have dirtied some buffers. Clean * the dirty bit as jbd2_journal_get_write_access() could complain * otherwise about fs integrity issues. Setting of the dirty bit - * by __block_prepare_write() isn't a real problem here as we clear + * by __block_write_begin() isn't a real problem here as we clear * the bit before releasing a page lock and thus writeback cannot * ever write the buffer. */ @@ -2550,8 +2550,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, if (buffer_delay(bh)) return 0; /* Not sure this could or should happen */ /* - * XXX: __block_prepare_write() unmaps passed block, - * is it OK? + * XXX: __block_write_begin() unmaps passed block, is it OK? */ ret = ext4_da_reserve_space(inode, iblock); if (ret) @@ -2583,7 +2582,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, /* * This function is used as a standard get_block_t calback function * when there is no desire to allocate any blocks. It is used as a - * callback function for block_prepare_write() and block_write_full_page(). + * callback function for block_write_begin() and block_write_full_page(). * These functions should only try to map a single block at a time. * * Since this function doesn't do block allocations even if the caller @@ -2743,7 +2742,7 @@ static int ext4_writepage(struct page *page, * all are mapped and non delay. We don't want to * do block allocation here. */ - ret = block_prepare_write(page, 0, len, + ret = __block_write_begin(page, 0, len, noalloc_get_block_write); if (!ret) { page_bufs = page_buffers(page); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 6b24afb..4f36f88 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -618,7 +618,6 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, struct gfs2_alloc *al = NULL; pgoff_t index = pos >> PAGE_CACHE_SHIFT; unsigned from = pos & (PAGE_CACHE_SIZE - 1); - unsigned to = from + len; struct page *page; gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); @@ -691,7 +690,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, } prepare_write: - error = block_prepare_write(page, from, to, gfs2_block_map); + error = __block_write_begin(page, from, len, gfs2_block_map); out: if (error == 0) return 0; diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 0534510..48a274f 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -1294,7 +1294,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to) int error; if (!page_has_buffers(page)) { - error = block_prepare_write(page, from, to, gfs2_block_map); + error = __block_write_begin(page, from, to - from, gfs2_block_map); if (unlikely(error)) return error; @@ -1313,7 +1313,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to) next += bh->b_size; if (buffer_mapped(bh)) { if (end) { - error = block_prepare_write(page, start, end, + error = __block_write_begin(page, start, end - start, gfs2_block_map); if (unlikely(error)) return error; @@ -1328,7 +1328,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to) } while (next < to); if (end) { - error = block_prepare_write(page, start, end, gfs2_block_map); + error = __block_write_begin(page, start, end - start, gfs2_block_map); if (unlikely(error)) return error; empty_write_end(page, start, end); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 5cfeee1..f1e962c 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -165,7 +165,7 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock, * ocfs2 never allocates in this function - the only time we * need to use BH_New is when we're extending i_size on a file * system which doesn't support holes, in which case BH_New - * allows block_prepare_write() to zero. + * allows __block_write_begin() to zero. * * If we see this on a sparse file system, then a truncate has * raced us and removed the cluster. In this case, we clear @@ -407,21 +407,6 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) return ret; } -/* - * This is called from ocfs2_write_zero_page() which has handled it's - * own cluster locking and has ensured allocation exists for those - * blocks to be written. - */ -int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, - unsigned from, unsigned to) -{ - int ret; - - ret = block_prepare_write(page, from, to, ocfs2_get_block); - - return ret; -} - /* Taken from ext3. We don't necessarily need the full blown * functionality yet, but IMHO it's better to cut and paste the whole * thing so we can avoid introducing our own bugs (and easily pick up @@ -732,7 +717,7 @@ static int ocfs2_should_read_blk(struct inode *inode, struct page *page, } /* - * Some of this taken from block_prepare_write(). We already have our + * Some of this taken from __block_write_begin(). We already have our * mapping by now though, and the entire write will be allocating or * it won't, so not much need to use BH_New. * diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 7606f66..76bfdfd 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -22,9 +22,6 @@ #ifndef OCFS2_AOPS_H #define OCFS2_AOPS_H -int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, - unsigned from, unsigned to); - handle_t *ocfs2_start_walk_page_trans(struct inode *inode, struct page *page, unsigned from, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 1ca6867..77b4c04 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -796,13 +796,12 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, block_end = block_start + (1 << inode->i_blkbits); /* - * block_start is block-aligned. Bump it by one to - * force ocfs2_{prepare,commit}_write() to zero the + * block_start is block-aligned. Bump it by one to force + * __block_write_begin and block_commit_write to zero the * whole block. */ - ret = ocfs2_prepare_write_nolock(inode, page, - block_start + 1, - block_start + 1); + ret = __block_write_begin(page, block_start + 1, 0, + ocfs2_get_block); if (ret < 0) { mlog_errno(ret); goto out_unlock; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index caa7583..4dcb880 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -22,8 +22,6 @@ int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); -int reiserfs_prepare_write(struct file *f, struct page *page, - unsigned from, unsigned to); void reiserfs_evict_inode(struct inode *inode) { @@ -165,7 +163,7 @@ inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key, ** but tail is still sitting in a direct item, and we can't write to ** it. So, look through this page, and check all the mapped buffers ** to make sure they have valid block numbers. Any that don't need -** to be unmapped, so that block_prepare_write will correctly call +** to be unmapped, so that __block_write_begin will correctly call ** reiserfs_get_block to convert the tail into an unformatted node */ static inline void fix_tail_page_for_writing(struct page *page) @@ -439,13 +437,13 @@ static int reiserfs_bmap(struct inode *inode, sector_t block, } /* special version of get_block that is only used by grab_tail_page right -** now. It is sent to block_prepare_write, and when you try to get a +** now. It is sent to __block_write_begin, and when you try to get a ** block past the end of the file (or a block from a hole) it returns -** -ENOENT instead of a valid buffer. block_prepare_write expects to +** -ENOENT instead of a valid buffer. __block_write_begin expects to ** be able to do i/o on the buffers returned, unless an error value ** is also returned. ** -** So, this allows block_prepare_write to be used for reading a single block +** So, this allows __block_write_begin to be used for reading a single block ** in a page. Where it does not produce a valid page for holes, or past the ** end of the file. This turns out to be exactly what we need for reading ** tails for conversion. @@ -558,11 +556,12 @@ static int convert_tail_for_hole(struct inode *inode, ** ** We must fix the tail page for writing because it might have buffers ** that are mapped, but have a block number of 0. This indicates tail - ** data that has been read directly into the page, and block_prepare_write - ** won't trigger a get_block in this case. + ** data that has been read directly into the page, and + ** __block_write_begin won't trigger a get_block in this case. */ fix_tail_page_for_writing(tail_page); - retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end); + retval = __reiserfs_write_begin(tail_page, tail_start, + tail_end - tail_start); if (retval) goto unlock; @@ -2033,7 +2032,7 @@ static int grab_tail_page(struct inode *inode, /* start within the page of the last block in the file */ start = (offset / blocksize) * blocksize; - error = block_prepare_write(page, start, offset, + error = __block_write_begin(page, start, offset - start, reiserfs_get_block_create_0); if (error) goto unlock; @@ -2628,8 +2627,7 @@ static int reiserfs_write_begin(struct file *file, return ret; } -int reiserfs_prepare_write(struct file *f, struct page *page, - unsigned from, unsigned to) +int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len) { struct inode *inode = page->mapping->host; int ret; @@ -2650,7 +2648,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page, th->t_refcount++; } - ret = block_prepare_write(page, from, to, reiserfs_get_block); + ret = __block_write_begin(page, from, len, reiserfs_get_block); if (ret && reiserfs_transaction_running(inode->i_sb)) { struct reiserfs_transaction_handle *th = current->journal_info; /* this gets a little ugly. If reiserfs_get_block returned an diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 5cbb81e..adf22b4 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -160,8 +160,6 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd, int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); -int reiserfs_prepare_write(struct file *f, struct page *page, - unsigned from, unsigned to); /* ** reiserfs_unpack ** Function try to convert tail from direct item into indirect. @@ -200,7 +198,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) } /* we unpack by finding the page with the tail, and calling - ** reiserfs_prepare_write on that page. This will force a + ** __reiserfs_write_begin on that page. This will force a ** reiserfs_get_block to unpack the tail for us. */ index = inode->i_size >> PAGE_CACHE_SHIFT; @@ -210,7 +208,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) if (!page) { goto out; } - retval = reiserfs_prepare_write(NULL, page, write_from, write_from); + retval = __reiserfs_write_begin(page, write_from, 0); if (retval) goto out_unlock; diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 8c4cf27..f7415de 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -418,8 +418,6 @@ static inline __u32 xattr_hash(const char *msg, int len) int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); -int reiserfs_prepare_write(struct file *f, struct page *page, - unsigned from, unsigned to); static void update_ctime(struct inode *inode) { @@ -532,8 +530,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, rxh->h_hash = cpu_to_le32(xahash); } - err = reiserfs_prepare_write(NULL, page, page_offset, - page_offset + chunk + skip); + err = __reiserfs_write_begin(page, page_offset, chunk + skip); if (!err) { if (buffer) memcpy(data + skip, buffer + buffer_pos, chunk); diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index ab31ce5..cf80878 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -576,7 +576,7 @@ xfs_max_file_offset( /* Figure out maximum filesize, on Linux this can depend on * the filesystem blocksize (on 32 bit platforms). - * __block_prepare_write does this in an [unsigned] long... + * __block_write_begin does this in an [unsigned] long... * page->index << (PAGE_CACHE_SHIFT - bbits) * So, for page sized blocks (4K on 32 bit platforms), * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index dd1b25b..68d1fe7 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -212,7 +212,6 @@ int generic_write_end(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page *, void *); void page_zero_new_buffers(struct page *page, unsigned from, unsigned to); -int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); int cont_write_begin(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page **, void **, get_block_t *, loff_t *); diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index 91a4177..5ca47e5 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -2072,6 +2072,8 @@ void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode); void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs); int reiserfs_setattr(struct dentry *dentry, struct iattr *attr); +int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len); + /* namei.c */ void set_de_name_and_namelen(struct reiserfs_dir_entry *de); int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, -- cgit v0.10.2 From fde214d414218fb6cace35708730986bcc94fb53 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 4 Oct 2010 11:37:37 +0200 Subject: isofs: Fix isofs_get_blocks for 8TB files Currently isofs_get_blocks() was limited to handle only 4TB files on 32-bit architectures because of unnecessary use of iblock variable which was signed long. Just remove the variable. The error messages that were using this variable should have rather used b_off anyway because that is the block we are currently mapping. Signed-off-by: Jan Kara Signed-off-by: Al Viro diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 09ff41a..60c2b944 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -962,25 +962,23 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf) * or getblk() if they are not. Returns the number of blocks inserted * (-ve == error.) */ -int isofs_get_blocks(struct inode *inode, sector_t iblock_s, +int isofs_get_blocks(struct inode *inode, sector_t iblock, struct buffer_head **bh, unsigned long nblocks) { - unsigned long b_off; + unsigned long b_off = iblock; unsigned offset, sect_size; unsigned int firstext; unsigned long nextblk, nextoff; - long iblock = (long)iblock_s; int section, rv, error; struct iso_inode_info *ei = ISOFS_I(inode); error = -EIO; rv = 0; - if (iblock < 0 || iblock != iblock_s) { + if (iblock != b_off) { printk(KERN_DEBUG "%s: block number too large\n", __func__); goto abort; } - b_off = iblock; offset = 0; firstext = ei->i_first_extent; @@ -998,8 +996,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s, * I/O errors. */ if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) { - printk(KERN_DEBUG "%s: block >= EOF (%ld, %ld)\n", - __func__, iblock, (unsigned long) inode->i_size); + printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n", + __func__, b_off, + (unsigned long long)inode->i_size); goto abort; } @@ -1025,9 +1024,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s, if (++section > 100) { printk(KERN_DEBUG "%s: More than 100 file sections ?!?" " aborting...\n", __func__); - printk(KERN_DEBUG "%s: block=%ld firstext=%u sect_size=%u " + printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u " "nextblk=%lu nextoff=%lu\n", __func__, - iblock, firstext, (unsigned) sect_size, + b_off, firstext, (unsigned) sect_size, nextblk, nextoff); goto abort; } -- cgit v0.10.2 From 7e360c38abe2c70eae3ba5a8a17f17671d8b77c5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Oct 2010 09:32:55 +0200 Subject: fs: allow for more than 2^31 files Andrew, Could you please review this patch, you probably are the right guy to take it, because it crosses fs and net trees. Note : /proc/sys/fs/file-nr is a read-only file, so this patch doesnt depend on previous patch (sysctl: fix min/max handling in __do_proc_doulongvec_minmax()) Thanks ! [PATCH V4] fs: allow for more than 2^31 files Robin Holt tried to boot a 16TB system and found af_unix was overflowing a 32bit value : We were seeing a failure which prevented boot. The kernel was incapable of creating either a named pipe or unix domain socket. This comes down to a common kernel function called unix_create1() which does: atomic_inc(&unix_nr_socks); if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) goto out; The function get_max_files() is a simple return of files_stat.max_files. files_stat.max_files is a signed integer and is computed in fs/file_table.c's files_init(). n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = n; In our case, mempages (total_ram_pages) is approx 3,758,096,384 (0xe0000000). That leaves max_files at approximately 1,503,238,553. This causes 2 * get_max_files() to integer overflow. Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long integers, and change af_unix to use an atomic_long_t instead of atomic_t. get_max_files() is changed to return an unsigned long. get_nr_files() is changed to return a long. unix_nr_socks is changed from atomic_t to atomic_long_t, while not strictly needed to address Robin problem. Before patch (on a 64bit kernel) : # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max -18446744071562067968 After patch: # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max 2147483648 # cat /proc/sys/fs/file-nr 704 0 2147483648 Reported-by: Robin Holt Signed-off-by: Eric Dumazet Acked-by: David Miller Reviewed-by: Robin Holt Tested-by: Robin Holt Signed-off-by: Al Viro diff --git a/fs/file_table.c b/fs/file_table.c index a04bdd8..c3dee38 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -60,7 +60,7 @@ static inline void file_free(struct file *f) /* * Return the total number of open files in the system */ -static int get_nr_files(void) +static long get_nr_files(void) { return percpu_counter_read_positive(&nr_files); } @@ -68,7 +68,7 @@ static int get_nr_files(void) /* * Return the maximum number of open files in the system */ -int get_max_files(void) +unsigned long get_max_files(void) { return files_stat.max_files; } @@ -82,7 +82,7 @@ int proc_nr_files(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); - return proc_dointvec(table, write, buffer, lenp, ppos); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #else int proc_nr_files(ctl_table *table, int write, @@ -105,7 +105,7 @@ int proc_nr_files(ctl_table *table, int write, struct file *get_empty_filp(void) { const struct cred *cred = current_cred(); - static int old_max; + static long old_max; struct file * f; /* @@ -140,8 +140,7 @@ struct file *get_empty_filp(void) over: /* Ran out of filps - report that */ if (get_nr_files() > old_max) { - printk(KERN_INFO "VFS: file-max limit %d reached\n", - get_max_files()); + pr_info("VFS: file-max limit %lu reached\n", get_max_files()); old_max = get_nr_files(); } goto fail; @@ -487,7 +486,7 @@ retry: void __init files_init(unsigned long mempages) { - int n; + unsigned long n; filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); @@ -498,9 +497,7 @@ void __init files_init(unsigned long mempages) */ n = (mempages * (PAGE_SIZE / 1024)) / 10; - files_stat.max_files = n; - if (files_stat.max_files < NR_FILE) - files_stat.max_files = NR_FILE; + files_stat.max_files = max_t(unsigned long, n, NR_FILE); files_defer_init(); lg_lock_init(files_lglock); percpu_counter_init(&nr_files, 0); diff --git a/include/linux/fs.h b/include/linux/fs.h index 0a5d836..0cd6821 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -34,9 +34,9 @@ /* And dynamically-tunable limits and defaults: */ struct files_stat_struct { - int nr_files; /* read only */ - int nr_free_files; /* read only */ - int max_files; /* tunable */ + unsigned long nr_files; /* read only */ + unsigned long nr_free_files; /* read only */ + unsigned long max_files; /* tunable */ }; struct inodes_stat_t { @@ -400,7 +400,7 @@ extern void __init inode_init_early(void); extern void __init files_init(unsigned long); extern struct files_stat_struct files_stat; -extern int get_max_files(void); +extern unsigned long get_max_files(void); extern int sysctl_nr_open; extern struct inodes_stat_t inodes_stat; extern int leases_enable, lease_break_time; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3a45c22..694b140 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1352,16 +1352,16 @@ static struct ctl_table fs_table[] = { { .procname = "file-nr", .data = &files_stat, - .maxlen = 3*sizeof(int), + .maxlen = sizeof(files_stat), .mode = 0444, .proc_handler = proc_nr_files, }, { .procname = "file-max", .data = &files_stat.max_files, - .maxlen = sizeof(int), + .maxlen = sizeof(files_stat.max_files), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "nr_open", diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 0ebc777..3c95304 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -117,7 +117,7 @@ static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; static DEFINE_SPINLOCK(unix_table_lock); -static atomic_t unix_nr_socks = ATOMIC_INIT(0); +static atomic_long_t unix_nr_socks; #define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) @@ -360,13 +360,13 @@ static void unix_sock_destructor(struct sock *sk) if (u->addr) unix_release_addr(u->addr); - atomic_dec(&unix_nr_socks); + atomic_long_dec(&unix_nr_socks); local_bh_disable(); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); local_bh_enable(); #ifdef UNIX_REFCNT_DEBUG - printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, - atomic_read(&unix_nr_socks)); + printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk, + atomic_long_read(&unix_nr_socks)); #endif } @@ -606,8 +606,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) struct sock *sk = NULL; struct unix_sock *u; - atomic_inc(&unix_nr_socks); - if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) + atomic_long_inc(&unix_nr_socks); + if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) goto out; sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); @@ -632,7 +632,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) unix_insert_socket(unix_sockets_unbound, sk); out: if (sk == NULL) - atomic_dec(&unix_nr_socks); + atomic_long_dec(&unix_nr_socks); else { local_bh_disable(); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); -- cgit v0.10.2 From 0e45b67d5aeb3dcfb6b149cf61c30b9a8e503f74 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Aug 2010 09:05:27 +0200 Subject: affs: testing the wrong variable The intent was to verify that bh = affs_bread_ino(...) returned a valid pointer. We checked "ext_bh" earlier in the function and it's valid here. Signed-off-by: Dan Carpenter Signed-off-by: Al Viro diff --git a/fs/affs/file.c b/fs/affs/file.c index c4a9875..0a90dcd 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -894,9 +894,9 @@ affs_truncate(struct inode *inode) if (AFFS_SB(sb)->s_flags & SF_OFS) { struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0); u32 tmp; - if (IS_ERR(ext_bh)) { + if (IS_ERR(bh)) { affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)", - ext, PTR_ERR(ext_bh)); + ext, PTR_ERR(bh)); return; } tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next); -- cgit v0.10.2 From d9d1dc802ffae507956ceb350eb3f4a995734f1e Mon Sep 17 00:00:00 2001 From: Valerie Aurora Date: Mon, 30 Aug 2010 17:23:12 -0400 Subject: Documentation: Fix trivial typo in filesystems/sharedsubtree.txt Documentation: Fix trivial typo in filesystems/sharedsubtree.txt This typo is easy to ignore unless you have spent a great deal of time thinking about how to eliminate duplicate dentries in unions. Signed-off-by: Valerie Aurora Signed-off-by: Al Viro diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt index fc0e39a..4ede421 100644 --- a/Documentation/filesystems/sharedsubtree.txt +++ b/Documentation/filesystems/sharedsubtree.txt @@ -62,10 +62,10 @@ replicas continue to be exactly same. # mount /dev/sd0 /tmp/a #ls /tmp/a - t1 t2 t2 + t1 t2 t3 #ls /mnt/a - t1 t2 t2 + t1 t2 t3 Note that the mount has propagated to the mount at /mnt as well. -- cgit v0.10.2 From ba10f486658c0ca1bc84c936f6a996e40d071453 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 19 Oct 2010 14:27:59 -0700 Subject: hostfs: fix UML crash: remove f_spare from hostfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 365b1818 ("add f_flags to struct statfs(64)") resized f_spare within struct statfs which caused a UML crash. There is no need to copy f_spare. Signed-off-by: Richard Weinberger Reported-by: Toralf Förster Tested-by: Toralf Förster Cc: Christoph Hellwig Cc: Al Viro Cc: Jeff Dike Signed-off-by: Andrew Morton Signed-off-by: Al Viro diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index 7c232c1..bf15a43 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -91,7 +91,6 @@ extern int rename_file(char *from, char *to); extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, long long *bfree_out, long long *bavail_out, long long *files_out, long long *ffree_out, - void *fsid_out, int fsid_size, long *namelen_out, - long *spare_out); + void *fsid_out, int fsid_size, long *namelen_out); #endif diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index f7dc9b5..cd7c939 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -217,7 +217,7 @@ int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf) err = do_statfs(dentry->d_sb->s_fs_info, &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files, &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid), - &sf->f_namelen, sf->f_spare); + &sf->f_namelen); if (err) return err; sf->f_blocks = f_blocks; diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 6777aa0..8d02683 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -364,8 +364,7 @@ int rename_file(char *from, char *to) int do_statfs(char *root, long *bsize_out, long long *blocks_out, long long *bfree_out, long long *bavail_out, long long *files_out, long long *ffree_out, - void *fsid_out, int fsid_size, long *namelen_out, - long *spare_out) + void *fsid_out, int fsid_size, long *namelen_out) { struct statfs64 buf; int err; @@ -384,10 +383,6 @@ int do_statfs(char *root, long *bsize_out, long long *blocks_out, sizeof(buf.f_fsid) > fsid_size ? fsid_size : sizeof(buf.f_fsid)); *namelen_out = buf.f_namelen; - spare_out[0] = buf.f_spare[0]; - spare_out[1] = buf.f_spare[1]; - spare_out[2] = buf.f_spare[2]; - spare_out[3] = buf.f_spare[3]; - spare_out[4] = buf.f_spare[4]; + return 0; } -- cgit v0.10.2 From 4a3956c790290efeb647bbb0c3a90476bb57800e Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 1 Oct 2010 14:20:22 -0700 Subject: vfs: introduce FMODE_UNSIGNED_OFFSET for allowing negative f_pos Now, rw_verify_area() checsk f_pos is negative or not. And if negative, returns -EINVAL. But, some special files as /dev/(k)mem and /proc//mem etc.. has negative offsets. And we can't do any access via read/write to the file(device). So introduce FMODE_UNSIGNED_OFFSET to allow negative file offsets. Signed-off-by: Wu Fengguang Signed-off-by: KAMEZAWA Hiroyuki Cc: Al Viro Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Al Viro diff --git a/drivers/char/mem.c b/drivers/char/mem.c index e985b1c..1256454 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -876,6 +876,10 @@ static int memory_open(struct inode *inode, struct file *filp) if (dev->dev_info) filp->f_mapping->backing_dev_info = dev->dev_info; + /* Is /dev/mem or /dev/kmem ? */ + if (dev->dev_info == &directly_mappable_cdev_bdi) + filp->f_mode |= FMODE_UNSIGNED_OFFSET; + if (dev->fops->open) return dev->fops->open(inode, filp); diff --git a/fs/proc/base.c b/fs/proc/base.c index dc5d5f5..fb2a5ab 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -771,6 +771,8 @@ static const struct file_operations proc_single_file_operations = { static int mem_open(struct inode* inode, struct file* file) { file->private_data = (void*)((long)current->self_exec_id); + /* OK to pass negative loff_t, we can catch out-of-range */ + file->f_mode |= FMODE_UNSIGNED_OFFSET; return 0; } diff --git a/fs/read_write.c b/fs/read_write.c index e757ef2..9cd9d14 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -31,6 +31,20 @@ const struct file_operations generic_ro_fops = { EXPORT_SYMBOL(generic_ro_fops); +static int +__negative_fpos_check(struct file *file, loff_t pos, size_t count) +{ + /* + * pos or pos+count is negative here, check overflow. + * too big "count" will be caught in rw_verify_area(). + */ + if ((pos < 0) && (pos + count < pos)) + return -EOVERFLOW; + if (file->f_mode & FMODE_UNSIGNED_OFFSET) + return 0; + return -EINVAL; +} + /** * generic_file_llseek_unlocked - lockless generic llseek implementation * @file: file structure to seek on @@ -62,7 +76,9 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) break; } - if (offset < 0 || offset > inode->i_sb->s_maxbytes) + if (offset < 0 && __negative_fpos_check(file, offset, 0)) + return -EINVAL; + if (offset > inode->i_sb->s_maxbytes) return -EINVAL; /* Special lock needed here? */ @@ -137,7 +153,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin) offset += file->f_pos; } retval = -EINVAL; - if (offset >= 0) { + if (offset >= 0 || !__negative_fpos_check(file, offset, 0)) { if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; @@ -221,6 +237,7 @@ bad: } #endif + /* * rw_verify_area doesn't like huge counts. We limit * them to something that fits in "int" so that others @@ -238,8 +255,11 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count if (unlikely((ssize_t) count < 0)) return retval; pos = *ppos; - if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) - return retval; + if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) { + retval = __negative_fpos_check(file, pos, count); + if (retval) + return retval; + } if (unlikely(inode->i_flock && mandatory_lock(inode))) { retval = locks_mandatory_area( diff --git a/include/linux/fs.h b/include/linux/fs.h index 0cd6821..7fc126d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -92,6 +92,9 @@ struct inodes_stat_t { /* Expect random access pattern */ #define FMODE_RANDOM ((__force fmode_t)0x1000) +/* File is huge (eg. /dev/kmem): treat loff_t as unsigned */ +#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x1000000) -- cgit v0.10.2 From 3072b90c4726396e38160cc1f7c93191e2f4a566 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Oct 2010 10:49:17 +0200 Subject: hfs: use sync_dirty_buffer Use sync_dirty_buffer instead of the incorrect opencoding it. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 4f55651..1efcbc7 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -254,17 +254,6 @@ static inline void hfs_bitmap_dirty(struct super_block *sb) sb->s_dirt = 1; } -static inline void hfs_buffer_sync(struct buffer_head *bh) -{ - while (buffer_locked(bh)) { - wait_on_buffer(bh); - } - if (buffer_dirty(bh)) { - ll_rw_block(WRITE, 1, &bh); - wait_on_buffer(bh); - } -} - #define sb_bread512(sb, sec, data) ({ \ struct buffer_head *__bh; \ sector_t __block; \ diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index 86428f5..1563d5c 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -220,7 +220,7 @@ int hfs_mdb_get(struct super_block *sb) mdb->drLsMod = hfs_mtime(); mark_buffer_dirty(HFS_SB(sb)->mdb_bh); - hfs_buffer_sync(HFS_SB(sb)->mdb_bh); + sync_dirty_buffer(HFS_SB(sb)->mdb_bh); } return 0; @@ -287,7 +287,7 @@ void hfs_mdb_commit(struct super_block *sb) HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT); HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT); mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh); - hfs_buffer_sync(HFS_SB(sb)->alt_mdb_bh); + sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh); } if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) { -- cgit v0.10.2 From bb1e5f8c022ae2f6bf1ec0c6b861db3c35976377 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 15 Oct 2010 16:32:27 -0700 Subject: fs: move exportfs since it is not a networking filesystem Move the EXPORTFS kconfig symbol out of the NETWORK_FILESYSTEMS block since it provides a library function that can be (and is) used by other (non-network) filesystems. This also eliminates a kconfig dependency warning: warning: (XFS_FS && BLOCK || NFSD && NETWORK_FILESYSTEMS && INET && FILE_LOCKING && BKL) selects EXPORTFS which has unmet direct dependencies (NETWORK_FILESYSTEMS) Signed-off-by: Randy Dunlap Cc: Dave Chinner Cc: Christoph Hellwig Cc: Alex Elder Cc: xfs-masters@oss.sgi.com Signed-off-by: Al Viro diff --git a/fs/Kconfig b/fs/Kconfig index 65781de..b5e582b 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -47,6 +47,9 @@ source "fs/nilfs2/Kconfig" endif # BLOCK +config EXPORTFS + tristate + config FILE_LOCKING bool "Enable POSIX file locking API" if EMBEDDED default y @@ -222,9 +225,6 @@ config LOCKD_V4 depends on FILE_LOCKING default y -config EXPORTFS - tristate - config NFS_ACL_SUPPORT tristate select FS_POSIX_ACL -- cgit v0.10.2 From 8358e7d71e712d3bd4e20ecf23e6fd7480c83684 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sat, 16 Oct 2010 12:40:33 +0900 Subject: fs/buffer.c: remove duplicated assignment on b_private bh->b_private is initialized within init_buffer(), thus the assignment should be redundant. Remove it. Signed-off-by: Namhyung Kim Signed-off-by: Al Viro diff --git a/fs/buffer.c b/fs/buffer.c index a7b8f3c..74566c6 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -905,7 +905,6 @@ try_again: bh->b_state = 0; atomic_set(&bh->b_count, 0); - bh->b_private = NULL; bh->b_size = size; /* Link the buffer to its page */ -- cgit v0.10.2 From e1455d1bdccbe056ba53479741b1452106ce59aa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Oct 2010 10:46:53 +0200 Subject: update block_device_operations documentation Updated Documentation/filesystems/Locking to match the code. Signed-off-by: Christoph Hellwig diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 2db4283..8a817f6 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -349,21 +349,36 @@ call this method upon the IO completion. --------------------------- block_device_operations ----------------------- prototypes: - int (*open) (struct inode *, struct file *); - int (*release) (struct inode *, struct file *); - int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long); + int (*open) (struct block_device *, fmode_t); + int (*release) (struct gendisk *, fmode_t); + int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); + int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); + int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *); int (*media_changed) (struct gendisk *); + void (*unlock_native_capacity) (struct gendisk *); int (*revalidate_disk) (struct gendisk *); + int (*getgeo)(struct block_device *, struct hd_geometry *); + void (*swap_slot_free_notify) (struct block_device *, unsigned long); locking rules: - BKL bd_sem -open: yes yes -release: yes yes -ioctl: yes no + BKL bd_mutex +open: no yes +release: no yes +ioctl: no no +compat_ioctl: no no +direct_access: no no media_changed: no no +unlock_native_capacity: no no revalidate_disk: no no +getgeo: no no +swap_slot_free_notify: no no (see below) + +media_changed, unlock_native_capacity and revalidate_disk are called only from +check_disk_change(). + +swap_slot_free_notify is called with swap_lock and sometimes the page lock +held. -The last two are called only from check_disk_change(). --------------------------- file_operations ------------------------------- prototypes: -- cgit v0.10.2 From 306fb0979443419288594446a348155a8027dcf2 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 23 Aug 2010 10:47:55 -0400 Subject: aio: bump i_count instead of using igrab The aio batching code is using igrab to get an extra reference on the inode so it can safely batch. igrab will go ahead and take the global inode spinlock, which can be a bottleneck on large machines doing lots of AIO. In this case, igrab isn't required because we already have a reference on the file handle. It is safe to just bump the i_count directly on the inode. Benchmarking shows this patch brings IOP/s on tons of flash up by about 2.5X. Signed-off-by: Chris Mason diff --git a/fs/aio.c b/fs/aio.c index 250b0a7..9e319a0 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1543,7 +1543,20 @@ static void aio_batch_add(struct address_space *mapping, } abe = mempool_alloc(abe_pool, GFP_KERNEL); - BUG_ON(!igrab(mapping->host)); + + /* + * we should be using igrab here, but + * we don't want to hammer on the global + * inode spinlock just to take an extra + * reference on a file that we must already + * have a reference to. + * + * When we're called, we always have a reference + * on the file, so we must always have a reference + * on the inode, so igrab must always just + * bump the count and move on. + */ + atomic_inc(&mapping->host->i_count); abe->mapping = mapping; hlist_add_head(&abe->list, &batch_hash[bucket]); return; -- cgit v0.10.2 From a3314a0ed389f51a51695120b429eccd45b3a165 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 11 Oct 2010 22:38:00 +0900 Subject: lockdep: fixup checking of dir inode annotation Since inode->i_mode shares its bits for S_IFMT, S_ISDIR should be used to distinguish whether it is a dir or not. Signed-off-by: Namhyung Kim Signed-off-by: Al Viro diff --git a/fs/inode.c b/fs/inode.c index 3368abd..4f0a67c5 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -663,7 +663,7 @@ EXPORT_SYMBOL(new_inode); void unlock_new_inode(struct inode *inode) { #ifdef CONFIG_DEBUG_LOCK_ALLOC - if (inode->i_mode & S_IFDIR) { + if (S_ISDIR(inode->i_mode)) { struct file_system_type *type = inode->i_sb->s_type; /* Set new key only if filesystem hasn't already changed it */ -- cgit v0.10.2 From 309f77ad9bea057d55b04580b5a711e9e3727e83 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 25 Oct 2010 15:01:12 +0900 Subject: fs/buffer.c: call __block_write_begin() if we have page If we have the appropriate page already, call __block_write_begin() directly instead of releasing and regrabbing it inside of block_write_begin(). Signed-off-by: Namhyung Kim Signed-off-by: Al Viro diff --git a/fs/buffer.c b/fs/buffer.c index 74566c6..d895d9f 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2458,11 +2458,10 @@ int nobh_write_begin(struct address_space *mapping, *fsdata = NULL; if (page_has_buffers(page)) { - unlock_page(page); - page_cache_release(page); - *pagep = NULL; - return block_write_begin(mapping, pos, len, flags, pagep, - get_block); + ret = __block_write_begin(page, pos, len, get_block); + if (unlikely(ret)) + goto out_release; + return ret; } if (PageMappedToDisk(page)) -- cgit v0.10.2 From 8e3b9a072d071700e83e88b0bf59115c59042885 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Oct 2010 09:45:22 -0400 Subject: ext2_remount: don't bother with invalidate_inodes() It's pointless - we *do* have busy inodes (root directory, for one), so that call will fail and attempt to change XIP flag will be ignored. Signed-off-by: Al Viro diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 85df87d..09013206 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1221,9 +1221,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) } es = sbi->s_es; - if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != - (old_mount_opt & EXT2_MOUNT_XIP)) && - invalidate_inodes(sb)) { + if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) { ext2_msg(sb, KERN_WARNING, "warning: refusing change of " "xip flag with busy inodes while remounting"); sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; -- cgit v0.10.2 From 9dcefee508d547eed88f3c578dc92819bdeaa952 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Oct 2010 09:46:50 -0400 Subject: gfs2: invalidate_inodes() is no-op there In fill_super() we hadn't MS_ACTIVE set yet, so there won't be any inodes with zero i_count sitting around. In put_super() we already have MS_ACTIVE removed *and* we had called invalidate_inodes() since then. So again there won't be any inodes with zero i_count... Signed-off-by: Al Viro diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index aeafc23..cade1ac 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1219,7 +1219,6 @@ fail_sb: fail_locking: init_locking(sdp, &mount_gh, UNDO); fail_lm: - invalidate_inodes(sb); gfs2_gl_hash_clear(sdp); gfs2_lm_unmount(sdp); fail_sys: diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 047d117..2b2c499 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -857,7 +857,6 @@ restart: gfs2_clear_rgrpd(sdp); gfs2_jindex_free(sdp); /* Take apart glock structures and buffer lists */ - invalidate_inodes(sdp->sd_vfs); gfs2_gl_hash_clear(sdp); /* Unmount the locking protocol */ gfs2_lm_unmount(sdp); -- cgit v0.10.2 From 70fd136ecc44ca8c364f4412f447de1d940c3639 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Oct 2010 09:50:11 -0400 Subject: ntfs: don't call invalidate_inodes() We are in fill_super(); again, no inodes with zero i_count could be around until we set MS_ACTIVE. Signed-off-by: Al Viro diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 19c5180..d4dec2d 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -3021,21 +3021,6 @@ iput_tmp_ino_err_out_now: if (vol->mft_ino && vol->mft_ino != tmp_ino) iput(vol->mft_ino); vol->mft_ino = NULL; - /* - * This is needed to get ntfs_clear_extent_inode() called for each - * inode we have ever called ntfs_iget()/iput() on, otherwise we A) - * leak resources and B) a subsequent mount fails automatically due to - * ntfs_iget() never calling down into our ntfs_read_locked_inode() - * method again... FIXME: Do we need to do this twice now because of - * attribute inodes? I think not, so leave as is for now... (AIA) - */ - if (invalidate_inodes(sb)) { - ntfs_error(sb, "Busy inodes left. This is most likely a NTFS " - "driver bug."); - /* Copied from fs/super.c. I just love this message. (-; */ - printk("NTFS: Busy inodes after umount. Self-destruct in 5 " - "seconds. Have a nice day...\n"); - } /* Errors at this stage are irrelevant. */ err_out_now: sb->s_fs_info = NULL; -- cgit v0.10.2 From 61ebdb4254e3ecb59022d2c730b57b04d0eeecc6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Oct 2010 10:48:15 -0400 Subject: smbfs never retains inodes with zero refcount in the first place Signed-off-by: Al Viro diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index 8fc5e50..f6e9ee5 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -229,7 +229,6 @@ smb_invalidate_inodes(struct smb_sb_info *server) { VERBOSE("\n"); shrink_dcache_sb(SB_of(server)); - invalidate_inodes(SB_of(server)); } /* -- cgit v0.10.2 From a8dade34e3df581bc36ca2afe6e27055e178801c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Oct 2010 11:13:10 -0400 Subject: unexport invalidate_inodes Signed-off-by: Al Viro diff --git a/fs/inode.c b/fs/inode.c index 4f0a67c5..db7c74c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -417,7 +417,6 @@ int invalidate_inodes(struct super_block *sb) return busy; } -EXPORT_SYMBOL(invalidate_inodes); static int can_unuse(struct inode *inode) { diff --git a/fs/internal.h b/fs/internal.h index a6910e9..f6dce46 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -101,3 +101,8 @@ extern void put_super(struct super_block *sb); struct nameidata; extern struct file *nameidata_to_filp(struct nameidata *); extern void release_open_intent(struct nameidata *); + +/* + * inode.c + */ +extern int invalidate_inodes(struct super_block *); diff --git a/include/linux/fs.h b/include/linux/fs.h index 7fc126d..c3f6daf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2082,7 +2082,6 @@ extern int check_disk_change(struct block_device *); extern int __invalidate_device(struct block_device *); extern int invalidate_partition(struct gendisk *, int); #endif -extern int invalidate_inodes(struct super_block *); unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end); -- cgit v0.10.2 From 1d3382cbf02986e4833849f528d451367ea0b4cb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 23 Oct 2010 15:19:20 -0400 Subject: new helper: inode_unhashed() note: for race-free uses you inode_lock held Signed-off-by: Al Viro diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c038644..f6f2a0d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3849,7 +3849,7 @@ again: p = &root->inode_tree.rb_node; parent = NULL; - if (hlist_unhashed(&inode->i_hash)) + if (inode_unhashed(inode)) return; spin_lock(&root->inode_lock); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 29e3f40..39f44f2 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -962,7 +962,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) * dirty list. Add blockdev inodes as well. */ if (!S_ISBLK(inode->i_mode)) { - if (hlist_unhashed(&inode->i_hash)) + if (inode_unhashed(inode)) goto out; } if (inode->i_state & I_FREEING) diff --git a/fs/inode.c b/fs/inode.c index db7c74c..4440cf1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1094,7 +1094,7 @@ int insert_inode_locked(struct inode *inode) __iget(old); spin_unlock(&inode_lock); wait_on_inode(old); - if (unlikely(!hlist_unhashed(&old->i_hash))) { + if (unlikely(!inode_unhashed(old))) { iput(old); return -EBUSY; } @@ -1133,7 +1133,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, __iget(old); spin_unlock(&inode_lock); wait_on_inode(old); - if (unlikely(!hlist_unhashed(&old->i_hash))) { + if (unlikely(!inode_unhashed(old))) { iput(old); return -EBUSY; } @@ -1186,7 +1186,7 @@ EXPORT_SYMBOL(generic_delete_inode); */ int generic_drop_inode(struct inode *inode) { - return !inode->i_nlink || hlist_unhashed(&inode->i_hash); + return !inode->i_nlink || inode_unhashed(inode); } EXPORT_SYMBOL_GPL(generic_drop_inode); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index f7415de..5d04a78 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -422,7 +422,7 @@ int reiserfs_commit_write(struct file *f, struct page *page, static void update_ctime(struct inode *inode) { struct timespec now = current_fs_time(inode->i_sb); - if (hlist_unhashed(&inode->i_hash) || !inode->i_nlink || + if (inode_unhashed(inode) || !inode->i_nlink || timespec_equal(&inode->i_ctime, &now)) return; diff --git a/include/linux/fs.h b/include/linux/fs.h index c3f6daf..78043da 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -786,6 +786,11 @@ struct inode { void *i_private; /* fs or device private pointer */ }; +static inline int inode_unhashed(struct inode *inode) +{ + return hlist_unhashed(&inode->i_hash); +} + /* * inode->i_mutex nesting subclasses for the lock validator: * diff --git a/mm/shmem.c b/mm/shmem.c index 080b09a..27a5812 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2146,7 +2146,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, if (*len < 3) return 255; - if (hlist_unhashed(&inode->i_hash)) { + if (inode_unhashed(inode)) { /* Unfortunately insert_inode_hash is not idempotent, * so as we hash inodes here rather than at creation * time, we need a lock to ensure we only try @@ -2154,7 +2154,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, */ static DEFINE_SPINLOCK(lock); spin_lock(&lock); - if (hlist_unhashed(&inode->i_hash)) + if (inode_unhashed(inode)) __insert_inode_hash(inode, inode->i_ino + inode->i_generation); spin_unlock(&lock); -- cgit v0.10.2 From 756acc2d61712a8cafe2aa6ad626c60a185d3645 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 23 Oct 2010 15:23:40 -0400 Subject: list.h: new helper - hlist_add_fake() Make node look as if it was on hlist, with hlist_del() working correctly. Usable without any locking... Convert a couple of places where we want to do that to inode->i_hash. Signed-off-by: Al Viro diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 7844928..8afd7e8 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -211,7 +211,7 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent * appear hashed, but do not put on any lists. hlist_del() * will work fine and require no locking. */ - inode->i_hash.pprev = &inode->i_hash.next; + hlist_add_fake(&inode->i_hash); mark_inode_dirty(inode); out: diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index f8332dc..3a09423 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -497,7 +497,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) * appear hashed, but do not put on any lists. hlist_del() * will work fine and require no locking. */ - ip->i_hash.pprev = &ip->i_hash.next; + hlist_add_fake(&ip->i_hash); return (ip); } diff --git a/include/linux/list.h b/include/linux/list.h index 88a0006..9a5f8a7 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -636,6 +636,12 @@ static inline void hlist_add_after(struct hlist_node *n, next->next->pprev = &next->next; } +/* after that we'll appear to be on some hlist and hlist_del will work */ +static inline void hlist_add_fake(struct hlist_node *n) +{ + n->pprev = &n->next; +} + /* * Move a list from one list head to another. Fixup the pprev * reference of the first entry if it exists. -- cgit v0.10.2 From 89b0fc38cca4e6c92a90b58960881ffc5dddd89c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 23 Oct 2010 15:26:21 -0400 Subject: switch hfs to hlist_add_fake() Signed-off-by: Al Viro diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 1efcbc7..c8cffb8 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -147,8 +147,6 @@ struct hfs_sb_info { u16 blockoffset; int fs_div; - - struct hlist_head rsrc_inodes; }; #define HFS_FLG_BITMAP_DIRTY 0 diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 397b7ad..dffb4e9 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -524,7 +524,7 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry, HFS_I(inode)->rsrc_inode = dir; HFS_I(dir)->rsrc_inode = inode; igrab(dir); - hlist_add_head(&inode->i_hash, &HFS_SB(dir->i_sb)->rsrc_inodes); + hlist_add_fake(&inode->i_hash); mark_inode_dirty(inode); out: d_add(dentry, inode); diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 3325416..6ee1586 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -382,7 +382,6 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; sb->s_fs_info = sbi; - INIT_HLIST_HEAD(&sbi->rsrc_inodes); res = -EINVAL; if (!parse_options((char *)data, sbi)) { -- cgit v0.10.2 From be1a16a0ae29a7c90081a657b64aa51cb1a65a27 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 5 Oct 2010 12:31:09 +0200 Subject: vfs: fix infinite loop caused by clone_mnt race If clone_mnt() happens while mnt_make_readonly() is running, the cloned mount might have MNT_WRITE_HOLD flag set, which results in mnt_want_write() spinning forever on this mount. Needs CAP_SYS_ADMIN to trigger deliberately and unlikely to happen accidentally. But if it does happen it can hang the machine. Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro diff --git a/fs/namespace.c b/fs/namespace.c index 7ca5182..8a415c9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -595,7 +595,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, goto out_free; } - mnt->mnt_flags = old->mnt_flags; + mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD; atomic_inc(&sb->s_active); mnt->mnt_sb = sb; mnt->mnt_root = dget(root); -- cgit v0.10.2 From cffbc8aa334f55c9ed42d25202eb3ebf3a97c195 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sat, 23 Oct 2010 05:03:02 -0400 Subject: fs: Convert nr_inodes and nr_unused to per-cpu counters The number of inodes allocated does not need to be tied to the addition or removal of an inode to/from a list. If we are not tied to a list lock, we could update the counters when inodes are initialised or destroyed, but to do that we need to convert the counters to be per-cpu (i.e. independent of a lock). This means that we have the freedom to change the list/locking implementation without needing to care about the counters. Based on a patch originally from Eric Dumazet. [AV: cleaned up a bit, fixed build breakage on weird configs Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 39f44f2..f04d04a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -723,7 +723,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) wb->last_old_flush = jiffies; nr_pages = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS) + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); + get_nr_dirty_inodes(); if (nr_pages) { struct wb_writeback_work work = { @@ -1090,8 +1090,7 @@ void writeback_inodes_sb(struct super_block *sb) WARN_ON(!rwsem_is_locked(&sb->s_umount)); - work.nr_pages = nr_dirty + nr_unstable + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); + work.nr_pages = nr_dirty + nr_unstable + get_nr_dirty_inodes(); bdi_queue_work(sb->s_bdi, &work); wait_for_completion(&done); diff --git a/fs/inode.c b/fs/inode.c index 4440cf1..0d5aecc 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -103,8 +103,41 @@ static DECLARE_RWSEM(iprune_sem); */ struct inodes_stat_t inodes_stat; +static struct percpu_counter nr_inodes __cacheline_aligned_in_smp; +static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp; + static struct kmem_cache *inode_cachep __read_mostly; +static inline int get_nr_inodes(void) +{ + return percpu_counter_sum_positive(&nr_inodes); +} + +static inline int get_nr_inodes_unused(void) +{ + return percpu_counter_sum_positive(&nr_inodes_unused); +} + +int get_nr_dirty_inodes(void) +{ + int nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); + return nr_dirty > 0 ? nr_dirty : 0; + +} + +/* + * Handle nr_inode sysctl + */ +#ifdef CONFIG_SYSCTL +int proc_nr_inodes(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + inodes_stat.nr_inodes = get_nr_inodes(); + inodes_stat.nr_unused = get_nr_inodes_unused(); + return proc_dointvec(table, write, buffer, lenp, ppos); +} +#endif + static void wake_up_inode(struct inode *inode) { /* @@ -192,6 +225,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_fsnotify_mask = 0; #endif + percpu_counter_inc(&nr_inodes); + return 0; out: return -ENOMEM; @@ -232,6 +267,7 @@ void __destroy_inode(struct inode *inode) if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) posix_acl_release(inode->i_default_acl); #endif + percpu_counter_dec(&nr_inodes); } EXPORT_SYMBOL(__destroy_inode); @@ -286,7 +322,7 @@ void __iget(struct inode *inode) if (!(inode->i_state & (I_DIRTY|I_SYNC))) list_move(&inode->i_list, &inode_in_use); - inodes_stat.nr_unused--; + percpu_counter_dec(&nr_inodes_unused); } void end_writeback(struct inode *inode) @@ -327,8 +363,6 @@ static void evict(struct inode *inode) */ static void dispose_list(struct list_head *head) { - int nr_disposed = 0; - while (!list_empty(head)) { struct inode *inode; @@ -344,11 +378,7 @@ static void dispose_list(struct list_head *head) wake_up_inode(inode); destroy_inode(inode); - nr_disposed++; } - spin_lock(&inode_lock); - inodes_stat.nr_inodes -= nr_disposed; - spin_unlock(&inode_lock); } /* @@ -357,7 +387,7 @@ static void dispose_list(struct list_head *head) static int invalidate_list(struct list_head *head, struct list_head *dispose) { struct list_head *next; - int busy = 0, count = 0; + int busy = 0; next = head->next; for (;;) { @@ -383,13 +413,11 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose) list_move(&inode->i_list, dispose); WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; - count++; + percpu_counter_dec(&nr_inodes_unused); continue; } busy = 1; } - /* only unused inodes may be cached with i_count zero */ - inodes_stat.nr_unused -= count; return busy; } @@ -447,7 +475,6 @@ static int can_unuse(struct inode *inode) static void prune_icache(int nr_to_scan) { LIST_HEAD(freeable); - int nr_pruned = 0; int nr_scanned; unsigned long reap = 0; @@ -483,9 +510,8 @@ static void prune_icache(int nr_to_scan) list_move(&inode->i_list, &freeable); WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; - nr_pruned++; + percpu_counter_dec(&nr_inodes_unused); } - inodes_stat.nr_unused -= nr_pruned; if (current_is_kswapd()) __count_vm_events(KSWAPD_INODESTEAL, reap); else @@ -517,7 +543,7 @@ static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) return -1; prune_icache(nr); } - return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; + return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure; } static struct shrinker icache_shrinker = { @@ -594,7 +620,6 @@ static inline void __inode_add_to_lists(struct super_block *sb, struct hlist_head *head, struct inode *inode) { - inodes_stat.nr_inodes++; list_add(&inode->i_list, &inode_in_use); list_add(&inode->i_sb_list, &sb->s_inodes); if (head) @@ -1214,7 +1239,7 @@ static void iput_final(struct inode *inode) if (!drop) { if (!(inode->i_state & (I_DIRTY|I_SYNC))) list_move(&inode->i_list, &inode_unused); - inodes_stat.nr_unused++; + percpu_counter_inc(&nr_inodes_unused); if (sb->s_flags & MS_ACTIVE) { spin_unlock(&inode_lock); return; @@ -1226,14 +1251,13 @@ static void iput_final(struct inode *inode) spin_lock(&inode_lock); WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_WILL_FREE; - inodes_stat.nr_unused--; + percpu_counter_dec(&nr_inodes_unused); hlist_del_init(&inode->i_hash); } list_del_init(&inode->i_list); list_del_init(&inode->i_sb_list); WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; - inodes_stat.nr_inodes--; spin_unlock(&inode_lock); evict(inode); spin_lock(&inode_lock); @@ -1502,6 +1526,8 @@ void __init inode_init(void) SLAB_MEM_SPREAD), init_once); register_shrinker(&icache_shrinker); + percpu_counter_init(&nr_inodes, 0); + percpu_counter_init(&nr_inodes_unused, 0); /* Hash may have been set up in inode_init_early */ if (!hashdist) diff --git a/fs/internal.h b/fs/internal.h index f6dce46..4cc67eb 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -105,4 +105,5 @@ extern void release_open_intent(struct nameidata *); /* * inode.c */ +extern int get_nr_dirty_inodes(void); extern int invalidate_inodes(struct super_block *); diff --git a/include/linux/fs.h b/include/linux/fs.h index 78043da..a3937a8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2486,7 +2486,8 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, struct ctl_table; int proc_nr_files(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); - +int proc_nr_inodes(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); int __init get_filesystem_list(char *buf); #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 694b140..99a510c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1340,14 +1340,14 @@ static struct ctl_table fs_table[] = { .data = &inodes_stat, .maxlen = 2*sizeof(int), .mode = 0444, - .proc_handler = proc_dointvec, + .proc_handler = proc_nr_inodes, }, { .procname = "inode-state", .data = &inodes_stat, .maxlen = 7*sizeof(int), .mode = 0444, - .proc_handler = proc_dointvec, + .proc_handler = proc_nr_inodes, }, { .procname = "file-nr", -- cgit v0.10.2 From 9e38d86ff2d8a8db99570e982230861046df32b5 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sat, 23 Oct 2010 06:55:17 -0400 Subject: fs: Implement lazy LRU updates for inodes Convert the inode LRU to use lazy updates to reduce lock and cacheline traffic. We avoid moving inodes around in the LRU list during iget/iput operations so these frequent operations don't need to access the LRUs. Instead, we defer the refcount checks to reclaim-time and use a per-inode state flag, I_REFERENCED, to tell reclaim that iget has touched the inode in the past. This means that only reclaim should be touching the LRU with any frequency, hence significantly reducing lock acquisitions and the amount contention on LRU updates. This also removes the inode_in_use list, which means we now only have one list for tracking the inode LRU status. This makes it much simpler to split out the LRU list operations under it's own lock. Signed-off-by: Nick Piggin Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f04d04a..e8f6529 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -408,16 +408,13 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * completion. */ redirty_tail(inode); - } else if (atomic_read(&inode->i_count)) { - /* - * The inode is clean, inuse - */ - list_move(&inode->i_list, &inode_in_use); } else { /* - * The inode is clean, unused + * The inode is clean. At this point we either have + * a reference to the inode or it's on it's way out. + * No need to add it back to the LRU. */ - list_move(&inode->i_list, &inode_unused); + list_del_init(&inode->i_list); } } inode_sync_complete(inode); diff --git a/fs/inode.c b/fs/inode.c index 0d5aecc..3bdc76f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -72,8 +72,7 @@ static unsigned int i_hash_shift __read_mostly; * allowing for low-overhead inode sync() operations. */ -LIST_HEAD(inode_in_use); -LIST_HEAD(inode_unused); +static LIST_HEAD(inode_unused); static struct hlist_head *inode_hashtable __read_mostly; /* @@ -291,6 +290,7 @@ void inode_init_once(struct inode *inode) INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_devices); + INIT_LIST_HEAD(&inode->i_list); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); spin_lock_init(&inode->i_data.tree_lock); spin_lock_init(&inode->i_data.i_mmap_lock); @@ -317,12 +317,23 @@ static void init_once(void *foo) */ void __iget(struct inode *inode) { - if (atomic_inc_return(&inode->i_count) != 1) - return; + atomic_inc(&inode->i_count); +} - if (!(inode->i_state & (I_DIRTY|I_SYNC))) - list_move(&inode->i_list, &inode_in_use); - percpu_counter_dec(&nr_inodes_unused); +static void inode_lru_list_add(struct inode *inode) +{ + if (list_empty(&inode->i_list)) { + list_add(&inode->i_list, &inode_unused); + percpu_counter_inc(&nr_inodes_unused); + } +} + +static void inode_lru_list_del(struct inode *inode) +{ + if (!list_empty(&inode->i_list)) { + list_del_init(&inode->i_list); + percpu_counter_dec(&nr_inodes_unused); + } } void end_writeback(struct inode *inode) @@ -367,7 +378,7 @@ static void dispose_list(struct list_head *head) struct inode *inode; inode = list_first_entry(head, struct inode, i_list); - list_del(&inode->i_list); + list_del_init(&inode->i_list); evict(inode); @@ -413,7 +424,8 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose) list_move(&inode->i_list, dispose); WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; - percpu_counter_dec(&nr_inodes_unused); + if (!(inode->i_state & (I_DIRTY | I_SYNC))) + percpu_counter_dec(&nr_inodes_unused); continue; } busy = 1; @@ -448,7 +460,7 @@ int invalidate_inodes(struct super_block *sb) static int can_unuse(struct inode *inode) { - if (inode->i_state) + if (inode->i_state & ~I_REFERENCED) return 0; if (inode_has_buffers(inode)) return 0; @@ -460,17 +472,20 @@ static int can_unuse(struct inode *inode) } /* - * Scan `goal' inodes on the unused list for freeable ones. They are moved to - * a temporary list and then are freed outside inode_lock by dispose_list(). + * Scan `goal' inodes on the unused list for freeable ones. They are moved to a + * temporary list and then are freed outside inode_lock by dispose_list(). * * Any inodes which are pinned purely because of attached pagecache have their - * pagecache removed. We expect the final iput() on that inode to add it to - * the front of the inode_unused list. So look for it there and if the - * inode is still freeable, proceed. The right inode is found 99.9% of the - * time in testing on a 4-way. + * pagecache removed. If the inode has metadata buffers attached to + * mapping->private_list then try to remove them. * - * If the inode has metadata buffers attached to mapping->private_list then - * try to remove them. + * If the inode has the I_REFERENCED flag set, then it means that it has been + * used recently - the flag is set in iput_final(). When we encounter such an + * inode, clear the flag and move it to the back of the LRU so it gets another + * pass through the LRU before it gets reclaimed. This is necessary because of + * the fact we are doing lazy LRU updates to minimise lock contention so the + * LRU does not have strict ordering. Hence we don't want to reclaim inodes + * with this flag set because they are the inodes that are out of order. */ static void prune_icache(int nr_to_scan) { @@ -488,8 +503,21 @@ static void prune_icache(int nr_to_scan) inode = list_entry(inode_unused.prev, struct inode, i_list); - if (inode->i_state || atomic_read(&inode->i_count)) { + /* + * Referenced or dirty inodes are still in use. Give them + * another pass through the LRU as we canot reclaim them now. + */ + if (atomic_read(&inode->i_count) || + (inode->i_state & ~I_REFERENCED)) { + list_del_init(&inode->i_list); + percpu_counter_dec(&nr_inodes_unused); + continue; + } + + /* recently referenced inodes get one more pass */ + if (inode->i_state & I_REFERENCED) { list_move(&inode->i_list, &inode_unused); + inode->i_state &= ~I_REFERENCED; continue; } if (inode_has_buffers(inode) || inode->i_data.nrpages) { @@ -620,7 +648,6 @@ static inline void __inode_add_to_lists(struct super_block *sb, struct hlist_head *head, struct inode *inode) { - list_add(&inode->i_list, &inode_in_use); list_add(&inode->i_sb_list, &sb->s_inodes); if (head) hlist_add_head(&inode->i_hash, head); @@ -1237,10 +1264,11 @@ static void iput_final(struct inode *inode) drop = generic_drop_inode(inode); if (!drop) { - if (!(inode->i_state & (I_DIRTY|I_SYNC))) - list_move(&inode->i_list, &inode_unused); - percpu_counter_inc(&nr_inodes_unused); if (sb->s_flags & MS_ACTIVE) { + inode->i_state |= I_REFERENCED; + if (!(inode->i_state & (I_DIRTY|I_SYNC))) { + inode_lru_list_add(inode); + } spin_unlock(&inode_lock); return; } @@ -1251,13 +1279,19 @@ static void iput_final(struct inode *inode) spin_lock(&inode_lock); WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_WILL_FREE; - percpu_counter_dec(&nr_inodes_unused); hlist_del_init(&inode->i_hash); } - list_del_init(&inode->i_list); - list_del_init(&inode->i_sb_list); WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; + + /* + * After we delete the inode from the LRU here, we avoid moving dirty + * inodes back onto the LRU now because I_FREEING is set and hence + * writeback_single_inode() won't move the inode around. + */ + inode_lru_list_del(inode); + + list_del_init(&inode->i_sb_list); spin_unlock(&inode_lock); evict(inode); spin_lock(&inode_lock); diff --git a/include/linux/fs.h b/include/linux/fs.h index a3937a8..876275f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1641,16 +1641,17 @@ struct super_operations { * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ -#define I_DIRTY_SYNC 1 -#define I_DIRTY_DATASYNC 2 -#define I_DIRTY_PAGES 4 +#define I_DIRTY_SYNC (1 << 0) +#define I_DIRTY_DATASYNC (1 << 1) +#define I_DIRTY_PAGES (1 << 2) #define __I_NEW 3 #define I_NEW (1 << __I_NEW) -#define I_WILL_FREE 16 -#define I_FREEING 32 -#define I_CLEAR 64 +#define I_WILL_FREE (1 << 4) +#define I_FREEING (1 << 5) +#define I_CLEAR (1 << 6) #define __I_SYNC 7 #define I_SYNC (1 << __I_SYNC) +#define I_REFERENCED (1 << 8) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 72a5d64..242b6f8 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -10,8 +10,6 @@ struct backing_dev_info; extern spinlock_t inode_lock; -extern struct list_head inode_in_use; -extern struct list_head inode_unused; /* * fs/fs-writeback.c -- cgit v0.10.2 From 4c51acbc66f754e536e1c9e3331656b69bce86d0 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sat, 23 Oct 2010 06:58:09 -0400 Subject: fs: Factor inode hash operations into functions Before replacing the inode hash locking with a more scalable mechanism, factor the removal of the inode from the hashes rather than open coding it in several places. Based on a patch originally from Nick Piggin. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/inode.c b/fs/inode.c index 3bdc76f..5e5bafe 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -336,6 +336,58 @@ static void inode_lru_list_del(struct inode *inode) } } +static unsigned long hash(struct super_block *sb, unsigned long hashval) +{ + unsigned long tmp; + + tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / + L1_CACHE_BYTES; + tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS); + return tmp & I_HASHMASK; +} + +/** + * __insert_inode_hash - hash an inode + * @inode: unhashed inode + * @hashval: unsigned long value used to locate this object in the + * inode_hashtable. + * + * Add an inode to the inode hash for this superblock. + */ +void __insert_inode_hash(struct inode *inode, unsigned long hashval) +{ + struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); + spin_lock(&inode_lock); + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode_lock); +} +EXPORT_SYMBOL(__insert_inode_hash); + +/** + * __remove_inode_hash - remove an inode from the hash + * @inode: inode to unhash + * + * Remove an inode from the superblock. + */ +static void __remove_inode_hash(struct inode *inode) +{ + hlist_del_init(&inode->i_hash); +} + +/** + * remove_inode_hash - remove an inode from the hash + * @inode: inode to unhash + * + * Remove an inode from the superblock. + */ +void remove_inode_hash(struct inode *inode) +{ + spin_lock(&inode_lock); + hlist_del_init(&inode->i_hash); + spin_unlock(&inode_lock); +} +EXPORT_SYMBOL(remove_inode_hash); + void end_writeback(struct inode *inode) { might_sleep(); @@ -383,7 +435,7 @@ static void dispose_list(struct list_head *head) evict(inode); spin_lock(&inode_lock); - hlist_del_init(&inode->i_hash); + __remove_inode_hash(inode); list_del_init(&inode->i_sb_list); spin_unlock(&inode_lock); @@ -634,16 +686,6 @@ repeat: return node ? inode : NULL; } -static unsigned long hash(struct super_block *sb, unsigned long hashval) -{ - unsigned long tmp; - - tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / - L1_CACHE_BYTES; - tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS); - return tmp & I_HASHMASK; -} - static inline void __inode_add_to_lists(struct super_block *sb, struct hlist_head *head, struct inode *inode) @@ -1194,36 +1236,6 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, } EXPORT_SYMBOL(insert_inode_locked4); -/** - * __insert_inode_hash - hash an inode - * @inode: unhashed inode - * @hashval: unsigned long value used to locate this object in the - * inode_hashtable. - * - * Add an inode to the inode hash for this superblock. - */ -void __insert_inode_hash(struct inode *inode, unsigned long hashval) -{ - struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); - spin_lock(&inode_lock); - hlist_add_head(&inode->i_hash, head); - spin_unlock(&inode_lock); -} -EXPORT_SYMBOL(__insert_inode_hash); - -/** - * remove_inode_hash - remove an inode from the hash - * @inode: inode to unhash - * - * Remove an inode from the superblock. - */ -void remove_inode_hash(struct inode *inode) -{ - spin_lock(&inode_lock); - hlist_del_init(&inode->i_hash); - spin_unlock(&inode_lock); -} -EXPORT_SYMBOL(remove_inode_hash); int generic_delete_inode(struct inode *inode) { @@ -1279,7 +1291,7 @@ static void iput_final(struct inode *inode) spin_lock(&inode_lock); WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_WILL_FREE; - hlist_del_init(&inode->i_hash); + __remove_inode_hash(inode); } WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; @@ -1294,9 +1306,7 @@ static void iput_final(struct inode *inode) list_del_init(&inode->i_sb_list); spin_unlock(&inode_lock); evict(inode); - spin_lock(&inode_lock); - hlist_del_init(&inode->i_hash); - spin_unlock(&inode_lock); + remove_inode_hash(inode); wake_up_inode(inode); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); destroy_inode(inode); -- cgit v0.10.2 From ad5e195ac9fdf4e2b28b8cf14937e5b9384dac2e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 Oct 2010 07:00:16 -0400 Subject: fs: Stop abusing find_inode_fast in iunique Stop abusing find_inode_fast for iunique and opencode the inode hash walk. Introduce a new iunique_lock to protect the iunique counters once inode_lock is removed. Based on a patch originally from Nick Piggin. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/inode.c b/fs/inode.c index 5e5bafe..a8035e8 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -884,6 +884,27 @@ static struct inode *get_new_inode_fast(struct super_block *sb, return inode; } +/* + * search the inode cache for a matching inode number. + * If we find one, then the inode number we are trying to + * allocate is not unique and so we should not use it. + * + * Returns 1 if the inode number is unique, 0 if it is not. + */ +static int test_inode_iunique(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *b = inode_hashtable + hash(sb, ino); + struct hlist_node *node; + struct inode *inode; + + hlist_for_each_entry(inode, node, b, i_hash) { + if (inode->i_ino == ino && inode->i_sb == sb) + return 0; + } + + return 1; +} + /** * iunique - get a unique inode number * @sb: superblock @@ -905,19 +926,18 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved) * error if st_ino won't fit in target struct field. Use 32bit counter * here to attempt to avoid that. */ + static DEFINE_SPINLOCK(iunique_lock); static unsigned int counter; - struct inode *inode; - struct hlist_head *head; ino_t res; spin_lock(&inode_lock); + spin_lock(&iunique_lock); do { if (counter <= max_reserved) counter = max_reserved + 1; res = counter++; - head = inode_hashtable + hash(sb, res); - inode = find_inode_fast(sb, head, res); - } while (inode != NULL); + } while (!test_inode_iunique(sb, res)); + spin_unlock(&iunique_lock); spin_unlock(&inode_lock); return res; -- cgit v0.10.2 From f7899bd5472e8e99741369b4a32eca44e5282a85 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 Oct 2010 07:09:06 -0400 Subject: fs: move i_count increments into find_inode/find_inode_fast Now that iunique is not abusing find_inode anymore we can move the i_ref increment back to where it belongs. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/inode.c b/fs/inode.c index a8035e8..78c41c6 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -634,9 +634,6 @@ static struct shrinker icache_shrinker = { static void __wait_on_freeing_inode(struct inode *inode); /* * Called with the inode lock held. - * NOTE: we are not increasing the inode-refcount, you must call __iget() - * by hand after calling find_inode now! This simplifies iunique and won't - * add any additional branch in the common code. */ static struct inode *find_inode(struct super_block *sb, struct hlist_head *head, @@ -656,9 +653,10 @@ repeat: __wait_on_freeing_inode(inode); goto repeat; } - break; + __iget(inode); + return inode; } - return node ? inode : NULL; + return NULL; } /* @@ -681,9 +679,10 @@ repeat: __wait_on_freeing_inode(inode); goto repeat; } - break; + __iget(inode); + return inode; } - return node ? inode : NULL; + return NULL; } static inline void @@ -828,7 +827,6 @@ static struct inode *get_new_inode(struct super_block *sb, * us. Use the old inode instead of the one we just * allocated. */ - __iget(old); spin_unlock(&inode_lock); destroy_inode(inode); inode = old; @@ -875,7 +873,6 @@ static struct inode *get_new_inode_fast(struct super_block *sb, * us. Use the old inode instead of the one we just * allocated. */ - __iget(old); spin_unlock(&inode_lock); destroy_inode(inode); inode = old; @@ -989,7 +986,6 @@ static struct inode *ifind(struct super_block *sb, spin_lock(&inode_lock); inode = find_inode(sb, head, test, data); if (inode) { - __iget(inode); spin_unlock(&inode_lock); if (likely(wait)) wait_on_inode(inode); @@ -1022,7 +1018,6 @@ static struct inode *ifind_fast(struct super_block *sb, spin_lock(&inode_lock); inode = find_inode_fast(sb, head, ino); if (inode) { - __iget(inode); spin_unlock(&inode_lock); wait_on_inode(inode); return inode; -- cgit v0.10.2 From 646ec4615cd05972581c9c5342ed7a1e77df17bb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 Oct 2010 07:15:32 -0400 Subject: fs: remove inode_add_to_list/__inode_add_to_list Split up inode_add_to_list/__inode_add_to_list. Locking for the two lists will be split soon so these helpers really don't buy us much anymore. The __ prefixes for the sb list helpers will go away soon, but until inode_lock is gone we'll need them to distinguish between the locked and unlocked variants. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/inode.c b/fs/inode.c index 78c41c6..430d70f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -336,6 +336,28 @@ static void inode_lru_list_del(struct inode *inode) } } +static inline void __inode_sb_list_add(struct inode *inode) +{ + list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); +} + +/** + * inode_sb_list_add - add inode to the superblock list of inodes + * @inode: inode to add + */ +void inode_sb_list_add(struct inode *inode) +{ + spin_lock(&inode_lock); + __inode_sb_list_add(inode); + spin_unlock(&inode_lock); +} +EXPORT_SYMBOL_GPL(inode_sb_list_add); + +static inline void __inode_sb_list_del(struct inode *inode) +{ + list_del_init(&inode->i_sb_list); +} + static unsigned long hash(struct super_block *sb, unsigned long hashval) { unsigned long tmp; @@ -356,9 +378,10 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval) */ void __insert_inode_hash(struct inode *inode, unsigned long hashval) { - struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); + struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); + spin_lock(&inode_lock); - hlist_add_head(&inode->i_hash, head); + hlist_add_head(&inode->i_hash, b); spin_unlock(&inode_lock); } EXPORT_SYMBOL(__insert_inode_hash); @@ -436,7 +459,7 @@ static void dispose_list(struct list_head *head) spin_lock(&inode_lock); __remove_inode_hash(inode); - list_del_init(&inode->i_sb_list); + __inode_sb_list_del(inode); spin_unlock(&inode_lock); wake_up_inode(inode); @@ -685,37 +708,6 @@ repeat: return NULL; } -static inline void -__inode_add_to_lists(struct super_block *sb, struct hlist_head *head, - struct inode *inode) -{ - list_add(&inode->i_sb_list, &sb->s_inodes); - if (head) - hlist_add_head(&inode->i_hash, head); -} - -/** - * inode_add_to_lists - add a new inode to relevant lists - * @sb: superblock inode belongs to - * @inode: inode to mark in use - * - * When an inode is allocated it needs to be accounted for, added to the in use - * list, the owning superblock and the inode hash. This needs to be done under - * the inode_lock, so export a function to do this rather than the inode lock - * itself. We calculate the hash list to add to here so it is all internal - * which requires the caller to have already set up the inode number in the - * inode to add. - */ -void inode_add_to_lists(struct super_block *sb, struct inode *inode) -{ - struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino); - - spin_lock(&inode_lock); - __inode_add_to_lists(sb, head, inode); - spin_unlock(&inode_lock); -} -EXPORT_SYMBOL_GPL(inode_add_to_lists); - /** * new_inode - obtain an inode * @sb: superblock @@ -743,7 +735,7 @@ struct inode *new_inode(struct super_block *sb) inode = alloc_inode(sb); if (inode) { spin_lock(&inode_lock); - __inode_add_to_lists(sb, NULL, inode); + __inode_sb_list_add(inode); inode->i_ino = ++last_ino; inode->i_state = 0; spin_unlock(&inode_lock); @@ -812,7 +804,8 @@ static struct inode *get_new_inode(struct super_block *sb, if (set(inode, data)) goto set_failed; - __inode_add_to_lists(sb, head, inode); + hlist_add_head(&inode->i_hash, head); + __inode_sb_list_add(inode); inode->i_state = I_NEW; spin_unlock(&inode_lock); @@ -858,7 +851,8 @@ static struct inode *get_new_inode_fast(struct super_block *sb, old = find_inode_fast(sb, head, ino); if (!old) { inode->i_ino = ino; - __inode_add_to_lists(sb, head, inode); + hlist_add_head(&inode->i_hash, head); + __inode_sb_list_add(inode); inode->i_state = I_NEW; spin_unlock(&inode_lock); @@ -1318,7 +1312,7 @@ static void iput_final(struct inode *inode) */ inode_lru_list_del(inode); - list_del_init(&inode->i_sb_list); + __inode_sb_list_del(inode); spin_unlock(&inode_lock); evict(inode); remove_inode_hash(inode); diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index ec858e0..71d83c9 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -760,7 +760,9 @@ xfs_setup_inode( inode->i_ino = ip->i_ino; inode->i_state = I_NEW; - inode_add_to_lists(ip->i_mount->m_super, inode); + + inode_sb_list_add(inode); + insert_inode_hash(inode); inode->i_mode = ip->i_d.di_mode; inode->i_nlink = ip->i_d.di_nlink; diff --git a/include/linux/fs.h b/include/linux/fs.h index 876275f..d43e8b6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2171,7 +2171,6 @@ extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin); extern int inode_init_always(struct super_block *, struct inode *); extern void inode_init_once(struct inode *); -extern void inode_add_to_lists(struct super_block *, struct inode *); extern void iput(struct inode *); extern struct inode * igrab(struct inode *); extern ino_t iunique(struct super_block *, ino_t); @@ -2202,9 +2201,11 @@ extern int file_remove_suid(struct file *); extern void __insert_inode_hash(struct inode *, unsigned long hashval); extern void remove_inode_hash(struct inode *); -static inline void insert_inode_hash(struct inode *inode) { +static inline void insert_inode_hash(struct inode *inode) +{ __insert_inode_hash(inode, inode->i_ino); } +extern void inode_sb_list_add(struct inode *inode); #ifdef CONFIG_BLOCK extern void submit_bio(int, struct bio *); -- cgit v0.10.2 From 7de9c6ee3ecffd99e1628e81a5ea5468f7581a1f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 23 Oct 2010 11:11:40 -0400 Subject: new helper: ihold() Clones an existing reference to inode; caller must already hold one. Signed-off-by: Al Viro diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 9e670d5..ef5905f 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1789,9 +1789,10 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, kfree(st); } else { /* Caching disabled. No need to get upto date stat info. - * This dentry will be released immediately. So, just i_count++ + * This dentry will be released immediately. So, just hold the + * inode */ - atomic_inc(&old_dentry->d_inode->i_count); + ihold(old_dentry->d_inode); } dentry->d_op = old_dentry->d_op; diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 3a0fdec..5d82890 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -388,7 +388,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3 affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain)); mark_buffer_dirty_inode(inode_bh, inode); inode->i_nlink = 2; - atomic_inc(&inode->i_count); + ihold(inode); } affs_fix_checksum(sb, bh); mark_buffer_dirty_inode(bh, inode); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 0d38c09..5439e1b 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -1045,7 +1045,7 @@ static int afs_link(struct dentry *from, struct inode *dir, if (ret < 0) goto link_error; - atomic_inc(&vnode->vfs_inode.i_count); + ihold(&vnode->vfs_inode); d_instantiate(dentry, &vnode->vfs_inode); key_put(key); _leave(" = 0"); diff --git a/fs/aio.c b/fs/aio.c index 9e319a0..8c8f6c5 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1553,10 +1553,9 @@ static void aio_batch_add(struct address_space *mapping, * * When we're called, we always have a reference * on the file, so we must always have a reference - * on the inode, so igrab must always just - * bump the count and move on. + * on the inode, so ihold() is safe here. */ - atomic_inc(&mapping->host->i_count); + ihold(mapping->host); abe->mapping = mapping; hlist_add_head(&abe->list, &batch_hash[bucket]); return; diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index e4b75d6..9c8e87b 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -111,10 +111,9 @@ struct file *anon_inode_getfile(const char *name, path.mnt = mntget(anon_inode_mnt); /* * We know the anon_inode inode count is always greater than zero, - * so we can avoid doing an igrab() and we can use an open-coded - * atomic_inc(). + * so ihold() is safe. */ - atomic_inc(&anon_inode_inode->i_count); + ihold(anon_inode_inode); path.dentry->d_op = &anon_inodefs_dentry_operations; d_instantiate(path.dentry, anon_inode_inode); diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index d967e05..685ecff 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -176,7 +176,7 @@ static int bfs_link(struct dentry *old, struct inode *dir, inc_nlink(inode); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - atomic_inc(&inode->i_count); + ihold(inode); d_instantiate(new, inode); mutex_unlock(&info->bfs_lock); return 0; diff --git a/fs/block_dev.c b/fs/block_dev.c index b737451..81972eb 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -550,7 +550,7 @@ EXPORT_SYMBOL(bdget); */ struct block_device *bdgrab(struct block_device *bdev) { - atomic_inc(&bdev->bd_inode->i_count); + ihold(bdev->bd_inode); return bdev; } @@ -580,7 +580,7 @@ static struct block_device *bd_acquire(struct inode *inode) spin_lock(&bdev_lock); bdev = inode->i_bdev; if (bdev) { - atomic_inc(&bdev->bd_inode->i_count); + ihold(bdev->bd_inode); spin_unlock(&bdev_lock); return bdev; } @@ -591,12 +591,12 @@ static struct block_device *bd_acquire(struct inode *inode) spin_lock(&bdev_lock); if (!inode->i_bdev) { /* - * We take an additional bd_inode->i_count for inode, + * We take an additional reference to bd_inode, * and it's released in clear_inode() of inode. * So, we can access it via ->i_mapping always * without igrab(). */ - atomic_inc(&bdev->bd_inode->i_count); + ihold(bdev->bd_inode); inode->i_bdev = bdev; inode->i_mapping = bdev->bd_inode->i_mapping; list_add(&inode->i_devices, &bdev->bd_inodes); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f6f2a0d..64f99cf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4758,7 +4758,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } btrfs_set_trans_block_group(trans, dir); - atomic_inc(&inode->i_count); + ihold(inode); err = btrfs_add_nondir(trans, dentry, inode, 1, index); diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 96fbeab..5d8b355 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -276,7 +276,7 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode, } coda_dir_update_mtime(dir_inode); - atomic_inc(&inode->i_count); + ihold(inode); d_instantiate(de, inode); inc_nlink(inode); return 0; diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index b7dd0c2..264e95d 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -153,7 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir, inode->i_ctime = CURRENT_TIME; inode_inc_link_count(inode); - atomic_inc(&inode->i_count); + ihold(inode); return exofs_add_nondir(dentry, inode); } diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 71efb0e..f8aecd2 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -206,7 +206,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); - atomic_inc(&inode->i_count); + ihold(inode); err = ext2_add_link(dentry, inode); if (!err) { diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 2b35ddb..bce9dce 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -2260,7 +2260,7 @@ retry: inode->i_ctime = CURRENT_TIME_SEC; inc_nlink(inode); - atomic_inc(&inode->i_count); + ihold(inode); err = ext3_add_entry(handle, dentry, inode); if (!err) { diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 314c0d3..bd39885 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2312,7 +2312,7 @@ retry: inode->i_ctime = ext4_current_time(inode); ext4_inc_count(handle, inode); - atomic_inc(&inode->i_count); + ihold(inode); err = ext4_add_entry(handle, dentry, inode); if (!err) { diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 48a274f..12cbea7 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -255,7 +255,7 @@ out_parent: gfs2_holder_uninit(ghs); gfs2_holder_uninit(ghs + 1); if (!error) { - atomic_inc(&inode->i_count); + ihold(inode); d_instantiate(dentry, inode); mark_inode_dirty(inode); } diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index d236d85..e318bbc 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -286,7 +286,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, inc_nlink(inode); hfsplus_instantiate(dst_dentry, inode, cnid); - atomic_inc(&inode->i_count); + ihold(inode); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); sbi->file_count++; diff --git a/fs/inode.c b/fs/inode.c index 430d70f..05ea293 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -320,6 +320,15 @@ void __iget(struct inode *inode) atomic_inc(&inode->i_count); } +/* + * get additional reference to inode; caller must already hold one. + */ +void ihold(struct inode *inode) +{ + WARN_ON(atomic_inc_return(&inode->i_count) < 2); +} +EXPORT_SYMBOL(ihold); + static void inode_lru_list_add(struct inode *inode) { if (list_empty(&inode->i_list)) { diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index ed78a3cf3..79121aa 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -289,7 +289,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de mutex_unlock(&f->sem); d_instantiate(dentry, old_dentry->d_inode); dir_i->i_mtime = dir_i->i_ctime = ITIME(now); - atomic_inc(&old_dentry->d_inode->i_count); + ihold(old_dentry->d_inode); } return ret; } @@ -864,7 +864,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret); /* Might as well let the VFS know */ d_instantiate(new_dentry, old_dentry->d_inode); - atomic_inc(&old_dentry->d_inode->i_count); + ihold(old_dentry->d_inode); new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now); return ret; } diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index d945ea7..9466957 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -1279,7 +1279,7 @@ int txCommit(tid_t tid, /* transaction identifier */ * lazy commit thread finishes processing */ if (tblk->xflag & COMMIT_DELETE) { - atomic_inc(&tblk->u.ip->i_count); + ihold(tblk->u.ip); /* * Avoid a rare deadlock * diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index a9cf8e8..231ca4a 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -839,7 +839,7 @@ static int jfs_link(struct dentry *old_dentry, ip->i_ctime = CURRENT_TIME; dir->i_ctime = dir->i_mtime = CURRENT_TIME; mark_inode_dirty(dir); - atomic_inc(&ip->i_count); + ihold(ip); iplist[0] = ip; iplist[1] = dir; diff --git a/fs/libfs.c b/fs/libfs.c index 2dbf487..304a513 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -255,7 +255,7 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; inc_nlink(inode); - atomic_inc(&inode->i_count); + ihold(inode); dget(dentry); d_instantiate(dentry, inode); return 0; diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 1eb4e89..409dfd6 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -569,7 +569,7 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir, return -EMLINK; inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; - atomic_inc(&inode->i_count); + ihold(inode); inode->i_nlink++; mark_inode_dirty_sync(inode); diff --git a/fs/minix/namei.c b/fs/minix/namei.c index f3f3578..c0d35a3 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -101,7 +101,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir, inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); - atomic_inc(&inode->i_count); + ihold(inode); return add_nondir(dentry, inode); } diff --git a/fs/namei.c b/fs/namei.c index f1ef97d..f7dbc06 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2285,7 +2285,7 @@ static long do_unlinkat(int dfd, const char __user *pathname) goto slashes; inode = dentry->d_inode; if (inode) - atomic_inc(&inode->i_count); + ihold(inode); error = mnt_want_write(nd.path.mnt); if (error) goto exit2; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e257172..0fac7fe 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1580,7 +1580,7 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) d_drop(dentry); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); if (error == 0) { - atomic_inc(&inode->i_count); + ihold(inode); d_add(dentry, inode); } return error; diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index a70e446..ac7b814 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -54,8 +54,7 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i iput(inode); return -ENOMEM; } - /* Circumvent igrab(): we know the inode is not being freed */ - atomic_inc(&inode->i_count); + ihold(inode); /* * Ensure that this dentry is invisible to d_find_alias(). * Otherwise, it may be spliced into the tree by diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 185d160..6e9557e 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -207,7 +207,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir, inode->i_ctime = CURRENT_TIME; inode_inc_link_count(inode); - atomic_inc(&inode->i_count); + ihold(inode); err = nilfs_add_nondir(dentry, inode); if (!err) diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index d4dec2d..d3fbe57 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -2911,8 +2911,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) goto unl_upcase_iput_tmp_ino_err_out_now; } if ((sb->s_root = d_alloc_root(vol->root_ino))) { - /* We increment i_count simulating an ntfs_iget(). */ - atomic_inc(&vol->root_ino->i_count); + /* We grab a reference, simulating an ntfs_iget(). */ + ihold(vol->root_ino); ntfs_debug("Exiting, status successful."); /* Release the default upcase if it has no users. */ mutex_lock(&ntfs_lock); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index e7bde21..ff5744e 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -742,7 +742,7 @@ static int ocfs2_link(struct dentry *old_dentry, goto out_commit; } - atomic_inc(&inode->i_count); + ihold(inode); dentry->d_op = &ocfs2_dentry_ops; d_instantiate(dentry, inode); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index ee78d4a..ba5f51e 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -1156,7 +1156,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, inode->i_ctime = CURRENT_TIME_SEC; reiserfs_update_sd(&th, inode); - atomic_inc(&inode->i_count); + ihold(inode); d_instantiate(dentry, inode); retval = journal_end(&th, dir->i_sb, jbegin_count); reiserfs_write_unlock(dir->i_sb); diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index 33e047b..11e7f7d 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -126,7 +126,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir, inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); - atomic_inc(&inode->i_count); + ihold(inode); return add_nondir(dentry, inode); } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 87ebcce..14f64b6 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -550,7 +550,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, lock_2_inodes(dir, inode); inc_nlink(inode); - atomic_inc(&inode->i_count); + ihold(inode); inode->i_ctime = ubifs_current_time(inode); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index bf5fc67..6d8dc02 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -1101,7 +1101,7 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, inc_nlink(inode); inode->i_ctime = current_fs_time(inode->i_sb); mark_inode_dirty(inode); - atomic_inc(&inode->i_count); + ihold(inode); d_instantiate(dentry, inode); unlock_kernel(); diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index b056f02..12f39b9 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -180,7 +180,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); - atomic_inc(&inode->i_count); + ihold(inode); error = ufs_add_nondir(dentry, inode); unlock_kernel(); diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 71d83c9..96107ef 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -317,7 +317,7 @@ xfs_vn_link( if (unlikely(error)) return -error; - atomic_inc(&inode->i_count); + ihold(inode); d_instantiate(dentry, inode); return 0; } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index fac5229..fb2ca2e 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -500,7 +500,7 @@ void xfs_mark_inode_dirty_sync(xfs_inode_t *); #define IHOLD(ip) \ do { \ ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ - atomic_inc(&(VFS_I(ip)->i_count)); \ + ihold(VFS_I(ip)); \ trace_xfs_ihold(ip, _THIS_IP_); \ } while (0) diff --git a/include/linux/fs.h b/include/linux/fs.h index d43e8b6..bd6ae6c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2171,6 +2171,7 @@ extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin); extern int inode_init_always(struct super_block *, struct inode *); extern void inode_init_once(struct inode *); +extern void ihold(struct inode * inode); extern void iput(struct inode *); extern struct inode * igrab(struct inode *); extern ino_t iunique(struct super_block *, ino_t); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index e1e7b96..80b35ff 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -769,7 +769,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) inode = dentry->d_inode; if (inode) - atomic_inc(&inode->i_count); + ihold(inode); err = mnt_want_write(ipc_ns->mq_mnt); if (err) goto out_err; diff --git a/kernel/futex.c b/kernel/futex.c index a118bf1..6c683b3 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -169,7 +169,7 @@ static void get_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - atomic_inc(&key->shared.inode->i_count); + ihold(key->shared.inode); break; case FUT_OFF_MMSHARED: atomic_inc(&key->private.mm->mm_count); diff --git a/mm/shmem.c b/mm/shmem.c index 27a5812..d4e2852 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1903,7 +1903,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; inc_nlink(inode); - atomic_inc(&inode->i_count); /* New dentry reference */ + ihold(inode); /* New dentry reference */ dget(dentry); /* Extra pinning count for the created dentry */ d_instantiate(dentry, inode); out: diff --git a/net/socket.c b/net/socket.c index abf3e25..d223725 100644 --- a/net/socket.c +++ b/net/socket.c @@ -377,7 +377,7 @@ static int sock_alloc_file(struct socket *sock, struct file **f, int flags) &socket_file_ops); if (unlikely(!file)) { /* drop dentry, keep inode */ - atomic_inc(&path.dentry->d_inode->i_count); + ihold(path.dentry->d_inode); path_put(&path); put_unused_fd(fd); return -ENFILE; -- cgit v0.10.2 From f991bd2e14210fb93d722cb23e54991de20e8a3d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 23 Oct 2010 11:18:01 -0400 Subject: fs: introduce a per-cpu last_ino allocator new_inode() dirties a contended cache line to get increasing inode numbers. This limits performance on workloads that cause significant parallel inode allocation. Solve this problem by using a per_cpu variable fed by the shared last_ino in batches of 1024 allocations. This reduces contention on the shared last_ino, and give same spreading ino numbers than before (i.e. same wraparound after 2^32 allocations). Signed-off-by: Eric Dumazet Signed-off-by: Nick Piggin Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/inode.c b/fs/inode.c index 05ea293..46a3e12 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -717,6 +717,43 @@ repeat: return NULL; } +/* + * Each cpu owns a range of LAST_INO_BATCH numbers. + * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations, + * to renew the exhausted range. + * + * This does not significantly increase overflow rate because every CPU can + * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is + * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the + * 2^32 range, and is a worst-case. Even a 50% wastage would only increase + * overflow rate by 2x, which does not seem too significant. + * + * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW + * error if st_ino won't fit in target struct field. Use 32bit counter + * here to attempt to avoid that. + */ +#define LAST_INO_BATCH 1024 +static DEFINE_PER_CPU(unsigned int, last_ino); + +static unsigned int get_next_ino(void) +{ + unsigned int *p = &get_cpu_var(last_ino); + unsigned int res = *p; + +#ifdef CONFIG_SMP + if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) { + static atomic_t shared_last_ino; + int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino); + + res = next - LAST_INO_BATCH; + } +#endif + + *p = ++res; + put_cpu_var(last_ino); + return res; +} + /** * new_inode - obtain an inode * @sb: superblock @@ -731,12 +768,6 @@ repeat: */ struct inode *new_inode(struct super_block *sb) { - /* - * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW - * error if st_ino won't fit in target struct field. Use 32bit counter - * here to attempt to avoid that. - */ - static unsigned int last_ino; struct inode *inode; spin_lock_prefetch(&inode_lock); @@ -745,7 +776,7 @@ struct inode *new_inode(struct super_block *sb) if (inode) { spin_lock(&inode_lock); __inode_sb_list_add(inode); - inode->i_ino = ++last_ino; + inode->i_ino = get_next_ino(); inode->i_state = 0; spin_unlock(&inode_lock); } -- cgit v0.10.2 From 85fe4025c616a7c0ed07bc2fc8c5371b07f3888c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 Oct 2010 11:19:54 -0400 Subject: fs: do not assign default i_ino in new_inode Instead of always assigning an increasing inode number in new_inode move the call to assign it into those callers that actually need it. For now callers that need it is estimated conservatively, that is the call is added to all filesystems that do not assign an i_ino by themselves. For a few more filesystems we can avoid assigning any inode number given that they aren't user visible, and for others it could be done lazily when an inode number is actually needed, but that's left for later patches. Signed-off-by: Christoph Hellwig Signed-off-by: Dave Chinner Signed-off-by: Al Viro diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c index d13e726..12d5bf7 100644 --- a/drivers/infiniband/hw/ipath/ipath_fs.c +++ b/drivers/infiniband/hw/ipath/ipath_fs.c @@ -57,6 +57,7 @@ static int ipathfs_mknod(struct inode *dir, struct dentry *dentry, goto bail; } + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_private = data; diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c index a0e6613..7e433d7 100644 --- a/drivers/infiniband/hw/qib/qib_fs.c +++ b/drivers/infiniband/hw/qib/qib_fs.c @@ -58,6 +58,7 @@ static int qibfs_mknod(struct inode *dir, struct dentry *dentry, goto bail; } + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = 0; inode->i_gid = 0; diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c index af2497a..0a53500 100644 --- a/drivers/misc/ibmasm/ibmasmfs.c +++ b/drivers/misc/ibmasm/ibmasmfs.c @@ -146,6 +146,7 @@ static struct inode *ibmasmfs_make_inode(struct super_block *sb, int mode) struct inode *ret = new_inode(sb); if (ret) { + ret->i_ino = get_next_ino(); ret->i_mode = mode; ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME; } diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c index 95f711b..449de59 100644 --- a/drivers/oprofile/oprofilefs.c +++ b/drivers/oprofile/oprofilefs.c @@ -28,6 +28,7 @@ static struct inode *oprofilefs_get_inode(struct super_block *sb, int mode) struct inode *inode = new_inode(sb); if (inode) { + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; } diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c index 095fa53..e2f63c0 100644 --- a/drivers/usb/core/inode.c +++ b/drivers/usb/core/inode.c @@ -276,6 +276,7 @@ static struct inode *usbfs_get_inode (struct super_block *sb, int mode, dev_t de struct inode *inode = new_inode(sb); if (inode) { + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); diff --git a/drivers/usb/gadget/f_fs.c b/drivers/usb/gadget/f_fs.c index e4f5950..e093fd8 100644 --- a/drivers/usb/gadget/f_fs.c +++ b/drivers/usb/gadget/f_fs.c @@ -980,6 +980,7 @@ ffs_sb_make_inode(struct super_block *sb, void *data, if (likely(inode)) { struct timespec current_time = CURRENT_TIME; + inode->i_ino = usbfs_get_inode(); inode->i_mode = perms->mode; inode->i_uid = perms->uid; inode->i_gid = perms->gid; diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c index d1d72d9..ba145e7 100644 --- a/drivers/usb/gadget/inode.c +++ b/drivers/usb/gadget/inode.c @@ -1991,6 +1991,7 @@ gadgetfs_make_inode (struct super_block *sb, struct inode *inode = new_inode (sb); if (inode) { + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = default_uid; inode->i_gid = default_gid; diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 9c8e87b..5365527 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -193,6 +193,7 @@ static struct inode *anon_inode_mkinode(void) if (!inode) return ERR_PTR(-ENOMEM); + inode->i_ino = get_next_ino(); inode->i_fop = &anon_inode_fops; inode->i_mapping->a_ops = &anon_aops; diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 821b2b9..ac87e49 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -398,6 +398,7 @@ struct inode *autofs4_get_inode(struct super_block *sb, inode->i_gid = sb->s_root->d_inode->i_gid; } inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_ino = get_next_ino(); if (S_ISDIR(inf->mode)) { inode->i_nlink = 2; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 139fc80..29990f0 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -495,6 +495,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode) struct inode * inode = new_inode(sb); if (inode) { + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index cf78d44..253476d 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -135,6 +135,7 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd) { struct inode * inode = new_inode(configfs_sb); if (inode) { + inode->i_ino = get_next_ino(); inode->i_mapping->a_ops = &configfs_aops; inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; inode->i_op = &configfs_inode_operations; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 30a87b3..a4ed838 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -40,6 +40,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d struct inode *inode = new_inode(sb); if (inode) { + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; switch (mode & S_IFMT) { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 19aa0d4..42f77b1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2373,6 +2373,7 @@ static int ext4_mb_init_backend(struct super_block *sb) printk(KERN_ERR "EXT4-fs: can't get new inode\n"); goto err_freesgi; } + sbi->s_buddy_cache->i_ino = get_next_ino(); EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; for (i = 0; i < ngroups; i++) { desc = ext4_get_group_desc(sb, i, NULL); diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 79d1b4e..8c04eac 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -260,6 +260,7 @@ vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip) struct inode *ip = NULL; if ((ip = new_inode(sbp))) { + ip->i_ino = get_next_ino(); vxfs_iinit(ip, vip); ip->i_mapping->a_ops = &vxfs_aops; } diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 7367e17..4eba076 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -222,6 +222,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, if (!inode) return NULL; + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = fc->user_id; inode->i_gid = fc->group_id; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 113eba3..8d0607b 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -455,6 +455,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, inode = new_inode(sb); if (inode) { struct hugetlbfs_inode_info *info; + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = uid; inode->i_gid = gid; diff --git a/fs/inode.c b/fs/inode.c index 46a3e12..2cd2e48 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -735,7 +735,7 @@ repeat: #define LAST_INO_BATCH 1024 static DEFINE_PER_CPU(unsigned int, last_ino); -static unsigned int get_next_ino(void) +unsigned int get_next_ino(void) { unsigned int *p = &get_cpu_var(last_ino); unsigned int res = *p; @@ -753,6 +753,7 @@ static unsigned int get_next_ino(void) put_cpu_var(last_ino); return res; } +EXPORT_SYMBOL(get_next_ino); /** * new_inode - obtain an inode @@ -776,7 +777,6 @@ struct inode *new_inode(struct super_block *sb) if (inode) { spin_lock(&inode_lock); __inode_sb_list_add(inode); - inode->i_ino = get_next_ino(); inode->i_state = 0; spin_unlock(&inode_lock); } diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index a7ebd9d..75e115f 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -400,6 +400,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) if (inode) { ip = DLMFS_I(inode); + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); @@ -425,6 +426,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent, if (!inode) return NULL; + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); diff --git a/fs/pipe.c b/fs/pipe.c index 37eb1eb..d2d7566 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -954,6 +954,8 @@ static struct inode * get_pipe_inode(void) if (!inode) goto fail_inode; + inode->i_ino = get_next_ino(); + pipe = alloc_pipe_info(inode); if (!pipe) goto fail_iput; diff --git a/fs/proc/base.c b/fs/proc/base.c index fb2a5ab..9883f1e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1603,6 +1603,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st /* Common stuff */ ei = PROC_I(inode); + inode->i_ino = get_next_ino(); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_op = &proc_def_inode_operations; @@ -2549,6 +2550,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir, /* Initialize the inode */ ei = PROC_I(inode); + inode->i_ino = get_next_ino(); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; /* diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 2fc5255..b652cb0 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -23,6 +23,8 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, if (!inode) goto out; + inode->i_ino = get_next_ino(); + sysctl_head_get(head); ei = PROC_I(inode); ei->sysctl = head; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index a5ebae7..67fadb1 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -58,6 +58,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, struct inode * inode = new_inode(sb); if (inode) { + inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); inode->i_mapping->a_ops = &ramfs_aops; inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index ba53128..63fd2c0 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1580,6 +1580,7 @@ xfs_mapping_buftarg( XFS_BUFTARG_NAME(btp)); return ENOMEM; } + inode->i_ino = get_next_ino(); inode->i_mode = S_IFBLK; inode->i_bdev = bdev; inode->i_rdev = bdev->bd_dev; diff --git a/include/linux/fs.h b/include/linux/fs.h index bd6ae6c..4a573cf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2191,6 +2191,7 @@ extern struct inode * iget_locked(struct super_block *, unsigned long); extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); extern int insert_inode_locked(struct inode *); extern void unlock_new_inode(struct inode *); +extern unsigned int get_next_ino(void); extern void __iget(struct inode * inode); extern void iget_failed(struct inode *); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 80b35ff..3a61ffe 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -116,6 +116,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, inode = new_inode(sb); if (inode) { + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7b69b8d..9270d53 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -777,6 +777,7 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) struct inode *inode = new_inode(sb); if (inode) { + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); diff --git a/mm/shmem.c b/mm/shmem.c index d4e2852..f6d350e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1586,6 +1586,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode inode = new_inode(sb); if (inode) { + inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); inode->i_blocks = 0; inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; diff --git a/net/socket.c b/net/socket.c index d223725..5cac1c7 100644 --- a/net/socket.c +++ b/net/socket.c @@ -480,6 +480,7 @@ static struct socket *sock_alloc(void) sock = SOCKET_I(inode); kmemcheck_annotate_bitfield(sock, type); + inode->i_ino = get_next_ino(); inode->i_mode = S_IFSOCK | S_IRWXUGO; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 52f2524..7df92d2 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -445,6 +445,7 @@ rpc_get_inode(struct super_block *sb, umode_t mode) struct inode *inode = new_inode(sb); if (!inode) return NULL; + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; switch(mode & S_IFMT) { diff --git a/security/inode.c b/security/inode.c index 8883986..cb8f47c 100644 --- a/security/inode.c +++ b/security/inode.c @@ -61,6 +61,7 @@ static struct inode *get_inode(struct super_block *sb, int mode, dev_t dev) struct inode *inode = new_inode(sb); if (inode) { + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; switch (mode & S_IFMT) { diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 87e0556..55a755c 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -978,6 +978,7 @@ static struct inode *sel_make_inode(struct super_block *sb, int mode) struct inode *ret = new_inode(sb); if (ret) { + ret->i_ino = get_next_ino(); ret->i_mode = mode; ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME; } -- cgit v0.10.2 From be148247cfbe2422f5709e77d9c3e10b8a6394da Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 10 Oct 2010 05:36:21 -0400 Subject: fs: take dcache_lock inside __d_path All callers take dcache_lock just around the call to __d_path, so take the lock into it in preparation of getting rid of dcache_lock. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/dcache.c b/fs/dcache.c index 83293be..54f93f5 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1994,7 +1994,7 @@ global_root: * Returns a pointer into the buffer or an error code if the * path was too long. * - * "buflen" should be positive. Caller holds the dcache_lock. + * "buflen" should be positive. * * If path is not reachable from the supplied root, then the value of * root is changed (without modifying refcounts). @@ -2006,10 +2006,12 @@ char *__d_path(const struct path *path, struct path *root, int error; prepend(&res, &buflen, "\0", 1); + spin_lock(&dcache_lock); error = prepend_path(path, root, &res, &buflen); + spin_unlock(&dcache_lock); + if (error) return ERR_PTR(error); - return res; } diff --git a/fs/seq_file.c b/fs/seq_file.c index 0e7cb13..05d6b0e 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -462,9 +462,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root, if (size) { char *p; - spin_lock(&dcache_lock); p = __d_path(path, root, buf, size); - spin_unlock(&dcache_lock); res = PTR_ERR(p); if (!IS_ERR(p)) { char *end = mangle_path(buf, p, esc); diff --git a/security/apparmor/path.c b/security/apparmor/path.c index 8239605..36cc0cc 100644 --- a/security/apparmor/path.c +++ b/security/apparmor/path.c @@ -72,10 +72,8 @@ static int d_namespace_path(struct path *path, char *buf, int buflen, path_get(&root); } - spin_lock(&dcache_lock); tmp = root; res = __d_path(path, &tmp, buf, buflen); - spin_unlock(&dcache_lock); *name = res; /* handle error conditions - and still allow a partial path to diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c index ed8ccd6..1d0bf8f 100644 --- a/security/tomoyo/realpath.c +++ b/security/tomoyo/realpath.c @@ -127,10 +127,8 @@ char *tomoyo_realpath_from_path(struct path *path) /* If we don't have a vfsmount, we can't calculate. */ if (!path->mnt) break; - spin_lock(&dcache_lock); /* go to whatever namespace root we are under */ pos = __d_path(path, &ns_root, buf, buf_len); - spin_unlock(&dcache_lock); /* Prepend "/proc" prefix if using internal proc vfs mount. */ if (!IS_ERR(pos) && (path->mnt->mnt_flags & MNT_INTERNAL) && (path->mnt->mnt_sb->s_magic == PROC_SUPER_MAGIC)) { -- cgit v0.10.2 From 9c82ab9c9e16cb9edf17bd0d31f3d6904afce04f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 10 Oct 2010 05:36:22 -0400 Subject: fs: simplify __d_free Remove d_callback and always call __d_free with a RCU head. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/dcache.c b/fs/dcache.c index 54f93f5..02875395 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -67,20 +67,16 @@ struct dentry_stat_t dentry_stat = { .age_limit = 45, }; -static void __d_free(struct dentry *dentry) +static void __d_free(struct rcu_head *head) { + struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); + WARN_ON(!list_empty(&dentry->d_alias)); if (dname_external(dentry)) kfree(dentry->d_name.name); kmem_cache_free(dentry_cache, dentry); } -static void d_callback(struct rcu_head *head) -{ - struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu); - __d_free(dentry); -} - /* * no dcache_lock, please. The caller must decrement dentry_stat.nr_dentry * inside dcache_lock. @@ -91,9 +87,9 @@ static void d_free(struct dentry *dentry) dentry->d_op->d_release(dentry); /* if dentry was never inserted into hash, immediate free is OK */ if (hlist_unhashed(&dentry->d_hash)) - __d_free(dentry); + __d_free(&dentry->d_u.d_rcu); else - call_rcu(&dentry->d_u.d_rcu, d_callback); + call_rcu(&dentry->d_u.d_rcu, __d_free); } /* -- cgit v0.10.2 From 312d3ca856d369bb04d0443846b85b4cdde6fa8a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 10 Oct 2010 05:36:23 -0400 Subject: fs: use percpu counter for nr_dentry and nr_dentry_unused The nr_dentry stat is a globally touched cacheline and atomic operation twice over the lifetime of a dentry. It is used for the benfit of userspace only. Turn it into a per-cpu counter and always decrement it in d_free instead of doing various batching operations to reduce lock hold times in the callers. Based on an earlier patch from Nick Piggin . Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/dcache.c b/fs/dcache.c index 02875395..c37a656 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -67,6 +67,19 @@ struct dentry_stat_t dentry_stat = { .age_limit = 45, }; +static struct percpu_counter nr_dentry __cacheline_aligned_in_smp; +static struct percpu_counter nr_dentry_unused __cacheline_aligned_in_smp; + +#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) +int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry); + dentry_stat.nr_unused = percpu_counter_sum_positive(&nr_dentry_unused); + return proc_dointvec(table, write, buffer, lenp, ppos); +} +#endif + static void __d_free(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); @@ -78,13 +91,14 @@ static void __d_free(struct rcu_head *head) } /* - * no dcache_lock, please. The caller must decrement dentry_stat.nr_dentry - * inside dcache_lock. + * no dcache_lock, please. */ static void d_free(struct dentry *dentry) { + percpu_counter_dec(&nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); + /* if dentry was never inserted into hash, immediate free is OK */ if (hlist_unhashed(&dentry->d_hash)) __d_free(&dentry->d_u.d_rcu); @@ -125,14 +139,14 @@ static void dentry_lru_add(struct dentry *dentry) { list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); dentry->d_sb->s_nr_dentry_unused++; - dentry_stat.nr_unused++; + percpu_counter_inc(&nr_dentry_unused); } static void dentry_lru_add_tail(struct dentry *dentry) { list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); dentry->d_sb->s_nr_dentry_unused++; - dentry_stat.nr_unused++; + percpu_counter_inc(&nr_dentry_unused); } static void dentry_lru_del(struct dentry *dentry) @@ -140,7 +154,7 @@ static void dentry_lru_del(struct dentry *dentry) if (!list_empty(&dentry->d_lru)) { list_del(&dentry->d_lru); dentry->d_sb->s_nr_dentry_unused--; - dentry_stat.nr_unused--; + percpu_counter_dec(&nr_dentry_unused); } } @@ -149,7 +163,7 @@ static void dentry_lru_del_init(struct dentry *dentry) if (likely(!list_empty(&dentry->d_lru))) { list_del_init(&dentry->d_lru); dentry->d_sb->s_nr_dentry_unused--; - dentry_stat.nr_unused--; + percpu_counter_dec(&nr_dentry_unused); } } @@ -168,7 +182,6 @@ static struct dentry *d_kill(struct dentry *dentry) struct dentry *parent; list_del(&dentry->d_u.d_child); - dentry_stat.nr_dentry--; /* For d_free, below */ /*drops the locks, at that point nobody can reach this dentry */ dentry_iput(dentry); if (IS_ROOT(dentry)) @@ -314,7 +327,6 @@ int d_invalidate(struct dentry * dentry) EXPORT_SYMBOL(d_invalidate); /* This should be called _only_ with dcache_lock held */ - static inline struct dentry * __dget_locked(struct dentry *dentry) { atomic_inc(&dentry->d_count); @@ -534,7 +546,7 @@ static void prune_dcache(int count) { struct super_block *sb, *p = NULL; int w_count; - int unused = dentry_stat.nr_unused; + int unused = percpu_counter_sum_positive(&nr_dentry_unused); int prune_ratio; int pruned; @@ -699,20 +711,13 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) * otherwise we ascend to the parent and move to the * next sibling if there is one */ if (!parent) - goto out; - + return; dentry = parent; - } while (list_empty(&dentry->d_subdirs)); dentry = list_entry(dentry->d_subdirs.next, struct dentry, d_u.d_child); } -out: - /* several dentries were freed, need to correct nr_dentry */ - spin_lock(&dcache_lock); - dentry_stat.nr_dentry -= detached; - spin_unlock(&dcache_lock); } /* @@ -896,12 +901,16 @@ EXPORT_SYMBOL(shrink_dcache_parent); */ static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { + int nr_unused; + if (nr) { if (!(gfp_mask & __GFP_FS)) return -1; prune_dcache(nr); } - return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; + + nr_unused = percpu_counter_sum_positive(&nr_dentry_unused); + return (nr_unused / 100) * sysctl_vfs_cache_pressure; } static struct shrinker dcache_shrinker = { @@ -968,9 +977,10 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) spin_lock(&dcache_lock); if (parent) list_add(&dentry->d_u.d_child, &parent->d_subdirs); - dentry_stat.nr_dentry++; spin_unlock(&dcache_lock); + percpu_counter_inc(&nr_dentry); + return dentry; } EXPORT_SYMBOL(d_alloc); @@ -2417,6 +2427,9 @@ static void __init dcache_init(void) { int loop; + percpu_counter_init(&nr_dentry, 0); + percpu_counter_init(&nr_dentry_unused, 0); + /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature diff --git a/include/linux/fs.h b/include/linux/fs.h index 4a573cf..d580599 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2490,6 +2490,8 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, struct ctl_table; int proc_nr_files(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +int proc_nr_dentry(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); int proc_nr_inodes(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); int __init get_filesystem_list(char *buf); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 99a510c..8b77ff5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1377,7 +1377,7 @@ static struct ctl_table fs_table[] = { .data = &dentry_stat, .maxlen = 6*sizeof(int), .mode = 0444, - .proc_handler = proc_dointvec, + .proc_handler = proc_nr_dentry, }, { .procname = "overflowuid", -- cgit v0.10.2 From 265ac90230257e9c035e4b0c63a0c11c5336e93c Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sun, 10 Oct 2010 05:36:24 -0400 Subject: fs: improve DCACHE_REFERENCED usage dentry referenced bit is only set when installing the dentry back onto the LRU. However with lazy LRU, the dentry can already be on the LRU list at dput time, thus missing out on setting the referenced bit. Fix this. Signed-off-by: Nick Piggin Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/dcache.c b/fs/dcache.c index c37a656..1a976d4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -246,13 +246,16 @@ repeat: if (dentry->d_op->d_delete(dentry)) goto unhash_it; } + /* Unreachable? Get rid of it */ if (d_unhashed(dentry)) goto kill_it; - if (list_empty(&dentry->d_lru)) { - dentry->d_flags |= DCACHE_REFERENCED; + + /* Otherwise leave it cached and ensure it's on the LRU */ + dentry->d_flags |= DCACHE_REFERENCED; + if (list_empty(&dentry->d_lru)) dentry_lru_add(dentry); - } + spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); return; -- cgit v0.10.2 From 3049cfe24ef3872ba74f90630356722cf988b80d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 10 Oct 2010 05:36:25 -0400 Subject: fs: split __shrink_dcache_sb Currently __shrink_dcache_sb has an extremly awkward calling convention because it tries to please very different callers. Split out the main loop into a shrink_dentry_list helper, which gets called directly from shrink_dcache_sb for the cases where all dentries need to be pruned, or from __shrink_dcache_sb for pruning only a certain number of dentries. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/dcache.c b/fs/dcache.c index 1a976d4..e987ad5 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -459,66 +459,20 @@ static void prune_one_dentry(struct dentry * dentry) } } -/* - * Shrink the dentry LRU on a given superblock. - * @sb : superblock to shrink dentry LRU. - * @count: If count is NULL, we prune all dentries on superblock. - * @flags: If flags is non-zero, we need to do special processing based on - * which flags are set. This means we don't need to maintain multiple - * similar copies of this loop. - */ -static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags) +static void shrink_dentry_list(struct list_head *list) { - LIST_HEAD(referenced); - LIST_HEAD(tmp); struct dentry *dentry; - int cnt = 0; - BUG_ON(!sb); - BUG_ON((flags & DCACHE_REFERENCED) && count == NULL); - spin_lock(&dcache_lock); - if (count != NULL) - /* called from prune_dcache() and shrink_dcache_parent() */ - cnt = *count; -restart: - if (count == NULL) - list_splice_init(&sb->s_dentry_lru, &tmp); - else { - while (!list_empty(&sb->s_dentry_lru)) { - dentry = list_entry(sb->s_dentry_lru.prev, - struct dentry, d_lru); - BUG_ON(dentry->d_sb != sb); - - spin_lock(&dentry->d_lock); - /* - * If we are honouring the DCACHE_REFERENCED flag and - * the dentry has this flag set, don't free it. Clear - * the flag and put it back on the LRU. - */ - if ((flags & DCACHE_REFERENCED) - && (dentry->d_flags & DCACHE_REFERENCED)) { - dentry->d_flags &= ~DCACHE_REFERENCED; - list_move(&dentry->d_lru, &referenced); - spin_unlock(&dentry->d_lock); - } else { - list_move_tail(&dentry->d_lru, &tmp); - spin_unlock(&dentry->d_lock); - cnt--; - if (!cnt) - break; - } - cond_resched_lock(&dcache_lock); - } - } - while (!list_empty(&tmp)) { - dentry = list_entry(tmp.prev, struct dentry, d_lru); + while (!list_empty(list)) { + dentry = list_entry(list->prev, struct dentry, d_lru); dentry_lru_del_init(dentry); - spin_lock(&dentry->d_lock); + /* * We found an inuse dentry which was not removed from * the LRU because of laziness during lookup. Do not free * it - just keep it off the LRU list. */ + spin_lock(&dentry->d_lock); if (atomic_read(&dentry->d_count)) { spin_unlock(&dentry->d_lock); continue; @@ -527,13 +481,60 @@ restart: /* dentry->d_lock was dropped in prune_one_dentry() */ cond_resched_lock(&dcache_lock); } - if (count == NULL && !list_empty(&sb->s_dentry_lru)) - goto restart; - if (count != NULL) - *count = cnt; +} + +/** + * __shrink_dcache_sb - shrink the dentry LRU on a given superblock + * @sb: superblock to shrink dentry LRU. + * @count: number of entries to prune + * @flags: flags to control the dentry processing + * + * If flags contains DCACHE_REFERENCED reference dentries will not be pruned. + */ +static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags) +{ + /* called from prune_dcache() and shrink_dcache_parent() */ + struct dentry *dentry; + LIST_HEAD(referenced); + LIST_HEAD(tmp); + int cnt = *count; + + spin_lock(&dcache_lock); + while (!list_empty(&sb->s_dentry_lru)) { + dentry = list_entry(sb->s_dentry_lru.prev, + struct dentry, d_lru); + BUG_ON(dentry->d_sb != sb); + + /* + * If we are honouring the DCACHE_REFERENCED flag and the + * dentry has this flag set, don't free it. Clear the flag + * and put it back on the LRU. + */ + if (flags & DCACHE_REFERENCED) { + spin_lock(&dentry->d_lock); + if (dentry->d_flags & DCACHE_REFERENCED) { + dentry->d_flags &= ~DCACHE_REFERENCED; + list_move(&dentry->d_lru, &referenced); + spin_unlock(&dentry->d_lock); + cond_resched_lock(&dcache_lock); + continue; + } + spin_unlock(&dentry->d_lock); + } + + list_move_tail(&dentry->d_lru, &tmp); + if (!--cnt) + break; + cond_resched_lock(&dcache_lock); + } + + *count = cnt; + shrink_dentry_list(&tmp); + if (!list_empty(&referenced)) list_splice(&referenced, &sb->s_dentry_lru); spin_unlock(&dcache_lock); + } /** @@ -619,13 +620,19 @@ static void prune_dcache(int count) * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock * - * Shrink the dcache for the specified super block. This - * is used to free the dcache before unmounting a file - * system + * Shrink the dcache for the specified super block. This is used to free + * the dcache before unmounting a file system. */ -void shrink_dcache_sb(struct super_block * sb) +void shrink_dcache_sb(struct super_block *sb) { - __shrink_dcache_sb(sb, NULL, 0); + LIST_HEAD(tmp); + + spin_lock(&dcache_lock); + while (!list_empty(&sb->s_dentry_lru)) { + list_splice_init(&sb->s_dentry_lru, &tmp); + shrink_dentry_list(&tmp); + } + spin_unlock(&dcache_lock); } EXPORT_SYMBOL(shrink_dcache_sb); -- cgit v0.10.2 From a4633357ac610cd2f8740e28a31fc148a7960421 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 10 Oct 2010 05:36:26 -0400 Subject: fs: clean up dentry lru modification Always do a list_del_init on the LRU to make sure the list_empty invariant for not beeing on the LRU always holds true, and fold dentry_lru_del_init into dentry_lru_del. Replace the dentry_lru_add_tail primitive with a dentry_lru_move_tail operations that simpler when the dentry already is one the list, which is always is. Move the list_empty into dentry_lru_add to fit the scheme of the other lru helpers, and simplify locking once we move to a separate LRU lock. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/dcache.c b/fs/dcache.c index e987ad5..cc2b938 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -133,37 +133,34 @@ static void dentry_iput(struct dentry * dentry) } /* - * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held. + * dentry_lru_(add|del|move_tail) must be called with dcache_lock held. */ static void dentry_lru_add(struct dentry *dentry) { - list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); - dentry->d_sb->s_nr_dentry_unused++; - percpu_counter_inc(&nr_dentry_unused); -} - -static void dentry_lru_add_tail(struct dentry *dentry) -{ - list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); - dentry->d_sb->s_nr_dentry_unused++; - percpu_counter_inc(&nr_dentry_unused); + if (list_empty(&dentry->d_lru)) { + list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); + dentry->d_sb->s_nr_dentry_unused++; + percpu_counter_inc(&nr_dentry_unused); + } } static void dentry_lru_del(struct dentry *dentry) { if (!list_empty(&dentry->d_lru)) { - list_del(&dentry->d_lru); + list_del_init(&dentry->d_lru); dentry->d_sb->s_nr_dentry_unused--; percpu_counter_dec(&nr_dentry_unused); } } -static void dentry_lru_del_init(struct dentry *dentry) +static void dentry_lru_move_tail(struct dentry *dentry) { - if (likely(!list_empty(&dentry->d_lru))) { - list_del_init(&dentry->d_lru); - dentry->d_sb->s_nr_dentry_unused--; - percpu_counter_dec(&nr_dentry_unused); + if (list_empty(&dentry->d_lru)) { + list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); + dentry->d_sb->s_nr_dentry_unused++; + percpu_counter_inc(&nr_dentry_unused); + } else { + list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); } } @@ -253,8 +250,7 @@ repeat: /* Otherwise leave it cached and ensure it's on the LRU */ dentry->d_flags |= DCACHE_REFERENCED; - if (list_empty(&dentry->d_lru)) - dentry_lru_add(dentry); + dentry_lru_add(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); @@ -333,7 +329,7 @@ EXPORT_SYMBOL(d_invalidate); static inline struct dentry * __dget_locked(struct dentry *dentry) { atomic_inc(&dentry->d_count); - dentry_lru_del_init(dentry); + dentry_lru_del(dentry); return dentry; } @@ -452,7 +448,7 @@ static void prune_one_dentry(struct dentry * dentry) if (dentry->d_op && dentry->d_op->d_delete) dentry->d_op->d_delete(dentry); - dentry_lru_del_init(dentry); + dentry_lru_del(dentry); __d_drop(dentry); dentry = d_kill(dentry); spin_lock(&dcache_lock); @@ -465,7 +461,7 @@ static void shrink_dentry_list(struct list_head *list) while (!list_empty(list)) { dentry = list_entry(list->prev, struct dentry, d_lru); - dentry_lru_del_init(dentry); + dentry_lru_del(dentry); /* * We found an inuse dentry which was not removed from @@ -650,7 +646,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) /* detach this root from the system */ spin_lock(&dcache_lock); - dentry_lru_del_init(dentry); + dentry_lru_del(dentry); __d_drop(dentry); spin_unlock(&dcache_lock); @@ -664,7 +660,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) spin_lock(&dcache_lock); list_for_each_entry(loop, &dentry->d_subdirs, d_u.d_child) { - dentry_lru_del_init(loop); + dentry_lru_del(loop); __d_drop(loop); cond_resched_lock(&dcache_lock); } @@ -841,14 +837,15 @@ resume: struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); next = tmp->next; - dentry_lru_del_init(dentry); /* * move only zero ref count dentries to the end * of the unused list for prune_dcache */ if (!atomic_read(&dentry->d_count)) { - dentry_lru_add_tail(dentry); + dentry_lru_move_tail(dentry); found++; + } else { + dentry_lru_del(dentry); } /* -- cgit v0.10.2 From 3825bdb7ed920845961f32f364454bee5f469abb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 10 Oct 2010 05:36:27 -0400 Subject: fs: use RCU read side protection in d_validate d_validate does a purely read lookup in the dentry hash, so use RCU read side locking instead of dcache_lock. Split out from a larget patch by Nick Piggin . Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/dcache.c b/fs/dcache.c index cc2b938..23702a9 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1491,33 +1491,26 @@ out: * This is used by ncpfs in its readdir implementation. * Zero is returned in the dentry is invalid. */ - -int d_validate(struct dentry *dentry, struct dentry *dparent) +int d_validate(struct dentry *dentry, struct dentry *parent) { - struct hlist_head *base; - struct hlist_node *lhp; + struct hlist_head *head = d_hash(parent, dentry->d_name.hash); + struct hlist_node *node; + struct dentry *d; /* Check whether the ptr might be valid at all.. */ if (!kmem_ptr_validate(dentry_cache, dentry)) - goto out; - - if (dentry->d_parent != dparent) - goto out; + return 0; + if (dentry->d_parent != parent) + return 0; - spin_lock(&dcache_lock); - base = d_hash(dparent, dentry->d_name.hash); - hlist_for_each(lhp,base) { - /* hlist_for_each_entry_rcu() not required for d_hash list - * as it is parsed under dcache_lock - */ - if (dentry == hlist_entry(lhp, struct dentry, d_hash)) { - __dget_locked(dentry); - spin_unlock(&dcache_lock); + rcu_read_lock(); + hlist_for_each_entry_rcu(d, node, head, d_hash) { + if (d == dentry) { + dget(dentry); return 1; } } - spin_unlock(&dcache_lock); -out: + rcu_read_unlock(); return 0; } EXPORT_SYMBOL(d_validate); -- cgit v0.10.2 From 0461ee2616252f1f6cec628990fa913a4282dcf7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Oct 2010 11:56:37 -0400 Subject: exportfs: use dget_parent Use dget_parent instead of opencoding it. This simplifies the code, but more importanly prepares for the more complicated locking for a parent dget in the dcache scale patch series. Signed-off-by: Al Viro diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index e9e1759..51b3040 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -74,21 +74,20 @@ static struct dentry * find_disconnected_root(struct dentry *dentry) { dget(dentry); - spin_lock(&dentry->d_lock); - while (!IS_ROOT(dentry) && - (dentry->d_parent->d_flags & DCACHE_DISCONNECTED)) { - struct dentry *parent = dentry->d_parent; - dget(parent); - spin_unlock(&dentry->d_lock); + while (!IS_ROOT(dentry)) { + struct dentry *parent = dget_parent(dentry); + + if (!(parent->d_flags & DCACHE_DISCONNECTED)) { + dput(parent); + break; + } + dput(dentry); dentry = parent; - spin_lock(&dentry->d_lock); } - spin_unlock(&dentry->d_lock); return dentry; } - /* * Make sure target_dir is fully connected to the dentry tree. * -- cgit v0.10.2 From be9eee2e8b87e335531a3ae13abb8d26e834c438 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 10 Oct 2010 05:36:29 -0400 Subject: smbfs: use dget_parent Use dget_parent instead of opencoding it. This simplifies the code, but more importanly prepares for the more complicated locking for a parent dget in the dcache scale patch series. Note that the d_time assignment in smb_renew_times moves out of d_lock, but it's a single atomic 32-bit value, and that's what other sites setting it do already. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c index 00a70ca..f678d42 100644 --- a/fs/smbfs/dir.c +++ b/fs/smbfs/dir.c @@ -406,21 +406,15 @@ void smb_renew_times(struct dentry * dentry) { dget(dentry); - spin_lock(&dentry->d_lock); - for (;;) { - struct dentry *parent; + dentry->d_time = jiffies; - dentry->d_time = jiffies; - if (IS_ROOT(dentry)) - break; - parent = dentry->d_parent; - dget(parent); - spin_unlock(&dentry->d_lock); + while (!IS_ROOT(dentry)) { + struct dentry *parent = dget_parent(dentry); dput(dentry); dentry = parent; - spin_lock(&dentry->d_lock); + + dentry->d_time = jiffies; } - spin_unlock(&dentry->d_lock); dput(dentry); } diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c index 71c29b6..3dcf638 100644 --- a/fs/smbfs/proc.c +++ b/fs/smbfs/proc.c @@ -332,16 +332,15 @@ static int smb_build_path(struct smb_sb_info *server, unsigned char *buf, * and store it in reversed order [see reverse_string()] */ dget(entry); - spin_lock(&entry->d_lock); while (!IS_ROOT(entry)) { struct dentry *parent; if (maxlen < (3<d_lock); dput(entry); return -ENAMETOOLONG; } + spin_lock(&entry->d_lock); len = server->ops->convert(path, maxlen-2, entry->d_name.name, entry->d_name.len, server->local_nls, server->remote_nls); @@ -359,15 +358,12 @@ static int smb_build_path(struct smb_sb_info *server, unsigned char *buf, } *path++ = '\\'; maxlen -= len+1; - - parent = entry->d_parent; - dget(parent); spin_unlock(&entry->d_lock); + + parent = dget_parent(entry); dput(entry); entry = parent; - spin_lock(&entry->d_lock); } - spin_unlock(&entry->d_lock); dput(entry); reverse_string(buf, path-buf); -- cgit v0.10.2 From 4d4eb36679adbdd75495e1bbfe7ac40e4ae41dea Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 10 Oct 2010 05:36:30 -0400 Subject: fsnotify: use dget_parent Use dget_parent instead of opencoding it. This simplifies the code, but more importanly prepares for the more complicated locking for a parent dget in the dcache scale patch series. It means we do grab a reference to the parent now if need to be watched, but not with the specified mask. If this turns out to be a problem we'll have to revisit it, but for now let's keep as much as possible dcache internals inside dcache.[ch]. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 3680242..4498a20 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -88,8 +88,6 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) { struct dentry *parent; struct inode *p_inode; - bool send = false; - bool should_update_children = false; if (!dentry) dentry = path->dentry; @@ -97,29 +95,12 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) return; - spin_lock(&dentry->d_lock); - parent = dentry->d_parent; + parent = dget_parent(dentry); p_inode = parent->d_inode; - if (fsnotify_inode_watches_children(p_inode)) { - if (p_inode->i_fsnotify_mask & mask) { - dget(parent); - send = true; - } - } else { - /* - * The parent doesn't care about events on it's children but - * at least one child thought it did. We need to run all the - * children and update their d_flags to let them know p_inode - * doesn't care about them any more. - */ - dget(parent); - should_update_children = true; - } - - spin_unlock(&dentry->d_lock); - - if (send) { + if (unlikely(!fsnotify_inode_watches_children(p_inode))) + __fsnotify_update_child_dentry_flags(p_inode); + else if (p_inode->i_fsnotify_mask & mask) { /* we are notifying a parent so come up with the new mask which * specifies these are events which came from a child. */ mask |= FS_EVENT_ON_CHILD; @@ -130,13 +111,9 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) else fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0); - dput(parent); } - if (unlikely(should_update_children)) { - __fsnotify_update_child_dentry_flags(p_inode); - dput(parent); - } + dput(parent); } EXPORT_SYMBOL_GPL(__fsnotify_parent); -- cgit v0.10.2 From 99a38919241fd051b8d93b2e4d0c05ef0556d795 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 Oct 2010 19:07:20 +0200 Subject: fs: fix buffer invalidation in invalidate_list We must not call invalidate_inode_buffers in invalidate_list unless the inode can be reclaimed. If we remove the buffer association of a busy inode fsync won't find the buffers anymore. As invalidate_inode_buffers is called from various others sources than umount this actually does matter in practice. While at it change the loop to a more natural form and remove the WARN_ON for I_NEW, wich we already tested a few lines above. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/inode.c b/fs/inode.c index 2cd2e48..4bedac3 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -28,7 +28,6 @@ /* * This is needed for the following functions: * - inode_has_buffers - * - invalidate_inode_buffers * - invalidate_bdev * * FIXME: remove all knowledge of the buffer layer from this file @@ -503,16 +502,15 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose) inode = list_entry(tmp, struct inode, i_sb_list); if (inode->i_state & I_NEW) continue; - invalidate_inode_buffers(inode); - if (!atomic_read(&inode->i_count)) { - list_move(&inode->i_list, dispose); - WARN_ON(inode->i_state & I_NEW); - inode->i_state |= I_FREEING; - if (!(inode->i_state & (I_DIRTY | I_SYNC))) - percpu_counter_dec(&nr_inodes_unused); + if (atomic_read(&inode->i_count)) { + busy = 1; continue; } - busy = 1; + + list_move(&inode->i_list, dispose); + inode->i_state |= I_FREEING; + if (!(inode->i_state & (I_DIRTY | I_SYNC))) + percpu_counter_dec(&nr_inodes_unused); } return busy; } -- cgit v0.10.2 From a5491e0c7bb7387e3e6ff9994d6dc2efc78af56c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Oct 2010 11:49:26 +1100 Subject: fs: switch bdev inode bdi's correctly bdev inodes can remain dirty even after their last close. Hence the BDI associated with the bdev->inode gets modified duringthe last close to point to the default BDI. However, the bdev inode still needs to be moved to the dirty lists of the new BDI, otherwise it will corrupt the writeback list is was left on. Add a new function bdev_inode_switch_bdi() to move all the bdi state from the old bdi to the new one safely. This is only a temporary measure until the bdev inode<->bdi lifecycle problems are sorted out. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/block_dev.c b/fs/block_dev.c index 81972eb..4e847e5 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -48,6 +48,21 @@ inline struct block_device *I_BDEV(struct inode *inode) EXPORT_SYMBOL(I_BDEV); +/* + * move the inode from it's current bdi to the a new bdi. if the inode is dirty + * we need to move it onto the dirty list of @dst so that the inode is always + * on the right list. + */ +static void bdev_inode_switch_bdi(struct inode *inode, + struct backing_dev_info *dst) +{ + spin_lock(&inode_lock); + inode->i_data.backing_dev_info = dst; + if (inode->i_state & I_DIRTY) + list_move(&inode->i_list, &dst->wb.b_dirty); + spin_unlock(&inode_lock); +} + static sector_t max_block(struct block_device *bdev) { sector_t retval = ~((sector_t)0); @@ -1390,7 +1405,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdi = blk_get_backing_dev_info(bdev); if (bdi == NULL) bdi = &default_backing_dev_info; - bdev->bd_inode->i_data.backing_dev_info = bdi; + bdev_inode_switch_bdi(bdev->bd_inode, bdi); } if (bdev->bd_invalidated) rescan_partitions(disk, bdev); @@ -1405,8 +1420,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (ret) goto out_clear; bdev->bd_contains = whole; - bdev->bd_inode->i_data.backing_dev_info = - whole->bd_inode->i_data.backing_dev_info; + bdev_inode_switch_bdi(bdev->bd_inode, + whole->bd_inode->i_data.backing_dev_info); bdev->bd_part = disk_get_part(disk, partno); if (!(disk->flags & GENHD_FL_UP) || !bdev->bd_part || !bdev->bd_part->nr_sects) { @@ -1439,7 +1454,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_disk = NULL; bdev->bd_part = NULL; - bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; + bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); bdev->bd_contains = NULL; @@ -1533,7 +1548,8 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; - bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; + bdev_inode_switch_bdi(bdev->bd_inode, + &default_backing_dev_info); if (bdev != bdev->bd_contains) victim = bdev->bd_contains; bdev->bd_contains = NULL; -- cgit v0.10.2 From 7ccf19a8042e343f8159f8a5fdd6a9422aa90c78 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 21 Oct 2010 11:49:30 +1100 Subject: fs: inode split IO and LRU lists The use of the same inode list structure (inode->i_list) for two different list constructs with different lifecycles and purposes makes it impossible to separate the locking of the different operations. Therefore, to enable the separation of the locking of the writeback and reclaim lists, split the inode->i_list into two separate lists dedicated to their specific tracking functions. Signed-off-by: Nick Piggin Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/block_dev.c b/fs/block_dev.c index 4e847e5..dea3b62 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -59,7 +59,7 @@ static void bdev_inode_switch_bdi(struct inode *inode, spin_lock(&inode_lock); inode->i_data.backing_dev_info = dst; if (inode->i_state & I_DIRTY) - list_move(&inode->i_list, &dst->wb.b_dirty); + list_move(&inode->i_wb_list, &dst->wb.b_dirty); spin_unlock(&inode_lock); } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e8f6529..7a24cc9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -79,6 +79,11 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) return sb->s_bdi; } +static inline struct inode *wb_inode(struct list_head *head) +{ + return list_entry(head, struct inode, i_wb_list); +} + static void bdi_queue_work(struct backing_dev_info *bdi, struct wb_writeback_work *work) { @@ -172,11 +177,11 @@ static void redirty_tail(struct inode *inode) if (!list_empty(&wb->b_dirty)) { struct inode *tail; - tail = list_entry(wb->b_dirty.next, struct inode, i_list); + tail = wb_inode(wb->b_dirty.next); if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } - list_move(&inode->i_list, &wb->b_dirty); + list_move(&inode->i_wb_list, &wb->b_dirty); } /* @@ -186,7 +191,7 @@ static void requeue_io(struct inode *inode) { struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - list_move(&inode->i_list, &wb->b_more_io); + list_move(&inode->i_wb_list, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) @@ -227,14 +232,14 @@ static void move_expired_inodes(struct list_head *delaying_queue, int do_sb_sort = 0; while (!list_empty(delaying_queue)) { - inode = list_entry(delaying_queue->prev, struct inode, i_list); + inode = wb_inode(delaying_queue->prev); if (older_than_this && inode_dirtied_after(inode, *older_than_this)) break; if (sb && sb != inode->i_sb) do_sb_sort = 1; sb = inode->i_sb; - list_move(&inode->i_list, &tmp); + list_move(&inode->i_wb_list, &tmp); } /* just one sb in list, splice to dispatch_queue and we're done */ @@ -245,12 +250,11 @@ static void move_expired_inodes(struct list_head *delaying_queue, /* Move inodes from one superblock together */ while (!list_empty(&tmp)) { - inode = list_entry(tmp.prev, struct inode, i_list); - sb = inode->i_sb; + sb = wb_inode(tmp.prev)->i_sb; list_for_each_prev_safe(pos, node, &tmp) { - inode = list_entry(pos, struct inode, i_list); + inode = wb_inode(pos); if (inode->i_sb == sb) - list_move(&inode->i_list, dispatch_queue); + list_move(&inode->i_wb_list, dispatch_queue); } } } @@ -414,7 +418,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * a reference to the inode or it's on it's way out. * No need to add it back to the LRU. */ - list_del_init(&inode->i_list); + list_del_init(&inode->i_wb_list); } } inode_sync_complete(inode); @@ -462,8 +466,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, { while (!list_empty(&wb->b_io)) { long pages_skipped; - struct inode *inode = list_entry(wb->b_io.prev, - struct inode, i_list); + struct inode *inode = wb_inode(wb->b_io.prev); if (inode->i_sb != sb) { if (only_this_sb) { @@ -533,8 +536,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, queue_io(wb, wbc->older_than_this); while (!list_empty(&wb->b_io)) { - struct inode *inode = list_entry(wb->b_io.prev, - struct inode, i_list); + struct inode *inode = wb_inode(wb->b_io.prev); struct super_block *sb = inode->i_sb; if (!pin_sb_for_writeback(sb)) { @@ -672,8 +674,7 @@ static long wb_writeback(struct bdi_writeback *wb, */ spin_lock(&inode_lock); if (!list_empty(&wb->b_more_io)) { - inode = list_entry(wb->b_more_io.prev, - struct inode, i_list); + inode = wb_inode(wb->b_more_io.prev); trace_wbc_writeback_wait(&wbc, wb->bdi); inode_wait_for_writeback(inode); } @@ -987,7 +988,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) } inode->dirtied_when = jiffies; - list_move(&inode->i_list, &bdi->wb.b_dirty); + list_move(&inode->i_wb_list, &bdi->wb.b_dirty); } } out: diff --git a/fs/inode.c b/fs/inode.c index 4bedac3..09e2d7a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -71,7 +71,7 @@ static unsigned int i_hash_shift __read_mostly; * allowing for low-overhead inode sync() operations. */ -static LIST_HEAD(inode_unused); +static LIST_HEAD(inode_lru); static struct hlist_head *inode_hashtable __read_mostly; /* @@ -271,6 +271,7 @@ EXPORT_SYMBOL(__destroy_inode); static void destroy_inode(struct inode *inode) { + BUG_ON(!list_empty(&inode->i_lru)); __destroy_inode(inode); if (inode->i_sb->s_op->destroy_inode) inode->i_sb->s_op->destroy_inode(inode); @@ -289,7 +290,8 @@ void inode_init_once(struct inode *inode) INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_devices); - INIT_LIST_HEAD(&inode->i_list); + INIT_LIST_HEAD(&inode->i_wb_list); + INIT_LIST_HEAD(&inode->i_lru); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); spin_lock_init(&inode->i_data.tree_lock); spin_lock_init(&inode->i_data.i_mmap_lock); @@ -330,16 +332,16 @@ EXPORT_SYMBOL(ihold); static void inode_lru_list_add(struct inode *inode) { - if (list_empty(&inode->i_list)) { - list_add(&inode->i_list, &inode_unused); + if (list_empty(&inode->i_lru)) { + list_add(&inode->i_lru, &inode_lru); percpu_counter_inc(&nr_inodes_unused); } } static void inode_lru_list_del(struct inode *inode) { - if (!list_empty(&inode->i_list)) { - list_del_init(&inode->i_list); + if (!list_empty(&inode->i_lru)) { + list_del_init(&inode->i_lru); percpu_counter_dec(&nr_inodes_unused); } } @@ -460,8 +462,8 @@ static void dispose_list(struct list_head *head) while (!list_empty(head)) { struct inode *inode; - inode = list_first_entry(head, struct inode, i_list); - list_del_init(&inode->i_list); + inode = list_first_entry(head, struct inode, i_lru); + list_del_init(&inode->i_lru); evict(inode); @@ -507,8 +509,14 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose) continue; } - list_move(&inode->i_list, dispose); inode->i_state |= I_FREEING; + + /* + * Move the inode off the IO lists and LRU once I_FREEING is + * set so that it won't get moved back on there if it is dirty. + */ + list_move(&inode->i_lru, dispose); + list_del_init(&inode->i_wb_list); if (!(inode->i_state & (I_DIRTY | I_SYNC))) percpu_counter_dec(&nr_inodes_unused); } @@ -580,10 +588,10 @@ static void prune_icache(int nr_to_scan) for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { struct inode *inode; - if (list_empty(&inode_unused)) + if (list_empty(&inode_lru)) break; - inode = list_entry(inode_unused.prev, struct inode, i_list); + inode = list_entry(inode_lru.prev, struct inode, i_lru); /* * Referenced or dirty inodes are still in use. Give them @@ -591,14 +599,14 @@ static void prune_icache(int nr_to_scan) */ if (atomic_read(&inode->i_count) || (inode->i_state & ~I_REFERENCED)) { - list_del_init(&inode->i_list); + list_del_init(&inode->i_lru); percpu_counter_dec(&nr_inodes_unused); continue; } /* recently referenced inodes get one more pass */ if (inode->i_state & I_REFERENCED) { - list_move(&inode->i_list, &inode_unused); + list_move(&inode->i_lru, &inode_lru); inode->i_state &= ~I_REFERENCED; continue; } @@ -611,15 +619,21 @@ static void prune_icache(int nr_to_scan) iput(inode); spin_lock(&inode_lock); - if (inode != list_entry(inode_unused.next, - struct inode, i_list)) + if (inode != list_entry(inode_lru.next, + struct inode, i_lru)) continue; /* wrong inode or list_empty */ if (!can_unuse(inode)) continue; } - list_move(&inode->i_list, &freeable); WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; + + /* + * Move the inode off the IO lists and LRU once I_FREEING is + * set so that it won't get moved back on there if it is dirty. + */ + list_move(&inode->i_lru, &freeable); + list_del_init(&inode->i_wb_list); percpu_counter_dec(&nr_inodes_unused); } if (current_is_kswapd()) @@ -1340,15 +1354,16 @@ static void iput_final(struct inode *inode) inode->i_state &= ~I_WILL_FREE; __remove_inode_hash(inode); } + WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; /* - * After we delete the inode from the LRU here, we avoid moving dirty - * inodes back onto the LRU now because I_FREEING is set and hence - * writeback_single_inode() won't move the inode around. + * Move the inode off the IO lists and LRU once I_FREEING is + * set so that it won't get moved back on there if it is dirty. */ inode_lru_list_del(inode); + list_del_init(&inode->i_wb_list); __inode_sb_list_del(inode); spin_unlock(&inode_lock); diff --git a/include/linux/fs.h b/include/linux/fs.h index d580599..f300a65 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -723,7 +723,8 @@ struct posix_acl; struct inode { struct hlist_node i_hash; - struct list_head i_list; /* backing dev IO list */ + struct list_head i_wb_list; /* backing dev IO list */ + struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; struct list_head i_dentry; unsigned long i_ino; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 65d4204..15d5097 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -74,11 +74,11 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) nr_wb = nr_dirty = nr_io = nr_more_io = 0; spin_lock(&inode_lock); - list_for_each_entry(inode, &wb->b_dirty, i_list) + list_for_each_entry(inode, &wb->b_dirty, i_wb_list) nr_dirty++; - list_for_each_entry(inode, &wb->b_io, i_list) + list_for_each_entry(inode, &wb->b_io, i_wb_list) nr_io++; - list_for_each_entry(inode, &wb->b_more_io, i_list) + list_for_each_entry(inode, &wb->b_more_io, i_wb_list) nr_more_io++; spin_unlock(&inode_lock); -- cgit v0.10.2 From d895a1c96af8c2a0f6a5e0119695a7c6b92df8db Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Oct 2010 19:40:24 +0200 Subject: fs: do not drop inode_lock in dispose_list Despite the comment above it we can not safely drop the lock here. invalidate_list is called from many other places that just umount. Also switch to proper list macros now that we never drop the lock. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/inode.c b/fs/inode.c index 09e2d7a..1bf2be4 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -482,26 +482,10 @@ static void dispose_list(struct list_head *head) */ static int invalidate_list(struct list_head *head, struct list_head *dispose) { - struct list_head *next; + struct inode *inode, *next; int busy = 0; - next = head->next; - for (;;) { - struct list_head *tmp = next; - struct inode *inode; - - /* - * We can reschedule here without worrying about the list's - * consistency because the per-sb list of inodes must not - * change during umount anymore, and because iprune_sem keeps - * shrink_icache_memory() away. - */ - cond_resched_lock(&inode_lock); - - next = next->next; - if (tmp == head) - break; - inode = list_entry(tmp, struct inode, i_sb_list); + list_for_each_entry_safe(inode, next, head, i_sb_list) { if (inode->i_state & I_NEW) continue; if (atomic_read(&inode->i_count)) { -- cgit v0.10.2 From a031878670ac8fe466859d4c1506bd91ae48678c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Oct 2010 19:40:33 +0200 Subject: fs: fold invalidate_list into invalidate_inodes Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/inode.c b/fs/inode.c index 1bf2be4..f6b3b52 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -477,15 +477,24 @@ static void dispose_list(struct list_head *head) } } -/* - * Invalidate all inodes for a device. +/** + * invalidate_inodes - attempt to free all inodes on a superblock + * @sb: superblock to operate on + * + * Attempts to free all inodes for a given superblock. If there were any + * busy inodes return a non-zero value, else zero. */ -static int invalidate_list(struct list_head *head, struct list_head *dispose) +int invalidate_inodes(struct super_block *sb) { - struct inode *inode, *next; int busy = 0; + struct inode *inode, *next; + LIST_HEAD(dispose); - list_for_each_entry_safe(inode, next, head, i_sb_list) { + down_write(&iprune_sem); + + spin_lock(&inode_lock); + fsnotify_unmount_inodes(&sb->s_inodes); + list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (inode->i_state & I_NEW) continue; if (atomic_read(&inode->i_count)) { @@ -499,34 +508,14 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose) * Move the inode off the IO lists and LRU once I_FREEING is * set so that it won't get moved back on there if it is dirty. */ - list_move(&inode->i_lru, dispose); + list_move(&inode->i_lru, &dispose); list_del_init(&inode->i_wb_list); if (!(inode->i_state & (I_DIRTY | I_SYNC))) percpu_counter_dec(&nr_inodes_unused); } - return busy; -} - -/** - * invalidate_inodes - discard the inodes on a device - * @sb: superblock - * - * Discard all of the inodes for a given superblock. If the discard - * fails because there are busy inodes then a non zero value is returned. - * If the discard is successful all the inodes have been discarded. - */ -int invalidate_inodes(struct super_block *sb) -{ - int busy; - LIST_HEAD(throw_away); - - down_write(&iprune_sem); - spin_lock(&inode_lock); - fsnotify_unmount_inodes(&sb->s_inodes); - busy = invalidate_list(&sb->s_inodes, &throw_away); spin_unlock(&inode_lock); - dispose_list(&throw_away); + dispose_list(&dispose); up_write(&iprune_sem); return busy; -- cgit v0.10.2 From 9843b76aae80293f5b5a0e275360627508595ce5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Oct 2010 19:40:46 +0200 Subject: fs: skip I_FREEING inodes in writeback_sb_inodes Skip I_FREEING inodes just like I_WILL_FREE and I_NEW when walking the writeback lists. Currenly this can't happen, but once we move from inode_lock to more fine grained locking we can have an inode that's still on the writeback lists but has I_FREEING set, and we absolutely need to skip it here, just like we do for all other inode list walks. Based on a patch from Dave Chinner. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 7a24cc9..f6af81a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -487,10 +487,16 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, return 0; } - if (inode->i_state & (I_NEW | I_WILL_FREE)) { + /* + * Don't bother with new inodes or inodes beeing freed, first + * kind does not need peridic writeout yet, and for the latter + * kind writeout is handled by the freer. + */ + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { requeue_io(inode); continue; } + /* * Was this inode dirtied after sync_sb_inodes was called? * This keeps sync from extra jobs and livelock. @@ -498,7 +504,6 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, if (inode_dirtied_after(inode, wbc->wb_start)) return 1; - BUG_ON(inode->i_state & I_FREEING); __iget(inode); pages_skipped = wbc->pages_skipped; writeback_single_inode(inode, wbc); -- cgit v0.10.2 From 63997e98a3be68d7cec806d22bf9b02b2e1daabb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 25 Oct 2010 20:49:35 -0400 Subject: split invalidate_inodes() Pull removal of fsnotify marks into generic_shutdown_super(). Split umount-time work into a new function - evict_inodes(). Make sure that invalidate_inodes() will be able to cope with I_FREEING once we change locking in iput(). Signed-off-by: Al Viro diff --git a/fs/inode.c b/fs/inode.c index f6b3b52..a6d6068 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -478,6 +478,49 @@ static void dispose_list(struct list_head *head) } /** + * evict_inodes - evict all evictable inodes for a superblock + * @sb: superblock to operate on + * + * Make sure that no inodes with zero refcount are retained. This is + * called by superblock shutdown after having MS_ACTIVE flag removed, + * so any inode reaching zero refcount during or after that call will + * be immediately evicted. + */ +void evict_inodes(struct super_block *sb) +{ + struct inode *inode, *next; + LIST_HEAD(dispose); + + down_write(&iprune_sem); + + spin_lock(&inode_lock); + list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { + if (atomic_read(&inode->i_count)) + continue; + + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + WARN_ON(1); + continue; + } + + inode->i_state |= I_FREEING; + + /* + * Move the inode off the IO lists and LRU once I_FREEING is + * set so that it won't get moved back on there if it is dirty. + */ + list_move(&inode->i_lru, &dispose); + list_del_init(&inode->i_wb_list); + if (!(inode->i_state & (I_DIRTY | I_SYNC))) + percpu_counter_dec(&nr_inodes_unused); + } + spin_unlock(&inode_lock); + + dispose_list(&dispose); + up_write(&iprune_sem); +} + +/** * invalidate_inodes - attempt to free all inodes on a superblock * @sb: superblock to operate on * @@ -493,9 +536,8 @@ int invalidate_inodes(struct super_block *sb) down_write(&iprune_sem); spin_lock(&inode_lock); - fsnotify_unmount_inodes(&sb->s_inodes); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { - if (inode->i_state & I_NEW) + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) continue; if (atomic_read(&inode->i_count)) { busy = 1; diff --git a/fs/internal.h b/fs/internal.h index 4cc67eb..ebad3b9 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -106,4 +106,5 @@ extern void release_open_intent(struct nameidata *); * inode.c */ extern int get_nr_dirty_inodes(void); +extern int evict_inodes(struct super_block *); extern int invalidate_inodes(struct super_block *); diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 33297c0..21ed106 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -240,6 +240,7 @@ void fsnotify_unmount_inodes(struct list_head *list) { struct inode *inode, *next_i, *need_iput = NULL; + spin_lock(&inode_lock); list_for_each_entry_safe(inode, next_i, list, i_sb_list) { struct inode *need_iput_tmp; @@ -297,4 +298,5 @@ void fsnotify_unmount_inodes(struct list_head *list) spin_lock(&inode_lock); } + spin_unlock(&inode_lock); } diff --git a/fs/super.c b/fs/super.c index 8819e3a..b9c9869 100644 --- a/fs/super.c +++ b/fs/super.c @@ -273,14 +273,14 @@ void generic_shutdown_super(struct super_block *sb) get_fs_excl(); sb->s_flags &= ~MS_ACTIVE; - /* bad name - it should be evict_inodes() */ - invalidate_inodes(sb); + fsnotify_unmount_inodes(&sb->s_inodes); + + evict_inodes(sb); if (sop->put_super) sop->put_super(sb); - /* Forget any remaining inodes */ - if (invalidate_inodes(sb)) { + if (!list_empty(&sb->s_inodes)) { printk("VFS: Busy inodes after unmount of %s. " "Self-destruct in 5 seconds. Have a nice day...\n", sb->s_id); -- cgit v0.10.2