From f2353d7bd4221dfe0e230c5be49038d4a57d97e9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 23 Mar 2016 10:42:01 -0700 Subject: f2fs: give RO message when recovering superblock When one of superblocks is missing, f2fs recovers it with the valid one. But, even if f2fs is mounted as RO, we'd better notify that too. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 006f87d..ffe4616 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1298,6 +1298,9 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) struct buffer_head *bh; int err; + if (f2fs_readonly(sbi->sb) || bdev_read_only(sbi->sb->s_bdev)) + return -EROFS; + /* write back-up superblock first */ bh = sb_getblk(sbi->sb, sbi->valid_super_block ? 0: 1); if (!bh) @@ -1565,7 +1568,7 @@ try_onemore: kfree(options); /* recover broken superblock */ - if (recovery && !f2fs_readonly(sb) && !bdev_read_only(sb->s_bdev)) { + if (recovery) { err = f2fs_commit_super(sbi, true); f2fs_msg(sb, KERN_INFO, "Try to recover %dth superblock, ret: %ld", -- cgit v0.10.2 From df728b0f6954c38545f4bf12dedeeb9e07469a94 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 23 Mar 2016 17:05:27 -0700 Subject: f2fs: recover superblock at RW remounts This patch adds a sbi flag, SBI_NEED_SB_WRITE, which indicates it needs to recover superblock when (re)mounting as RW. This is set only when f2fs is mounted as RO. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7a4558d..7cb360e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -672,6 +672,7 @@ enum { SBI_IS_CLOSE, /* specify unmounting */ SBI_NEED_FSCK, /* need fsck.f2fs to fix */ SBI_POR_DOING, /* recovery is doing or not */ + SBI_NEED_SB_WRITE, /* need to recover superblock */ }; enum { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ffe4616..f5fbbfd 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -796,6 +796,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) set_sbi_flag(sbi, SBI_IS_DIRTY); } + /* recover superblocks we couldn't write due to previous RO mount */ + if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { + err = f2fs_commit_super(sbi, false); + f2fs_msg(sb, KERN_INFO, + "Try to recover all the superblocks, ret: %d", err); + if (!err) + clear_sbi_flag(sbi, SBI_NEED_SB_WRITE); + } + sync_filesystem(sb); sbi->mount_opt.opt = 0; @@ -852,8 +861,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } skip: /* Update the POSIXACL Flag */ - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); + return 0; restore_gc: if (need_restart_gc) { @@ -998,11 +1008,12 @@ static int __f2fs_commit_super(struct buffer_head *bh, return __sync_dirty_buffer(bh, WRITE_FLUSH_FUA); } -static inline bool sanity_check_area_boundary(struct super_block *sb, +static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, struct buffer_head *bh) { struct f2fs_super_block *raw_super = (struct f2fs_super_block *) (bh->b_data + F2FS_SUPER_OFFSET); + struct super_block *sb = sbi->sb; u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr); u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr); @@ -1081,6 +1092,7 @@ static inline bool sanity_check_area_boundary(struct super_block *sb, segment0_blkaddr) >> log_blocks_per_seg); if (f2fs_readonly(sb) || bdev_read_only(sb->s_bdev)) { + set_sbi_flag(sbi, SBI_NEED_SB_WRITE); res = "internally"; } else { err = __f2fs_commit_super(bh, NULL); @@ -1098,11 +1110,12 @@ static inline bool sanity_check_area_boundary(struct super_block *sb, return false; } -static int sanity_check_raw_super(struct super_block *sb, +static int sanity_check_raw_super(struct f2fs_sb_info *sbi, struct buffer_head *bh) { struct f2fs_super_block *raw_super = (struct f2fs_super_block *) (bh->b_data + F2FS_SUPER_OFFSET); + struct super_block *sb = sbi->sb; unsigned int blocksize; if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) { @@ -1169,7 +1182,7 @@ static int sanity_check_raw_super(struct super_block *sb, } /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ - if (sanity_check_area_boundary(sb, bh)) + if (sanity_check_area_boundary(sbi, bh)) return 1; return 0; @@ -1239,10 +1252,11 @@ static void init_sb_info(struct f2fs_sb_info *sbi) * to get the first valid one. If any one of them is broken, we pass * them recovery flag back to the caller. */ -static int read_raw_super_block(struct super_block *sb, +static int read_raw_super_block(struct f2fs_sb_info *sbi, struct f2fs_super_block **raw_super, int *valid_super_block, int *recovery) { + struct super_block *sb = sbi->sb; int block; struct buffer_head *bh; struct f2fs_super_block *super; @@ -1262,7 +1276,7 @@ static int read_raw_super_block(struct super_block *sb, } /* sanity checking of raw super */ - if (sanity_check_raw_super(sb, bh)) { + if (sanity_check_raw_super(sbi, bh)) { f2fs_msg(sb, KERN_ERR, "Can't find valid F2FS filesystem in %dth superblock", block + 1); @@ -1298,8 +1312,11 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) struct buffer_head *bh; int err; - if (f2fs_readonly(sbi->sb) || bdev_read_only(sbi->sb->s_bdev)) + if ((recover && f2fs_readonly(sbi->sb)) || + bdev_read_only(sbi->sb->s_bdev)) { + set_sbi_flag(sbi, SBI_NEED_SB_WRITE); return -EROFS; + } /* write back-up superblock first */ bh = sb_getblk(sbi->sb, sbi->valid_super_block ? 0: 1); @@ -1343,6 +1360,8 @@ try_onemore: if (!sbi) return -ENOMEM; + sbi->sb = sb; + /* Load the checksum driver */ sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0); if (IS_ERR(sbi->s_chksum_driver)) { @@ -1358,7 +1377,7 @@ try_onemore: goto free_sbi; } - err = read_raw_super_block(sb, &raw_super, &valid_super_block, + err = read_raw_super_block(sbi, &raw_super, &valid_super_block, &recovery); if (err) goto free_sbi; @@ -1393,7 +1412,6 @@ try_onemore: memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); /* init f2fs-specific super block info */ - sbi->sb = sb; sbi->raw_super = raw_super; sbi->valid_super_block = valid_super_block; mutex_init(&sbi->gc_mutex); -- cgit v0.10.2 From 6781eabba1bdb133eb9125c4acf6704ccbe4df02 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 23 Mar 2016 16:12:58 -0700 Subject: f2fs: give -EINVAL for norecovery and rw mount Once detecting something to recover, f2fs should stop mounting, given norecovery and rw mount options. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7cb360e..e1c07b6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1907,7 +1907,7 @@ void build_gc_manager(struct f2fs_sb_info *); /* * recovery.c */ -int recover_fsync_data(struct f2fs_sb_info *); +int recover_fsync_data(struct f2fs_sb_info *, bool); bool space_for_roll_forward(struct f2fs_sb_info *); /* diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 011942f..2c87c12 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -551,12 +551,13 @@ next: return err; } -int recover_fsync_data(struct f2fs_sb_info *sbi) +int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct list_head inode_list; block_t blkaddr; int err; + int ret = 0; bool need_writecp = false; fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", @@ -573,11 +574,13 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list); - if (err) + if (err || list_empty(&inode_list)) goto out; - if (list_empty(&inode_list)) + if (check_only) { + ret = 1; goto out; + } need_writecp = true; @@ -625,5 +628,5 @@ out: } else { mutex_unlock(&sbi->cp_mutex); } - return err; + return ret ? ret: err; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f5fbbfd..8f9648f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1562,14 +1562,24 @@ try_onemore: if (need_fsck) set_sbi_flag(sbi, SBI_NEED_FSCK); - err = recover_fsync_data(sbi); - if (err) { + err = recover_fsync_data(sbi, false); + if (err < 0) { need_fsck = true; f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%ld", err); goto free_kobj; } + } else { + err = recover_fsync_data(sbi, true); + + if (!f2fs_readonly(sb) && err > 0) { + err = -EINVAL; + f2fs_msg(sb, KERN_ERR, + "Need to recover fsync data"); + goto free_kobj; + } } + /* recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); -- cgit v0.10.2 From faa0e55bba54b91c464850dc28376056d26fa310 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 24 Mar 2016 10:29:39 -0700 Subject: f2fs: treat as a normal umount when remounting ro When user remounts f2fs as read-only, we can mark the checkpoint as umount. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8f9648f..19a85cf 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -791,11 +791,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) org_mount_opt = sbi->mount_opt; active_logs = sbi->active_logs; - if (*flags & MS_RDONLY) { - set_opt(sbi, FASTBOOT); - set_sbi_flag(sbi, SBI_IS_DIRTY); - } - /* recover superblocks we couldn't write due to previous RO mount */ if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { err = f2fs_commit_super(sbi, false); @@ -805,8 +800,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) clear_sbi_flag(sbi, SBI_NEED_SB_WRITE); } - sync_filesystem(sb); - sbi->mount_opt.opt = 0; default_options(sbi); @@ -838,7 +831,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) { if (sbi->gc_thread) { stop_gc_thread(sbi); - f2fs_sync_fs(sb, 1); need_restart_gc = true; } } else if (!sbi->gc_thread) { @@ -848,6 +840,16 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } + if (*flags & MS_RDONLY) { + writeback_inodes_sb(sb, WB_REASON_SYNC); + sync_inodes_sb(sb); + + set_sbi_flag(sbi, SBI_IS_DIRTY); + set_sbi_flag(sbi, SBI_IS_CLOSE); + f2fs_sync_fs(sb, 1); + clear_sbi_flag(sbi, SBI_IS_CLOSE); + } + /* * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. -- cgit v0.10.2 From 8c11a53fc2557eb247e3eaa4d554d45c1b55fc98 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 18 Mar 2016 09:46:10 -0700 Subject: f2fs: show current mount status This patch remains the current mount status to f2fs status info. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index f4a61a5..818064b 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -216,8 +216,9 @@ static int stat_show(struct seq_file *s, void *v) list_for_each_entry(si, &f2fs_stat_list, stat_list) { update_general_status(si->sbi); - seq_printf(s, "\n=====[ partition info(%pg). #%d ]=====\n", - si->sbi->sb->s_bdev, i++); + seq_printf(s, "\n=====[ partition info(%pg). #%d, %s]=====\n", + si->sbi->sb->s_bdev, i++, + f2fs_readonly(si->sbi->sb) ? "RO": "RW"); seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", -- cgit v0.10.2 From 675f10bde6cc3874632a8f684df2a8a2a8ace76e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 22 Feb 2016 18:29:18 +0800 Subject: f2fs: fix to convert inline directory correctly With below serials, we will lose parts of dirents: 1) mount f2fs with inline_dentry option 2) echo 1 > /sys/fs/f2fs/sdX/dir_level 3) mkdir dir 4) touch 180 files named [1-180] in dir 5) touch 181 in dir 6) echo 3 > /proc/sys/vm/drop_caches 7) ll dir ls: cannot access 2: No such file or directory ls: cannot access 4: No such file or directory ls: cannot access 5: No such file or directory ls: cannot access 6: No such file or directory ls: cannot access 8: No such file or directory ls: cannot access 9: No such file or directory ... total 360 drwxr-xr-x 2 root root 4096 Feb 19 15:12 ./ drwxr-xr-x 3 root root 4096 Feb 19 15:11 ../ -rw-r--r-- 1 root root 0 Feb 19 15:12 1 -rw-r--r-- 1 root root 0 Feb 19 15:12 10 -rw-r--r-- 1 root root 0 Feb 19 15:12 100 -????????? ? ? ? ? ? 101 -????????? ? ? ? ? ? 102 -????????? ? ? ? ? ? 103 ... The reason is: when doing the inline dir conversion, we didn't consider that directory has hierarchical hash structure which can be configured through sysfs interface 'dir_level'. By default, dir_level of directory inode is 0, it means we have one bucket in hash table located in first level, all dirents will be hashed in this bucket, so it has no problem for us to do the duplication simply between inline dentry page and converted normal dentry page. However, if we configured dir_level with the value N (greater than 0), it will expand the bucket number of first level hash table by 2^N - 1, it hashs dirents into different buckets according their hash value, if we still move all dirents to first bucket, it makes incorrent locating for inline dirents, the result is, although we can iterate all dirents through ->readdir, we can't stat some of them in ->lookup which based on hash table searching. This patch fixes this issue by rehashing dirents into correct position when converting inline directory. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index af81957..e90380d 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -48,7 +48,6 @@ unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { [F2FS_FT_SYMLINK] = DT_LNK, }; -#define S_SHIFT 12 static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFREG >> S_SHIFT] = F2FS_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = F2FS_FT_DIR, @@ -64,6 +63,13 @@ void set_de_type(struct f2fs_dir_entry *de, umode_t mode) de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } +unsigned char get_de_type(struct f2fs_dir_entry *de) +{ + if (de->file_type < F2FS_FT_MAX) + return f2fs_filetype_table[de->file_type]; + return DT_UNKNOWN; +} + static unsigned long dir_block_index(unsigned int level, int dir_level, unsigned int idx) { @@ -509,11 +515,7 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, } } -/* - * Caller should grab and release a rwsem by calling f2fs_lock_op() and - * f2fs_unlock_op(). - */ -int __f2fs_add_link(struct inode *dir, const struct qstr *name, +int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, struct inode *inode, nid_t ino, umode_t mode) { unsigned int bit_pos; @@ -526,28 +528,11 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct f2fs_dentry_block *dentry_blk = NULL; struct f2fs_dentry_ptr d; struct page *page = NULL; - struct fscrypt_name fname; - struct qstr new_name; - int slots, err; - - err = fscrypt_setup_filename(dir, name, 0, &fname); - if (err) - return err; - - new_name.name = fname_name(&fname); - new_name.len = fname_len(&fname); - - if (f2fs_has_inline_dentry(dir)) { - err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode); - if (!err || err != -EAGAIN) - goto out; - else - err = 0; - } + int slots, err = 0; level = 0; - slots = GET_DENTRY_SLOTS(new_name.len); - dentry_hash = f2fs_dentry_hash(&new_name); + slots = GET_DENTRY_SLOTS(new_name->len); + dentry_hash = f2fs_dentry_hash(new_name); current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { @@ -556,10 +541,8 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, } start: - if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) { - err = -ENOSPC; - goto out; - } + if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) + return -ENOSPC; /* Increase the depth, if required */ if (level == current_depth) @@ -573,10 +556,8 @@ start: for (block = bidx; block <= (bidx + nblock - 1); block++) { dentry_page = get_new_data_page(dir, NULL, block, true); - if (IS_ERR(dentry_page)) { - err = PTR_ERR(dentry_page); - goto out; - } + if (IS_ERR(dentry_page)) + return PTR_ERR(dentry_page); dentry_blk = kmap(dentry_page); bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, @@ -596,7 +577,7 @@ add_dentry: if (inode) { down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, &new_name, NULL); + page = init_inode_metadata(inode, dir, new_name, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; @@ -606,7 +587,7 @@ add_dentry: } make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); - f2fs_update_dentry(ino, mode, &d, &new_name, dentry_hash, bit_pos); + f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos); set_page_dirty(dentry_page); @@ -628,7 +609,34 @@ fail: } kunmap(dentry_page); f2fs_put_page(dentry_page, 1); -out: + + return err; +} + +/* + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + */ +int __f2fs_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode, nid_t ino, umode_t mode) +{ + struct fscrypt_name fname; + struct qstr new_name; + int err; + + err = fscrypt_setup_filename(dir, name, 0, &fname); + if (err) + return err; + + new_name.name = fname_name(&fname); + new_name.len = fname_len(&fname); + + err = -EAGAIN; + if (f2fs_has_inline_dentry(dir)) + err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode); + if (err == -EAGAIN) + err = f2fs_add_regular_entry(dir, &new_name, inode, ino, mode); + fscrypt_free_filename(&fname); f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); return err; @@ -792,10 +800,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, continue; } - if (de->file_type < F2FS_FT_MAX) - d_type = f2fs_filetype_table[de->file_type]; - else - d_type = DT_UNKNOWN; + d_type = get_de_type(de); de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e1c07b6..3f15513 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1711,7 +1711,7 @@ struct dentry *f2fs_get_parent(struct dentry *child); */ extern unsigned char f2fs_filetype_table[F2FS_FT_MAX]; void set_de_type(struct f2fs_dir_entry *, umode_t); - +unsigned char get_de_type(struct f2fs_dir_entry *); struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, f2fs_hash_t, int *, struct f2fs_dentry_ptr *); bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, @@ -1732,6 +1732,8 @@ void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, int update_dent_inode(struct inode *, struct inode *, const struct qstr *); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *, const struct qstr *, f2fs_hash_t , unsigned int); +int f2fs_add_regular_entry(struct inode *, const struct qstr *, + struct inode *, nid_t, umode_t); int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t, umode_t); void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *, diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index a2fbe6f..7720565 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -355,7 +355,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, * NOTE: ipage is grabbed by caller, but if any error occurs, we should * release ipage in this function. */ -static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, +static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, struct f2fs_inline_dentry *inline_dentry) { struct page *page; @@ -416,6 +416,98 @@ out: return err; } +static int f2fs_add_inline_entries(struct inode *dir, + struct f2fs_inline_dentry *inline_dentry) +{ + struct f2fs_dentry_ptr d; + unsigned long bit_pos = 0; + int err = 0; + + make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + + while (bit_pos < d.max) { + struct f2fs_dir_entry *de; + struct qstr new_name; + nid_t ino; + umode_t fake_mode; + + if (!test_bit_le(bit_pos, d.bitmap)) { + bit_pos++; + continue; + } + + de = &d.dentry[bit_pos]; + new_name.name = d.filename[bit_pos]; + new_name.len = de->name_len; + + ino = le32_to_cpu(de->ino); + fake_mode = get_de_type(de) << S_SHIFT; + + err = f2fs_add_regular_entry(dir, &new_name, NULL, + ino, fake_mode); + if (err) + goto punch_dentry_pages; + + if (unlikely(!de->name_len)) + d.max = -1; + + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + } + return 0; +punch_dentry_pages: + truncate_inode_pages(&dir->i_data, 0); + truncate_blocks(dir, 0, false); + remove_dirty_inode(dir); + return err; +} + +static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, + struct f2fs_inline_dentry *inline_dentry) +{ + struct f2fs_inline_dentry *backup_dentry; + int err; + + backup_dentry = kmalloc(sizeof(struct f2fs_inline_dentry), + GFP_F2FS_ZERO); + if (!backup_dentry) + return -ENOMEM; + + memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA); + truncate_inline_inode(ipage, 0); + + unlock_page(ipage); + + err = f2fs_add_inline_entries(dir, backup_dentry); + if (err) + goto recover; + + lock_page(ipage); + + stat_dec_inline_dir(dir); + clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY); + update_inode(dir, ipage); + kfree(backup_dentry); + return 0; +recover: + lock_page(ipage); + memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA); + i_size_write(dir, MAX_INLINE_DATA); + update_inode(dir, ipage); + f2fs_put_page(ipage, 1); + + kfree(backup_dentry); + return err; +} + +static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, + struct f2fs_inline_dentry *inline_dentry) +{ + if (!F2FS_I(dir)->i_dir_level) + return f2fs_move_inline_dirents(dir, ipage, inline_dentry); + else + return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry); +} + int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode) { diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index b90e9bd..4c02c65 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -508,4 +508,6 @@ enum { F2FS_FT_MAX }; +#define S_SHIFT 12 + #endif /* _LINUX_F2FS_FS_H */ -- cgit v0.10.2 From 4a6de50d5408cdc699588119e2338e580adc2b73 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 30 Mar 2016 11:25:31 -0700 Subject: f2fs: use PGP_LOCK to check its truncation Previously, after trylock_page is succeeded, it doesn't check its mapping. In order to fix that, we can just give PGP_LOCK to pagecache_get_page. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1a33de9..ade221c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1202,13 +1202,10 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) if (!inode) return; - page = pagecache_get_page(inode->i_mapping, 0, FGP_NOWAIT, 0); + page = pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0); if (!page) goto iput_out; - if (!trylock_page(page)) - goto release_out; - if (!PageUptodate(page)) goto page_out; @@ -1223,9 +1220,7 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) else set_page_dirty(page); page_out: - unlock_page(page); -release_out: - f2fs_put_page(page, 0); + f2fs_put_page(page, 1); iput_out: iput(inode); } -- cgit v0.10.2 From ff37355886ac2082cee594aa949c08e2cfb33aa0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 29 Mar 2016 16:13:45 -0700 Subject: f2fs: add BUG_ON to avoid unnecessary flow This patch adds BUG_ON instead of retrying loop. In the case of node pages, we already got this inode page, but unlocked it. By the fact that we don't truncate any node pages in operations, the page's mapping should be unchangeable. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ade221c..095fc2c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -832,7 +832,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) trace_f2fs_truncate_inode_blocks_enter(inode, from); level = get_node_path(inode, from, offset, noffset); -restart: + page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); @@ -896,10 +896,7 @@ skip_partial: if (offset[1] == 0 && ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) { lock_page(page); - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { - f2fs_put_page(page, 1); - goto restart; - } + BUG_ON(page->mapping != NODE_MAPPING(sbi)); f2fs_wait_on_page_writeback(page, NODE, true); ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; set_page_dirty(page); -- cgit v0.10.2 From de5307e46d28aa26ffd6f2048398b6303e134a67 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 11 Apr 2016 11:51:51 -0700 Subject: f2fs: fix dropping inmemory pages in a wrong time When one reader closes its file while the other writer is doing atomic writes, f2fs_release_file drops atomic data resulting in an empty commit. This patch fixes this wrong commit problem by checking openess of the file. Process0 Process1 open file start atomic write write data read data close file f2fs_release_file() clear atomic data commit atomic write Reported-by: Miao Xie Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 90d1157..e10eb61 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1254,6 +1254,14 @@ out: static int f2fs_release_file(struct inode *inode, struct file *filp) { + /* + * f2fs_relase_file is called at every close calls. So we should + * not drop any inmemory pages by close called by other process. + */ + if (!(filp->f_mode & FMODE_WRITE) || + atomic_read(&inode->i_writecount) != 1) + return 0; + /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) drop_inmem_pages(inode); -- cgit v0.10.2 From 26dc3d4424e9f4764d633bd8f9309a01e6f364dd Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 11 Apr 2016 13:15:10 -0700 Subject: f2fs: unset atomic/volatile flag in f2fs_release_file The atomic/volatile operation should be done in pair of start and commit ioctl. For example, if a killed process remains open-ended atomic operation, we should drop its flag as well as its atomic data. Otherwise, if sqlite initiates another operation which doesn't require atomic writes, it will lose every data, since f2fs still treats with them as atomic writes; nobody will trigger its commit. Reported-by: Miao Xie Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e10eb61..ed389f6 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1266,6 +1266,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) if (f2fs_is_atomic_file(inode)) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { + clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); set_inode_flag(F2FS_I(inode), FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); clear_inode_flag(F2FS_I(inode), FI_DROP_CACHE); @@ -1449,10 +1450,8 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) if (ret) return ret; - if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + if (f2fs_is_atomic_file(inode)) drop_inmem_pages(inode); - } if (f2fs_is_volatile_file(inode)) { clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 540669d..299c784 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -239,6 +239,8 @@ void drop_inmem_pages(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); + clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + mutex_lock(&fi->inmem_lock); __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); mutex_unlock(&fi->inmem_lock); -- cgit v0.10.2 From 4da7bf5a4345f3ce9699476a8022f66cfb4a8ce9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 6 Apr 2016 11:27:03 -0700 Subject: f2fs: remove redundant condition check This patch resolves the redundant condition check reported by David. Reported-by: David Binderman Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0955312..b92782f 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -211,7 +211,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) bool readahead = false; page = find_get_page(META_MAPPING(sbi), index); - if (!page || (page && !PageUptodate(page))) + if (!page || !PageUptodate(page)) readahead = true; f2fs_put_page(page, 0); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5dafb9c..c29bcf4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1496,7 +1496,7 @@ restart: } else { /* hole case */ err = get_dnode_of_data(&dn, index, LOOKUP_NODE); - if (err || (!err && dn.data_blkaddr == NULL_ADDR)) { + if (err || dn.data_blkaddr == NULL_ADDR) { f2fs_put_dnode(&dn); f2fs_lock_op(sbi); locked = true; -- cgit v0.10.2 From 58457f1c355545c468b8aed5c431d8a6bb71d35d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 12 Apr 2016 11:52:30 -0700 Subject: f2fs: give -E2BIG for no space in xattr This patch returns -E2BIG if there is no space to add an xattr entry. This should fix generic/026 in xfstests as well. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 06a72dc..1529712 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -500,7 +500,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, free = free + ENTRY_SIZE(here); if (unlikely(free < newsize)) { - error = -ENOSPC; + error = -E2BIG; goto exit; } } -- cgit v0.10.2 From 63c52d7878903a014fa4c9075afd051b1e77597b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 12 Apr 2016 14:11:03 -0700 Subject: f2fs: don't invalidate atomic page if successful If we committed atomic write successfully, we don't need to invalidate pages. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 299c784..770cdc9 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -223,9 +223,10 @@ static int __revoke_inmem_pages(struct inode *inode, f2fs_put_dnode(&dn); } next: - ClearPageUptodate(page); + /* we don't need to invalidate this in the sccessful status */ + if (drop || recover) + ClearPageUptodate(page); set_page_private(page, 0); - ClearPageUptodate(page); f2fs_put_page(page, 1); list_del(&cur->list); -- cgit v0.10.2 From c27753d675fccd3b15a78621a2a66616507693d4 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 12 Apr 2016 14:36:11 -0700 Subject: f2fs: flush dirty pages before starting atomic writes If somebody wrote some data before atomic writes, we should flush them in order to handle atomic data in a right period. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ed389f6..7de90e6 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1369,7 +1369,16 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return 0; + if (!get_dirty_pages(inode)) + return 0; + + f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, + "Unexpected flush for atomic writes: ino=%lu, npages=%u", + inode->i_ino, get_dirty_pages(inode)); + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) + clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + return ret; } static int f2fs_ioc_commit_atomic_write(struct file *filp) -- cgit v0.10.2 From eca76e783cf5970db36edfda7e66487d897ea222 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 13 Apr 2016 16:14:38 -0700 Subject: f2fs: avoid needless lock for node pages when fsyncing a file When fsync is called, sync_node_pages finds a proper direct node pages to flush. But, it locks unrelated direct node pages together unnecessarily. Acked-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 095fc2c..cccee50 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1272,10 +1272,14 @@ next_step: * we should not skip writing node pages. */ lock_node: - if (ino && ino_of_node(page) == ino) - lock_page(page); - else if (!trylock_page(page)) + if (ino) { + if (ino_of_node(page) == ino) + lock_page(page); + else + continue; + } else if (!trylock_page(page)) { continue; + } if (unlikely(page->mapping != NODE_MAPPING(sbi))) { continue_unlock: -- cgit v0.10.2 From e6e5f5610d585551785ec654b6db9277b19a0664 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 14 Apr 2016 16:48:52 -0700 Subject: f2fs: avoid writing 0'th page in volatile writes The first page of volatile writes usually contains a sort of header information which will be used for recovery. (e.g., journal header of sqlite) If this is written without other journal data, user needs to handle the stale journal information. Acked-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c29bcf4..e54489b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1177,8 +1177,10 @@ write: goto redirty_out; if (f2fs_is_drop_cache(inode)) goto out; - if (f2fs_is_volatile_file(inode) && !wbc->for_reclaim && - available_free_memory(sbi, BASE_CHECK)) + /* we should not write 0'th page having journal header */ + if (f2fs_is_volatile_file(inode) && (!page->index || + (!wbc->for_reclaim && + available_free_memory(sbi, BASE_CHECK)))) goto redirty_out; /* Dentry blocks are controlled by checkpoint */ -- cgit v0.10.2 From 5268137564920843e581304d9bfb06fb9502cf24 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 13 Apr 2016 16:24:44 -0700 Subject: f2fs: split sync_node_pages with fsync_node_pages This patch splits the existing sync_node_pages into (f)sync_node_pages. The fsync_node_pages is used for f2fs_sync_file only. Acked-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index b92782f..bf040b5 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -892,7 +892,7 @@ retry_flush_nodes: if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - err = sync_node_pages(sbi, 0, &wbc); + err = sync_node_pages(sbi, &wbc); if (err) { f2fs_unlock_all(sbi); goto out; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3f15513..269abe5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1784,7 +1784,8 @@ void ra_node_page(struct f2fs_sb_info *, nid_t); struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); struct page *get_node_page_ra(struct page *, int); void sync_inode_page(struct dnode_of_data *); -int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *); +int fsync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *); +int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); bool alloc_nid(struct f2fs_sb_info *, nid_t *); void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7de90e6..3d53ee0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -256,7 +256,7 @@ go_write: goto out; } sync_nodes: - sync_node_pages(sbi, ino, &wbc); + fsync_node_pages(sbi, ino, &wbc); /* if cp_error was enabled, we should avoid infinite loop */ if (unlikely(f2fs_cp_error(sbi))) { diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b0051a9..e820465 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -841,7 +841,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, .nr_to_write = LONG_MAX, .for_reclaim = 0, }; - sync_node_pages(sbi, 0, &wbc); + sync_node_pages(sbi, &wbc); } else { f2fs_submit_merged_bio(sbi, DATA, WRITE); } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index cccee50..675b730 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1222,12 +1222,84 @@ iput_out: iput(inode); } -int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, +int fsync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, struct writeback_control *wbc) { pgoff_t index, end; struct pagevec pvec; - int step = ino ? 2 : 0; + int nwritten = 0; + + pagevec_init(&pvec, 0); + index = 0; + end = ULONG_MAX; + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (unlikely(f2fs_cp_error(sbi))) { + pagevec_release(&pvec); + return -EIO; + } + + if (!IS_DNODE(page) || !is_cold_node(page)) + continue; + if (ino_of_node(page) != ino) + continue; + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + if (ino_of_node(page) != ino) + goto continue_unlock; + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + f2fs_wait_on_page_writeback(page, NODE, true); + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + set_fsync_mark(page, 1); + if (IS_INODE(page)) + set_dentry_mark(page, + need_dentry_mark(sbi, ino)); + nwritten++; + + if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc)) + unlock_page(page); + + if (--wbc->nr_to_write == 0) + break; + } + pagevec_release(&pvec); + cond_resched(); + + if (wbc->nr_to_write == 0) + break; + } + return nwritten; +} + +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) +{ + pgoff_t index, end; + struct pagevec pvec; + int step = 0; int nwritten = 0; pagevec_init(&pvec, 0); @@ -1266,28 +1338,15 @@ next_step: if (step == 2 && (!IS_DNODE(page) || !is_cold_node(page))) continue; - - /* - * If an fsync mode, - * we should not skip writing node pages. - */ lock_node: - if (ino) { - if (ino_of_node(page) == ino) - lock_page(page); - else - continue; - } else if (!trylock_page(page)) { + if (!trylock_page(page)) continue; - } if (unlikely(page->mapping != NODE_MAPPING(sbi))) { continue_unlock: unlock_page(page); continue; } - if (ino && ino_of_node(page) != ino) - goto continue_unlock; if (!PageDirty(page)) { /* someone wrote it for us */ @@ -1295,7 +1354,7 @@ continue_unlock: } /* flush inline_data */ - if (!ino && is_inline_node(page)) { + if (is_inline_node(page)) { clear_inline_node(page); unlock_page(page); flush_inline_data(sbi, ino_of_node(page)); @@ -1308,17 +1367,8 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - /* called by fsync() */ - if (ino && IS_DNODE(page)) { - set_fsync_mark(page, 1); - if (IS_INODE(page)) - set_dentry_mark(page, - need_dentry_mark(sbi, ino)); - nwritten++; - } else { - set_fsync_mark(page, 0); - set_dentry_mark(page, 0); - } + set_fsync_mark(page, 0); + set_dentry_mark(page, 0); if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc)) unlock_page(page); @@ -1466,7 +1516,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; - sync_node_pages(sbi, 0, wbc); + sync_node_pages(sbi, wbc); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; -- cgit v0.10.2 From c267ec1526da2d3b12c80a89ebd8eb9b6a01d636 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 15 Apr 2016 09:25:04 -0700 Subject: f2fs: report unwritten status in fsync_node_pages The fsync_node_pages should return pass or failure so that user could know fsync is completed or not. Acked-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3d53ee0..60fd64c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -256,7 +256,9 @@ go_write: goto out; } sync_nodes: - fsync_node_pages(sbi, ino, &wbc); + ret = fsync_node_pages(sbi, ino, &wbc); + if (ret) + goto out; /* if cp_error was enabled, we should avoid infinite loop */ if (unlikely(f2fs_cp_error(sbi))) { diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 675b730..8a1e211 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1227,7 +1227,7 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, { pgoff_t index, end; struct pagevec pvec; - int nwritten = 0; + int ret = 0; pagevec_init(&pvec, 0); index = 0; @@ -1278,21 +1278,20 @@ continue_unlock: if (IS_INODE(page)) set_dentry_mark(page, need_dentry_mark(sbi, ino)); - nwritten++; - if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc)) + ret = NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); + if (ret) { unlock_page(page); - - if (--wbc->nr_to_write == 0) break; + } } pagevec_release(&pvec); cond_resched(); - if (wbc->nr_to_write == 0) + if (ret) break; } - return nwritten; + return ret ? -EIO: 0; } int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) -- cgit v0.10.2 From 608514deba38c8611ad330d6a3c8e2b9a1f68e4b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 15 Apr 2016 09:43:17 -0700 Subject: f2fs: set fsync mark only for the last dnode In order to give atomic writes, we should consider power failure during sync_node_pages in fsync. So, this patch marks fsync flag only in the last dnode block. Acked-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 269abe5..ca828b0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -159,7 +159,6 @@ struct fsync_inode_entry { struct inode *inode; /* vfs inode pointer */ block_t blkaddr; /* block address locating the last fsync */ block_t last_dentry; /* block address locating the last dentry */ - block_t last_inode; /* block address locating the last inode */ }; #define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats)) @@ -1784,7 +1783,8 @@ void ra_node_page(struct f2fs_sb_info *, nid_t); struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); struct page *get_node_page_ra(struct page *, int); void sync_inode_page(struct dnode_of_data *); -int fsync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *); +int fsync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *, + bool); int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); bool alloc_nid(struct f2fs_sb_info *, nid_t *); void alloc_nid_done(struct f2fs_sb_info *, nid_t); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 60fd64c..dc47d5c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -182,7 +182,8 @@ static void try_to_fix_pino(struct inode *inode) } } -int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, + int datasync, bool atomic) { struct inode *inode = file->f_mapping->host; struct f2fs_inode_info *fi = F2FS_I(inode); @@ -256,7 +257,7 @@ go_write: goto out; } sync_nodes: - ret = fsync_node_pages(sbi, ino, &wbc); + ret = fsync_node_pages(sbi, ino, &wbc, atomic); if (ret) goto out; @@ -290,6 +291,11 @@ out: return ret; } +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +{ + return f2fs_do_sync_file(file, start, end, datasync, false); +} + static pgoff_t __get_first_dirty_index(struct address_space *mapping, pgoff_t pgofs, int whence) { @@ -1407,7 +1413,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) } } - ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); err_out: mnt_drop_write_file(filp); return ret; @@ -1465,7 +1471,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); - ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } mnt_drop_write_file(filp); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8a1e211..de070a5 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1222,13 +1222,81 @@ iput_out: iput(inode); } +static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) +{ + pgoff_t index, end; + struct pagevec pvec; + struct page *last_page = NULL; + + pagevec_init(&pvec, 0); + index = 0; + end = ULONG_MAX; + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_put_page(last_page, 0); + pagevec_release(&pvec); + return ERR_PTR(-EIO); + } + + if (!IS_DNODE(page) || !is_cold_node(page)) + continue; + if (ino_of_node(page) != ino) + continue; + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + if (ino_of_node(page) != ino) + goto continue_unlock; + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (last_page) + f2fs_put_page(last_page, 0); + + get_page(page); + last_page = page; + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } + return last_page; +} + int fsync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, - struct writeback_control *wbc) + struct writeback_control *wbc, bool atomic) { pgoff_t index, end; struct pagevec pvec; int ret = 0; + struct page *last_page = NULL; + bool marked = false; + if (atomic) { + last_page = last_fsync_dnode(sbi, ino); + if (IS_ERR_OR_NULL(last_page)) + return PTR_ERR_OR_ZERO(last_page); + } +retry: pagevec_init(&pvec, 0); index = 0; end = ULONG_MAX; @@ -1245,6 +1313,7 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, struct page *page = pvec.pages[i]; if (unlikely(f2fs_cp_error(sbi))) { + f2fs_put_page(last_page, 0); pagevec_release(&pvec); return -EIO; } @@ -1264,33 +1333,54 @@ continue_unlock: if (ino_of_node(page) != ino) goto continue_unlock; - if (!PageDirty(page)) { + if (!PageDirty(page) && page != last_page) { /* someone wrote it for us */ goto continue_unlock; } f2fs_wait_on_page_writeback(page, NODE, true); BUG_ON(PageWriteback(page)); - if (!clear_page_dirty_for_io(page)) - goto continue_unlock; - set_fsync_mark(page, 1); - if (IS_INODE(page)) - set_dentry_mark(page, + if (!atomic || page == last_page) { + set_fsync_mark(page, 1); + if (IS_INODE(page)) + set_dentry_mark(page, need_dentry_mark(sbi, ino)); + /* may be written by other thread */ + if (!PageDirty(page)) + set_page_dirty(page); + } + + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; ret = NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); if (ret) { unlock_page(page); + f2fs_put_page(last_page, 0); + break; + } + if (page == last_page) { + f2fs_put_page(page, 0); + marked = true; break; } } pagevec_release(&pvec); cond_resched(); - if (ret) + if (ret || marked) break; } + if (!ret && atomic && !marked) { + f2fs_msg(sbi->sb, KERN_DEBUG, + "Retry to write fsync mark: ino=%u, idx=%lx", + ino, last_page->index); + lock_page(last_page); + set_page_dirty(last_page); + unlock_page(last_page); + goto retry; + } return ret ? -EIO: 0; } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 2c87c12..a646d3b 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -257,11 +257,8 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) } entry->blkaddr = blkaddr; - if (IS_INODE(page)) { - entry->last_inode = blkaddr; - if (is_dent_dnode(page)) - entry->last_dentry = blkaddr; - } + if (IS_INODE(page) && is_dent_dnode(page)) + entry->last_dentry = blkaddr; next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); @@ -521,7 +518,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head) * In this case, we can lose the latest inode(x). * So, call recover_inode for the inode update. */ - if (entry->last_inode == blkaddr) + if (IS_INODE(page)) recover_inode(entry->inode, page); if (entry->last_dentry == blkaddr) { err = recover_dentry(entry->inode, page); -- cgit v0.10.2 From 6bfc49197eba070b799ab4ca8755d3a9fd1700da Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 18 Apr 2016 17:07:44 -0400 Subject: f2fs: issue cache flush on direct IO Under direct IO path with O_(D)SYNC, it needs to set proper APPEND or UPDATE flags, so taht f2fs_sync_file can make its data safe. Acked-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e54489b..38ce5d6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -673,6 +673,9 @@ next_block: err = reserve_new_block(&dn); } else { err = __allocate_data_block(&dn); + if (!err) + set_inode_flag(F2FS_I(inode), + FI_APPEND_WRITE); } if (err) goto sync_out; @@ -1685,8 +1688,12 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); - if (err < 0 && iov_iter_rw(iter) == WRITE) - f2fs_write_failed(mapping, offset + count); + if (iov_iter_rw(iter) == WRITE) { + if (err > 0) + set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); + else if (err < 0) + f2fs_write_failed(mapping, offset + count); + } trace_f2fs_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), err); -- cgit v0.10.2 From ae9b9a9d3c480376b8225c212735fb298d7c799f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 24 Apr 2016 21:54:51 +0800 Subject: MAINTAINERS: update my email address I've changed employer, update my email address to the new one. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/MAINTAINERS b/MAINTAINERS index 1d5b4be..a121f46 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4707,7 +4707,7 @@ F: include/linux/fscache*.h F2FS FILE SYSTEM M: Jaegeuk Kim M: Changman Lee -R: Chao Yu +R: Chao Yu L: linux-f2fs-devel@lists.sourceforge.net W: http://en.wikipedia.org/wiki/F2FS T: git git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs.git -- cgit v0.10.2 From a4a13f582c6d36b78b1c0459ee0b28f17bb2fb06 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 27 Apr 2016 22:22:20 +0800 Subject: f2fs: be aware of invalid filename length The filename length in dirent of may become zero-sized after random junk data injection, once encounter such dirent, find_target_dentry or f2fs_add_inline_entries will run into an infinite loop. So let f2fs being aware of that to avoid deadloop. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index e90380d..3b1c14e 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -101,11 +101,6 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, else kunmap(dentry_page); - /* - * For the most part, it should be a bug when name_len is zero. - * We stop here for figuring out where the bugs has occurred. - */ - f2fs_bug_on(F2FS_P_SB(dentry_page), d.max < 0); return de; } @@ -130,6 +125,11 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, de = &d->dentry[bit_pos]; + if (unlikely(!de->name_len)) { + bit_pos++; + continue; + } + /* encrypted case */ de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); @@ -147,10 +147,6 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, *max_slots = max_len; max_len = 0; - /* remain bug on condition */ - if (unlikely(!de->name_len)) - d->max = -1; - bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 7720565..e61084c 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -303,11 +303,6 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, else f2fs_put_page(ipage, 0); - /* - * For the most part, it should be a bug when name_len is zero. - * We stop here for figuring out where the bugs has occurred. - */ - f2fs_bug_on(sbi, d.max < 0); return de; } @@ -437,6 +432,12 @@ static int f2fs_add_inline_entries(struct inode *dir, } de = &d.dentry[bit_pos]; + + if (unlikely(!de->name_len)) { + bit_pos++; + continue; + } + new_name.name = d.filename[bit_pos]; new_name.len = de->name_len; @@ -448,9 +449,6 @@ static int f2fs_add_inline_entries(struct inode *dir, if (err) goto punch_dentry_pages; - if (unlikely(!de->name_len)) - d.max = -1; - bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } return 0; -- cgit v0.10.2 From da011cc0da8cf4a60ddf4d2ae8b42902a3d71e5f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 27 Apr 2016 21:40:15 +0800 Subject: f2fs: move node pages only in victim section during GC For foreground GC, we cache node blocks in victim section and set them dirty, then we call sync_node_pages to flush these node pages, but meanwhile, those node pages which does not locate in victim section will be flushed together, so more bandwidth and continuous free space would be occupied. So for this condition, it's better to leave those unrelated node page in cache for further write hit, and let CP or VM to flush them afterward. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ca828b0..6a48241 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1783,6 +1783,7 @@ void ra_node_page(struct f2fs_sb_info *, nid_t); struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); struct page *get_node_page_ra(struct page *, int); void sync_inode_page(struct dnode_of_data *); +void move_node_page(struct page *, int); int fsync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *, bool); int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e820465..19e9faf 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -465,15 +465,7 @@ next_step: continue; } - /* set page dirty and write it */ - if (gc_type == FG_GC) { - f2fs_wait_on_page_writeback(node_page, NODE, true); - set_page_dirty(node_page); - } else { - if (!PageWriteback(node_page)) - set_page_dirty(node_page); - } - f2fs_put_page(node_page, 1); + move_node_page(node_page, gc_type); stat_inc_node_blk_count(sbi, 1, gc_type); } @@ -834,18 +826,9 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, f2fs_put_page(sum_page, 0); } - if (gc_type == FG_GC) { - if (type == SUM_TYPE_NODE) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .for_reclaim = 0, - }; - sync_node_pages(sbi, &wbc); - } else { - f2fs_submit_merged_bio(sbi, DATA, WRITE); - } - } + if (gc_type == FG_GC) + f2fs_submit_merged_bio(sbi, + (type == SUM_TYPE_NODE) ? NODE : DATA, WRITE); blk_finish_plug(&plug); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index de070a5..f80cfb6 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1222,6 +1222,37 @@ iput_out: iput(inode); } +void move_node_page(struct page *node_page, int gc_type) +{ + if (gc_type == FG_GC) { + struct f2fs_sb_info *sbi = F2FS_P_SB(node_page); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + .for_reclaim = 0, + }; + + set_page_dirty(node_page); + f2fs_wait_on_page_writeback(node_page, NODE, true); + + f2fs_bug_on(sbi, PageWriteback(node_page)); + if (!clear_page_dirty_for_io(node_page)) + goto out_page; + + if (NODE_MAPPING(sbi)->a_ops->writepage(node_page, &wbc)) + unlock_page(node_page); + goto release_page; + } else { + /* set page dirty and write it */ + if (!PageWriteback(node_page)) + set_page_dirty(node_page); + } +out_page: + unlock_page(node_page); +release_page: + f2fs_put_page(node_page, 0); +} + static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index, end; -- cgit v0.10.2 From fe216c7a0ff993cdac885109d8544ba02e6f9127 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Wed, 27 Apr 2016 20:32:37 +0800 Subject: f2fs: fix to return 0 if err == -ENOENT in f2fs_readdir Commit 57b62d29ad5b384775974973087d47755a8c6fcc ("f2fs: fix to report error in f2fs_readdir") causes f2fs_readdir to return -ENOENT when get_lock_data_page returns -ENOENT. However, the original logic is to continue when get_lock_data_page returns -ENOENT, but it forgets to reset err to 0. This will cause getdents64 incorretly return -ENOENT when lastdirent is NULL in getdents64. This will lead to a wrong return value for syscall caller. Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 3b1c14e..dbfc1d1 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -888,6 +888,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } + err = 0; out: fscrypt_fname_free_buffer(&fstr); return err; -- cgit v0.10.2 From 23dc974eed576b2464b222a892272073adf6a92c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 29 Apr 2016 20:09:15 +0800 Subject: f2fs: fix to clear private data in page Private data in page should be removed during ->releasepage or ->invalidatepage, otherwise garbage data would be remained in that page. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 38ce5d6..888f178 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1723,6 +1723,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, if (IS_ATOMIC_WRITTEN_PAGE(page)) return; + set_page_private(page, 0); ClearPagePrivate(page); } @@ -1736,6 +1737,7 @@ int f2fs_release_page(struct page *page, gfp_t wait) if (IS_ATOMIC_WRITTEN_PAGE(page)) return 0; + set_page_private(page, 0); ClearPagePrivate(page); return 1; } -- cgit v0.10.2 From c81ced05b2c7fc3c1d6db753a55b55733ffb1524 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 29 Apr 2016 20:13:36 +0800 Subject: f2fs: fix to clear page private flag Commit 28bc106b2346 ("f2fs: support revoking atomic written pages") forgot to clear page private flag correctly, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 770cdc9..2e6f537 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -227,6 +227,7 @@ next: if (drop || recover) ClearPageUptodate(page); set_page_private(page, 0); + ClearPagePrivate(page); f2fs_put_page(page, 1); list_del(&cur->list); -- cgit v0.10.2 From 3f8ab270855b0b461995da5dc48dce9461c85d94 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 29 Apr 2016 20:13:37 +0800 Subject: f2fs: factor out fsync inode entry operations Factor out fsync inode entry operations into {add,del}_fsync_inode. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index a646d3b..58275a2 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -67,6 +67,28 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, return NULL; } +static struct fsync_inode_entry *add_fsync_inode(struct list_head *head, + struct inode *inode) +{ + struct fsync_inode_entry *entry; + + entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); + if (!entry) + return NULL; + + entry->inode = inode; + list_add_tail(&entry->list, head); + + return entry; +} + +static void del_fsync_inode(struct fsync_inode_entry *entry) +{ + iput(entry->inode); + list_del(&entry->list); + kmem_cache_free(fsync_entry_slab, entry); +} + static int recover_dentry(struct inode *inode, struct page *ipage) { struct f2fs_inode *raw_inode = F2FS_INODE(ipage); @@ -198,6 +220,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; + struct inode *inode; struct page *page = NULL; block_t blkaddr; int err = 0; @@ -233,27 +256,27 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) break; } - /* add this fsync inode to the list */ - entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); - if (!entry) { - err = -ENOMEM; - break; - } /* * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ - entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); - if (IS_ERR(entry->inode)) { - err = PTR_ERR(entry->inode); - kmem_cache_free(fsync_entry_slab, entry); + inode = f2fs_iget(sbi->sb, ino_of_node(page)); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); if (err == -ENOENT) { err = 0; goto next; } break; } - list_add_tail(&entry->list, head); + + /* add this fsync inode to the list */ + entry = add_fsync_inode(head, inode); + if (!entry) { + err = -ENOMEM; + iput(inode); + break; + } } entry->blkaddr = blkaddr; @@ -274,11 +297,8 @@ static void destroy_fsync_dnodes(struct list_head *head) { struct fsync_inode_entry *entry, *tmp; - list_for_each_entry_safe(entry, tmp, head, list) { - iput(entry->inode); - list_del(&entry->list); - kmem_cache_free(fsync_entry_slab, entry); - } + list_for_each_entry_safe(entry, tmp, head, list) + del_fsync_inode(entry); } static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, @@ -533,11 +553,8 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head) break; } - if (entry->blkaddr == blkaddr) { - iput(entry->inode); - list_del(&entry->list); - kmem_cache_free(fsync_entry_slab, entry); - } + if (entry->blkaddr == blkaddr) + del_fsync_inode(entry); next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); -- cgit v0.10.2 From b7a15f3dbe3f3d009360273b62a633e47a1a65d6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 27 Apr 2016 16:07:56 -0700 Subject: f2fs: introduce macros for proc entries This adds macros to be used multiple proc entries. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 19a85cf..b006de6 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -745,19 +745,22 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) return 0; } -static int segment_info_open_fs(struct inode *inode, struct file *file) -{ - return single_open(file, segment_info_seq_show, PDE_DATA(inode)); -} - -static const struct file_operations f2fs_seq_segment_info_fops = { - .owner = THIS_MODULE, - .open = segment_info_open_fs, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, +#define F2FS_PROC_FILE_DEF(_name) \ +static int _name##_open_fs(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ +} \ + \ +static const struct file_operations f2fs_seq_##_name##_fops = { \ + .owner = THIS_MODULE, \ + .open = _name##_open_fs, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ }; +F2FS_PROC_FILE_DEF(segment_info); + static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ -- cgit v0.10.2 From f00d6fa727a4c4977241bd21910cafc139f77989 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 27 Apr 2016 16:29:05 -0700 Subject: f2fs: add proc entry to show valid block bitmap This patch adds a new proc entry to show segment information in more detail. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b006de6..90d4b86 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -539,6 +539,7 @@ static void f2fs_put_super(struct super_block *sb) if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } kobject_del(&sbi->s_kobj); @@ -745,6 +746,30 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) return 0; } +static int segment_bits_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i, j; + + seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u|", se->type, + get_valid_blocks(sbi, i, 1)); + for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) + seq_printf(seq, "%x ", se->cur_valid_map[j]); + seq_putc(seq, '\n'); + } + return 0; +} + #define F2FS_PROC_FILE_DEF(_name) \ static int _name##_open_fs(struct inode *inode, struct file *file) \ { \ @@ -760,6 +785,7 @@ static const struct file_operations f2fs_seq_##_name##_fops = { \ }; F2FS_PROC_FILE_DEF(segment_info); +F2FS_PROC_FILE_DEF(segment_bits); static void default_options(struct f2fs_sb_info *sbi) { @@ -1541,9 +1567,12 @@ try_onemore: if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); - if (sbi->s_proc) + if (sbi->s_proc) { proc_create_data("segment_info", S_IRUGO, sbi->s_proc, &f2fs_seq_segment_info_fops, sb); + proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_bits_fops, sb); + } sbi->s_kobj.kset = f2fs_kset; init_completion(&sbi->s_kobj_unregister); @@ -1619,6 +1648,7 @@ free_kobj: free_proc: if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } f2fs_destroy_stats(sbi); -- cgit v0.10.2 From 0414b004a894746921bbc05f05dced1e7907b092 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 29 Apr 2016 15:16:42 -0700 Subject: f2fs: introduce f2fs_kmalloc to wrap kmalloc This patch adds f2fs_kmalloc. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index c8f25f7..d757d79 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -115,7 +115,7 @@ static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size) struct f2fs_acl_entry *entry; int i; - f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count * + f2fs_acl = f2fs_kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count * sizeof(struct f2fs_acl_entry), GFP_NOFS); if (!f2fs_acl) return ERR_PTR(-ENOMEM); @@ -175,7 +175,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage); if (retval > 0) { - value = kmalloc(retval, GFP_F2FS_ZERO); + value = f2fs_kmalloc(retval, GFP_F2FS_ZERO); if (!value) return ERR_PTR(-ENOMEM); retval = f2fs_getxattr(inode, name_index, "", value, diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index dbfc1d1..6bd0595 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -805,7 +805,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, int save_len = fstr->len; int ret; - de_name.name = kmalloc(de_name.len, GFP_NOFS); + de_name.name = f2fs_kmalloc(de_name.len, GFP_NOFS); if (!de_name.name) return false; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6a48241..879e4d7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1644,6 +1644,11 @@ static inline bool f2fs_may_extent_tree(struct inode *inode) return S_ISREG(inode->i_mode); } +static inline void *f2fs_kmalloc(size_t size, gfp_t flags) +{ + return kmalloc(size, flags); +} + static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) { void *ret; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 19e9faf..38d56f6 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -96,7 +96,7 @@ int start_gc_thread(struct f2fs_sb_info *sbi) dev_t dev = sbi->sb->s_bdev->bd_dev; int err = 0; - gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); + gc_th = f2fs_kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); if (!gc_th) { err = -ENOMEM; goto out; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index e61084c..33830b2 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -465,7 +465,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, struct f2fs_inline_dentry *backup_dentry; int err; - backup_dentry = kmalloc(sizeof(struct f2fs_inline_dentry), + backup_dentry = f2fs_kmalloc(sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO); if (!backup_dentry) return -ENOMEM; -- cgit v0.10.2 From 300e129c15f0ed2f94482900a4cb65b28eb09d94 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 29 Apr 2016 16:11:53 -0700 Subject: f2fs: use f2fs_grab_cache_page instead of grab_cache_page This patch converts grab_cache_page to f2fs_grab_cache_page. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index bf040b5..9596d61 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -34,7 +34,7 @@ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) struct address_space *mapping = META_MAPPING(sbi); struct page *page = NULL; repeat: - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, false); if (!page) { cond_resched(); goto repeat; @@ -64,7 +64,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, if (unlikely(!is_meta)) fio.rw &= ~REQ_META; repeat: - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, false); if (!page) { cond_resched(); goto repeat; @@ -186,7 +186,8 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, BUG(); } - page = grab_cache_page(META_MAPPING(sbi), fio.new_blkaddr); + page = f2fs_grab_cache_page(META_MAPPING(sbi), + fio.new_blkaddr, false); if (!page) continue; if (PageUptodate(page)) { diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 33830b2..8a51955 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -161,7 +161,7 @@ int f2fs_convert_inline_inode(struct inode *inode) if (!f2fs_has_inline_data(inode)) return 0; - page = grab_cache_page(inode->i_mapping, 0); + page = f2fs_grab_cache_page(inode->i_mapping, 0, false); if (!page) return -ENOMEM; @@ -358,7 +358,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, struct f2fs_dentry_block *dentry_blk; int err; - page = grab_cache_page(dir->i_mapping, 0); + page = f2fs_grab_cache_page(dir->i_mapping, 0, false); if (!page) { f2fs_put_page(ipage, 1); return -ENOMEM; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f80cfb6..af010f5 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -995,7 +995,7 @@ struct page *new_node_page(struct dnode_of_data *dn, if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return ERR_PTR(-EPERM); - page = grab_cache_page(NODE_MAPPING(sbi), dn->nid); + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false); if (!page) return ERR_PTR(-ENOMEM); @@ -1087,7 +1087,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) if (apage) return; - apage = grab_cache_page(NODE_MAPPING(sbi), nid); + apage = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); if (!apage) return; @@ -1128,7 +1128,7 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, return ERR_PTR(-ENOENT); f2fs_bug_on(sbi, check_nid_range(sbi, nid)); repeat: - page = grab_cache_page(NODE_MAPPING(sbi), nid); + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); if (!page) return ERR_PTR(-ENOMEM); @@ -2012,7 +2012,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) if (unlikely(old_ni.blk_addr != NULL_ADDR)) return -EINVAL; - ipage = grab_cache_page(NODE_MAPPING(sbi), ino); + ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false); if (!ipage) return -ENOMEM; -- cgit v0.10.2 From 73faec4d99358b79815866dd660ae2f9f6f9110a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 29 Apr 2016 15:34:32 -0700 Subject: f2fs: add mount option to select fault injection ratio This patch adds a mount option to select fault ratio. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 1f8982a..378c221 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -94,3 +94,11 @@ config F2FS_IO_TRACE information and block IO patterns in the filesystem level. If unsure, say N. + +config F2FS_FAULT_INJECTION + bool "F2FS fault injection facility" + depends on F2FS_FS + help + Test F2FS to inject faults such as ENOMEM, ENOSPC, and so on. + + If unsure, say N. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 879e4d7..0cf2a26 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -56,6 +56,7 @@ #define F2FS_MOUNT_EXTENT_CACHE 0x00002000 #define F2FS_MOUNT_FORCE_FG_GC 0x00004000 #define F2FS_MOUNT_DATA_FLUSH 0x00008000 +#define F2FS_MOUNT_FAULT_INJECTION 0x00010000 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 90d4b86..2234879 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -39,6 +39,10 @@ static struct proc_dir_entry *f2fs_proc_root; static struct kmem_cache *f2fs_inode_cachep; static struct kset *f2fs_kset; +#ifdef CONFIG_F2FS_FAULT_INJECTION +u32 f2fs_fault_rate = 0; +#endif + /* f2fs-wide shrinker description */ static struct shrinker f2fs_shrinker_info = { .scan_objects = f2fs_shrink_scan, @@ -68,6 +72,7 @@ enum { Opt_noextent_cache, Opt_noinline_data, Opt_data_flush, + Opt_fault_injection, Opt_err, }; @@ -93,6 +98,7 @@ static match_table_t f2fs_tokens = { {Opt_noextent_cache, "noextent_cache"}, {Opt_noinline_data, "noinline_data"}, {Opt_data_flush, "data_flush"}, + {Opt_fault_injection, "fault_injection=%u"}, {Opt_err, NULL}, }; @@ -300,6 +306,9 @@ static int parse_options(struct super_block *sb, char *options) char *p, *name; int arg = 0; +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_fault_rate = 0; +#endif if (!options) return 0; @@ -433,6 +442,16 @@ static int parse_options(struct super_block *sb, char *options) case Opt_data_flush: set_opt(sbi, DATA_FLUSH); break; + case Opt_fault_injection: + if (args->from && match_int(args, &arg)) + return -EINVAL; +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_fault_rate = arg; +#else + f2fs_msg(sb, KERN_INFO, + "FAULT_INJECTION was not selected"); +#endif + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", -- cgit v0.10.2 From 2c63fead9e372b3b65d1883bb174df6c9820f1dd Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 29 Apr 2016 15:49:56 -0700 Subject: f2fs: inject kmalloc failure This patch injects kmalloc failure given a fault injection rate. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0cf2a26..cf03c57 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -37,6 +37,31 @@ } while (0) #endif +#ifdef CONFIG_F2FS_FAULT_INJECTION +enum { + FAULT_KMALLOC, + FAULT_MAX, +}; + +extern u32 f2fs_fault_rate; +extern atomic_t f2fs_ops; +extern char *fault_name[FAULT_MAX]; + +static inline bool time_to_inject(int type) +{ + atomic_inc(&f2fs_ops); + if (f2fs_fault_rate && (atomic_read(&f2fs_ops) >= f2fs_fault_rate)) { + atomic_set(&f2fs_ops, 0); + printk("%sF2FS-fs : inject %s in %pF\n", + KERN_INFO, + fault_name[type], + __builtin_return_address(0)); + return true; + } + return false; +} +#endif + /* * For mount options */ @@ -1647,6 +1672,10 @@ static inline bool f2fs_may_extent_tree(struct inode *inode) static inline void *f2fs_kmalloc(size_t size, gfp_t flags) { +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(FAULT_KMALLOC)) + return NULL; +#endif return kmalloc(size, flags); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2234879..9a56f54 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -41,6 +41,11 @@ static struct kset *f2fs_kset; #ifdef CONFIG_F2FS_FAULT_INJECTION u32 f2fs_fault_rate = 0; +atomic_t f2fs_ops; + +char *fault_name[FAULT_MAX] = { + [FAULT_KMALLOC] = "kmalloc", +}; #endif /* f2fs-wide shrinker description */ @@ -447,6 +452,7 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; #ifdef CONFIG_F2FS_FAULT_INJECTION f2fs_fault_rate = arg; + atomic_set(&f2fs_ops, 0); #else f2fs_msg(sb, KERN_INFO, "FAULT_INJECTION was not selected"); -- cgit v0.10.2 From c41f3cc3ae34acdbcec328084b4b74f686c02f0d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 29 Apr 2016 16:17:09 -0700 Subject: f2fs: inject page allocation failures This patch adds page allocation failures. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cf03c57..d550a95 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -40,6 +40,7 @@ #ifdef CONFIG_F2FS_FAULT_INJECTION enum { FAULT_KMALLOC, + FAULT_PAGE_ALLOC, FAULT_MAX, }; @@ -1296,6 +1297,14 @@ static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, pgoff_t index, bool for_write) { +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct page *page = find_lock_page(mapping, index); + if (page) + return page; + + if (time_to_inject(FAULT_PAGE_ALLOC)) + return NULL; +#endif if (!for_write) return grab_cache_page(mapping, index); return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9a56f54..986d0da 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -45,6 +45,7 @@ atomic_t f2fs_ops; char *fault_name[FAULT_MAX] = { [FAULT_KMALLOC] = "kmalloc", + [FAULT_PAGE_ALLOC] = "page alloc", }; #endif -- cgit v0.10.2 From cb78942b821380913e6810375c9ce72858e64c4f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 29 Apr 2016 16:29:22 -0700 Subject: f2fs: inject ENOSPC failures This patch injects ENOSPC failures. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 9596d61..79da86d 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -474,6 +474,13 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi) int err = 0; spin_lock(&im->ino_lock); + +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(FAULT_ORPHAN)) { + spin_unlock(&im->ino_lock); + return -ENOSPC; + } +#endif if (unlikely(im->ino_num >= sbi->max_orphans)) err = -ENOSPC; else diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 6bd0595..50f42be 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -537,6 +537,10 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, } start: +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(FAULT_DIR_DEPTH)) + return -ENOSPC; +#endif if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) return -ENOSPC; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d550a95..8ba2f92 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -41,6 +41,10 @@ enum { FAULT_KMALLOC, FAULT_PAGE_ALLOC, + FAULT_ALLOC_NID, + FAULT_ORPHAN, + FAULT_BLOCK, + FAULT_DIR_DEPTH, FAULT_MAX, }; @@ -1087,6 +1091,12 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, block_t valid_block_count; spin_lock(&sbi->stat_lock); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(FAULT_BLOCK)) { + spin_unlock(&sbi->stat_lock); + return false; + } +#endif valid_block_count = sbi->total_valid_block_count + (block_t)count; if (unlikely(valid_block_count > sbi->user_block_count)) { diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index af010f5..78b98db 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1838,6 +1838,10 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; retry: +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(FAULT_ALLOC_NID)) + return false; +#endif if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) return false; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 986d0da..87654f4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -46,6 +46,10 @@ atomic_t f2fs_ops; char *fault_name[FAULT_MAX] = { [FAULT_KMALLOC] = "kmalloc", [FAULT_PAGE_ALLOC] = "page alloc", + [FAULT_ALLOC_NID] = "alloc nid", + [FAULT_ORPHAN] = "orphan", + [FAULT_BLOCK] = "no more block", + [FAULT_DIR_DEPTH] = "too big dir depth", }; #endif -- cgit v0.10.2 From 221149c00e64c202e6e172a9c4efad142a6b610d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 2 May 2016 12:34:48 -0700 Subject: f2fs: revisit error handling flows This patch fixes a couple of bugs regarding to orphan inodes when handling errors. This tries to - call alloc_nid_done with add_orphan_inode in handle_failed_inode - let truncate blocks in f2fs_evict_inode - not make a bad inode due to i_mode change Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 50f42be..5373f33 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -391,9 +391,14 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, return page; if (S_ISDIR(inode->i_mode)) { + /* in order to handle error case */ + get_page(page); err = make_empty_dir(inode, dir, page); - if (err) - goto error; + if (err) { + lock_page(page); + goto put_error; + } + put_page(page); } err = f2fs_init_acl(inode, dir, page, dpage); @@ -437,13 +442,12 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, return page; put_error: - f2fs_put_page(page, 1); -error: - /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ + /* truncate empty dir pages */ truncate_inode_pages(&inode->i_data, 0); - truncate_blocks(inode, 0, false); - remove_dirty_inode(inode); - remove_inode_page(inode); + + clear_nlink(inode); + update_inode(inode, page); + f2fs_put_page(page, 1); return ERR_PTR(err); } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index cb269c4..f4ac851 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -368,10 +368,7 @@ no_delete: if (is_inode_flag_set(fi, FI_UPDATE_WRITE)) add_ino_entry(sbi, inode->i_ino, UPDATE_INO); if (is_inode_flag_set(fi, FI_FREE_NID)) { - if (err && err != -ENOENT) - alloc_nid_done(sbi, inode->i_ino); - else - alloc_nid_failed(sbi, inode->i_ino); + alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(fi, FI_FREE_NID); } @@ -397,37 +394,32 @@ out_clear: void handle_failed_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int err = 0; + struct node_info ni; - clear_nlink(inode); - make_bad_inode(inode); + /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); - i_size_write(inode, 0); - if (F2FS_HAS_BLOCKS(inode)) - err = f2fs_truncate(inode, false); - - if (!err) - err = remove_inode_page(inode); - /* - * if we skip truncate_node in remove_inode_page bacause we failed - * before, it's better to find another way to release resource of - * this inode (e.g. valid block count, node block or nid). Here we - * choose to add this inode to orphan list, so that we can call iput - * for releasing in orphan recovery flow. - * * Note: we should add inode to orphan list before f2fs_unlock_op() * so we can prevent losing this orphan when encoutering checkpoint * and following suddenly power-off. */ - if (err && err != -ENOENT) { - err = acquire_orphan_inode(sbi); - if (!err) + get_node_info(sbi, inode->i_ino, &ni); + + if (ni.blk_addr != NULL_ADDR) { + int err = acquire_orphan_inode(sbi); + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "Too many orphan inodes, run fsck to fix."); + } else { add_orphan_inode(sbi, inode->i_ino); + } + alloc_nid_done(sbi, inode->i_ino); + } else { + set_inode_flag(F2FS_I(inode), FI_FREE_NID); } - set_inode_flag(F2FS_I(inode), FI_FREE_NID); f2fs_unlock_op(sbi); /* iput will drop the inode object */ -- cgit v0.10.2 From 74ef924167ad9d2897ffc0ef83c7ebf322fcc535 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 2 May 2016 22:09:56 -0700 Subject: f2fs: fix leak of orphan inode objects When unmounting filesystem, we should release all the ino entries. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 79da86d..432a6ba 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -449,12 +449,12 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) return e ? true : false; } -void release_ino_entry(struct f2fs_sb_info *sbi) +void release_ino_entry(struct f2fs_sb_info *sbi, bool all) { struct ino_entry *e, *tmp; int i; - for (i = APPEND_INO; i <= UPDATE_INO; i++) { + for (i = all ? ORPHAN_INO: APPEND_INO; i <= UPDATE_INO; i++) { struct inode_management *im = &sbi->im[i]; spin_lock(&im->ino_lock); @@ -1106,7 +1106,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) invalidate_mapping_pages(META_MAPPING(sbi), discard_blk, discard_blk); - release_ino_entry(sbi); + release_ino_entry(sbi, false); if (unlikely(f2fs_cp_error(sbi))) return -EIO; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8ba2f92..ea146a5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1911,7 +1911,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); void add_ino_entry(struct f2fs_sb_info *, nid_t, int type); void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type); -void release_ino_entry(struct f2fs_sb_info *); +void release_ino_entry(struct f2fs_sb_info *, bool); bool exist_written_data(struct f2fs_sb_info *, nid_t, int); int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 87654f4..8a28f79 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -599,7 +599,7 @@ static void f2fs_put_super(struct super_block *sb) * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. */ - release_ino_entry(sbi); + release_ino_entry(sbi, true); release_discard_addrs(sbi); f2fs_leave_shrinker(sbi); -- cgit v0.10.2 From 4c0c294934b055aca765737c3b06698f51e9124d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 3 May 2016 09:22:18 -0700 Subject: f2fs: retry to truncate blocks in -ENOMEM case This patch modifies to retry truncating node blocks in -ENOMEM case. Signed-off-by: Hou Pengyang Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index f4ac851..baf3a2a 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -344,7 +344,7 @@ void f2fs_evict_inode(struct inode *inode) sb_start_intwrite(inode->i_sb); set_inode_flag(fi, FI_NO_ALLOC); i_size_write(inode, 0); - +retry: if (F2FS_HAS_BLOCKS(inode)) err = f2fs_truncate(inode, true); @@ -354,6 +354,12 @@ void f2fs_evict_inode(struct inode *inode) f2fs_unlock_op(sbi); } + /* give more chances, if ENOMEM case */ + if (err == -ENOMEM) { + err = 0; + goto retry; + } + sb_end_intwrite(inode->i_sb); no_delete: stat_dec_inline_xattr(inode); -- cgit v0.10.2 From ae8d1db34ff57b11feceeaaa26940015b14e30fb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 May 2016 23:17:00 +0800 Subject: f2fs: remove unneeded readahead in find_fsync_dnodes In find_fsync_dnodes, get_tmp_page will read dnode page synchronously, previously, ra_meta_page did the same work, which is redundant, remove it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 58275a2..29a37aa 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -229,8 +229,6 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - ra_meta_pages(sbi, blkaddr, 1, META_POR, true); - while (1) { struct fsync_inode_entry *entry; -- cgit v0.10.2 From e3bc808ca869f0070d438257e4085602636498da Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 May 2016 23:19:46 +0800 Subject: f2fs: remove unneeded memset when updating xattr Each of fields in struct f2fs_xattr_entry will be assigned later, so previously we don't need to memset the struct. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 1529712..55c69d3 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -528,7 +528,6 @@ static int __f2fs_setxattr(struct inode *inode, int index, * Before we come here, old entry is removed. * We just write new entry. */ - memset(last, 0, newsize); last->e_name_index = index; last->e_name_len = len; memcpy(last->e_name, name, len); -- cgit v0.10.2 From bd933d4faedf3a79034e67e8032d153bf1afcd83 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 May 2016 23:19:47 +0800 Subject: f2fs: reuse get_extent_info Reuse get_extent_info for readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index c859bb0..5bfcdb9 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -196,8 +196,7 @@ bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) if (!i_ext || !i_ext->len) return false; - set_extent_info(&ei, le32_to_cpu(i_ext->fofs), - le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len)); + get_extent_info(&ei, i_ext); write_lock(&et->lock); if (atomic_read(&et->node_cnt)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ea146a5..ccf8bf4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -428,11 +428,11 @@ struct f2fs_inode_info { }; static inline void get_extent_info(struct extent_info *ext, - struct f2fs_extent i_ext) + struct f2fs_extent *i_ext) { - ext->fofs = le32_to_cpu(i_ext.fofs); - ext->blk = le32_to_cpu(i_ext.blk); - ext->len = le32_to_cpu(i_ext.len); + ext->fofs = le32_to_cpu(i_ext->fofs); + ext->blk = le32_to_cpu(i_ext->blk); + ext->len = le32_to_cpu(i_ext->len); } static inline void set_raw_extent(struct extent_info *ext, -- cgit v0.10.2 From f51b4ce6c1759a64d68d314545854dc226ab69a0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 May 2016 23:19:48 +0800 Subject: f2fs: shrink size of struct seg_entry Restructure struct seg_entry to eliminate holes in it, after that, in 32-bits machine, it reduces size from 32 bytes to 24 bytes; in 64-bits machine, it reduces size from 56 bytes to 40 bytes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 975c33d..7a756ff 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -158,16 +158,17 @@ struct victim_sel_policy { }; struct seg_entry { - unsigned short valid_blocks; /* # of valid blocks */ + unsigned int type:6; /* segment type like CURSEG_XXX_TYPE */ + unsigned int valid_blocks:10; /* # of valid blocks */ + unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */ + unsigned int padding:6; /* padding */ unsigned char *cur_valid_map; /* validity bitmap of blocks */ /* * # of valid blocks and the validity bitmap stored in the the last * checkpoint pack. This information is used by the SSR mode. */ - unsigned short ckpt_valid_blocks; - unsigned char *ckpt_valid_map; + unsigned char *ckpt_valid_map; /* validity bitmap of blocks last cp */ unsigned char *discard_map; - unsigned char type; /* segment type like CURSEG_XXX_TYPE */ unsigned long long mtime; /* modification time of the segment */ }; -- cgit v0.10.2 From 29234b1d6d2d4ecbd8f85831285c20329617859f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 4 May 2016 19:48:53 -0700 Subject: f2fs: don't worry about inode leak in evict_inode Even if an inode failed to release its blocks, it should be kept in an orphan inode list, so it will be released later. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index baf3a2a..689d691 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -377,20 +377,8 @@ no_delete: alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(fi, FI_FREE_NID); } - - if (err && err != -ENOENT) { - if (!exist_written_data(sbi, inode->i_ino, ORPHAN_INO)) { - /* - * get here because we failed to release resource - * of inode previously, reminder our user to run fsck - * for fixing. - */ - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_msg(sbi->sb, KERN_WARNING, - "inode (ino:%lu) resource leak, run fsck " - "to fix this issue!", inode->i_ino); - } - } + f2fs_bug_on(sbi, err && + !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); out_clear: fscrypt_put_encryption_info(inode, NULL); clear_inode(inode); -- cgit v0.10.2 From fb58ae22067e0595d974e3d856522c1ed6d2d7bf Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 4 May 2016 09:58:10 -0700 Subject: f2fs: remove an obsolete variable This patch removes an obsolete variable used in add_free_nid. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 78b98db..264abe0 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1690,7 +1690,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; struct nat_entry *ne; - bool allocated = false; if (!available_free_memory(sbi, FREE_NIDS)) return -1; @@ -1704,8 +1703,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) ne = __lookup_nat_cache(nm_i, nid); if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || nat_get_blkaddr(ne) != NULL_ADDR)) - allocated = true; - if (allocated) return 0; } -- cgit v0.10.2 From 43473f96453f0b075c480a26ec4fc846d5fb3bd4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 5 May 2016 19:13:02 +0800 Subject: f2fs: fix incorrect mapping in ->bmap Currently, generic_block_bmap is used in f2fs_bmap, its semantics is when the mapping is been found, return position of target physical block, otherwise return zero. But, previously, when there is no mapping info for specified logical block, f2fs_bmap will map target physical block to a uninitialized variable, which should be wrong. Fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 888f178..96b0353 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -648,6 +648,8 @@ next_dnode: set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, pgofs, mode); if (err) { + if (flag == F2FS_GET_BLOCK_BMAP) + map->m_pblk = 0; if (err == -ENOENT) { err = 0; if (map->m_next_pgofs) @@ -683,17 +685,18 @@ next_block: map->m_flags = F2FS_MAP_NEW; blkaddr = dn.data_blkaddr; } else { + if (flag == F2FS_GET_BLOCK_BMAP) { + map->m_pblk = 0; + goto sync_out; + } if (flag == F2FS_GET_BLOCK_FIEMAP && blkaddr == NULL_ADDR) { if (map->m_next_pgofs) *map->m_next_pgofs = pgofs + 1; } if (flag != F2FS_GET_BLOCK_FIEMAP || - blkaddr != NEW_ADDR) { - if (flag == F2FS_GET_BLOCK_BMAP) - err = -ENOENT; + blkaddr != NEW_ADDR) goto sync_out; - } } } -- cgit v0.10.2 From 09210c973af30320edc03a6325422cdd0f03b580 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 5 May 2016 19:13:03 +0800 Subject: f2fs: avoid panic when truncating to max filesize The following panic occurs when truncating inode which has inline xattr to max filesize. [] get_dnode_of_data+0x4e/0x580 [f2fs] [] ? read_node_page+0x51/0x90 [f2fs] [] ? get_node_page.part.34+0xb9/0x170 [f2fs] [] truncate_blocks+0x131/0x3f0 [f2fs] [] f2fs_truncate+0x73/0x100 [f2fs] [] f2fs_setattr+0x62/0x2a0 [f2fs] [] notify_change+0x158/0x300 [] do_truncate+0x6b/0xa0 [] ? __sb_start_write+0x49/0x100 [] do_sys_ftruncate.constprop.12+0x118/0x170 [] SyS_ftruncate+0xe/0x10 [] tracesys+0xe1/0xe6 [] get_node_path+0x210/0x220 [f2fs] --[ end trace 5fea664dfbcc6625 ]--- The reason is truncate_blocks tries to truncate all node and data blocks start from specified block offset with value of (max filesize / block size), but actually, our valid max block offset is (max filesize / block size) - 1, so f2fs detects such invalid block offset with BUG_ON in truncation path. This patch lets f2fs skip truncating data which is exceeding max filesize. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index dc47d5c..dd50f30 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -563,6 +563,9 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1); + if (free_from >= sbi->max_file_blocks) + goto free_partial; + if (lock) f2fs_lock_op(sbi); @@ -604,7 +607,7 @@ free_next: out: if (lock) f2fs_unlock_op(sbi); - +free_partial: /* lastly zero out the first data page */ if (!err) err = truncate_partial_data_page(inode, from, truncate_page); -- cgit v0.10.2 From b5a7aef1ef436ec005fef0efe31a676ec5f4ab31 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 4 May 2016 22:05:01 -0700 Subject: fscrypto/f2fs: allow fs-specific key prefix for fs encryption This patch allows fscrypto to handle a second key prefix given by filesystem. The main reason is to provide backward compatibility, since previously f2fs used "f2fs:" as a crypto prefix instead of "fscrypt:". Later, ext4 should also provide key_prefix() to give "ext4:". One concern decribed by Ted would be kinda double check overhead of prefixes. In x86, for example, validate_user_key consumes 8 ms after boot-up, which turns out derive_key_aes() consumed most of the time to load specific crypto module. After such the cold miss, it shows almost zero latencies, which treats as a negligible overhead. Note that request_key() detects wrong prefix in prior to derive_key_aes() even. Cc: Ted Tso Cc: stable@vger.kernel.org # v4.6 Signed-off-by: Jaegeuk Kim diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 06f5aa4..1ac263e 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -78,6 +78,67 @@ out: return res; } +static int validate_user_key(struct fscrypt_info *crypt_info, + struct fscrypt_context *ctx, u8 *raw_key, + u8 *prefix, int prefix_size) +{ + u8 *full_key_descriptor; + struct key *keyring_key; + struct fscrypt_key *master_key; + const struct user_key_payload *ukp; + int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1; + int res; + + full_key_descriptor = kmalloc(full_key_len, GFP_NOFS); + if (!full_key_descriptor) + return -ENOMEM; + + memcpy(full_key_descriptor, prefix, prefix_size); + sprintf(full_key_descriptor + prefix_size, + "%*phN", FS_KEY_DESCRIPTOR_SIZE, + ctx->master_key_descriptor); + full_key_descriptor[full_key_len - 1] = '\0'; + keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); + kfree(full_key_descriptor); + if (IS_ERR(keyring_key)) + return PTR_ERR(keyring_key); + + if (keyring_key->type != &key_type_logon) { + printk_once(KERN_WARNING + "%s: key type must be logon\n", __func__); + res = -ENOKEY; + goto out; + } + down_read(&keyring_key->sem); + ukp = user_key_payload(keyring_key); + if (ukp->datalen != sizeof(struct fscrypt_key)) { + res = -EINVAL; + up_read(&keyring_key->sem); + goto out; + } + master_key = (struct fscrypt_key *)ukp->data; + BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE); + + if (master_key->size != FS_AES_256_XTS_KEY_SIZE) { + printk_once(KERN_WARNING + "%s: key size incorrect: %d\n", + __func__, master_key->size); + res = -ENOKEY; + up_read(&keyring_key->sem); + goto out; + } + res = derive_key_aes(ctx->nonce, master_key->raw, raw_key); + up_read(&keyring_key->sem); + if (res) + goto out; + + crypt_info->ci_keyring_key = keyring_key; + return 0; +out: + key_put(keyring_key); + return res; +} + static void put_crypt_info(struct fscrypt_info *ci) { if (!ci) @@ -91,12 +152,7 @@ static void put_crypt_info(struct fscrypt_info *ci) int get_crypt_info(struct inode *inode) { struct fscrypt_info *crypt_info; - u8 full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE + - (FS_KEY_DESCRIPTOR_SIZE * 2) + 1]; - struct key *keyring_key = NULL; - struct fscrypt_key *master_key; struct fscrypt_context ctx; - const struct user_key_payload *ukp; struct crypto_skcipher *ctfm; const char *cipher_str; u8 raw_key[FS_MAX_KEY_SIZE]; @@ -167,48 +223,24 @@ retry: memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); goto got_key; } - memcpy(full_key_descriptor, FS_KEY_DESC_PREFIX, - FS_KEY_DESC_PREFIX_SIZE); - sprintf(full_key_descriptor + FS_KEY_DESC_PREFIX_SIZE, - "%*phN", FS_KEY_DESCRIPTOR_SIZE, - ctx.master_key_descriptor); - full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE + - (2 * FS_KEY_DESCRIPTOR_SIZE)] = '\0'; - keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); - if (IS_ERR(keyring_key)) { - res = PTR_ERR(keyring_key); - keyring_key = NULL; - goto out; - } - crypt_info->ci_keyring_key = keyring_key; - if (keyring_key->type != &key_type_logon) { - printk_once(KERN_WARNING - "%s: key type must be logon\n", __func__); - res = -ENOKEY; - goto out; - } - down_read(&keyring_key->sem); - ukp = user_key_payload(keyring_key); - if (ukp->datalen != sizeof(struct fscrypt_key)) { - res = -EINVAL; - up_read(&keyring_key->sem); - goto out; - } - master_key = (struct fscrypt_key *)ukp->data; - BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE); - if (master_key->size != FS_AES_256_XTS_KEY_SIZE) { - printk_once(KERN_WARNING - "%s: key size incorrect: %d\n", - __func__, master_key->size); - res = -ENOKEY; - up_read(&keyring_key->sem); + res = validate_user_key(crypt_info, &ctx, raw_key, + FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE); + if (res && inode->i_sb->s_cop->key_prefix) { + u8 *prefix = NULL; + int prefix_size, res2; + + prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix); + res2 = validate_user_key(crypt_info, &ctx, raw_key, + prefix, prefix_size); + if (res2) { + if (res2 == -ENOKEY) + res = -ENOKEY; + goto out; + } + } else if (res) { goto out; } - res = derive_key_aes(ctx.nonce, master_key->raw, raw_key); - up_read(&keyring_key->sem); - if (res) - goto out; got_key: ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); if (!ctfm || IS_ERR(ctfm)) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ccf8bf4..dbd277e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -711,6 +711,10 @@ enum { MAX_TIME, }; +#ifdef CONFIG_F2FS_FS_ENCRYPTION +#define F2FS_KEY_DESC_PREFIX "f2fs:" +#define F2FS_KEY_DESC_PREFIX_SIZE 5 +#endif struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -718,6 +722,10 @@ struct f2fs_sb_info { int valid_super_block; /* valid super block no */ int s_flag; /* flags for sbi */ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; + u8 key_prefix_size; +#endif /* for node-related operations */ struct f2fs_nm_info *nm_info; /* node manager */ struct inode *node_inode; /* cache node blocks */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8a28f79..28c8992 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -964,6 +964,12 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) ctx, len, NULL); } +static int f2fs_key_prefix(struct inode *inode, u8 **key) +{ + *key = F2FS_I_SB(inode)->key_prefix; + return F2FS_I_SB(inode)->key_prefix_size; +} + static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { @@ -980,6 +986,7 @@ static unsigned f2fs_max_namelen(struct inode *inode) static struct fscrypt_operations f2fs_cryptops = { .get_context = f2fs_get_context, + .key_prefix = f2fs_key_prefix, .set_context = f2fs_set_context, .is_encrypted = f2fs_encrypted_inode, .empty_dir = f2fs_empty_dir, @@ -1305,6 +1312,12 @@ static void init_sb_info(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); + +#ifdef CONFIG_F2FS_FS_ENCRYPTION + memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX, + F2FS_KEY_DESC_PREFIX_SIZE); + sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE; +#endif } /* diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h index 6027f6b..cfa6cde 100644 --- a/include/linux/fscrypto.h +++ b/include/linux/fscrypto.h @@ -175,6 +175,7 @@ struct fscrypt_name { */ struct fscrypt_operations { int (*get_context)(struct inode *, void *, size_t); + int (*key_prefix)(struct inode *, u8 **); int (*prepare_context)(struct inode *); int (*set_context)(struct inode *, const void *, size_t, void *); int (*dummy_context)(struct inode *); -- cgit v0.10.2 From f61cce5b81f91ba336184008b24baec84afbb3dd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 7 May 2016 16:15:05 +0800 Subject: f2fs: fix inode cache leak When testing f2fs with inline_dentry option, generic/342 reports: VFS: Busy inodes after unmount of dm-0. Self-destruct in 5 seconds. Have a nice day... After rmmod f2fs module, kenrel shows following dmesg: ============================================================================= BUG f2fs_inode_cache (Tainted: G O ): Objects remaining in f2fs_inode_cache on __kmem_cache_shutdown() ----------------------------------------------------------------------------- Disabling lock debugging due to kernel taint INFO: Slab 0xf51ca0e0 objects=22 used=1 fp=0xd1e6fc60 flags=0x40004080 CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 00000086 00000086 d062fe18 c13a83a0 f51ca0e0 d062fe38 d062fea4 c11c7276 c1981040 f51ca0e0 00000016 00000001 d1e6fc60 40004080 656a624f 20737463 616d6572 6e696e69 6e692067 66326620 6e695f73 5f65646f 68636163 6e6f2065 Call Trace: [] dump_stack+0x5f/0x8f [] slab_err+0x76/0x80 [] ? __kmem_cache_shutdown+0x100/0x2f0 [] ? __kmem_cache_shutdown+0x100/0x2f0 [] __kmem_cache_shutdown+0x125/0x2f0 [] kmem_cache_destroy+0x158/0x1f0 [] ? mutex_unlock+0xd/0x10 [] exit_f2fs_fs+0x4b/0x5a8 [f2fs] [] SyS_delete_module+0x16c/0x1d0 [] ? do_fast_syscall_32+0x30/0x1c0 [] ? __this_cpu_preempt_check+0xf/0x20 [] ? trace_hardirqs_on_caller+0xdd/0x210 [] ? trace_hardirqs_off+0xb/0x10 [] do_fast_syscall_32+0xa1/0x1c0 [] sysenter_past_esp+0x45/0x74 INFO: Object 0xd1e6d9e0 @offset=6624 kmem_cache_destroy f2fs_inode_cache: Slab cache still has objects CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 00000286 00000286 d062fef4 c13a83a0 f174b000 d062ff14 d062ff28 c1198ac7 c197fe18 f3c5b980 d062ff20 000d04f2 d062ff0c d062ff0c d062ff14 d062ff14 f8f20dc0 fffffff5 d062e000 d062ff30 f8f15aa3 d062ff7c c10f596c 73663266 Call Trace: [] dump_stack+0x5f/0x8f [] kmem_cache_destroy+0x1e7/0x1f0 [] exit_f2fs_fs+0x4b/0x5a8 [f2fs] [] SyS_delete_module+0x16c/0x1d0 [] ? do_fast_syscall_32+0x30/0x1c0 [] ? __this_cpu_preempt_check+0xf/0x20 [] ? trace_hardirqs_on_caller+0xdd/0x210 [] ? trace_hardirqs_off+0xb/0x10 [] do_fast_syscall_32+0xa1/0x1c0 [] sysenter_past_esp+0x45/0x74 The reason is: in recovery flow, we use delayed iput mechanism for directory which has recovered dentry block. It means the reference of inode will be held until last dirty dentry page being writebacked. But when we mount f2fs with inline_dentry option, during recovery, dirent may only be recovered into dir inode page rather than dentry page, so there are no chance for us to release inode reference in ->writepage when writebacking last dentry page. We can call paired iget/iput explicityly for inline_dentry case, but for non-inline_dentry case, iput will call writeback_single_inode to write all data pages synchronously, but during recovery, ->writepages of f2fs skips writing all pages, result in losing dirent. This patch fixes this issue by obsoleting old mechanism, and introduce a new dir_list to hold all directory inodes which has recovered datas until finishing recovery. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 432a6ba..6402e93 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -794,19 +794,9 @@ void update_dirty_page(struct inode *inode, struct page *page) f2fs_trace_pid(page); } -void add_dirty_dir_inode(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - - spin_lock(&sbi->inode_lock[DIR_INODE]); - __add_dirty_inode(inode, DIR_INODE); - spin_unlock(&sbi->inode_lock[DIR_INODE]); -} - void remove_dirty_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && @@ -816,12 +806,6 @@ void remove_dirty_inode(struct inode *inode) spin_lock(&sbi->inode_lock[type]); __remove_dirty_inode(inode, type); spin_unlock(&sbi->inode_lock[type]); - - /* Only from the recovery routine */ - if (is_inode_flag_set(fi, FI_DELAY_IPUT)) { - clear_inode_flag(fi, FI_DELAY_IPUT); - iput(inode); - } } int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dbd277e..052f5a8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1488,7 +1488,6 @@ enum { FI_NO_ALLOC, /* should not allocate any blocks */ FI_FREE_NID, /* free allocated nide */ FI_UPDATE_DIR, /* should update inode block for consistency */ - FI_DELAY_IPUT, /* used for the recovery */ FI_NO_EXTENT, /* not to use the extent cache */ FI_INLINE_XATTR, /* used for inline xattr */ FI_INLINE_DATA, /* used for inline data*/ @@ -1928,7 +1927,6 @@ void remove_orphan_inode(struct f2fs_sb_info *, nid_t); int recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); void update_dirty_page(struct inode *, struct page *); -void add_dirty_dir_inode(struct inode *); void remove_dirty_inode(struct inode *); int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type); int write_checkpoint(struct f2fs_sb_info *, struct cp_control *); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 29a37aa..2b25329 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -89,7 +89,8 @@ static void del_fsync_inode(struct fsync_inode_entry *entry) kmem_cache_free(fsync_entry_slab, entry); } -static int recover_dentry(struct inode *inode, struct page *ipage) +static int recover_dentry(struct inode *inode, struct page *ipage, + struct list_head *dir_list) { struct f2fs_inode *raw_inode = F2FS_INODE(ipage); nid_t pino = le32_to_cpu(raw_inode->i_pino); @@ -97,18 +98,29 @@ static int recover_dentry(struct inode *inode, struct page *ipage) struct qstr name; struct page *page; struct inode *dir, *einode; + struct fsync_inode_entry *entry; int err = 0; - dir = f2fs_iget(inode->i_sb, pino); - if (IS_ERR(dir)) { - err = PTR_ERR(dir); - goto out; + entry = get_fsync_inode(dir_list, pino); + if (!entry) { + dir = f2fs_iget(inode->i_sb, pino); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + goto out; + } + + entry = add_fsync_inode(dir_list, dir); + if (!entry) { + err = -ENOMEM; + iput(dir); + goto out; + } } - if (file_enc_name(inode)) { - iput(dir); + dir = entry->inode; + + if (file_enc_name(inode)) return 0; - } name.len = le32_to_cpu(raw_inode->i_namelen); name.name = raw_inode->i_name; @@ -116,7 +128,7 @@ static int recover_dentry(struct inode *inode, struct page *ipage) if (unlikely(name.len > F2FS_NAME_LEN)) { WARN_ON(1); err = -ENAMETOOLONG; - goto out_err; + goto out; } retry: de = f2fs_find_entry(dir, &name, &page); @@ -142,23 +154,12 @@ retry: goto retry; } err = __f2fs_add_link(dir, &name, inode, inode->i_ino, inode->i_mode); - if (err) - goto out_err; - - if (is_inode_flag_set(F2FS_I(dir), FI_DELAY_IPUT)) { - iput(dir); - } else { - add_dirty_dir_inode(dir); - set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT); - } goto out; out_unmap_put: f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); -out_err: - iput(dir); out: f2fs_msg(inode->i_sb, KERN_NOTICE, "%s: ino = %x, name = %s, dir = %lx, err = %d", @@ -501,7 +502,8 @@ out: return err; } -static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head) +static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, + struct list_head *dir_list) { unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; @@ -528,7 +530,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head) break; } - entry = get_fsync_inode(head, ino_of_node(page)); + entry = get_fsync_inode(inode_list, ino_of_node(page)); if (!entry) goto next; /* @@ -539,7 +541,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head) if (IS_INODE(page)) recover_inode(entry->inode, page); if (entry->last_dentry == blkaddr) { - err = recover_dentry(entry->inode, page); + err = recover_dentry(entry->inode, page, dir_list); if (err) { f2fs_put_page(page, 1); break; @@ -567,6 +569,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct list_head inode_list; + struct list_head dir_list; block_t blkaddr; int err; int ret = 0; @@ -578,6 +581,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) return -ENOMEM; INIT_LIST_HEAD(&inode_list); + INIT_LIST_HEAD(&dir_list); /* prevent checkpoint */ mutex_lock(&sbi->cp_mutex); @@ -597,12 +601,11 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) need_writecp = true; /* step #2: recover data */ - err = recover_data(sbi, &inode_list); + err = recover_data(sbi, &inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); out: destroy_fsync_dnodes(&inode_list); - kmem_cache_destroy(fsync_entry_slab); /* truncate meta pages to be used by the recovery */ truncate_inode_pages_range(META_MAPPING(sbi), @@ -640,5 +643,8 @@ out: } else { mutex_unlock(&sbi->cp_mutex); } + + destroy_fsync_dnodes(&dir_list); + kmem_cache_destroy(fsync_entry_slab); return ret ? ret: err; } -- cgit v0.10.2 From e12dd7bd874cb1c8658d5e8c1eab5f91a71939dc Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 6 May 2016 15:30:38 -0700 Subject: f2fs: fallocate data blocks in single locked node page This patch is to improve the expand_inode speed in fallocate by allocating data blocks as many as possible in single locked node page. In SSD, # time fallocate -l 500G $MNT/testfile Before : 1m 33.410 s After : 24.758 s Reported-by: Stephen Bates Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index dd50f30..fb8cc27 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1158,10 +1158,11 @@ static int expand_inode_data(struct inode *inode, loff_t offset, loff_t len, int mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - pgoff_t index, pg_start, pg_end; + struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; + pgoff_t pg_end; loff_t new_size = i_size_read(inode); - loff_t off_start, off_end; - int ret = 0; + loff_t off_end; + int ret; ret = inode_newsize_ok(inode, (len + offset)); if (ret) @@ -1173,43 +1174,35 @@ static int expand_inode_data(struct inode *inode, loff_t offset, f2fs_balance_fs(sbi, true); - pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; - pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; - - off_start = offset & (PAGE_SIZE - 1); + pg_end = ((unsigned long long)offset + len) >> PAGE_SHIFT; off_end = (offset + len) & (PAGE_SIZE - 1); - f2fs_lock_op(sbi); + map.m_lblk = ((unsigned long long)offset) >> PAGE_SHIFT; + map.m_len = pg_end - map.m_lblk; + if (off_end) + map.m_len++; - for (index = pg_start; index <= pg_end; index++) { - struct dnode_of_data dn; + ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + if (ret) { + pgoff_t last_off; - if (index == pg_end && !off_end) - goto noalloc; + if (!map.m_len) + return ret; - set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = f2fs_reserve_block(&dn, index); - if (ret) - break; -noalloc: - if (pg_start == pg_end) - new_size = offset + len; - else if (index == pg_start && off_start) - new_size = (loff_t)(index + 1) << PAGE_SHIFT; - else if (index == pg_end) - new_size = ((loff_t)index << PAGE_SHIFT) + - off_end; - else - new_size += PAGE_SIZE; + last_off = map.m_lblk + map.m_len - 1; + + /* update new size to the failed position */ + new_size = (last_off == pg_end) ? offset + len: + (loff_t)(last_off + 1) << PAGE_SHIFT; + } else { + new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end; } - if (!(mode & FALLOC_FL_KEEP_SIZE) && - i_size_read(inode) < new_size) { + if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) { i_size_write(inode, new_size); mark_inode_dirty(inode); update_inode_page(inode); } - f2fs_unlock_op(sbi); return ret; } -- cgit v0.10.2 From 79344efb93a26378a91193bed133cee42162cd81 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 6 May 2016 16:19:43 -0700 Subject: f2fs: read node blocks ahead when truncating blocks This patch enables reading node blocks in advance when truncating large data blocks. > time rm $MNT/testfile (500GB) after drop_cachees Before : 9.422 s After : 4.821 s Reported-by: Stephen Bates Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index fb8cc27..3146129 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -584,7 +584,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) } set_new_dnode(&dn, inode, ipage, NULL, 0); - err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); + err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA); if (err) { if (err == -ENOENT) goto free_next; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 264abe0..cda7c7c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -407,6 +407,29 @@ cache: up_write(&nm_i->nat_tree_lock); } +/* + * readahead MAX_RA_NODE number of node pages. + */ +static void ra_node_pages(struct page *parent, int start, int n) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + struct blk_plug plug; + int i, end; + nid_t nid; + + blk_start_plug(&plug); + + /* Then, try readahead for siblings of the desired node */ + end = start + n; + end = min(end, NIDS_PER_BLOCK); + for (i = start; i < end; i++) { + nid = get_nid(parent, i, false); + ra_node_page(sbi, nid); + } + + blk_finish_plug(&plug); +} + pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs) { const long direct_index = ADDRS_PER_INODE(dn->inode); @@ -707,6 +730,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, return PTR_ERR(page); } + ra_node_pages(page, ofs, NIDS_PER_BLOCK); + rn = F2FS_NODE(page); if (depth < 3) { for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) { @@ -784,6 +809,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, nid[i + 1] = get_nid(pages[i], offset[i + 1], false); } + ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK); + /* free direct nodes linked to a partial indirect node */ for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { child_nid = get_nid(pages[idx], i, false); @@ -1095,29 +1122,6 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_put_page(apage, err ? 1 : 0); } -/* - * readahead MAX_RA_NODE number of node pages. - */ -static void ra_node_pages(struct page *parent, int start) -{ - struct f2fs_sb_info *sbi = F2FS_P_SB(parent); - struct blk_plug plug; - int i, end; - nid_t nid; - - blk_start_plug(&plug); - - /* Then, try readahead for siblings of the desired node */ - end = start + MAX_RA_NODE; - end = min(end, NIDS_PER_BLOCK); - for (i = start; i < end; i++) { - nid = get_nid(parent, i, false); - ra_node_page(sbi, nid); - } - - blk_finish_plug(&plug); -} - static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, struct page *parent, int start) { @@ -1141,7 +1145,7 @@ repeat: } if (parent) - ra_node_pages(parent, start + 1); + ra_node_pages(parent, start + 1, MAX_RA_NODE); lock_page(page); -- cgit v0.10.2 From 0080c5076409d211fbe28d6f07966f7d39e58bad Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 7 May 2016 08:52:57 -0700 Subject: f2fs: do not preallocate block unaligned to 4KB Previously f2fs_preallocate_blocks() tries to allocate unaligned blocks. In f2fs_write_begin(), however, prepare_write_begin() does not skip its allocation due to (len != 4KB). So, it needs locking node page twice unexpectedly. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 96b0353..369d953 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -582,8 +582,8 @@ ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) struct f2fs_map_blocks map; ssize_t ret = 0; - map.m_lblk = F2FS_BYTES_TO_BLK(iocb->ki_pos); - map.m_len = F2FS_BLK_ALIGN(iov_iter_count(from)); + map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); + map.m_len = F2FS_BYTES_TO_BLK(iov_iter_count(from)); map.m_next_pgofs = NULL; if (f2fs_encrypted_inode(inode)) -- cgit v0.10.2 From 7fb17fe44b70c854d380686d8b7e63217f9f1dcf Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 9 May 2016 19:56:32 +0800 Subject: f2fs: use mnt_{want,drop}_write_file in ioctl In interfaces of ioctl, mnt_{want,drop}_write_file should be used for: - get exclusion against file system freezing which may used by lvm snapshot. - do telling filesystem that a write is about to be performed on it, and make sure that the writes are permitted. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3146129..2f8f225 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1307,20 +1307,16 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) unsigned int oldflags; int ret; + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(flags, (int __user *)arg)) + return -EFAULT; + ret = mnt_want_write_file(filp); if (ret) return ret; - if (!inode_owner_or_capable(inode)) { - ret = -EACCES; - goto out; - } - - if (get_user(flags, (int __user *)arg)) { - ret = -EFAULT; - goto out; - } - flags = f2fs_mask_flags(inode->i_mode, flags); inode_lock(inode); @@ -1363,18 +1359,22 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + if (f2fs_is_atomic_file(inode)) - return 0; + goto out; ret = f2fs_convert_inline_inode(inode); if (ret) - return ret; + goto out; set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); if (!get_dirty_pages(inode)) - return 0; + goto out; f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, "Unexpected flush for atomic writes: ino=%lu, npages=%u", @@ -1382,6 +1382,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); +out: + mnt_drop_write_file(filp); return ret; } @@ -1393,13 +1395,13 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; - if (f2fs_is_volatile_file(inode)) - return 0; - ret = mnt_want_write_file(filp); if (ret) return ret; + if (f2fs_is_volatile_file(inode)) + goto err_out; + if (f2fs_is_atomic_file(inode)) { clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); ret = commit_inmem_pages(inode); @@ -1423,32 +1425,48 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + if (f2fs_is_volatile_file(inode)) - return 0; + goto out; ret = f2fs_convert_inline_inode(inode); if (ret) - return ret; + goto out; set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return 0; +out: + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_release_volatile_write(struct file *filp) { struct inode *inode = file_inode(filp); + int ret; if (!inode_owner_or_capable(inode)) return -EACCES; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + if (!f2fs_is_volatile_file(inode)) - return 0; + goto out; - if (!f2fs_is_first_block_written(inode)) - return truncate_partial_data_page(inode, 0, true); + if (!f2fs_is_first_block_written(inode)) { + ret = truncate_partial_data_page(inode, 0, true); + goto out; + } - return punch_hole(inode, 0, F2FS_BLKSIZE); + ret = punch_hole(inode, 0, F2FS_BLKSIZE); +out: + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_abort_volatile_write(struct file *filp) @@ -1481,6 +1499,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct super_block *sb = sbi->sb; __u32 in; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1488,6 +1507,10 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (get_user(in, (__u32 __user *)arg)) return -EFAULT; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + switch (in) { case F2FS_GOING_DOWN_FULLSYNC: sb = freeze_bdev(sb->s_bdev); @@ -1509,10 +1532,13 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) f2fs_stop_checkpoint(sbi); break; default: - return -EINVAL; + ret = -EINVAL; + goto out; } f2fs_update_time(sbi, REQ_TIME); - return 0; +out: + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) @@ -1533,9 +1559,14 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) sizeof(range))) return -EFAULT; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + range.minlen = max((unsigned int)range.minlen, q->limits.discard_granularity); ret = f2fs_trim_fs(F2FS_SB(sb), &range); + mnt_drop_write_file(filp); if (ret < 0) return ret; @@ -1560,13 +1591,21 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { struct fscrypt_policy policy; struct inode *inode = file_inode(filp); + int ret; if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, sizeof(policy))) return -EFAULT; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return fscrypt_process_policy(inode, &policy); + ret = fscrypt_process_policy(inode, &policy); + + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) @@ -1623,6 +1662,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); __u32 sync; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1633,20 +1673,30 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + if (!sync) { - if (!mutex_trylock(&sbi->gc_mutex)) - return -EBUSY; + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } } else { mutex_lock(&sbi->gc_mutex); } - return f2fs_gc(sbi, sync); + ret = f2fs_gc(sbi, sync); +out: + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1654,7 +1704,14 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; - return f2fs_sync_fs(sbi->sb, 1); + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = f2fs_sync_fs(sbi->sb, 1); + + mnt_drop_write_file(filp); + return ret; } static int f2fs_defragment_range(struct f2fs_sb_info *sbi, -- cgit v0.10.2 From 0fac558b96584799876498248020dc49a98bd131 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 9 May 2016 19:56:33 +0800 Subject: f2fs: make atomic/volatile operation exclusive atomic/volatile ioctl interfaces are exposed to user like other file operation interface, it needs to make them getting exclusion against to each other to avoid potential conflict among these operations in concurrent scenario. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2f8f225..f67ca62 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1363,6 +1363,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) return ret; + inode_lock(inode); + if (f2fs_is_atomic_file(inode)) goto out; @@ -1383,6 +1385,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); out: + inode_unlock(inode); mnt_drop_write_file(filp); return ret; } @@ -1399,6 +1402,8 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (ret) return ret; + inode_lock(inode); + if (f2fs_is_volatile_file(inode)) goto err_out; @@ -1413,6 +1418,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); err_out: + inode_unlock(inode); mnt_drop_write_file(filp); return ret; } @@ -1429,6 +1435,8 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (ret) return ret; + inode_lock(inode); + if (f2fs_is_volatile_file(inode)) goto out; @@ -1439,6 +1447,7 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); out: + inode_unlock(inode); mnt_drop_write_file(filp); return ret; } @@ -1455,6 +1464,8 @@ static int f2fs_ioc_release_volatile_write(struct file *filp) if (ret) return ret; + inode_lock(inode); + if (!f2fs_is_volatile_file(inode)) goto out; @@ -1465,6 +1476,7 @@ static int f2fs_ioc_release_volatile_write(struct file *filp) ret = punch_hole(inode, 0, F2FS_BLKSIZE); out: + inode_unlock(inode); mnt_drop_write_file(filp); return ret; } @@ -1481,6 +1493,8 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) if (ret) return ret; + inode_lock(inode); + if (f2fs_is_atomic_file(inode)) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { @@ -1488,6 +1502,8 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } + inode_unlock(inode); + mnt_drop_write_file(filp); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; -- cgit v0.10.2 From 46008c6d42328710f9beaf5c2b47dc92b1cc1a75 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 9 May 2016 19:56:30 +0800 Subject: f2fs: support in batch multi blocks preallocation This patch introduces reserve_new_blocks to make preallocation of multi blocks as in batch operation, so it can avoid lots of redundant operation, result in better performance. In virtual machine, with rotational device: time fallocate -l 32G /mnt/f2fs/file Before: real 0m4.584s user 0m0.000s sys 0m4.580s After: real 0m0.292s user 0m0.000s sys 0m0.272s In x86, with SSD: time fallocate -l 500G $MNT/testfile Before : 24.758 s After : 1.604 s Signed-off-by: Chao Yu [Jaegeuk Kim: fix bugs and add performance numbers measured in x86.] Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 369d953..b61e2d4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -278,6 +278,16 @@ alloc_new: trace_f2fs_submit_page_mbio(fio->page, fio); } +static void __set_data_blkaddr(struct dnode_of_data *dn) +{ + struct f2fs_node *rn = F2FS_NODE(dn->node_page); + __le32 *addr_array; + + /* Get physical address of data block */ + addr_array = blkaddr_in_node(rn); + addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); +} + /* * Lock ordering for the change of data block address: * ->data_page @@ -286,19 +296,9 @@ alloc_new: */ void set_data_blkaddr(struct dnode_of_data *dn) { - struct f2fs_node *rn; - __le32 *addr_array; - struct page *node_page = dn->node_page; - unsigned int ofs_in_node = dn->ofs_in_node; - - f2fs_wait_on_page_writeback(node_page, NODE, true); - - rn = F2FS_NODE(node_page); - - /* Get physical address of data block */ - addr_array = blkaddr_in_node(rn); - addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr); - if (set_page_dirty(node_page)) + f2fs_wait_on_page_writeback(dn->node_page, NODE, true); + __set_data_blkaddr(dn); + if (set_page_dirty(dn->node_page)) dn->node_changed = true; } @@ -309,24 +309,53 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) f2fs_update_extent_cache(dn); } -int reserve_new_block(struct dnode_of_data *dn) +/* dn->ofs_in_node will be returned with up-to-date last block pointer */ +int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + if (!count) + return 0; + if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return -EPERM; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) + if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) return -ENOSPC; - trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); + trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, + dn->ofs_in_node, count); + + f2fs_wait_on_page_writeback(dn->node_page, NODE, true); + + for (; count > 0; dn->ofs_in_node++) { + block_t blkaddr = + datablock_addr(dn->node_page, dn->ofs_in_node); + if (blkaddr == NULL_ADDR) { + dn->data_blkaddr = NEW_ADDR; + __set_data_blkaddr(dn); + count--; + } + } + + if (set_page_dirty(dn->node_page)) + dn->node_changed = true; - dn->data_blkaddr = NEW_ADDR; - set_data_blkaddr(dn); mark_inode_dirty(dn->inode); sync_inode_page(dn); return 0; } +/* Should keep dn->ofs_in_node unchanged */ +int reserve_new_block(struct dnode_of_data *dn) +{ + unsigned int ofs_in_node = dn->ofs_in_node; + int ret; + + ret = reserve_new_blocks(dn, 1); + dn->ofs_in_node = ofs_in_node; + return ret; +} + int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) { bool need_put = dn->inode_page ? false : true; @@ -545,6 +574,7 @@ static int __allocate_data_block(struct dnode_of_data *dn) struct node_info ni; int seg = CURSEG_WARM_DATA; pgoff_t fofs; + blkcnt_t count = 1; if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return -EPERM; @@ -553,7 +583,7 @@ static int __allocate_data_block(struct dnode_of_data *dn) if (dn->data_blkaddr == NEW_ADDR) goto alloc; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) + if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) return -ENOSPC; alloc: @@ -621,8 +651,10 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, struct dnode_of_data dn; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA; - pgoff_t pgofs, end_offset; + pgoff_t pgofs, end_offset, end; int err = 0, ofs = 1; + unsigned int ofs_in_node, last_ofs_in_node; + blkcnt_t prealloc; struct extent_info ei; bool allocated = false; block_t blkaddr; @@ -632,6 +664,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, /* it only supports block size == page size */ pgofs = (pgoff_t)map->m_lblk; + end = pgofs + maxblocks; if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) { map->m_pblk = ei.blk + pgofs - ei.fofs; @@ -659,6 +692,8 @@ next_dnode: goto unlock_out; } + prealloc = 0; + ofs_in_node = dn.ofs_in_node; end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: @@ -671,17 +706,20 @@ next_block: goto sync_out; } if (flag == F2FS_GET_BLOCK_PRE_AIO) { - if (blkaddr == NULL_ADDR) - err = reserve_new_block(&dn); + if (blkaddr == NULL_ADDR) { + prealloc++; + last_ofs_in_node = dn.ofs_in_node; + } } else { err = __allocate_data_block(&dn); - if (!err) + if (!err) { set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); + allocated = true; + } } if (err) goto sync_out; - allocated = true; map->m_flags = F2FS_MAP_NEW; blkaddr = dn.data_blkaddr; } else { @@ -700,6 +738,9 @@ next_block: } } + if (flag == F2FS_GET_BLOCK_PRE_AIO) + goto skip; + if (map->m_len == 0) { /* preallocated unwritten block should be mapped for fiemap. */ if (blkaddr == NEW_ADDR) @@ -711,32 +752,49 @@ next_block: } else if ((map->m_pblk != NEW_ADDR && blkaddr == (map->m_pblk + ofs)) || (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) || - flag == F2FS_GET_BLOCK_PRE_DIO || - flag == F2FS_GET_BLOCK_PRE_AIO) { + flag == F2FS_GET_BLOCK_PRE_DIO) { ofs++; map->m_len++; } else { goto sync_out; } +skip: dn.ofs_in_node++; pgofs++; - if (map->m_len < maxblocks) { - if (dn.ofs_in_node < end_offset) - goto next_block; + /* preallocate blocks in batch for one dnode page */ + if (flag == F2FS_GET_BLOCK_PRE_AIO && + (pgofs == end || dn.ofs_in_node == end_offset)) { - if (allocated) - sync_inode_page(&dn); - f2fs_put_dnode(&dn); + dn.ofs_in_node = ofs_in_node; + err = reserve_new_blocks(&dn, prealloc); + if (err) + goto sync_out; - if (create) { - f2fs_unlock_op(sbi); - f2fs_balance_fs(sbi, allocated); + map->m_len += dn.ofs_in_node - ofs_in_node; + if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { + err = -ENOSPC; + goto sync_out; } - allocated = false; - goto next_dnode; + dn.ofs_in_node = end_offset; + } + + if (pgofs >= end) + goto sync_out; + else if (dn.ofs_in_node < end_offset) + goto next_block; + + if (allocated) + sync_inode_page(&dn); + f2fs_put_dnode(&dn); + + if (create) { + f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, allocated); } + allocated = false; + goto next_dnode; sync_out: if (allocated) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 052f5a8..1401c96 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1094,7 +1094,7 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) } static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, - struct inode *inode, blkcnt_t count) + struct inode *inode, blkcnt_t *count) { block_t valid_block_count; @@ -1106,14 +1106,19 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, } #endif valid_block_count = - sbi->total_valid_block_count + (block_t)count; + sbi->total_valid_block_count + (block_t)(*count); if (unlikely(valid_block_count > sbi->user_block_count)) { - spin_unlock(&sbi->stat_lock); - return false; + *count = sbi->user_block_count - sbi->total_valid_block_count; + if (!*count) { + spin_unlock(&sbi->stat_lock); + return false; + } } - inode->i_blocks += count; - sbi->total_valid_block_count = valid_block_count; - sbi->alloc_valid_block_count += (block_t)count; + /* *count can be recalculated */ + inode->i_blocks += *count; + sbi->total_valid_block_count = + sbi->total_valid_block_count + (block_t)(*count); + sbi->alloc_valid_block_count += (block_t)(*count); spin_unlock(&sbi->stat_lock); return true; } @@ -1945,6 +1950,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *); void f2fs_submit_page_mbio(struct f2fs_io_info *); void set_data_blkaddr(struct dnode_of_data *); void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); +int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); int reserve_new_block(struct dnode_of_data *); int f2fs_get_block(struct dnode_of_data *, pgoff_t); ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 0f56584..497e6e8 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -694,28 +694,32 @@ TRACE_EVENT(f2fs_direct_IO_exit, __entry->ret) ); -TRACE_EVENT(f2fs_reserve_new_block, +TRACE_EVENT(f2fs_reserve_new_blocks, - TP_PROTO(struct inode *inode, nid_t nid, unsigned int ofs_in_node), + TP_PROTO(struct inode *inode, nid_t nid, unsigned int ofs_in_node, + blkcnt_t count), - TP_ARGS(inode, nid, ofs_in_node), + TP_ARGS(inode, nid, ofs_in_node, count), TP_STRUCT__entry( __field(dev_t, dev) __field(nid_t, nid) __field(unsigned int, ofs_in_node) + __field(blkcnt_t, count) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = nid; __entry->ofs_in_node = ofs_in_node; + __entry->count = count; ), - TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u", + TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u, count = %llu", show_dev(__entry), (unsigned int)__entry->nid, - __entry->ofs_in_node) + __entry->ofs_in_node, + (unsigned long long)__entry->count) ); DECLARE_EVENT_CLASS(f2fs__submit_page_bio, -- cgit v0.10.2 From 6e9619499f53b22ead972e476c0e8341c997d929 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 9 May 2016 19:56:31 +0800 Subject: f2fs: support in batch fzero in dnode page This patch tries to speedup fzero_range by making space preallocation and address removal of blocks in one dnode page as in batch operation. In virtual machine, with zram driver: dd if=/dev/zero of=/mnt/f2fs/file bs=1M count=4096 time xfs_io -f /mnt/f2fs/file -c "fzero 0 4096M" Before: real 0m3.276s user 0m0.008s sys 0m3.260s After: real 0m1.568s user 0m0.000s sys 0m1.564s Signed-off-by: Chao Yu [Jaegeuk Kim: consider ENOSPC case] Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f67ca62..a9cb314 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -997,6 +997,49 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) return ret; } +static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, + pgoff_t end) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + pgoff_t index = start; + unsigned int ofs_in_node = dn->ofs_in_node; + blkcnt_t count = 0; + int ret; + + for (; index < end; index++, dn->ofs_in_node++) { + if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR) + count++; + } + + dn->ofs_in_node = ofs_in_node; + ret = reserve_new_blocks(dn, count); + if (ret) + return ret; + + dn->ofs_in_node = ofs_in_node; + for (index = start; index < end; index++, dn->ofs_in_node++) { + dn->data_blkaddr = + datablock_addr(dn->node_page, dn->ofs_in_node); + /* + * reserve_new_blocks will not guarantee entire block + * allocation. + */ + if (dn->data_blkaddr == NULL_ADDR) { + ret = -ENOSPC; + break; + } + if (dn->data_blkaddr != NEW_ADDR) { + invalidate_blocks(sbi, dn->data_blkaddr); + dn->data_blkaddr = NEW_ADDR; + set_data_blkaddr(dn); + } + } + + f2fs_update_extent_cache_range(dn, start, 0, index - start); + + return ret; +} + static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, int mode) { @@ -1047,35 +1090,32 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, (loff_t)pg_start << PAGE_SHIFT); } - for (index = pg_start; index < pg_end; index++) { + for (index = pg_start; index < pg_end;) { struct dnode_of_data dn; - struct page *ipage; + unsigned int end_offset; + pgoff_t end; f2fs_lock_op(sbi); - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - ret = PTR_ERR(ipage); - f2fs_unlock_op(sbi); - goto out; - } - - set_new_dnode(&dn, inode, ipage, NULL, 0); - ret = f2fs_reserve_block(&dn, index); + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { f2fs_unlock_op(sbi); goto out; } - if (dn.data_blkaddr != NEW_ADDR) { - invalidate_blocks(sbi, dn.data_blkaddr); - f2fs_update_data_blkaddr(&dn, NEW_ADDR); - } + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + end = min(pg_end, end_offset - dn.ofs_in_node + index); + + ret = f2fs_do_zero_range(&dn, index, end); f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + if (ret) + goto out; + index = end; new_size = max_t(loff_t, new_size, - (loff_t)(index + 1) << PAGE_SHIFT); + (loff_t)index << PAGE_SHIFT); } if (off_end) { -- cgit v0.10.2 From 652be55162dcf3eb4440a4f7536ffedbe0352dcf Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 10 May 2016 19:13:50 -0700 Subject: f2fs: show # of orphan inodes This adds debug information for # of orphan inodes. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 818064b..37615b2 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -58,6 +58,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_xattr = atomic_read(&sbi->inline_xattr); si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); + si->orphans = sbi->im[ORPHAN_INO].ino_num; si->utilization = utilization(sbi); si->free_segs = free_segments(sbi); @@ -192,7 +193,7 @@ get_cache: si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); - for (i = 0; i <= UPDATE_INO; i++) + for (i = 0; i <= ORPHAN_INO; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); si->cache_mem += atomic_read(&sbi->total_ext_tree) * sizeof(struct extent_tree); @@ -238,6 +239,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", si->inline_dir); + seq_printf(s, " - Orphan Inode: %u\n", + si->orphans); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1401c96..ec978b44 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1997,7 +1997,7 @@ struct f2fs_stat_info { int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; int bg_gc, inmem_pages, wb_pages; - int inline_xattr, inline_inode, inline_dir; + int inline_xattr, inline_inode, inline_dir, orphans; unsigned int valid_count, valid_node_count, valid_inode_count; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; -- cgit v0.10.2 From 3b9b10f9ce61b574c63d71ff3eeec9cf5dbe763f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 May 2016 09:13:13 -0700 Subject: f2fs: avoid f2fs_bug_on during recovery We don't need to use f2fs_bug_on() to treat with any error case when allocating a block during recovery. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 2b25329..6303b2a 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -460,8 +460,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, */ if (dest == NEW_ADDR) { truncate_data_blocks_range(&dn, 1); - err = reserve_new_block(&dn); - f2fs_bug_on(sbi, err); + reserve_new_block(&dn); continue; } -- cgit v0.10.2 From ab47036d8f7227361cad7894adee8e66ab6f95b2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 11 May 2016 19:48:44 +0800 Subject: f2fs: fix deadlock when flush inline data Below backtrace info was reported by Yunlei He: Call Trace: [] schedule+0x35/0x80 [] rwsem_down_read_failed+0xed/0x130 [] call_rwsem_down_read_failed+0x18/0x [] down_read+0x20/0x30 [] f2fs_evict_inode+0x242/0x3a0 [f2fs] [] evict+0xc7/0x1a0 [] iput+0x196/0x200 [] __dentry_kill+0x179/0x1e0 [] dput+0x199/0x1f0 [] __fput+0x18b/0x220 [] ____fput+0xe/0x10 [] task_work_run+0x77/0x90 [] exit_to_usermode_loop+0x73/0xa2 [] do_syscall_64+0xfa/0x110 [] entry_SYSCALL64_slow_path+0x25/0x25 Call Trace: [] schedule+0x35/0x80 [] __wait_on_freeing_inode+0xa3/0xd0 [] ? autoremove_wake_function+0x40/0x4 [] find_inode_fast+0x7d/0xb0 [] ilookup+0x6a/0xd0 [] sync_node_pages+0x210/0x650 [f2fs] [] ? do_fsync+0x70/0x70 [] block_operations+0x9e/0xf0 [f2fs] [] ? bio_endio+0x55/0x60 [] write_checkpoint+0x92/0xba0 [f2fs] [] ? mempool_free_slab+0x17/0x20 [] ? mempool_free+0x2b/0x80 [] ? do_fsync+0x70/0x70 [] f2fs_sync_fs+0x63/0xd0 [f2fs] [] ? ext4_sync_fs+0xbf/0x190 [] sync_fs_one_sb+0x20/0x30 [] iterate_supers+0xb9/0x110 [] sys_sync+0x55/0x90 [] do_syscall_64+0x69/0x110 [] entry_SYSCALL64_slow_path+0x25/0x25 With following excuting serials, we will set inline_node in inode page after inode was unlinked, result in a deadloop described as below: 1. open file 2. write file 3. unlink file 4. write file 5. close file Thread A Thread B - dput - iput_final - inode->i_state |= I_FREEING - evict - f2fs_evict_inode - f2fs_sync_fs - write_checkpoint - block_operations - f2fs_lock_all (down_write(cp_rwsem)) - f2fs_lock_op (down_read(cp_rwsem)) - sync_node_pages - ilookup - find_inode_fast - __wait_on_freeing_inode (wait on I_FREEING clear) Here, we change to set inline_node flag only for linked inode for fixing. Reported-by: Yunlei He Signed-off-by: Chao Yu Tested-by: Jaegeuk Kim Cc: stable@vger.kernel.org # v4.6 Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b61e2d4..1013836 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1546,7 +1546,8 @@ restart: if (pos + len <= MAX_INLINE_DATA) { read_inline_data(page, ipage); set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); - set_inline_node(ipage); + if (inode->i_nlink) + set_inline_node(ipage); } else { err = f2fs_convert_inline_page(&dn, page); if (err) -- cgit v0.10.2 From 99e3e858a486ccef93da0d4b67a71ed1c171b2fe Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Wed, 11 May 2016 17:08:14 +0800 Subject: f2fs: correct return value type of f2fs_fill_super Signed-off-by: Sheng Yong Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 28c8992..0e54e84 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1417,7 +1417,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) struct f2fs_sb_info *sbi; struct f2fs_super_block *raw_super; struct inode *root; - long err; + int err; bool retry = true, need_fsck = false; char *options = NULL; int recovery, i, valid_super_block; @@ -1643,7 +1643,7 @@ try_onemore: if (err < 0) { need_fsck = true; f2fs_msg(sb, KERN_ERR, - "Cannot recover all fsync data errno=%ld", err); + "Cannot recover all fsync data errno=%d", err); goto free_kobj; } } else { @@ -1676,7 +1676,7 @@ try_onemore: if (recovery) { err = f2fs_commit_super(sbi, true); f2fs_msg(sb, KERN_INFO, - "Try to recover %dth superblock, ret: %ld", + "Try to recover %dth superblock, ret: %d", sbi->valid_super_block ? 1 : 2, err); } -- cgit v0.10.2 From e4103849ba784368e802eec4f15832ac54eb7d39 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 14 May 2016 19:03:52 +0800 Subject: f2fs: fix i_current_depth during inline dentry conversion With below steps, we will see that dentry page becoming unaccessable later. This is because we forget updating i_current_depth in inode during inline dentry conversion, after that, once we failed at somewhere, it will leave i_current_depth as 0 in non-inline directory. Then, during ->lookup, the current_depth value makes all dentry pages in first level invisible. Fix it. 1) mount f2fs with inline_dentry option 2) mkdir dir 3) touch 180 files named [0-179] in dir 4) touch 180 in dir (fail after inline dir conversion) 5) ll dir ls: cannot access /mnt/f2fs/dir/0: No such file or directory ls: cannot access /mnt/f2fs/dir/1: No such file or directory ls: cannot access /mnt/f2fs/dir/2: No such file or directory ls: cannot access /mnt/f2fs/dir/3: No such file or directory ls: cannot access /mnt/f2fs/dir/4: No such file or directory drwxr-xr-x 2 root root 4096 may 13 21:47 ./ drwxr-xr-x 3 root root 4096 may 13 21:46 ../ -????????? ? ? ? ? ? 0 -????????? ? ? ? ? ? 1 -????????? ? ? ? ? ? 10 -????????? ? ? ? ? ? 100 -????????? ? ? ? ? ? 101 -????????? ? ? ? ? ? 102 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 8a51955..60ba7ac 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -400,6 +400,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, stat_dec_inline_dir(dir); clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY); + F2FS_I(dir)->i_current_depth = 1; if (i_size_read(dir) < PAGE_SIZE) { i_size_write(dir, PAGE_SIZE); set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); -- cgit v0.10.2 From 8975bdf48220cca703ad4586528e04b76b40ea1c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 14 May 2016 19:03:53 +0800 Subject: f2fs: fix incorrect error path handling in f2fs_move_rehashed_dirents Fix two bugs in error path of f2fs_move_rehashed_dirents: - release dir's inode page if fail to call kmalloc - recover i_current_depth if fail to converting Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 60ba7ac..a4bb155 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -464,12 +464,15 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, struct f2fs_inline_dentry *inline_dentry) { struct f2fs_inline_dentry *backup_dentry; + struct f2fs_inode_info *fi = F2FS_I(dir); int err; backup_dentry = f2fs_kmalloc(sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO); - if (!backup_dentry) + if (!backup_dentry) { + f2fs_put_page(ipage, 1); return -ENOMEM; + } memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA); truncate_inline_inode(ipage, 0); @@ -483,13 +486,14 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, lock_page(ipage); stat_dec_inline_dir(dir); - clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY); + clear_inode_flag(fi, FI_INLINE_DENTRY); update_inode(dir, ipage); kfree(backup_dentry); return 0; recover: lock_page(ipage); memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA); + fi->i_current_depth = 0; i_size_write(dir, MAX_INLINE_DATA); update_inode(dir, ipage); f2fs_put_page(ipage, 1); -- cgit v0.10.2 From b951a4ec165af4973b2bd9c80fb5845fbd840435 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Fri, 13 May 2016 14:57:43 +0800 Subject: f2fs: no need inc dirty pages under inode lock No need inc dirty pages under inode lock Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6402e93..3da6499 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -787,9 +787,9 @@ void update_dirty_page(struct inode *inode, struct page *page) spin_lock(&sbi->inode_lock[type]); __add_dirty_inode(inode, type); - inode_inc_dirty_pages(inode); spin_unlock(&sbi->inode_lock[type]); + inode_inc_dirty_pages(inode); SetPagePrivate(page); f2fs_trace_pid(page); } -- cgit v0.10.2 From 087968974fcd9e8c910bba73f3393a49b7380ca6 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Mon, 16 May 2016 12:38:50 +0800 Subject: f2fs: add fault injection to sysfs This patch introduces a new struct f2fs_fault_info and a global f2fs_fault to save fault injection status. Fault injection entries are created in /sys/fs/f2fs/fault_injection/ during initializing f2fs module. Signed-off-by: Sheng Yong Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ec978b44..1351178 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -48,15 +48,36 @@ enum { FAULT_MAX, }; -extern u32 f2fs_fault_rate; -extern atomic_t f2fs_ops; +struct f2fs_fault_info { + atomic_t inject_ops; + unsigned int inject_rate; + unsigned int inject_type; +}; + +extern struct f2fs_fault_info f2fs_fault; extern char *fault_name[FAULT_MAX]; +#define IS_FAULT_SET(type) (f2fs_fault.inject_type & (1 << (type))) static inline bool time_to_inject(int type) { - atomic_inc(&f2fs_ops); - if (f2fs_fault_rate && (atomic_read(&f2fs_ops) >= f2fs_fault_rate)) { - atomic_set(&f2fs_ops, 0); + if (!f2fs_fault.inject_rate) + return false; + if (type == FAULT_KMALLOC && !IS_FAULT_SET(type)) + return false; + else if (type == FAULT_PAGE_ALLOC && !IS_FAULT_SET(type)) + return false; + else if (type == FAULT_ALLOC_NID && !IS_FAULT_SET(type)) + return false; + else if (type == FAULT_ORPHAN && !IS_FAULT_SET(type)) + return false; + else if (type == FAULT_BLOCK && !IS_FAULT_SET(type)) + return false; + else if (type == FAULT_DIR_DEPTH && !IS_FAULT_SET(type)) + return false; + + atomic_inc(&f2fs_fault.inject_ops); + if (atomic_read(&f2fs_fault.inject_ops) >= f2fs_fault.inject_rate) { + atomic_set(&f2fs_fault.inject_ops, 0); printk("%sF2FS-fs : inject %s in %pF\n", KERN_INFO, fault_name[type], diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0e54e84..9df6d72 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -40,8 +40,7 @@ static struct kmem_cache *f2fs_inode_cachep; static struct kset *f2fs_kset; #ifdef CONFIG_F2FS_FAULT_INJECTION -u32 f2fs_fault_rate = 0; -atomic_t f2fs_ops; +struct f2fs_fault_info f2fs_fault; char *fault_name[FAULT_MAX] = { [FAULT_KMALLOC] = "kmalloc", @@ -51,6 +50,17 @@ char *fault_name[FAULT_MAX] = { [FAULT_BLOCK] = "no more block", [FAULT_DIR_DEPTH] = "too big dir depth", }; + +static void f2fs_build_fault_attr(unsigned int rate) +{ + if (rate) { + atomic_set(&f2fs_fault.inject_ops, 0); + f2fs_fault.inject_rate = rate; + f2fs_fault.inject_type = (1 << FAULT_MAX) - 1; + } else { + memset(&f2fs_fault, 0, sizeof(struct f2fs_fault_info)); + } +} #endif /* f2fs-wide shrinker description */ @@ -118,6 +128,10 @@ enum { SM_INFO, /* struct f2fs_sm_info */ NM_INFO, /* struct f2fs_nm_info */ F2FS_SBI, /* struct f2fs_sb_info */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + FAULT_INFO_RATE, /* struct f2fs_fault_info */ + FAULT_INFO_TYPE, /* struct f2fs_fault_info */ +#endif }; struct f2fs_attr { @@ -139,6 +153,11 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return (unsigned char *)NM_I(sbi); else if (struct_type == F2FS_SBI) return (unsigned char *)sbi; +#ifdef CONFIG_F2FS_FAULT_INJECTION + else if (struct_type == FAULT_INFO_RATE || + struct_type == FAULT_INFO_TYPE) + return (unsigned char *)&f2fs_fault; +#endif return NULL; } @@ -188,6 +207,10 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, ret = kstrtoul(skip_spaces(buf), 0, &t); if (ret < 0) return ret; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) + return -EINVAL; +#endif *ui = t; return count; } @@ -253,6 +276,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +#ifdef CONFIG_F2FS_FAULT_INJECTION +F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); +F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); +#endif F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) @@ -289,6 +316,22 @@ static struct kobj_type f2fs_ktype = { .release = f2fs_sb_release, }; +#ifdef CONFIG_F2FS_FAULT_INJECTION +/* sysfs for f2fs fault injection */ +static struct kobject f2fs_fault_inject; + +static struct attribute *f2fs_fault_attrs[] = { + ATTR_LIST(inject_rate), + ATTR_LIST(inject_type), + NULL +}; + +static struct kobj_type f2fs_fault_ktype = { + .default_attrs = f2fs_fault_attrs, + .sysfs_ops = &f2fs_attr_ops, +}; +#endif + void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) { struct va_format vaf; @@ -317,8 +360,9 @@ static int parse_options(struct super_block *sb, char *options) int arg = 0; #ifdef CONFIG_F2FS_FAULT_INJECTION - f2fs_fault_rate = 0; + f2fs_build_fault_attr(0); #endif + if (!options) return 0; @@ -456,8 +500,7 @@ static int parse_options(struct super_block *sb, char *options) if (args->from && match_int(args, &arg)) return -EINVAL; #ifdef CONFIG_F2FS_FAULT_INJECTION - f2fs_fault_rate = arg; - atomic_set(&f2fs_ops, 0); + f2fs_build_fault_attr(arg); #else f2fs_msg(sb, KERN_INFO, "FAULT_INJECTION was not selected"); @@ -1797,6 +1840,16 @@ static int __init init_f2fs_fs(void) err = -ENOMEM; goto free_extent_cache; } +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_fault_inject.kset = f2fs_kset; + f2fs_build_fault_attr(0); + err = kobject_init_and_add(&f2fs_fault_inject, &f2fs_fault_ktype, + NULL, "fault_injection"); + if (err) { + f2fs_fault_inject.kset = NULL; + goto free_kset; + } +#endif err = register_shrinker(&f2fs_shrinker_info); if (err) goto free_kset; @@ -1815,6 +1868,10 @@ free_filesystem: free_shrinker: unregister_shrinker(&f2fs_shrinker_info); free_kset: +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (f2fs_fault_inject.kset) + kobject_put(&f2fs_fault_inject); +#endif kset_unregister(f2fs_kset); free_extent_cache: destroy_extent_cache(); @@ -1841,6 +1898,9 @@ static void __exit exit_f2fs_fs(void) destroy_segment_manager_caches(); destroy_node_manager_caches(); destroy_inodecache(); +#ifdef CONFIG_F2FS_FAULT_INJECTION + kobject_put(&f2fs_fault_inject); +#endif kset_unregister(f2fs_kset); f2fs_destroy_trace_ios(); } -- cgit v0.10.2 From 10aa97c379cdd1e9f537a00ef2d787989759269d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 16 May 2016 10:33:40 -0700 Subject: f2fs: manipulate dirty file inodes when DATA_FLUSH is set It needs to maintain dirty file inodes only if DATA_FLUSH is set. Otherwise, let's avoid its overhead. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 3da6499..d04113b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -785,9 +785,11 @@ void update_dirty_page(struct inode *inode, struct page *page) !S_ISLNK(inode->i_mode)) return; - spin_lock(&sbi->inode_lock[type]); - __add_dirty_inode(inode, type); - spin_unlock(&sbi->inode_lock[type]); + if (type != FILE_INODE || test_opt(sbi, DATA_FLUSH)) { + spin_lock(&sbi->inode_lock[type]); + __add_dirty_inode(inode, type); + spin_unlock(&sbi->inode_lock[type]); + } inode_inc_dirty_pages(inode); SetPagePrivate(page); @@ -803,6 +805,9 @@ void remove_dirty_inode(struct inode *inode) !S_ISLNK(inode->i_mode)) return; + if (type == FILE_INODE && !test_opt(sbi, DATA_FLUSH)) + return; + spin_lock(&sbi->inode_lock[type]); __remove_dirty_inode(inode, type); spin_unlock(&sbi->inode_lock[type]); -- cgit v0.10.2 From f573018491fd823e909d587cfe16758f3dd9e6d6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 17 May 2016 16:23:36 -0700 Subject: f2fs: use bio count instead of F2FS_WRITEBACK page count This can reduce page counting overhead. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index d04113b..447e2a9 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -914,7 +914,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) for (;;) { prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); - if (!get_pages(sbi, F2FS_WRITEBACK)) + if (!atomic_read(&sbi->nr_wb_bios)) break; io_schedule_timeout(5*HZ); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1013836..faef666 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -71,10 +71,9 @@ static void f2fs_write_end_io(struct bio *bio) f2fs_stop_checkpoint(sbi); } end_page_writeback(page); - dec_page_count(sbi, F2FS_WRITEBACK); } - - if (!get_pages(sbi, F2FS_WRITEBACK) && wq_has_sleeper(&sbi->cp_wait)) + if (atomic_dec_and_test(&sbi->nr_wb_bios) && + wq_has_sleeper(&sbi->cp_wait)) wake_up(&sbi->cp_wait); bio_put(bio); @@ -98,6 +97,14 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, return bio; } +static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw, + struct bio *bio) +{ + if (!is_read_io(rw)) + atomic_inc(&sbi->nr_wb_bios); + submit_bio(rw, bio); +} + static void __submit_merged_bio(struct f2fs_bio_info *io) { struct f2fs_io_info *fio = &io->fio; @@ -110,7 +117,7 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) else trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio); - submit_bio(fio->rw, io->bio); + __submit_bio(io->sbi, fio->rw, io->bio); io->bio = NULL; } @@ -228,7 +235,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) return -EFAULT; } - submit_bio(fio->rw, bio); + __submit_bio(fio->sbi, fio->rw, bio); return 0; } @@ -248,9 +255,6 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) down_write(&io->io_rwsem); - if (!is_read) - inc_page_count(sbi, F2FS_WRITEBACK); - if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || io->fio.rw != fio->rw)) __submit_merged_bio(io); @@ -1047,7 +1051,7 @@ got_it: */ if (bio && (last_block_in_bio != block_nr - 1)) { submit_and_realloc: - submit_bio(READ, bio); + __submit_bio(F2FS_I_SB(inode), READ, bio); bio = NULL; } if (bio == NULL) { @@ -1090,7 +1094,7 @@ set_error_page: goto next_page; confused: if (bio) { - submit_bio(READ, bio); + __submit_bio(F2FS_I_SB(inode), READ, bio); bio = NULL; } unlock_page(page); @@ -1100,7 +1104,7 @@ next_page: } BUG_ON(pages && !list_empty(pages)); if (bio) - submit_bio(READ, bio); + __submit_bio(F2FS_I_SB(inode), READ, bio); return 0; } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 37615b2..a188973 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -48,7 +48,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); - si->wb_pages = get_pages(sbi, F2FS_WRITEBACK); + si->wb_bios = atomic_read(&sbi->nr_wb_bios); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -299,8 +299,8 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4d, wb: %4d\n", - si->inmem_pages, si->wb_pages); + seq_printf(s, " - inmem: %4d, wb_bios: %4d\n", + si->inmem_pages, si->wb_bios); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d\n", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1351178..bc45a2c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -650,7 +650,6 @@ struct f2fs_sm_info { * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ enum count_type { - F2FS_WRITEBACK, F2FS_DIRTY_DENTS, F2FS_DIRTY_DATA, F2FS_DIRTY_NODES, @@ -813,6 +812,7 @@ struct f2fs_sb_info { block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ u32 s_next_generation; /* for NFS support */ + atomic_t nr_wb_bios; /* # of writeback bios */ atomic_t nr_pages[NR_COUNT_TYPE]; /* # of pages, see count_type */ struct f2fs_mount_info mount_opt; /* mount options */ @@ -2017,7 +2017,7 @@ struct f2fs_stat_info { int ndirty_dent, ndirty_dirs, ndirty_data, ndirty_files; int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; - int bg_gc, inmem_pages, wb_pages; + int bg_gc, inmem_pages, wb_bios; int inline_xattr, inline_inode, inline_dir, orphans; unsigned int valid_count, valid_node_count, valid_inode_count; unsigned int bimodal, avg_vblocks; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9df6d72..0e17560 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -649,8 +649,7 @@ static void f2fs_put_super(struct super_block *sb) mutex_unlock(&sbi->umount_mutex); /* our cp_error case, we can wait for any writeback page */ - if (get_pages(sbi, F2FS_WRITEBACK)) - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_bios(sbi); iput(sbi->node_inode); iput(sbi->meta_inode); -- cgit v0.10.2 From 523be8a6b3418eb7e0f0f042fe0490345eb5d516 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 13 May 2016 12:36:58 -0700 Subject: f2fs: use percpu_counter for page counters This patch substitutes percpu_counter for atomic_counter when counting various types of pages. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index a188973..d89a425 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -144,6 +144,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; si->base_mem += 2 * sizeof(struct f2fs_inode_info); si->base_mem += sizeof(*sbi->ckpt); + si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE; /* build sm */ si->base_mem += sizeof(struct f2fs_sm_info); @@ -299,15 +300,15 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4d, wb_bios: %4d\n", + seq_printf(s, " - inmem: %4lld, wb_bios: %4d\n", si->inmem_pages, si->wb_bios); - seq_printf(s, " - nodes: %4d in %4d\n", + seq_printf(s, " - nodes: %4lld in %4d\n", si->ndirty_node, si->node_pages); - seq_printf(s, " - dents: %4d in dirs:%4d\n", + seq_printf(s, " - dents: %4lld in dirs:%4d\n", si->ndirty_dent, si->ndirty_dirs); - seq_printf(s, " - datas: %4d in files:%4d\n", + seq_printf(s, " - datas: %4lld in files:%4d\n", si->ndirty_data, si->ndirty_files); - seq_printf(s, " - meta: %4d in %4d\n", + seq_printf(s, " - meta: %4lld in %4d\n", si->ndirty_meta, si->meta_pages); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bc45a2c..c2d0b24 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -813,7 +813,9 @@ struct f2fs_sb_info { block_t last_valid_block_count; /* for recovery */ u32 s_next_generation; /* for NFS support */ atomic_t nr_wb_bios; /* # of writeback bios */ - atomic_t nr_pages[NR_COUNT_TYPE]; /* # of pages, see count_type */ + + /* # of pages, see count_type */ + struct percpu_counter nr_pages[NR_COUNT_TYPE]; struct f2fs_mount_info mount_opt; /* mount options */ @@ -1158,7 +1160,7 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { - atomic_inc(&sbi->nr_pages[count_type]); + percpu_counter_inc(&sbi->nr_pages[count_type]); set_sbi_flag(sbi, SBI_IS_DIRTY); } @@ -1171,7 +1173,7 @@ static inline void inode_inc_dirty_pages(struct inode *inode) static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) { - atomic_dec(&sbi->nr_pages[count_type]); + percpu_counter_dec(&sbi->nr_pages[count_type]); } static inline void inode_dec_dirty_pages(struct inode *inode) @@ -1185,9 +1187,9 @@ static inline void inode_dec_dirty_pages(struct inode *inode) F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } -static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) +static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) { - return atomic_read(&sbi->nr_pages[count_type]); + return percpu_counter_sum_positive(&sbi->nr_pages[count_type]); } static inline int get_dirty_pages(struct inode *inode) @@ -1198,8 +1200,10 @@ static inline int get_dirty_pages(struct inode *inode) static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) { unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; - return ((get_pages(sbi, block_type) + pages_per_sec - 1) - >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; + unsigned int segs = (get_pages(sbi, block_type) + pages_per_sec - 1) >> + sbi->log_blocks_per_seg; + + return segs / sbi->segs_per_sec; } static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) @@ -2013,11 +2017,11 @@ struct f2fs_stat_info { unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; int ext_tree, zombie_tree, ext_node; - int ndirty_node, ndirty_meta; - int ndirty_dent, ndirty_dirs, ndirty_data, ndirty_files; + s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, inmem_pages; + unsigned int ndirty_dirs, ndirty_files; int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; - int bg_gc, inmem_pages, wb_bios; + int bg_gc, wb_bios; int inline_xattr, inline_inode, inline_dir, orphans; unsigned int valid_count, valid_node_count, valid_inode_count; unsigned int bimodal, avg_vblocks; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0e17560..009a9f4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -606,6 +606,14 @@ static void f2fs_destroy_inode(struct inode *inode) call_rcu(&inode->i_rcu, f2fs_i_callback); } +static void destroy_percpu_info(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < NR_COUNT_TYPE; i++) + percpu_counter_destroy(&sbi->nr_pages[i]); +} + static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -666,6 +674,8 @@ static void f2fs_put_super(struct super_block *sb) if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->raw_super); + + destroy_percpu_info(sbi); kfree(sbi); } @@ -1324,7 +1334,6 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; - int i; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -1344,9 +1353,6 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->cur_victim_sec = NULL_SECNO; sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; - for (i = 0; i < NR_COUNT_TYPE; i++) - atomic_set(&sbi->nr_pages[i], 0); - sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; @@ -1362,6 +1368,18 @@ static void init_sb_info(struct f2fs_sb_info *sbi) #endif } +static int init_percpu_info(struct f2fs_sb_info *sbi) +{ + int i, err; + + for (i = 0; i < NR_COUNT_TYPE; i++) { + err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL); + if (err) + return err; + } + return 0; +} + /* * Read f2fs raw super block. * Because we have two copies of super block, so read both of them @@ -1552,6 +1570,10 @@ try_onemore: init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); + err = init_percpu_info(sbi); + if (err) + goto free_options; + /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { @@ -1754,6 +1776,7 @@ free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); free_options: + destroy_percpu_info(sbi); kfree(options); free_sb_buf: kfree(raw_super); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 497e6e8..3a09bb4 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1275,14 +1275,14 @@ TRACE_EVENT(f2fs_destroy_extent_tree, DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes, - TP_PROTO(struct super_block *sb, int type, int count), + TP_PROTO(struct super_block *sb, int type, s64 count), TP_ARGS(sb, type, count), TP_STRUCT__entry( __field(dev_t, dev) __field(int, type) - __field(int, count) + __field(s64, count) ), TP_fast_assign( @@ -1291,7 +1291,7 @@ DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes, __entry->count = count; ), - TP_printk("dev = (%d,%d), %s, dirty count = %d", + TP_printk("dev = (%d,%d), %s, dirty count = %lld", show_dev(__entry), show_file_type(__entry->type), __entry->count) @@ -1299,14 +1299,14 @@ DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes, DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_enter, - TP_PROTO(struct super_block *sb, int type, int count), + TP_PROTO(struct super_block *sb, int type, s64 count), TP_ARGS(sb, type, count) ); DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_exit, - TP_PROTO(struct super_block *sb, int type, int count), + TP_PROTO(struct super_block *sb, int type, s64 count), TP_ARGS(sb, type, count) ); -- cgit v0.10.2 From 1beba1b3a953107c3ff5448ab4e4297db4619c76 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 13 May 2016 12:47:11 -0700 Subject: f2fs: use percpu_counter for # of dirty pages in inode This patch adds percpu_counter for # of dirty pages in inode. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c2d0b24..3f1c710 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -436,7 +436,7 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ struct rw_semaphore i_sem; /* protect fi info */ - atomic_t dirty_pages; /* # of dirty pages */ + struct percpu_counter dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ @@ -1166,7 +1166,7 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_inc_dirty_pages(struct inode *inode) { - atomic_inc(&F2FS_I(inode)->dirty_pages); + percpu_counter_inc(&F2FS_I(inode)->dirty_pages); inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } @@ -1182,7 +1182,7 @@ static inline void inode_dec_dirty_pages(struct inode *inode) !S_ISLNK(inode->i_mode)) return; - atomic_dec(&F2FS_I(inode)->dirty_pages); + percpu_counter_dec(&F2FS_I(inode)->dirty_pages); dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } @@ -1192,9 +1192,9 @@ static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) return percpu_counter_sum_positive(&sbi->nr_pages[count_type]); } -static inline int get_dirty_pages(struct inode *inode) +static inline s64 get_dirty_pages(struct inode *inode) { - return atomic_read(&F2FS_I(inode)->dirty_pages); + return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages); } static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a9cb314..69dd7c9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1419,7 +1419,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, - "Unexpected flush for atomic writes: ino=%lu, npages=%u", + "Unexpected flush for atomic writes: ino=%lu, npages=%lld", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 009a9f4..0ff862b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -526,9 +526,13 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_once((void *) fi); + if (percpu_counter_init(&fi->dirty_pages, 0, GFP_NOFS)) { + kmem_cache_free(f2fs_inode_cachep, fi); + return NULL; + } + /* Initialize f2fs-specific inode info */ fi->vfs_inode.i_version = 1; - atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; init_rwsem(&fi->i_sem); @@ -603,6 +607,7 @@ static void f2fs_i_callback(struct rcu_head *head) static void f2fs_destroy_inode(struct inode *inode) { + percpu_counter_destroy(&F2FS_I(inode)->dirty_pages); call_rcu(&inode->i_rcu, f2fs_i_callback); } -- cgit v0.10.2 From 41382ec43255b502321c3c27f1347efeb3279290 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 16 May 2016 11:06:50 -0700 Subject: f2fs: use percpu_counter for alloc_valid_block_count This patch uses percpu_count for sbi->alloc_valid_block_count. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 447e2a9..cf79598 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1079,7 +1079,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* update user_block_counts */ sbi->last_valid_block_count = sbi->total_valid_block_count; - sbi->alloc_valid_block_count = 0; + percpu_counter_set(&sbi->alloc_valid_block_count, 0); /* Here, we only have one bio having CP pack */ sync_meta_pages(sbi, META_FLUSH, LONG_MAX); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3f1c710..c8833c8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -808,7 +808,6 @@ struct f2fs_sb_info { block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ - block_t alloc_valid_block_count; /* # of allocated blocks */ block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ u32 s_next_generation; /* for NFS support */ @@ -816,6 +815,8 @@ struct f2fs_sb_info { /* # of pages, see count_type */ struct percpu_counter nr_pages[NR_COUNT_TYPE]; + /* # of allocated blocks */ + struct percpu_counter alloc_valid_block_count; struct f2fs_mount_info mount_opt; /* mount options */ @@ -1141,8 +1142,9 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, inode->i_blocks += *count; sbi->total_valid_block_count = sbi->total_valid_block_count + (block_t)(*count); - sbi->alloc_valid_block_count += (block_t)(*count); spin_unlock(&sbi->stat_lock); + + percpu_counter_add(&sbi->alloc_valid_block_count, (*count)); return true; } @@ -1292,11 +1294,11 @@ static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, if (inode) inode->i_blocks++; - sbi->alloc_valid_block_count++; sbi->total_valid_node_count++; sbi->total_valid_block_count++; spin_unlock(&sbi->stat_lock); + percpu_counter_inc(&sbi->alloc_valid_block_count); return true; } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 6303b2a..f89b70e 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -49,8 +49,9 @@ static struct kmem_cache *fsync_entry_slab; bool space_for_roll_forward(struct f2fs_sb_info *sbi) { - if (sbi->last_valid_block_count + sbi->alloc_valid_block_count - > sbi->user_block_count) + s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count); + + if (sbi->last_valid_block_count + nalloc > sbi->user_block_count) return false; return true; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0ff862b..43e3e9c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -617,6 +617,7 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi) for (i = 0; i < NR_COUNT_TYPE; i++) percpu_counter_destroy(&sbi->nr_pages[i]); + percpu_counter_destroy(&sbi->alloc_valid_block_count); } static void f2fs_put_super(struct super_block *sb) @@ -1382,7 +1383,9 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) if (err) return err; } - return 0; + + return percpu_counter_init(&sbi->alloc_valid_block_count, 0, + GFP_KERNEL); } /* @@ -1601,7 +1604,7 @@ try_onemore: sbi->total_valid_block_count = le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; - sbi->alloc_valid_block_count = 0; + for (i = 0; i < NR_INODE_TYPE; i++) { INIT_LIST_HEAD(&sbi->inode_list[i]); spin_lock_init(&sbi->inode_lock[i]); -- cgit v0.10.2 From 513c5f3735a9bd0bd8b58b6cdafbad5ef19f2159 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 16 May 2016 11:42:32 -0700 Subject: f2fs: use percpu_counter for total_valid_inode_count This patch uses percpu_counter to avoid stat_lock. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c8833c8..02f0656 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -801,7 +801,6 @@ struct f2fs_sb_info { unsigned int total_sections; /* total section count */ unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ - unsigned int total_valid_inode_count; /* valid inode count */ loff_t max_file_blocks; /* max block index of file */ int active_logs; /* # of active logs */ int dir_level; /* directory level */ @@ -818,6 +817,9 @@ struct f2fs_sb_info { /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; + /* valid inode count */ + struct percpu_counter total_valid_inode_count; + struct f2fs_mount_info mount_opt; /* mount options */ /* for cleaning operations */ @@ -1325,23 +1327,17 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) { - spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count); - sbi->total_valid_inode_count++; - spin_unlock(&sbi->stat_lock); + percpu_counter_inc(&sbi->total_valid_inode_count); } static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) { - spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, !sbi->total_valid_inode_count); - sbi->total_valid_inode_count--; - spin_unlock(&sbi->stat_lock); + percpu_counter_dec(&sbi->total_valid_inode_count); } -static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) +static inline s64 valid_inode_count(struct f2fs_sb_info *sbi) { - return sbi->total_valid_inode_count; + return percpu_counter_sum_positive(&sbi->total_valid_inode_count); } static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 43e3e9c..173cf3a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -618,6 +618,7 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi) for (i = 0; i < NR_COUNT_TYPE; i++) percpu_counter_destroy(&sbi->nr_pages[i]); percpu_counter_destroy(&sbi->alloc_valid_block_count); + percpu_counter_destroy(&sbi->total_valid_inode_count); } static void f2fs_put_super(struct super_block *sb) @@ -1384,7 +1385,11 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) return err; } - return percpu_counter_init(&sbi->alloc_valid_block_count, 0, + err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL); + if (err) + return err; + + return percpu_counter_init(&sbi->total_valid_inode_count, 0, GFP_KERNEL); } @@ -1598,8 +1603,8 @@ try_onemore: sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); - sbi->total_valid_inode_count = - le32_to_cpu(sbi->ckpt->valid_inode_count); + percpu_counter_set(&sbi->total_valid_inode_count, + le32_to_cpu(sbi->ckpt->valid_inode_count)); sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count); sbi->total_valid_block_count = le64_to_cpu(sbi->ckpt->valid_block_count); -- cgit v0.10.2 From b8bef79df709533416a278c4c3aef3577cefa61a Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Wed, 18 May 2016 08:02:25 +0800 Subject: f2fs: make exit_f2fs_fs more clear init_f2fs_fs does: 1) f2fs_build_trace_ios 2) init_inodecache 3) create_node_manager_caches 4) create_segment_manager_caches 5) create_checkpoint_caches 6) create_extent_cache 7) kset_create_and_add 8) kobject_init_and_add 9) register_shrinker 10) register_filesystem 11) f2fs_create_root_stats 12) proc_mkdir exit_f2fs_fs should do cleanup in the reverse order to make the code more clear. Signed-off-by: Tiezhu Yang Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 173cf3a..74cc852 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1926,17 +1926,17 @@ static void __exit exit_f2fs_fs(void) { remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); - unregister_shrinker(&f2fs_shrinker_info); unregister_filesystem(&f2fs_fs_type); + unregister_shrinker(&f2fs_shrinker_info); +#ifdef CONFIG_F2FS_FAULT_INJECTION + kobject_put(&f2fs_fault_inject); +#endif + kset_unregister(f2fs_kset); destroy_extent_cache(); destroy_checkpoint_caches(); destroy_segment_manager_caches(); destroy_node_manager_caches(); destroy_inodecache(); -#ifdef CONFIG_F2FS_FAULT_INJECTION - kobject_put(&f2fs_fault_inject); -#endif - kset_unregister(f2fs_kset); f2fs_destroy_trace_ios(); } -- cgit v0.10.2 From 975756c41332bc5e523e9f843271ed5ab6aaaaaa Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 19 May 2016 11:57:21 -0700 Subject: f2fs: avoid ENOSPC fault in the recovery process This patch avoids impossible error injection, ENOSPC, during recovery process. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index f89b70e..3d7216d 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -470,6 +470,10 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (src == NULL_ADDR) { err = reserve_new_block(&dn); +#ifdef CONFIG_F2FS_FAULT_INJECTION + while (err) + err = reserve_new_block(&dn); +#endif /* We should not get -ENOSPC */ f2fs_bug_on(sbi, err); } -- cgit v0.10.2 From 38f91ca8c0ea69f707c26f592dcc70f937088bcc Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 18 May 2016 14:07:56 -0700 Subject: f2fs: flush pending bios right away when error occurs Given errors, this patch flushes pending bios as soon as possible. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index cf79598..3891600 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -26,6 +26,14 @@ static struct kmem_cache *ino_entry_slab; struct kmem_cache *inode_entry_slab; +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) +{ + set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); + sbi->sb->s_flags |= MS_RDONLY; + if (!end_io) + f2fs_flush_merged_bios(sbi); +} + /* * We guarantee no failure on the returned page. */ @@ -91,7 +99,7 @@ repeat: * meta page. */ if (unlikely(!PageUptodate(page))) - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); out: return page; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index faef666..105dd3d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -68,7 +68,7 @@ static void f2fs_write_end_io(struct bio *bio) if (unlikely(bio->bi_error)) { set_bit(AS_EIO, &page->mapping->flags); - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, true); } end_page_writeback(page); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 02f0656..916e7c2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1698,12 +1698,6 @@ static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); } -static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) -{ - set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); - sbi->sb->s_flags |= MS_RDONLY; -} - static inline bool is_dot_dotdot(const struct qstr *str) { if (str->len == 1 && str->name[0] == '.') @@ -1937,6 +1931,7 @@ void destroy_segment_manager_caches(void); /* * checkpoint.c */ +void f2fs_stop_checkpoint(struct f2fs_sb_info *, bool); struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 69dd7c9..4c21254 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1571,21 +1571,21 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) case F2FS_GOING_DOWN_FULLSYNC: sb = freeze_bdev(sb->s_bdev); if (sb && !IS_ERR(sb)) { - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); thaw_bdev(sb->s_bdev, sb); } break; case F2FS_GOING_DOWN_METASYNC: /* do checkpoint only */ f2fs_sync_fs(sb, 1); - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_NOSYNC: - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_METAFLUSH: sync_meta_pages(sbi, META, LONG_MAX); - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); break; default: ret = -EINVAL; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 689d691..2e68ada 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -283,7 +283,7 @@ retry: cond_resched(); goto retry; } else if (err != -ENOENT) { - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); } return 0; } -- cgit v0.10.2 From 0f3311a8c266b9f4fae4e5cdfcd9a86970e2b9bd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 21 May 2016 00:11:09 +0800 Subject: f2fs: fix to update dirty page count correctly Once we failed to merge inline data into inode page during flushing inline inode, we will skip invoking inode_dec_dirty_pages, which makes dirty page count incorrect, result in panic in ->evict_inode, Fix it. ------------[ cut here ]------------ kernel BUG at /home/yuchao/git/devf2fs/inode.c:336! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 3 PID: 10004 Comm: umount Tainted: G O 4.6.0-rc5+ #17 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 task: f0c33000 ti: c5212000 task.ti: c5212000 EIP: 0060:[] EFLAGS: 00010202 CPU: 3 EIP is at f2fs_evict_inode+0x85/0x490 [f2fs] EAX: 00000001 EBX: c4529ea0 ECX: 00000001 EDX: 00000000 ESI: c0131000 EDI: f89dd0a0 EBP: c5213e9c ESP: c5213e78 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 80050033 CR2: b75878c0 CR3: 1a36a700 CR4: 000406f0 Stack: c4529ea0 c4529ef4 c5213e8c c176d45c c4529ef4 00000000 c4529ea0 c4529fac f89dd0a0 c5213eb0 c1204a68 c5213ed8 c452a2b4 c6680930 c5213ec0 c1204b64 c6680d44 c6680620 c5213eec c120588d ee84b000 ee84b5c0 c5214000 ee84b5e0 Call Trace: [] ? _raw_spin_unlock+0x2c/0x50 [] evict+0xa8/0x170 [] dispose_list+0x34/0x50 [] evict_inodes+0x10d/0x130 [] generic_shutdown_super+0x41/0xe0 [] ? unregister_shrinker+0x40/0x50 [] ? unregister_shrinker+0x40/0x50 [] kill_block_super+0x22/0x70 [] kill_f2fs_super+0x1e/0x20 [f2fs] [] deactivate_locked_super+0x3d/0x70 [] deactivate_super+0x43/0x60 [] cleanup_mnt+0x39/0x80 [] __cleanup_mnt+0x10/0x20 [] task_work_run+0x71/0x90 [] exit_to_usermode_loop+0x72/0x9e [] do_fast_syscall_32+0x19c/0x1c0 [] sysenter_past_esp+0x45/0x74 EIP: [] f2fs_evict_inode+0x85/0x490 [f2fs] SS:ESP 0068:c5213e78 ---[ end trace d30536330b7fdc58 ]--- Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index cda7c7c..1f21aae 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1197,6 +1197,7 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode; struct page *page; + int ret; /* should flush inline_data before evict_inode */ inode = ilookup(sbi->sb, ino); @@ -1216,9 +1217,9 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) if (!clear_page_dirty_for_io(page)) goto page_out; - if (!f2fs_write_inline_data(inode, page)) - inode_dec_dirty_pages(inode); - else + ret = f2fs_write_inline_data(inode, page); + inode_dec_dirty_pages(inode); + if (ret) set_page_dirty(page); page_out: f2fs_put_page(page, 1); -- cgit v0.10.2