From 744602cf45ce35758b8637f76bc263c871abc6ea Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 24 Jan 2014 09:42:16 +0900 Subject: f2fs: update_inode_page should be done all the time In order to make fs consistency, update_inode_page should not be failed all the time. Otherwise, it is possible to lose some metadata in the inode like a link count. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2261ccd..20c3c64 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -55,8 +55,7 @@ static void f2fs_write_end_io(struct bio *bio, int err) if (unlikely(err)) { SetPageError(page); set_bit(AS_EIO, &page->mapping->flags); - set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); - sbi->sb->s_flags |= MS_RDONLY; + f2fs_stop_checkpoint(sbi); } end_page_writeback(page); dec_page_count(sbi, F2FS_WRITEBACK); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fc3c558..0f0ad3a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1023,6 +1023,12 @@ static inline int f2fs_readonly(struct super_block *sb) return sb->s_flags & MS_RDONLY; } +static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) +{ + set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); + sbi->sb->s_flags |= MS_RDONLY; +} + #define get_inode_mode(i) \ ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) @@ -1048,7 +1054,7 @@ void f2fs_set_inode_flags(struct inode *); struct inode *f2fs_iget(struct super_block *, unsigned long); int try_to_free_nats(struct f2fs_sb_info *, int); void update_inode(struct inode *, struct page *); -int update_inode_page(struct inode *); +void update_inode_page(struct inode *); int f2fs_write_inode(struct inode *, struct writeback_control *); void f2fs_evict_inode(struct inode *); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 4d67ed7..08d69c9 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -212,24 +212,29 @@ void update_inode(struct inode *inode, struct page *node_page) clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); } -int update_inode_page(struct inode *inode) +void update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *node_page; - +retry: node_page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(node_page)) - return PTR_ERR(node_page); - + if (IS_ERR(node_page)) { + int err = PTR_ERR(node_page); + if (err == -ENOMEM) { + cond_resched(); + goto retry; + } else if (err != -ENOENT) { + f2fs_stop_checkpoint(sbi); + } + return; + } update_inode(inode, node_page); f2fs_put_page(node_page, 1); - return 0; } int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - int ret; if (inode->i_ino == F2FS_NODE_INO(sbi) || inode->i_ino == F2FS_META_INO(sbi)) @@ -243,13 +248,13 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * during the urgent cleaning time when runing out of free sections. */ f2fs_lock_op(sbi); - ret = update_inode_page(inode); + update_inode_page(inode); f2fs_unlock_op(sbi); if (wbc) f2fs_balance_fs(sbi); - return ret; + return 0; } /* -- cgit v0.10.2 From 5e443818fa0b2a2845561ee25bec181424fb2889 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 28 Jan 2014 12:22:14 +0900 Subject: f2fs: handle dirty segments inside refresh_sit_entry This patch cleans up the refresh_sit_entry to handle locate_dirty_segments. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0f0ad3a..3f223aa 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1136,6 +1136,7 @@ void destroy_node_manager_caches(void); void f2fs_balance_fs(struct f2fs_sb_info *); void f2fs_balance_fs_bg(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); +void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *); void allocate_new_segments(struct f2fs_sb_info *); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7caac5f..fba510b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -434,12 +434,14 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) get_sec_entry(sbi, segno)->valid_blocks += del; } -static void refresh_sit_entry(struct f2fs_sb_info *sbi, - block_t old_blkaddr, block_t new_blkaddr) +void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new) { - update_sit_entry(sbi, new_blkaddr, 1); - if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) - update_sit_entry(sbi, old_blkaddr, -1); + update_sit_entry(sbi, new, 1); + if (GET_SEGNO(sbi, old) != NULL_SEGNO) + update_sit_entry(sbi, old, -1); + + locate_dirty_segment(sbi, GET_SEGNO(sbi, old)); + locate_dirty_segment(sbi, GET_SEGNO(sbi, new)); } void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) @@ -881,17 +883,15 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, stat_inc_block_count(sbi, curseg); + if (!__has_curseg_space(sbi, type)) + sit_i->s_ops->allocate_segment(sbi, type, false); /* * SIT information should be updated before segment allocation, * since SSR needs latest valid block information. */ refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); - - if (!__has_curseg_space(sbi, type)) - sit_i->s_ops->allocate_segment(sbi, type, false); - locate_dirty_segment(sbi, old_cursegno); - locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + mutex_unlock(&sit_i->sentry_lock); if (page && IS_NODESEG(type)) @@ -992,9 +992,7 @@ void recover_data_page(struct f2fs_sb_info *sbi, __add_sum_entry(sbi, type, sum); refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); - locate_dirty_segment(sbi, old_cursegno); - locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); mutex_unlock(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); @@ -1045,9 +1043,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio); f2fs_submit_merged_bio(sbi, NODE, WRITE); refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); - locate_dirty_segment(sbi, old_cursegno); - locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); mutex_unlock(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); -- cgit v0.10.2 From abb2366c82c3d2dac3d7e9a74332137da8fc9399 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 28 Jan 2014 12:25:06 +0900 Subject: f2fs: fix to recover xattr node block If a new xattr node page was allocated and its inode is fsynced, we should recover the xattr node page during the roll-forward process after power-cut. But, previously, f2fs didn't handle that case, resulting in kernel panic as follows reported by Tom Li. BUG: unable to handle kernel paging request at ffffc9001c861a98 IP: [] check_index_in_prev_nodes+0x86/0x2d0 [f2fs] Call Trace: [] ? printk+0x48/0x4a [] recover_fsync_data+0xdca/0xf50 [f2fs] [] f2fs_fill_super+0x92e/0x970 [f2fs] [] mount_bdev+0x1b8/0x200 [] ? f2fs_remount+0x130/0x130 [f2fs] [] f2fs_mount+0x10/0x20 [f2fs] [] mount_fs+0x3e/0x1b0 [] ? __alloc_percpu+0xb/0x10 [] vfs_kern_mount+0x6f/0x120 [] do_mount+0x259/0xa90 [] ? memdup_user+0x3d/0x80 [] ? strndup_user+0x53/0x70 [] SyS_mount+0x89/0xd0 [] system_call_fastpath+0x16/0x1b This patch adds a recovery function of xattr node pages. Reported-by: Tom Li Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3f223aa..55288d2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1121,6 +1121,7 @@ void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); void recover_node_page(struct f2fs_sb_info *, struct page *, struct f2fs_summary *, struct node_info *, block_t); +bool recover_xattr_data(struct inode *, struct page *, block_t); int recover_inode_page(struct f2fs_sb_info *, struct page *); int restore_node_summary(struct f2fs_sb_info *, unsigned int, struct f2fs_summary_block *); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b0649b7..82f4753 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1535,6 +1535,46 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, clear_node_page_dirty(page); } +bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; + nid_t new_xnid = nid_of_node(page); + struct node_info ni; + + if (ofs_of_node(page) != XATTR_NODE_OFFSET) + return false; + + /* 1: invalidate the previous xattr nid */ + if (!prev_xnid) + goto recover_xnid; + + /* Deallocate node address */ + get_node_info(sbi, prev_xnid, &ni); + f2fs_bug_on(ni.blk_addr == NULL_ADDR); + invalidate_blocks(sbi, ni.blk_addr); + dec_valid_node_count(sbi, inode); + set_node_addr(sbi, &ni, NULL_ADDR); + +recover_xnid: + /* 2: allocate new xattr nid */ + if (unlikely(!inc_valid_node_count(sbi, inode))) + f2fs_bug_on(1); + + remove_free_nid(NM_I(sbi), new_xnid); + get_node_info(sbi, new_xnid, &ni); + ni.ino = inode->i_ino; + set_node_addr(sbi, &ni, NEW_ADDR); + F2FS_I(inode)->i_xattr_nid = new_xnid; + + /* 3: update xattr blkaddr */ + refresh_sit_entry(sbi, NEW_ADDR, blkaddr); + set_node_addr(sbi, &ni, blkaddr); + + update_inode_page(inode); + return true; +} + int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) { struct f2fs_inode *src, *dst; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 976a7a9..f1b0b89 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -301,6 +301,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (recover_inline_data(inode, page)) goto out; + if (recover_xattr_data(inode, page, blkaddr)) + goto out; + start = start_bidx_of_node(ofs_of_node(page), fi); if (IS_INODE(page)) end = start + ADDRS_PER_INODE(fi); -- cgit v0.10.2 From 1b1f559fc362f96869b7e04ef9825b1039b9a67d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 3 Feb 2014 10:50:22 +0900 Subject: f2fs: remove the ugly pointer conversion This patch modifies the use of bi_private to remove pointer chasing for sbi. Previously, we had a bi_private structure, but it needs memory allocation. So this patch uses bi_private by the sbi pointer and adds a completion pointer into the sbi. This can achieve no memory allocation and nice use of the bi_private. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 20c3c64..d175ae3 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -45,7 +45,7 @@ static void f2fs_read_end_io(struct bio *bio, int err) static void f2fs_write_end_io(struct bio *bio, int err) { - struct f2fs_sb_info *sbi = F2FS_SB(bio->bi_io_vec->bv_page->mapping->host->i_sb); + struct f2fs_sb_info *sbi = bio->bi_private; struct bio_vec *bvec; int i; @@ -61,8 +61,10 @@ static void f2fs_write_end_io(struct bio *bio, int err) dec_page_count(sbi, F2FS_WRITEBACK); } - if (bio->bi_private) - complete(bio->bi_private); + if (sbi->wait_io) { + complete(sbi->wait_io); + sbi->wait_io = NULL; + } if (!get_pages(sbi, F2FS_WRITEBACK) && !list_empty(&sbi->cp_wait.task_list)) @@ -85,6 +87,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, bio->bi_bdev = sbi->sb->s_bdev; bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; + bio->bi_private = sbi; return bio; } @@ -112,7 +115,7 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) */ if (fio->type == META_FLUSH) { DECLARE_COMPLETION_ONSTACK(wait); - io->bio->bi_private = &wait; + io->sbi->wait_io = &wait; submit_bio(rw, io->bio); wait_for_completion(&wait); } else { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 55288d2..aeff132 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -398,6 +398,7 @@ struct f2fs_sb_info { /* for bio operations */ struct f2fs_bio_info read_io; /* for read bios */ struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ + struct completion *wait_io; /* for completion bios */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ -- cgit v0.10.2 From 924a2ddbd0c2829ebca9ac899522cbb16a9b6d8c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 3 Feb 2014 17:24:51 +0900 Subject: f2fs: fix the potential mismatch between dir's i_size and i_blocks This is the erroneous scenario. i_size on-disk i_size i_blocks __f2fs_add_link() 4096 4096 2 get_new_data_page 8192 4096 3 -ENOSPC = init_inode_metadata checkpoint - 4096 3 POR and reboot __f2fs_add_link() 4096 4096 3 page = get_new_data_page (page->index = 1 by NEW_ADDR) add a dentry to the page successfully f2fs_rmdir() f2fs_empty_dir() 4096 4096 3 f2fs_unlink() goes, since there is no valid dentry due to i_size = 4096. But, still there is one dentry in page->index = 1. So this patch moves the code to write dir->i_size into on-disk i_size in order to sync dir's i_size, on-disk i_size, and its i_blocks. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 2b7c255..bfcb4ae 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -395,9 +395,6 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode, set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); } - if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) - update_inode_page(dir); - if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) clear_inode_flag(F2FS_I(inode), FI_INC_LINK); } @@ -511,7 +508,10 @@ add_dentry: update_parent_metadata(dir, inode, current_depth); fail: - clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); + if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { + update_inode_page(dir); + clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); + } kunmap(dentry_page); f2fs_put_page(dentry_page, 1); return err; -- cgit v0.10.2 From 491c0854b41380f48e422c00ae7e25ae4d02cecc Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 4 Feb 2014 13:01:10 +0900 Subject: f2fs: clean up with a macro This patch adds GET_BLKOFF_FROM_SEG0 to clean up some codes. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index f1b0b89..bda04a0 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -218,8 +218,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, { struct seg_entry *sentry; unsigned int segno = GET_SEGNO(sbi, blkaddr); - unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & - (sbi->blocks_per_seg - 1); + unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); struct f2fs_summary sum; nid_t ino, nid; void *kaddr; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fba510b..e87946a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -405,7 +405,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) se = get_seg_entry(sbi, segno); new_vblocks = se->valid_blocks + del; - offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1); + offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) || (new_vblocks > sbi->blocks_per_seg))); @@ -987,8 +987,7 @@ void recover_data_page(struct f2fs_sb_info *sbi, change_curseg(sbi, type, true); } - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & - (sbi->blocks_per_seg - 1); + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); __add_sum_entry(sbi, type, sum); refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); @@ -1026,8 +1025,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, curseg->next_segno = segno; change_curseg(sbi, type, true); } - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & - (sbi->blocks_per_seg - 1); + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); __add_sum_entry(sbi, type, sum); /* change the current log to the next block addr in advance */ @@ -1035,8 +1033,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, curseg->next_segno = next_segno; change_curseg(sbi, type, true); } - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, next_blkaddr) & - (sbi->blocks_per_seg - 1); + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr); /* rewrite node page */ set_page_writeback(page); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5731682..4024546 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -57,6 +57,9 @@ ((blk_addr) - SM_I(sbi)->seg0_blkaddr) #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) +#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1)) + #define GET_SEGNO(sbi, blk_addr) \ (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ -- cgit v0.10.2 From f6517cfc84246b2606fd631730846c648ee0d455 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 28 Jan 2014 14:54:07 +0900 Subject: f2fs: fix a build warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch modifies flow a little bit to avoid the following build warnings. src/fs/f2fs/recovery.c: In function ‘check_index_in_prev_nodes’: src/fs/f2fs/recovery.c:288:51: warning: ‘sum...ofs_in_node’ may be used uninitialized in this function [-Wmaybe-uninitialized] src/fs/f2fs/recovery.c:260:23: warning: ‘sum.nid’ may be used uninitialized in this function [-Wmaybe-uninitialized] Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index bda04a0..72adbbf 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -219,11 +219,11 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, struct seg_entry *sentry; unsigned int segno = GET_SEGNO(sbi, blkaddr); unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + struct f2fs_summary_block *sum_node; struct f2fs_summary sum; + struct page *sum_page, *node_page; nid_t ino, nid; - void *kaddr; struct inode *inode; - struct page *node_page; unsigned int offset; block_t bidx; int i; @@ -237,18 +237,15 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, struct curseg_info *curseg = CURSEG_I(sbi, i); if (curseg->segno == segno) { sum = curseg->sum_blk->entries[blkoff]; - break; + goto got_it; } } - if (i > CURSEG_COLD_DATA) { - struct page *sum_page = get_sum_page(sbi, segno); - struct f2fs_summary_block *sum_node; - kaddr = page_address(sum_page); - sum_node = (struct f2fs_summary_block *)kaddr; - sum = sum_node->entries[blkoff]; - f2fs_put_page(sum_page, 1); - } + sum_page = get_sum_page(sbi, segno); + sum_node = (struct f2fs_summary_block *)page_address(sum_page); + sum = sum_node->entries[blkoff]; + f2fs_put_page(sum_page, 1); +got_it: /* Use the locked dnode page and inode */ nid = le32_to_cpu(sum.nid); if (dn->inode->i_ino == nid) { -- cgit v0.10.2 From bd859c6598dd2b73c517b3a36ecc5dd387eb1eb2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 5 Feb 2014 11:16:39 +0900 Subject: f2fs: fix to truncate dentry pages in the error case When a new directory is allocated, if an error is occurred, we should truncate preallocated dentry pages too. This bug was reported by Andrey Tsyvarev after a while as follows. mkdir()-> f2fs_add_link()-> init_inode_metadata()-> f2fs_init_acl()-> f2fs_get_acl()-> f2fs_getxattr()-> read_all_xattrs() fails. Also there was a BUG_ON triggered after the fault in mkdir()-> f2fs_add_link()-> init_inode_metadata()-> remove_inode_page() -> f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1); But, previous patch wasn't perfect to resolve that bug, so the following bug report was also submitted. kernel BUG at fs/f2fs/inode.c:274! Call Trace: [] evict+0xa3/0x1a0 [] iput+0xf5/0x180 [] f2fs_mkdir+0xf3/0x150 [f2fs] [] vfs_mkdir+0xb7/0x160 [] SyS_mkdir+0x5f/0xc0 [] system_call_fastpath+0x16/0x1b Finally, this patch resolves all the issues like below. If an error is occurred after make_empty_dir(), 1. truncate_inode_pages() The make_bad_inode() prior to iput() will change i_mode to S_IFREG, which means that f2fs will not decrement fi->dirty_dents during f2fs_evict_inode. But, by calling it here, we can do that. 2. truncate_blocks() Preallocated dentry pages are trucated here to sync i_blocks. 3. remove_dirty_dir_inode() Remove this directory inode from the list. Reported-and-Tested-by: Andrey Tsyvarev Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index bfcb4ae..5bbf94c 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -372,6 +372,10 @@ static struct page *init_inode_metadata(struct inode *inode, put_error: f2fs_put_page(page, 1); + /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ + truncate_inode_pages(&inode->i_data, 0); + truncate_blocks(inode, 0); + remove_dirty_dir_inode(inode); error: remove_inode_page(inode); return ERR_PTR(err); -- cgit v0.10.2 From 203681f65b07055259bd475a6281136615b4e9a4 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 5 Feb 2014 13:03:57 +0900 Subject: f2fs: fix f2fs_write_meta_page at no checkpoint status If f2fs entered errorneous checkpoint status, it should skip writing meta pages instead of redirtying the pages out. Otherwise, it cannot unmount the partition even though f2fs is under read-only status. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 293d048..8f5dff1 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -81,17 +81,18 @@ static int f2fs_write_meta_page(struct page *page, struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - /* Should not write any meta pages, if any IO error was occurred */ - if (unlikely(sbi->por_doing || - is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) + if (unlikely(sbi->por_doing)) goto redirty_out; - if (wbc->for_reclaim) goto redirty_out; - wait_on_page_writeback(page); + /* Should not write any meta pages, if any IO error was occurred */ + if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) + goto no_write; + wait_on_page_writeback(page); write_meta_page(sbi, page); +no_write: dec_page_count(sbi, F2FS_DIRTY_META); unlock_page(page); return 0; @@ -148,10 +149,22 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + lock_page(page); - f2fs_bug_on(page->mapping != mapping); - f2fs_bug_on(!PageDirty(page)); - clear_page_dirty_for_io(page); + + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + if (f2fs_write_meta_page(page, &wbc)) { unlock_page(page); break; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ea0371e..b0f5762 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -701,6 +701,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi) gc_more: if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; + if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) + goto stop; if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { gc_type = FG_GC; -- cgit v0.10.2 From 1fe54f9dd3acfaa3ed4e1d1e3278fd0f1d1e98cd Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 7 Feb 2014 10:00:06 +0900 Subject: f2fs: clean up redundant function call This patch integrates inode_[inc|dec]_dirty_dents with inc_page_count to remove redundant calls. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 8f5dff1..427dd55 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -508,7 +508,6 @@ void set_dirty_dir_page(struct inode *inode, struct page *page) if (__add_dirty_inode(inode, new)) kmem_cache_free(inode_entry_slab, new); - inc_page_count(sbi, F2FS_DIRTY_DENTS); inode_inc_dirty_dents(inode); SetPagePrivate(page); spin_unlock(&sbi->dir_inode_lock); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d175ae3..b401be7 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -799,10 +799,7 @@ static int f2fs_write_data_page(struct page *page, */ offset = i_size & (PAGE_CACHE_SIZE - 1); if ((page->index >= end_index + 1) || !offset) { - if (S_ISDIR(inode->i_mode)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); - inode_dec_dirty_dents(inode); - } + inode_dec_dirty_dents(inode); goto out; } @@ -815,7 +812,6 @@ write: /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); inode_dec_dirty_dents(inode); err = do_write_data_page(page, &fio); } else { @@ -1033,11 +1029,8 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, unsigned int length) { struct inode *inode = page->mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - if (S_ISDIR(inode->i_mode) && PageDirty(page)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); + if (PageDirty(page)) inode_dec_dirty_dents(inode); - } ClearPagePrivate(page); } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 5bbf94c..d5a2c9e 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -532,7 +532,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, unsigned int bit_pos; struct address_space *mapping = page->mapping; struct inode *dir = mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); void *kaddr = page_address(page); int i; @@ -555,6 +554,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, dir->i_ctime = dir->i_mtime = CURRENT_TIME; if (inode) { + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + if (S_ISDIR(inode->i_mode)) { drop_nlink(dir); update_inode_page(dir); @@ -577,7 +578,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, truncate_hole(dir, page->index, page->index + 1); clear_page_dirty_for_io(page); ClearPageUptodate(page); - dec_page_count(sbi, F2FS_DIRTY_DENTS); inode_dec_dirty_dents(dir); } f2fs_put_page(page, 1); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index aeff132..4841d12 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -662,6 +662,7 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_inc_dirty_dents(struct inode *inode) { + inc_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); atomic_inc(&F2FS_I(inode)->dirty_dents); } @@ -672,6 +673,10 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_dec_dirty_dents(struct inode *inode) { + if (!S_ISDIR(inode->i_mode)) + return; + + dec_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); atomic_dec(&F2FS_I(inode)->dirty_dents); } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b0f5762..b161db4 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -531,15 +531,10 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type) set_page_dirty(page); set_cold_data(page); } else { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - f2fs_wait_on_page_writeback(page, DATA); - if (clear_page_dirty_for_io(page) && - S_ISDIR(inode->i_mode)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); + if (clear_page_dirty_for_io(page)) inode_dec_dirty_dents(inode); - } set_cold_data(page); do_write_data_page(page, &fio); clear_cold_data(page); -- cgit v0.10.2 From 3375f696bd9cfdfd385e2460a9cf021d8ef01eab Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 28 Jan 2014 10:29:26 +0800 Subject: f2fs: use inode mutex to keep atomicity of f2fs_falloc Previously without protection of inode mutex, f2fs_falloc and other data correlated operations will interfere with each other. So let's use inode mutex to keep atomicity of f2fs_falloc. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0dfcef5..00f937e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -560,6 +560,8 @@ static long f2fs_fallocate(struct file *file, int mode, if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; + mutex_lock(&inode->i_mutex); + if (mode & FALLOC_FL_PUNCH_HOLE) ret = punch_hole(inode, offset, len); else @@ -569,6 +571,9 @@ static long f2fs_fallocate(struct file *file, int mode, inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); } + + mutex_unlock(&inode->i_mutex); + trace_f2fs_fallocate(inode, mode, offset, len, ret); return ret; } -- cgit v0.10.2 From 662befda25fb16d7164633c39e9e20aeac5107d9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 7 Feb 2014 16:11:53 +0800 Subject: f2fs: introduce ra_meta_pages to readahead CP/NAT/SIT pages This patch help us to cleanup the readahead code by merging ra_{sit,nat}_pages function into ra_meta_pages. Additionally the new function is used to readahead cp block in recover_orphan_inodes. Change log from v1: o fix a deadloop bug pointed by Jaegeuk Kim. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 427dd55..deb6035 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -75,6 +75,82 @@ out: return page; } +inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type) +{ + switch (type) { + case META_NAT: + return NM_I(sbi)->max_nid / NAT_ENTRY_PER_BLOCK; + case META_SIT: + return SIT_BLK_CNT(sbi); + case META_CP: + return 0; + default: + BUG(); + } +} + +/* + * Readahead CP/NAT/SIT pages + */ +int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type) +{ + block_t prev_blk_addr = 0; + struct page *page; + int blkno = start; + int max_blks = get_max_meta_blks(sbi, type); + + struct f2fs_io_info fio = { + .type = META, + .rw = READ_SYNC | REQ_META | REQ_PRIO + }; + + for (; nrpages-- > 0; blkno++) { + block_t blk_addr; + + switch (type) { + case META_NAT: + /* get nat block addr */ + if (unlikely(blkno >= max_blks)) + blkno = 0; + blk_addr = current_nat_addr(sbi, + blkno * NAT_ENTRY_PER_BLOCK); + break; + case META_SIT: + /* get sit block addr */ + if (unlikely(blkno >= max_blks)) + goto out; + blk_addr = current_sit_addr(sbi, + blkno * SIT_ENTRY_PER_BLOCK); + if (blkno != start && prev_blk_addr + 1 != blk_addr) + goto out; + prev_blk_addr = blk_addr; + break; + case META_CP: + /* get cp block addr */ + blk_addr = blkno; + break; + default: + BUG(); + } + + page = grab_cache_page(META_MAPPING(sbi), blk_addr); + if (!page) + continue; + if (PageUptodate(page)) { + mark_page_accessed(page); + f2fs_put_page(page, 1); + continue; + } + + f2fs_submit_page_mbio(sbi, page, blk_addr, &fio); + mark_page_accessed(page); + f2fs_put_page(page, 0); + } +out: + f2fs_submit_merged_bio(sbi, META, READ); + return blkno - start; +} + static int f2fs_write_meta_page(struct page *page, struct writeback_control *wbc) { @@ -298,6 +374,8 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi) start_blk = __start_cp_addr(sbi) + 1; orphan_blkaddr = __start_sum_addr(sbi) - 1; + ra_meta_pages(sbi, start_blk, orphan_blkaddr, META_CP); + for (i = 0; i < orphan_blkaddr; i++) { struct page *page = get_meta_page(sbi, start_blk + i); struct f2fs_orphan_block *orphan_blk; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4841d12..817eccc 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -88,6 +88,15 @@ enum { SIT_BITMAP }; +/* + * For CP/NAT/SIT readahead + */ +enum { + META_CP, + META_NAT, + META_SIT +}; + /* for the list of orphan inodes */ struct orphan_inode_entry { struct list_head list; /* list head */ @@ -1176,6 +1185,7 @@ void destroy_segment_manager_caches(void); */ struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); +int ra_meta_pages(struct f2fs_sb_info *, int, int, int); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 82f4753..7689f91 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -82,42 +82,6 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) return dst_page; } -/* - * Readahead NAT pages - */ -static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) -{ - struct address_space *mapping = META_MAPPING(sbi); - struct f2fs_nm_info *nm_i = NM_I(sbi); - struct page *page; - pgoff_t index; - int i; - struct f2fs_io_info fio = { - .type = META, - .rw = READ_SYNC | REQ_META | REQ_PRIO - }; - - - for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) { - if (unlikely(nid >= nm_i->max_nid)) - nid = 0; - index = current_nat_addr(sbi, nid); - - page = grab_cache_page(mapping, index); - if (!page) - continue; - if (PageUptodate(page)) { - mark_page_accessed(page); - f2fs_put_page(page, 1); - continue; - } - f2fs_submit_page_mbio(sbi, page, index, &fio); - mark_page_accessed(page); - f2fs_put_page(page, 0); - } - f2fs_submit_merged_bio(sbi, META, READ); -} - static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) { return radix_tree_lookup(&nm_i->nat_root, n); @@ -1413,7 +1377,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi) return; /* readahead nat pages to be scanned */ - ra_nat_pages(sbi, nid); + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT); while (1) { struct page *page = get_current_nat_page(sbi, nid); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e87946a..fbb41ba 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1576,47 +1576,6 @@ static int build_curseg(struct f2fs_sb_info *sbi) return restore_curseg_summaries(sbi); } -static int ra_sit_pages(struct f2fs_sb_info *sbi, int start, int nrpages) -{ - struct address_space *mapping = META_MAPPING(sbi); - struct page *page; - block_t blk_addr, prev_blk_addr = 0; - int sit_blk_cnt = SIT_BLK_CNT(sbi); - int blkno = start; - struct f2fs_io_info fio = { - .type = META, - .rw = READ_SYNC | REQ_META | REQ_PRIO - }; - - for (; blkno < start + nrpages && blkno < sit_blk_cnt; blkno++) { - - blk_addr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK); - - if (blkno != start && prev_blk_addr + 1 != blk_addr) - break; - prev_blk_addr = blk_addr; -repeat: - page = grab_cache_page(mapping, blk_addr); - if (!page) { - cond_resched(); - goto repeat; - } - if (PageUptodate(page)) { - mark_page_accessed(page); - f2fs_put_page(page, 1); - continue; - } - - f2fs_submit_page_mbio(sbi, page, blk_addr, &fio); - - mark_page_accessed(page); - f2fs_put_page(page, 0); - } - - f2fs_submit_merged_bio(sbi, META, READ); - return blkno - start; -} - static void build_sit_entries(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); @@ -1628,7 +1587,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); do { - readed = ra_sit_pages(sbi, start_blk, nrpages); + readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT); start = start_blk * sit_i->sents_per_block; end = (start_blk + readed) * sit_i->sents_per_block; -- cgit v0.10.2 From 942e0be6219cc80384eb961feb963cab275bcbbf Mon Sep 17 00:00:00 2001 From: Changman Lee Date: Thu, 13 Feb 2014 15:12:29 +0900 Subject: f2fs: show counts of checkpoint in status This patch shows the counts of checkpoint in f2fs' status. Signed-off-by: Changman Lee Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index deb6035..757b77b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -914,6 +914,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) unblock_operations(sbi); mutex_unlock(&sbi->cp_mutex); + stat_inc_cp_count(sbi->stat_info); trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 3de9d20..46a12e4 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -236,6 +236,7 @@ static int stat_show(struct seq_file *s, void *v) si->dirty_count); seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", si->prefree_count, si->free_segs, si->free_secs); + seq_printf(s, "CP calls: %d\n", si->cp_count); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d\n", si->data_segs); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 817eccc..91f4c5e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1255,7 +1255,7 @@ struct f2fs_stat_info { int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; int dirty_count, node_pages, meta_pages; - int prefree_count, call_count; + int prefree_count, call_count, cp_count; int tot_segs, node_segs, data_segs, free_segs, free_secs; int tot_blks, data_blks, node_blks; int curseg[NR_CURSEG_TYPE]; @@ -1272,6 +1272,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) return (struct f2fs_stat_info *)sbi->stat_info; } +#define stat_inc_cp_count(si) ((si)->cp_count++) #define stat_inc_call_count(si) ((si)->call_count++) #define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) #define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) @@ -1326,6 +1327,7 @@ void f2fs_destroy_stats(struct f2fs_sb_info *); void __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else +#define stat_inc_cp_count(si) #define stat_inc_call_count(si) #define stat_inc_bggc_count(si) #define stat_inc_dirty_dir(sbi) -- cgit v0.10.2 From b63da15e8b475245026bdf2096853683f189706b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 17 Feb 2014 12:44:20 +0900 Subject: f2fs: fix the calculation of max_nids Total nids that f2fs can use should not include 0, nid for node inode, and nid for meta inode. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7689f91..d452185 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1811,7 +1811,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi) /* segment_count_nat includes pair segment so divide to 2. */ nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); - nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; + + /* not used nids: 0, node, meta, (and root counted as valid node) */ + nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks - 3; nm_i->fcnt = 0; nm_i->nat_cnt = 0; -- cgit v0.10.2 From 8618b881e9fd0e1817ff5cb7befadd3240d54830 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 17 Feb 2014 19:29:27 +0900 Subject: f2fs: fix not to write data pages on the page reclaiming path Even if f2fs_write_data_page is called by the page reclaiming path, we should not write the page to provide enough free segments for the worst case scenario. Otherwise, f2fs can face with no free segment while gc is conducted, resulting in: ------------[ cut here ]------------ kernel BUG at /home/zeus/f2fs_test/src/fs/f2fs/segment.c:565! RIP: 0010:[] [] new_curseg+0x331/0x340 [f2fs] Call Trace: allocate_segment_by_default+0x204/0x280 [f2fs] allocate_data_block+0x108/0x210 [f2fs] write_data_page+0x8a/0xc0 [f2fs] do_write_data_page+0xe1/0x2a0 [f2fs] move_data_page+0x8a/0xf0 [f2fs] f2fs_gc+0x446/0x970 [f2fs] f2fs_balance_fs+0xb6/0xd0 [f2fs] f2fs_write_begin+0x50/0x350 [f2fs] ? unlock_page+0x27/0x30 ? unlock_page+0x27/0x30 generic_file_buffered_write+0x10a/0x280 ? file_update_time+0xa3/0xf0 __generic_file_aio_write+0x1c8/0x3d0 ? generic_file_aio_write+0x52/0xb0 ? generic_file_aio_write+0x52/0xb0 generic_file_aio_write+0x65/0xb0 do_sync_write+0x5a/0x90 vfs_write+0xc5/0x1f0 SyS_write+0x55/0xa0 system_call_fastpath+0x16/0x1b Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b401be7..93d80ea 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -805,38 +805,30 @@ static int f2fs_write_data_page(struct page *page, zero_user_segment(page, offset, PAGE_CACHE_SIZE); write: - if (unlikely(sbi->por_doing)) { - err = AOP_WRITEPAGE_ACTIVATE; + if (unlikely(sbi->por_doing)) goto redirty_out; - } /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { inode_dec_dirty_dents(inode); err = do_write_data_page(page, &fio); - } else { - f2fs_lock_op(sbi); - - if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) { - err = f2fs_write_inline_data(inode, page, offset); - f2fs_unlock_op(sbi); - goto out; - } else { - err = do_write_data_page(page, &fio); - } + goto done; + } - f2fs_unlock_op(sbi); + if (!wbc->for_reclaim) need_balance_fs = true; - } - if (err == -ENOENT) - goto out; - else if (err) + else if (has_not_enough_free_secs(sbi, 0)) goto redirty_out; - if (wbc->for_reclaim) { - f2fs_submit_merged_bio(sbi, DATA, WRITE); - need_balance_fs = false; - } + f2fs_lock_op(sbi); + if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) + err = f2fs_write_inline_data(inode, page, offset); + else + err = do_write_data_page(page, &fio); + f2fs_unlock_op(sbi); +done: + if (err && err != -ENOENT) + goto redirty_out; clear_cold_data(page); out: @@ -848,7 +840,7 @@ out: redirty_out: wbc->pages_skipped++; set_page_dirty(page); - return err; + return AOP_WRITEPAGE_ACTIVATE; } #define MAX_DESIRED_PAGES_WP 4096 -- cgit v0.10.2 From 6437d1b0adb46f29aafcbf10950a89211028ca09 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Feb 2014 18:23:32 +0900 Subject: f2fs: fix to do build_stat prior to the recovery procedure At the end of the recovery procedure, write_checkpoint is called and updates the cp count which is managed by f2fs stat. But, previously build_stat() is called after the recovery procedure, which results in: BUG: unable to handle kernel NULL pointer dereference at 000000000000012c IP: [] write_checkpoint+0x720/0xbc0 [f2fs] Call Trace: [] ? mark_held_locks+0x74/0x140 [] ? __init_waitqueue_head+0x60/0x60 [] recover_fsync_data+0x656/0xf20 [f2fs] [] ? security_d_instantiate+0x1b/0x30 [] f2fs_fill_super+0x94d/0xa00 [f2fs] [] mount_bdev+0x1a5/0x1f0 [] ? __get_free_pages+0xe/0x40 [] ? f2fs_remount+0x130/0x130 [f2fs] [] f2fs_mount+0x15/0x20 [f2fs] [] mount_fs+0x43/0x1b0 [] vfs_kern_mount+0x74/0x160 [] ? __get_fs_type+0x51/0x60 [] do_mount+0x237/0xb50 [] ? copy_mount_options+0x3a/0x170 So, this patche changes the order of recovery_fsync_data() and f2fs_build_stats(). Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1a85f83..475560e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -989,28 +989,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_root_inode; } - /* recover fsynced data */ - if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { - err = recover_fsync_data(sbi); - if (err) - f2fs_msg(sb, KERN_ERR, - "Cannot recover all fsync data errno=%ld", err); - } - - /* - * If filesystem is not mounted as read-only then - * do start the gc_thread. - */ - if (!(sb->s_flags & MS_RDONLY)) { - /* After POR, we can run background GC thread.*/ - err = start_gc_thread(sbi); - if (err) - goto free_gc; - } - err = f2fs_build_stats(sbi); if (err) - goto free_gc; + goto free_root_inode; if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -1032,17 +1013,36 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, "%s", sb->s_id); if (err) - goto fail; + goto free_proc; + /* recover fsynced data */ + if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { + err = recover_fsync_data(sbi); + if (err) + f2fs_msg(sb, KERN_ERR, + "Cannot recover all fsync data errno=%ld", err); + } + + /* + * If filesystem is not mounted as read-only then + * do start the gc_thread. + */ + if (!(sb->s_flags & MS_RDONLY)) { + /* After POR, we can run background GC thread.*/ + err = start_gc_thread(sbi); + if (err) + goto free_kobj; + } return 0; -fail: + +free_kobj: + kobject_del(&sbi->s_kobj); +free_proc: if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } f2fs_destroy_stats(sbi); -free_gc: - stop_gc_thread(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; -- cgit v0.10.2 From fffc2a00fc01b781c1e3b9541e3e0f270c50ce90 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 21 Feb 2014 13:17:22 +0900 Subject: f2fs: fix to mark the checkpointed nat entry correctly The nat cache entry maintains a status whether it is checkpointed or not. So, if a new cache entry is loaded from the last checkpoint, nat_entry->checkpointed should be true. If the cache entry is modified as being dirty, nat_entry->checkpoint should be false. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d452185..a070b14 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -128,6 +128,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) } memset(new, 0, sizeof(struct nat_entry)); nat_set_nid(new, nid); + new->checkpointed = true; list_add_tail(&new->list, &nm_i->nat_entries); nm_i->nat_cnt++; return new; @@ -149,7 +150,6 @@ retry: nat_set_blkaddr(e, le32_to_cpu(ne->block_addr)); nat_set_ino(e, le32_to_cpu(ne->ino)); nat_set_version(e, ne->version); - e->checkpointed = true; } write_unlock(&nm_i->nat_tree_lock); } @@ -169,7 +169,6 @@ retry: goto retry; } e->ni = *ni; - e->checkpointed = true; f2fs_bug_on(ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { /* @@ -181,9 +180,6 @@ retry: f2fs_bug_on(ni->blk_addr != NULL_ADDR); } - if (new_blkaddr == NEW_ADDR) - e->checkpointed = false; - /* sanity check */ f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr); f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR && @@ -1787,7 +1783,6 @@ flush_now: } else { write_lock(&nm_i->nat_tree_lock); __clear_nat_cache_dirty(nm_i, ne); - ne->checkpointed = true; write_unlock(&nm_i->nat_tree_lock); } } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index c4c7988..4dea719 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -58,9 +58,15 @@ struct nat_entry { #define nat_set_version(nat, v) (nat->ni.version = v) #define __set_nat_cache_dirty(nm_i, ne) \ - list_move_tail(&ne->list, &nm_i->dirty_nat_entries); + do { \ + ne->checkpointed = false; \ + list_move_tail(&ne->list, &nm_i->dirty_nat_entries); \ + } while (0); #define __clear_nat_cache_dirty(nm_i, ne) \ - list_move_tail(&ne->list, &nm_i->nat_entries); + do { \ + ne->checkpointed = true; \ + list_move_tail(&ne->list, &nm_i->nat_entries); \ + } while (0); #define inc_node_version(version) (++version) static inline void node_info_from_raw_nat(struct node_info *ni, -- cgit v0.10.2 From f978f5a0616d18f303d9c8f51c293a03bc09dbaf Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Fri, 21 Feb 2014 18:08:29 +0800 Subject: f2fs: introduce help macro on_build_free_nids() Introduce help macro on_build_free_nids() which just uses build_lock to judge whether the building free nid is going, so that we can remove the on_build_free_nids field from f2fs_sb_info. Signed-off-by: Gu Zheng [Jaegeuk Kim: remove an unnecessary white line removal] Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 91f4c5e..c56e67b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -417,7 +417,6 @@ struct f2fs_sb_info { struct mutex node_write; /* locking node writes */ struct mutex writepages; /* mutex for writepages() */ bool por_doing; /* recovery is doing or not */ - bool on_build_free_nids; /* build_free_nids is doing */ wait_queue_head_t cp_wait; /* for orphan inode management */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a070b14..431bcb4 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -21,6 +21,8 @@ #include "segment.h" #include +#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock) + static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; @@ -1422,7 +1424,7 @@ retry: spin_lock(&nm_i->free_nid_list_lock); /* We should not use stale free nids created by build_free_nids */ - if (nm_i->fcnt && !sbi->on_build_free_nids) { + if (nm_i->fcnt && !on_build_free_nids(nm_i)) { f2fs_bug_on(list_empty(&nm_i->free_nid_list)); list_for_each(this, &nm_i->free_nid_list) { i = list_entry(this, struct free_nid, list); @@ -1441,9 +1443,7 @@ retry: /* Let's scan nat pages and its caches to get free nids */ mutex_lock(&nm_i->build_lock); - sbi->on_build_free_nids = true; build_free_nids(sbi); - sbi->on_build_free_nids = false; mutex_unlock(&nm_i->build_lock); goto retry; } -- cgit v0.10.2 From 8a7ed66aaf8ee56b0a6beee4d02e10af5a9e38b2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 21 Feb 2014 14:29:35 +0900 Subject: f2fs: introduce a radix_tree for the free_nid list This patch introduces a radix tree for the list of free_nids, which enhances the performance on free nid management. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c56e67b..11fd8be 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -247,6 +247,7 @@ struct f2fs_nm_info { struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ /* free node ids management */ + struct radix_tree_root free_nid_root;/* root of the free_nid cache */ struct list_head free_nid_list; /* a list for free nids */ spinlock_t free_nid_list_lock; /* protect free nid list */ unsigned int fcnt; /* the number of free node id */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 431bcb4..1f9cf21 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1269,21 +1269,17 @@ const struct address_space_operations f2fs_node_aops = { .releasepage = f2fs_release_node_page, }; -static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head) +static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, + nid_t n) { - struct list_head *this; - struct free_nid *i; - list_for_each(this, head) { - i = list_entry(this, struct free_nid, list); - if (i->nid == n) - return i; - } - return NULL; + return radix_tree_lookup(&nm_i->free_nid_root, n); } -static void __del_from_free_nid_list(struct free_nid *i) +static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, + struct free_nid *i) { list_del(&i->list); + radix_tree_delete(&nm_i->free_nid_root, i->nid); kmem_cache_free(free_nid_slab, i); } @@ -1304,7 +1300,8 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) /* do not add allocated nids */ read_lock(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); - if (ne && nat_get_blkaddr(ne) != NULL_ADDR) + if (ne && + (!ne->checkpointed || nat_get_blkaddr(ne) != NULL_ADDR)) allocated = true; read_unlock(&nm_i->nat_tree_lock); if (allocated) @@ -1316,7 +1313,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) i->state = NID_NEW; spin_lock(&nm_i->free_nid_list_lock); - if (__lookup_free_nid_list(nid, &nm_i->free_nid_list)) { + if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { spin_unlock(&nm_i->free_nid_list_lock); kmem_cache_free(free_nid_slab, i); return 0; @@ -1331,9 +1328,9 @@ static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) { struct free_nid *i; spin_lock(&nm_i->free_nid_list_lock); - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); + i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == NID_NEW) { - __del_from_free_nid_list(i); + __del_from_free_nid_list(nm_i, i); nm_i->fcnt--; } spin_unlock(&nm_i->free_nid_list_lock); @@ -1457,9 +1454,9 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) struct free_nid *i; spin_lock(&nm_i->free_nid_list_lock); - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); + i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(!i || i->state != NID_ALLOC); - __del_from_free_nid_list(i); + __del_from_free_nid_list(nm_i, i); spin_unlock(&nm_i->free_nid_list_lock); } @@ -1475,10 +1472,10 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) return; spin_lock(&nm_i->free_nid_list_lock); - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); + i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(!i || i->state != NID_ALLOC); if (nm_i->fcnt > 2 * MAX_FREE_NIDS) { - __del_from_free_nid_list(i); + __del_from_free_nid_list(nm_i, i); } else { i->state = NID_NEW; nm_i->fcnt++; @@ -1812,6 +1809,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->fcnt = 0; nm_i->nat_cnt = 0; + INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->nat_entries); @@ -1865,7 +1863,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) spin_lock(&nm_i->free_nid_list_lock); list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { f2fs_bug_on(i->state == NID_ALLOC); - __del_from_free_nid_list(i); + __del_from_free_nid_list(nm_i, i); nm_i->fcnt--; } f2fs_bug_on(nm_i->fcnt); -- cgit v0.10.2 From 8b8343fa9d503894ece57acbe46cb36883646685 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 24 Feb 2014 13:00:13 +0900 Subject: f2fs: implement a lock-free stat_show The stat_show is just to show the current status of f2fs. So, we can remove all the there-in locks. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 46a12e4..b7111c4 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -86,7 +86,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist; - struct sit_info *sit_i = SIT_I(sbi); unsigned int segno, vblocks; int ndirty = 0; @@ -94,7 +93,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi) total_vblocks = 0; blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); hblks_per_sec = blks_per_sec / 2; - mutex_lock(&sit_i->sentry_lock); for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); dist = abs(vblocks - hblks_per_sec); @@ -105,7 +103,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi) ndirty++; } } - mutex_unlock(&sit_i->sentry_lock); dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; si->bimodal = bimodal / dist; if (si->dirty_count) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 11fd8be..4beedcc 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -704,11 +704,7 @@ static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) { - block_t ret; - spin_lock(&sbi->stat_lock); - ret = sbi->total_valid_block_count; - spin_unlock(&sbi->stat_lock); - return ret; + return sbi->total_valid_block_count; } static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) @@ -804,11 +800,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) { - unsigned int ret; - spin_lock(&sbi->stat_lock); - ret = sbi->total_valid_node_count; - spin_unlock(&sbi->stat_lock); - return ret; + return sbi->total_valid_node_count; } static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) @@ -829,11 +821,7 @@ static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) { - unsigned int ret; - spin_lock(&sbi->stat_lock); - ret = sbi->total_valid_inode_count; - spin_unlock(&sbi->stat_lock); - return ret; + return sbi->total_valid_inode_count; } static inline void f2fs_put_page(struct page *page, int unlock) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 4024546..c3d5e36 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -380,26 +380,12 @@ static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, static inline block_t written_block_count(struct f2fs_sb_info *sbi) { - struct sit_info *sit_i = SIT_I(sbi); - block_t vblocks; - - mutex_lock(&sit_i->sentry_lock); - vblocks = sit_i->written_valid_blocks; - mutex_unlock(&sit_i->sentry_lock); - - return vblocks; + return SIT_I(sbi)->written_valid_blocks; } static inline unsigned int free_segments(struct f2fs_sb_info *sbi) { - struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int free_segs; - - read_lock(&free_i->segmap_lock); - free_segs = free_i->free_segments; - read_unlock(&free_i->segmap_lock); - - return free_segs; + return FREE_I(sbi)->free_segments; } static inline int reserved_segments(struct f2fs_sb_info *sbi) @@ -409,14 +395,7 @@ static inline int reserved_segments(struct f2fs_sb_info *sbi) static inline unsigned int free_sections(struct f2fs_sb_info *sbi) { - struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int free_secs; - - read_lock(&free_i->segmap_lock); - free_secs = free_i->free_sections; - read_unlock(&free_i->segmap_lock); - - return free_secs; + return FREE_I(sbi)->free_sections; } static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi) -- cgit v0.10.2 From 5d0c667121bfc8be76d1580f485bddbe73465d1a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 27 Feb 2014 13:57:53 +0900 Subject: f2fs: remove costly bit operations for f2fs_find_entry It turns out that a bit operation like find_next_bit is not always fast enough for f2fs_find_entry. Instead, it is pretty much simple and fast to traverse each dentries. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index d5a2c9e..c3ea8f8 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -93,16 +93,20 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, f2fs_hash_t namehash, struct page **res_page) { struct f2fs_dir_entry *de; - unsigned long bit_pos, end_pos, next_pos; + unsigned long bit_pos = 0; struct f2fs_dentry_block *dentry_blk = kmap(dentry_page); - int slots; + int max_len = 0; - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, - NR_DENTRY_IN_BLOCK, 0); while (bit_pos < NR_DENTRY_IN_BLOCK) { de = &dentry_blk->dentry[bit_pos]; - slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); - + if (!test_bit_le(bit_pos, &dentry_blk->dentry_bitmap)) { + if (bit_pos == 0) + max_len = 1; + else if (!test_bit_le(bit_pos - 1, &dentry_blk->dentry_bitmap)) + max_len++; + bit_pos++; + continue; + } if (early_match_name(name, namelen, namehash, de)) { if (!memcmp(dentry_blk->filename[bit_pos], name, namelen)) { @@ -110,20 +114,18 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, goto found; } } - next_pos = bit_pos + slots; - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, - NR_DENTRY_IN_BLOCK, next_pos); - if (bit_pos >= NR_DENTRY_IN_BLOCK) - end_pos = NR_DENTRY_IN_BLOCK; - else - end_pos = bit_pos; - if (*max_slots < end_pos - next_pos) - *max_slots = end_pos - next_pos; + if (max_len > *max_slots) { + *max_slots = max_len; + max_len = 0; + } + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } de = NULL; kunmap(dentry_page); found: + if (max_len > *max_slots) + *max_slots = max_len; return de; } -- cgit v0.10.2 From 3843154598a00408f4214a68bd536fdf27b1df10 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 27 Feb 2014 18:20:00 +0900 Subject: f2fs: introduce large directory support This patch introduces an i_dir_level field to support large directory. Previously, f2fs maintains multi-level hash tables to find a dentry quickly from a bunch of chiild dentries in a directory, and the hash tables consist of the following tree structure as below. In Documentation/filesystems/f2fs.txt, ---------------------- A : bucket B : block N : MAX_DIR_HASH_DEPTH ---------------------- level #0 | A(2B) | level #1 | A(2B) - A(2B) | level #2 | A(2B) - A(2B) - A(2B) - A(2B) . | . . . . level #N/2 | A(2B) - A(2B) - A(2B) - A(2B) - A(2B) - ... - A(2B) . | . . . . level #N | A(4B) - A(4B) - A(4B) - A(4B) - A(4B) - ... - A(4B) But, if we can guess that a directory will handle a number of child files, we don't need to traverse the tree from level #0 to #N all the time. Since the lower level tables contain relatively small number of dentries, the miss ratio of the target dentry is likely to be high. In order to avoid that, we can configure the hash tables sparsely from level #0 like this. level #0 | A(2B) - A(2B) - A(2B) - A(2B) level #1 | A(2B) - A(2B) - A(2B) - A(2B) - A(2B) - ... - A(2B) . | . . . . level #N/2 | A(2B) - A(2B) - A(2B) - A(2B) - A(2B) - ... - A(2B) . | . . . . level #N | A(4B) - A(4B) - A(4B) - A(4B) - A(4B) - ... - A(4B) With this structure, we can skip the ineffective tree searches in lower level hash tables. This patch adds just a facility for this by introducing i_dir_level in f2fs_inode. Signed-off-by: Jaegeuk Kim diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index b8d2849..8eb06b0 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -444,9 +444,11 @@ The number of blocks and buckets are determined by, # of blocks in level #n = | `- 4, Otherwise - ,- 2^n, if n < MAX_DIR_HASH_DEPTH / 2, + ,- 2^ (n + dir_level), + | if n < MAX_DIR_HASH_DEPTH / 2, # of buckets in level #n = | - `- 2^((MAX_DIR_HASH_DEPTH / 2) - 1), Otherwise + `- 2^((MAX_DIR_HASH_DEPTH / 2 + dir_level) - 1), + Otherwise When F2FS finds a file name in a directory, at first a hash value of the file name is calculated. Then, F2FS scans the hash table in level #0 to find the diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index c3ea8f8..582fa00f 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -21,12 +21,12 @@ static unsigned long dir_blocks(struct inode *inode) >> PAGE_CACHE_SHIFT; } -static unsigned int dir_buckets(unsigned int level) +static unsigned int dir_buckets(unsigned int level, int dir_level) { if (level < MAX_DIR_HASH_DEPTH / 2) - return 1 << level; + return 1 << (level + dir_level); else - return 1 << ((MAX_DIR_HASH_DEPTH / 2) - 1); + return 1 << ((MAX_DIR_HASH_DEPTH / 2 + dir_level) - 1); } static unsigned int bucket_blocks(unsigned int level) @@ -65,13 +65,14 @@ static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode) de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } -static unsigned long dir_block_index(unsigned int level, unsigned int idx) +static unsigned long dir_block_index(unsigned int level, + int dir_level, unsigned int idx) { unsigned long i; unsigned long bidx = 0; for (i = 0; i < level; i++) - bidx += dir_buckets(i) * bucket_blocks(i); + bidx += dir_buckets(i, dir_level) * bucket_blocks(i); bidx += idx * bucket_blocks(level); return bidx; } @@ -143,10 +144,11 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, f2fs_bug_on(level > MAX_DIR_HASH_DEPTH); - nbucket = dir_buckets(level); + nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); - bidx = dir_block_index(level, le32_to_cpu(namehash) % nbucket); + bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, + le32_to_cpu(namehash) % nbucket); end_block = bidx + nblock; for (; bidx < end_block; bidx++) { @@ -467,10 +469,11 @@ start: if (level == current_depth) ++current_depth; - nbucket = dir_buckets(level); + nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); - bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket)); + bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, + (le32_to_cpu(dentry_hash) % nbucket)); for (block = bidx; block <= (bidx + nblock - 1); block++) { dentry_page = get_new_data_page(dir, NULL, block, true); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4beedcc..a826916 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -200,6 +200,7 @@ struct f2fs_inode_info { struct inode vfs_inode; /* serve a vfs inode */ unsigned long i_flags; /* keep an inode flags for ioctl */ unsigned char i_advise; /* use to give file attribute hints */ + unsigned char i_dir_level; /* use for dentry level for large dir */ unsigned int i_current_depth; /* use only in directory structure */ unsigned int i_pino; /* parent inode number */ umode_t i_acl_mode; /* keep file acl mode temporarily */ diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 08d69c9..d518e37 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -107,6 +107,7 @@ static int do_read_inode(struct inode *inode) fi->flags = 0; fi->i_advise = ri->i_advise; fi->i_pino = le32_to_cpu(ri->i_pino); + fi->i_dir_level = ri->i_dir_level; get_extent_info(&fi->ext, ri->i_ext); get_inline_info(fi, ri); @@ -204,6 +205,7 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); ri->i_generation = cpu_to_le32(inode->i_generation); + ri->i_dir_level = F2FS_I(inode)->i_dir_level; __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index da74d87..df53e17 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -183,7 +183,7 @@ struct f2fs_inode { __le32 i_pino; /* parent inode number */ __le32 i_namelen; /* file name length */ __u8 i_name[F2FS_NAME_LEN]; /* file name for SPOR */ - __u8 i_reserved2; /* for backward compatibility */ + __u8 i_dir_level; /* dentry_level for large dir */ struct f2fs_extent i_ext; /* caching a largest extent */ -- cgit v0.10.2 From ab9fa662e4867455f44f4de96d29a7f09cf292c6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 27 Feb 2014 20:09:05 +0900 Subject: f2fs: add an sysfs entry to control the directory level This patch adds an sysfs entry to control dir_level used by the large directory. The description of this entry is: dir_level This parameter controls the directory level to support large directory. If a directory has a number of files, it can reduce the file lookup latency by increasing this dir_level value. Otherwise, it needs to decrease this value to reduce the space overhead. The default value is 0. Signed-off-by: Jaegeuk Kim diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 8eb06b0..803784e 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -195,6 +195,13 @@ Files in /sys/fs/f2fs/ cleaning operations. The default value is 4096 which covers 8GB block address range. + dir_level This parameter controls the directory level to + support large directory. If a directory has a + number of files, it can reduce the file lookup + latency by increasing this dir_level value. + Otherwise, it needs to decrease this value to + reduce the space overhead. The default value is 0. + ================================================================================ USAGE ================================================================================ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a826916..6f88191 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -196,6 +196,8 @@ struct extent_info { #define FADVISE_COLD_BIT 0x01 #define FADVISE_LOST_PINO_BIT 0x02 +#define DEF_DIR_LEVEL 0 + struct f2fs_inode_info { struct inode vfs_inode; /* serve a vfs inode */ unsigned long i_flags; /* keep an inode flags for ioctl */ @@ -447,6 +449,7 @@ struct f2fs_sb_info { unsigned int total_valid_node_count; /* valid node block count */ unsigned int total_valid_inode_count; /* valid inode count */ int active_logs; /* # of active logs */ + int dir_level; /* directory level */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 475560e..1bd9153 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -184,6 +184,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -196,6 +197,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(max_victim_search), + ATTR_LIST(dir_level), NULL, }; @@ -359,6 +361,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) if (test_opt(F2FS_SB(sb), INLINE_XATTR)) set_inode_flag(fi, FI_INLINE_XATTR); + /* Will be used by directory only */ + fi->i_dir_level = F2FS_SB(sb)->dir_level; + return &fi->vfs_inode; } @@ -785,6 +790,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); + + sbi->dir_level = DEF_DIR_LEVEL; } /* -- cgit v0.10.2 From 81c1a0f13e6306a76fc3743b8504085d96659a5f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 27 Feb 2014 19:12:24 +0800 Subject: f2fs: readahead contiguous SSA blocks for f2fs_gc If there are multi segments in one section, we will read those SSA blocks which have contiguous address one by one in f2fs_gc. It may lost performance, let's read ahead SSA blocks by merge multi read request. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 757b77b..c8516ee 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -82,6 +82,7 @@ inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type) return NM_I(sbi)->max_nid / NAT_ENTRY_PER_BLOCK; case META_SIT: return SIT_BLK_CNT(sbi); + case META_SSA: case META_CP: return 0; default: @@ -90,7 +91,7 @@ inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type) } /* - * Readahead CP/NAT/SIT pages + * Readahead CP/NAT/SIT/SSA pages */ int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type) { @@ -125,8 +126,9 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type) goto out; prev_blk_addr = blk_addr; break; + case META_SSA: case META_CP: - /* get cp block addr */ + /* get ssa/cp block addr */ blk_addr = blkno; break; default: diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6f88191..bd6666e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -89,12 +89,13 @@ enum { }; /* - * For CP/NAT/SIT readahead + * For CP/NAT/SIT/SSA readahead */ enum { META_CP, META_NAT, - META_SIT + META_SIT, + META_SSA }; /* for the list of orphan inodes */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b161db4..d94acbc 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -708,6 +708,11 @@ gc_more: goto stop; ret = 0; + /* readahead multi ssa blocks those have contiguous address */ + if (sbi->segs_per_sec > 1) + ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, + META_SSA); + for (i = 0; i < sbi->segs_per_sec; i++) do_garbage_collect(sbi, segno + i, &ilist, gc_type); -- cgit v0.10.2 From 695fd1ed3bcaae9fc032cbe47f0fe9a934bf1717 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 27 Feb 2014 19:52:21 +0800 Subject: f2fs: use existing macro to clean up some codes This patch use existing macro F2FS_INODE/NEXT_FREE_BLKADDR to clean up some codes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bd6666e..8b2cb82 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1000,8 +1000,7 @@ static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) static inline void *inline_xattr_addr(struct page *page) { - struct f2fs_inode *ri; - ri = (struct f2fs_inode *)page_address(page); + struct f2fs_inode *ri = F2FS_INODE(page); return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS]); } @@ -1021,8 +1020,7 @@ static inline int f2fs_has_inline_data(struct inode *inode) static inline void *inline_data_addr(struct page *page) { - struct f2fs_inode *ri; - ri = (struct f2fs_inode *)page_address(page); + struct f2fs_inode *ri = F2FS_INODE(page); return (void *)&(ri->i_addr[1]); } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 72adbbf..aef7768 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -136,7 +136,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) /* get node pages in the current segment */ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); - blkaddr = START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff; + blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); /* read node page */ page = alloc_page(GFP_F2FS_ZERO); -- cgit v0.10.2 From 9cf3c3898a274ca637b88ad01b0830550ee2d318 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 28 Feb 2014 10:12:05 +0800 Subject: f2fs: fix dirty page accounting when redirty We should de-account dirty counters for page when redirty in ->writepage(). Wu Fengguang described in 'commit 971767caf632190f77a40b4011c19948232eed75': "writeback: fix dirtied pages accounting on redirty De-account the accumulative dirty counters on page redirty. Page redirties (very common in ext4) will introduce mismatch between counters (a) and (b) a) NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied b) NR_WRITTEN, BDI_WRITTEN This will introduce systematic errors in balanced_rate and result in dirty page position errors (ie. the dirty pages are no longer balanced around the global/bdi setpoints)." Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c8516ee..f069249 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -178,6 +178,7 @@ no_write: redirty_out: dec_page_count(sbi, F2FS_DIRTY_META); wbc->pages_skipped++; + account_page_redirty(page); set_page_dirty(page); return AOP_WRITEPAGE_ACTIVATE; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 93d80ea..101b4cd 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -839,6 +839,7 @@ out: redirty_out: wbc->pages_skipped++; + account_page_redirty(page); set_page_dirty(page); return AOP_WRITEPAGE_ACTIVATE; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1f9cf21..8c14110 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1193,6 +1193,7 @@ static int f2fs_write_node_page(struct page *page, redirty_out: dec_page_count(sbi, F2FS_DIRTY_NODES); wbc->pages_skipped++; + account_page_redirty(page); set_page_dirty(page); return AOP_WRITEPAGE_ACTIVATE; } -- cgit v0.10.2 From c81bf1c84f6924678d088d68e131ff1f4d2d9002 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 3 Mar 2014 11:28:40 +0900 Subject: f2fs: fix to write node pages with WRITE_SYNC This patch fixes performance regression of dbench reported by Alex . This issue was revealed by Phoronix tests results: http://www.phoronix.com/scan.php?page=article&item=linux_314_ssdfs&num=2 It turns out that we need to assign WRITE_SYNC to the node writes, if fsync is triggered. The performance numbers are like below, which is measured by Alex. 1. 355MB/s ext4 2. 225MB/s f2fs : WRITE for node writes 3. 525MB/s f2fs : WRITE_SYNC for node writes Reported-And-Tested-by: Alex . Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 00f937e..a4cc1d6 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -115,7 +115,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) int ret = 0; bool need_cp = false; struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, + .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .for_reclaim = 0, }; -- cgit v0.10.2 From 20f70751c6b4ac5055be9a0d8a3d3189a81afc5a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 5 Mar 2014 10:48:25 +0900 Subject: f2fs: fix wrong kernel coding style This patch includes a simple fix to adjust coding style. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 582fa00f..f3a80ce 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -96,18 +96,19 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, struct f2fs_dir_entry *de; unsigned long bit_pos = 0; struct f2fs_dentry_block *dentry_blk = kmap(dentry_page); + const void *dentry_bits = &dentry_blk->dentry_bitmap; int max_len = 0; while (bit_pos < NR_DENTRY_IN_BLOCK) { - de = &dentry_blk->dentry[bit_pos]; - if (!test_bit_le(bit_pos, &dentry_blk->dentry_bitmap)) { + if (!test_bit_le(bit_pos, dentry_bits)) { if (bit_pos == 0) max_len = 1; - else if (!test_bit_le(bit_pos - 1, &dentry_blk->dentry_bitmap)) + else if (!test_bit_le(bit_pos - 1, dentry_bits)) max_len++; bit_pos++; continue; } + de = &dentry_blk->dentry[bit_pos]; if (early_match_name(name, namelen, namehash, de)) { if (!memcmp(dentry_blk->filename[bit_pos], name, namelen)) { -- cgit v0.10.2 From b6ce391e615175029cb8496f03afc9905e0957cc Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Fri, 7 Mar 2014 18:43:24 +0800 Subject: f2fs: update start nid only once each circle Integrated a couple of minor changes for better readability suggested by Chao Yu. Signed-off-by: Gu Zheng Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8c14110..77b6189 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1875,11 +1875,9 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) while ((found = __gang_lookup_nat_cache(nm_i, nid, NATVEC_SIZE, natvec))) { unsigned idx; - for (idx = 0; idx < found; idx++) { - struct nat_entry *e = natvec[idx]; - nid = nat_get_nid(e) + 1; - __del_from_nat_cache(nm_i, e); - } + nid = nat_get_nid(natvec[found - 1]) + 1; + for (idx = 0; idx < found; idx++) + __del_from_nat_cache(nm_i, natvec[idx]); } f2fs_bug_on(nm_i->nat_cnt); write_unlock(&nm_i->nat_tree_lock); -- cgit v0.10.2 From e8512d2e0c4eb38cd78b1499bb08d7d8eea6c723 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Fri, 7 Mar 2014 18:43:28 +0800 Subject: f2fs: remove the unused ctor argument of f2fs_kmem_cache_create() Signed-off-by: Gu Zheng Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f069249..911b6f9 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -939,11 +939,11 @@ void init_orphan_info(struct f2fs_sb_info *sbi) int __init create_checkpoint_caches(void) { orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", - sizeof(struct orphan_inode_entry), NULL); + sizeof(struct orphan_inode_entry)); if (!orphan_entry_slab) return -ENOMEM; inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", - sizeof(struct dir_inode_entry), NULL); + sizeof(struct dir_inode_entry)); if (!inode_entry_slab) { kmem_cache_destroy(orphan_entry_slab); return -ENOMEM; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8b2cb82..f845e92 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -852,9 +852,9 @@ static inline void f2fs_put_dnode(struct dnode_of_data *dn) } static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, - size_t size, void (*ctor)(void *)) + size_t size) { - return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, ctor); + return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL); } static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d94acbc..b90dbe5 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -742,7 +742,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi) int __init create_gc_caches(void) { winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes", - sizeof(struct inode_entry), NULL); + sizeof(struct inode_entry)); if (!winode_slab) return -ENOMEM; return 0; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 77b6189..12c9ded 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1890,12 +1890,12 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) int __init create_node_manager_caches(void) { nat_entry_slab = f2fs_kmem_cache_create("nat_entry", - sizeof(struct nat_entry), NULL); + sizeof(struct nat_entry)); if (!nat_entry_slab) return -ENOMEM; free_nid_slab = f2fs_kmem_cache_create("free_nid", - sizeof(struct free_nid), NULL); + sizeof(struct free_nid)); if (!free_nid_slab) { kmem_cache_destroy(nat_entry_slab); return -ENOMEM; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index aef7768..03b28ec 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -436,7 +436,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) bool need_writecp = false; fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", - sizeof(struct fsync_inode_entry), NULL); + sizeof(struct fsync_inode_entry)); if (!fsync_entry_slab) return -ENOMEM; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fbb41ba..199c964 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1878,7 +1878,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) int __init create_segment_manager_caches(void) { discard_entry_slab = f2fs_kmem_cache_create("discard_entry", - sizeof(struct discard_entry), NULL); + sizeof(struct discard_entry)); if (!discard_entry_slab) return -ENOMEM; return 0; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1bd9153..72df734 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1089,7 +1089,7 @@ MODULE_ALIAS_FS("f2fs"); static int __init init_inodecache(void) { f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", - sizeof(struct f2fs_inode_info), NULL); + sizeof(struct f2fs_inode_info)); if (!f2fs_inode_cachep) return -ENOMEM; return 0; -- cgit v0.10.2 From 46c04366bbfd112a74dcfebbe41c9bf3f496ea75 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Fri, 7 Mar 2014 18:43:33 +0800 Subject: f2fs: format segment_info's show for better legibility The original segment_info's show is a bit out-of-format: [root@guz Demoes]# cat /proc/fs/f2fs/loop0/segment_info 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...... 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 [root@guz Demoes]# so we fix it here for better legibility. [root@guz Demoes]# cat /proc/fs/f2fs/loop0/segment_info 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...... 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 [root@guz Demoes]# Signed-off-by: Gu Zheng Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 72df734..6e4851c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -546,11 +546,12 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) for (i = 0; i < total_segs; i++) { seq_printf(seq, "%u", get_valid_blocks(sbi, i, 1)); - if (i != 0 && (i % 10) == 0) - seq_puts(seq, "\n"); + if ((i % 10) == 9 || i == (total_segs - 1)) + seq_putc(seq, '\n'); else - seq_puts(seq, " "); + seq_putc(seq, ' '); } + return 0; } -- cgit v0.10.2 From d653788a43475eb3cdfcfaa60fb53451878944cf Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Fri, 7 Mar 2014 18:43:36 +0800 Subject: f2fs: optimize restore_node_summary slightly Previously, we ra_sum_pages to pre-read contiguous pages as more as possible, and if we fail to alloc more pages, an ENOMEM error will be reported upstream, even though we have alloced some pages yet. In fact, we can use the available pages to do the job partly, and continue the rest in the following circle. Only reporting ENOMEM upstream if we really can not alloc any available page. And another fix is ignoring dealing with the following pages if an EIO occurs when reading page from page_list. Signed-off-by: Gu Zheng Reviewed-by: Chao Yu [Jaegeuk Kim: modify the flow for better neat code] Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 12c9ded..c415cec 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1588,15 +1588,8 @@ static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages, for (; page_idx < start + nrpages; page_idx++) { /* alloc temporal page for read node summary info*/ page = alloc_page(GFP_F2FS_ZERO); - if (!page) { - struct page *tmp; - list_for_each_entry_safe(page, tmp, pages, lru) { - list_del(&page->lru); - unlock_page(page); - __free_pages(page, 0); - } - return -ENOMEM; - } + if (!page) + break; lock_page(page); page->index = page_idx; @@ -1607,7 +1600,8 @@ static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages, f2fs_submit_page_mbio(sbi, page, page->index, &fio); f2fs_submit_merged_bio(sbi, META, READ); - return 0; + + return page_idx - start; } int restore_node_summary(struct f2fs_sb_info *sbi, @@ -1626,15 +1620,17 @@ int restore_node_summary(struct f2fs_sb_info *sbi, addr = START_BLOCK(sbi, segno); sum_entry = &sum->entries[0]; - for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { + for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) { nrpages = min(last_offset - i, bio_blocks); /* read ahead node pages */ - err = ra_sum_pages(sbi, &page_list, addr, nrpages); - if (err) - return err; + nrpages = ra_sum_pages(sbi, &page_list, addr, nrpages); + if (!nrpages) + return -ENOMEM; list_for_each_entry_safe(page, tmp, &page_list, lru) { + if (err) + goto skip; lock_page(page); if (unlikely(!PageUptodate(page))) { @@ -1646,9 +1642,9 @@ int restore_node_summary(struct f2fs_sb_info *sbi, sum_entry->ofs_in_node = 0; sum_entry++; } - - list_del(&page->lru); unlock_page(page); +skip: + list_del(&page->lru); __free_pages(page, 0); } } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 199c964..b3f8431 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1160,9 +1160,12 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) ns->ofs_in_node = 0; } } else { - if (restore_node_summary(sbi, segno, sum)) { + int err; + + err = restore_node_summary(sbi, segno, sum); + if (err) { f2fs_put_page(new, 1); - return -EINVAL; + return err; } } } -- cgit v0.10.2 From 28cdce0459ccea71ea734d7903d39910d1c6a05d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Mar 2014 13:37:38 +0800 Subject: f2fs: recover inline xattr data in roll-forward process Previously we do not recover inline xattr data of inode after power-cut, so inline xattr data may be lost. We should recover the data during the roll-forward process. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c415cec..e72b258 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1493,6 +1493,37 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, clear_node_page_dirty(page); } +void recover_inline_xattr(struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + void *src_addr, *dst_addr; + size_t inline_size; + struct page *ipage; + struct f2fs_inode *ri; + + if (!is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR)) + return; + + if (!IS_INODE(page)) + return; + + ri = F2FS_INODE(page); + if (!(ri->i_inline & F2FS_INLINE_XATTR)) + return; + + ipage = get_node_page(sbi, inode->i_ino); + f2fs_bug_on(IS_ERR(ipage)); + + dst_addr = inline_xattr_addr(ipage); + src_addr = inline_xattr_addr(page); + inline_size = inline_xattr_size(inode); + + memcpy(dst_addr, src_addr, inline_size); + + update_inode(inode, ipage); + f2fs_put_page(ipage, 1); +} + bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); @@ -1500,6 +1531,8 @@ bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) nid_t new_xnid = nid_of_node(page); struct node_info ni; + recover_inline_xattr(inode, page); + if (ofs_of_node(page) != XATTR_NODE_OFFSET) return false; -- cgit v0.10.2 From 987c7c31123fd36c1f792ff53ff131378475f5c8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 12 Mar 2014 15:59:03 +0800 Subject: f2fs: introduce f2fs_has_inline_xattr for better readability This patch introduces a help function f2fs_has_inline_xattr for better readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f845e92..1f87a04 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -991,9 +991,14 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi, ri->i_inline |= F2FS_INLINE_DATA; } +static inline int f2fs_has_inline_xattr(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR); +} + static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) { - if (is_inode_flag_set(fi, FI_INLINE_XATTR)) + if (f2fs_has_inline_xattr(&fi->vfs_inode)) return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; return DEF_ADDRS_PER_INODE; } @@ -1007,7 +1012,7 @@ static inline void *inline_xattr_addr(struct page *page) static inline int inline_xattr_size(struct inode *inode) { - if (is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR)) + if (f2fs_has_inline_xattr(inode)) return F2FS_INLINE_XATTR_ADDRS << 2; else return 0; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e72b258..c618fad 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1501,7 +1501,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page) struct page *ipage; struct f2fs_inode *ri; - if (!is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR)) + if (!f2fs_has_inline_xattr(inode)) return; if (!IS_INODE(page)) -- cgit v0.10.2 From 910bb12d29cc64144506333bfeaeeee9715c3872 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 12 Mar 2014 17:08:36 +0800 Subject: f2fs: check upper bound of ino value in f2fs_nfs_get_inode Upper bound checking of ino should be added to f2fs_nfs_get_inode, so unneeded process before do_read_inode in f2fs_iget could be avoided when ino is invalid. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6e4851c..3a51d7a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -644,6 +644,8 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, if (unlikely(ino < F2FS_ROOT_INO(sbi))) return ERR_PTR(-ESTALE); + if (unlikely(ino >= NM_I(sbi)->max_nid)) + return ERR_PTR(-ESTALE); /* * f2fs_iget isn't quite right if the inode is currently unallocated! -- cgit v0.10.2 From 90aa6dc9b9dd9b3ee832bb6e91e2d8ba8ebb93b6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 17 Mar 2014 10:31:06 +0800 Subject: f2fs: print type for each segment in segment_info's show The original segment_info's show looks out-of-format: cat /proc/fs/f2fs/loop0/segment_info 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 512 512 512 512 512 512 512 512 0 0 512 348 0 263 0 0 512 0 0 512 512 512 512 0 512 512 512 512 512 512 512 512 512 511 328 512 512 512 512 512 512 512 512 512 512 512 512 512 0 0 175 Let's fix this and show type for each segment. cat /proc/fs/f2fs/loop0/segment_info format: segment_type|valid_blocks segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN) 0 2|0 1|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 10 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 20 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 30 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 40 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 50 3|0 3|0 3|0 3|0 3|0 3|0 3|0 0|0 3|0 3|0 60 3|0 3|0 3|0 3|0 3|0 3|0 3|0 3|0 3|0 3|512 70 3|512 3|512 3|512 3|512 3|512 3|512 3|512 3|0 3|0 3|512 80 3|0 3|0 3|0 3|0 3|0 3|512 3|0 3|0 3|512 3|512 90 3|512 0|512 3|274 0|512 0|512 0|512 0|512 0|512 0|512 3|512 100 3|512 0|512 3|511 0|328 3|512 0|512 0|512 3|512 0|512 0|512 110 0|512 0|512 0|512 0|512 0|512 0|512 0|512 5|0 4|0 3|512 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3a51d7a..057a3ef 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -544,8 +544,16 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) le32_to_cpu(sbi->raw_super->segment_count_main); int i; + seq_puts(seq, "format: segment_type|valid_blocks\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + for (i = 0; i < total_segs; i++) { - seq_printf(seq, "%u", get_valid_blocks(sbi, i, 1)); + struct seg_entry *se = get_seg_entry(sbi, i); + + if ((i % 10) == 0) + seq_printf(seq, "%-5d", i); + seq_printf(seq, "%d|%-3u", se->type, + get_valid_blocks(sbi, i, 1)); if ((i % 10) == 9 || i == (total_segs - 1)) seq_putc(seq, '\n'); else -- cgit v0.10.2 From 4bc8e9bcf50103216a7a316ab66b9bb8e81baa27 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 17 Mar 2014 16:35:06 +0800 Subject: f2fs: introduce f2fs_has_xattr_block for better readability This patch introduces a help function f2fs_has_xattr_block for better readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1f87a04..292cc3c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -637,6 +637,11 @@ static inline int F2FS_HAS_BLOCKS(struct inode *inode) return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS; } +static inline bool f2fs_has_xattr_block(unsigned int ofs) +{ + return ofs == XATTR_NODE_OFFSET; +} + static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t count) { diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c618fad..3e36240 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -836,7 +836,7 @@ struct page *new_node_page(struct dnode_of_data *dn, SetPageUptodate(page); set_page_dirty(page); - if (ofs == XATTR_NODE_OFFSET) + if (f2fs_has_xattr_block(ofs)) F2FS_I(dn->inode)->i_xattr_nid = dn->nid; dn->node_page = page; @@ -1533,7 +1533,7 @@ bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) recover_inline_xattr(inode, page); - if (ofs_of_node(page) != XATTR_NODE_OFFSET) + if (!f2fs_has_xattr_block(ofs_of_node(page))) return false; /* 1: invalidate the previous xattr nid */ diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 4dea719..ee6d286 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -242,7 +242,7 @@ static inline bool IS_DNODE(struct page *node_page) { unsigned int ofs = ofs_of_node(node_page); - if (ofs == XATTR_NODE_OFFSET) + if (f2fs_has_xattr_block(ofs)) return false; if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || -- cgit v0.10.2 From e4fc5fbfc9e285356be7e5208bb1a2fa377b2656 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 17 Mar 2014 16:36:24 +0800 Subject: f2fs: avoid to return incorrect errno of read_normal_summaries We should return error number of read_normal_summaries instead of -EINVAL when read_normal_summaries failed. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b3f8431..6c5a4f0 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1186,6 +1186,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { int type = CURSEG_HOT_DATA; + int err; if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) { /* restore for compacted data summary */ @@ -1194,9 +1195,12 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) type = CURSEG_HOT_NODE; } - for (; type <= CURSEG_COLD_NODE; type++) - if (read_normal_summaries(sbi, type)) - return -EINVAL; + for (; type <= CURSEG_COLD_NODE; type++) { + err = read_normal_summaries(sbi, type); + if (err) + return err; + } + return 0; } -- cgit v0.10.2 From 04c0938844695ab97b79a477a9f57748fe97d2f5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 18 Mar 2014 09:07:59 +0800 Subject: f2fs: fix incorrect parsing with option string Previously 'background_gc={on***,off***}' is being parsed as correct option, with this patch we cloud fix the trivial bug in mount process. Change log from v1: o need to check length of parameter suggested by Jaegeuk Kim. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 057a3ef..dbe402b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -258,9 +258,9 @@ static int parse_options(struct super_block *sb, char *options) if (!name) return -ENOMEM; - if (!strncmp(name, "on", 2)) + if (strlen(name) == 2 && !strncmp(name, "on", 2)) set_opt(sbi, BG_GC); - else if (!strncmp(name, "off", 3)) + else if (strlen(name) == 3 && !strncmp(name, "off", 3)) clear_opt(sbi, BG_GC); else { kfree(name); -- cgit v0.10.2 From f8b2c1f940dca2843fe13b55ba5868bac8040551 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 18 Mar 2014 12:33:06 +0900 Subject: f2fs: introduce get_dirty_dents for readability The get_dirty_dents gives us the number of dirty dentry pages. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 911b6f9..4c0e98d 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -619,7 +619,7 @@ void remove_dirty_dir_inode(struct inode *inode) return; spin_lock(&sbi->dir_inode_lock); - if (atomic_read(&F2FS_I(inode)->dirty_dents)) { + if (get_dirty_dents(inode)) { spin_unlock(&sbi->dir_inode_lock); return; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 292cc3c..59fac1a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -704,6 +704,11 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) return atomic_read(&sbi->nr_pages[count_type]); } +static inline int get_dirty_dents(struct inode *inode) +{ + return atomic_read(&F2FS_I(inode)->dirty_dents); +} + static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) { unsigned int pages_per_sec = sbi->segs_per_sec * diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index d518e37..0d8e4a2 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -273,7 +273,7 @@ void f2fs_evict_inode(struct inode *inode) inode->i_ino == F2FS_META_INO(sbi)) goto no_delete; - f2fs_bug_on(atomic_read(&F2FS_I(inode)->dirty_dents)); + f2fs_bug_on(get_dirty_dents(inode)); remove_dirty_dir_inode(inode); if (inode->i_nlink || is_bad_inode(inode)) -- cgit v0.10.2 From 87d6f890944d092c4ef5b84053f0d0d5d8137b0b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 18 Mar 2014 12:40:49 +0900 Subject: f2fs: avoid small data writes by skipping writepages This patch introduces nr_pages_to_skip(sbi, type) to determine writepages can be skipped. The dentry, node, and meta pages can be conrolled by F2FS without breaking the FS consistency. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 4c0e98d..1f52b70 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -187,7 +187,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + int nrpages = nr_pages_to_skip(sbi, META); long written; if (wbc->for_kupdate) @@ -682,7 +682,7 @@ retry: inode = igrab(entry->inode); spin_unlock(&sbi->dir_inode_lock); if (inode) { - filemap_flush(inode->i_mapping); + filemap_fdatawrite(inode->i_mapping); iput(inode); } else { /* diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 101b4cd..e3b7cfa 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -868,6 +868,10 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (!mapping->a_ops->writepage) return 0; + if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && + get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA)) + return 0; + if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) { desired_nrtw = MAX_DESIRED_PAGES_WP; excess_nrtw = desired_nrtw - wbc->nr_to_write; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3e36240..cb514f1 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1198,12 +1198,6 @@ redirty_out: return AOP_WRITEPAGE_ACTIVATE; } -/* - * It is very important to gather dirty pages and write at once, so that we can - * submit a big bio without interfering other data writes. - * Be default, 512 pages (2MB) * 3 node types, is more reasonable. - */ -#define COLLECT_DIRTY_NODES 1536 static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1214,7 +1208,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, f2fs_balance_fs_bg(sbi); /* collect a number of dirty node pages and write together */ - if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES) + if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) return 0; /* if mounting is failed, skip writing node pages */ diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index c3d5e36..bbd9761 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -664,3 +664,22 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) struct request_queue *q = bdev_get_queue(bdev); return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q)); } + +/* + * It is very important to gather dirty pages and write at once, so that we can + * submit a big bio without interfering other data writes. + * By default, 512 pages for directory data, + * 512 pages (2MB) * 3 for three types of nodes, and + * max_bio_blocks for meta are set. + */ +static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) +{ + if (type == DATA) + return sbi->blocks_per_seg; + else if (type == NODE) + return 3 * sbi->blocks_per_seg; + else if (type == META) + return MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + else + return 0; +} -- cgit v0.10.2 From d3baf95da5b0bce9fe980eeff6140817d63fabdf Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 18 Mar 2014 13:43:05 +0900 Subject: f2fs: increase pages_skipped when skipping writepages This patch increases pages_skipped when skipping writepages. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 1f52b70..aef32f3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -190,12 +190,9 @@ static int f2fs_write_meta_pages(struct address_space *mapping, int nrpages = nr_pages_to_skip(sbi, META); long written; - if (wbc->for_kupdate) - return 0; - /* collect a number of dirty meta pages and write together */ - if (get_pages(sbi, F2FS_DIRTY_META) < nrpages) - return 0; + if (wbc->for_kupdate || get_pages(sbi, F2FS_DIRTY_META) < nrpages) + goto skip_write; /* if mounting is failed, skip writing node pages */ mutex_lock(&sbi->cp_mutex); @@ -203,6 +200,10 @@ static int f2fs_write_meta_pages(struct address_space *mapping, mutex_unlock(&sbi->cp_mutex); wbc->nr_to_write -= written; return 0; + +skip_write: + wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META); + return 0; } long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e3b7cfa..3bc3809 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -870,7 +870,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA)) - return 0; + goto skip_write; if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) { desired_nrtw = MAX_DESIRED_PAGES_WP; @@ -892,6 +892,10 @@ static int f2fs_write_data_pages(struct address_space *mapping, wbc->nr_to_write -= excess_nrtw; return ret; + +skip_write: + wbc->pages_skipped += get_dirty_dents(inode); + return 0; } static int f2fs_write_begin(struct file *file, struct address_space *mapping, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index cb514f1..7cc146b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1209,7 +1209,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, /* collect a number of dirty node pages and write together */ if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) - return 0; + goto skip_write; /* if mounting is failed, skip writing node pages */ wbc->nr_to_write = 3 * max_hw_blocks(sbi); @@ -1218,6 +1218,10 @@ static int f2fs_write_node_pages(struct address_space *mapping, wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) - wbc->nr_to_write); return 0; + +skip_write: + wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES); + return 0; } static int f2fs_set_node_page_dirty(struct page *page) -- cgit v0.10.2 From 50c8cdb35ad8016c52fb2326ef9d65542e3a3e1b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 18 Mar 2014 13:47:11 +0900 Subject: f2fs: introduce nr_pages_to_write for segment alignment This patch introduces nr_pages_to_write to align page writes to the segment or other operational unit size, which can be tuned according to the system environment. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index aef32f3..2a00c94 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -187,18 +187,19 @@ static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - int nrpages = nr_pages_to_skip(sbi, META); - long written; + long diff, written; /* collect a number of dirty meta pages and write together */ - if (wbc->for_kupdate || get_pages(sbi, F2FS_DIRTY_META) < nrpages) + if (wbc->for_kupdate || + get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) goto skip_write; /* if mounting is failed, skip writing node pages */ mutex_lock(&sbi->cp_mutex); - written = sync_meta_pages(sbi, META, nrpages); + diff = nr_pages_to_write(sbi, META, wbc); + written = sync_meta_pages(sbi, META, wbc->nr_to_write); mutex_unlock(&sbi->cp_mutex); - wbc->nr_to_write -= written; + wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; skip_write: diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3bc3809..b0c923a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -844,8 +844,6 @@ redirty_out: return AOP_WRITEPAGE_ACTIVATE; } -#define MAX_DESIRED_PAGES_WP 4096 - static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, void *data) { @@ -862,7 +860,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); bool locked = false; int ret; - long excess_nrtw = 0, desired_nrtw; + long diff; /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) @@ -872,11 +870,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA)) goto skip_write; - if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) { - desired_nrtw = MAX_DESIRED_PAGES_WP; - excess_nrtw = desired_nrtw - wbc->nr_to_write; - wbc->nr_to_write = desired_nrtw; - } + diff = nr_pages_to_write(sbi, DATA, wbc); if (!S_ISDIR(inode->i_mode)) { mutex_lock(&sbi->writepages); @@ -890,7 +884,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, remove_dirty_dir_inode(inode); - wbc->nr_to_write -= excess_nrtw; + wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return ret; skip_write: diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7cc146b..5e9c38e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1202,7 +1202,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - long nr_to_write = wbc->nr_to_write; + long diff; /* balancing f2fs's metadata in background */ f2fs_balance_fs_bg(sbi); @@ -1211,12 +1211,10 @@ static int f2fs_write_node_pages(struct address_space *mapping, if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) goto skip_write; - /* if mounting is failed, skip writing node pages */ - wbc->nr_to_write = 3 * max_hw_blocks(sbi); + diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; sync_node_pages(sbi, 0, wbc); - wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) - - wbc->nr_to_write); + wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; skip_write: diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index bbd9761..9fc46ee 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -683,3 +683,27 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) else return 0; } + +/* + * When writing pages, it'd better align nr_to_write for segment size. + */ +static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, + struct writeback_control *wbc) +{ + long nr_to_write, desired; + + if (wbc->sync_mode != WB_SYNC_NONE) + return 0; + + nr_to_write = wbc->nr_to_write; + + if (type == DATA) + desired = 4096; + else if (type == NODE) + desired = 3 * max_hw_blocks(sbi); + else + desired = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + + wbc->nr_to_write = desired; + return desired - nr_to_write; +} -- cgit v0.10.2 From 3cb5ad152b54430f3e5f338c15f8cd434e7160c8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 18 Mar 2014 13:29:07 +0900 Subject: f2fs: call f2fs_wait_on_page_writeback instead of native function If a page is on writeback, f2fs can face with deadlock due to under writepages. This is caused by merging IOs inside f2fs, so if it comes to detect, let's throw merged IOs, which is implemented by f2fs_wait_on_page_writeback. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 2a00c94..a80be51 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -33,14 +33,12 @@ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) struct address_space *mapping = META_MAPPING(sbi); struct page *page = NULL; repeat: - page = grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); if (!page) { cond_resched(); goto repeat; } - /* We wait writeback only inside grab_meta_page() */ - wait_on_page_writeback(page); SetPageUptodate(page); return page; } @@ -168,7 +166,7 @@ static int f2fs_write_meta_page(struct page *page, if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) goto no_write; - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, META); write_meta_page(sbi, page); no_write: dec_page_count(sbi, F2FS_DIRTY_META); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index f3a80ce..7c9b17c 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -253,7 +253,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode) { lock_page(page); - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); de->ino = cpu_to_le32(inode->i_ino); set_de_type(de, inode); kunmap(page); @@ -352,14 +352,11 @@ static struct page *init_inode_metadata(struct inode *inode, err = f2fs_init_security(inode, dir, name, page); if (err) goto put_error; - - wait_on_page_writeback(page); } else { page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); if (IS_ERR(page)) return page; - wait_on_page_writeback(page); set_cold_node(inode, page); } @@ -494,7 +491,7 @@ start: ++level; goto start; add_dentry: - wait_on_page_writeback(dentry_page); + f2fs_wait_on_page_writeback(dentry_page, DATA); page = init_inode_metadata(inode, dir, name); if (IS_ERR(page)) { @@ -543,7 +540,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, int i; lock_page(page); - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); dentry_blk = (struct f2fs_dentry_block *)kaddr; bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a4cc1d6..e755ee5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -76,7 +76,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, trace_f2fs_vm_page_mkwrite(page, DATA); mapped: /* fill the page */ - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(err); @@ -245,7 +245,7 @@ static void truncate_partial_data_page(struct inode *inode, u64 from) f2fs_put_page(page, 1); return; } - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); zero_user(page, offset, PAGE_CACHE_SIZE - offset); set_page_dirty(page); f2fs_put_page(page, 1); @@ -422,7 +422,7 @@ static void fill_zero(struct inode *inode, pgoff_t index, f2fs_unlock_op(sbi); if (!IS_ERR(page)) { - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); zero_user(page, start, len); set_page_dirty(page); f2fs_put_page(page, 1); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5e9c38e..9a6d8bb 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -725,7 +725,7 @@ skip_partial: f2fs_put_page(page, 1); goto restart; } - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, NODE); ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; set_page_dirty(page); unlock_page(page); @@ -814,7 +814,8 @@ struct page *new_node_page(struct dnode_of_data *dn, if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return ERR_PTR(-EPERM); - page = grab_cache_page(NODE_MAPPING(sbi), dn->nid); + page = grab_cache_page_write_begin(NODE_MAPPING(sbi), + dn->nid, AOP_FLAG_NOFS); if (!page) return ERR_PTR(-ENOMEM); @@ -910,7 +911,8 @@ struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) struct page *page; int err; repeat: - page = grab_cache_page(NODE_MAPPING(sbi), nid); + page = grab_cache_page_write_begin(NODE_MAPPING(sbi), + nid, AOP_FLAG_NOFS); if (!page) return ERR_PTR(-ENOMEM); @@ -1130,7 +1132,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) continue; if (ino && ino_of_node(page) == ino) { - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, NODE); if (TestClearPageError(page)) ret = -EIO; } @@ -1163,7 +1165,7 @@ static int f2fs_write_node_page(struct page *page, if (unlikely(sbi->por_doing)) goto redirty_out; - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, NODE); /* get old block addr of this node page */ nid = nid_of_node(page); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 03b28ec..bbef4ed 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -316,7 +316,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, goto out; } - wait_on_page_writeback(dn.node_page); + f2fs_wait_on_page_writeback(dn.node_page, NODE); get_node_info(sbi, dn.nid, &ni); f2fs_bug_on(ni.ino != ino_of_node(page)); -- cgit v0.10.2 From 40bb0058c871c6ddcd4aff9fe2f5224e59aba47b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Mar 2014 10:43:59 +0900 Subject: f2fs: avoid to drop nat entries due to the negative nr_shrink The try_to_free_nats should not receive the negative nr_shrink. Otherwise, it can drop all the nat entries by the while loop. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9a6d8bb..d27e65a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -208,7 +208,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); - if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD) + if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD || nr_shrink <= 0) return 0; write_lock(&nm_i->nat_tree_lock); -- cgit v0.10.2 From cdfc41c134d48c1923066bcfa6630b94588ad6bc Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Mar 2014 13:31:37 +0900 Subject: f2fs: throttle the memory footprint with a sysfs entry This patch introduces ram_thresh, a sysfs entry, which controls the memory footprint used by the free nid list and the nat cache. Previously, the free nid list was controlled by MAX_FREE_NIDS, while the nat cache was managed by NM_WOUT_THRESHOLD. However, this approach cannot be applied dynamically according to the system. So, this patch adds ram_thresh that users can specify the threshold, which is in order of 1 / 1024. For example, if the total ram size is 4GB and the value is set to 10 by default, f2fs tries to control the number of free nids and nat caches not to consume over 10 * (4GB / 1024) = 10MB. Signed-off-by: Jaegeuk Kim diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 32b0809..4ac2c25 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -55,3 +55,9 @@ Date: January 2014 Contact: "Jaegeuk Kim" Description: Controls the number of trials to find a victim segment. + +What: /sys/fs/f2fs//ram_thresh +Date: March 2014 +Contact: "Jaegeuk Kim" +Description: + Controls the memory footprint used by f2fs. diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 803784e..f415b9f 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -202,6 +202,10 @@ Files in /sys/fs/f2fs/ Otherwise, it needs to decrease this value to reduce the space overhead. The default value is 0. + ram_thresh This parameter controls the memory footprint used + by free nids and cached nat entries. By default, + 10 is set, which indicates 10 MB / 1 GB RAM. + ================================================================================ USAGE ================================================================================ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 59fac1a..05c6524 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -242,6 +242,7 @@ struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ + unsigned int ram_thresh; /* control the memory footprint */ /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d27e65a..fec4967 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -26,6 +26,22 @@ static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; +static inline bool available_free_memory(struct f2fs_nm_info *nm_i, int type) +{ + struct sysinfo val; + unsigned long mem_size = 0; + + si_meminfo(&val); + if (type == FREE_NIDS) + mem_size = nm_i->fcnt * sizeof(struct free_nid); + else if (type == NAT_ENTRIES) + mem_size += nm_i->nat_cnt * sizeof(struct nat_entry); + mem_size >>= 12; + + /* give 50:50 memory for free nids and nat caches respectively */ + return (mem_size < ((val.totalram * nm_i->ram_thresh) >> 11)); +} + static void clear_node_page_dirty(struct page *page) { struct address_space *mapping = page->mapping; @@ -208,7 +224,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); - if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD || nr_shrink <= 0) + if (available_free_memory(nm_i, NAT_ENTRIES) || nr_shrink <= 0) return 0; write_lock(&nm_i->nat_tree_lock); @@ -1288,7 +1304,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) struct nat_entry *ne; bool allocated = false; - if (nm_i->fcnt > 2 * MAX_FREE_NIDS) + if (!available_free_memory(nm_i, FREE_NIDS)) return -1; /* 0 nid should not be used */ @@ -1473,7 +1489,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->free_nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(!i || i->state != NID_ALLOC); - if (nm_i->fcnt > 2 * MAX_FREE_NIDS) { + if (!available_free_memory(nm_i, FREE_NIDS)) { __del_from_free_nid_list(nm_i, i); } else { i->state = NID_NEW; @@ -1836,6 +1852,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks - 3; nm_i->fcnt = 0; nm_i->nat_cnt = 0; + nm_i->ram_thresh = DEF_RAM_THRESHOLD; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index ee6d286..c972154 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -17,15 +17,15 @@ /* # of pages to perform readahead before building free nids */ #define FREE_NID_PAGES 4 -/* maximum # of free node ids to produce during build_free_nids */ -#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) - /* maximum readahead size for node during getting data blocks */ #define MAX_RA_NODE 128 /* maximum cached nat entries to manage memory footprint */ #define NM_WOUT_THRESHOLD (64 * NAT_ENTRY_PER_BLOCK) +/* control the memory footprint threshold (10MB per 1GB ram) */ +#define DEF_RAM_THRESHOLD 10 + /* vector size for gang look-up from nat cache that consists of radix tree */ #define NATVEC_SIZE 64 @@ -77,6 +77,11 @@ static inline void node_info_from_raw_nat(struct node_info *ni, ni->version = raw_ne->version; } +enum nid_type { + FREE_NIDS, /* indicates the free nid list */ + NAT_ENTRIES /* indicates the cached nat entry */ +}; + /* * For free nid mangement */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index dbe402b..34c47b2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -74,6 +74,7 @@ static match_table_t f2fs_tokens = { enum { GC_THREAD, /* struct f2fs_gc_thread */ SM_INFO, /* struct f2fs_sm_info */ + NM_INFO, /* struct f2fs_nm_info */ F2FS_SBI, /* struct f2fs_sb_info */ }; @@ -92,6 +93,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return (unsigned char *)sbi->gc_thread; else if (struct_type == SM_INFO) return (unsigned char *)SM_I(sbi); + else if (struct_type == NM_INFO) + return (unsigned char *)NM_I(sbi); else if (struct_type == F2FS_SBI) return (unsigned char *)sbi; return NULL; @@ -183,6 +186,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); @@ -198,6 +202,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(min_ipu_util), ATTR_LIST(max_victim_search), ATTR_LIST(dir_level), + ATTR_LIST(ram_thresh), NULL, }; -- cgit v0.10.2 From a5f420101db326e27ef5c2ab737c8c1b0e3559e3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Mar 2014 13:45:52 +0900 Subject: f2fs: remove unnecessary threshold The NM_WOUT_THRESHOLD is now obsolete since f2fs starts to control on a basis of the memory footprint. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index b7111c4..b52c12c 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -250,10 +250,10 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_dent, si->ndirty_dirs); seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); - seq_printf(s, " - NATs: %5d > %lu\n", - si->nats, NM_WOUT_THRESHOLD); - seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n", - si->sits, si->fnids); + seq_printf(s, " - NATs: %9d\n - SITs: %9d\n", + si->nats, si->sits); + seq_printf(s, " - free_nids: %9d\n", + si->fnids); seq_puts(s, "\nDistribution of User Blocks:"); seq_puts(s, " [ valid | invalid | free ]\n"); seq_puts(s, " ["); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index fec4967..daf644c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -224,7 +224,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); - if (available_free_memory(nm_i, NAT_ENTRIES) || nr_shrink <= 0) + if (available_free_memory(nm_i, NAT_ENTRIES)) return 0; write_lock(&nm_i->nat_tree_lock); @@ -1830,9 +1830,6 @@ flush_now: if (!flushed) mutex_unlock(&curseg->curseg_mutex); f2fs_put_page(page, 1); - - /* 2) shrink nat caches if necessary */ - try_to_free_nats(sbi, nm_i->nat_cnt - NM_WOUT_THRESHOLD); } static int init_node_manager(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index c972154..c2015b7 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -20,9 +20,6 @@ /* maximum readahead size for node during getting data blocks */ #define MAX_RA_NODE 128 -/* maximum cached nat entries to manage memory footprint */ -#define NM_WOUT_THRESHOLD (64 * NAT_ENTRY_PER_BLOCK) - /* control the memory footprint threshold (10MB per 1GB ram) */ #define DEF_RAM_THRESHOLD 10 -- cgit v0.10.2 From 9179682591c231cc77840d4e75929cc1fa573365 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Mar 2014 13:40:09 +0900 Subject: f2fs: add missing documentation for dir_level This patch adds missing dir_level documentation. Signed-off-by: Jaegeuk Kim diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 4ac2c25..62dd725 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -56,6 +56,12 @@ Contact: "Jaegeuk Kim" Description: Controls the number of trials to find a victim segment. +What: /sys/fs/f2fs//dir_level +Date: March 2014 +Contact: "Jaegeuk Kim" +Description: + Controls the directory level for large directory. + What: /sys/fs/f2fs//ram_thresh Date: March 2014 Contact: "Jaegeuk Kim" -- cgit v0.10.2 From 58c410351eba3d24f741c85a0eb9eaf15c94047d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Mar 2014 14:17:21 +0900 Subject: f2fs: change reclaim rate in percentage It is more reasonable to determine the reclaiming rate of prefree segments according to the volume size, which is set to 5% by default. For example, if the volume is 128GB, the prefree segments are reclaimed when the number reaches to 6.4GB. Signed-off-by: Jaegeuk Kim diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index f415b9f..2f6d021 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -169,9 +169,11 @@ Files in /sys/fs/f2fs/ reclaim_segments This parameter controls the number of prefree segments to be reclaimed. If the number of prefree - segments is larger than this number, f2fs tries to - conduct checkpoint to reclaim the prefree segments - to free segments. By default, 100 segments, 200MB. + segments is larger than the number of segments + in the proportion to the percentage over total + volume size, f2fs tries to conduct checkpoint to + reclaim the prefree segments to free segments. + By default, 5% over total # of segments. max_small_discards This parameter controls the number of discard commands that consist small blocks less than 2MB. diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6c5a4f0..e7ff23a5 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1758,7 +1758,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); - sm_info->rec_prefree_segments = DEF_RECLAIM_PREFREE_SEGMENTS; + sm_info->rec_prefree_segments = sm_info->main_segments * + DEF_RECLAIM_PREFREE_SEGMENTS / 100; sm_info->ipu_policy = F2FS_IPU_DISABLE; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 9fc46ee..7091204 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -14,7 +14,7 @@ #define NULL_SEGNO ((unsigned int)(~0)) #define NULL_SECNO ((unsigned int)(~0)) -#define DEF_RECLAIM_PREFREE_SEGMENTS 100 /* 200MB of prefree segments */ +#define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) -- cgit v0.10.2 From d928bfbfe77aa457b765c19e9db8cd4cc72b3c89 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 20 Mar 2014 19:10:08 +0900 Subject: f2fs: introduce fi->i_sem to protect fi's info This patch introduces fi->i_sem to protect fi's info that includes xattr_ver, pino, i_nlink. This enables to remove i_mutex during f2fs_sync_file, resulting in performance improvement when a number of fsync calls are triggered from many concurrent threads. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 7c9b17c..972fd0e 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -493,6 +493,7 @@ start: add_dentry: f2fs_wait_on_page_writeback(dentry_page, DATA); + down_write(&F2FS_I(inode)->i_sem); page = init_inode_metadata(inode, dir, name); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -515,6 +516,8 @@ add_dentry: update_parent_metadata(dir, inode, current_depth); fail: + up_write(&F2FS_I(inode)->i_sem); + if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { update_inode_page(dir); clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); @@ -559,6 +562,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (inode) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + down_write(&F2FS_I(inode)->i_sem); + if (S_ISDIR(inode->i_mode)) { drop_nlink(dir); update_inode_page(dir); @@ -569,6 +574,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, drop_nlink(inode); i_size_write(inode, 0); } + up_write(&F2FS_I(inode)->i_sem); update_inode_page(inode); if (inode->i_nlink == 0) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 05c6524..469779a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -210,6 +210,7 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ + struct rw_semaphore i_sem; /* protect fi info */ atomic_t dirty_dents; /* # of dirty dentry pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e755ee5..a9474cd 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -111,6 +111,7 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; + struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); int ret = 0; bool need_cp = false; @@ -133,7 +134,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) /* guarantee free sections for fsync */ f2fs_balance_fs(sbi); - mutex_lock(&inode->i_mutex); + down_read(&fi->i_sem); /* * Both of fdatasync() and fsync() are able to be recovered from @@ -150,21 +151,27 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) need_cp = true; + up_read(&fi->i_sem); + if (need_cp) { nid_t pino; - F2FS_I(inode)->xattr_ver = 0; - /* all the dirty node pages should be flushed for POR */ ret = f2fs_sync_fs(inode->i_sb, 1); + + down_write(&fi->i_sem); + F2FS_I(inode)->xattr_ver = 0; if (file_wrong_pino(inode) && inode->i_nlink == 1 && get_parent_ino(inode, &pino)) { F2FS_I(inode)->i_pino = pino; file_got_pino(inode); + up_write(&fi->i_sem); mark_inode_dirty_sync(inode); ret = f2fs_write_inode(inode, NULL); if (ret) goto out; + } else { + up_write(&fi->i_sem); } } else { /* if there is no written node page, write its inode page */ @@ -180,7 +187,6 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); } out: - mutex_unlock(&inode->i_mutex); trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); return ret; } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 397d459..0cea874 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -424,12 +424,17 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_set_link(new_dir, new_entry, new_page, old_inode); + down_write(&F2FS_I(old_inode)->i_sem); F2FS_I(old_inode)->i_pino = new_dir->i_ino; + up_write(&F2FS_I(old_inode)->i_sem); new_inode->i_ctime = CURRENT_TIME; + down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) drop_nlink(new_inode); drop_nlink(new_inode); + up_write(&F2FS_I(new_inode)->i_sem); + mark_inode_dirty(new_inode); if (!new_inode->i_nlink) @@ -459,7 +464,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir != new_dir) { f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); + down_write(&F2FS_I(old_inode)->i_sem); F2FS_I(old_inode)->i_pino = new_dir->i_ino; + up_write(&F2FS_I(old_inode)->i_sem); update_inode_page(old_inode); } else { kunmap(old_dir_page); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 34c47b2..89ea046 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -360,6 +360,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) fi->i_current_depth = 1; fi->i_advise = 0; rwlock_init(&fi->ext.ext_lock); + init_rwsem(&fi->i_sem); set_inode_flag(fi, FI_NEW_INODE); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 89d0422..8419130 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -590,7 +590,10 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, f2fs_balance_fs(sbi); f2fs_lock_op(sbi); + /* protect xattr_ver */ + down_write(&F2FS_I(inode)->i_sem); err = __f2fs_setxattr(inode, name_index, name, value, value_len, ipage); + up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); return err; -- cgit v0.10.2 From 479f40c44ae30e02642ce0391be707a53852d545 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 20 Mar 2014 21:52:53 +0900 Subject: f2fs: skip unnecessary node writes during fsync If multiple redundant fsync calls are triggered, we don't need to write its node pages with fsync mark continuously. So, this patch adds FI_NEED_FSYNC to track whether the latest node block is written with the fsync mark or not. If the mark was set, a new fsync doesn't need to write a node block. Otherwise, we should do a new node block with the mark for roll-forward recovery. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 469779a..f83433e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1126,6 +1126,7 @@ struct dnode_of_data; struct node_info; int is_checkpointed_node(struct f2fs_sb_info *, nid_t); +bool fsync_mark_done(struct f2fs_sb_info *, nid_t); void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a9474cd..6ba2668 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -176,6 +176,8 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } else { /* if there is no written node page, write its inode page */ while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { + if (fsync_mark_done(sbi, inode->i_ino)) + goto out; mark_inode_dirty_sync(inode); ret = f2fs_write_inode(inode, NULL); if (ret) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index daf644c..eced8d7 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -133,6 +133,20 @@ int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) return is_cp; } +bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + bool fsync_done = false; + + read_lock(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (e) + fsync_done = e->fsync_done; + read_unlock(&nm_i->nat_tree_lock); + return fsync_done; +} + static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) { struct nat_entry *new; @@ -173,7 +187,7 @@ retry: } static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, - block_t new_blkaddr) + block_t new_blkaddr, bool fsync_done) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; @@ -217,6 +231,11 @@ retry: /* change address */ nat_set_blkaddr(e, new_blkaddr); __set_nat_cache_dirty(nm_i, e); + + /* update fsync_mark if its inode nat entry is still alive */ + e = __lookup_nat_cache(nm_i, ni->ino); + if (e) + e->fsync_done = fsync_done; write_unlock(&nm_i->nat_tree_lock); } @@ -483,7 +502,7 @@ static void truncate_node(struct dnode_of_data *dn) /* Deallocate node address */ invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, dn->inode); - set_node_addr(sbi, &ni, NULL_ADDR); + set_node_addr(sbi, &ni, NULL_ADDR, false); if (dn->nid == dn->inode->i_ino) { remove_orphan_inode(sbi, dn->nid); @@ -846,7 +865,7 @@ struct page *new_node_page(struct dnode_of_data *dn, f2fs_bug_on(old_ni.blk_addr != NULL_ADDR); new_ni = old_ni; new_ni.ino = dn->inode->i_ino; - set_node_addr(sbi, &new_ni, NEW_ADDR); + set_node_addr(sbi, &new_ni, NEW_ADDR, false); fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); set_cold_node(dn->inode, page); @@ -1202,7 +1221,7 @@ static int f2fs_write_node_page(struct page *page, mutex_lock(&sbi->node_write); set_page_writeback(page); write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr); - set_node_addr(sbi, &ni, new_addr); + set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); mutex_unlock(&sbi->node_write); unlock_page(page); @@ -1503,7 +1522,7 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, block_t new_blkaddr) { rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr); - set_node_addr(sbi, ni, new_blkaddr); + set_node_addr(sbi, ni, new_blkaddr, false); clear_node_page_dirty(page); } @@ -1559,7 +1578,7 @@ bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) f2fs_bug_on(ni.blk_addr == NULL_ADDR); invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, inode); - set_node_addr(sbi, &ni, NULL_ADDR); + set_node_addr(sbi, &ni, NULL_ADDR, false); recover_xnid: /* 2: allocate new xattr nid */ @@ -1569,12 +1588,12 @@ recover_xnid: remove_free_nid(NM_I(sbi), new_xnid); get_node_info(sbi, new_xnid, &ni); ni.ino = inode->i_ino; - set_node_addr(sbi, &ni, NEW_ADDR); + set_node_addr(sbi, &ni, NEW_ADDR, false); F2FS_I(inode)->i_xattr_nid = new_xnid; /* 3: update xattr blkaddr */ refresh_sit_entry(sbi, NEW_ADDR, blkaddr); - set_node_addr(sbi, &ni, blkaddr); + set_node_addr(sbi, &ni, blkaddr, false); update_inode_page(inode); return true; @@ -1612,7 +1631,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) if (unlikely(!inc_valid_node_count(sbi, NULL))) WARN_ON(1); - set_node_addr(sbi, &new_ni, NEW_ADDR); + set_node_addr(sbi, &new_ni, NEW_ADDR, false); inc_valid_inode_count(sbi); f2fs_put_page(ipage, 1); return 0; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index c2015b7..5decc1a 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -42,6 +42,7 @@ struct node_info { struct nat_entry { struct list_head list; /* for clean or dirty nat list */ bool checkpointed; /* whether it is checkpointed or not */ + bool fsync_done; /* whether the latest node has fsync mark */ struct node_info ni; /* in-memory node information */ }; -- cgit v0.10.2 From 808a1d7490671a74ffa077cf92779c7b0c9f66da Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 20 Mar 2014 22:21:08 +0900 Subject: f2fs: avoid RECLAIM_FS-ON-W warning This patch should resolve the following possible bug. RECLAIM_FS-ON-W at: mark_held_locks+0xb9/0x140 lockdep_trace_alloc+0x85/0xf0 __kmalloc+0x53/0x1d0 read_all_xattrs+0x3d1/0x3f0 [f2fs] f2fs_getxattr+0x4f/0x100 [f2fs] f2fs_get_acl+0x4c/0x290 [f2fs] get_acl+0x4f/0x80 posix_acl_create+0x72/0x180 f2fs_init_acl+0x29/0xcc [f2fs] __f2fs_add_link+0x259/0x710 [f2fs] f2fs_create+0xad/0x1c0 [f2fs] vfs_create+0xed/0x150 do_last+0xd36/0xed0 path_openat+0xc5/0x680 do_filp_open+0x43/0xa0 do_sys_open+0x13c/0x230 SyS_creat+0x1e/0x20 system_call_fastpath+0x16/0x1b Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index fa8da4c..a285715 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -174,7 +174,7 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) retval = f2fs_getxattr(inode, name_index, "", NULL, 0); if (retval > 0) { - value = kmalloc(retval, GFP_KERNEL); + value = kmalloc(retval, GFP_F2FS_ZERO); if (!value) return ERR_PTR(-ENOMEM); retval = f2fs_getxattr(inode, name_index, "", value, retval); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 8419130..0121e45 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -275,7 +275,7 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) inline_size = inline_xattr_size(inode); - txattr_addr = kzalloc(inline_size + size, GFP_KERNEL); + txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); if (!txattr_addr) return NULL; -- cgit v0.10.2 From 3bb5e2c8fe2296ddd9d864dcfb5ee1b77135f3ec Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 1 Apr 2014 17:38:26 +0900 Subject: f2fs: return -EIO when node id is not matched During the cleaing of node segments, F2FS can get errored node blocks due to data race between node page lock and its valid bitmap operations. In that case, it needs to return an error to skip such the obsolete block copy. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index eced8d7..065cd99 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -958,7 +958,7 @@ repeat: goto got_it; lock_page(page); - if (unlikely(!PageUptodate(page))) { + if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { f2fs_put_page(page, 1); return ERR_PTR(-EIO); } @@ -967,7 +967,6 @@ repeat: goto repeat; } got_it: - f2fs_bug_on(nid != nid_of_node(page)); mark_page_accessed(page); return page; } -- cgit v0.10.2 From df0f8dc0e154de13e3a54846f384b674dd557c85 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 22 Mar 2014 14:57:23 +0800 Subject: f2fs: avoid unnecessary bio submit when wait page writeback This patch introduce is_merged_page() to check whether current page is merged in f2fs bio cache. When page is not in cache, we can avoid submitting bio cache, resulting in having more chance to merge pages. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b0c923a..598bfa6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -134,7 +134,7 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype]; - mutex_lock(&io->io_mutex); + down_write(&io->io_rwsem); /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { @@ -142,7 +142,7 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; } __submit_merged_bio(io); - mutex_unlock(&io->io_mutex); + up_write(&io->io_rwsem); } /* @@ -180,7 +180,7 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page, verify_block_addr(sbi, blk_addr); - mutex_lock(&io->io_mutex); + down_write(&io->io_rwsem); if (!is_read) inc_page_count(sbi, F2FS_WRITEBACK); @@ -204,7 +204,7 @@ alloc_new: io->last_block_in_bio = blk_addr; - mutex_unlock(&io->io_mutex); + up_write(&io->io_rwsem); trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f83433e..1e3d869 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -394,7 +394,7 @@ struct f2fs_bio_info { struct bio *bio; /* bios to merge */ sector_t last_block_in_bio; /* last block number */ struct f2fs_io_info fio; /* store buffered io info. */ - struct mutex io_mutex; /* mutex for bio */ + struct rw_semaphore io_rwsem; /* blocking op for bio */ }; struct f2fs_sb_info { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e7ff23a5..570ab9a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1046,12 +1046,38 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, mutex_unlock(&curseg->curseg_mutex); } +static inline bool is_merged_page(struct f2fs_sb_info *sbi, + struct page *page, enum page_type type) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io = &sbi->write_io[btype]; + struct bio *bio = io->bio; + struct bio_vec *bvec; + int i; + + down_read(&io->io_rwsem); + if (!bio) + goto out; + + bio_for_each_segment_all(bvec, bio, i) { + if (page == bvec->bv_page) { + up_read(&io->io_rwsem); + return true; + } + } + +out: + up_read(&io->io_rwsem); + return false; +} + void f2fs_wait_on_page_writeback(struct page *page, enum page_type type) { struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); if (PageWriteback(page)) { - f2fs_submit_merged_bio(sbi, type, WRITE); + if (is_merged_page(sbi, page, type)) + f2fs_submit_merged_bio(sbi, type, WRITE); wait_on_page_writeback(page); } } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 89ea046..9598340 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -920,11 +920,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sbi->por_doing = false; spin_lock_init(&sbi->stat_lock); - mutex_init(&sbi->read_io.io_mutex); + init_rwsem(&sbi->read_io.io_rwsem); sbi->read_io.sbi = sbi; sbi->read_io.bio = NULL; for (i = 0; i < NR_PAGE_TYPE; i++) { - mutex_init(&sbi->write_io[i].io_mutex); + init_rwsem(&sbi->write_io[i].io_rwsem); sbi->write_io[i].sbi = sbi; sbi->write_io[i].bio = NULL; } -- cgit v0.10.2 From 6e452d69d421e10e99446c6de16a19604149c40f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 22 Mar 2014 14:59:45 +0800 Subject: f2fs: avoid unneeded lookup when xattr name length is too long In f2fs_setxattr we have limit this attribute name length, so we should also check it in f2fs_getxattr to avoid useless lookup caused by invalid name length. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 0121e45..503c245 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -407,6 +407,8 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name, if (name == NULL) return -EINVAL; name_len = strlen(name); + if (name_len > F2FS_NAME_LEN) + return -ERANGE; base_addr = read_all_xattrs(inode, NULL); if (!base_addr) -- cgit v0.10.2 From cf0ee0f09bc09f54b9852dda1088b9cdcd4f8683 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 2 Apr 2014 08:55:00 +0800 Subject: f2fs: avoid free slab cache under spinlock Move kmem_cache_free out of spinlock protection region for better performance. Change log from v1: o remove spinlock protection for kmem_cache_free in destroy_node_manager suggested by Jaegeuk Kim. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index a80be51..d877f46 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -347,10 +347,11 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) list_for_each_entry(orphan, head, list) { if (orphan->ino == ino) { list_del(&orphan->list); - kmem_cache_free(orphan_entry_slab, orphan); f2fs_bug_on(sbi->n_orphans == 0); sbi->n_orphans--; - break; + spin_unlock(&sbi->orphan_inode_lock); + kmem_cache_free(orphan_entry_slab, orphan); + return; } } spin_unlock(&sbi->orphan_inode_lock); @@ -577,6 +578,7 @@ void set_dirty_dir_page(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct dir_inode_entry *new; + int ret = 0; if (!S_ISDIR(inode->i_mode)) return; @@ -586,12 +588,13 @@ void set_dirty_dir_page(struct inode *inode, struct page *page) INIT_LIST_HEAD(&new->list); spin_lock(&sbi->dir_inode_lock); - if (__add_dirty_inode(inode, new)) - kmem_cache_free(inode_entry_slab, new); - + ret = __add_dirty_inode(inode, new); inode_inc_dirty_dents(inode); SetPagePrivate(page); spin_unlock(&sbi->dir_inode_lock); + + if (ret) + kmem_cache_free(inode_entry_slab, new); } void add_dirty_dir_inode(struct inode *inode) @@ -599,20 +602,22 @@ void add_dirty_dir_inode(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct dir_inode_entry *new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); + int ret = 0; new->inode = inode; INIT_LIST_HEAD(&new->list); spin_lock(&sbi->dir_inode_lock); - if (__add_dirty_inode(inode, new)) - kmem_cache_free(inode_entry_slab, new); + ret = __add_dirty_inode(inode, new); spin_unlock(&sbi->dir_inode_lock); + + if (ret) + kmem_cache_free(inode_entry_slab, new); } void remove_dirty_dir_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct list_head *this, *head; if (!S_ISDIR(inode->i_mode)) @@ -630,13 +635,15 @@ void remove_dirty_dir_inode(struct inode *inode) entry = list_entry(this, struct dir_inode_entry, list); if (entry->inode == inode) { list_del(&entry->list); - kmem_cache_free(inode_entry_slab, entry); stat_dec_dirty_dir(sbi); - break; + spin_unlock(&sbi->dir_inode_lock); + kmem_cache_free(inode_entry_slab, entry); + goto done; } } spin_unlock(&sbi->dir_inode_lock); +done: /* Only from the recovery routine */ if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 065cd99..4b27e36 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1313,7 +1313,6 @@ static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, { list_del(&i->list); radix_tree_delete(&nm_i->free_nid_root, i->nid); - kmem_cache_free(free_nid_slab, i); } static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) @@ -1360,13 +1359,19 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) { struct free_nid *i; + bool need_free = false; + spin_lock(&nm_i->free_nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == NID_NEW) { __del_from_free_nid_list(nm_i, i); nm_i->fcnt--; + need_free = true; } spin_unlock(&nm_i->free_nid_list_lock); + + if (need_free) + kmem_cache_free(free_nid_slab, i); } static void scan_nat_page(struct f2fs_nm_info *nm_i, @@ -1491,6 +1496,8 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) f2fs_bug_on(!i || i->state != NID_ALLOC); __del_from_free_nid_list(nm_i, i); spin_unlock(&nm_i->free_nid_list_lock); + + kmem_cache_free(free_nid_slab, i); } /* @@ -1500,6 +1507,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; + bool need_free = false; if (!nid) return; @@ -1509,11 +1517,15 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) f2fs_bug_on(!i || i->state != NID_ALLOC); if (!available_free_memory(nm_i, FREE_NIDS)) { __del_from_free_nid_list(nm_i, i); + need_free = true; } else { i->state = NID_NEW; nm_i->fcnt++; } spin_unlock(&nm_i->free_nid_list_lock); + + if (need_free) + kmem_cache_free(free_nid_slab, i); } void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, @@ -1925,6 +1937,9 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) f2fs_bug_on(i->state == NID_ALLOC); __del_from_free_nid_list(nm_i, i); nm_i->fcnt--; + spin_unlock(&nm_i->free_nid_list_lock); + kmem_cache_free(free_nid_slab, i); + spin_lock(&nm_i->free_nid_list_lock); } f2fs_bug_on(nm_i->fcnt); spin_unlock(&nm_i->free_nid_list_lock); -- cgit v0.10.2 From 2d7b822ad9daf0ea903accacaa89340ddd3f201f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 29 Mar 2014 11:33:17 +0800 Subject: f2fs: use list_for_each_entry{_safe} for simplyfying code This patch use list_for_each_entry{_safe} instead of list_for_each{_safe} for simplfying code. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index d877f46..4aa521a 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -308,16 +308,15 @@ void release_orphan_inode(struct f2fs_sb_info *sbi) void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *head, *this; - struct orphan_inode_entry *new = NULL, *orphan = NULL; + struct list_head *head; + struct orphan_inode_entry *new, *orphan; new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); new->ino = ino; spin_lock(&sbi->orphan_inode_lock); head = &sbi->orphan_inode_list; - list_for_each(this, head) { - orphan = list_entry(this, struct orphan_inode_entry, list); + list_for_each_entry(orphan, head, list) { if (orphan->ino == ino) { spin_unlock(&sbi->orphan_inode_lock); kmem_cache_free(orphan_entry_slab, new); @@ -326,14 +325,10 @@ void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) if (orphan->ino > ino) break; - orphan = NULL; } - /* add new_oentry into list which is sorted by inode number */ - if (orphan) - list_add(&new->list, this->prev); - else - list_add_tail(&new->list, head); + /* add new orphan entry into list which is sorted by inode number */ + list_add_tail(&new->list, &orphan->list); spin_unlock(&sbi->orphan_inode_lock); } @@ -561,14 +556,12 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct list_head *head = &sbi->dir_inode_list; - struct list_head *this; + struct dir_inode_entry *entry; - list_for_each(this, head) { - struct dir_inode_entry *entry; - entry = list_entry(this, struct dir_inode_entry, list); + list_for_each_entry(entry, head, list) if (unlikely(entry->inode == inode)) return -EEXIST; - } + list_add_tail(&new->list, head); stat_inc_dirty_dir(sbi); return 0; @@ -618,7 +611,8 @@ void add_dirty_dir_inode(struct inode *inode) void remove_dirty_dir_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct list_head *this, *head; + struct list_head *head; + struct dir_inode_entry *entry; if (!S_ISDIR(inode->i_mode)) return; @@ -630,9 +624,7 @@ void remove_dirty_dir_inode(struct inode *inode) } head = &sbi->dir_inode_list; - list_for_each(this, head) { - struct dir_inode_entry *entry; - entry = list_entry(this, struct dir_inode_entry, list); + list_for_each_entry(entry, head, list) { if (entry->inode == inode) { list_del(&entry->list); stat_dec_dirty_dir(sbi); @@ -654,15 +646,14 @@ done: struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *this, *head; + struct list_head *head; struct inode *inode = NULL; + struct dir_inode_entry *entry; spin_lock(&sbi->dir_inode_lock); head = &sbi->dir_inode_list; - list_for_each(this, head) { - struct dir_inode_entry *entry; - entry = list_entry(this, struct dir_inode_entry, list); + list_for_each_entry(entry, head, list) { if (entry->inode->i_ino == ino) { inode = entry->inode; break; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4b27e36..a161e95 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1451,7 +1451,6 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; - struct list_head *this; retry: if (unlikely(sbi->total_valid_node_count + 1 >= nm_i->max_nid)) return false; @@ -1461,11 +1460,9 @@ retry: /* We should not use stale free nids created by build_free_nids */ if (nm_i->fcnt && !on_build_free_nids(nm_i)) { f2fs_bug_on(list_empty(&nm_i->free_nid_list)); - list_for_each(this, &nm_i->free_nid_list) { - i = list_entry(this, struct free_nid, list); + list_for_each_entry(i, &nm_i->free_nid_list, list) if (i->state == NID_NEW) break; - } f2fs_bug_on(i->state != NID_NEW); *nid = i->nid; @@ -1780,7 +1777,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; - struct list_head *cur, *n; + struct nat_entry *ne, *cur; struct page *page = NULL; struct f2fs_nat_block *nat_blk = NULL; nid_t start_nid = 0, end_nid = 0; @@ -1792,18 +1789,17 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) mutex_lock(&curseg->curseg_mutex); /* 1) flush dirty nat caches */ - list_for_each_safe(cur, n, &nm_i->dirty_nat_entries) { - struct nat_entry *ne; + list_for_each_entry_safe(ne, cur, &nm_i->dirty_nat_entries, list) { nid_t nid; struct f2fs_nat_entry raw_ne; int offset = -1; block_t new_blkaddr; - ne = list_entry(cur, struct nat_entry, list); - nid = nat_get_nid(ne); - if (nat_get_blkaddr(ne) == NEW_ADDR) continue; + + nid = nat_get_nid(ne); + if (flushed) goto to_nat_page; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index bbef4ed..b1ae89f 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -27,14 +27,12 @@ bool space_for_roll_forward(struct f2fs_sb_info *sbi) static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, nid_t ino) { - struct list_head *this; struct fsync_inode_entry *entry; - list_for_each(this, head) { - entry = list_entry(this, struct fsync_inode_entry, list); + list_for_each_entry(entry, head, list) if (entry->inode->i_ino == ino) return entry; - } + return NULL; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 570ab9a..cb49e639 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -340,8 +340,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi) { struct list_head *head = &(SM_I(sbi)->discard_list); - struct list_head *this, *next; - struct discard_entry *entry; + struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int total_segs = TOTAL_SEGS(sbi); @@ -370,8 +369,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi) mutex_unlock(&dirty_i->seglist_lock); /* send small discards */ - list_for_each_safe(this, next, head) { - entry = list_entry(this, struct discard_entry, list); + list_for_each_entry_safe(entry, this, head, list) { f2fs_issue_discard(sbi, entry->blkaddr, entry->len); list_del(&entry->list); SM_I(sbi)->nr_discards -= entry->len; -- cgit v0.10.2 From d54c795b495b9bd417e482286c832c9a8eb210ae Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 29 Mar 2014 15:30:40 +0800 Subject: f2fs: fix error path when fail to read inline data We should unlock page in ->readpage() path and also should unlock & release page in error path of ->write_begin() to avoid deadlock or memory leak. So let's add release code to fix the problem when we fail to read inline data. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 598bfa6..45abd60 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -942,13 +942,19 @@ inline_data: if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); } else { - if (f2fs_has_inline_data(inode)) + if (f2fs_has_inline_data(inode)) { err = f2fs_read_inline_data(inode, page); - else + if (err) { + page_cache_release(page); + return err; + } + } else { err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) - return err; + if (err) + return err; + } + lock_page(page); if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 31ee5b1..383db1f 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -45,8 +45,10 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) } ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) + if (IS_ERR(ipage)) { + unlock_page(page); return PTR_ERR(ipage); + } zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); -- cgit v0.10.2 From ce23447fe5764391025a67c20c97eaf5c6ac1ec3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 2 Apr 2014 09:04:42 +0900 Subject: f2fs: fix to cover io->bio with io_rwsem In the f2fs_wait_on_page_writeback, io->bio should be covered by io_rwsem. Otherwise, the bio pointer can become a dangling pointer due to data races. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index cb49e639..f799c6a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1049,15 +1049,14 @@ static inline bool is_merged_page(struct f2fs_sb_info *sbi, { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = &sbi->write_io[btype]; - struct bio *bio = io->bio; struct bio_vec *bvec; int i; down_read(&io->io_rwsem); - if (!bio) + if (!io->bio) goto out; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, io->bio, i) { if (page == bvec->bv_page) { up_read(&io->io_rwsem); return true; -- cgit v0.10.2 From 6b4afdd794783fe515b50838aa36591e3feea990 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 2 Apr 2014 15:34:36 +0900 Subject: f2fs: introduce f2fs_issue_flush to avoid redundant flush issue Some storage devices show relatively high latencies to complete cache_flush commands, even though their normal IO speed is prettry much high. In such the case, it needs to merge cache_flush commands as much as possible to avoid issuing them redundantly. So, this patch introduces a mount option, "-o flush_merge", to mitigate such the overhead. If this option is enabled by user, F2FS merges the cache_flush commands and then issues just one cache_flush on behalf of them. Once the single command is finished, F2FS sends a completion signal to all the pending threads. Note that, this option can be used under a workload consisting of very intensive concurrent fsync calls, while the storage handles cache_flush commands slowly. Signed-off-by: Jaegeuk Kim diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 2f6d021..25311e11 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -122,6 +122,10 @@ disable_ext_identify Disable the extension list configured by mkfs, so f2fs inline_xattr Enable the inline xattrs feature. inline_data Enable the inline data feature: New created small(<~3.4k) files can be written into inode block. +flush_merge Merge concurrent cache_flush commands as much as possible + to eliminate redundant command issues. If the underlying + device handles the cache_flush command relatively slowly, + recommend to enable this option. ================================================================================ DEBUGFS ENTRIES diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1e3d869..2ecac83 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -40,6 +40,7 @@ #define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 #define F2FS_MOUNT_INLINE_XATTR 0x00000080 #define F2FS_MOUNT_INLINE_DATA 0x00000100 +#define F2FS_MOUNT_FLUSH_MERGE 0x00000200 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -316,6 +317,12 @@ enum { NO_CHECK_TYPE }; +struct flush_cmd { + struct flush_cmd *next; + struct completion wait; + int ret; +}; + struct f2fs_sm_info { struct sit_info *sit_info; /* whole segment information */ struct free_segmap_info *free_info; /* free segment information */ @@ -344,6 +351,14 @@ struct f2fs_sm_info { unsigned int ipu_policy; /* in-place-update policy */ unsigned int min_ipu_util; /* in-place-update threshold */ + + /* for flush command control */ + struct task_struct *f2fs_issue_flush; /* flush thread */ + wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ + struct flush_cmd *issue_list; /* list for command issue */ + struct flush_cmd *dispatch_list; /* list for command dispatch */ + spinlock_t issue_lock; /* for issue list lock */ + struct flush_cmd *issue_tail; /* list tail of issue list */ }; /* @@ -1160,6 +1175,7 @@ void destroy_node_manager_caches(void); */ void f2fs_balance_fs(struct f2fs_sb_info *); void f2fs_balance_fs_bg(struct f2fs_sb_info *); +int f2fs_issue_flush(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6ba2668..302d552 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -186,7 +186,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = wait_on_node_pages_writeback(sbi, inode->i_ino); if (ret) goto out; - ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + ret = f2fs_issue_flush(F2FS_SB(inode->i_sb)); } out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f799c6a..085f548 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -24,6 +25,7 @@ #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; +static struct kmem_cache *flush_cmd_slab; /* * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since @@ -195,6 +197,73 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) f2fs_sync_fs(sbi->sb, true); } +static int issue_flush_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct f2fs_sm_info *sm_i = SM_I(sbi); + wait_queue_head_t *q = &sm_i->flush_wait_queue; +repeat: + if (kthread_should_stop()) + return 0; + + spin_lock(&sm_i->issue_lock); + if (sm_i->issue_list) { + sm_i->dispatch_list = sm_i->issue_list; + sm_i->issue_list = sm_i->issue_tail = NULL; + } + spin_unlock(&sm_i->issue_lock); + + if (sm_i->dispatch_list) { + struct bio *bio = bio_alloc(GFP_NOIO, 0); + struct flush_cmd *cmd, *next; + int ret; + + bio->bi_bdev = sbi->sb->s_bdev; + ret = submit_bio_wait(WRITE_FLUSH, bio); + + for (cmd = sm_i->dispatch_list; cmd; cmd = next) { + cmd->ret = ret; + next = cmd->next; + complete(&cmd->wait); + } + sm_i->dispatch_list = NULL; + } + + wait_event_interruptible(*q, kthread_should_stop() || sm_i->issue_list); + goto repeat; +} + +int f2fs_issue_flush(struct f2fs_sb_info *sbi) +{ + struct f2fs_sm_info *sm_i = SM_I(sbi); + struct flush_cmd *cmd; + int ret; + + if (!test_opt(sbi, FLUSH_MERGE)) + return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL); + + cmd = f2fs_kmem_cache_alloc(flush_cmd_slab, GFP_ATOMIC); + cmd->next = NULL; + cmd->ret = 0; + init_completion(&cmd->wait); + + spin_lock(&sm_i->issue_lock); + if (sm_i->issue_list) + sm_i->issue_tail->next = cmd; + else + sm_i->issue_list = cmd; + sm_i->issue_tail = cmd; + spin_unlock(&sm_i->issue_lock); + + if (!sm_i->dispatch_list) + wake_up(&sm_i->flush_wait_queue); + + wait_for_completion(&cmd->wait); + ret = cmd->ret; + kmem_cache_free(flush_cmd_slab, cmd); + return ret; +} + static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { @@ -1763,6 +1832,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + dev_t dev = sbi->sb->s_bdev->bd_dev; struct f2fs_sm_info *sm_info; int err; @@ -1790,6 +1860,16 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->nr_discards = 0; sm_info->max_discards = 0; + if (test_opt(sbi, FLUSH_MERGE)) { + spin_lock_init(&sm_info->issue_lock); + init_waitqueue_head(&sm_info->flush_wait_queue); + + sm_info->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, + "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(sm_info->f2fs_issue_flush)) + return PTR_ERR(sm_info->f2fs_issue_flush); + } + err = build_sit_info(sbi); if (err) return err; @@ -1898,6 +1978,8 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) struct f2fs_sm_info *sm_info = SM_I(sbi); if (!sm_info) return; + if (sm_info->f2fs_issue_flush) + kthread_stop(sm_info->f2fs_issue_flush); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); @@ -1912,10 +1994,17 @@ int __init create_segment_manager_caches(void) sizeof(struct discard_entry)); if (!discard_entry_slab) return -ENOMEM; + flush_cmd_slab = f2fs_kmem_cache_create("flush_command", + sizeof(struct flush_cmd)); + if (!flush_cmd_slab) { + kmem_cache_destroy(discard_entry_slab); + return -ENOMEM; + } return 0; } void destroy_segment_manager_caches(void) { kmem_cache_destroy(discard_entry_slab); + kmem_cache_destroy(flush_cmd_slab); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9598340..d31b767 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -51,6 +51,7 @@ enum { Opt_disable_ext_identify, Opt_inline_xattr, Opt_inline_data, + Opt_flush_merge, Opt_err, }; @@ -67,6 +68,7 @@ static match_table_t f2fs_tokens = { {Opt_disable_ext_identify, "disable_ext_identify"}, {Opt_inline_xattr, "inline_xattr"}, {Opt_inline_data, "inline_data"}, + {Opt_flush_merge, "flush_merge"}, {Opt_err, NULL}, }; @@ -334,6 +336,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_inline_data: set_opt(sbi, INLINE_DATA); break; + case Opt_flush_merge: + set_opt(sbi, FLUSH_MERGE); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -537,6 +542,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",disable_ext_identify"); if (test_opt(sbi, INLINE_DATA)) seq_puts(seq, ",inline_data"); + if (test_opt(sbi, FLUSH_MERGE)) + seq_puts(seq, ",flush_merge"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; -- cgit v0.10.2 From 3a8861e2715e3b985bfaac43bcdfcfebe9b423cb Mon Sep 17 00:00:00 2001 From: ZhangZhen Date: Fri, 4 Apr 2014 09:47:16 +0800 Subject: f2fs: check the acl's validity before setting Before setting the acl, call posix_acl_valid() to check if it is valid or not. Signed-off-by: zhangzhen Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index a285715..e93e4ec 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -203,6 +203,12 @@ static int __f2fs_set_acl(struct inode *inode, int type, size_t size = 0; int error; + if (acl) { + error = posix_acl_valid(acl); + if (error < 0) + return error; + } + switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; -- cgit v0.10.2 From 48b230a583965d33c32b4e3c29a1e5e15d7e55de Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 7 Apr 2014 11:18:34 +0800 Subject: f2fs: fix wrong statistics of inline data If we remove a file that has inline data after mount, our statistics turns to inaccurate. cat /sys/kernel/debug/f2fs/status - Inline_data Inode: 4294967295 Let's add stat_inc_inline_inode() to stat inline info of the file when lookup. Change log from v1: o stat in f2fs_lookup() instead of in do_read_inode() for excluding wrong stat. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 0cea874..a9409d1 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -207,6 +207,8 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, inode = f2fs_iget(dir->i_sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); + + stat_inc_inline_inode(inode); } return d_splice_alias(inode, dentry); -- cgit v0.10.2