From 744288c7216bf5876f271c32516dfb66dedb0448 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 6 Sep 2015 17:50:13 +0800 Subject: f2fs: trace in batches extent info update Rename trace_f2fs_update_extent_tree to trace_f2fs_update_extent_tree_range, then expand and enable it to trace in batches extent info updates. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 997ac86..1cd6c6c 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -409,6 +409,8 @@ unsigned int f2fs_update_extent_tree_range(struct inode *inode, if (!et) return false; + trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len); + write_lock(&et->lock); if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) { diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index a019465..6aa63d9 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1132,17 +1132,19 @@ TRACE_EVENT_CONDITION(f2fs_lookup_extent_tree_end, __entry->len) ); -TRACE_EVENT(f2fs_update_extent_tree, +TRACE_EVENT(f2fs_update_extent_tree_range, - TP_PROTO(struct inode *inode, unsigned int pgofs, block_t blkaddr), + TP_PROTO(struct inode *inode, unsigned int pgofs, block_t blkaddr, + unsigned int len), - TP_ARGS(inode, pgofs, blkaddr), + TP_ARGS(inode, pgofs, blkaddr, len), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(unsigned int, pgofs) __field(u32, blk) + __field(unsigned int, len) ), TP_fast_assign( @@ -1150,12 +1152,15 @@ TRACE_EVENT(f2fs_update_extent_tree, __entry->ino = inode->i_ino; __entry->pgofs = pgofs; __entry->blk = blkaddr; + __entry->len = len; ), - TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, blkaddr = %u", + TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " + "blkaddr = %u, len = %u", show_dev_ino(__entry), __entry->pgofs, - __entry->blk) + __entry->blk, + __entry->len) ); TRACE_EVENT(f2fs_shrink_extent_tree, -- cgit v0.10.2 From 538e17e7e96e14f76bb82dd83290a5315da70c3b Mon Sep 17 00:00:00 2001 From: Nicholas Krause Date: Sun, 6 Sep 2015 08:28:46 -0400 Subject: f2fs: fix incorrect return statement in the function f2fs_ioc_release_volatile_write This fixes the incorrect return statement at the end of the function f2fs_ioc_release_volatile_write's body for returning zero as this is incorrect due to the function call before this return statement to the function punch_hole being able to fail and we should return this function's return fail directly in order to signal to callers of the function f2fs_ioc_release_volatile if a failure arises with this call to punch_hole fails. Signed-off-by: Nicholas Krause Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8120f86..b2fab9e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1437,8 +1437,7 @@ static int f2fs_ioc_release_volatile_write(struct file *filp) if (!f2fs_is_first_block_written(inode)) return truncate_partial_data_page(inode, 0, true); - punch_hole(inode, 0, F2FS_BLKSIZE); - return 0; + return punch_hole(inode, 0, F2FS_BLKSIZE); } static int f2fs_ioc_abort_volatile_write(struct file *filp) -- cgit v0.10.2 From 25b93346a6b5542fd7de50555f8f0ddfc56d7443 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 12 Sep 2015 11:13:04 -0700 Subject: f2fs: cover number of dirty node pages under node_write lock This number is referenced by checkpoint under node_write lock. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 27d1a74..4d9bedf 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1323,23 +1323,24 @@ static int f2fs_write_node_page(struct page *page, nid = nid_of_node(page); f2fs_bug_on(sbi, page->index != nid); + if (wbc->for_reclaim) { + if (!down_read_trylock(&sbi->node_write)) + goto redirty_out; + } else { + down_read(&sbi->node_write); + } + get_node_info(sbi, nid, &ni); /* This page is already truncated */ if (unlikely(ni.blk_addr == NULL_ADDR)) { ClearPageUptodate(page); dec_page_count(sbi, F2FS_DIRTY_NODES); + up_read(&sbi->node_write); unlock_page(page); return 0; } - if (wbc->for_reclaim) { - if (!down_read_trylock(&sbi->node_write)) - goto redirty_out; - } else { - down_read(&sbi->node_write); - } - set_page_writeback(page); fio.blk_addr = ni.blk_addr; write_node_page(nid, &fio); -- cgit v0.10.2 From c5cd29d21cebc2e1802a9eea4cd940ddf9825ce1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 12 Sep 2015 11:25:30 -0700 Subject: f2fs: no need to lock for update_inode_page all the time As comment says, we don't need to call f2fs_lock_op in write_inode to prevent from producing dirty node pages all the time. That happens only when there is not enough free sections and we can avoid that by calling balance_fs in prior to that. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 35aae65..97e20de 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -296,16 +296,12 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) return 0; /* - * We need to lock here to prevent from producing dirty node pages + * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - f2fs_lock_op(sbi); update_inode_page(inode); - f2fs_unlock_op(sbi); - - if (wbc) - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi); return 0; } -- cgit v0.10.2 From c998012b0bb93009d66349ea7e9747c968375e69 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 11 Sep 2015 14:39:02 +0800 Subject: f2fs: verify file type early in f2fs_fallocate This patch changes to verify file type early in f2fs_fallocate for cleanup, meanwhile this also fixes to add missing verification for expand_inode_data. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b2fab9e..a85bb14 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -765,9 +765,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) loff_t off_start, off_end; int ret = 0; - if (!S_ISREG(inode->i_mode)) - return -EOPNOTSUPP; - if (f2fs_has_inline_data(inode)) { ret = f2fs_convert_inline_inode(inode); if (ret) @@ -908,9 +905,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) loff_t new_size; int ret; - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - if (offset + len >= i_size_read(inode)) return -EINVAL; @@ -959,9 +953,6 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, loff_t off_start, off_end; int ret = 0; - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - ret = inode_newsize_ok(inode, (len + offset)); if (ret) return ret; @@ -1068,9 +1059,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) loff_t new_size; int ret; - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - new_size = i_size_read(inode) + len; if (new_size > inode->i_sb->s_maxbytes) return -EFBIG; @@ -1228,6 +1216,10 @@ static long f2fs_fallocate(struct file *file, int mode, struct inode *inode = file_inode(file); long ret = 0; + /* f2fs only support ->fallocate for regular file */ + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + if (f2fs_encrypted_inode(inode) && (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; -- cgit v0.10.2 From 100136acfb4023ab7dc899192e95aca9aedfe98a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 11 Sep 2015 14:43:02 +0800 Subject: f2fs: fix incorrect searching position when shrinking extent cache When shrinking extent cache, we have two steps in the flow: 1) shrink objects which are unreferenced by inodes; 2) shrink objects from LRU list of extent cache. In step 1, if we haven't shrunk enough number of objects, we will try step 2, but before that we didn't update the searching position which may point to last inode index in global extent tree, result in failing to shrink objects by traversing the all inodes' extent tree. In this patch, we reset searching position to beginning of global extent tree for fixing. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 1cd6c6c..a8b9aa2 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -652,6 +652,11 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) } spin_unlock(&sbi->extent_lock); + /* + * reset ino for searching victims from beginning of global extent tree. + */ + ino = F2FS_ROOT_INO(sbi); + while ((found = radix_tree_gang_lookup(root, (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { unsigned i; -- cgit v0.10.2 From 9edcdabf36422d15d01db73b0fa5487e418beff6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 11 Sep 2015 14:43:52 +0800 Subject: f2fs: fix overflow of size calculation We have potential overflow issue when calculating size of object, when we left shift index with PAGE_CACHE_SHIFT bits, if type of index has only 32-bits space in 32-bit architecture, left shifting will incur overflow, i.e: pgoff_t index = 0xFFFFFFFF; loff_t size = index << PAGE_CACHE_SHIFT; size: 0xFFFFF000 So we should cast index with 64-bits type to avoid this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a82abe9..681373e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -447,9 +447,9 @@ repeat: lock_page(page); } got_it: - if (new_i_size && - i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) { - i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT)); + if (new_i_size && i_size_read(inode) < + ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)) { + i_size_write(inode, ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)); /* Only the directory inode sets new_i_size */ set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); } @@ -489,8 +489,9 @@ alloc: /* update i_size */ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + dn->ofs_in_node; - if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT)) - i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT)); + if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)) + i_size_write(dn->inode, + ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)); /* direct IO doesn't use extent cache to maximize the performance */ f2fs_drop_largest_extent(dn->inode, fofs); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index d013d84..ebfcc40 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -198,9 +198,9 @@ get_cache: si->page_mem = 0; npages = NODE_MAPPING(sbi)->nrpages; - si->page_mem += npages << PAGE_CACHE_SHIFT; + si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT; npages = META_MAPPING(sbi)->nrpages; - si->page_mem += npages << PAGE_CACHE_SHIFT; + si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT; } static int stat_show(struct seq_file *s, void *v) @@ -333,13 +333,13 @@ static int stat_show(struct seq_file *s, void *v) /* memory footprint */ update_mem_info(si->sbi); - seq_printf(s, "\nMemory: %u KB\n", + seq_printf(s, "\nMemory: %llu KB\n", (si->base_mem + si->cache_mem + si->page_mem) >> 10); - seq_printf(s, " - static: %u KB\n", + seq_printf(s, " - static: %llu KB\n", si->base_mem >> 10); - seq_printf(s, " - cached: %u KB\n", + seq_printf(s, " - cached: %llu KB\n", si->cache_mem >> 10); - seq_printf(s, " - paged : %u KB\n", + seq_printf(s, " - paged : %llu KB\n", si->page_mem >> 10); } mutex_unlock(&f2fs_stat_mutex); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f1a90ff..79c38ad 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1844,7 +1844,7 @@ struct f2fs_stat_info { unsigned int segment_count[2]; unsigned int block_count[2]; unsigned int inplace_count; - unsigned base_mem, cache_mem, page_mem; + unsigned long long base_mem, cache_mem, page_mem; }; static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a85bb14..27a789c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -74,7 +74,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, goto mapped; /* page is wholly or partially inside EOF */ - if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) { + if (((loff_t)(page->index + 1) << PAGE_CACHE_SHIFT) > + i_size_read(inode)) { unsigned offset; offset = i_size_read(inode) & ~PAGE_CACHE_MASK; zero_user_segment(page, offset, PAGE_CACHE_SIZE); @@ -343,7 +344,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence); - for (; data_ofs < isize; data_ofs = pgofs << PAGE_CACHE_SHIFT) { + for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) { set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); if (err && err != -ENOENT) { @@ -802,8 +803,8 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi); - blk_start = pg_start << PAGE_CACHE_SHIFT; - blk_end = pg_end << PAGE_CACHE_SHIFT; + blk_start = (loff_t)pg_start << PAGE_CACHE_SHIFT; + blk_end = (loff_t)pg_end << PAGE_CACHE_SHIFT; truncate_inode_pages_range(mapping, blk_start, blk_end - 1); @@ -994,7 +995,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, return ret; new_size = max_t(loff_t, new_size, - pg_start << PAGE_CACHE_SHIFT); + (loff_t)pg_start << PAGE_CACHE_SHIFT); } for (index = pg_start; index < pg_end; index++) { @@ -1030,7 +1031,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, f2fs_unlock_op(sbi); new_size = max_t(loff_t, new_size, - (index + 1) << PAGE_CACHE_SHIFT); + (loff_t)(index + 1) << PAGE_CACHE_SHIFT); } if (off_end) { @@ -1192,9 +1193,10 @@ noalloc: if (pg_start == pg_end) new_size = offset + len; else if (index == pg_start && off_start) - new_size = (index + 1) << PAGE_CACHE_SHIFT; + new_size = (loff_t)(index + 1) << PAGE_CACHE_SHIFT; else if (index == pg_end) - new_size = (index << PAGE_CACHE_SHIFT) + off_end; + new_size = ((loff_t)index << PAGE_CACHE_SHIFT) + + off_end; else new_size += PAGE_CACHE_SIZE; } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index faec2ca..acc21f2 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -570,7 +570,7 @@ out: /* truncate meta pages to be used by the recovery */ truncate_inode_pages_range(META_MAPPING(sbi), - MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1); + (loff_t)MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1); if (err) { truncate_inode_pages_final(NODE_MAPPING(sbi)); -- cgit v0.10.2 From 514053e4543b33bbd74d8900f368df663cb03325 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 15 Sep 2015 14:46:43 -0700 Subject: f2fs: declare f2fs_update_extent_tree_range as static This function should be static. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index a8b9aa2..63068b7 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -394,7 +394,7 @@ do_insert: return en; } -unsigned int f2fs_update_extent_tree_range(struct inode *inode, +static unsigned int f2fs_update_extent_tree_range(struct inode *inode, pgoff_t fofs, block_t blkaddr, unsigned int len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); -- cgit v0.10.2 From 973163fc0ce07761eae671e895dc6747a3410fe7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 18 Sep 2015 16:51:51 +0800 Subject: f2fs: reorganize f2fs_map_blocks In this patch, we try to reorganize f2fs_map_blocks to make block mapping flow more clear by using following structure: /* check status of mapping */ if (unmapped) { /* blkaddr == NULL_ADDR || blkaddr == NEW_ADDR */ if (create) { /* write path, handle dio write case here */ alloc_and_map; } else { /* * handle read cases from all call paths: * 1. generic read; * 2. dio read; * 3. fiemap; * 4. bmap */ } } /* map buffer_header */ Besides, this patch handles the missing case correctly for dio write: When we fail in __allocate_data_blocks, then in f2fs_map_blocks, we will not allocate blocks correctly for preallocated blocks, but returning with an unmapped buffer head, which will result in failure of dio write. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 681373e..883b649 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -596,40 +596,36 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, err = 0; goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR) { - if (flag == F2FS_GET_BLOCK_BMAP) { - err = -ENOENT; - goto put_out; - } else if (flag == F2FS_GET_BLOCK_READ || - flag == F2FS_GET_BLOCK_DIO) { - goto put_out; + + if (dn.data_blkaddr == NEW_ADDR || dn.data_blkaddr == NULL_ADDR) { + if (create) { + err = __allocate_data_block(&dn); + if (err) + goto put_out; + allocated = true; + map->m_flags = F2FS_MAP_NEW; + } else { + if (flag != F2FS_GET_BLOCK_FIEMAP || + dn.data_blkaddr != NEW_ADDR) { + if (flag == F2FS_GET_BLOCK_BMAP) + err = -ENOENT; + goto put_out; + } + + /* + * preallocated unwritten block should be mapped + * for fiemap. + */ + if (dn.data_blkaddr == NEW_ADDR) + map->m_flags = F2FS_MAP_UNWRITTEN; } - /* - * if it is in fiemap call path (flag = F2FS_GET_BLOCK_FIEMAP), - * mark it as mapped and unwritten block. - */ } - if (dn.data_blkaddr != NULL_ADDR) { - map->m_flags = F2FS_MAP_MAPPED; - map->m_pblk = dn.data_blkaddr; - if (dn.data_blkaddr == NEW_ADDR) - map->m_flags |= F2FS_MAP_UNWRITTEN; - } else if (create) { - err = __allocate_data_block(&dn); - if (err) - goto put_out; - allocated = true; - map->m_flags = F2FS_MAP_NEW | F2FS_MAP_MAPPED; - map->m_pblk = dn.data_blkaddr; - } else { - if (flag == F2FS_GET_BLOCK_BMAP) - err = -ENOENT; - goto put_out; - } + map->m_flags |= F2FS_MAP_MAPPED; + map->m_pblk = dn.data_blkaddr; + map->m_len = 1; end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); - map->m_len = 1; dn.ofs_in_node++; pgofs++; @@ -648,23 +644,31 @@ get_next: goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR && - flag != F2FS_GET_BLOCK_FIEMAP) - goto put_out; - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); } if (maxblocks > map->m_len) { block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); - if (blkaddr == NULL_ADDR && create) { - err = __allocate_data_block(&dn); - if (err) - goto sync_out; - allocated = true; - map->m_flags |= F2FS_MAP_NEW; - blkaddr = dn.data_blkaddr; + + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { + if (create) { + err = __allocate_data_block(&dn); + if (err) + goto sync_out; + allocated = true; + map->m_flags |= F2FS_MAP_NEW; + blkaddr = dn.data_blkaddr; + } else { + /* + * we only merge preallocated unwritten blocks + * for fiemap. + */ + if (flag != F2FS_GET_BLOCK_FIEMAP || + blkaddr != NEW_ADDR) + goto sync_out; + } } + /* Give more consecutive addresses for the readahead */ if ((map->m_pblk != NEW_ADDR && blkaddr == (map->m_pblk + ofs)) || -- cgit v0.10.2 From 569cf1876a32e574ba8a7fb825cd91bafd003882 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 3 Sep 2015 13:38:23 -0700 Subject: f2fs crypto: allocate buffer for decrypting filename We got dentry pages from high_mem, and its address space directly goes into the decryption path via f2fs_fname_disk_to_usr. But, sg_init_one assumes the address is not from high_mem, so we can get this panic since it doesn't call kmap_high but kunmap_high is triggered at the end. kernel BUG at ../../../../../../kernel/mm/highmem.c:290! Internal error: Oops - BUG: 0 [#1] PREEMPT SMP ARM ... (kunmap_high+0xb0/0xb8) from [] (__kunmap_atomic+0xa0/0xa4) (__kunmap_atomic+0xa0/0xa4) from [] (blkcipher_walk_done+0x128/0x1ec) (blkcipher_walk_done+0x128/0x1ec) from [] (crypto_cbc_decrypt+0xc0/0x170) (crypto_cbc_decrypt+0xc0/0x170) from [] (crypto_cts_decrypt+0xc0/0x114) (crypto_cts_decrypt+0xc0/0x114) from [] (async_decrypt+0x40/0x48) (async_decrypt+0x40/0x48) from [] (f2fs_fname_disk_to_usr+0x124/0x304) (f2fs_fname_disk_to_usr+0x124/0x304) from [] (f2fs_fill_dentries+0xac/0x188) (f2fs_fill_dentries+0xac/0x188) from [] (f2fs_readdir+0x1f0/0x300) (f2fs_readdir+0x1f0/0x300) from [] (vfs_readdir+0x90/0xb4) (vfs_readdir+0x90/0xb4) from [] (SyS_getdents64+0x64/0xcc) (SyS_getdents64+0x64/0xcc) from [] (ret_fast_syscall+0x0/0x30) Cc: Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 8f15fc1..6726c4a 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -787,7 +787,6 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, else d_type = DT_UNKNOWN; - /* encrypted case */ de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); @@ -795,12 +794,20 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, int save_len = fstr->len; int ret; + de_name.name = kmalloc(de_name.len, GFP_NOFS); + if (!de_name.name) + return false; + + memcpy(de_name.name, d->filename[bit_pos], de_name.len); + ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code, &de_name, fstr); - de_name = *fstr; - fstr->len = save_len; + kfree(de_name.name); if (ret < 0) return true; + + de_name = *fstr; + fstr->len = save_len; } if (!dir_emit(ctx, de_name.name, de_name.len, diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a680bf3..dfa01c8 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -947,8 +947,13 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook /* Symlink is encrypted */ sd = (struct f2fs_encrypted_symlink_data *)caddr; - cstr.name = sd->encrypted_path; cstr.len = le16_to_cpu(sd->len); + cstr.name = kmalloc(cstr.len, GFP_NOFS); + if (!cstr.name) { + res = -ENOMEM; + goto errout; + } + memcpy(cstr.name, sd->encrypted_path, cstr.len); /* this is broken symlink case */ if (cstr.name[0] == 0 && cstr.len == 0) { @@ -970,6 +975,8 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook if (res < 0) goto errout; + kfree(cstr.name); + paddr = pstr.name; /* Null-terminate the name */ @@ -979,6 +986,7 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook page_cache_release(cpage); return *cookie = paddr; errout: + kfree(cstr.name); f2fs_fname_crypto_free_buffer(&pstr); kunmap(cpage); page_cache_release(cpage); -- cgit v0.10.2 From a7230d16d5f8635dbc029af057969e46e3eb2246 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 16 Sep 2015 14:06:54 -0700 Subject: f2fs: check end_io for metapages before making next checkpoint blocks This patch avoids to produce new checkpoint blocks before the previous meta pages were written completely. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c5a38e3..ff53405 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1000,6 +1000,11 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk = __start_cp_addr(sbi); + /* need to wait for end_io results */ + wait_on_all_pages_writeback(sbi); + if (unlikely(f2fs_cp_error(sbi))) + return; + /* write out checkpoint buffer at block 0 */ update_meta_page(sbi, ckpt, start_blk++); -- cgit v0.10.2 From 41a099de3a8d2b570147445c5cddc59c0daf5e96 Mon Sep 17 00:00:00 2001 From: Fan Li Date: Thu, 17 Sep 2015 18:24:17 +0800 Subject: f2fs: drop largest extent by range now we update extent by range, fofs may not be on the largest extent if the new extent overlaps with it. so add a new function to drop largest extent properly. Signed-off-by: Fan li Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 63068b7..2f013e2 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -155,11 +155,12 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, return count - et->count; } -static void __drop_largest_extent(struct inode *inode, pgoff_t fofs) +static void __drop_largest_extent(struct inode *inode, + pgoff_t fofs, unsigned int len) { struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest; - if (largest->fofs <= fofs && largest->fofs + largest->len > fofs) + if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) largest->len = 0; } @@ -168,7 +169,7 @@ void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs) if (!f2fs_may_extent_tree(inode)) return; - __drop_largest_extent(inode, fofs); + __drop_largest_extent(inode, fofs, 1); } void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) @@ -422,7 +423,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, dei.len = 0; /* we do not guarantee that the largest extent is cached all the time */ - __drop_largest_extent(inode, fofs); + __drop_largest_extent(inode, fofs, len); /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en, -- cgit v0.10.2 From 4d1fa815f26ce92dc4710d28f50071174a2340b7 Mon Sep 17 00:00:00 2001 From: Fan Li Date: Thu, 17 Sep 2015 18:42:06 +0800 Subject: f2fs: optimize code of f2fs_update_extent_tree_range Fix 2 potential problems: 1. when largest extent needs to be invalidated, it will be reset in __drop_largest_extent, which makes __is_extent_same after always return false, and largest extent unchanged. Now we update it properly. 2. when extent is split and the latter part remains in tree, next_en should be the latter part instead of next extent of original extent. It will cause merge failure if there is in-place update, although there is not, I think this fix will still makes codes less ambiguous. This patch also simplifies codes of invalidating extents, and optimizes the procedues that split extent into two. There are a few modifications after last patch: 1. prev_en now is updated properly. 2. more codes and branches are simplified. Signed-off-by: Fan li Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 2f013e2..c9d1cfd 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -400,7 +400,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et = F2FS_I(inode)->extent_tree; - struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL; + struct extent_node *en = NULL, *en1 = NULL; struct extent_node *prev_en = NULL, *next_en = NULL; struct extent_info ei, dei, prev; struct rb_node **insert_p = NULL, *insert_parent = NULL; @@ -422,148 +422,101 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, prev = et->largest; dei.len = 0; - /* we do not guarantee that the largest extent is cached all the time */ + /* + * drop largest extent before lookup, in case it's already + * been shrunk from extent tree + */ __drop_largest_extent(inode, fofs, len); /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en, &insert_p, &insert_parent); - if (!en) { - if (next_en) { - en = next_en; - f2fs_bug_on(sbi, en->ei.fofs <= pos); - pos = en->ei.fofs; - } else { - /* - * skip searching in the tree since there is no - * larger extent node in the cache. - */ - goto update_extent; - } - } + if (!en) + en = next_en; /* 2. invlidate all extent nodes in range [fofs, fofs + len - 1] */ - while (en) { - struct rb_node *node; + while (en && en->ei.fofs < end) { + unsigned int org_end; + int parts = 0; /* # of parts current extent split into */ - if (pos >= end) - break; + next_en = en1 = NULL; dei = en->ei; - en1 = en2 = NULL; + org_end = dei.fofs + dei.len; + f2fs_bug_on(sbi, pos >= org_end); - node = rb_next(&en->rb_node); + if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { + en->ei.len = pos - en->ei.fofs; + prev_en = en; + parts = 1; + } - /* - * 2.1 there are four cases when we invalidate blkaddr in extent - * node, |V: valid address, X: will be invalidated| - */ - /* case#1, invalidate right part of extent node |VVVVVXXXXX| */ - if (pos > dei.fofs && end >= dei.fofs + dei.len) { - en->ei.len = pos - dei.fofs; - - if (en->ei.len < F2FS_MIN_EXTENT_LEN) { - __detach_extent_node(sbi, et, en); - insert_p = NULL; - insert_parent = NULL; - goto update; + if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) { + if (parts) { + set_extent_info(&ei, end, + end - dei.fofs + dei.blk, + org_end - end); + en1 = __insert_extent_tree(sbi, et, &ei, + NULL, NULL); + next_en = en1; + } else { + en->ei.fofs = end; + en->ei.blk += end - dei.fofs; + en->ei.len -= end - dei.fofs; + next_en = en; } - - if (__is_extent_same(&dei, &et->largest)) - et->largest = en->ei; - goto next; + parts++; } - /* case#2, invalidate left part of extent node |XXXXXVVVVV| */ - if (pos <= dei.fofs && end < dei.fofs + dei.len) { - en->ei.fofs = end; - en->ei.blk += end - dei.fofs; - en->ei.len -= end - dei.fofs; - - if (en->ei.len < F2FS_MIN_EXTENT_LEN) { - __detach_extent_node(sbi, et, en); - insert_p = NULL; - insert_parent = NULL; - goto update; - } + if (!next_en) { + struct rb_node *node = rb_next(&en->rb_node); - if (__is_extent_same(&dei, &et->largest)) - et->largest = en->ei; - goto next; + next_en = node ? + rb_entry(node, struct extent_node, rb_node) + : NULL; } - __detach_extent_node(sbi, et, en); - - /* - * if we remove node in rb-tree, our parent node pointer may - * point the wrong place, discard them. - */ - insert_p = NULL; - insert_parent = NULL; - - /* case#3, invalidate entire extent node |XXXXXXXXXX| */ - if (pos <= dei.fofs && end >= dei.fofs + dei.len) { - if (__is_extent_same(&dei, &et->largest)) - et->largest.len = 0; - goto update; + if (parts) { + if (en->ei.len > et->largest.len) + et->largest = en->ei; + } else { + __detach_extent_node(sbi, et, en); } /* - * case#4, invalidate data in the middle of extent node - * |VVVXXXXVVV| + * if original extent is split into zero or two parts, extent + * tree has been altered by deletion or insertion, therefore + * invalidate pointers regard to tree. */ - if (dei.len > F2FS_MIN_EXTENT_LEN) { - unsigned int endofs; - - /* insert left part of split extent into cache */ - if (pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { - set_extent_info(&ei, dei.fofs, dei.blk, - pos - dei.fofs); - en1 = __insert_extent_tree(sbi, et, &ei, - NULL, NULL); - } - - /* insert right part of split extent into cache */ - endofs = dei.fofs + dei.len; - if (endofs - end >= F2FS_MIN_EXTENT_LEN) { - set_extent_info(&ei, end, - end - dei.fofs + dei.blk, - endofs - end); - en2 = __insert_extent_tree(sbi, et, &ei, - NULL, NULL); - } + if (parts != 1) { + insert_p = NULL; + insert_parent = NULL; } -update: - /* 2.2 update in global extent list */ + + /* update in global extent list */ spin_lock(&sbi->extent_lock); - if (en && !list_empty(&en->list)) + if (!parts && !list_empty(&en->list)) list_del(&en->list); if (en1) list_add_tail(&en1->list, &sbi->extent_list); - if (en2) - list_add_tail(&en2->list, &sbi->extent_list); spin_unlock(&sbi->extent_lock); - /* 2.3 release extent node */ - if (en) + /* release extent node */ + if (!parts) kmem_cache_free(extent_node_slab, en); -next: - en = node ? rb_entry(node, struct extent_node, rb_node) : NULL; - next_en = en; - if (en) - pos = en->ei.fofs; + + en = next_en; } -update_extent: /* 3. update extent in extent cache */ if (blkaddr) { struct extent_node *den = NULL; set_extent_info(&ei, fofs, blkaddr, len); - en3 = __try_merge_extent_node(sbi, et, &ei, &den, + en1 = __try_merge_extent_node(sbi, et, &ei, &den, prev_en, next_en); - if (!en3) - en3 = __insert_extent_tree(sbi, et, &ei, + if (!en1) + en1 = __insert_extent_tree(sbi, et, &ei, insert_p, insert_parent); /* give up extent_cache, if split and small updates happen */ @@ -575,11 +528,11 @@ update_extent: } spin_lock(&sbi->extent_lock); - if (en3) { - if (list_empty(&en3->list)) - list_add_tail(&en3->list, &sbi->extent_list); + if (en1) { + if (list_empty(&en1->list)) + list_add_tail(&en1->list, &sbi->extent_list); else - list_move_tail(&en3->list, &sbi->extent_list); + list_move_tail(&en1->list, &sbi->extent_list); } if (den && !list_empty(&den->list)) list_del(&den->list); -- cgit v0.10.2 From ea58711e884c3006e6f61e44d47f0c2d9f0979e1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 17 Sep 2015 20:22:44 +0800 Subject: f2fs: do in batches truncation in truncate_hole truncate_data_blocks_range can do in batches truncation which makes all changes in dnode page content, dnode page status, extent cache, block count updating together. But previously, truncate_hole() always truncates one block in dnode page at a time by invoking truncate_data_blocks_range(,1), which make thing slow. This patch changes truncate_hole() to do in batches truncation for all target blocks in one direct node inside truncate_data_blocks_range, which can make our punch hole operation in ->fallocate more efficent. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 27a789c..6702157 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -739,23 +739,31 @@ static int fill_zero(struct inode *inode, pgoff_t index, int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) { - pgoff_t index; int err; - for (index = pg_start; index < pg_end; index++) { + while (pg_start < pg_end) { struct dnode_of_data dn; + pgoff_t end_offset, count; set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + err = get_dnode_of_data(&dn, pg_start, LOOKUP_NODE); if (err) { - if (err == -ENOENT) + if (err == -ENOENT) { + pg_start++; continue; + } return err; } - if (dn.data_blkaddr != NULL_ADDR) - truncate_data_blocks_range(&dn, 1); + end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + count = min(end_offset - dn.ofs_in_node, pg_end - pg_start); + + f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset); + + truncate_data_blocks_range(&dn, count); f2fs_put_dnode(&dn); + + pg_start += count; } return 0; } -- cgit v0.10.2 From f9811703fefbdb22abf723ec61368777cdad8508 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 21 Sep 2015 20:17:52 +0800 Subject: f2fs: fix to handle io error in ->direct_IO Here is a oops reported as following message when testing generic/019 of xfstest: ------------[ cut here ]------------ kernel BUG at /home/yuchao/git/f2fs-dev/segment.c:882! invalid opcode: 0000 [#1] SMP Modules linked in: zram lz4_compress lz4_decompress f2fs(O) ip6table_filter ip6_tables ebtable_nat ebtables nf_conntrack_ipv4 nf_def CPU: 2 PID: 25441 Comm: fio Tainted: G O 4.3.0-rc1+ #6 Hardware name: Hewlett-Packard HP Z220 CMT Workstation/1790, BIOS K51 v01.61 05/16/2013 task: ffff8803f4e85580 ti: ffff8803fd61c000 task.ti: ffff8803fd61c000 RIP: 0010:[] [] new_curseg+0x321/0x330 [f2fs] RSP: 0018:ffff8803fd61f918 EFLAGS: 00010246 RAX: 00000000000007ed RBX: 0000000000000224 RCX: 000000000000001f RDX: 0000000000000800 RSI: ffffffffffffffff RDI: ffff8803f56f4300 RBP: ffff8803fd61f978 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000024 R11: ffff8800d23bbd78 R12: ffff8800d0ef0000 R13: 0000000000000224 R14: 0000000000000000 R15: 0000000000000001 FS: 00007f827ff85700(0000) GS:ffff88041ea80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffff600000 CR3: 00000003fef17000 CR4: 00000000001406e0 Stack: 000007ea00000002 0000000100000001 ffff8803f6456248 000007ed0000002b 0000000000000224 ffff880404d1aa20 ffff8803fd61f9c8 ffff8800d0ef0000 ffff8803f6456248 0000000000000001 00000000ffffffff ffffffffa078f358 Call Trace: [] allocate_segment_by_default+0x1a7/0x1f0 [f2fs] [] allocate_data_block+0x17c/0x360 [f2fs] [] __allocate_data_block+0x131/0x1d0 [f2fs] [] f2fs_direct_IO+0x4b5/0x580 [f2fs] [] generic_file_direct_write+0xae/0x160 [] __generic_file_write_iter+0xd5/0x1f0 [] generic_file_write_iter+0xf7/0x200 [] ? apparmor_file_permission+0x18/0x20 [] ? f2fs_fallocate+0x1190/0x1190 [f2fs] [] f2fs_file_write_iter+0x46/0x90 [f2fs] [] aio_run_iocb+0x1ee/0x290 [] ? mutex_lock+0x1e/0x50 [] ? aio_read_events+0x207/0x2b0 [] do_io_submit+0x373/0x630 [] ? SyS_io_getevents+0x56/0xb0 [] SyS_io_submit+0x10/0x20 [] entry_SYSCALL_64_fastpath+0x12/0x6a Code: 45 c8 48 8b 78 10 e8 9f 23 bf e0 41 8b 8c 24 cc 03 00 00 89 c7 31 d2 89 c6 89 d8 29 df f7 f1 29 d1 39 cf 0f 83 be fd ff ff eb RIP [] new_curseg+0x321/0x330 [f2fs] RSP ---[ end trace 2e577d7f711ddb86 ]--- The reason is that: in the test of generic/019, we will trigger a manmade IO error in block layer through debugfs, after that, prefree segment will no longer be freed, because we always skip doing gc or checkpoint when there occurs an IO error. Meanwhile fio with aio engine generated a large number of direct IOs, which continue allocating spaces in free segment until we run out of them, eventually, results in panic in new_curseg as no more free segment was found. So, this patch changes to return EIO in direct_IO for this condition. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 883b649..d265401 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -524,6 +524,9 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset, while (dn.ofs_in_node < end_offset && len) { block_t blkaddr; + if (unlikely(f2fs_cp_error(sbi))) + goto sync_out; + blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) { if (__allocate_data_block(&dn)) @@ -566,6 +569,7 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, { unsigned int maxblocks = map->m_len; struct dnode_of_data dn; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA; pgoff_t pgofs, end_offset; int err = 0, ofs = 1; @@ -599,6 +603,10 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, if (dn.data_blkaddr == NEW_ADDR || dn.data_blkaddr == NULL_ADDR) { if (create) { + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto put_out; + } err = __allocate_data_block(&dn); if (err) goto put_out; @@ -652,6 +660,10 @@ get_next: if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { if (create) { + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto sync_out; + } err = __allocate_data_block(&dn); if (err) goto sync_out; @@ -1556,10 +1568,16 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); - if (iov_iter_rw(iter) == WRITE) + if (iov_iter_rw(iter) == WRITE) { __allocate_data_blocks(inode, offset, count); + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { + err = -EIO; + goto out; + } + } err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); +out: if (err < 0 && iov_iter_rw(iter) == WRITE) f2fs_write_failed(mapping, offset + count); -- cgit v0.10.2 From 46c9e1413f3c4ecbbe775e545063e88feb1f9871 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 18 Sep 2015 16:54:16 +0800 Subject: f2fs: use correct flag in f2fs_map_blocks() We introduce F2FS_GET_BLOCK_READ in commit e2b4e2bc8865 ("f2fs: fix incorrect mapping for bmap"), but forget to use this flag in the right place, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d265401..bc04e92 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -920,7 +920,8 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_lblk = block_in_file; map.m_len = last_block - block_in_file; - if (f2fs_map_blocks(inode, &map, 0, false)) + if (f2fs_map_blocks(inode, &map, 0, + F2FS_GET_BLOCK_READ)) goto set_error_page; } got_it: -- cgit v0.10.2 From 9cd81ce3c2f01fc599b9156b94b868b4f578698c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 18 Sep 2015 16:55:26 +0800 Subject: f2fs: disallow switch extent_cache option dynamically Swith extent_cache option dynamically when remount may casue consistency issue between extent cache and dnode page. Fix in this patch to avoid that condition. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f794781..16442ec 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -742,6 +742,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) int err, active_logs; bool need_restart_gc = false; bool need_stop_gc = false; + bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); sync_filesystem(sb); @@ -767,6 +768,14 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (f2fs_readonly(sb) && (*flags & MS_RDONLY)) goto skip; + /* disallow enable/disable extent_cache dynamically */ + if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { + err = -EINVAL; + f2fs_msg(sbi->sb, KERN_WARNING, + "switch extent_cache option is not allowed"); + goto restore_opts; + } + /* * We stop the GC thread if FS is mounted as RO * or if background_gc = off is passed in mount -- cgit v0.10.2 From 545fe4210df5eb4097aa17c68f0f153db27bcf44 Mon Sep 17 00:00:00 2001 From: Nicholas Krause Date: Mon, 21 Sep 2015 18:55:49 -0400 Subject: f2fs: fix error handling for calls to various functions in the function recover_inline_data This fixes error handling for calls to various functions in the function recover_inline_data to check if these particular functions either return a error code or the boolean value false to signal their caller they have failed internally and if this arises return false to signal failure immediately to the caller of recover_inline_data as we cannot continue after failures to calling either the function truncate_inline_inode or truncate_blocks. Signed-off-by: Nicholas Krause Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3d143be..3b18e23 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -274,12 +274,14 @@ process_inline: if (f2fs_has_inline_data(inode)) { ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - truncate_inline_inode(ipage, 0); + if (!truncate_inline_inode(ipage, 0)) + return false; f2fs_clear_inline_inode(inode); update_inode(inode, ipage); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { - truncate_blocks(inode, 0, false); + if (truncate_blocks(inode, 0, false)) + return false; goto process_inline; } return false; -- cgit v0.10.2 From 4abd3f5ac49314fe345566ec859b8e60993207bf Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 22 Sep 2015 21:07:47 +0800 Subject: f2fs: introduce __try_update_largest_extent This patch adds a new helper __try_update_largest_extent for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index c9d1cfd..a38ee9b 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -351,8 +351,7 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, } if (en) { - if (en->ei.len > et->largest.len) - et->largest = en->ei; + __try_update_largest_extent(et, en); et->cached_en = en; } return en; @@ -389,8 +388,7 @@ do_insert: if (!en) return NULL; - if (en->ei.len > et->largest.len) - et->largest = en->ei; + __try_update_largest_extent(et, en); et->cached_en = en; return en; } @@ -476,12 +474,10 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, : NULL; } - if (parts) { - if (en->ei.len > et->largest.len) - et->largest = en->ei; - } else { + if (parts) + __try_update_largest_extent(et, en); + else __detach_extent_node(sbi, et, en); - } /* * if original extent is split into zero or two parts, extent diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 79c38ad..8d6681a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -492,6 +492,13 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } +static inline void __try_update_largest_extent(struct extent_tree *et, + struct extent_node *en) +{ + if (en->ei.len > et->largest.len) + et->largest = en->ei; +} + struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ -- cgit v0.10.2 From 1d7e10d58a1f62c8c44668cca1cff6a2dd57d7d9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 23 Sep 2015 09:25:43 +0800 Subject: f2fs: fix incorrect bimodal calculation In update_sit_info, we use div_u64 to handle 'u64 divide u64' case, but div_u64 can only handle 32-bits divisor, so our divisor with u64 type passed to div_u64 will overflow, result in the wrong calculation when show debug info of f2fs as below: BDF: 464, avg. vblocks: 23509 (BDF should never exceed 100) So change to use div64_u64 to handle this case correctly. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index ebfcc40..615a307 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -118,7 +118,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) } } dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100); - si->bimodal = div_u64(bimodal, dist); + si->bimodal = div64_u64(bimodal, dist); if (si->dirty_count) si->avg_vblocks = div_u64(total_vblocks, ndirty); else -- cgit v0.10.2 From 7223554133c3f72809ea6ddbf0d8464e7c70c1b1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 25 Sep 2015 17:54:56 +0800 Subject: f2fs: remove unneeded f2fs_{,un}lock_op in do_recover_data() Protecting recovery flow by using cp_rwsem is not needed, since we have prevent triggering any checkpoint by locking cp_mutex previously. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index acc21f2..c5daec5 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -383,15 +383,11 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, start = start_bidx_of_node(ofs_of_node(page), fi); end = start + ADDRS_PER_PAGE(page, fi); - f2fs_lock_op(sbi); - set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, start, ALLOC_NODE); - if (err) { - f2fs_unlock_op(sbi); + if (err) goto out; - } f2fs_wait_on_page_writeback(dn.node_page, NODE); @@ -456,7 +452,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, set_page_dirty(dn.node_page); err: f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); out: f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, recovered = %d blocks, err = %d", -- cgit v0.10.2 From 90b803e6fb6243922bff9ddd8a6205c17cb93b31 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 25 Sep 2015 19:34:50 -0700 Subject: f2fs: do not skip dentry block writes Previously, we skip dentry block writes when wbc is SYNC_NONE with no memory pressure and the number of dirty pages is pretty small. But, we didn't skip for normal data writes, which gives us not much big impact on overall performance. Moreover, by skipping some data writes, kworker falls into infinite loop to try to write blocks, when many dir inodes have only one dentry block. So, this patch removes skipping data writes. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index bc04e92..a903423 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1340,11 +1340,6 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) return 0; - if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && - get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && - available_free_memory(sbi, DIRTY_DENTS)) - goto skip_write; - /* during POR, we don't need to trigger writepage at all. */ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto skip_write; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4d9bedf..1fe49ca 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -52,11 +52,6 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); - } else if (type == DIRTY_DENTS) { - if (sbi->sb->s_bdi->wb.dirty_exceeded) - return false; - mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); - res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == INO_ENTRIES) { int i; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 7427e95..51c62ed 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -118,7 +118,6 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, enum mem_type { FREE_NIDS, /* indicates the free nid list */ NAT_ENTRIES, /* indicates the cached nat entry */ - DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ EXTENT_CACHE, /* indicates extent cache */ BASE_CHECK, /* check kernel status */ diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index b6e4ed1..a294da7 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -697,9 +697,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) if (sbi->sb->s_bdi->wb.dirty_exceeded) return 0; - if (type == DATA) - return sbi->blocks_per_seg; - else if (type == NODE) + if (type == NODE) return 3 * sbi->blocks_per_seg; else if (type == META) return MAX_BIO_BLOCKS(sbi); -- cgit v0.10.2 From 345a6b2ee2987a11bc8e9c08ff2b68a973fd912c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 28 Sep 2015 17:46:01 +0800 Subject: f2fs: fix to update {m,c}time correctly when truncating larger This patch fixes to update ctime and atime correctly when truncating larger in ->setattr. The bug is reported by xfstest generic/313 as below: generic/313 2s ... - output mismatch (see ./results/generic/313.out.bad) --- tests/generic/313.out 2015-08-04 15:28:53.430798882 +0800 +++ results/generic/313.out.bad 2015-09-28 17:04:27.294278016 +0800 @@ -1,2 +1,4 @@ QA output created by 313 Silence is golden +ctime not updated after truncate up +mtime not updated after truncate up ... (Run 'diff -u tests/generic/313.out tests/generic/313.out.bad' to see the entire diff) Ran: generic/313 Failures: generic/313 Failed 1 of 1 tests Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6702157..a350c2a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -681,6 +681,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) * larger than i_size. */ truncate_setsize(inode, attr->ia_size); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; } } -- cgit v0.10.2 From 45fe8492ccbe561c4b8918c2d4c83a0501e50646 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 28 Sep 2015 17:42:24 +0800 Subject: f2fs: fix to correct freed section number during gc This patch fixes to maintain the right section count freed in garbage collecting when triggering a foreground gc. Besides, when a foreground gc is running on current selected section, once we fail to gc one segment, it's better to abandon gcing the left segments in current section, because anyway we will select next victim for foreground gc, so gc on the left segments in previous section will become overhead and also cause the long latency for caller. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 782b8e7..b6e03eb 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -802,7 +802,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi) unsigned int segno = NULL_SEGNO; unsigned int i; int gc_type = BG_GC; - int nfree = 0; + int sec_freed = 0; int ret = -1; struct cp_control cpc; struct gc_inode_list gc_list = { @@ -817,7 +817,7 @@ gc_more: if (unlikely(f2fs_cp_error(sbi))) goto stop; - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) { gc_type = FG_GC; if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi)) write_checkpoint(sbi, &cpc); @@ -832,13 +832,23 @@ gc_more: ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, META_SSA); - for (i = 0; i < sbi->segs_per_sec; i++) - nfree += do_garbage_collect(sbi, segno + i, &gc_list, gc_type); + for (i = 0; i < sbi->segs_per_sec; i++) { + /* + * for FG_GC case, halt gcing left segments once failed one + * of segments in selected section to avoid long latency. + */ + if (!do_garbage_collect(sbi, segno + i, &gc_list, gc_type) && + gc_type == FG_GC) + break; + } + + if (i == sbi->segs_per_sec && gc_type == FG_GC) + sec_freed++; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (has_not_enough_free_secs(sbi, nfree)) + if (has_not_enough_free_secs(sbi, sec_freed)) goto gc_more; if (gc_type == FG_GC) -- cgit v0.10.2 From ab126cfc3090fa5af2a87cc0698f793aebbec7d0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 18 Sep 2015 17:33:00 -0700 Subject: f2fs: should get a victim from retrials If we do not call get_victim first, we cannot get a new victim for retrial path. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b6e03eb..343b096 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -799,8 +799,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, int f2fs_gc(struct f2fs_sb_info *sbi) { - unsigned int segno = NULL_SEGNO; - unsigned int i; + unsigned int segno, i; int gc_type = BG_GC; int sec_freed = 0; int ret = -1; @@ -812,6 +811,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi) cpc.reason = __get_cp_reason(sbi); gc_more: + segno = NULL_SEGNO; + if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; if (unlikely(f2fs_cp_error(sbi))) -- cgit v0.10.2 From 39307a8e2459ecdee9f1bc0b8a5d7af4a6d8f754 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 22 Sep 2015 13:50:47 -0700 Subject: f2fs: use vmalloc to handle -ENOMEM error This patch introduces f2fs_kvmalloc to avoid -ENOMEM during mount. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8d6681a..6f27b92 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #ifdef CONFIG_F2FS_CHECK_FS @@ -1586,6 +1587,26 @@ static inline bool f2fs_may_extent_tree(struct inode *inode) return S_ISREG(mode); } +static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) +{ + void *ret; + + ret = kmalloc(size, flags | __GFP_NOWARN); + if (!ret) + ret = __vmalloc(size, flags, PAGE_KERNEL); + return ret; +} + +static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) +{ + void *ret; + + ret = kzalloc(size, flags | __GFP_NOWARN); + if (!ret) + ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); + return ret; +} + #define get_inode_mode(i) \ ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 78e6d06..0ceff28 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include "f2fs.h" @@ -1955,12 +1954,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = sit_i; - sit_i->sentries = vzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry)); + sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) * + sizeof(struct seg_entry), GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL); + sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; @@ -1982,8 +1982,8 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) * - sizeof(struct sec_entry)); + sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) * + sizeof(struct sec_entry), GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; } @@ -2028,12 +2028,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->free_info = free_i; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL); + free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL); + free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -2174,7 +2174,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL); + dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; return 0; @@ -2196,7 +2196,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); for (i = 0; i < NR_DIRTY_TYPE; i++) { - dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL); + dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->dirty_segmap[i]) return -ENOMEM; } @@ -2301,7 +2301,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi, struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); mutex_lock(&dirty_i->seglist_lock); - kfree(dirty_i->dirty_segmap[dirty_type]); + kvfree(dirty_i->dirty_segmap[dirty_type]); dirty_i->nr_dirty[dirty_type] = 0; mutex_unlock(&dirty_i->seglist_lock); } @@ -2309,7 +2309,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi, static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - kfree(dirty_i->victim_secmap); + kvfree(dirty_i->victim_secmap); } static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) @@ -2348,8 +2348,8 @@ static void destroy_free_segmap(struct f2fs_sb_info *sbi) if (!free_i) return; SM_I(sbi)->free_info = NULL; - kfree(free_i->free_segmap); - kfree(free_i->free_secmap); + kvfree(free_i->free_segmap); + kvfree(free_i->free_secmap); kfree(free_i); } @@ -2370,9 +2370,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) } kfree(sit_i->tmp_map); - vfree(sit_i->sentries); - vfree(sit_i->sec_entries); - kfree(sit_i->dirty_sentries_bitmap); + kvfree(sit_i->sentries); + kvfree(sit_i->sec_entries); + kvfree(sit_i->dirty_sentries_bitmap); SM_I(sbi)->sit_info = NULL; kfree(sit_i->sit_bitmap); -- cgit v0.10.2 From 5b7ee374144f8ef2db3e25d0d59a8ad83bb3cf33 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 30 Sep 2015 17:38:48 +0800 Subject: f2fs: use atomic64_t for extent cache hit stat Our hit stat of extent cache will increase all the time until remount, and we use atomic_t type for the stat variable, so it may easily incur overflow when we query extent cache frequently in a long time running fs. So to avoid that, this patch uses atomic64_t for hit stat variables. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 615a307..478e5d5 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -33,11 +33,11 @@ static void update_general_status(struct f2fs_sb_info *sbi) int i; /* validation check of the segment numbers */ - si->hit_largest = atomic_read(&sbi->read_hit_largest); - si->hit_cached = atomic_read(&sbi->read_hit_cached); - si->hit_rbtree = atomic_read(&sbi->read_hit_rbtree); + si->hit_largest = atomic64_read(&sbi->read_hit_largest); + si->hit_cached = atomic64_read(&sbi->read_hit_cached); + si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree); si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree; - si->total_ext = atomic_read(&sbi->total_hit_ext); + si->total_ext = atomic64_read(&sbi->total_hit_ext); si->ext_tree = sbi->total_ext_tree; si->ext_node = atomic_read(&sbi->total_ext_node); si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); @@ -283,12 +283,12 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, si->bg_node_blks); seq_puts(s, "\nExtent Cache:\n"); - seq_printf(s, " - Hit Count: L1-1:%d L1-2:%d L2:%d\n", + seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n", si->hit_largest, si->hit_cached, si->hit_rbtree); - seq_printf(s, " - Hit Ratio: %d%% (%d / %d)\n", + seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n", !si->total_ext ? 0 : - (si->hit_total * 100) / si->total_ext, + div64_u64(si->hit_total * 100, si->total_ext), si->hit_total, si->total_ext); seq_printf(s, " - Inner Struct Count: tree: %d, node: %d\n", si->ext_tree, si->ext_node); @@ -378,10 +378,10 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) si->sbi = sbi; sbi->stat_info = si; - atomic_set(&sbi->total_hit_ext, 0); - atomic_set(&sbi->read_hit_rbtree, 0); - atomic_set(&sbi->read_hit_largest, 0); - atomic_set(&sbi->read_hit_cached, 0); + atomic64_set(&sbi->total_hit_ext, 0); + atomic64_set(&sbi->read_hit_rbtree, 0); + atomic64_set(&sbi->read_hit_largest, 0); + atomic64_set(&sbi->read_hit_cached, 0); atomic_set(&sbi->inline_xattr, 0); atomic_set(&sbi->inline_inode, 0); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6f27b92..1fa0555 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -795,10 +795,10 @@ struct f2fs_sb_info { unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ atomic_t inplace_count; /* # of inplace update */ - atomic_t total_hit_ext; /* # of lookup extent cache */ - atomic_t read_hit_rbtree; /* # of hit rbtree extent node */ - atomic_t read_hit_largest; /* # of hit largest extent node */ - atomic_t read_hit_cached; /* # of hit cached extent node */ + atomic64_t total_hit_ext; /* # of lookup extent cache */ + atomic64_t read_hit_rbtree; /* # of hit rbtree extent node */ + atomic64_t read_hit_largest; /* # of hit largest extent node */ + atomic64_t read_hit_cached; /* # of hit cached extent node */ atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ @@ -1848,7 +1848,8 @@ struct f2fs_stat_info { struct f2fs_sb_info *sbi; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; - int hit_largest, hit_cached, hit_rbtree, hit_total, total_ext; + unsigned long long hit_largest, hit_cached, hit_rbtree; + unsigned long long hit_total, total_ext; int ext_tree, ext_node; int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; int nats, dirty_nats, sits, dirty_sits, fnids; @@ -1885,10 +1886,10 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) #define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) #define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) -#define stat_inc_total_hit(sbi) (atomic_inc(&(sbi)->total_hit_ext)) -#define stat_inc_rbtree_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_rbtree)) -#define stat_inc_largest_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_largest)) -#define stat_inc_cached_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_cached)) +#define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext)) +#define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree)) +#define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest)) +#define stat_inc_cached_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_cached)) #define stat_inc_inline_xattr(inode) \ do { \ if (f2fs_has_inline_xattr(inode)) \ -- cgit v0.10.2 From a43f7ec327b0d86cbb80d0841673038c0706e714 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 5 Oct 2015 22:19:24 +0800 Subject: f2fs: fix to avoid redundant searching in dirty map during gc When doing gc, we search a victim in dirty map, starting from position of last victim, we will reset the current searching position until we touch the end of dirty map, and then search the whole diryt map. So sometimes we will search the range [victim, last] twice, it's redundant, this patch avoids this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 343b096..e5c255b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -257,6 +257,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct victim_sel_policy p; unsigned int secno, max_cost; + unsigned int last_segment = MAIN_SEGS(sbi); int nsearched = 0; mutex_lock(&dirty_i->seglist_lock); @@ -277,9 +278,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, unsigned long cost; unsigned int segno; - segno = find_next_bit(p.dirty_segmap, MAIN_SEGS(sbi), p.offset); - if (segno >= MAIN_SEGS(sbi)) { + segno = find_next_bit(p.dirty_segmap, last_segment, p.offset); + if (segno >= last_segment) { if (sbi->last_victim[p.gc_mode]) { + last_segment = sbi->last_victim[p.gc_mode]; sbi->last_victim[p.gc_mode] = 0; p.offset = 0; continue; -- cgit v0.10.2 From 3342bb303bf48dd8bb5ac94c3356489ff53e4d04 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 5 Oct 2015 22:20:40 +0800 Subject: f2fs: skip searching dirty map if dirty segment is not exist When searching victim during gc, if there are no dirty segments in filesystem, we will still take the time to search the whole dirty segment map, it's not needed, it's better to skip in this condition. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e5c255b..d844a80 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -268,6 +268,9 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, p.min_segno = NULL_SEGNO; p.min_cost = max_cost = get_max_cost(sbi, &p); + if (p.max_search == 0) + goto out; + if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); if (p.min_segno != NULL_SEGNO) @@ -329,6 +332,7 @@ got_it: sbi->cur_victim_sec, prefree_segments(sbi), free_segments(sbi)); } +out: mutex_unlock(&dirty_i->seglist_lock); return (p.min_segno == NULL_SEGNO) ? 0 : 1; -- cgit v0.10.2 From d530d4d8e237f4d12c93bb76df40b69b8b8a1dcd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 5 Oct 2015 22:22:44 +0800 Subject: f2fs: support synchronous gc in ioctl This patch drops in batches gc triggered through ioctl, since user can easily control the gc by designing the loop around the ->ioctl. We support synchronous gc by forcing using FG_GC in f2fs_gc, so with it, user can make sure that in this round all blocks gced were persistent in the device until ioctl returned. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1fa0555..1b2fc23 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1830,7 +1830,7 @@ int f2fs_release_page(struct page *, gfp_t); int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *); -int f2fs_gc(struct f2fs_sb_info *); +int f2fs_gc(struct f2fs_sb_info *, bool); void build_gc_manager(struct f2fs_sb_info *); /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a350c2a..c269966 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1618,29 +1618,25 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - __u32 i, count; + __u32 sync; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (get_user(count, (__u32 __user *)arg)) + if (get_user(sync, (__u32 __user *)arg)) return -EFAULT; - if (!count || count > F2FS_BATCH_GC_MAX_NUM) - return -EINVAL; + if (f2fs_readonly(sbi->sb)) + return -EROFS; - for (i = 0; i < count; i++) { + if (!sync) { if (!mutex_trylock(&sbi->gc_mutex)) - break; - - if (f2fs_gc(sbi)) - break; + return -EBUSY; + } else { + mutex_lock(&sbi->gc_mutex); } - if (put_user(i, (__u32 __user *)arg)) - return -EFAULT; - - return 0; + return f2fs_gc(sbi, sync); } long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d844a80..830d277 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -78,7 +78,7 @@ static int gc_thread_func(void *data) stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi)) + if (f2fs_gc(sbi, false)) wait_ms = gc_th->no_gc_sleep_time; /* balancing f2fs's metadata periodically */ @@ -803,12 +803,12 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, return nfree; } -int f2fs_gc(struct f2fs_sb_info *sbi) +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) { unsigned int segno, i; - int gc_type = BG_GC; + int gc_type = sync ? FG_GC : BG_GC; int sec_freed = 0; - int ret = -1; + int ret = -EINVAL; struct cp_control cpc; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), @@ -855,15 +855,20 @@ gc_more: if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (has_not_enough_free_secs(sbi, sec_freed)) - goto gc_more; + if (!sync) { + if (has_not_enough_free_secs(sbi, sec_freed)) + goto gc_more; - if (gc_type == FG_GC) - write_checkpoint(sbi, &cpc); + if (gc_type == FG_GC) + write_checkpoint(sbi, &cpc); + } stop: mutex_unlock(&sbi->gc_mutex); put_gc_inode(&gc_list); + + if (sync) + ret = sec_freed ? 0 : -EAGAIN; return ret; } diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index c5a055b..b4a65be 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -19,12 +19,6 @@ #define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ -/* - * with this macro, we can control the max time we do garbage collection, - * when user triggers batch mode gc by ioctl. - */ -#define F2FS_BATCH_GC_MAX_NUM 16 - /* Search max. number of dirty segments to select a victim segment */ #define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0ceff28..6b8edf2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -295,7 +295,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi) */ if (has_not_enough_free_secs(sbi, 0)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi); + f2fs_gc(sbi, false); } } -- cgit v0.10.2 From 456b88e4d15de833165cce67c2cdf998139e2fc1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 5 Oct 2015 22:24:19 +0800 Subject: f2fs: introduce a new ioctl F2FS_IOC_WRITE_CHECKPOINT This patch introduce a new ioctl for those users who want to trigger checkpoint from userspace through ioctl. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1b2fc23..18e5902 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -231,6 +231,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, #define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) #define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) +#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) #define F2FS_IOC_SET_ENCRYPTION_POLICY \ _IOR('f', 19, struct f2fs_encryption_policy) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c269966..b3985a6 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1639,6 +1639,27 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) return f2fs_gc(sbi, sync); } +static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct cp_control cpc; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + cpc.reason = __get_cp_reason(sbi); + + mutex_lock(&sbi->gc_mutex); + write_checkpoint(sbi, &cpc); + mutex_unlock(&sbi->gc_mutex); + + return 0; +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -1670,6 +1691,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_get_encryption_pwsalt(filp, arg); case F2FS_IOC_GARBAGE_COLLECT: return f2fs_ioc_gc(filp, arg); + case F2FS_IOC_WRITE_CHECKPOINT: + return f2fs_ioc_write_checkpoint(filp, arg); default: return -ENOTTY; } -- cgit v0.10.2 From 6aefd93b01379bf0b62f8b38dcf7a21397893833 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 5 Oct 2015 11:02:54 -0700 Subject: f2fs: introduce background_gc=sync mount option This patch introduce background_gc=sync enabling synchronous cleaning in background. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index e2d5105..b102b43 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -102,7 +102,8 @@ background_gc=%s Turn on/off cleaning operations, namely garbage collection, triggered in background when I/O subsystem is idle. If background_gc=on, it will turn on the garbage collection and if background_gc=off, garbage collection - will be truned off. + will be truned off. If background_gc=sync, it will turn + on synchronous garbage collection running in background. Default value for this option is on. So garbage collection is on by default. disable_roll_forward Disable the roll-forward recovery routine diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 18e5902..00bd470 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -53,6 +53,7 @@ #define F2FS_MOUNT_NOBARRIER 0x00000800 #define F2FS_MOUNT_FASTBOOT 0x00001000 #define F2FS_MOUNT_EXTENT_CACHE 0x00002000 +#define F2FS_MOUNT_FORCE_FG_GC 0x00004000 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 830d277..e627c19 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -78,7 +78,7 @@ static int gc_thread_func(void *data) stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, false)) + if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC))) wait_ms = gc_th->no_gc_sleep_time; /* balancing f2fs's metadata periodically */ diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index a294da7..e9afb58 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -137,10 +137,12 @@ enum { /* * BG_GC means the background cleaning job. * FG_GC means the on-demand cleaning job. + * FORCE_FG_GC means on-demand cleaning job in background. */ enum { BG_GC = 0, - FG_GC + FG_GC, + FORCE_FG_GC, }; /* for a function parameter to select a victim segment */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 16442ec..ba058d0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -292,11 +292,16 @@ static int parse_options(struct super_block *sb, char *options) if (!name) return -ENOMEM; - if (strlen(name) == 2 && !strncmp(name, "on", 2)) + if (strlen(name) == 2 && !strncmp(name, "on", 2)) { set_opt(sbi, BG_GC); - else if (strlen(name) == 3 && !strncmp(name, "off", 3)) + clear_opt(sbi, FORCE_FG_GC); + } else if (strlen(name) == 3 && !strncmp(name, "off", 3)) { clear_opt(sbi, BG_GC); - else { + clear_opt(sbi, FORCE_FG_GC); + } else if (strlen(name) == 4 && !strncmp(name, "sync", 4)) { + set_opt(sbi, BG_GC); + set_opt(sbi, FORCE_FG_GC); + } else { kfree(name); return -EINVAL; } @@ -631,10 +636,14 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); - if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) - seq_printf(seq, ",background_gc=%s", "on"); - else + if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) { + if (test_opt(sbi, FORCE_FG_GC)) + seq_printf(seq, ",background_gc=%s", "sync"); + else + seq_printf(seq, ",background_gc=%s", "on"); + } else { seq_printf(seq, ",background_gc=%s", "off"); + } if (test_opt(sbi, DISABLE_ROLL_FORWARD)) seq_puts(seq, ",disable_roll_forward"); if (test_opt(sbi, DISCARD)) -- cgit v0.10.2 From 5c2674347466d5c2d5169214e95f4ad6dc09e9b6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 5 Oct 2015 11:32:34 -0700 Subject: f2fs: add a tracepoint for background gc This patch introduces a tracepoint to monitor background gc behaviors. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e627c19..e7cec86 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -77,6 +77,9 @@ static int gc_thread_func(void *data) stat_inc_bggc_count(sbi); + trace_f2fs_background_gc(sbi->sb, wait_ms, + prefree_segments(sbi), free_segments(sbi)); + /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC))) wait_ms = gc_th->no_gc_sleep_time; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 6aa63d9..7de751d 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -514,6 +514,34 @@ TRACE_EVENT(f2fs_map_blocks, __entry->ret) ); +TRACE_EVENT(f2fs_background_gc, + + TP_PROTO(struct super_block *sb, long wait_ms, + unsigned int prefree, unsigned int free), + + TP_ARGS(sb, wait_ms, prefree, free), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(long, wait_ms) + __field(unsigned int, prefree) + __field(unsigned int, free) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->wait_ms = wait_ms; + __entry->prefree = prefree; + __entry->free = free; + ), + + TP_printk("dev = (%d,%d), wait_ms = %ld, prefree = %u, free = %u", + show_dev(__entry), + __entry->wait_ms, + __entry->prefree, + __entry->free) +); + TRACE_EVENT(f2fs_get_victim, TP_PROTO(struct super_block *sb, int type, int gc_type, -- cgit v0.10.2 From 60b99b486b568c13cbb7caa83cf8a12af7665f1e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 5 Oct 2015 14:49:57 -0700 Subject: f2fs: introduce a periodic checkpoint flow This patch introduces a periodic checkpoint feature. Note that, this is not enforcing to conduct checkpoints very strictly in terms of trigger timing, instead just hope to help user experiences. The default value is 60 seconds. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 2c4cc42..e066281d 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -80,3 +80,9 @@ Date: February 2015 Contact: "Jaegeuk Kim" Description: Controls the trimming rate in batch mode. + +What: /sys/fs/f2fs//cp_interval +Date: October 2015 +Contact: "Jaegeuk Kim" +Description: + Controls the checkpoint timing. diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ff53405..0569097 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1114,6 +1114,9 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (cpc->reason == CP_RECOVERY) f2fs_msg(sbi->sb, KERN_NOTICE, "checkpoint: version = %llx", ckpt_ver); + + /* do checkpoint periodically */ + sbi->cp_expires = round_jiffies_up(jiffies + HZ * sbi->cp_interval); out: mutex_unlock(&sbi->cp_mutex); trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 00bd470..aad4720 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -124,6 +124,7 @@ enum { (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) +#define DEF_CP_INTERVAL 60 /* 60 secs */ struct cp_control { int reason; @@ -734,6 +735,7 @@ struct f2fs_sb_info { struct rw_semaphore node_write; /* locking node writes */ struct mutex writepages; /* mutex for writepages() */ wait_queue_head_t cp_wait; + long cp_expires, cp_interval; /* next expected periodic cp */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6b8edf2..1d86a35 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -315,7 +316,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || excess_prefree_segs(sbi) || - !available_free_memory(sbi, INO_ENTRIES)) + !available_free_memory(sbi, INO_ENTRIES) || + jiffies > sbi->cp_expires) f2fs_sync_fs(sbi->sb, true); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ba058d0..cb23d85 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -215,6 +215,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, cp_interval); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -231,6 +232,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(max_victim_search), ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), + ATTR_LIST(cp_interval), NULL, }; @@ -1014,6 +1016,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) atomic_set(&sbi->nr_pages[i], 0); sbi->dir_level = DEF_DIR_LEVEL; + sbi->cp_interval = DEF_CP_INTERVAL; clear_sbi_flag(sbi, SBI_NEED_FSCK); INIT_LIST_HEAD(&sbi->s_list); @@ -1350,6 +1353,8 @@ try_onemore: f2fs_commit_super(sbi, true); } + sbi->cp_expires = round_jiffies_up(jiffies); + return 0; free_kobj: -- cgit v0.10.2 From 6066d8cdb6dd0fd51ccd96027f6ae8e6b3b5adff Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 1 Oct 2015 16:42:55 -0700 Subject: f2fs: merge meta writes as many possible This patch tries to merge IOs as many as possible when background flusher conducts flushing the dirty meta pages. [Before] ... 2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 124320, size = 4096 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 124560, size = 32768 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 95720, size = 987136 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123928, size = 4096 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123944, size = 8192 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123968, size = 45056 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 124064, size = 4096 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 97648, size = 1007616 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123776, size = 8192 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123800, size = 32768 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 124624, size = 4096 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 99616, size = 921600 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123608, size = 4096 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123624, size = 77824 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123792, size = 4096 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123864, size = 32768 ... [After] ... f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 92168, size = 892928 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 93912, size = 753664 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 95384, size = 716800 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 96784, size = 712704 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 104160, size = 364544 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 104872, size = 356352 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 105568, size = 278528 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 106112, size = 319488 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 106736, size = 258048 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 107240, size = 270336 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 107768, size = 180224 ... Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0569097..0a9ec42 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -257,7 +257,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write) { struct address_space *mapping = META_MAPPING(sbi); - pgoff_t index = 0, end = LONG_MAX; + pgoff_t index = 0, end = LONG_MAX, prev = LONG_MAX; struct pagevec pvec; long nwritten = 0; struct writeback_control wbc = { @@ -277,6 +277,13 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + if (prev == LONG_MAX) + prev = page->index - 1; + if (nr_to_write != LONG_MAX && page->index != prev + 1) { + pagevec_release(&pvec); + goto stop; + } + lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -297,13 +304,14 @@ continue_unlock: break; } nwritten++; + prev = page->index; if (unlikely(nwritten >= nr_to_write)) break; } pagevec_release(&pvec); cond_resched(); } - +stop: if (nwritten) f2fs_submit_merged_bio(sbi, type, WRITE); -- cgit v0.10.2 From c912a8298c16ef15aa2b7203022c935f439f488b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 7 Oct 2015 09:46:37 -0700 Subject: f2fs: add F2FS_GOING_DOWN_METAFLUSH to test power-failure This patch introduces F2FS_GOING_DOWN_METAFLUSH which flushes meta pages like SSA blocks and then blocks all the writes. This can be used by power-failure tests. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index aad4720..f05ae22 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -250,6 +250,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, #define F2FS_GOING_DOWN_FULLSYNC 0x0 /* going down with full sync */ #define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */ #define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */ +#define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */ #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b3985a6..6d3cfd5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1498,6 +1498,10 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) case F2FS_GOING_DOWN_NOSYNC: f2fs_stop_checkpoint(sbi); break; + case F2FS_GOING_DOWN_METAFLUSH: + sync_meta_pages(sbi, META, LONG_MAX); + f2fs_stop_checkpoint(sbi); + break; default: return -EINVAL; } -- cgit v0.10.2 From a125702326d9c3b753fe9c9b9727d3b3dd1cba4a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 8 Oct 2015 10:40:07 -0700 Subject: Revert "f2fs: do not skip dentry block writes" The periodic checkpoint can resolve the previous issue. So, now we can use this again to improve the reported performance regression: https://lkml.org/lkml/2015/10/8/20 This reverts commit 15bec0ff5a9ba6d203178fa8772259df6207942a. diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a903423..bc04e92 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1340,6 +1340,11 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) return 0; + if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && + get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && + available_free_memory(sbi, DIRTY_DENTS)) + goto skip_write; + /* during POR, we don't need to trigger writepage at all. */ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto skip_write; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1fe49ca..4d9bedf 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -52,6 +52,11 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); + } else if (type == DIRTY_DENTS) { + if (sbi->sb->s_bdi->wb.dirty_exceeded) + return false; + mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == INO_ENTRIES) { int i; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 51c62ed..7427e95 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -118,6 +118,7 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, enum mem_type { FREE_NIDS, /* indicates the free nid list */ NAT_ENTRIES, /* indicates the cached nat entry */ + DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ EXTENT_CACHE, /* indicates extent cache */ BASE_CHECK, /* check kernel status */ diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index e9afb58..ee44d34 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -699,7 +699,9 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) if (sbi->sb->s_bdi->wb.dirty_exceeded) return 0; - if (type == NODE) + if (type == DATA) + return sbi->blocks_per_seg; + else if (type == NODE) return 3 * sbi->blocks_per_seg; else if (type == META) return MAX_BIO_BLOCKS(sbi); -- cgit v0.10.2 From 6e2c64ad7cebf8740c5e1241de374c6b6ea80f81 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 7 Oct 2015 12:28:41 -0700 Subject: f2fs: fix SSA updates resulting in corruption The f2fs_collapse_range and f2fs_insert_range changes the block addresses directly. But that can cause uncovered SSA updates. In that case, we need to give up to change the block addresses and do buffered writes to keep filesystem consistency. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f05ae22..6f2310c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1233,6 +1233,16 @@ static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) return sbi->total_valid_inode_count; } +static inline void f2fs_copy_page(struct page *src, struct page *dst) +{ + char *src_kaddr = kmap(src); + char *dst_kaddr = kmap(dst); + + memcpy(dst_kaddr, src_kaddr, PAGE_SIZE); + kunmap(dst); + kunmap(src); +} + static inline void f2fs_put_page(struct page *page, int unlock) { if (!page) @@ -1754,6 +1764,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *); int create_flush_cmd_control(struct f2fs_sb_info *); void destroy_flush_cmd_control(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); +bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6d3cfd5..8d5583b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -826,86 +826,100 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) return ret; } -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) +static int __exchange_data_block(struct inode *inode, pgoff_t src, + pgoff_t dst, bool full) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; - pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; - int ret = 0; - - for (; end < nrpages; start++, end++) { - block_t new_addr, old_addr; - - f2fs_lock_op(sbi); + block_t new_addr; + bool do_replace = false; + int ret; - set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, end, LOOKUP_NODE_RA); - if (ret && ret != -ENOENT) { - goto out; - } else if (ret == -ENOENT) { - new_addr = NULL_ADDR; - } else { - new_addr = dn.data_blkaddr; - truncate_data_blocks_range(&dn, 1); - f2fs_put_dnode(&dn); + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, src, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) { + return ret; + } else if (ret == -ENOENT) { + new_addr = NULL_ADDR; + } else { + new_addr = dn.data_blkaddr; + if (!is_checkpointed_data(sbi, new_addr)) { + dn.data_blkaddr = NULL_ADDR; + /* do not invalidate this block address */ + set_data_blkaddr(&dn); + f2fs_update_extent_cache(&dn); + do_replace = true; } + f2fs_put_dnode(&dn); + } - if (new_addr == NULL_ADDR) { - set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, start, LOOKUP_NODE_RA); - if (ret && ret != -ENOENT) { - goto out; - } else if (ret == -ENOENT) { - f2fs_unlock_op(sbi); - continue; - } + if (new_addr == NULL_ADDR) + return full ? truncate_hole(inode, dst, dst + 1) : 0; - if (dn.data_blkaddr == NULL_ADDR) { - f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); - continue; - } else { - truncate_data_blocks_range(&dn, 1); - } + if (do_replace) { + struct page *ipage = get_node_page(sbi, inode->i_ino); + struct node_info ni; - f2fs_put_dnode(&dn); - } else { - struct page *ipage; + if (IS_ERR(ipage)) { + ret = PTR_ERR(ipage); + goto err_out; + } - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - ret = PTR_ERR(ipage); - goto out; - } + set_new_dnode(&dn, inode, ipage, NULL, 0); + ret = f2fs_reserve_block(&dn, dst); + if (ret) + goto err_out; - set_new_dnode(&dn, inode, ipage, NULL, 0); - ret = f2fs_reserve_block(&dn, start); - if (ret) - goto out; + truncate_data_blocks_range(&dn, 1); - old_addr = dn.data_blkaddr; - if (old_addr != NEW_ADDR && new_addr == NEW_ADDR) { - dn.data_blkaddr = NULL_ADDR; - f2fs_update_extent_cache(&dn); - invalidate_blocks(sbi, old_addr); + get_node_info(sbi, dn.nid, &ni); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, + ni.version, true); + f2fs_put_dnode(&dn); + } else { + struct page *psrc, *pdst; + + psrc = get_lock_data_page(inode, src); + if (IS_ERR(psrc)) + return PTR_ERR(psrc); + pdst = get_new_data_page(inode, NULL, dst, false); + if (IS_ERR(pdst)) { + f2fs_put_page(psrc, 1); + return PTR_ERR(pdst); + } + f2fs_copy_page(psrc, pdst); + set_page_dirty(pdst); + f2fs_put_page(pdst, 1); + f2fs_put_page(psrc, 1); - dn.data_blkaddr = new_addr; - set_data_blkaddr(&dn); - } else if (new_addr != NEW_ADDR) { - struct node_info ni; + return truncate_hole(inode, src, src + 1); + } + return 0; - get_node_info(sbi, dn.nid, &ni); - f2fs_replace_block(sbi, &dn, old_addr, new_addr, - ni.version, true); - } +err_out: + if (!get_dnode_of_data(&dn, src, LOOKUP_NODE)) { + dn.data_blkaddr = new_addr; + set_data_blkaddr(&dn); + f2fs_update_extent_cache(&dn); + f2fs_put_dnode(&dn); + } + return ret; +} - f2fs_put_dnode(&dn); - } +static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + int ret = 0; + + for (; end < nrpages; start++, end++) { + f2fs_balance_fs(sbi); + f2fs_lock_op(sbi); + ret = __exchange_data_block(inode, end, start, true); f2fs_unlock_op(sbi); + if (ret) + break; } - return 0; -out: - f2fs_unlock_op(sbi); return ret; } @@ -944,7 +958,12 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; + /* write out all moved pages, if possible */ + filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + truncate_pagecache(inode, offset); + new_size = i_size_read(inode) - len; + truncate_pagecache(inode, new_size); ret = truncate_blocks(inode, new_size, true); if (!ret) @@ -1067,7 +1086,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); pgoff_t pg_start, pg_end, delta, nrpages, idx; loff_t new_size; - int ret; + int ret = 0; new_size = i_size_read(inode) + len; if (new_size > inode->i_sb->s_maxbytes) @@ -1105,57 +1124,19 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; for (idx = nrpages - 1; idx >= pg_start && idx != -1; idx--) { - struct dnode_of_data dn; - struct page *ipage; - block_t new_addr, old_addr; - f2fs_lock_op(sbi); - - set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, idx, LOOKUP_NODE_RA); - if (ret && ret != -ENOENT) { - goto out; - } else if (ret == -ENOENT) { - goto next; - } else if (dn.data_blkaddr == NULL_ADDR) { - f2fs_put_dnode(&dn); - goto next; - } else { - new_addr = dn.data_blkaddr; - truncate_data_blocks_range(&dn, 1); - f2fs_put_dnode(&dn); - } - - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - ret = PTR_ERR(ipage); - goto out; - } - - set_new_dnode(&dn, inode, ipage, NULL, 0); - ret = f2fs_reserve_block(&dn, idx + delta); - if (ret) - goto out; - - old_addr = dn.data_blkaddr; - f2fs_bug_on(sbi, old_addr != NEW_ADDR); - - if (new_addr != NEW_ADDR) { - struct node_info ni; - - get_node_info(sbi, dn.nid, &ni); - f2fs_replace_block(sbi, &dn, old_addr, new_addr, - ni.version, true); - } - f2fs_put_dnode(&dn); -next: + ret = __exchange_data_block(inode, idx, idx + delta, false); f2fs_unlock_op(sbi); + if (ret) + break; } - i_size_write(inode, new_size); - return 0; -out: - f2fs_unlock_op(sbi); + /* write out all moved pages, if possible */ + filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + truncate_pagecache(inode, offset); + + if (!ret) + i_size_write(inode, new_size); return ret; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1d86a35..581a9af 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -768,6 +768,30 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) mutex_unlock(&sit_i->sentry_lock); } +bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int segno, offset; + struct seg_entry *se; + bool is_cp = false; + + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) + return true; + + mutex_lock(&sit_i->sentry_lock); + + segno = GET_SEGNO(sbi, blkaddr); + se = get_seg_entry(sbi, segno); + offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + + if (f2fs_test_bit(offset, se->ckpt_valid_map)) + is_cp = true; + + mutex_unlock(&sit_i->sentry_lock); + + return is_cp; +} + /* * This function should be resided under the curseg_mutex lock */ @@ -1370,7 +1394,14 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi, curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); __add_sum_entry(sbi, type, sum); - refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); + if (!recover_curseg) + update_sit_entry(sbi, new_blkaddr, 1); + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + update_sit_entry(sbi, old_blkaddr, -1); + + locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + locate_dirty_segment(sbi, GET_SEGNO(sbi, new_blkaddr)); + locate_dirty_segment(sbi, old_cursegno); if (recover_curseg) { -- cgit v0.10.2 From a56c7c6fb3c60857c1335bcb8b914e6f65655486 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 9 Oct 2015 15:11:38 -0700 Subject: f2fs: set GFP_NOFS for grab_cache_page For normal inodes, their pages are allocated with __GFP_FS, which can cause filesystem calls when reclaiming memory. This can incur a dead lock condition accordingly. So, this patch addresses this problem by introducing f2fs_grab_cache_page(.., bool for_write), which calls grab_cache_page_write_begin() with AOP_FLAG_NOFS. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index bc04e92..d4f1c74 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -275,7 +275,8 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) return f2fs_reserve_block(dn, index); } -struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw) +struct page *get_read_data_page(struct inode *inode, pgoff_t index, + int rw, bool for_write) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; @@ -292,7 +293,7 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw) if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) return read_mapping_page(mapping, index, NULL); - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, for_write); if (!page) return ERR_PTR(-ENOMEM); @@ -352,7 +353,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) return page; f2fs_put_page(page, 0); - page = get_read_data_page(inode, index, READ_SYNC); + page = get_read_data_page(inode, index, READ_SYNC, false); if (IS_ERR(page)) return page; @@ -372,12 +373,13 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) * Because, the callers, functions in dir.c and GC, should be able to know * whether this page exists or not. */ -struct page *get_lock_data_page(struct inode *inode, pgoff_t index) +struct page *get_lock_data_page(struct inode *inode, pgoff_t index, + bool for_write) { struct address_space *mapping = inode->i_mapping; struct page *page; repeat: - page = get_read_data_page(inode, index, READ_SYNC); + page = get_read_data_page(inode, index, READ_SYNC, for_write); if (IS_ERR(page)) return page; @@ -411,7 +413,7 @@ struct page *get_new_data_page(struct inode *inode, struct dnode_of_data dn; int err; repeat: - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, true); if (!page) { /* * before exiting, we should make sure ipage will be released @@ -439,7 +441,7 @@ repeat: } else { f2fs_put_page(page, 1); - page = get_read_data_page(inode, index, READ_SYNC); + page = get_read_data_page(inode, index, READ_SYNC, true); if (IS_ERR(page)) goto repeat; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 6726c4a..7c1678b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -258,7 +258,7 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) if (f2fs_has_inline_dentry(dir)) return f2fs_parent_inline_dir(dir, p); - page = get_lock_data_page(dir, 0); + page = get_lock_data_page(dir, 0, false); if (IS_ERR(page)) return NULL; @@ -740,7 +740,7 @@ bool f2fs_empty_dir(struct inode *dir) return f2fs_empty_inline_dir(dir); for (bidx = 0; bidx < nblock; bidx++) { - dentry_page = get_lock_data_page(dir, bidx); + dentry_page = get_lock_data_page(dir, bidx, false); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) continue; @@ -854,7 +854,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); for (; n < npages; n++) { - dentry_page = get_lock_data_page(inode, n); + dentry_page = get_lock_data_page(inode, n, false); if (IS_ERR(dentry_page)) continue; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6f2310c..6ba5a59 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1233,6 +1233,14 @@ static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) return sbi->total_valid_inode_count; } +static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, + pgoff_t index, bool for_write) +{ + if (!for_write) + return grab_cache_page(mapping, index); + return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); +} + static inline void f2fs_copy_page(struct page *src, struct page *dst) { char *src_kaddr = kmap(src); @@ -1831,9 +1839,9 @@ void set_data_blkaddr(struct dnode_of_data *); int reserve_new_block(struct dnode_of_data *); int f2fs_get_block(struct dnode_of_data *, pgoff_t); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); -struct page *get_read_data_page(struct inode *, pgoff_t, int); +struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); struct page *find_data_page(struct inode *, pgoff_t); -struct page *get_lock_data_page(struct inode *, pgoff_t); +struct page *get_lock_data_page(struct inode *, pgoff_t, bool); struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); int do_write_data_page(struct f2fs_io_info *); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8d5583b..5d2a2ee 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -505,14 +505,14 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, return 0; if (cache_only) { - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, false); if (page && PageUptodate(page)) goto truncate_out; f2fs_put_page(page, 1); return 0; } - page = get_lock_data_page(inode, index); + page = get_lock_data_page(inode, index, true); if (IS_ERR(page)) return 0; truncate_out: @@ -879,7 +879,7 @@ static int __exchange_data_block(struct inode *inode, pgoff_t src, } else { struct page *psrc, *pdst; - psrc = get_lock_data_page(inode, src); + psrc = get_lock_data_page(inode, src, true); if (IS_ERR(psrc)) return PTR_ERR(psrc); pdst = get_new_data_page(inode, NULL, dst, false); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e7cec86..94278db 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -550,7 +550,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) int err; /* do not read out */ - page = grab_cache_page(inode->i_mapping, bidx); + page = f2fs_grab_cache_page(inode->i_mapping, bidx, false); if (!page) return; @@ -620,7 +620,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) { struct page *page; - page = get_lock_data_page(inode, bidx); + page = get_lock_data_page(inode, bidx, true); if (IS_ERR(page)) return; @@ -714,7 +714,7 @@ next_step: start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); data_page = get_read_data_page(inode, - start_bidx + ofs_in_node, READA); + start_bidx + ofs_in_node, READA, true); if (IS_ERR(data_page)) { iput(inode); continue; -- cgit v0.10.2 From b8c2940048adf4b2fcc5ae738f2bd4821ebf6a8a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 12 Oct 2015 17:02:26 +0800 Subject: f2fs: add a tracepoint for f2fs_read_data_pages This patch adds a tracepoint for f2fs_read_data_pages to trace when pages are readahead by VFS. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d4f1c74..2285376 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1032,6 +1032,9 @@ static int f2fs_read_data_pages(struct file *file, struct list_head *pages, unsigned nr_pages) { struct inode *inode = file->f_mapping->host; + struct page *page = list_entry(pages->prev, struct page, lru); + + trace_f2fs_readpages(inode, page, nr_pages); /* If the file has inline data, skip readpages */ if (f2fs_has_inline_data(inode)) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 7de751d..00b4a63 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1028,6 +1028,32 @@ TRACE_EVENT(f2fs_writepages, __entry->for_sync) ); +TRACE_EVENT(f2fs_readpages, + + TP_PROTO(struct inode *inode, struct page *page, unsigned int nrpage), + + TP_ARGS(inode, page, nrpage), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(pgoff_t, start) + __field(unsigned int, nrpage) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->start = page->index; + __entry->nrpage = nrpage; + ), + + TP_printk("dev = (%d,%d), ino = %lu, start = %lu nrpage = %u", + show_dev_ino(__entry), + (unsigned long)__entry->start, + __entry->nrpage) +); + TRACE_EVENT(f2fs_write_checkpoint, TP_PROTO(struct super_block *sb, int reason, char *msg), -- cgit v0.10.2 From 2b947003fa98d5a39f3b21214380d0b1daf750b5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 12 Oct 2015 17:04:21 +0800 Subject: f2fs: don't tag REQ_META for temporary non-meta pages In recovery or checkpoint flow, we grab pages temperarily in meta inode's mapping for caching temperary data, actually, datas in these pages were not meta data of f2fs, but still we tag them with REQ_META flag. However, lower device like eMMC may do some optimization for data of such type. So in order to avoid wrong optimization, we'd better remove such flag for temperary non-meta pages. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0a9ec42..60a9599 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -47,7 +47,8 @@ repeat: /* * We guarantee no failure on the returned page. */ -struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, + bool is_meta) { struct address_space *mapping = META_MAPPING(sbi); struct page *page; @@ -58,6 +59,9 @@ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) .blk_addr = index, .encrypted_page = NULL, }; + + if (unlikely(!is_meta)) + fio.rw &= ~REQ_META; repeat: page = grab_cache_page(mapping, index); if (!page) { @@ -91,6 +95,17 @@ out: return page; } +struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +{ + return __get_meta_page(sbi, index, true); +} + +/* for POR only */ +struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) +{ + return __get_meta_page(sbi, index, false); +} + bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { switch (type) { @@ -137,6 +152,9 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type .encrypted_page = NULL, }; + if (unlikely(type == META_POR)) + fio.rw &= ~REQ_META; + for (; nrpages-- > 0; blkno++) { if (!is_valid_blkaddr(sbi, blkno, type)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6ba5a59..6c0e83c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1806,6 +1806,7 @@ void destroy_segment_manager_caches(void); */ struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); +struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t); bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int); int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int); void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4d9bedf..c61dfb6 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1807,7 +1807,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi, ra_meta_pages(sbi, addr, nrpages, META_POR); for (idx = addr; idx < addr + nrpages; idx++) { - struct page *page = get_meta_page(sbi, idx); + struct page *page = get_tmp_page(sbi, idx); rn = F2FS_NODE(page); sum_entry->nid = rn->footer.nid; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index c5daec5..75dbc07 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -188,7 +188,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) if (!is_valid_blkaddr(sbi, blkaddr, META_POR)) return 0; - page = get_meta_page(sbi, blkaddr); + page = get_tmp_page(sbi, blkaddr); if (cp_ver != cpver_of_node(page)) break; @@ -480,7 +480,7 @@ static int recover_data(struct f2fs_sb_info *sbi, ra_meta_pages_cond(sbi, blkaddr); - page = get_meta_page(sbi, blkaddr); + page = get_tmp_page(sbi, blkaddr); if (cp_ver != cpver_of_node(page)) { f2fs_put_page(page, 1); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 581a9af..13aa7a6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1317,6 +1317,9 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) .encrypted_page = NULL, }; + if (unlikely(page->index >= MAIN_BLKADDR(sbi))) + fio.rw &= ~REQ_META; + set_page_writeback(page); f2fs_submit_page_mbio(&fio); } -- cgit v0.10.2 From 26879fb101f28c554294eaf25ac7817a2825b180 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 12 Oct 2015 17:05:59 +0800 Subject: f2fs: support lower priority asynchronous readahead in ra_meta_pages Now, we use ra_meta_pages to reads continuous physical blocks as much as possible to improve performance of following reads. However, ra_meta_pages uses a synchronous readahead approach by submitting bio with READ, as READ is with high priority, it can not be used in the case of preloading blocks, and it's not sure when these RAed pages will be used. This patch supports asynchronous readahead in ra_meta_pages by tagging bio with READA flag in order to allow preloading. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 60a9599..f661d80 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -140,7 +140,8 @@ bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) /* * Readahead CP/NAT/SIT/SSA pages */ -int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type) +int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, + int type, bool sync) { block_t prev_blk_addr = 0; struct page *page; @@ -148,7 +149,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type struct f2fs_io_info fio = { .sbi = sbi, .type = META, - .rw = READ_SYNC | REQ_META | REQ_PRIO, + .rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA, .encrypted_page = NULL, }; @@ -214,7 +215,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) f2fs_put_page(page, 0); if (readahead) - ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR); + ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR, true); } static int f2fs_write_meta_page(struct page *page, @@ -521,7 +522,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); - ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP); + ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true); for (i = 0; i < orphan_blocks; i++) { struct page *page = get_meta_page(sbi, start_blk + i); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6c0e83c..5f372c0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1808,7 +1808,7 @@ struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t); bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int); -int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int); +int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool); void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 94278db..3f19634 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -840,7 +840,7 @@ gc_more: /* readahead multi ssa blocks those have contiguous address */ if (sbi->segs_per_sec > 1) ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, - META_SSA); + META_SSA, true); for (i = 0; i < sbi->segs_per_sec; i++) { /* diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c61dfb6..ad98e35 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1529,7 +1529,8 @@ static void build_free_nids(struct f2fs_sb_info *sbi) return; /* readahead nat pages to be scanned */ - ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT); + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, + META_NAT, true); while (1) { struct page *page = get_current_nat_page(sbi, nid); @@ -1804,7 +1805,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi, nrpages = min(last_offset - i, bio_blocks); /* readahead node pages */ - ra_meta_pages(sbi, addr, nrpages, META_POR); + ra_meta_pages(sbi, addr, nrpages, META_POR, true); for (idx = addr; idx < addr + nrpages; idx++) { struct page *page = get_tmp_page(sbi, idx); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 75dbc07..cbf74f4 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -180,7 +180,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - ra_meta_pages(sbi, blkaddr, 1, META_POR); + ra_meta_pages(sbi, blkaddr, 1, META_POR, true); while (1) { struct fsync_inode_entry *entry; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 13aa7a6..5337fd6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1621,7 +1621,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) if (npages >= 2) ra_meta_pages(sbi, start_sum_block(sbi), npages, - META_CP); + META_CP, true); /* restore for compacted data summary */ if (read_compacted_summaries(sbi)) @@ -1631,7 +1631,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) if (__exist_node_summaries(sbi)) ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type), - NR_CURSEG_TYPE - type, META_CP); + NR_CURSEG_TYPE - type, META_CP, true); for (; type <= CURSEG_COLD_NODE; type++) { err = read_normal_summaries(sbi, type); @@ -2118,7 +2118,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) int nrpages = MAX_BIO_BLOCKS(sbi); do { - readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT); + readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true); start = start_blk * sit_i->sents_per_block; end = (start_blk + readed) * sit_i->sents_per_block; -- cgit v0.10.2 From 2db2388fcf0d880bd7160264274ed873baec4f0d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 12 Oct 2015 17:07:33 +0800 Subject: f2fs: readahead for free nids building When there is no free nid in nid cache, all new node allocaters stop their job to wait for reloading of free nids, however reloading is synchronous as we will read 4 NAT pages for building nid cache, it cause the long latency. This patch tries to readahead more NAT pages with READA request flag after reloading of free nids. It helps to improve performance when users allocate node id intensively. Env: Sandisk 32G sd card time for i in `seq 1 60000`; { echo -n > /mnt/f2fs/$i; echo XXXXXX > /mnt/f2fs/$i;} Before: real 0m2.814s user 0m1.220s sys 0m1.536s After: real 0m2.711s user 0m1.136s sys 0m1.568s Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ad98e35..14f4606 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1560,6 +1560,9 @@ static void build_free_nids(struct f2fs_sb_info *sbi) remove_free_nid(nm_i, nid); } mutex_unlock(&curseg->curseg_mutex); + + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), + FREE_NID_PAGES, META_NAT, false); } /* -- cgit v0.10.2 From ea1a29a0bdfffd56ca98335c0655308e8d7d0e22 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 12 Oct 2015 17:08:48 +0800 Subject: f2fs: export ra_nid_pages to sysfs After finishing building free nid cache, we will try to readahead asynchronously 4 more pages for the next reloading, the count of readahead nid pages is fixed. In some case, like SMR drive, read less sectors with fixed count each time we trigger RA may be low efficient, since we will face high seeking overhead, so we'd better let user to configure this parameter from sysfs in specific workload. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index e066281d..0345f2d 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -86,3 +86,9 @@ Date: October 2015 Contact: "Jaegeuk Kim" Description: Controls the checkpoint timing. + +What: /sys/fs/f2fs//ra_nid_pages +Date: October 2015 +Contact: "Chao Yu" +Description: + Controls the count of nid pages to be readaheaded. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5f372c0..94cf6bc 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -510,6 +510,7 @@ struct f2fs_nm_info { nid_t available_nids; /* maximum available node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ unsigned int ram_thresh; /* control the memory footprint */ + unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 14f4606..7bcbc6e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1562,7 +1562,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi) mutex_unlock(&curseg->curseg_mutex); ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), - FREE_NID_PAGES, META_NAT, false); + nm_i->ra_nid_pages, META_NAT, false); } /* @@ -2005,6 +2005,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->fcnt = 0; nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; + nm_i->ra_nid_pages = DEF_RA_NID_PAGES; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 7427e95..e4fffd2 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -14,9 +14,11 @@ /* node block offset on the NAT area dedicated to the given start node id */ #define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) -/* # of pages to perform readahead before building free nids */ +/* # of pages to perform synchronous readahead before building free nids */ #define FREE_NID_PAGES 4 +#define DEF_RA_NID_PAGES 4 /* # of nid pages to be readaheaded */ + /* maximum readahead size for node during getting data blocks */ #define MAX_RA_NODE 128 diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index cb23d85..3a65e01 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -213,6 +213,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, cp_interval); @@ -232,6 +233,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(max_victim_search), ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), + ATTR_LIST(ra_nid_pages), ATTR_LIST(cp_interval), NULL, }; -- cgit v0.10.2 From 08b39fbd59781729da9fb6367decaf4804a22721 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 8 Oct 2015 13:27:34 +0800 Subject: f2fs crypto: fix racing of accessing encrypted page among different competitors Since we use different page cache (normally inode's page cache for R/W and meta inode's page cache for GC) to cache the same physical block which is belong to an encrypted inode. Writeback of these two page cache should be exclusive, but now we didn't handle writeback state well, so there may be potential racing problem: a) kworker: f2fs_gc: - f2fs_write_data_pages - f2fs_write_data_page - do_write_data_page - write_data_page - f2fs_submit_page_mbio (page#1 in inode's page cache was queued in f2fs bio cache, and be ready to write to new blkaddr) - gc_data_segment - move_encrypted_block - pagecache_get_page (page#2 in meta inode's page cache was cached with the invalid datas of physical block located in new blkaddr) - f2fs_submit_page_mbio (page#1 was submitted, later, page#2 with invalid data will be submitted) b) f2fs_gc: - gc_data_segment - move_encrypted_block - f2fs_submit_page_mbio (page#1 in meta inode's page cache was queued in f2fs bio cache, and be ready to write to new blkaddr) user thread: - f2fs_write_begin - f2fs_submit_page_bio (we submit the request to block layer to update page#2 in inode's page cache with physical block located in new blkaddr, so here we may read gabbage data from new blkaddr since GC hasn't writebacked the page#1 yet) This patch fixes above potential racing problem for encrypted inode. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2285376..77dfc9e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -956,21 +956,14 @@ submit_and_realloc: if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - struct page *cpage; ctx = f2fs_get_crypto_ctx(inode); if (IS_ERR(ctx)) goto set_error_page; /* wait the page to be moved by cleaning */ - cpage = find_lock_page( - META_MAPPING(F2FS_I_SB(inode)), - block_nr); - if (cpage) { - f2fs_wait_on_page_writeback(cpage, - DATA); - f2fs_put_page(cpage, 1); - } + f2fs_wait_on_encrypted_page_writeback( + F2FS_I_SB(inode), block_nr); } bio = bio_alloc(GFP_KERNEL, @@ -1064,6 +1057,11 @@ int do_write_data_page(struct f2fs_io_info *fio) } if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + + /* wait for GCed encrypted page writeback */ + f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode), + fio->blk_addr); + fio->encrypted_page = f2fs_encrypt(inode, fio->page); if (IS_ERR(fio->encrypted_page)) { err = PTR_ERR(fio->encrypted_page); @@ -1452,6 +1450,10 @@ put_next: f2fs_wait_on_page_writeback(page, DATA); + /* wait for GCed encrypted page writeback */ + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + if (len == PAGE_CACHE_SIZE) goto out_update; if (PageUptodate(page)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 94cf6bc..c3443da 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1792,6 +1792,7 @@ void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *, void allocate_data_block(struct f2fs_sb_info *, struct page *, block_t, block_t *, struct f2fs_summary *, int); void f2fs_wait_on_page_writeback(struct page *, enum page_type); +void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t); void write_data_summaries(struct f2fs_sb_info *, block_t); void write_node_summaries(struct f2fs_sb_info *, block_t); int lookup_journal_in_cursum(struct f2fs_summary_block *, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5d2a2ee..91c51a6 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -87,6 +87,11 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, mapped: /* fill the page */ f2fs_wait_on_page_writeback(page, DATA); + + /* wait for GCed encrypted page writeback */ + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + /* if gced page is attached, don't write to cold segment */ clear_cold_data(page); out: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3f19634..af7c24c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -559,8 +559,16 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) if (err) goto out; - if (unlikely(dn.data_blkaddr == NULL_ADDR)) + if (unlikely(dn.data_blkaddr == NULL_ADDR)) { + ClearPageUptodate(page); goto put_out; + } + + /* + * don't cache encrypted data into meta inode until previous dirty + * data were writebacked to avoid racing between GC and flush. + */ + f2fs_wait_on_page_writeback(page, DATA); get_node_info(fio.sbi, dn.nid, &ni); set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); @@ -589,7 +597,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) goto put_page_out; set_page_dirty(fio.encrypted_page); - f2fs_wait_on_page_writeback(fio.encrypted_page, META); + f2fs_wait_on_page_writeback(fio.encrypted_page, DATA); if (clear_page_dirty_for_io(fio.encrypted_page)) dec_page_count(fio.sbi, F2FS_DIRTY_META); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5337fd6..f37c212 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1484,6 +1484,23 @@ void f2fs_wait_on_page_writeback(struct page *page, } } +void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + struct page *cpage; + + if (blkaddr == NEW_ADDR) + return; + + f2fs_bug_on(sbi, blkaddr == NULL_ADDR); + + cpage = find_lock_page(META_MAPPING(sbi), blkaddr); + if (cpage) { + f2fs_wait_on_page_writeback(cpage, DATA); + f2fs_put_page(cpage, 1); + } +} + static int read_compacted_summaries(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); -- cgit v0.10.2 From 84e4214f0868ae77771837d0ed4cc6eff10738ba Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 13 Oct 2015 10:00:53 -0700 Subject: f2fs: relocate the tracepoint for background_gc Once f2fs_gc is done, wait_ms is changed once more. So, its tracepoint would be located after it. Reported-by: He YunLei Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index af7c24c..fedbf67 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -77,13 +77,13 @@ static int gc_thread_func(void *data) stat_inc_bggc_count(sbi); - trace_f2fs_background_gc(sbi->sb, wait_ms, - prefree_segments(sbi), free_segments(sbi)); - /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC))) wait_ms = gc_th->no_gc_sleep_time; + trace_f2fs_background_gc(sbi->sb, wait_ms, + prefree_segments(sbi), free_segments(sbi)); + /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi); -- cgit v0.10.2 From 1d373a0ef7a7bc08f95ca820c627e961fb21e188 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 19 Oct 2015 10:29:51 -0700 Subject: f2fs: flush dirty data for bmap Users expect bmap will give allocated block addresses. Let's play likewise ext4. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 77dfc9e..b052e7c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1667,12 +1667,13 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; - /* we don't need to use inline_data strictly */ - if (f2fs_has_inline_data(inode)) { - int err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } + if (f2fs_has_inline_data(inode)) + return 0; + + /* make sure allocating whole blocks */ + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + filemap_write_and_wait(mapping); + return generic_block_bmap(mapping, block, get_data_block_bmap); } -- cgit v0.10.2 From 67f8cf3cee6f398d05de8333c04fea2ddb59c805 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 15 Oct 2015 11:34:49 -0700 Subject: f2fs: support fiemap for inline_data There is a FIEMAP_EXTENT_INLINE_DATA, pointed out by Marc. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b052e7c..972eab7 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -771,6 +771,12 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; + if (f2fs_has_inline_data(inode)) { + ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len); + if (ret != -EAGAIN) + return ret; + } + mutex_lock(&inode->i_mutex); if (len >= isize) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c3443da..9db5500 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2054,6 +2054,8 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, bool f2fs_empty_inline_dir(struct inode *); int f2fs_read_inline_dir(struct file *, struct dir_context *, struct f2fs_str *); +int f2fs_inline_data_fiemap(struct inode *, + struct fiemap_extent_info *, __u64, __u64); /* * shrinker.c diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3b18e23..bda7126 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -12,6 +12,7 @@ #include #include "f2fs.h" +#include "node.h" bool f2fs_may_inline_data(struct inode *inode) { @@ -570,3 +571,38 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, f2fs_put_page(ipage, 1); return 0; } + +int f2fs_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) +{ + __u64 byteaddr, ilen; + __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED | + FIEMAP_EXTENT_LAST; + struct node_info ni; + struct page *ipage; + int err = 0; + + ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + if (!f2fs_has_inline_data(inode)) { + err = -EAGAIN; + goto out; + } + + ilen = min_t(size_t, MAX_INLINE_DATA, i_size_read(inode)); + if (start >= ilen) + goto out; + if (start + len < ilen) + ilen = start + len; + ilen -= start; + + get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); + byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; + byteaddr += (char *)inline_data_addr(ipage) - (char *)F2FS_INODE(ipage); + err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); +out: + f2fs_put_page(ipage, 1); + return err; +} -- cgit v0.10.2 From f96999c35f46fa9bce8a3a2812cd0a28fcde5903 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 20 Oct 2015 15:17:19 -0700 Subject: f2fs: refactor __find_rev_next_{zero}_bit This patch refactors __find_rev_next_{zero}_bit which was disabled previously due to bugs. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f37c212..7835e41 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -29,6 +29,21 @@ static struct kmem_cache *discard_entry_slab; static struct kmem_cache *sit_entry_set_slab; static struct kmem_cache *inmem_entry_slab; +static unsigned long __reverse_ulong(unsigned char *str) +{ + unsigned long tmp = 0; + int shift = 24, idx = 0; + +#if BITS_PER_LONG == 64 + shift = 56; +#endif + while (shift >= 0) { + tmp |= (unsigned long)str[idx++] << shift; + shift -= BITS_PER_BYTE; + } + return tmp; +} + /* * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since * MSB and LSB are reversed in a byte by f2fs_set_bit. @@ -38,27 +53,31 @@ static inline unsigned long __reverse_ffs(unsigned long word) int num = 0; #if BITS_PER_LONG == 64 - if ((word & 0xffffffff) == 0) { + if ((word & 0xffffffff00000000UL) == 0) num += 32; + else word >>= 32; - } #endif - if ((word & 0xffff) == 0) { + if ((word & 0xffff0000) == 0) num += 16; + else word >>= 16; - } - if ((word & 0xff) == 0) { + + if ((word & 0xff00) == 0) num += 8; + else word >>= 8; - } + if ((word & 0xf0) == 0) num += 4; else word >>= 4; + if ((word & 0xc) == 0) num += 2; else word >>= 2; + if ((word & 0x2) == 0) num += 1; return num; @@ -68,26 +87,16 @@ static inline unsigned long __reverse_ffs(unsigned long word) * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because * f2fs_set_bit makes MSB and LSB reversed in a byte. * Example: - * LSB <--> MSB - * f2fs_set_bit(0, bitmap) => 0000 0001 - * f2fs_set_bit(7, bitmap) => 1000 0000 + * MSB <--> LSB + * f2fs_set_bit(0, bitmap) => 1000 0000 + * f2fs_set_bit(7, bitmap) => 0000 0001 */ static unsigned long __find_rev_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { - while (!f2fs_test_bit(offset, (unsigned char *)addr)) - offset++; - - if (offset > size) - offset = size; - - return offset; -#if 0 const unsigned long *p = addr + BIT_WORD(offset); unsigned long result = offset & ~(BITS_PER_LONG - 1); unsigned long tmp; - unsigned long mask, submask; - unsigned long quot, rest; if (offset >= size) return size; @@ -97,14 +106,9 @@ static unsigned long __find_rev_next_bit(const unsigned long *addr, if (!offset) goto aligned; - tmp = *(p++); - quot = (offset >> 3) << 3; - rest = offset & 0x7; - mask = ~0UL << quot; - submask = (unsigned char)(0xff << rest) >> rest; - submask <<= quot; - mask &= submask; - tmp &= mask; + tmp = __reverse_ulong((unsigned char *)p); + tmp &= ~0UL >> offset; + if (size < BITS_PER_LONG) goto found_first; if (tmp) @@ -112,42 +116,34 @@ static unsigned long __find_rev_next_bit(const unsigned long *addr, size -= BITS_PER_LONG; result += BITS_PER_LONG; + p++; aligned: while (size & ~(BITS_PER_LONG-1)) { - tmp = *(p++); + tmp = __reverse_ulong((unsigned char *)p); if (tmp) goto found_middle; result += BITS_PER_LONG; size -= BITS_PER_LONG; + p++; } if (!size) return result; - tmp = *p; + + tmp = __reverse_ulong((unsigned char *)p); found_first: - tmp &= (~0UL >> (BITS_PER_LONG - size)); - if (tmp == 0UL) /* Are any bits set? */ + tmp &= (~0UL << (BITS_PER_LONG - size)); + if (!tmp) /* Are any bits set? */ return result + size; /* Nope. */ found_middle: return result + __reverse_ffs(tmp); -#endif } static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { - while (f2fs_test_bit(offset, (unsigned char *)addr)) - offset++; - - if (offset > size) - offset = size; - - return offset; -#if 0 const unsigned long *p = addr + BIT_WORD(offset); unsigned long result = offset & ~(BITS_PER_LONG - 1); unsigned long tmp; - unsigned long mask, submask; - unsigned long quot, rest; if (offset >= size) return size; @@ -157,40 +153,36 @@ static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, if (!offset) goto aligned; - tmp = *(p++); - quot = (offset >> 3) << 3; - rest = offset & 0x7; - mask = ~(~0UL << quot); - submask = (unsigned char)~((unsigned char)(0xff << rest) >> rest); - submask <<= quot; - mask += submask; - tmp |= mask; + tmp = __reverse_ulong((unsigned char *)p); + tmp |= ~((~0UL << offset) >> offset); + if (size < BITS_PER_LONG) goto found_first; - if (~tmp) + if (tmp != ~0UL) goto found_middle; size -= BITS_PER_LONG; result += BITS_PER_LONG; + p++; aligned: while (size & ~(BITS_PER_LONG - 1)) { - tmp = *(p++); - if (~tmp) + tmp = __reverse_ulong((unsigned char *)p); + if (tmp != ~0UL) goto found_middle; result += BITS_PER_LONG; size -= BITS_PER_LONG; + p++; } if (!size) return result; - tmp = *p; + tmp = __reverse_ulong((unsigned char *)p); found_first: - tmp |= ~0UL << size; - if (tmp == ~0UL) /* Are any bits zero? */ + tmp |= ~(~0UL << (BITS_PER_LONG - size)); + if (tmp == ~0UL) /* Are any bits zero? */ return result + size; /* Nope. */ found_middle: return result + __reverse_ffz(tmp); -#endif } void register_inmem_page(struct inode *inode, struct page *page) -- cgit v0.10.2 From d7b8b384b02f52b20a538e741c75c5d50c0e131f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 21 Oct 2015 18:49:50 -0700 Subject: f2fs: fix leakage of inmemory atomic pages If we got failure during commit_atomic_write, abort_volatile_write will be called, but will not drop the inmemory pages due to no FI_ATOMIC_FILE. Actually, there is no reason to check the flag in abort_volatile_write. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 91c51a6..a197215 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1443,13 +1443,9 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) f2fs_balance_fs(F2FS_I_SB(inode)); - if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); - commit_inmem_pages(inode, true); - } - - if (f2fs_is_volatile_file(inode)) - clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + commit_inmem_pages(inode, true); mnt_drop_write_file(filp); return ret; -- cgit v0.10.2 From 2b246fb0f60d318cec6901a0326b94b50d5e1dcb Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 21 Oct 2015 19:00:31 -0700 Subject: f2fs: don't need to submit bio on error case If commit_atomic_write is failed, we don't need to submit any bio. Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7835e41..7efd96ad 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -249,11 +249,11 @@ int commit_inmem_pages(struct inode *inode, bool abort) trace_f2fs_commit_inmem_page(cur->page, INMEM); fio.page = cur->page; err = do_write_data_page(&fio); - submit_bio = true; if (err) { unlock_page(cur->page); break; } + submit_bio = true; } } else { trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP); -- cgit v0.10.2 From 7fee740697e0d9a57d618b6fec79e4c4e09fd606 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 22 Oct 2015 18:18:11 +0800 Subject: f2fs: fix to clear GCed flag for atomic written page Atomic write page can be GCed, after committing this kind of page, we should clear the GCed flag for it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7efd96ad..f77b325 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -253,6 +253,7 @@ int commit_inmem_pages(struct inode *inode, bool abort) unlock_page(cur->page); break; } + clear_cold_data(cur->page); submit_bio = true; } } else { -- cgit v0.10.2 From a6be014e1d28339ba7c745fc4ac1efdbf6e2c1a2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 22 Oct 2015 18:23:08 +0800 Subject: f2fs: fix error path of ->symlink Now, in ->symlink of f2fs, we kept the fixed invoking order between f2fs_add_link and page_symlink since we should init node info firstly in f2fs_add_link, then such node info can be used in page_symlink. But we didn't fix to release meta info which was done before page_symlink in our error path, so this will leave us corrupt symlink entry in its parent's dentry page. Fix this issue by adding f2fs_unlink in the error path for removing such linking. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index dfa01c8..e48b80c 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -410,11 +410,14 @@ err_out: * If the symlink path is stored into inline_data, there is no * performance regression. */ - if (!err) + if (!err) { filemap_write_and_wait_range(inode->i_mapping, 0, p_len - 1); - if (IS_DIRSYNC(dir)) - f2fs_sync_fs(sbi->sb, 1); + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + } else { + f2fs_unlink(dir, dentry); + } kfree(sd); f2fs_fname_crypto_free_buffer(&disk_link); -- cgit v0.10.2 From beaa57dd986d4f398728c060692fc2452895cfd8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 22 Oct 2015 18:24:12 +0800 Subject: f2fs: fix to skip shrinking extent nodes In f2fs_shrink_extent_tree we should stop shrink flow if we have already shrunk enough nodes in extent cache. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index a38ee9b..7ddba81 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -620,7 +620,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) write_unlock(&et->lock); if (node_cnt + tree_cnt >= nr_shrink) - break; + goto unlock_out; } } unlock_out: -- cgit v0.10.2