From ecb94f5fdf4b72547fca022421a9dca1672bddd4 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 17 Aug 2012 09:44:17 -0400 Subject: ext4: collapse a single extent tree block into the inode if possible If an inode has more than 4 extents, but then later some of the extents are merged together, we can optimize the file system by moving the extents up into the inode, and discarding the extent tree block. This is important, because if there are a large number of inodes with an external extent tree blocks where the contents could fit in the inode, this can significantly increase the fsck time of the file system. Google-Bug-Id: 6801242 Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index aabbb3f..e8755c2 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1656,16 +1656,60 @@ static int ext4_ext_try_to_merge_right(struct inode *inode, } /* + * This function does a very simple check to see if we can collapse + * an extent tree with a single extent tree leaf block into the inode. + */ +static void ext4_ext_try_to_merge_up(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path) +{ + size_t s; + unsigned max_root = ext4_ext_space_root(inode, 0); + ext4_fsblk_t blk; + + if ((path[0].p_depth != 1) || + (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) || + (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root)) + return; + + /* + * We need to modify the block allocation bitmap and the block + * group descriptor to release the extent tree block. If we + * can't get the journal credits, give up. + */ + if (ext4_journal_extend(handle, 2)) + return; + + /* + * Copy the extent data up to the inode + */ + blk = ext4_idx_pblock(path[0].p_idx); + s = le16_to_cpu(path[1].p_hdr->eh_entries) * + sizeof(struct ext4_extent_idx); + s += sizeof(struct ext4_extent_header); + + memcpy(path[0].p_hdr, path[1].p_hdr, s); + path[0].p_depth = 0; + path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + + (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr)); + path[0].p_hdr->eh_max = cpu_to_le16(max_root); + + brelse(path[1].p_bh); + ext4_free_blocks(handle, inode, NULL, blk, 1, + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); +} + +/* * This function tries to merge the @ex extent to neighbours in the tree. * return 1 if merge left else 0. */ -static int ext4_ext_try_to_merge(struct inode *inode, +static void ext4_ext_try_to_merge(handle_t *handle, + struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex) { struct ext4_extent_header *eh; unsigned int depth; int merge_done = 0; - int ret = 0; depth = ext_depth(inode); BUG_ON(path[depth].p_hdr == NULL); @@ -1675,9 +1719,9 @@ static int ext4_ext_try_to_merge(struct inode *inode, merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); if (!merge_done) - ret = ext4_ext_try_to_merge_right(inode, path, ex); + (void) ext4_ext_try_to_merge_right(inode, path, ex); - return ret; + ext4_ext_try_to_merge_up(handle, inode, path); } /* @@ -1893,7 +1937,7 @@ has_space: merge: /* try to merge extents */ if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) - ext4_ext_try_to_merge(inode, path, nearex); + ext4_ext_try_to_merge(handle, inode, path, nearex); /* time to correct all indexes above */ @@ -1901,7 +1945,7 @@ merge: if (err) goto cleanup; - err = ext4_ext_dirty(handle, inode, path + depth); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); cleanup: if (npath) { @@ -2924,9 +2968,9 @@ static int ext4_split_extent_at(handle_t *handle, ext4_ext_mark_initialized(ex); if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) - ext4_ext_try_to_merge(inode, path, ex); + ext4_ext_try_to_merge(handle, inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + depth); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); goto out; } @@ -2958,8 +3002,8 @@ static int ext4_split_extent_at(handle_t *handle, goto fix_extent_len; /* update the extent length and mark as initialized */ ex->ee_len = cpu_to_le16(ee_len); - ext4_ext_try_to_merge(inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + depth); + ext4_ext_try_to_merge(handle, inode, path, ex); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); goto out; } else if (err) goto fix_extent_len; @@ -3191,8 +3235,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, if (err) goto out; ext4_ext_mark_initialized(ex); - ext4_ext_try_to_merge(inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + depth); + ext4_ext_try_to_merge(handle, inode, path, ex); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); goto out; } @@ -3333,10 +3377,10 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, /* note: ext4_ext_correct_indexes() isn't needed here because * borders are not changed */ - ext4_ext_try_to_merge(inode, path, ex); + ext4_ext_try_to_merge(handle, inode, path, ex); /* Mark modified extent as dirty */ - err = ext4_ext_dirty(handle, inode, path + depth); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); out: ext4_ext_show_leaf(inode, path); return err; -- cgit v0.10.2 From 01fc48e8929e45e67527200017cff4e74e4ba054 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 17 Aug 2012 09:46:17 -0400 Subject: ext4: don't load the block bitmap for block groups which have no space Add a short circuit check to ext4_mb_group_group() so that we don't bother to load the block bitmap for a block group which does not have any space available. (Or which does not have enough space until we are in desperation mode, i.e., when cr == 3.) Resolves-bug: https://bugzilla.kernel.org/show_bug.cgi?id=45741 Reported-by: mirek@me.com Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8eae947..3a57975 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1862,6 +1862,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, BUG_ON(cr < 0 || cr >= 4); + free = grp->bb_free; + if (free == 0) + return 0; + if (cr <= 2 && free < ac->ac_g_ex.fe_len) + return 0; + /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { int ret = ext4_mb_init_group(ac->ac_sb, group); @@ -1869,10 +1875,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, return 0; } - free = grp->bb_free; fragments = grp->bb_fragments; - if (free == 0) - return 0; if (fragments == 0) return 0; -- cgit v0.10.2 From df981d03eeff7971ac7e6ff37000bfa702327ef1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 17 Aug 2012 09:48:17 -0400 Subject: ext4: add max_dir_size_kb mount option Very large directories can cause significant performance problems, or perhaps even invoke the OOM killer, if the process is running in a highly constrained memory environment (whether it is VM's with a small amount of memory or in a small memory cgroup). So it is useful, in cloud server/data center environments, to be able to set a filesystem-wide cap on the maximum size of a directory, to ensure that directories never get larger than a sane size. We do this via a new mount option, max_dir_size_kb. If there is an attempt to grow the directory larger than max_dir_size_kb, the system call will return ENOSPC instead. Google-Bug-Id: 6863013 Signed-off-by: "Theodore Ts'o" diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 1b7f9ac..104322b 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -375,6 +375,16 @@ dioread_nolock locking. If the dioread_nolock option is specified Because of the restrictions this options comprises it is off by default (e.g. dioread_lock). +max_dir_size_kb=n This limits the size of directories so that any + attempt to expand them beyond the specified + limit in kilobytes will cause an ENOSPC error. + This is useful in memory constrained + environments, where a very large directory can + cause severe performance problems or even + provoke the Out Of Memory killer. (For example, + if there is only 512mb memory available, a 176mb + directory may seriously cramp the system's style.) + i_version Enable 64-bit inode version support. This option is off by default. diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c3411d4..7c0841e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1243,6 +1243,7 @@ struct ext4_sb_info { unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; unsigned int s_max_writeback_mb_bump; + unsigned int s_max_dir_size_kb; /* where last allocation was done - for stream allocation */ unsigned long s_mb_last_group; unsigned long s_mb_last_start; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2a42cc0..7450ff01 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -55,6 +55,13 @@ static struct buffer_head *ext4_append(handle_t *handle, { struct buffer_head *bh; + if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && + ((inode->i_size >> 10) >= + EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) { + *err = -ENOSPC; + return NULL; + } + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; bh = ext4_bread(handle, inode, *block, 1, err); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5984989..5a97e59 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1230,6 +1230,7 @@ enum { Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, + Opt_max_dir_size_kb, }; static const match_table_t tokens = { @@ -1303,6 +1304,7 @@ static const match_table_t tokens = { {Opt_init_itable, "init_itable=%u"}, {Opt_init_itable, "init_itable"}, {Opt_noinit_itable, "noinit_itable"}, + {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, {Opt_removed, "check=none"}, /* mount option from ext2/3 */ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ {Opt_removed, "reservation"}, /* mount option from ext2/3 */ @@ -1483,6 +1485,7 @@ static const struct mount_opts { {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, + {Opt_max_dir_size_kb, 0, MOPT_GTE0}, {Opt_err, 0, 0} }; @@ -1598,6 +1601,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, if (!args->from) arg = EXT4_DEF_LI_WAIT_MULT; sbi->s_li_wait_mult = arg; + } else if (token == Opt_max_dir_size_kb) { + sbi->s_max_dir_size_kb = arg; } else if (token == Opt_stripe) { sbi->s_stripe = arg; } else if (m->flags & MOPT_DATAJ) { @@ -1829,6 +1834,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, if (nodefs || (test_opt(sb, INIT_INODE_TABLE) && (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); + if (nodefs || sbi->s_max_dir_size_kb) + SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); ext4_show_quota_options(seq, sb); return 0; -- cgit v0.10.2 From 210c05264d46ddf688bc26861ffb8e2323fb25cb Mon Sep 17 00:00:00 2001 From: Anatol Pomozov Date: Fri, 17 Aug 2012 09:50:17 -0400 Subject: ext4: realign trace events structs to make it smaller Most hardware architectures require that data (including struct fields) have to be aligned in memory. To make it happen compiler inserts padding between struct fields if they are not aligned correctly. Reorder fields to remove paddings and make structures denser. Making data smaller saves some memory that is very important for trace events. Tracing buffer has limited size and making objects smaller we can put more of them without overflowing the tracing buffer. To find data struct holes I used 'pahole -H 1 -E -I vmlinux.o' from 'dwarves' package. Signed-off-by: Anatol Pomozov Signed-off-by: "Theodore Ts'o" diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 69d8a69..db67c71 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -26,19 +26,19 @@ TRACE_EVENT(ext4_free_inode, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( __u16, mode ) __field( uid_t, uid ) __field( gid_t, gid ) __field( __u64, blocks ) + __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->mode = inode->i_mode; __entry->uid = i_uid_read(inode); __entry->gid = i_gid_read(inode); __entry->blocks = inode->i_blocks; + __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu", @@ -300,10 +300,10 @@ TRACE_EVENT(ext4_da_writepages, __field( long, pages_skipped ) __field( loff_t, range_start ) __field( loff_t, range_end ) + __field( pgoff_t, writeback_index ) __field( int, sync_mode ) __field( char, for_kupdate ) __field( char, range_cyclic ) - __field( pgoff_t, writeback_index ) ), TP_fast_assign( @@ -313,10 +313,10 @@ TRACE_EVENT(ext4_da_writepages, __entry->pages_skipped = wbc->pages_skipped; __entry->range_start = wbc->range_start; __entry->range_end = wbc->range_end; + __entry->writeback_index = inode->i_mapping->writeback_index; __entry->sync_mode = wbc->sync_mode; __entry->for_kupdate = wbc->for_kupdate; __entry->range_cyclic = wbc->range_cyclic; - __entry->writeback_index = inode->i_mapping->writeback_index; ), TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld " @@ -382,8 +382,8 @@ TRACE_EVENT(ext4_da_writepages_result, __field( int, ret ) __field( int, pages_written ) __field( long, pages_skipped ) - __field( int, sync_mode ) __field( pgoff_t, writeback_index ) + __field( int, sync_mode ) ), TP_fast_assign( @@ -392,8 +392,8 @@ TRACE_EVENT(ext4_da_writepages_result, __entry->ret = ret; __entry->pages_written = pages_written; __entry->pages_skipped = wbc->pages_skipped; - __entry->sync_mode = wbc->sync_mode; __entry->writeback_index = inode->i_mapping->writeback_index; + __entry->sync_mode = wbc->sync_mode; ), TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld " @@ -411,16 +411,16 @@ DECLARE_EVENT_CLASS(ext4__page_op, TP_ARGS(page), TP_STRUCT__entry( - __field( pgoff_t, index ) - __field( ino_t, ino ) __field( dev_t, dev ) + __field( ino_t, ino ) + __field( pgoff_t, index ) ), TP_fast_assign( - __entry->index = page->index; - __entry->ino = page->mapping->host->i_ino; __entry->dev = page->mapping->host->i_sb->s_dev; + __entry->ino = page->mapping->host->i_ino; + __entry->index = page->index; ), TP_printk("dev %d,%d ino %lu page_index %lu", @@ -456,18 +456,18 @@ TRACE_EVENT(ext4_invalidatepage, TP_ARGS(page, offset), TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) __field( pgoff_t, index ) __field( unsigned long, offset ) - __field( ino_t, ino ) - __field( dev_t, dev ) ), TP_fast_assign( + __entry->dev = page->mapping->host->i_sb->s_dev; + __entry->ino = page->mapping->host->i_ino; __entry->index = page->index; __entry->offset = offset; - __entry->ino = page->mapping->host->i_ino; - __entry->dev = page->mapping->host->i_sb->s_dev; ), TP_printk("dev %d,%d ino %lu page_index %lu offset %lu", @@ -510,8 +510,8 @@ DECLARE_EVENT_CLASS(ext4__mb_new_pa, __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, pa_pstart ) - __field( __u32, pa_len ) __field( __u64, pa_lstart ) + __field( __u32, pa_len ) ), @@ -519,8 +519,8 @@ DECLARE_EVENT_CLASS(ext4__mb_new_pa, __entry->dev = ac->ac_sb->s_dev; __entry->ino = ac->ac_inode->i_ino; __entry->pa_pstart = pa->pa_pstart; - __entry->pa_len = pa->pa_len; __entry->pa_lstart = pa->pa_lstart; + __entry->pa_len = pa->pa_len; ), TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu", @@ -645,7 +645,6 @@ TRACE_EVENT(ext4_request_blocks, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( unsigned int, flags ) __field( unsigned int, len ) __field( __u32, logical ) __field( __u32, lleft ) @@ -653,12 +652,12 @@ TRACE_EVENT(ext4_request_blocks, __field( __u64, goal ) __field( __u64, pleft ) __field( __u64, pright ) + __field( unsigned int, flags ) ), TP_fast_assign( __entry->dev = ar->inode->i_sb->s_dev; __entry->ino = ar->inode->i_ino; - __entry->flags = ar->flags; __entry->len = ar->len; __entry->logical = ar->logical; __entry->goal = ar->goal; @@ -666,6 +665,7 @@ TRACE_EVENT(ext4_request_blocks, __entry->lright = ar->lright; __entry->pleft = ar->pleft; __entry->pright = ar->pright; + __entry->flags = ar->flags; ), TP_printk("dev %d,%d ino %lu flags %u len %u lblk %u goal %llu " @@ -686,7 +686,6 @@ TRACE_EVENT(ext4_allocate_blocks, __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, block ) - __field( unsigned int, flags ) __field( unsigned int, len ) __field( __u32, logical ) __field( __u32, lleft ) @@ -694,13 +693,13 @@ TRACE_EVENT(ext4_allocate_blocks, __field( __u64, goal ) __field( __u64, pleft ) __field( __u64, pright ) + __field( unsigned int, flags ) ), TP_fast_assign( __entry->dev = ar->inode->i_sb->s_dev; __entry->ino = ar->inode->i_ino; __entry->block = block; - __entry->flags = ar->flags; __entry->len = ar->len; __entry->logical = ar->logical; __entry->goal = ar->goal; @@ -708,6 +707,7 @@ TRACE_EVENT(ext4_allocate_blocks, __entry->lright = ar->lright; __entry->pleft = ar->pleft; __entry->pright = ar->pright; + __entry->flags = ar->flags; ), TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %u " @@ -728,19 +728,19 @@ TRACE_EVENT(ext4_free_blocks, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( __u16, mode ) __field( __u64, block ) __field( unsigned long, count ) __field( int, flags ) + __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->mode = inode->i_mode; __entry->block = block; __entry->count = count; __entry->flags = flags; + __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %d", @@ -783,15 +783,15 @@ TRACE_EVENT(ext4_sync_file_exit, TP_ARGS(inode, ret), TP_STRUCT__entry( - __field( int, ret ) - __field( ino_t, ino ) __field( dev_t, dev ) + __field( ino_t, ino ) + __field( int, ret ) ), TP_fast_assign( - __entry->ret = ret; - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->ret = ret; ), TP_printk("dev %d,%d ino %lu ret %d", @@ -854,12 +854,6 @@ TRACE_EVENT(ext4_mballoc_alloc, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( __u16, found ) - __field( __u16, groups ) - __field( __u16, buddy ) - __field( __u16, flags ) - __field( __u16, tail ) - __field( __u8, cr ) __field( __u32, orig_logical ) __field( int, orig_start ) __field( __u32, orig_group ) @@ -872,17 +866,17 @@ TRACE_EVENT(ext4_mballoc_alloc, __field( int, result_start ) __field( __u32, result_group ) __field( int, result_len ) + __field( __u16, found ) + __field( __u16, groups ) + __field( __u16, buddy ) + __field( __u16, flags ) + __field( __u16, tail ) + __field( __u8, cr ) ), TP_fast_assign( __entry->dev = ac->ac_inode->i_sb->s_dev; __entry->ino = ac->ac_inode->i_ino; - __entry->found = ac->ac_found; - __entry->flags = ac->ac_flags; - __entry->groups = ac->ac_groups_scanned; - __entry->buddy = ac->ac_buddy; - __entry->tail = ac->ac_tail; - __entry->cr = ac->ac_criteria; __entry->orig_logical = ac->ac_o_ex.fe_logical; __entry->orig_start = ac->ac_o_ex.fe_start; __entry->orig_group = ac->ac_o_ex.fe_group; @@ -895,6 +889,12 @@ TRACE_EVENT(ext4_mballoc_alloc, __entry->result_start = ac->ac_f_ex.fe_start; __entry->result_group = ac->ac_f_ex.fe_group; __entry->result_len = ac->ac_f_ex.fe_len; + __entry->found = ac->ac_found; + __entry->flags = ac->ac_flags; + __entry->groups = ac->ac_groups_scanned; + __entry->buddy = ac->ac_buddy; + __entry->tail = ac->ac_tail; + __entry->cr = ac->ac_criteria; ), TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " @@ -1015,17 +1015,17 @@ TRACE_EVENT(ext4_forget, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( __u16, mode ) - __field( int, is_metadata ) __field( __u64, block ) + __field( int, is_metadata ) + __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->mode = inode->i_mode; - __entry->is_metadata = is_metadata; __entry->block = block; + __entry->is_metadata = is_metadata; + __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu", @@ -1042,19 +1042,18 @@ TRACE_EVENT(ext4_da_update_reserve_space, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( __u16, mode ) __field( __u64, i_blocks ) __field( int, used_blocks ) __field( int, reserved_data_blocks ) __field( int, reserved_meta_blocks ) __field( int, allocated_meta_blocks ) __field( int, quota_claim ) + __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->mode = inode->i_mode; __entry->i_blocks = inode->i_blocks; __entry->used_blocks = used_blocks; __entry->reserved_data_blocks = @@ -1064,6 +1063,7 @@ TRACE_EVENT(ext4_da_update_reserve_space, __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks; __entry->quota_claim = quota_claim; + __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d " @@ -1085,21 +1085,21 @@ TRACE_EVENT(ext4_da_reserve_space, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( __u16, mode ) __field( __u64, i_blocks ) __field( int, md_needed ) __field( int, reserved_data_blocks ) __field( int, reserved_meta_blocks ) + __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->mode = inode->i_mode; __entry->i_blocks = inode->i_blocks; __entry->md_needed = md_needed; __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; + __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu md_needed %d " @@ -1119,23 +1119,23 @@ TRACE_EVENT(ext4_da_release_space, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( __u16, mode ) __field( __u64, i_blocks ) __field( int, freed_blocks ) __field( int, reserved_data_blocks ) __field( int, reserved_meta_blocks ) __field( int, allocated_meta_blocks ) + __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->mode = inode->i_mode; __entry->i_blocks = inode->i_blocks; __entry->freed_blocks = freed_blocks; __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks; + __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d " @@ -1203,16 +1203,16 @@ TRACE_EVENT(ext4_direct_IO_enter, TP_ARGS(inode, offset, len, rw), TP_STRUCT__entry( - __field( ino_t, ino ) __field( dev_t, dev ) + __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned long, len ) __field( int, rw ) ), TP_fast_assign( - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; __entry->pos = offset; __entry->len = len; __entry->rw = rw; @@ -1231,8 +1231,8 @@ TRACE_EVENT(ext4_direct_IO_exit, TP_ARGS(inode, offset, len, rw, ret), TP_STRUCT__entry( - __field( ino_t, ino ) __field( dev_t, dev ) + __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned long, len ) __field( int, rw ) @@ -1240,8 +1240,8 @@ TRACE_EVENT(ext4_direct_IO_exit, ), TP_fast_assign( - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; __entry->pos = offset; __entry->len = len; __entry->rw = rw; @@ -1261,16 +1261,16 @@ TRACE_EVENT(ext4_fallocate_enter, TP_ARGS(inode, offset, len, mode), TP_STRUCT__entry( - __field( ino_t, ino ) __field( dev_t, dev ) + __field( ino_t, ino ) __field( loff_t, pos ) __field( loff_t, len ) __field( int, mode ) ), TP_fast_assign( - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; __entry->pos = offset; __entry->len = len; __entry->mode = mode; @@ -1289,16 +1289,16 @@ TRACE_EVENT(ext4_fallocate_exit, TP_ARGS(inode, offset, max_blocks, ret), TP_STRUCT__entry( - __field( ino_t, ino ) __field( dev_t, dev ) + __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned int, blocks ) __field( int, ret ) ), TP_fast_assign( - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; __entry->pos = offset; __entry->blocks = max_blocks; __entry->ret = ret; @@ -1317,17 +1317,17 @@ TRACE_EVENT(ext4_unlink_enter, TP_ARGS(parent, dentry), TP_STRUCT__entry( - __field( ino_t, parent ) + __field( dev_t, dev ) __field( ino_t, ino ) + __field( ino_t, parent ) __field( loff_t, size ) - __field( dev_t, dev ) ), TP_fast_assign( - __entry->parent = parent->i_ino; + __entry->dev = dentry->d_inode->i_sb->s_dev; __entry->ino = dentry->d_inode->i_ino; + __entry->parent = parent->i_ino; __entry->size = dentry->d_inode->i_size; - __entry->dev = dentry->d_inode->i_sb->s_dev; ), TP_printk("dev %d,%d ino %lu size %lld parent %lu", @@ -1342,14 +1342,14 @@ TRACE_EVENT(ext4_unlink_exit, TP_ARGS(dentry, ret), TP_STRUCT__entry( - __field( ino_t, ino ) __field( dev_t, dev ) + __field( ino_t, ino ) __field( int, ret ) ), TP_fast_assign( - __entry->ino = dentry->d_inode->i_ino; __entry->dev = dentry->d_inode->i_sb->s_dev; + __entry->ino = dentry->d_inode->i_ino; __entry->ret = ret; ), @@ -1365,14 +1365,14 @@ DECLARE_EVENT_CLASS(ext4__truncate, TP_ARGS(inode), TP_STRUCT__entry( - __field( ino_t, ino ) - __field( dev_t, dev ) + __field( dev_t, dev ) + __field( ino_t, ino ) __field( __u64, blocks ) ), TP_fast_assign( - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; __entry->blocks = inode->i_blocks; ), @@ -1403,8 +1403,8 @@ TRACE_EVENT(ext4_ext_convert_to_initialized_enter, TP_ARGS(inode, map, ux), TP_STRUCT__entry( - __field( ino_t, ino ) __field( dev_t, dev ) + __field( ino_t, ino ) __field( ext4_lblk_t, m_lblk ) __field( unsigned, m_len ) __field( ext4_lblk_t, u_lblk ) @@ -1413,8 +1413,8 @@ TRACE_EVENT(ext4_ext_convert_to_initialized_enter, ), TP_fast_assign( - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; __entry->m_lblk = map->m_lblk; __entry->m_len = map->m_len; __entry->u_lblk = le32_to_cpu(ux->ee_block); @@ -1441,8 +1441,8 @@ TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath, TP_ARGS(inode, map, ux, ix), TP_STRUCT__entry( - __field( ino_t, ino ) __field( dev_t, dev ) + __field( ino_t, ino ) __field( ext4_lblk_t, m_lblk ) __field( unsigned, m_len ) __field( ext4_lblk_t, u_lblk ) @@ -1454,8 +1454,8 @@ TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath, ), TP_fast_assign( - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; __entry->m_lblk = map->m_lblk; __entry->m_len = map->m_len; __entry->u_lblk = le32_to_cpu(ux->ee_block); @@ -1483,16 +1483,16 @@ DECLARE_EVENT_CLASS(ext4__map_blocks_enter, TP_ARGS(inode, lblk, len, flags), TP_STRUCT__entry( - __field( ino_t, ino ) - __field( dev_t, dev ) + __field( dev_t, dev ) + __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( unsigned int, len ) __field( unsigned int, flags ) ), TP_fast_assign( - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; __entry->lblk = lblk; __entry->len = len; __entry->flags = flags; @@ -1525,19 +1525,19 @@ DECLARE_EVENT_CLASS(ext4__map_blocks_exit, TP_ARGS(inode, lblk, pblk, len, ret), TP_STRUCT__entry( - __field( ino_t, ino ) __field( dev_t, dev ) - __field( ext4_lblk_t, lblk ) + __field( ino_t, ino ) __field( ext4_fsblk_t, pblk ) + __field( ext4_lblk_t, lblk ) __field( unsigned int, len ) __field( int, ret ) ), TP_fast_assign( - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; - __entry->lblk = lblk; + __entry->ino = inode->i_ino; __entry->pblk = pblk; + __entry->lblk = lblk; __entry->len = len; __entry->ret = ret; ), @@ -1569,17 +1569,17 @@ TRACE_EVENT(ext4_ext_load_extent, TP_ARGS(inode, lblk, pblk), TP_STRUCT__entry( - __field( ino_t, ino ) __field( dev_t, dev ) - __field( ext4_lblk_t, lblk ) + __field( ino_t, ino ) __field( ext4_fsblk_t, pblk ) + __field( ext4_lblk_t, lblk ) ), TP_fast_assign( - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; - __entry->lblk = lblk; + __entry->ino = inode->i_ino; __entry->pblk = pblk; + __entry->lblk = lblk; ), TP_printk("dev %d,%d ino %lu lblk %u pblk %llu", @@ -1594,13 +1594,13 @@ TRACE_EVENT(ext4_load_inode, TP_ARGS(inode), TP_STRUCT__entry( - __field( ino_t, ino ) __field( dev_t, dev ) + __field( ino_t, ino ) ), TP_fast_assign( - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; ), TP_printk("dev %d,%d ino %ld", @@ -1615,14 +1615,14 @@ TRACE_EVENT(ext4_journal_start, TP_STRUCT__entry( __field( dev_t, dev ) - __field( int, nblocks ) __field(unsigned long, ip ) + __field( int, nblocks ) ), TP_fast_assign( __entry->dev = sb->s_dev; - __entry->nblocks = nblocks; __entry->ip = IP; + __entry->nblocks = nblocks; ), TP_printk("dev %d,%d nblocks %d caller %pF", @@ -1686,23 +1686,23 @@ TRACE_EVENT(ext4_ext_handle_uninitialized_extents, TP_ARGS(inode, map, allocated, newblock), TP_STRUCT__entry( - __field( ino_t, ino ) __field( dev_t, dev ) + __field( ino_t, ino ) + __field( int, flags ) __field( ext4_lblk_t, lblk ) __field( ext4_fsblk_t, pblk ) __field( unsigned int, len ) - __field( int, flags ) __field( unsigned int, allocated ) __field( ext4_fsblk_t, newblk ) ), TP_fast_assign( - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->flags = map->m_flags; __entry->lblk = map->m_lblk; __entry->pblk = map->m_pblk; __entry->len = map->m_len; - __entry->flags = map->m_flags; __entry->allocated = allocated; __entry->newblk = newblock; ), @@ -1724,19 +1724,19 @@ TRACE_EVENT(ext4_get_implied_cluster_alloc_exit, TP_STRUCT__entry( __field( dev_t, dev ) + __field( unsigned int, flags ) __field( ext4_lblk_t, lblk ) __field( ext4_fsblk_t, pblk ) __field( unsigned int, len ) - __field( unsigned int, flags ) __field( int, ret ) ), TP_fast_assign( __entry->dev = sb->s_dev; + __entry->flags = map->m_flags; __entry->lblk = map->m_lblk; __entry->pblk = map->m_pblk; __entry->len = map->m_len; - __entry->flags = map->m_flags; __entry->ret = ret; ), @@ -1753,16 +1753,16 @@ TRACE_EVENT(ext4_ext_put_in_cache, TP_ARGS(inode, lblk, len, start), TP_STRUCT__entry( - __field( ino_t, ino ) __field( dev_t, dev ) + __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( unsigned int, len ) __field( ext4_fsblk_t, start ) ), TP_fast_assign( - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; __entry->lblk = lblk; __entry->len = len; __entry->start = start; @@ -1782,15 +1782,15 @@ TRACE_EVENT(ext4_ext_in_cache, TP_ARGS(inode, lblk, ret), TP_STRUCT__entry( - __field( ino_t, ino ) __field( dev_t, dev ) + __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( int, ret ) ), TP_fast_assign( - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; __entry->lblk = lblk; __entry->ret = ret; ), @@ -1810,8 +1810,8 @@ TRACE_EVENT(ext4_find_delalloc_range, TP_ARGS(inode, from, to, reverse, found, found_blk), TP_STRUCT__entry( - __field( ino_t, ino ) __field( dev_t, dev ) + __field( ino_t, ino ) __field( ext4_lblk_t, from ) __field( ext4_lblk_t, to ) __field( int, reverse ) @@ -1820,8 +1820,8 @@ TRACE_EVENT(ext4_find_delalloc_range, ), TP_fast_assign( - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; __entry->from = from; __entry->to = to; __entry->reverse = reverse; @@ -1844,15 +1844,15 @@ TRACE_EVENT(ext4_get_reserved_cluster_alloc, TP_ARGS(inode, lblk, len), TP_STRUCT__entry( - __field( ino_t, ino ) __field( dev_t, dev ) + __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( unsigned int, len ) ), TP_fast_assign( - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; __entry->lblk = lblk; __entry->len = len; ), @@ -1871,18 +1871,18 @@ TRACE_EVENT(ext4_ext_show_extent, TP_ARGS(inode, lblk, pblk, len), TP_STRUCT__entry( - __field( ino_t, ino ) __field( dev_t, dev ) - __field( ext4_lblk_t, lblk ) + __field( ino_t, ino ) __field( ext4_fsblk_t, pblk ) + __field( ext4_lblk_t, lblk ) __field( unsigned short, len ) ), TP_fast_assign( - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; - __entry->lblk = lblk; + __entry->ino = inode->i_ino; __entry->pblk = pblk; + __entry->lblk = lblk; __entry->len = len; ), @@ -1902,25 +1902,25 @@ TRACE_EVENT(ext4_remove_blocks, TP_ARGS(inode, ex, from, to, partial_cluster), TP_STRUCT__entry( - __field( ino_t, ino ) __field( dev_t, dev ) - __field( ext4_lblk_t, ee_lblk ) - __field( ext4_fsblk_t, ee_pblk ) - __field( unsigned short, ee_len ) + __field( ino_t, ino ) __field( ext4_lblk_t, from ) __field( ext4_lblk_t, to ) __field( ext4_fsblk_t, partial ) + __field( ext4_fsblk_t, ee_pblk ) + __field( ext4_lblk_t, ee_lblk ) + __field( unsigned short, ee_len ) ), TP_fast_assign( - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; - __entry->ee_lblk = cpu_to_le32(ex->ee_block); - __entry->ee_pblk = ext4_ext_pblock(ex); - __entry->ee_len = ext4_ext_get_actual_len(ex); + __entry->ino = inode->i_ino; __entry->from = from; __entry->to = to; __entry->partial = partial_cluster; + __entry->ee_pblk = ext4_ext_pblock(ex); + __entry->ee_lblk = cpu_to_le32(ex->ee_block); + __entry->ee_len = ext4_ext_get_actual_len(ex); ), TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]" @@ -1942,23 +1942,23 @@ TRACE_EVENT(ext4_ext_rm_leaf, TP_ARGS(inode, start, ex, partial_cluster), TP_STRUCT__entry( - __field( ino_t, ino ) __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ext4_fsblk_t, partial ) __field( ext4_lblk_t, start ) __field( ext4_lblk_t, ee_lblk ) __field( ext4_fsblk_t, ee_pblk ) __field( short, ee_len ) - __field( ext4_fsblk_t, partial ) ), TP_fast_assign( - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->partial = partial_cluster; __entry->start = start; __entry->ee_lblk = le32_to_cpu(ex->ee_block); __entry->ee_pblk = ext4_ext_pblock(ex); __entry->ee_len = ext4_ext_get_actual_len(ex); - __entry->partial = partial_cluster; ), TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]" @@ -1978,14 +1978,14 @@ TRACE_EVENT(ext4_ext_rm_idx, TP_ARGS(inode, pblk), TP_STRUCT__entry( - __field( ino_t, ino ) __field( dev_t, dev ) + __field( ino_t, ino ) __field( ext4_fsblk_t, pblk ) ), TP_fast_assign( - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; __entry->pblk = pblk; ), @@ -2001,15 +2001,15 @@ TRACE_EVENT(ext4_ext_remove_space, TP_ARGS(inode, start, depth), TP_STRUCT__entry( - __field( ino_t, ino ) __field( dev_t, dev ) + __field( ino_t, ino ) __field( ext4_lblk_t, start ) __field( int, depth ) ), TP_fast_assign( - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; __entry->start = start; __entry->depth = depth; ), @@ -2028,8 +2028,8 @@ TRACE_EVENT(ext4_ext_remove_space_done, TP_ARGS(inode, start, depth, partial, eh_entries), TP_STRUCT__entry( - __field( ino_t, ino ) __field( dev_t, dev ) + __field( ino_t, ino ) __field( ext4_lblk_t, start ) __field( int, depth ) __field( ext4_lblk_t, partial ) @@ -2037,8 +2037,8 @@ TRACE_EVENT(ext4_ext_remove_space_done, ), TP_fast_assign( - __entry->ino = inode->i_ino; __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; __entry->start = start; __entry->depth = depth; __entry->partial = partial; -- cgit v0.10.2 From 81370291722ac1e0ec95234a0ea91a5bc76b6185 Mon Sep 17 00:00:00 2001 From: Anatol Pomozov Date: Fri, 17 Aug 2012 09:52:17 -0400 Subject: ext4: add missing space to trace message Signed-off-by: Anatol Pomozov Signed-off-by: "Theodore Ts'o" diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index db67c71..d49b285 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -320,7 +320,7 @@ TRACE_EVENT(ext4_da_writepages, ), TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld " - "range_start %lld range_end %lld sync_mode %d" + "range_start %lld range_end %lld sync_mode %d " "for_kupdate %d range_cyclic %d writeback_index %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->nr_to_write, -- cgit v0.10.2 From 67a5da564f97f31c4054d358e00b34d7ee570da5 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Fri, 17 Aug 2012 09:54:17 -0400 Subject: ext4: make the zero-out chunk size tunable Currently in ext4 the length of zero-out chunk is set to 7 file system blocks. But if an inode has uninitailized extents from using fallocate to preallocate space, and the workload issues many random writes, this can cause a fragmented extent tree that will unnecessarily grow the extent tree. So create a new sysfs tunable, extent_max_zeroout_kb, which controls the maximum size where blocks will be zeroed out instead of creating a new uninitialized extent. The default of this has been sent to 32kb. CC: Zach Brown CC: Andreas Dilger Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4 index f22ac08..c631253 100644 --- a/Documentation/ABI/testing/sysfs-fs-ext4 +++ b/Documentation/ABI/testing/sysfs-fs-ext4 @@ -96,3 +96,16 @@ Contact: "Theodore Ts'o" Description: The maximum number of megabytes the writeback code will try to write out before move on to another inode. + +What: /sys/fs/ext4//extent_max_zeroout_kb +Date: August 2012 +Contact: "Theodore Ts'o" +Description: + The maximum number of kilobytes which will be zeroed + out in preference to creating a new uninitialized + extent when manipulating an inode's extent tree. Note + that using a larger value will increase the + variability of time necessary to complete a random + write operation (since a 4k random write might turn + into a much larger write due to the zeroout + operation). diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7c0841e..0df5ee1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1271,6 +1271,9 @@ struct ext4_sb_info { unsigned long s_sectors_written_start; u64 s_kbytes_written; + /* the size of zero-out chunk */ + unsigned int s_extent_max_zeroout_kb; + unsigned int s_log_groups_per_flex; struct flex_groups *s_flex_groups; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e8755c2..2f082ab 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3085,7 +3085,6 @@ out: return err ? err : map->m_len; } -#define EXT4_EXT_ZERO_LEN 7 /* * This function is called by ext4_ext_map_blocks() if someone tries to write * to an uninitialized extent. It may result in splitting the uninitialized @@ -3111,13 +3110,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, struct ext4_map_blocks *map, struct ext4_ext_path *path) { + struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; struct ext4_extent zero_ex; struct ext4_extent *ex; ext4_lblk_t ee_block, eof_block; unsigned int ee_len, depth; - int allocated; + int allocated, max_zeroout = 0; int err = 0; int split_flag = 0; @@ -3125,6 +3125,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, "block %llu, max_blocks %u\n", inode->i_ino, (unsigned long long)map->m_lblk, map->m_len); + sbi = EXT4_SB(inode->i_sb); eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; if (eof_block < map->m_lblk + map->m_len) @@ -3224,9 +3225,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, */ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; - /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ - if (ee_len <= 2*EXT4_EXT_ZERO_LEN && - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + if (EXT4_EXT_MAY_ZEROOUT & split_flag) + max_zeroout = sbi->s_extent_max_zeroout_kb >> + inode->i_sb->s_blocksize_bits; + + /* If extent is less than s_max_zeroout_kb, zeroout directly */ + if (max_zeroout && (ee_len <= max_zeroout)) { err = ext4_ext_zeroout(inode, ex); if (err) goto out; @@ -3250,9 +3254,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, split_map.m_lblk = map->m_lblk; split_map.m_len = map->m_len; - if (allocated > map->m_len) { - if (allocated <= EXT4_EXT_ZERO_LEN && - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + if (max_zeroout && (allocated > map->m_len)) { + if (allocated <= max_zeroout) { /* case 3 */ zero_ex.ee_block = cpu_to_le32(map->m_lblk); @@ -3264,9 +3267,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, goto out; split_map.m_lblk = map->m_lblk; split_map.m_len = allocated; - } else if ((map->m_lblk - ee_block + map->m_len < - EXT4_EXT_ZERO_LEN) && - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) { /* case 2 */ if (map->m_lblk != ee_block) { zero_ex.ee_block = ex->ee_block; @@ -3286,7 +3287,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, } allocated = ext4_split_extent(handle, inode, path, - &split_map, split_flag, 0); + &split_map, split_flag, 0); if (allocated < 0) err = allocated; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5a97e59..0423e2e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2541,6 +2541,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); +EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); static struct attribute *ext4_attrs[] = { @@ -2556,6 +2557,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_stream_req), ATTR_LIST(mb_group_prealloc), ATTR_LIST(max_writeback_mb_bump), + ATTR_LIST(extent_max_zeroout_kb), ATTR_LIST(trigger_fs_error), NULL, }; @@ -3756,6 +3758,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_max_writeback_mb_bump = 128; + sbi->s_extent_max_zeroout_kb = 32; /* * set up enough so that it can read an inode -- cgit v0.10.2 From 316e4cfd0b0b4ce846fd0fbda2deebcffbd3e440 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 17 Aug 2012 09:56:17 -0400 Subject: jbd2: check return value of blkdev_issue_flush() blkdev_issue_flush() can fail; make sure the error gets properly propagated. This is a port of the equivalent jbd patch from commit 349ecd6a3c0e. Signed-off-by: "Theodore Ts'o" diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 0131e43..626846b 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -289,8 +289,11 @@ int jbd2_journal_recover(journal_t *journal) if (!err) err = err2; /* Make sure all replayed data is on permanent storage */ - if (journal->j_flags & JBD2_BARRIER) - blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + if (journal->j_flags & JBD2_BARRIER) { + err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + if (!err) + err = err2; + } return err; } -- cgit v0.10.2 From a4a39040e9dacb5fa28b7ee7f842237ee534e7bf Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 17 Aug 2012 09:58:17 -0400 Subject: ext4: check return value of blkdev_issue_flush() blkdev_issue_flush() can fail; make sure the error gets properly propagated. This is a port of the equivalent ext3 patch from commit 44f4f729e7a1. Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 2a1dcea..323eb15 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -203,7 +203,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct inode *inode = file->f_mapping->host; struct ext4_inode_info *ei = EXT4_I(inode); journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - int ret; + int ret, err; tid_t commit_tid; bool needs_barrier = false; @@ -255,8 +255,11 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) needs_barrier = true; jbd2_log_start_commit(journal, commit_tid); ret = jbd2_log_wait_commit(journal, commit_tid); - if (needs_barrier) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + if (needs_barrier) { + err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + if (!ret) + ret = err; + } out: mutex_unlock(&inode->i_mutex); trace_ext4_sync_file_exit(inode, ret); -- cgit v0.10.2 From cc6eb18d68fb52a7de65b7a318461ca600240177 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Fri, 17 Aug 2012 10:00:17 -0400 Subject: ext4: remove unused macro MB_DEFAULT_MAX_GROUPS_TO_SCAN Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index c070618..3ccd889 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -65,11 +65,6 @@ extern u8 mb_enable_debug; #define MB_DEFAULT_MIN_TO_SCAN 10 /* - * How many groups mballoc will scan looking for the best chunk - */ -#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5 - -/* * with 'ext4_mb_stats' allocator will collect stats that will be * shown at umount. The collecting costs though! */ -- cgit v0.10.2 From 15c006a22f8e004afbce42a54c878162355f1587 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Fri, 17 Aug 2012 10:02:17 -0400 Subject: ext4: remove unused function argument 'order' in mb_find_extent() All the routines call mb_find_extent are setting argument 'order' to 0 just like: mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex); therefore the useless argument should be removed. Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 3a57975..6873571 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1338,17 +1338,17 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, mb_check_buddy(e4b); } -static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, +static int mb_find_extent(struct ext4_buddy *e4b, int block, int needed, struct ext4_free_extent *ex) { int next = block; - int max; + int max, order; void *buddy; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); BUG_ON(ex == NULL); - buddy = mb_find_buddy(e4b, order, &max); + buddy = mb_find_buddy(e4b, 0, &max); BUG_ON(buddy == NULL); BUG_ON(block >= max); if (mb_test_bit(block, buddy)) { @@ -1358,12 +1358,9 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, return 0; } - /* FIXME dorp order completely ? */ - if (likely(order == 0)) { - /* find actual order */ - order = mb_find_order_for_block(e4b, block); - block = block >> order; - } + /* find actual order */ + order = mb_find_order_for_block(e4b, block); + block = block >> order; ex->fe_len = 1 << order; ex->fe_start = block << order; @@ -1549,7 +1546,7 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac, /* recheck chunk's availability - we don't know * when it was found (within this lock-unlock * period or not) */ - max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex); + max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex); if (max >= gex->fe_len) { ext4_mb_use_best_found(ac, e4b); return; @@ -1641,7 +1638,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac, return err; ext4_lock_group(ac->ac_sb, group); - max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex); + max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); if (max > 0) { ac->ac_b_ex = ex; @@ -1672,7 +1669,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, return err; ext4_lock_group(ac->ac_sb, group); - max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start, + max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { @@ -1788,7 +1785,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, break; } - mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); + mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); BUG_ON(ex.fe_len <= 0); if (free < ex.fe_len) { ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, @@ -1840,7 +1837,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { if (!mb_test_bit(i, bitmap)) { - max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); + max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); if (max >= sbi->s_stripe) { ac->ac_found++; ac->ac_b_ex = ex; -- cgit v0.10.2 From 0e376b1e3ccedee49cb8cc6b652fbc1e7c15eeef Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 17 Aug 2012 10:04:17 -0400 Subject: ext4: return an error if kset_create_and_add fails in ext4_init_fs() In the very unlikely case that kset_create_and_add() fails when the ext4.ko module is being loaded (or during kernel startup) set err so that it's clear that the module load failed. https://bugzilla.kernel.org/show_bug.cgi?id=27912 Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0423e2e..3ab798d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5285,8 +5285,10 @@ static int __init ext4_init_fs(void) if (err) goto out6; ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); - if (!ext4_kset) + if (!ext4_kset) { + err = -ENOMEM; goto out5; + } ext4_proc_root = proc_mkdir("fs/ext4", NULL); err = ext4_init_feat_adverts(); -- cgit v0.10.2 From 07724f98978ad39386a3996a4e1b6780489a4dda Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 17 Aug 2012 19:08:42 -0400 Subject: ext4: drop lock_super()/unlock_super() We don't need lock_super()/unlock_super() any more, since the places where it is used, we are protected by the s_umount r/w semaphore. Signed-off-by: "Theodore Ts'o" Cc: Marco Stornelli diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3ab798d..bae4124 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -861,7 +861,6 @@ static void ext4_put_super(struct super_block *sb) flush_workqueue(sbi->dio_unwritten_wq); destroy_workqueue(sbi->dio_unwritten_wq); - lock_super(sb); if (sbi->s_journal) { err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -928,7 +927,6 @@ static void ext4_put_super(struct super_block *sb) * Now that we are completely done shutting down the * superblock, we need to actually destroy the kobject. */ - unlock_super(sb); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); if (sbi->s_chksum_driver) @@ -4535,11 +4533,9 @@ static int ext4_unfreeze(struct super_block *sb) if (sb->s_flags & MS_RDONLY) return 0; - lock_super(sb); /* Reset the needs_recovery flag before the fs is unlocked. */ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); ext4_commit_super(sb, 1); - unlock_super(sb); return 0; } @@ -4575,7 +4571,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) char *orig_data = kstrdup(data, GFP_KERNEL); /* Store the original options */ - lock_super(sb); old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; old_opts.s_mount_opt2 = sbi->s_mount_opt2; @@ -4717,7 +4712,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal == NULL) ext4_commit_super(sb, 1); - unlock_super(sb); #ifdef CONFIG_QUOTA /* Release old quota file names */ for (i = 0; i < MAXQUOTAS; i++) @@ -4730,10 +4724,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) else if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) { err = ext4_enable_quotas(sb); - if (err) { - lock_super(sb); + if (err) goto restore_opts; - } } } #endif @@ -4760,7 +4752,6 @@ restore_opts: sbi->s_qf_names[i] = old_opts.s_qf_names[i]; } #endif - unlock_super(sb); kfree(orig_data); return err; } -- cgit v0.10.2 From caecd0af8fe0635e8e6900399b951433af35bf52 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Sat, 18 Aug 2012 22:29:18 -0400 Subject: ext4: replace plain integer with NULL in super.c Fixes the following sparse warning: fs/ext4/super.c:1672:45: warning: Using plain integer as NULL pointer Signed-off-by: Sachin Kamat Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bae4124..b875ff5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1673,7 +1673,7 @@ static int parse_options(char *options, struct super_block *sb, * Initialize args struct so we know whether arg was * found; some options take optional arguments. */ - args[0].to = args[0].from = 0; + args[0].to = args[0].from = NULL; token = match_token(p, tokens, args); if (handle_mount_opt(sb, p, token, args, journal_devnum, journal_ioprio, is_remount) < 0) -- cgit v0.10.2 From eeecef0af5ea4efd763c9554cf2bd80fc4a0efd3 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 18 Aug 2012 22:29:40 -0400 Subject: jbd2: don't write superblock when if its empty This sequence: # truncate --size=1g fsfile # mkfs.ext4 -F fsfile # mount -o loop,ro fsfile /mnt # umount /mnt # dmesg | tail results in an IO error when unmounting the RO filesystem: [ 318.020828] Buffer I/O error on device loop1, logical block 196608 [ 318.027024] lost page write due to I/O error on loop1 [ 318.032088] JBD2: Error -5 detected when updating journal superblock for loop1-8. This was a regression introduced by commit 24bcc89c7e7c: "jbd2: split updating of journal superblock and marking journal empty". Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index bd23f2e..0f16edd 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1354,6 +1354,11 @@ static void jbd2_mark_journal_empty(journal_t *journal) BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); read_lock(&journal->j_state_lock); + /* Is it already empty? */ + if (sb->s_start == 0) { + read_unlock(&journal->j_state_lock); + return; + } jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n", journal->j_tail_sequence); -- cgit v0.10.2 From e3d2e433e32a4b7a05818cbc9158037e26c74c8e Mon Sep 17 00:00:00 2001 From: Ashish Sangwan Date: Sat, 18 Aug 2012 22:29:46 -0400 Subject: ext4: no need to add inode to orphan list during hole punch While performing punch hole for an inode, i_disksize is not changed. So, there is no need to add the inode to orphan list. Signed-off-by: Ashish Sangwan Signed-off-by: Namjae Jeon Acked-by: Zheng Liu Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2f082ab..2e56903 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4860,9 +4860,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) if (IS_ERR(handle)) return PTR_ERR(handle); - err = ext4_orphan_add(handle, inode); - if (err) - goto out; /* * Now we need to zero out the non-page-aligned data in the @@ -4948,7 +4945,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) up_write(&EXT4_I(inode)->i_data_sem); out: - ext4_orphan_del(handle, inode); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); -- cgit v0.10.2 From 30cb27d661f5fdc582add0a9c297946a18ee4a4b Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Sat, 18 Aug 2012 22:38:07 -0400 Subject: ext4: fix trivial typo in comment Signed-off-by: Wang Sheng-Hui Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2e56903..cc6d2b9 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3301,7 +3301,7 @@ out: * to an uninitialized extent. * * Writing to an uninitialized extent may result in splitting the uninitialized - * extent into multiple /initialized uninitialized extents (up to three) + * extent into multiple initialized/uninitialized extents (up to three) * There are three possibilities: * a> There is no split required: Entire extent should be uninitialized * b> Splits in two extents: Write is happening at either end of the extent -- cgit v0.10.2 From 8a2f8460e816f4786939a0cefbda35af6bd1a1c5 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sun, 19 Aug 2012 18:07:40 -0400 Subject: ext4: remove duplicated declarations in inode.c In patch cb20d5188366f04d96d2e07b1240cc92170ade40, ext4_set_bh_endio and ext4_end_io_buffer_write are declared at the beginning of inode.c, and again later on in the middle of the file. Remove the second set of duplicated function declarations. Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6324f74e..b4effbd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1954,9 +1954,6 @@ out: return ret; } -static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); -static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); - /* * Note that we don't need to start a transaction unless we're journaling data * because we should have holes filled from ext4_page_mkwrite(). We even don't -- cgit v0.10.2 From 03c1c29053f678234dbd51bf3d65f3b7529021de Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Wed, 5 Sep 2012 01:21:50 -0400 Subject: ext4: ignore last group w/o enough space when resizing instead of BUG'ing If the last group does not have enough space for group tables, ignore it instead of calling BUG_ON(). Reported-by: Daniel Drake Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 41f6ef6..28031c4 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -200,8 +200,11 @@ static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd) * be a partial of a flex group. * * @sb: super block of fs to which the groups belongs + * + * Returns 0 on a successful allocation of the metadata blocks in the + * block group. */ -static void ext4_alloc_group_tables(struct super_block *sb, +static int ext4_alloc_group_tables(struct super_block *sb, struct ext4_new_flex_group_data *flex_gd, int flexbg_size) { @@ -226,6 +229,8 @@ static void ext4_alloc_group_tables(struct super_block *sb, (last_group & ~(flexbg_size - 1)))); next_group: group = group_data[0].group; + if (src_group >= group_data[0].group + flex_gd->count) + return -ENOSPC; start_blk = ext4_group_first_block_no(sb, src_group); last_blk = start_blk + group_data[src_group - group].blocks_count; @@ -235,7 +240,6 @@ next_group: start_blk += overhead; - BUG_ON(src_group >= group_data[0].group + flex_gd->count); /* We collect contiguous blocks as much as possible. */ src_group++; for (; src_group <= last_group; src_group++) @@ -300,6 +304,7 @@ next_group: group_data[i].free_blocks_count); } } + return 0; } static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, @@ -1729,7 +1734,8 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) */ while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count, flexbg_size)) { - ext4_alloc_group_tables(sb, flex_gd, flexbg_size); + if (ext4_alloc_group_tables(sb, flex_gd, flexbg_size) != 0) + break; err = ext4_flex_group_add(sb, resize_inode, flex_gd); if (unlikely(err)) break; -- cgit v0.10.2 From d7574ad08bac1ef89cb679d2c76c91ff9281c2e2 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Wed, 5 Sep 2012 01:23:50 -0400 Subject: ext4: report the original old blocks count in a debug message when resizing Avoid changing o_blocks_count, since it is used later when reporting old blocks count in debug mode. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 28031c4..591f4bd 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1719,8 +1719,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) es->s_log_groups_per_flex) flexbg_size = 1 << es->s_log_groups_per_flex; - o_blocks_count = ext4_blocks_count(es); - if (o_blocks_count == n_blocks_count) + if (ext4_blocks_count(es) == n_blocks_count) goto out; flex_gd = alloc_flex_gd(flexbg_size); -- cgit v0.10.2 From 6df935ad2fced9033ab52078825fcaf6365f34b7 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Wed, 5 Sep 2012 01:25:50 -0400 Subject: ext4: don't copy non-existent gdt blocks when resizing The resize code was copying blocks at the beginning of each block group in order to copy the superblock and block group descriptor table (gdt) blocks. This was, unfortunately, being done even for block groups that did not have super blocks or gdt blocks. This is a complete waste of perfectly good I/O bandwidth, to skip writing those blocks for sparse bg's. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 591f4bd..a0ee26c 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -456,6 +456,9 @@ static int setup_new_flex_group_blocks(struct super_block *sb, gdblocks = ext4_bg_num_gdb(sb, group); start = ext4_group_first_block_no(sb, group); + if (!ext4_bg_has_super(sb, group)) + goto handle_itb; + /* Copy all of the GDT blocks into the backup in this group */ for (j = 0, block = start + 1; j < gdblocks; j++, block++) { struct buffer_head *gdb; @@ -498,6 +501,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb, goto out; } +handle_itb: /* Initialize group tables of the grop @group */ if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED)) goto handle_bb; -- cgit v0.10.2 From 2ebd1704ded88a8ae29b5f3998b13959c715c4be Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Wed, 5 Sep 2012 01:27:50 -0400 Subject: ext4: avoid duplicate writes of the backup bg descriptor blocks The resize code was needlessly writing the backup block group descriptor blocks multiple times (once per block group) during an online resize. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index a0ee26c..365d800 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1358,13 +1358,15 @@ exit_journal: err = err2; if (!err) { - int i; + int gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + int gdb_num_end = ((group + flex_gd->count - 1) / + EXT4_DESC_PER_BLOCK(sb)); + update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, sizeof(struct ext4_super_block)); - for (i = 0; i < flex_gd->count; i++, group++) { + for (; gdb_num <= gdb_num_end; gdb_num++) { struct buffer_head *gdb_bh; - int gdb_num; - gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb); + gdb_bh = sbi->s_group_desc[gdb_num]; update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data, gdb_bh->b_size); -- cgit v0.10.2 From 117fff10d7f140e12dd43df20d3f9fda80577460 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 5 Sep 2012 01:29:50 -0400 Subject: ext4: grow the s_flex_groups array as needed when resizing Previously, we allocated the s_flex_groups array to the maximum size that the file system could be resized. There was two problems with this approach. First, it wasted memory in the common case where the file system was not resized. Secondly, once we start allowing online resizing using the meta_bg scheme, there is no maximum size that the file system can be resized. So instead, we need to grow the s_flex_groups at inline resize time. Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0df5ee1..464cff7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1276,6 +1276,7 @@ struct ext4_sb_info { unsigned int s_log_groups_per_flex; struct flex_groups *s_flex_groups; + ext4_group_t s_flex_groups_allocated; /* workqueue for dio unwritten */ struct workqueue_struct *dio_unwritten_wq; @@ -2055,6 +2056,8 @@ extern void ext4_superblock_csum_set(struct super_block *sb, extern void *ext4_kvmalloc(size_t size, gfp_t flags); extern void *ext4_kvzalloc(size_t size, gfp_t flags); extern void ext4_kvfree(void *ptr); +extern int ext4_alloc_flex_bg_array(struct super_block *sb, + ext4_group_t ngroup); extern __printf(4, 5) void __ext4_error(struct super_block *, const char *, unsigned int, const char *, ...); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 365d800..3f5c67b 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1503,6 +1503,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (err) goto out; + err = ext4_alloc_flex_bg_array(sb, input->group + 1); + if (err) + return err; + flex_gd.count = 1; flex_gd.groups = input; flex_gd.bg_flags = &bg_flags; @@ -1662,7 +1666,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) unsigned long n_desc_blocks; unsigned long o_desc_blocks; unsigned long desc_blocks; - int err = 0, flexbg_size = 1; + int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex; o_blocks_count = ext4_blocks_count(es); @@ -1721,13 +1725,13 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) goto out; } - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && - es->s_log_groups_per_flex) - flexbg_size = 1 << es->s_log_groups_per_flex; - if (ext4_blocks_count(es) == n_blocks_count) goto out; + err = ext4_alloc_flex_bg_array(sb, n_group + 1); + if (err) + return err; + flex_gd = alloc_flex_gd(flexbg_size); if (flex_gd == NULL) { err = -ENOMEM; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b875ff5..b8de488 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1925,15 +1925,45 @@ done: return res; } +int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct flex_groups *new_groups; + int size; + + if (!sbi->s_log_groups_per_flex) + return 0; + + size = ext4_flex_group(sbi, ngroup - 1) + 1; + if (size <= sbi->s_flex_groups_allocated) + return 0; + + size = roundup_pow_of_two(size * sizeof(struct flex_groups)); + new_groups = ext4_kvzalloc(size, GFP_KERNEL); + if (!new_groups) { + ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups", + size / (int) sizeof(struct flex_groups)); + return -ENOMEM; + } + + if (sbi->s_flex_groups) { + memcpy(new_groups, sbi->s_flex_groups, + (sbi->s_flex_groups_allocated * + sizeof(struct flex_groups))); + ext4_kvfree(sbi->s_flex_groups); + } + sbi->s_flex_groups = new_groups; + sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups); + return 0; +} + static int ext4_fill_flex_info(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = NULL; - ext4_group_t flex_group_count; ext4_group_t flex_group; unsigned int groups_per_flex = 0; - size_t size; - int i; + int i, err; sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { @@ -1942,17 +1972,9 @@ static int ext4_fill_flex_info(struct super_block *sb) } groups_per_flex = 1 << sbi->s_log_groups_per_flex; - /* We allocate both existing and potentially added groups */ - flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + - ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << - EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; - size = flex_group_count * sizeof(struct flex_groups); - sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL); - if (sbi->s_flex_groups == NULL) { - ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups", - flex_group_count); + err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); + if (err) goto failed; - } for (i = 0; i < sbi->s_groups_count; i++) { gdp = ext4_get_group_desc(sb, i, NULL); -- cgit v0.10.2 From 28623c2f5b0dca3c3ea34fd6108940661352e276 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 5 Sep 2012 01:31:50 -0400 Subject: ext4: grow the s_group_info array as needed Previously we allocated the s_group_info array with enough space for any future possible growth of the file system via online resize. This is unfortunate because it wastes memory, and it doesn't work for the meta_bg scheme, since there is no limit based on the number of reserved gdt blocks. So add the code to grow the s_group_info array as needed. Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 464cff7..8b6902c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1233,6 +1233,7 @@ struct ext4_sb_info { spinlock_t s_md_lock; unsigned short *s_mb_offsets; unsigned int *s_mb_maxs; + unsigned int s_group_info_size; /* tunables */ unsigned long s_stripe; @@ -1971,6 +1972,8 @@ extern void ext4_exit_mballoc(void); extern void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, unsigned long count, int flags); +extern int ext4_mb_alloc_groupinfo(struct super_block *sb, + ext4_group_t ngroups); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 6873571..2102c20 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -24,6 +24,7 @@ #include "ext4_jbd2.h" #include "mballoc.h" #include +#include #include #include @@ -2163,6 +2164,39 @@ static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) return cachep; } +/* + * Allocate the top-level s_group_info array for the specified number + * of groups + */ +int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned size; + struct ext4_group_info ***new_groupinfo; + + size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> + EXT4_DESC_PER_BLOCK_BITS(sb); + if (size <= sbi->s_group_info_size) + return 0; + + size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); + new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL); + if (!new_groupinfo) { + ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); + return -ENOMEM; + } + if (sbi->s_group_info) { + memcpy(new_groupinfo, sbi->s_group_info, + sbi->s_group_info_size * sizeof(*sbi->s_group_info)); + ext4_kvfree(sbi->s_group_info); + } + sbi->s_group_info = new_groupinfo; + sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); + ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", + sbi->s_group_info_size); + return 0; +} + /* Create and initialize ext4_group_info data for the given group. */ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, struct ext4_group_desc *desc) @@ -2252,49 +2286,14 @@ static int ext4_mb_init_backend(struct super_block *sb) ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - int num_meta_group_infos; - int num_meta_group_infos_max; - int array_size; + int err; struct ext4_group_desc *desc; struct kmem_cache *cachep; - /* This is the number of blocks used by GDT */ - num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); - - /* - * This is the total number of blocks used by GDT including - * the number of reserved blocks for GDT. - * The s_group_info array is allocated with this value - * to allow a clean online resize without a complex - * manipulation of pointer. - * The drawback is the unused memory when no resize - * occurs but it's very low in terms of pages - * (see comments below) - * Need to handle this properly when META_BG resizing is allowed - */ - num_meta_group_infos_max = num_meta_group_infos + - le16_to_cpu(es->s_reserved_gdt_blocks); + err = ext4_mb_alloc_groupinfo(sb, ngroups); + if (err) + return err; - /* - * array_size is the size of s_group_info array. We round it - * to the next power of two because this approximation is done - * internally by kmalloc so we can have some more memory - * for free here (e.g. may be used for META_BG resize). - */ - array_size = 1; - while (array_size < sizeof(*sbi->s_group_info) * - num_meta_group_infos_max) - array_size = array_size << 1; - /* An 8TB filesystem with 64-bit pointers requires a 4096 byte - * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. - * So a two level scheme suffices for now. */ - sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL); - if (sbi->s_group_info == NULL) { - ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); - return -ENOMEM; - } sbi->s_buddy_cache = new_inode(sb); if (sbi->s_buddy_cache == NULL) { ext4_msg(sb, KERN_ERR, "can't get new inode"); @@ -2322,7 +2321,7 @@ err_freebuddy: cachep = get_groupinfo_cache(sb->s_blocksize_bits); while (i-- > 0) kmem_cache_free(cachep, ext4_get_group_info(sb, i)); - i = num_meta_group_infos; + i = sbi->s_group_info_size; while (i-- > 0) kfree(sbi->s_group_info[i]); iput(sbi->s_buddy_cache); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 3f5c67b..f288933 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1507,6 +1507,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (err) return err; + err = ext4_mb_alloc_groupinfo(sb, input->group + 1); + if (err) + goto out; + flex_gd.count = 1; flex_gd.groups = input; flex_gd.bg_flags = &bg_flags; @@ -1732,6 +1736,10 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) if (err) return err; + err = ext4_mb_alloc_groupinfo(sb, n_group + 1); + if (err) + goto out; + flex_gd = alloc_flex_gd(flexbg_size); if (flex_gd == NULL) { err = -ENOMEM; -- cgit v0.10.2 From 01f795f9e0d67adeccc61a8b20c28acb45fa5fd8 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Wed, 5 Sep 2012 01:33:50 -0400 Subject: ext4: add online resizing support for meta_bg and 64-bit file systems This patch adds support for resizing file systems with the meta_bg and 64bit features. [ Added a fix by tytso to fix a divide by zero when resizing a filesystem from 14 TB to 18TB. Also fixed overhead accounting for meta_bg file systems.] Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 7f7dad7..8b84fe2 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -365,26 +365,11 @@ group_add_out: return -EOPNOTSUPP; } - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_META_BG)) { - ext4_msg(sb, KERN_ERR, - "Online resizing not (yet) supported with meta_bg"); - return -EOPNOTSUPP; - } - if (copy_from_user(&n_blocks_count, (__u64 __user *)arg, sizeof(__u64))) { return -EFAULT; } - if (n_blocks_count > MAX_32_NUM && - !EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_64BIT)) { - ext4_msg(sb, KERN_ERR, - "File system only supports 32-bit block numbers"); - return -EOPNOTSUPP; - } - err = ext4_resize_begin(sb); if (err) return err; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index f288933..7adc088 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -45,6 +45,28 @@ void ext4_resize_end(struct super_block *sb) smp_mb__after_clear_bit(); } +static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb, + ext4_group_t group) { + return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) << + EXT4_DESC_PER_BLOCK_BITS(sb); +} + +static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb, + ext4_group_t group) { + group = ext4_meta_bg_first_group(sb, group); + return ext4_group_first_block_no(sb, group); +} + +static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb, + ext4_group_t group) { + ext4_grpblk_t overhead; + overhead = ext4_bg_num_gdb(sb, group); + if (ext4_bg_has_super(sb, group)) + overhead += 1 + + le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); + return overhead; +} + #define outside(b, first, last) ((b) < (first) || (b) >= (last)) #define inside(b, first, last) ((b) >= (first) && (b) < (last)) @@ -57,9 +79,7 @@ static int verify_group_input(struct super_block *sb, ext4_fsblk_t end = start + input->blocks_count; ext4_group_t group = input->group; ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; - unsigned overhead = ext4_bg_has_super(sb, group) ? - (1 + ext4_bg_num_gdb(sb, group) + - le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + unsigned overhead = ext4_group_overhead_blocks(sb, group); ext4_fsblk_t metaend = start + overhead; struct buffer_head *bh = NULL; ext4_grpblk_t free_blocks_count, offset; @@ -209,7 +229,6 @@ static int ext4_alloc_group_tables(struct super_block *sb, int flexbg_size) { struct ext4_new_group_data *group_data = flex_gd->groups; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; ext4_fsblk_t start_blk; ext4_fsblk_t last_blk; ext4_group_t src_group; @@ -234,19 +253,19 @@ next_group: start_blk = ext4_group_first_block_no(sb, src_group); last_blk = start_blk + group_data[src_group - group].blocks_count; - overhead = ext4_bg_has_super(sb, src_group) ? - (1 + ext4_bg_num_gdb(sb, src_group) + - le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + overhead = ext4_group_overhead_blocks(sb, src_group); start_blk += overhead; /* We collect contiguous blocks as much as possible. */ src_group++; - for (; src_group <= last_group; src_group++) - if (!ext4_bg_has_super(sb, src_group)) + for (; src_group <= last_group; src_group++) { + overhead = ext4_group_overhead_blocks(sb, src_group); + if (overhead != 0) last_blk += group_data[src_group - group].blocks_count; else break; + } /* Allocate block bitmaps */ for (; bb_index < flex_gd->count; bb_index++) { @@ -438,11 +457,13 @@ static int setup_new_flex_group_blocks(struct super_block *sb, ext4_group_t group, count; struct buffer_head *bh = NULL; int reserved_gdb, i, j, err = 0, err2; + int meta_bg; BUG_ON(!flex_gd->count || !group_data || group_data[0].group != sbi->s_groups_count); reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); + meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); /* This transaction may be extended/restarted along the way */ handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); @@ -452,15 +473,25 @@ static int setup_new_flex_group_blocks(struct super_block *sb, group = group_data[0].group; for (i = 0; i < flex_gd->count; i++, group++) { unsigned long gdblocks; + ext4_grpblk_t overhead; gdblocks = ext4_bg_num_gdb(sb, group); start = ext4_group_first_block_no(sb, group); - if (!ext4_bg_has_super(sb, group)) + if (meta_bg == 0 && !ext4_bg_has_super(sb, group)) goto handle_itb; + if (meta_bg == 1) { + ext4_group_t first_group; + first_group = ext4_meta_bg_first_group(sb, group); + if (first_group != group + 1 && + first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1) + goto handle_itb; + } + + block = start + ext4_bg_has_super(sb, group); /* Copy all of the GDT blocks into the backup in this group */ - for (j = 0, block = start + 1; j < gdblocks; j++, block++) { + for (j = 0; j < gdblocks; j++, block++) { struct buffer_head *gdb; ext4_debug("update backup group %#04llx\n", block); @@ -530,11 +561,11 @@ handle_bb: err = PTR_ERR(bh); goto out; } - if (ext4_bg_has_super(sb, group)) { + overhead = ext4_group_overhead_blocks(sb, group); + if (overhead != 0) { ext4_debug("mark backup superblock %#04llx (+0)\n", start); - ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + - 1); + ext4_set_bits(bh->b_data, 0, overhead); } ext4_mark_bitmap_end(group_data[i].blocks_count, sb->s_blocksize * 8, bh->b_data); @@ -831,6 +862,45 @@ exit_bh: } /* + * add_new_gdb_meta_bg is the sister of add_new_gdb. + */ +static int add_new_gdb_meta_bg(struct super_block *sb, + handle_t *handle, ext4_group_t group) { + ext4_fsblk_t gdblock; + struct buffer_head *gdb_bh; + struct buffer_head **o_group_desc, **n_group_desc; + unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + int err; + + gdblock = ext4_meta_bg_first_block_no(sb, group) + + ext4_bg_has_super(sb, group); + gdb_bh = sb_bread(sb, gdblock); + if (!gdb_bh) + return -EIO; + n_group_desc = ext4_kvmalloc((gdb_num + 1) * + sizeof(struct buffer_head *), + GFP_NOFS); + if (!n_group_desc) { + err = -ENOMEM; + ext4_warning(sb, "not enough memory for %lu groups", + gdb_num + 1); + return err; + } + + o_group_desc = EXT4_SB(sb)->s_group_desc; + memcpy(n_group_desc, o_group_desc, + EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); + n_group_desc[gdb_num] = gdb_bh; + EXT4_SB(sb)->s_group_desc = n_group_desc; + EXT4_SB(sb)->s_gdb_count++; + ext4_kvfree(o_group_desc); + err = ext4_journal_get_write_access(handle, gdb_bh); + if (unlikely(err)) + brelse(gdb_bh); + return err; +} + +/* * Called when we are adding a new group which has a backup copy of each of * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks. * We need to add these reserved backup GDT blocks to the resize inode, so @@ -958,16 +1028,16 @@ exit_free: * do not copy the full number of backups at this time. The resize * which changed s_groups_count will backup again. */ -static void update_backups(struct super_block *sb, - int blk_off, char *data, int size) +static void update_backups(struct super_block *sb, int blk_off, char *data, + int size, int meta_bg) { struct ext4_sb_info *sbi = EXT4_SB(sb); - const ext4_group_t last = sbi->s_groups_count; + ext4_group_t last; const int bpg = EXT4_BLOCKS_PER_GROUP(sb); unsigned three = 1; unsigned five = 5; unsigned seven = 7; - ext4_group_t group; + ext4_group_t group = 0; int rest = sb->s_blocksize - size; handle_t *handle; int err = 0, err2; @@ -981,8 +1051,17 @@ static void update_backups(struct super_block *sb, ext4_superblock_csum_set(sb, (struct ext4_super_block *)data); - while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) { + if (meta_bg == 0) { + group = ext4_list_backups(sb, &three, &five, &seven); + last = sbi->s_groups_count; + } else { + group = ext4_meta_bg_first_group(sb, group) + 1; + last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2); + } + + while (group < sbi->s_groups_count) { struct buffer_head *bh; + ext4_fsblk_t backup_block; /* Out of journal space, and can't get more - abort - so sad */ if (ext4_handle_valid(handle) && @@ -991,13 +1070,20 @@ static void update_backups(struct super_block *sb, (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) break; - bh = sb_getblk(sb, group * bpg + blk_off); + if (meta_bg == 0) + backup_block = group * bpg + blk_off; + else + backup_block = (ext4_group_first_block_no(sb, group) + + ext4_bg_has_super(sb, group)); + + bh = sb_getblk(sb, backup_block); if (!bh) { err = -EIO; break; } - ext4_debug("update metadata backup %#04lx\n", - (unsigned long)bh->b_blocknr); + ext4_debug("update metadata backup %llu(+%llu)\n", + backup_block, backup_block - + ext4_group_first_block_no(sb, group)); if ((err = ext4_journal_get_write_access(handle, bh))) break; lock_buffer(bh); @@ -1010,6 +1096,13 @@ static void update_backups(struct super_block *sb, if (unlikely(err)) ext4_std_error(sb, err); brelse(bh); + + if (meta_bg == 0) + group = ext4_list_backups(sb, &three, &five, &seven); + else if (group == last) + break; + else + group = last; } if ((err2 = ext4_journal_stop(handle)) && !err) err = err2; @@ -1052,7 +1145,9 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, struct ext4_super_block *es = sbi->s_es; struct buffer_head *gdb_bh; int i, gdb_off, gdb_num, err = 0; + int meta_bg; + meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); for (i = 0; i < count; i++, group++) { int reserved_gdb = ext4_bg_has_super(sb, group) ? le16_to_cpu(es->s_reserved_gdt_blocks) : 0; @@ -1072,8 +1167,11 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group)) err = reserve_backup_gdb(handle, resize_inode, group); - } else + } else if (meta_bg != 0) { + err = add_new_gdb_meta_bg(sb, handle, group); + } else { err = add_new_gdb(handle, resize_inode, group); + } if (err) break; } @@ -1225,7 +1323,7 @@ static void ext4_update_super(struct super_block *sb, } reserved_blocks = ext4_r_blocks_count(es) * 100; - do_div(reserved_blocks, ext4_blocks_count(es)); + reserved_blocks = div64_u64(reserved_blocks, ext4_blocks_count(es)); reserved_blocks *= blocks_count; do_div(reserved_blocks, 100); @@ -1236,6 +1334,7 @@ static void ext4_update_super(struct super_block *sb, le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) * flex_gd->count); + ext4_debug("free blocks count %llu", ext4_free_blocks_count(es)); /* * We need to protect s_groups_count against other CPUs seeing * inconsistent state in the superblock. @@ -1270,6 +1369,8 @@ static void ext4_update_super(struct super_block *sb, percpu_counter_add(&sbi->s_freeinodes_counter, EXT4_INODES_PER_GROUP(sb) * flex_gd->count); + ext4_debug("free blocks count %llu", + percpu_counter_read(&sbi->s_freeclusters_counter)); if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && sbi->s_log_groups_per_flex) { @@ -1361,15 +1462,17 @@ exit_journal: int gdb_num = group / EXT4_DESC_PER_BLOCK(sb); int gdb_num_end = ((group + flex_gd->count - 1) / EXT4_DESC_PER_BLOCK(sb)); + int meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_META_BG); update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, - sizeof(struct ext4_super_block)); + sizeof(struct ext4_super_block), 0); for (; gdb_num <= gdb_num_end; gdb_num++) { struct buffer_head *gdb_bh; gdb_bh = sbi->s_group_desc[gdb_num]; update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data, - gdb_bh->b_size); + gdb_bh->b_size, meta_bg); } } exit: @@ -1413,9 +1516,7 @@ static int ext4_setup_next_flex_gd(struct super_block *sb, group_data[i].group = group + i; group_data[i].blocks_count = blocks_per_group; - overhead = ext4_bg_has_super(sb, group + i) ? - (1 + ext4_bg_num_gdb(sb, group + i) + - le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + overhead = ext4_group_overhead_blocks(sb, group + i); group_data[i].free_blocks_count = blocks_per_group - overhead; if (ext4_has_group_desc_csum(sb)) flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | @@ -1563,11 +1664,13 @@ errout: err = err2; if (!err) { + ext4_fsblk_t first_block; + first_block = ext4_group_first_block_no(sb, 0); if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: extended group to %llu " "blocks\n", ext4_blocks_count(es)); - update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es, - sizeof(struct ext4_super_block)); + update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block, + (char *)es, sizeof(struct ext4_super_block), 0); } return err; } @@ -1662,15 +1765,16 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct buffer_head *bh; - struct inode *resize_inode; - ext4_fsblk_t o_blocks_count; - ext4_group_t o_group; - ext4_group_t n_group; - ext4_grpblk_t offset, add; + struct inode *resize_inode = NULL; + ext4_grpblk_t add, offset; unsigned long n_desc_blocks; unsigned long o_desc_blocks; unsigned long desc_blocks; + ext4_group_t o_group; + ext4_group_t n_group; + ext4_fsblk_t o_blocks_count; int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex; + int meta_bg; o_blocks_count = ext4_blocks_count(es); @@ -1692,22 +1796,33 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) / - EXT4_DESC_PER_BLOCK(sb); + EXT4_DESC_PER_BLOCK(sb); o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / - EXT4_DESC_PER_BLOCK(sb); + EXT4_DESC_PER_BLOCK(sb); desc_blocks = n_desc_blocks - o_desc_blocks; - if (desc_blocks && - (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) || - le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) { - ext4_warning(sb, "No reserved GDT blocks, can't resize"); - return -EPERM; - } + meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); - resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); - if (IS_ERR(resize_inode)) { - ext4_warning(sb, "Error opening resize inode"); - return PTR_ERR(resize_inode); + if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)) { + if (meta_bg) { + ext4_error(sb, "resize_inode and meta_bg enabled " + "simultaneously"); + return -EINVAL; + } + if (le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks) { + ext4_warning(sb, + "No reserved GDT blocks, can't resize"); + return -EPERM; + } + resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); + if (IS_ERR(resize_inode)) { + ext4_warning(sb, "Error opening resize inode"); + return PTR_ERR(resize_inode); + } + } else if (!meta_bg) { + ext4_warning(sb, "File system features do not permit " + "online resize"); + return -EPERM; } /* See if the device is actually as big as what was requested */ @@ -1761,8 +1876,8 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) out: if (flex_gd) free_flex_gd(flex_gd); - - iput(resize_inode); + if (resize_inode != NULL) + iput(resize_inode); if (test_opt(sb, DEBUG)) ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu " "upto %llu blocks", o_blocks_count, n_blocks_count); -- cgit v0.10.2 From 93f9052643409c13b3b5f76833865087351f55b8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 12 Sep 2012 14:32:42 -0400 Subject: ext4: set bg_itable_unused when resizing Set bg_itable_unused for file systems that have uninit_bg enabled. This will speed up the first e2fsck run after the file system is resized. Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 7adc088..a5be589 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1268,6 +1268,9 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, ext4_free_group_clusters_set(sb, gdp, EXT4_B2C(sbi, group_data->free_blocks_count)); ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); + if (ext4_has_group_desc_csum(sb)) + ext4_itable_unused_set(sb, gdp, + EXT4_INODES_PER_GROUP(sb)); gdp->bg_flags = cpu_to_le16(*bg_flags); ext4_group_desc_csum_set(sb, group, gdp); -- cgit v0.10.2 From 1c6bd7173d66b3dfdefcedb38cabc1fb03997509 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 13 Sep 2012 10:19:24 -0400 Subject: ext4: convert file system to meta_bg if needed during resizing If we have run out of reserved gdt blocks, then clear the resize_inode feature and enable the meta_bg feature, so that we can continue resizing the file system seamlessly. Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index a5be589..5932ab5 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1756,6 +1756,99 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, return err; } /* ext4_group_extend */ + +static int num_desc_blocks(struct super_block *sb, ext4_group_t groups) +{ + return (groups + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); +} + +/* + * Release the resize inode and drop the resize_inode feature if there + * are no more reserved gdt blocks, and then convert the file system + * to enable meta_bg + */ +static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) +{ + handle_t *handle; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct ext4_inode_info *ei = 0; + ext4_fsblk_t nr; + int i, ret, err = 0; + int credits = 1; + + ext4_msg(sb, KERN_INFO, "Converting file system to meta_bg"); + if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)) { + if (es->s_reserved_gdt_blocks) { + ext4_error(sb, "Unexpected non-zero " + "s_reserved_gdt_blocks"); + return -EPERM; + } + if (!inode) { + ext4_error(sb, "Unexpected NULL resize_inode"); + return -EPERM; + } + ei = EXT4_I(inode); + + /* Do a quick sanity check of the resize inode */ + if (inode->i_blocks != 1 << (inode->i_blkbits - 9)) + goto invalid_resize_inode; + for (i = 0; i < EXT4_N_BLOCKS; i++) { + if (i == EXT4_DIND_BLOCK) { + if (ei->i_data[i]) + continue; + else + goto invalid_resize_inode; + } + if (ei->i_data[i]) + goto invalid_resize_inode; + } + credits += 3; /* block bitmap, bg descriptor, resize inode */ + } + + handle = ext4_journal_start_sb(sb, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto errout; + + EXT4_CLEAR_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE); + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); + sbi->s_es->s_first_meta_bg = + cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count)); + + err = ext4_handle_dirty_super(handle, sb); + if (err) { + ext4_std_error(sb, err); + goto errout; + } + + if (inode) { + nr = le32_to_cpu(ei->i_data[EXT4_DIND_BLOCK]); + ext4_free_blocks(handle, inode, NULL, nr, 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + ei->i_data[EXT4_DIND_BLOCK] = 0; + inode->i_blocks = 0; + + err = ext4_mark_inode_dirty(handle, inode); + if (err) + ext4_std_error(sb, err); + } + +errout: + ret = ext4_journal_stop(handle); + if (!err) + err = ret; + return ret; + +invalid_resize_inode: + ext4_error(sb, "corrupted/inconsistent resize inode"); + return -EINVAL; +} + /* * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count * @@ -1772,13 +1865,14 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) ext4_grpblk_t add, offset; unsigned long n_desc_blocks; unsigned long o_desc_blocks; - unsigned long desc_blocks; ext4_group_t o_group; ext4_group_t n_group; ext4_fsblk_t o_blocks_count; + ext4_fsblk_t n_blocks_count_retry = 0; int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex; int meta_bg; +retry: o_blocks_count = ext4_blocks_count(es); if (test_opt(sb, DEBUG)) @@ -1798,11 +1892,8 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); - n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) / - EXT4_DESC_PER_BLOCK(sb); - o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / - EXT4_DESC_PER_BLOCK(sb); - desc_blocks = n_desc_blocks - o_desc_blocks; + n_desc_blocks = num_desc_blocks(sb, n_group + 1); + o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count); meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); @@ -1812,20 +1903,37 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) "simultaneously"); return -EINVAL; } - if (le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks) { - ext4_warning(sb, - "No reserved GDT blocks, can't resize"); - return -EPERM; + if (n_desc_blocks > o_desc_blocks + + le16_to_cpu(es->s_reserved_gdt_blocks)) { + n_blocks_count_retry = n_blocks_count; + n_desc_blocks = o_desc_blocks + + le16_to_cpu(es->s_reserved_gdt_blocks); + n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb); + n_blocks_count = n_group * EXT4_BLOCKS_PER_GROUP(sb); + n_group--; /* set to last group number */ } - resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); + + if (!resize_inode) + resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); if (IS_ERR(resize_inode)) { ext4_warning(sb, "Error opening resize inode"); return PTR_ERR(resize_inode); } - } else if (!meta_bg) { - ext4_warning(sb, "File system features do not permit " - "online resize"); - return -EPERM; + } + + if ((!resize_inode && !meta_bg) || n_group == o_group) { + err = ext4_convert_meta_bg(sb, resize_inode); + if (err) + goto out; + if (resize_inode) { + iput(resize_inode); + resize_inode = NULL; + } + if (n_blocks_count_retry) { + n_blocks_count = n_blocks_count_retry; + n_blocks_count_retry = 0; + goto retry; + } } /* See if the device is actually as big as what was requested */ @@ -1876,13 +1984,21 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) break; } + if (!err && n_blocks_count_retry) { + n_blocks_count = n_blocks_count_retry; + n_blocks_count_retry = 0; + free_flex_gd(flex_gd); + flex_gd = NULL; + goto retry; + } + out: if (flex_gd) free_flex_gd(flex_gd); if (resize_inode != NULL) iput(resize_inode); if (test_opt(sb, DEBUG)) - ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu " - "upto %llu blocks", o_blocks_count, n_blocks_count); + ext4_msg(sb, KERN_DEBUG, "resized filesystem to %llu", + n_blocks_count); return err; } -- cgit v0.10.2 From 4da4a56e4f83f52d71e2c5fa86fb1ad77be09753 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 13 Sep 2012 10:24:21 -0400 Subject: ext4: log a resize update to the console every 10 seconds For very long online resizes, a periodic update to the console log is helpful for debugging and for progress reporting. Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 5932ab5..3c9367b 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1869,6 +1869,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) ext4_group_t n_group; ext4_fsblk_t o_blocks_count; ext4_fsblk_t n_blocks_count_retry = 0; + unsigned long last_update_time = 0; int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex; int meta_bg; @@ -1977,6 +1978,13 @@ retry: */ while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count, flexbg_size)) { + if (jiffies - last_update_time > HZ * 10) { + if (last_update_time) + ext4_msg(sb, KERN_INFO, + "resized to %llu blocks", + ext4_blocks_count(es)); + last_update_time = jiffies; + } if (ext4_alloc_group_tables(sb, flex_gd, flexbg_size) != 0) break; err = ext4_flex_group_add(sb, resize_inode, flex_gd); -- cgit v0.10.2 From 5e7bbef19c8385895cb21c41a88bd937902e6316 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 13 Sep 2012 12:11:40 -0400 Subject: ext4: advertise the fact that the kernel supports meta_bg resizing Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b8de488..eb7722a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2585,10 +2585,12 @@ static struct attribute *ext4_attrs[] = { /* Features this copy of ext4 supports */ EXT4_INFO_ATTR(lazy_itable_init); EXT4_INFO_ATTR(batched_discard); +EXT4_INFO_ATTR(meta_bg_resize); static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(lazy_itable_init), ATTR_LIST(batched_discard), + ATTR_LIST(meta_bg_resize), NULL, }; -- cgit v0.10.2 From bc0b75f77a944b482293972eb8fd5c88c576eb46 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 17 Sep 2012 22:54:36 -0400 Subject: ext4: do not enable delalloc by default for ext2 Signed-off-by: Brian Foster Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/super.c b/fs/ext4/super.c index eb7722a..e6784b3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3411,7 +3411,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * enable delayed allocation by default * Use -o nodelalloc to turn it off */ - if (!IS_EXT3_SB(sb) && + if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) && ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); -- cgit v0.10.2 From 90b0a97323f42ead278bbccbdf0e123db2add400 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Mon, 17 Sep 2012 23:39:12 -0400 Subject: ext4: fix possible non-initialized variable in htree_dirblock_to_tree() htree_dirblock_to_tree() declares a non-initialized 'err' variable, which is passed as a reference to another functions expecting them to set this variable with their error codes. It's passed to ext4_bread(), which then passes it to ext4_getblk(). If ext4_map_blocks() returns 0 due to a lookup failure, leaving the ext4_getblk() buffer_head uninitialized, it will make ext4_getblk() return to ext4_bread() without initialize the 'err' variable, and ext4_bread() will return to htree_dirblock_to_tree() with this variable still uninitialized. htree_dirblock_to_tree() will pass this variable with garbage back to ext4_htree_fill_tree(), which expects a number of directory entries added to the rb-tree. which, in case, might return a fake non-zero value due the garbage left in the 'err' variable, leading the kernel to an Oops in ext4_dx_readdir(), once this is expecting a filled rb-tree node, when in turn it will have a NULL-ed one, causing an invalid page request when trying to get a fname struct from this NULL-ed rb-tree node in this line: fname = rb_entry(info->curr_node, struct fname, rb_hash); The patch itself initializes the err variable in htree_dirblock_to_tree() to avoid usage mistakes by the called functions, and also fix ext4_getblk() to return a initialized 'err' variable when ext4_map_blocks() fails a lookup. Signed-off-by: Carlos Maiolino Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b4effbd..ca76b5e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -732,11 +732,13 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, err = ext4_map_blocks(handle, inode, &map, create ? EXT4_GET_BLOCKS_CREATE : 0); + /* ensure we send some value back into *errp */ + *errp = 0; + if (err < 0) *errp = err; if (err <= 0) return NULL; - *errp = 0; bh = sb_getblk(inode->i_sb, map.m_pblk); if (!bh) { diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 7450ff01..37c03b3 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -846,7 +846,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, { struct buffer_head *bh; struct ext4_dir_entry_2 *de, *top; - int err, count = 0; + int err = 0, count = 0; dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); -- cgit v0.10.2 From b5e2368baeddf401bf3da9e364fc1c96676279cd Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 18 Sep 2012 13:33:44 -0400 Subject: ext4: re-enable -o discard functionality in no-journal mode This is a revert of commit b56ff9d397ce, which removed the call to ext4_issue_discard() to fix a BUG reported because ext4_issue_discard() was being called from inside a block group spinlock. As it turns out this bug had already been fixed by Lukas Czerner in commit 53fdcf992d61 by the simple expedient of moving when we call ext4_issue_discard() outside the spinlock. So it should be safe to re-enable this functionality, which I tested by putting an BUG_ON(in_atomic) just after the restored callsite to ext4_issue_discard(). Addresses-Google-Bug: #6750518 Signed-off-by: "Theodore Ts'o" Cc: Anatol Pomozov diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 2102c20..2c7c082 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4656,6 +4656,8 @@ do_more: * with group lock held. generate_buddy look at * them with group lock_held */ + if (test_opt(sb, DISCARD)) + ext4_issue_discard(sb, block_group, bit, count); ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); mb_free_blocks(inode, &e4b, bit, count_clusters); -- cgit v0.10.2 From c9b92530a723ac5ef8e352885a1862b18f31b2f5 Mon Sep 17 00:00:00 2001 From: Anatol Pomozov Date: Tue, 18 Sep 2012 13:38:59 -0400 Subject: ext4: make orphan functions be no-op in no-journal mode Instead of checking whether the handle is valid, we check if journal is enabled. This avoids taking the s_orphan_lock mutex in all cases when there is no journal in use, including the error paths where ext4_orphan_del() is called with a handle set to NULL. Signed-off-by: Anatol Pomozov Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 37c03b3..8f4bda7 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2369,7 +2369,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) struct ext4_iloc iloc; int err = 0, rc; - if (!ext4_handle_valid(handle)) + if (!EXT4_SB(sb)->s_journal) return 0; mutex_lock(&EXT4_SB(sb)->s_orphan_lock); @@ -2443,8 +2443,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) struct ext4_iloc iloc; int err = 0; - /* ext4_handle_valid() assumes a valid handle_t pointer */ - if (handle && !ext4_handle_valid(handle)) + if (!EXT4_SB(inode->i_sb)->s_journal) return 0; mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); @@ -2463,7 +2462,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) * transaction handle with which to update the orphan list on * disk, but we still need to remove the inode from the linked * list in memory. */ - if (sbi->s_journal && !handle) + if (!handle) goto out; err = ext4_reserve_inode_write(handle, inode, &iloc); -- cgit v0.10.2 From 59e31c156a24d483bbd2ea07d4dc96043a55b6bc Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 19 Sep 2012 00:55:56 -0400 Subject: ext4: fix online resizing when the # of block groups is constant Commit 1c6bd7173d66b3 introduced a regression where an online resize operation which did not change the number of block groups would fail, i.e: mke2fs -t /dev/vdc 60000 mount /dev/vdc resize2fs /dev/vdc 60001 This was due to a bug in the logic regarding when to try converting the filesystem to use meta_bg. Also fix up a number of other minor issues with the online resizing code: (a) Fix a sparse warning; (b) only check to make sure the device is large enough once, instead of multiple times through the resize loop. Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 3c9367b..ee985ca 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1772,23 +1772,18 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) handle_t *handle; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; - struct ext4_inode_info *ei = 0; + struct ext4_inode_info *ei = EXT4_I(inode); ext4_fsblk_t nr; int i, ret, err = 0; int credits = 1; ext4_msg(sb, KERN_INFO, "Converting file system to meta_bg"); - if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)) { + if (inode) { if (es->s_reserved_gdt_blocks) { ext4_error(sb, "Unexpected non-zero " "s_reserved_gdt_blocks"); return -EPERM; } - if (!inode) { - ext4_error(sb, "Unexpected NULL resize_inode"); - return -EPERM; - } - ei = EXT4_I(inode); /* Do a quick sanity check of the resize inode */ if (inode->i_blocks != 1 << (inode->i_blkbits - 9)) @@ -1873,12 +1868,19 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex; int meta_bg; + /* See if the device is actually as big as what was requested */ + bh = sb_bread(sb, n_blocks_count - 1); + if (!bh) { + ext4_warning(sb, "can't read last block, resize aborted"); + return -ENOSPC; + } + brelse(bh); + retry: o_blocks_count = ext4_blocks_count(es); - if (test_opt(sb, DEBUG)) - ext4_msg(sb, KERN_DEBUG, "resizing filesystem from %llu " - "to %llu blocks", o_blocks_count, n_blocks_count); + ext4_msg(sb, KERN_INFO, "resizing filesystem from %llu " + "to %llu blocks", o_blocks_count, n_blocks_count); if (n_blocks_count < o_blocks_count) { /* On-line shrinking not supported */ @@ -1922,7 +1924,7 @@ retry: } } - if ((!resize_inode && !meta_bg) || n_group == o_group) { + if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) { err = ext4_convert_meta_bg(sb, resize_inode); if (err) goto out; @@ -1937,14 +1939,6 @@ retry: } } - /* See if the device is actually as big as what was requested */ - bh = sb_bread(sb, n_blocks_count - 1); - if (!bh) { - ext4_warning(sb, "can't read last block, resize aborted"); - return -ENOSPC; - } - brelse(bh); - /* extend the last group */ if (n_group == o_group) add = n_blocks_count - o_blocks_count; @@ -2005,8 +1999,6 @@ out: free_flex_gd(flex_gd); if (resize_inode != NULL) iput(resize_inode); - if (test_opt(sb, DEBUG)) - ext4_msg(sb, KERN_DEBUG, "resized filesystem to %llu", - n_blocks_count); + ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count); return err; } -- cgit v0.10.2 From 18888cf0883c286f238d44ee565530fe82752f06 Mon Sep 17 00:00:00 2001 From: Andrey Sidorov Date: Wed, 19 Sep 2012 14:14:53 -0400 Subject: ext4: speed up truncate/unlink by not using bforget() unless needed Do not iterate over data blocks scanning for bh's to forget as they're never exist. This improves time taken by unlink / truncate syscall. Tested by continuously truncating file that is being written by dd. Another test is rm -rf of linux tree while tar unpacks it. With ordered data mode condition unlikely(!tbh) was always met in ext4_free_blocks. With journal data mode tbh was found only few times, so optimisation is also possible. Unlinking fallocated 60G file after doing sync && echo 3 > /proc/sys/vm/drop_caches && time rm --help X86 before (linux 3.6-rc4): # time rm -f test1 real 0m2.710s user 0m0.000s sys 0m1.530s X86 after: # time rm -f test1 real 0m0.644s user 0m0.003s sys 0m0.060s MIPS before (linux 2.6.37): # time rm -f test1 real 0m 4.93s user 0m 0.00s sys 0m 4.61s MIPS after: # time rm -f test1 real 0m 0.16s user 0m 0.00s sys 0m 0.06s Signed-off-by: "Theodore Ts'o" Signed-off-by: Andrey Sidorov diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index cc6d2b9..a510917 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2318,10 +2318,13 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned short ee_len = ext4_ext_get_actual_len(ex); ext4_fsblk_t pblk; - int flags = EXT4_FREE_BLOCKS_FORGET; + int flags = 0; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; + flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; + else if (ext4_should_journal_data(inode)) + flags |= EXT4_FREE_BLOCKS_FORGET; + /* * For bigalloc file systems, we never free a partial cluster * at the beginning of the extent. Instead, we make a note -- cgit v0.10.2 From 00d4e7362ed01987183e9528295de3213031309c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 19 Sep 2012 22:42:36 -0400 Subject: ext4: fix potential deadlock in ext4_nonda_switch() In ext4_nonda_switch(), if the file system is getting full we used to call writeback_inodes_sb_if_idle(). The problem is that we can be holding i_mutex already, and this causes a potential deadlock when writeback_inodes_sb_if_idle() when it tries to take s_umount. (See lockdep output below). As it turns out we don't need need to hold s_umount; the fact that we are in the middle of the write(2) system call will keep the superblock pinned. Unfortunately writeback_inodes_sb() checks to make sure s_umount is taken, and the VFS uses a different mechanism for making sure the file system doesn't get unmounted out from under us. The simplest way of dealing with this is to just simply grab s_umount using a trylock, and skip kicking the writeback flusher thread in the very unlikely case that we can't take a read lock on s_umount without blocking. Also, we now check the cirteria for kicking the writeback thread before we decide to whether to fall back to non-delayed writeback, so if there are any outstanding delayed allocation writes, we try to get them resolved as soon as possible. [ INFO: possible circular locking dependency detected ] 3.6.0-rc1-00042-gce894ca #367 Not tainted ------------------------------------------------------- dd/8298 is trying to acquire lock: (&type->s_umount_key#18){++++..}, at: [] writeback_inodes_sb_if_idle+0x28/0x46 but task is already holding lock: (&sb->s_type->i_mutex_key#8){+.+...}, at: [] generic_file_aio_write+0x5f/0xd3 which lock already depends on the new lock. 2 locks held by dd/8298: #0: (sb_writers#2){.+.+.+}, at: [] generic_file_aio_write+0x56/0xd3 #1: (&sb->s_type->i_mutex_key#8){+.+...}, at: [] generic_file_aio_write+0x5f/0xd3 stack backtrace: Pid: 8298, comm: dd Not tainted 3.6.0-rc1-00042-gce894ca #367 Call Trace: [] ? console_unlock+0x345/0x372 [] print_circular_bug+0x190/0x19d [] __lock_acquire+0x86d/0xb6c [] ? mark_held_locks+0x5c/0x7b [] lock_acquire+0x66/0xb9 [] ? writeback_inodes_sb_if_idle+0x28/0x46 [] down_read+0x28/0x58 [] ? writeback_inodes_sb_if_idle+0x28/0x46 [] writeback_inodes_sb_if_idle+0x28/0x46 [] ext4_nonda_switch+0xe1/0xf4 [] ext4_da_write_begin+0x27/0x193 [] generic_file_buffered_write+0xc8/0x1bb [] __generic_file_aio_write+0x1dd/0x205 [] generic_file_aio_write+0x78/0xd3 [] ext4_file_write+0x480/0x4a6 [] ? __lock_acquire+0x41e/0xb6c [] ? sched_clock_cpu+0x11a/0x13e [] ? trace_hardirqs_off+0xb/0xd [] ? local_clock+0x37/0x4e [] do_sync_write+0x67/0x9d [] ? wait_on_retry_sync_kiocb+0x44/0x44 [] vfs_write+0x7b/0xe6 [] sys_write+0x3b/0x64 [] syscall_call+0x7/0xb Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ca76b5e..0a31197 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2462,6 +2462,16 @@ static int ext4_nonda_switch(struct super_block *sb) free_blocks = EXT4_C2B(sbi, percpu_counter_read_positive(&sbi->s_freeclusters_counter)); dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); + /* + * Start pushing delalloc when 1/2 of free blocks are dirty. + */ + if (dirty_blocks && (free_blocks < 2 * dirty_blocks) && + !writeback_in_progress(sb->s_bdi) && + down_read_trylock(&sb->s_umount)) { + writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); + up_read(&sb->s_umount); + } + if (2 * free_blocks < 3 * dirty_blocks || free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { /* @@ -2470,13 +2480,6 @@ static int ext4_nonda_switch(struct super_block *sb) */ return 1; } - /* - * Even if we don't switch but are nearing capacity, - * start pushing delalloc when 1/2 of free blocks are dirty. - */ - if (free_blocks < 2 * dirty_blocks) - writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE); - return 0; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index be3efc4..5602d73 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -63,6 +63,7 @@ int writeback_in_progress(struct backing_dev_info *bdi) { return test_bit(BDI_writeback_running, &bdi->state); } +EXPORT_SYMBOL(writeback_in_progress); static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) { -- cgit v0.10.2 From bef53b01faeb791e27605cba1a71ba21364cb23e Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 20 Sep 2012 11:35:38 -0400 Subject: ext4: remove erroneous ext4_superblock_csum_set() in update_backups() The update_backups() function is used to backup all the metadata blocks, so we should not take it for granted that 'data' is pointed to a super block and use ext4_superblock_csum_set to calculate the checksum there. In case where the data is a group descriptor block, it will corrupt the last group descriptor, and then e2fsck will complain about it it. As all the metadata checksums should already be OK when we do the backup, remove the wrong ext4_superblock_csum_set and it should be just fine. Reported-by: "Theodore Ts'o" Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index ee985ca..9f821ce 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1049,8 +1049,6 @@ static void update_backups(struct super_block *sb, int blk_off, char *data, goto exit_err; } - ext4_superblock_csum_set(sb, (struct ext4_super_block *)data); - if (meta_bg == 0) { group = ext4_list_backups(sb, &three, &five, &seven); last = sbi->s_groups_count; -- cgit v0.10.2 From 50df9fd55e4271e89a7adf3b1172083dd0ca199d Mon Sep 17 00:00:00 2001 From: Herton Ronaldo Krzesinski Date: Sun, 23 Sep 2012 22:49:12 -0400 Subject: ext4: fix crash when accessing /proc/mounts concurrently The crash was caused by a variable being erronously declared static in token2str(). In addition to /proc/mounts, the problem can also be easily replicated by accessing /proc/fs/ext4//options in parallel: $ cat /proc/fs/ext4//options > options.txt ... and then running the following command in two different terminals: $ while diff /proc/fs/ext4//options options.txt; do true; done This is also the cause of the following a crash while running xfstests #234, as reported in the following bug reports: https://bugs.launchpad.net/bugs/1053019 https://bugzilla.kernel.org/show_bug.cgi?id=47731 Signed-off-by: Herton Ronaldo Krzesinski Signed-off-by: "Theodore Ts'o" Cc: Brad Figg Cc: stable@vger.kernel.org diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e6784b3..7bef0a4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1749,7 +1749,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq, static const char *token2str(int token) { - static const struct match_token *t; + const struct match_token *t; for (t = tokens; t->token != Opt_err; t++) if (t->token == token && !strchr(t->pattern, '=')) -- cgit v0.10.2 From 838cd0cf9af52e034ee81513642083bbe8e4ddb1 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Sun, 23 Sep 2012 23:10:51 -0400 Subject: ext4: check free block counters in ext4_mb_find_by_goal Free block counters should be checked before doing allocation. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 2c7c082..bb821a9 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1660,10 +1660,13 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, int max; int err; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct ext4_free_extent ex; if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) return 0; + if (grp->bb_free == 0) + return 0; err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); if (err) -- cgit v0.10.2 From f2a09af645b762f8230e7eba7fee3b6f7e6e96e7 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Sun, 23 Sep 2012 23:16:03 -0400 Subject: ext4: check free inode count before allocating an inode Recently, I ecountered some corrupted filesystems in which some groups' free inode counts were 65535, it seemed that free inode count was overflow. This patch teaches ext4 to check free inode count before allocaing an inode. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 26154b8..fa36372 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -697,6 +697,15 @@ got_group: if (!gdp) goto fail; + /* + * Check free inodes count before loading bitmap. + */ + if (ext4_free_inodes_count(sb, gdp) == 0) { + if (++group == ngroups) + group = 0; + continue; + } + brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); if (!inode_bitmap_bh) -- cgit v0.10.2 From 7f1468d1d50d368097ab13596dc08eaba7eace7f Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Tue, 25 Sep 2012 23:19:25 -0400 Subject: ext4: fix double unlock buffer mess during fs-resize bh_submit_read() is responsible for unlock bh on endio. In addition, we need to use bh_uptodate_or_lock() to avoid races. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 9f821ce..f21fdbf 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1181,17 +1181,12 @@ static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block) struct buffer_head *bh = sb_getblk(sb, block); if (!bh) return NULL; - - if (bitmap_uptodate(bh)) - return bh; - - lock_buffer(bh); - if (bh_submit_read(bh) < 0) { - unlock_buffer(bh); - brelse(bh); - return NULL; + if (!bh_uptodate_or_lock(bh)) { + if (bh_submit_read(bh) < 0) { + brelse(bh); + return NULL; + } } - unlock_buffer(bh); return bh; } -- cgit v0.10.2 From 0acdb8876fead922c9ffa6768c5675a37485c48c Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Wed, 26 Sep 2012 00:08:57 -0400 Subject: ext4: don't call update_backups() multiple times for the same bg When performing an online resize, we add a bunch of groups at one time in ext4_flex_group_add, so in most cases a lot of group descriptors will be in the same group block. But in the end of this function, update_backups will be called for every group descriptor and the same block will be copied and journalled again and again. It is really a waste. Fix things so we only update a particular bg descriptor block once and skip subsequent updates of the same block. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index f21fdbf..7a75e10 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1460,6 +1460,7 @@ exit_journal: EXT4_DESC_PER_BLOCK(sb)); int meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); + sector_t old_gdb = 0; update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, sizeof(struct ext4_super_block), 0); @@ -1467,8 +1468,11 @@ exit_journal: struct buffer_head *gdb_bh; gdb_bh = sbi->s_group_desc[gdb_num]; + if (old_gdb == gdb_bh->b_blocknr) + continue; update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data, gdb_bh->b_size, meta_bg); + old_gdb = gdb_bh->b_blocknr; } } exit: -- cgit v0.10.2 From 03bd8b9b896c8e940f282f540e6b4de90d666b7c Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 26 Sep 2012 12:32:19 -0400 Subject: ext4: move_extent code cleanup - Remove usless checks, because it is too late to check that inode != NULL at the moment it was referenced several times. - Double lock routines looks very ugly and locking ordering relays on order of i_ino, but other kernel code rely on order of pointers. Let's make them simple and clean. - check that inodes belongs to the same SB as soon as possible. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index c5826c6..df5cde5 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -141,55 +141,21 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, } /** - * mext_check_null_inode - NULL check for two inodes - * - * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. - */ -static int -mext_check_null_inode(struct inode *inode1, struct inode *inode2, - const char *function, unsigned int line) -{ - int ret = 0; - - if (inode1 == NULL) { - __ext4_error(inode2->i_sb, function, line, - "Both inodes should not be NULL: " - "inode1 NULL inode2 %lu", inode2->i_ino); - ret = -EIO; - } else if (inode2 == NULL) { - __ext4_error(inode1->i_sb, function, line, - "Both inodes should not be NULL: " - "inode1 %lu inode2 NULL", inode1->i_ino); - ret = -EIO; - } - return ret; -} - -/** * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem * - * @orig_inode: original inode structure - * @donor_inode: donor inode structure - * Acquire write lock of i_data_sem of the two inodes (orig and donor) by - * i_ino order. + * Acquire write lock of i_data_sem of the two inodes */ static void -double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) +double_down_write_data_sem(struct inode *first, struct inode *second) { - struct inode *first = orig_inode, *second = donor_inode; + if (first < second) { + down_write(&EXT4_I(first)->i_data_sem); + down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); + } else { + down_write(&EXT4_I(second)->i_data_sem); + down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING); - /* - * Use the inode number to provide the stable locking order instead - * of its address, because the C language doesn't guarantee you can - * compare pointers that don't come from the same array. - */ - if (donor_inode->i_ino < orig_inode->i_ino) { - first = donor_inode; - second = orig_inode; } - - down_write(&EXT4_I(first)->i_data_sem); - down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); } /** @@ -969,14 +935,6 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } - /* Files should be in the same ext4 FS */ - if (orig_inode->i_sb != donor_inode->i_sb) { - ext4_debug("ext4 move extent: The argument files " - "should be in same FS [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - /* Ext4 move extent supports only extent based file */ if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { ext4_debug("ext4 move extent: orig file is not extents " @@ -1072,35 +1030,19 @@ mext_check_arguments(struct inode *orig_inode, * @inode1: the inode structure * @inode2: the inode structure * - * Lock two inodes' i_mutex by i_ino order. - * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. + * Lock two inodes' i_mutex */ -static int +static void mext_inode_double_lock(struct inode *inode1, struct inode *inode2) { - int ret = 0; - - BUG_ON(inode1 == NULL && inode2 == NULL); - - ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); - if (ret < 0) - goto out; - - if (inode1 == inode2) { - mutex_lock(&inode1->i_mutex); - goto out; - } - - if (inode1->i_ino < inode2->i_ino) { + BUG_ON(inode1 == inode2); + if (inode1 < inode2) { mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); } else { mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); } - -out: - return ret; } /** @@ -1109,28 +1051,13 @@ out: * @inode1: the inode that is released first * @inode2: the inode that is released second * - * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. */ -static int +static void mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) { - int ret = 0; - - BUG_ON(inode1 == NULL && inode2 == NULL); - - ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); - if (ret < 0) - goto out; - - if (inode1) - mutex_unlock(&inode1->i_mutex); - - if (inode2 && inode2 != inode1) - mutex_unlock(&inode2->i_mutex); - -out: - return ret; + mutex_unlock(&inode1->i_mutex); + mutex_unlock(&inode2->i_mutex); } /** @@ -1187,16 +1114,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; ext4_lblk_t rest_blocks; pgoff_t orig_page_offset = 0, seq_end_page; - int ret1, ret2, depth, last_extent = 0; + int ret, depth, last_extent = 0; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; int data_offset_in_page; int block_len_in_page; int uninit; - /* orig and donor should be different file */ - if (orig_inode->i_ino == donor_inode->i_ino) { + if (orig_inode->i_sb != donor_inode->i_sb) { + ext4_debug("ext4 move extent: The argument files " + "should be in same FS [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* orig and donor should be different inodes */ + if (orig_inode == donor_inode) { ext4_debug("ext4 move extent: The argument files should not " - "be same file [ino:orig %lu, donor %lu]\n", + "be same inode [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } @@ -1210,16 +1144,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, } /* Protect orig and donor inodes against a truncate */ - ret1 = mext_inode_double_lock(orig_inode, donor_inode); - if (ret1 < 0) - return ret1; + mext_inode_double_lock(orig_inode, donor_inode); /* Protect extent tree against block allocations via delalloc */ double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ - ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, + ret = mext_check_arguments(orig_inode, donor_inode, orig_start, donor_start, &len); - if (ret1) + if (ret) goto out; file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; @@ -1227,13 +1159,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, if (file_end < block_end) len -= block_end - file_end; - ret1 = get_ext_path(orig_inode, block_start, &orig_path); - if (ret1) + ret = get_ext_path(orig_inode, block_start, &orig_path); + if (ret) goto out; /* Get path structure to check the hole */ - ret1 = get_ext_path(orig_inode, block_start, &holecheck_path); - if (ret1) + ret = get_ext_path(orig_inode, block_start, &holecheck_path); + if (ret) goto out; depth = ext_depth(orig_inode); @@ -1252,13 +1184,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, last_extent = mext_next_extent(orig_inode, holecheck_path, &ext_cur); if (last_extent < 0) { - ret1 = last_extent; + ret = last_extent; goto out; } last_extent = mext_next_extent(orig_inode, orig_path, &ext_dummy); if (last_extent < 0) { - ret1 = last_extent; + ret = last_extent; goto out; } seq_start = le32_to_cpu(ext_cur->ee_block); @@ -1272,7 +1204,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, if (le32_to_cpu(ext_cur->ee_block) > block_end) { ext4_debug("ext4 move extent: The specified range of file " "may be the hole\n"); - ret1 = -EINVAL; + ret = -EINVAL; goto out; } @@ -1292,7 +1224,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, last_extent = mext_next_extent(orig_inode, holecheck_path, &ext_cur); if (last_extent < 0) { - ret1 = last_extent; + ret = last_extent; break; } add_blocks = ext4_ext_get_actual_len(ext_cur); @@ -1349,18 +1281,18 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, orig_page_offset, data_offset_in_page, block_len_in_page, uninit, - &ret1); + &ret); /* Count how many blocks we have exchanged */ *moved_len += block_len_in_page; - if (ret1 < 0) + if (ret < 0) break; if (*moved_len > len) { EXT4_ERROR_INODE(orig_inode, "We replaced blocks too much! " "sum of replaced: %llu requested: %llu", *moved_len, len); - ret1 = -EIO; + ret = -EIO; break; } @@ -1374,22 +1306,22 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, } double_down_write_data_sem(orig_inode, donor_inode); - if (ret1 < 0) + if (ret < 0) break; /* Decrease buffer counter */ if (holecheck_path) ext4_ext_drop_refs(holecheck_path); - ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path); - if (ret1) + ret = get_ext_path(orig_inode, seq_start, &holecheck_path); + if (ret) break; depth = holecheck_path->p_depth; /* Decrease buffer counter */ if (orig_path) ext4_ext_drop_refs(orig_path); - ret1 = get_ext_path(orig_inode, seq_start, &orig_path); - if (ret1) + ret = get_ext_path(orig_inode, seq_start, &orig_path); + if (ret) break; ext_cur = holecheck_path[depth].p_ext; @@ -1412,12 +1344,7 @@ out: kfree(holecheck_path); } double_up_write_data_sem(orig_inode, donor_inode); - ret2 = mext_inode_double_unlock(orig_inode, donor_inode); + mext_inode_double_unlock(orig_inode, donor_inode); - if (ret1) - return ret1; - else if (ret2) - return ret2; - - return 0; + return ret; } -- cgit v0.10.2 From f066055a3449f0e5b0ae4f3ceab4445bead47638 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 26 Sep 2012 12:32:54 -0400 Subject: ext4: online defrag is not supported for journaled files Proper block swap for inodes with full journaling enabled is truly non obvious task. In order to be on a safe side let's explicitly disable it for now. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index df5cde5..e2016f3 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -1142,7 +1142,12 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - + /* TODO: This is non obvious task to swap blocks for inodes with full + jornaling enabled */ + if (ext4_should_journal_data(orig_inode) || + ext4_should_journal_data(donor_inode)) { + return -EINVAL; + } /* Protect orig and donor inodes against a truncate */ mext_inode_double_lock(orig_inode, donor_inode); -- cgit v0.10.2 From bb5574880574fea38c674942cf0360253a2d60fe Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 26 Sep 2012 12:52:07 -0400 Subject: ext4: clean up online defrag bugs in move_extent_per_page() Non-full list of bugs: 1) uninitialized extent optimization does not hold page's lock, and simply replace brunches after that writeback code goes crazy because block mapping changed under it's feets kernel BUG at fs/ext4/inode.c:1434! ( 288'th xfstress) 2) uninitialized extent may became initialized right after we drop i_data_sem, so extent state must be rechecked 3) Locked pages goes uptodate via following sequence: ->readpage(page); lock_page(page); use_that_page(page) But after readpage() one may invalidate it because it is uptodate and unlocked (reclaimer does that) As result kernel bug at include/linux/buffer_head.c:133! 4) We call write_begin() with already opened stansaction which result in following deadlock: ->move_extent_per_page() ->ext4_journal_start()-> hold journal transaction ->write_begin() ->ext4_da_write_begin() ->ext4_nonda_switch() ->writeback_inodes_sb_if_idle() --> will wait for journal_stop() 5) try_to_release_page() may fail and it does fail if one of page's bh was pinned by journal 6) If we about to change page's mapping we MUST hold it's lock during entire remapping procedure, this is true for both pages(original and donor one) Fixes: - Avoid (1) and (2) simply by temproraly drop uninitialized extent handling optimization, this will be reimplemented later. - Fix (3) by manually forcing page to uptodate state w/o dropping it's lock - Fix (4) by rearranging existing locking: from: journal_start(); ->write_begin to: write_begin(); journal_extend() - Fix (5) simply by checking retvalue - Fix (6) by locking both (original and donor one) pages during extent swap with help of mext_page_double_lock() Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index e2016f3..c87a746 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -736,6 +736,118 @@ out: } /** + * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2 + * + * @inode1: the inode structure + * @inode2: the inode structure + * @index: page index + * @page: result page vector + * + * Grab two locked pages for inode's by inode order + */ +static int +mext_page_double_lock(struct inode *inode1, struct inode *inode2, + pgoff_t index, struct page *page[2]) +{ + struct address_space *mapping[2]; + unsigned fl = AOP_FLAG_NOFS; + + BUG_ON(!inode1 || !inode2); + if (inode1 < inode2) { + mapping[0] = inode1->i_mapping; + mapping[1] = inode2->i_mapping; + } else { + mapping[0] = inode2->i_mapping; + mapping[1] = inode1->i_mapping; + } + + page[0] = grab_cache_page_write_begin(mapping[0], index, fl); + if (!page[0]) + return -ENOMEM; + + page[1] = grab_cache_page_write_begin(mapping[1], index, fl); + if (!page[1]) { + unlock_page(page[0]); + page_cache_release(page[0]); + return -ENOMEM; + } + + if (inode1 > inode2) { + struct page *tmp; + tmp = page[0]; + page[0] = page[1]; + page[1] = tmp; + } + return 0; +} + +/* Force page buffers uptodate w/o dropping page's lock */ +static int +mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + sector_t block; + struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; + unsigned int blocksize, block_start, block_end; + int i, err, nr = 0, partial = 0; + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + + if (PageUptodate(page)) + return 0; + + blocksize = 1 << inode->i_blkbits; + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + head = page_buffers(page); + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + for (bh = head, block_start = 0; bh != head || !block_start; + block++, block_start = block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (!buffer_uptodate(bh)) + partial = 1; + continue; + } + if (buffer_uptodate(bh)) + continue; + if (!buffer_mapped(bh)) { + int err = 0; + err = ext4_get_block(inode, block, bh, 0); + if (err) { + SetPageError(page); + return err; + } + if (!buffer_mapped(bh)) { + zero_user(page, block_start, blocksize); + if (!err) + set_buffer_uptodate(bh); + continue; + } + } + BUG_ON(nr >= MAX_BUF_PER_PAGE); + arr[nr++] = bh; + } + /* No io required */ + if (!nr) + goto out; + + for (i = 0; i < nr; i++) { + bh = arr[i]; + if (!bh_uptodate_or_lock(bh)) { + err = bh_submit_read(bh); + if (err) + return err; + } + } +out: + if (!partial) + SetPageUptodate(page); + return 0; +} + +/** * move_extent_per_page - Move extent data per page * * @o_filp: file structure of original file @@ -757,26 +869,24 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, int block_len_in_page, int uninit, int *err) { struct inode *orig_inode = o_filp->f_dentry->d_inode; - struct address_space *mapping = orig_inode->i_mapping; - struct buffer_head *bh; - struct page *page = NULL; - const struct address_space_operations *a_ops = mapping->a_ops; + struct page *pagep[2] = {NULL, NULL}; handle_t *handle; ext4_lblk_t orig_blk_offset; long long offs = orig_page_offset << PAGE_CACHE_SHIFT; unsigned long blocksize = orig_inode->i_sb->s_blocksize; unsigned int w_flags = 0; unsigned int tmp_data_size, data_size, replaced_size; - void *fsdata; - int i, jblocks; - int err2 = 0; + int err2, jblocks, retries = 0; int replaced_count = 0; + int from = data_offset_in_page << orig_inode->i_blkbits; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; /* * It needs twice the amount of ordinary journal buffers because * inode and donor_inode may change each different metadata blocks. */ +again: + *err = 0; jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; handle = ext4_journal_start(orig_inode, jblocks); if (IS_ERR(handle)) { @@ -790,19 +900,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, orig_blk_offset = orig_page_offset * blocks_per_page + data_offset_in_page; - /* - * If orig extent is uninitialized one, - * it's not necessary force the page into memory - * and then force it to be written out again. - * Just swap data blocks between orig and donor. - */ - if (uninit) { - replaced_count = mext_replace_branches(handle, orig_inode, - donor_inode, orig_blk_offset, - block_len_in_page, err); - goto out2; - } - offs = (long long)orig_blk_offset << orig_inode->i_blkbits; /* Calculate data_size */ @@ -824,75 +921,81 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, replaced_size = data_size; - *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, - &page, &fsdata); + *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset, + pagep); if (unlikely(*err < 0)) - goto out; + goto stop_journal; - if (!PageUptodate(page)) { - mapping->a_ops->readpage(o_filp, page); - lock_page(page); + *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size); + if (*err) + goto unlock_pages; + + /* At this point all buffers in range are uptodate, old mapping layout + * is no longer required, try to drop it now. */ + if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) || + (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) { + *err = -EBUSY; + goto unlock_pages; } - /* - * try_to_release_page() doesn't call releasepage in writeback mode. - * We should care about the order of writing to the same file - * by multiple move extent processes. - * It needs to call wait_on_page_writeback() to wait for the - * writeback of the page. - */ - wait_on_page_writeback(page); - - /* Release old bh and drop refs */ - try_to_release_page(page, 0); - replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, - orig_blk_offset, block_len_in_page, - &err2); - if (err2) { + orig_blk_offset, + block_len_in_page, err); + if (*err) { if (replaced_count) { block_len_in_page = replaced_count; replaced_size = block_len_in_page << orig_inode->i_blkbits; } else - goto out; + goto unlock_pages; } + /* Perform all necessary steps similar write_begin()/write_end() + * but keeping in mind that i_size will not change */ + *err = __block_write_begin(pagep[0], from, from + replaced_size, + ext4_get_block); + if (!*err) + *err = block_commit_write(pagep[0], from, from + replaced_size); - if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); - - bh = page_buffers(page); - for (i = 0; i < data_offset_in_page; i++) - bh = bh->b_this_page; - - for (i = 0; i < block_len_in_page; i++) { - *err = ext4_get_block(orig_inode, - (sector_t)(orig_blk_offset + i), bh, 0); - if (*err < 0) - goto out; - - if (bh->b_this_page != NULL) - bh = bh->b_this_page; - } - - *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size, - page, fsdata); - page = NULL; - -out: - if (unlikely(page)) { - if (PageLocked(page)) - unlock_page(page); - page_cache_release(page); - ext4_journal_stop(handle); - } -out2: + if (unlikely(*err < 0)) + goto repair_branches; + + /* Even in case of data=writeback it is reasonable to pin + * inode to transaction, to prevent unexpected data loss */ + *err = ext4_jbd2_file_inode(handle, orig_inode); + +unlock_pages: + unlock_page(pagep[0]); + page_cache_release(pagep[0]); + unlock_page(pagep[1]); + page_cache_release(pagep[1]); +stop_journal: ext4_journal_stop(handle); - - if (err2) - *err = err2; - + /* Buffer was busy because probably is pinned to journal transaction, + * force transaction commit may help to free it. */ + if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb, + &retries)) + goto again; return replaced_count; + +repair_branches: + /* + * This should never ever happen! + * Extents are swapped already, but we are not able to copy data. + * Try to swap extents to it's original places + */ + double_down_write_data_sem(orig_inode, donor_inode); + replaced_count = mext_replace_branches(handle, donor_inode, orig_inode, + orig_blk_offset, + block_len_in_page, &err2); + double_up_write_data_sem(orig_inode, donor_inode); + if (replaced_count != block_len_in_page) { + EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), + "Unable to copy data block," + " data will be lost."); + *err = -EIO; + } + replaced_count = 0; + goto unlock_pages; } /** -- cgit v0.10.2 From 8c85447391735469f407add6fdb0630ce59d7f6d Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 26 Sep 2012 12:54:52 -0400 Subject: ext4: reimplement uninit extent optimization for move_extent_per_page() Uninitialized extent may became initialized(parallel writeback task) at any moment after we drop i_data_sem, so we have to recheck extent's state after we hold page's lock and i_data_sem. If we about to change page's mapping we must hold page's lock in order to serialize other users. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index c87a746..c2e47da 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -595,6 +595,43 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, } /** + * mext_check_coverage - Check that all extents in range has the same type + * + * @inode: inode in question + * @from: block offset of inode + * @count: block count to be checked + * @uninit: extents expected to be uninitialized + * @err: pointer to save error value + * + * Return 1 if all extents in range has expected type, and zero otherwise. + */ +static int +mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, + int uninit, int *err) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent *ext; + ext4_lblk_t last = from + count; + while (from < last) { + *err = get_ext_path(inode, from, &path); + if (*err) + return 0; + ext = path[ext_depth(inode)].p_ext; + if (!ext) { + ext4_ext_drop_refs(path); + return 0; + } + if (uninit != ext4_ext_is_uninitialized(ext)) { + ext4_ext_drop_refs(path); + return 0; + } + from += ext4_ext_get_actual_len(ext); + ext4_ext_drop_refs(path); + } + return 1; +} + +/** * mext_replace_branches - Replace original extents with new extents * * @handle: journal handle @@ -629,9 +666,6 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, int replaced_count = 0; int dext_alen; - /* Protect extent trees against block allocations via delalloc */ - double_down_write_data_sem(orig_inode, donor_inode); - /* Get the original extent for the block "orig_off" */ *err = get_ext_path(orig_inode, orig_off, &orig_path); if (*err) @@ -730,8 +764,6 @@ out: ext4_ext_invalidate_cache(orig_inode); ext4_ext_invalidate_cache(donor_inode); - double_up_write_data_sem(orig_inode, donor_inode); - return replaced_count; } @@ -925,7 +957,46 @@ again: pagep); if (unlikely(*err < 0)) goto stop_journal; + /* + * If orig extent was uninitialized it can become initialized + * at any time after i_data_sem was dropped, in order to + * serialize with delalloc we have recheck extent while we + * hold page's lock, if it is still the case data copy is not + * necessary, just swap data blocks between orig and donor. + */ + if (uninit) { + double_down_write_data_sem(orig_inode, donor_inode); + /* If any of extents in range became initialized we have to + * fallback to data copying */ + uninit = mext_check_coverage(orig_inode, orig_blk_offset, + block_len_in_page, 1, err); + if (*err) + goto drop_data_sem; + uninit &= mext_check_coverage(donor_inode, orig_blk_offset, + block_len_in_page, 1, err); + if (*err) + goto drop_data_sem; + + if (!uninit) { + double_up_write_data_sem(orig_inode, donor_inode); + goto data_copy; + } + if ((page_has_private(pagep[0]) && + !try_to_release_page(pagep[0], 0)) || + (page_has_private(pagep[1]) && + !try_to_release_page(pagep[1], 0))) { + *err = -EBUSY; + goto drop_data_sem; + } + replaced_count = mext_replace_branches(handle, orig_inode, + donor_inode, orig_blk_offset, + block_len_in_page, err); + drop_data_sem: + double_up_write_data_sem(orig_inode, donor_inode); + goto unlock_pages; + } +data_copy: *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size); if (*err) goto unlock_pages; -- cgit v0.10.2 From 85556c9a503ababa889bd7bb7a9b3effba795d00 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 26 Sep 2012 20:43:37 -0400 Subject: ext4: use kmem_cache_zalloc instead of kmem_cache_alloc/memset Using kmem_cache_zalloc() instead of kmem_cache_alloc() and memset(). spatch with a semantic match is used to found this problem. (http://coccinelle.lip6.fr/) Signed-off-by: Wei Yongjun Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index bb821a9..8ec6f88 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2232,12 +2232,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); - meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); + meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL); if (meta_group_info[i] == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); goto exit_group_info; } - memset(meta_group_info[i], 0, kmem_cache_size(cachep)); set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(meta_group_info[i]->bb_state)); @@ -4010,7 +4009,6 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, ext4_get_group_no_and_offset(sb, goal, &group, &block); /* set up allocation goals */ - memset(ac, 0, sizeof(struct ext4_allocation_context)); ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1); ac->ac_status = AC_STATUS_CONTINUE; ac->ac_sb = sb; @@ -4293,7 +4291,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, } } - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); + ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS); if (!ac) { ar->len = 0; *errp = -ENOMEM; -- cgit v0.10.2 From 63fedaf1c2dd546da22ff5b4ae06e9b4efba5146 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 26 Sep 2012 21:09:06 -0400 Subject: ext4: remove unused function ext4_ext_check_cache Remove unused function ext4_ext_check_cache() and merge the code back to the ext4_ext_in_cache(). Reviewed-by: Carlos Maiolino Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a510917..cbcc6b3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2136,13 +2136,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, } /* - * ext4_ext_check_cache() + * ext4_ext_in_cache() * Checks to see if the given block is in the cache. * If it is, the cached extent is stored in the given - * cache extent pointer. If the cached extent is a hole, - * this routine should be used instead of - * ext4_ext_in_cache if the calling function needs to - * know the size of the hole. + * cache extent pointer. * * @inode: The files inode * @block: The block to look for in the cache @@ -2151,8 +2148,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, * * Return 0 if cache is invalid; 1 if the cache is valid */ -static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block, - struct ext4_ext_cache *ex){ +static int +ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, + struct ext4_extent *ex) +{ struct ext4_ext_cache *cex; struct ext4_sb_info *sbi; int ret = 0; @@ -2169,7 +2168,9 @@ static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block, goto errout; if (in_range(block, cex->ec_block, cex->ec_len)) { - memcpy(ex, cex, sizeof(struct ext4_ext_cache)); + ex->ee_block = cpu_to_le32(cex->ec_block); + ext4_ext_store_pblock(ex, cex->ec_start); + ex->ee_len = cpu_to_le16(cex->ec_len); ext_debug("%u cached by %u:%u:%llu\n", block, cex->ec_block, cex->ec_len, cex->ec_start); @@ -2182,37 +2183,6 @@ errout: } /* - * ext4_ext_in_cache() - * Checks to see if the given block is in the cache. - * If it is, the cached extent is stored in the given - * extent pointer. - * - * @inode: The files inode - * @block: The block to look for in the cache - * @ex: Pointer where the cached extent will be stored - * if it contains block - * - * Return 0 if cache is invalid; 1 if the cache is valid - */ -static int -ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, - struct ext4_extent *ex) -{ - struct ext4_ext_cache cex; - int ret = 0; - - if (ext4_ext_check_cache(inode, block, &cex)) { - ex->ee_block = cpu_to_le32(cex.ec_block); - ext4_ext_store_pblock(ex, cex.ec_start); - ex->ee_len = cpu_to_le16(cex.ec_len); - ret = 1; - } - - return ret; -} - - -/* * ext4_ext_rm_idx: * removes index from the index block. */ -- cgit v0.10.2 From 6a08f447facb4f9e29fcc30fb68060bb5a0d21c2 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 26 Sep 2012 21:24:57 -0400 Subject: ext4: always set i_op in ext4_mknod() ext4_special_inode_operations have their own ifdef CONFIG_EXT4_FS_XATTR to mask those methods. And ext4_iget also always sets it, so there is an inconsistency. Signed-off-by: Bernd Schubert Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 8f4bda7..bd87b7a 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2156,9 +2156,7 @@ retry: err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); -#ifdef CONFIG_EXT4_FS_XATTR inode->i_op = &ext4_special_inode_operations; -#endif err = ext4_add_nondir(handle, dentry, inode); } ext4_journal_stop(handle); -- cgit v0.10.2 From b71fc079b5d8f42b2a52743c8d2f1d35d655b1c5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 26 Sep 2012 21:52:20 -0400 Subject: ext4: fix fdatasync() for files with only i_size changes Code tracking when transaction needs to be committed on fdatasync(2) forgets to handle a situation when only inode's i_size is changed. Thus in such situations fdatasync(2) doesn't force transaction with new i_size to disk and that can result in wrong i_size after a crash. Fix the issue by updating inode's i_datasync_tid whenever its size is updated. CC: # >= 2.6.32 Reported-by: Kristian Nielsen Signed-off-by: Jan Kara diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0a31197..4df5e95 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4054,6 +4054,7 @@ static int ext4_do_update_inode(handle_t *handle, struct ext4_inode_info *ei = EXT4_I(inode); struct buffer_head *bh = iloc->bh; int err = 0, rc, block; + int need_datasync = 0; uid_t i_uid; gid_t i_gid; @@ -4104,7 +4105,10 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); - ext4_isize_set(raw_inode, ei->i_disksize); + if (ei->i_disksize != ext4_isize(raw_inode)) { + ext4_isize_set(raw_inode, ei->i_disksize); + need_datasync = 1; + } if (ei->i_disksize > 0x7fffffffULL) { struct super_block *sb = inode->i_sb; if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, @@ -4157,7 +4161,7 @@ static int ext4_do_update_inode(handle_t *handle, err = rc; ext4_clear_inode_state(inode, EXT4_STATE_NEW); - ext4_update_inode_fsync_trans(handle, inode, 0); + ext4_update_inode_fsync_trans(handle, inode, need_datasync); out_brelse: brelse(bh); ext4_std_error(inode->i_sb, err); -- cgit v0.10.2 From aaf7d73e54b6915310ece11aedb19ec06a833642 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 26 Sep 2012 22:21:21 -0400 Subject: ext4: enable FITRIM ioctl on bigalloc file system With a minor tweaks regarding minimum extent size to discard and discarded bytes reporting the FITRIM can be enabled on bigalloc file system and it works without any problem. This patch fixes minlen handling and discarded bytes reporting to take into consideration bigalloc enabled file systems and finally removes the restriction and allow FITRIM to be used on file system with bigalloc feature enabled. Reviewed-by: Carlos Maiolino Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 8b84fe2..4c8174a 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -404,13 +404,6 @@ resizefs_out: if (!blk_queue_discard(q)) return -EOPNOTSUPP; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { - ext4_msg(sb, KERN_ERR, - "FITRIM not supported with bigalloc"); - return -EOPNOTSUPP; - } - if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8ec6f88..a415465 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4990,7 +4990,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) start = range->start >> sb->s_blocksize_bits; end = start + (range->len >> sb->s_blocksize_bits) - 1; - minlen = range->minlen >> sb->s_blocksize_bits; + minlen = EXT4_NUM_B2C(EXT4_SB(sb), + range->minlen >> sb->s_blocksize_bits); if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) || unlikely(start >= max_blks)) @@ -5050,6 +5051,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); out: - range->len = trimmed * sb->s_blocksize; + range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; return ret; } -- cgit v0.10.2 From 9b68733273665a4c0d98041a657dabfb4fd6bd80 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Wed, 26 Sep 2012 22:58:50 -0400 Subject: ext4: release donor reference when EXT4_IOC_MOVE_EXT ioctl fails When the EXT4_IOC_MOVE_EXT ioctl() fails on bigalloc file systems, we should jump to the 'mext_out' label to release the donor file reference. Signed-off-by: Djalal Harouni Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 4c8174a..17c53a6 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -258,7 +258,8 @@ group_extend_out: EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { ext4_msg(sb, KERN_ERR, "Online defrag not supported with bigalloc"); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto mext_out; } err = mnt_want_write_file(filp); -- cgit v0.10.2 From b794e7a6ebfbddb819b0e75ab59ada6b08a285f2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 26 Sep 2012 23:11:13 -0400 Subject: jbd2: fix assertion failure in commit code due to lacking transaction credits ext4 users of data=journal mode with blocksize < pagesize were occasionally hitting assertion failure in jbd2_journal_commit_transaction() checking whether the transaction has at least as many credits reserved as buffers attached. The core of the problem is that when a file gets truncated, buffers that still need checkpointing or that are attached to the committing transaction are left with buffer_mapped set. When this happens to buffers beyond i_size attached to a page stradding i_size, subsequent write extending the file will see these buffers and as they are mapped (but underlying blocks were freed) things go awry from here. The assertion failure just coincidentally (and in this case luckily as we would start corrupting filesystem) triggers due to journal_head not being properly cleaned up as well. We fix the problem by unmapping buffers if possible (in lots of cases we just need a buffer attached to a transaction as a place holder but it must not be written out anyway). And in one case, we just have to bite the bullet and wait for transaction commit to finish. CC: Josef Bacik Signed-off-by: Jan Kara diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index af5280f..3091d42 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -1014,17 +1014,35 @@ restart_loop: * there's no point in keeping a checkpoint record for * it. */ - /* A buffer which has been freed while still being - * journaled by a previous transaction may end up still - * being dirty here, but we want to avoid writing back - * that buffer in the future after the "add to orphan" - * operation been committed, That's not only a performance - * gain, it also stops aliasing problems if the buffer is - * left behind for writeback and gets reallocated for another - * use in a different page. */ - if (buffer_freed(bh) && !jh->b_next_transaction) { - clear_buffer_freed(bh); - clear_buffer_jbddirty(bh); + /* + * A buffer which has been freed while still being journaled by + * a previous transaction. + */ + if (buffer_freed(bh)) { + /* + * If the running transaction is the one containing + * "add to orphan" operation (b_next_transaction != + * NULL), we have to wait for that transaction to + * commit before we can really get rid of the buffer. + * So just clear b_modified to not confuse transaction + * credit accounting and refile the buffer to + * BJ_Forget of the running transaction. If the just + * committed transaction contains "add to orphan" + * operation, we can completely invalidate the buffer + * now. We are rather through in that since the + * buffer may be still accessible when blocksize < + * pagesize and it is attached to the last partial + * page. + */ + jh->b_modified = 0; + if (!jh->b_next_transaction) { + clear_buffer_freed(bh); + clear_buffer_jbddirty(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_req(bh); + bh->b_bdev = NULL; + } } if (buffer_jbddirty(bh)) { diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index fb1ab953..a74ba46 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1841,15 +1841,16 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) * We're outside-transaction here. Either or both of j_running_transaction * and j_committing_transaction may be NULL. */ -static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) +static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, + int partial_page) { transaction_t *transaction; struct journal_head *jh; int may_free = 1; - int ret; BUFFER_TRACE(bh, "entry"); +retry: /* * It is safe to proceed here without the j_list_lock because the * buffers cannot be stolen by try_to_free_buffers as long as we are @@ -1878,10 +1879,18 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) * clear the buffer dirty bit at latest at the moment when the * transaction marking the buffer as freed in the filesystem * structures is committed because from that moment on the - * buffer can be reallocated and used by a different page. + * block can be reallocated and used by a different page. * Since the block hasn't been freed yet but the inode has * already been added to orphan list, it is safe for us to add * the buffer to BJ_Forget list of the newest transaction. + * + * Also we have to clear buffer_mapped flag of a truncated buffer + * because the buffer_head may be attached to the page straddling + * i_size (can happen only when blocksize < pagesize) and thus the + * buffer_head can be reused when the file is extended again. So we end + * up keeping around invalidated buffers attached to transactions' + * BJ_Forget list just to stop checkpointing code from cleaning up + * the transaction this buffer was modified in. */ transaction = jh->b_transaction; if (transaction == NULL) { @@ -1908,13 +1917,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) * committed, the buffer won't be needed any * longer. */ JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); - ret = __dispose_buffer(jh, + may_free = __dispose_buffer(jh, journal->j_running_transaction); - jbd2_journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - write_unlock(&journal->j_state_lock); - return ret; + goto zap_buffer; } else { /* There is no currently-running transaction. So the * orphan record which we wrote for this file must have @@ -1922,13 +1927,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) * the committing transaction, if it exists. */ if (journal->j_committing_transaction) { JBUFFER_TRACE(jh, "give to committing trans"); - ret = __dispose_buffer(jh, + may_free = __dispose_buffer(jh, journal->j_committing_transaction); - jbd2_journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - write_unlock(&journal->j_state_lock); - return ret; + goto zap_buffer; } else { /* The orphan record's transaction has * committed. We can cleanse this buffer */ @@ -1940,10 +1941,24 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) JBUFFER_TRACE(jh, "on committing transaction"); /* * The buffer is committing, we simply cannot touch - * it. So we just set j_next_transaction to the - * running transaction (if there is one) and mark - * buffer as freed so that commit code knows it should - * clear dirty bits when it is done with the buffer. + * it. If the page is straddling i_size we have to wait + * for commit and try again. + */ + if (partial_page) { + tid_t tid = journal->j_committing_transaction->t_tid; + + jbd2_journal_put_journal_head(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + write_unlock(&journal->j_state_lock); + jbd2_log_wait_commit(journal, tid); + goto retry; + } + /* + * OK, buffer won't be reachable after truncate. We just set + * j_next_transaction to the running transaction (if there is + * one) and mark buffer as freed so that commit code knows it + * should clear dirty bits when it is done with the buffer. */ set_buffer_freed(bh); if (journal->j_running_transaction && buffer_jbddirty(bh)) @@ -1966,6 +1981,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) } zap_buffer: + /* + * This is tricky. Although the buffer is truncated, it may be reused + * if blocksize < pagesize and it is attached to the page straddling + * EOF. Since the buffer might have been added to BJ_Forget list of the + * running transaction, journal_get_write_access() won't clear + * b_modified and credit accounting gets confused. So clear b_modified + * here. + */ + jh->b_modified = 0; jbd2_journal_put_journal_head(jh); zap_buffer_no_jh: spin_unlock(&journal->j_list_lock); @@ -2017,7 +2041,8 @@ void jbd2_journal_invalidatepage(journal_t *journal, if (offset <= curr_off) { /* This block is wholly outside the truncation point */ lock_buffer(bh); - may_free &= journal_unmap_buffer(journal, bh); + may_free &= journal_unmap_buffer(journal, bh, + offset > 0); unlock_buffer(bh); } curr_off = next_off; -- cgit v0.10.2 From c25f9bc6143f4cb4dc31d2ad7a6fe4e4005fc414 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 26 Sep 2012 23:30:12 -0400 Subject: ext4: don't clear orphan list on ro mount with errors If the file system contains errors and it is being mounted read-only, don't clear the orphan list. We should minimize changes to the file system if it is mounted read-only. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7bef0a4..a53a23a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2177,10 +2177,12 @@ static void ext4_orphan_cleanup(struct super_block *sb, } if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { - if (es->s_last_orphan) + /* don't clear list on RO mount w/ errors */ + if (es->s_last_orphan && !(s_flags & MS_RDONLY)) { jbd_debug(1, "Errors on filesystem, " "clearing orphan list.\n"); - es->s_last_orphan = 0; + es->s_last_orphan = 0; + } jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); return; } -- cgit v0.10.2 From cbb4ee830e0057f8d60b4e0a8c4b6daa6cdd28d7 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Thu, 27 Sep 2012 08:00:01 -0400 Subject: ext4: remove redundant offset check in mext_check_arguments() In the check code above, if orig_start != donor_start, we would return -EINVAL. So here, orig_start should be equal with donor_start. Remove the redundant check here. Signed-off-by: Wang Sheng-Hui Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index c2e47da..cee4bd0 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -1134,7 +1134,6 @@ mext_check_arguments(struct inode *orig_inode, } if ((orig_start >= EXT_MAX_BLOCKS) || - (donor_start >= EXT_MAX_BLOCKS) || (*len > EXT_MAX_BLOCKS) || (orig_start + *len >= EXT_MAX_BLOCKS)) { ext4_debug("ext4 move extent: Can't handle over [%u] blocks " -- cgit v0.10.2 From bbdd68086ca4a8976226e23efd08e2058d34dd81 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 27 Sep 2012 09:29:33 -0400 Subject: fs: reserve fallocate flag codepoint As discussed at the Plumber's Conference, reserve the bit 0x04 in fallocate() to prevent collisions with a commonly used out-of-tree patch which implements the no-hide-stale feature. Signed-off-by: "Theodore Ts'o" diff --git a/include/linux/falloc.h b/include/linux/falloc.h index 73e0b62..d39b824 100644 --- a/include/linux/falloc.h +++ b/include/linux/falloc.h @@ -3,6 +3,7 @@ #define FALLOC_FL_KEEP_SIZE 0x01 /* default is extend size */ #define FALLOC_FL_PUNCH_HOLE 0x02 /* de-allocates range */ +#define FALLOC_FL_NO_HIDE_STALE 0x04 /* reserved codepoint */ #ifdef __KERNEL__ -- cgit v0.10.2 From 6d1ab10e69ff5f3cb63920ba965ec0f1f0bdaf8d Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Thu, 27 Sep 2012 09:31:33 -0400 Subject: ext4: ext4_bread usage audit When ext4_bread() returns NULL and err is set to zero, this means there is no phyical block mapped to the specified logical block number. (Previous to commit 90b0a97323, err was uninitialized in this case, which caused other problems.) The directory handling routines use ext4_bread() in many places, the fact that ext4_bread() now returns NULL with err set to zero could cause problems since a number of these functions will simply return the value of err if the result of ext4_bread() was the NULL pointer, causing the caller of the function to think that the function was successful. Since directories should never contain holes, this case can only happen if the file system is corrupted. This commit audits all of the callers of ext4_bread(), and makes sure they do the right thing if a hole in a directory is found by ext4_bread(). Some ext4_bread() callers did not need any changes either because they already had its own hole detector paths. Signed-off-by: Carlos Maiolino Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index bd87b7a..6d600a6 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -74,6 +74,12 @@ static struct buffer_head *ext4_append(handle_t *handle, bh = NULL; } } + if (!bh && !(*err)) { + *err = -EIO; + ext4_error(inode->i_sb, + "Directory hole detected on inode %lu\n", + inode->i_ino); + } return bh; } @@ -601,8 +607,11 @@ dx_probe(const struct qstr *d_name, struct inode *dir, u32 hash; frame->bh = NULL; - if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) + if (!(bh = ext4_bread(NULL, dir, 0, 0, err))) { + if (*err == 0) + *err = ERR_BAD_DX_DIR; goto fail; + } root = (struct dx_root *) bh->b_data; if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && @@ -703,8 +712,11 @@ dx_probe(const struct qstr *d_name, struct inode *dir, frame->entries = entries; frame->at = at; if (!indirect--) return frame; - if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err))) + if (!(bh = ext4_bread(NULL, dir, dx_get_block(at), 0, err))) { + if (!(*err)) + *err = ERR_BAD_DX_DIR; goto fail2; + } at = entries = ((struct dx_node *) bh->b_data)->entries; if (!buffer_verified(bh) && @@ -814,8 +826,15 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, */ while (num_frames--) { if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at), - 0, &err))) + 0, &err))) { + if (!err) { + ext4_error(dir->i_sb, + "Directory hole detected on inode %lu\n", + dir->i_ino); + return -EIO; + } return err; /* Failure */ + } if (!buffer_verified(bh) && !ext4_dx_csum_verify(dir, @@ -850,8 +869,15 @@ static int htree_dirblock_to_tree(struct file *dir_file, dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); - if (!(bh = ext4_bread (NULL, dir, block, 0, &err))) + if (!(bh = ext4_bread(NULL, dir, block, 0, &err))) { + if (!err) { + err = -EIO; + ext4_error(dir->i_sb, + "Directory hole detected on inode %lu\n", + dir->i_ino); + } return err; + } if (!buffer_verified(bh) && !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) @@ -1274,8 +1300,15 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q return NULL; do { block = dx_get_block(frame->at); - if (!(bh = ext4_bread(NULL, dir, block, 0, err))) + if (!(bh = ext4_bread(NULL, dir, block, 0, err))) { + if (!(*err)) { + *err = -EIO; + ext4_error(dir->i_sb, + "Directory hole detected on inode %lu\n", + dir->i_ino); + } goto errout; + } if (!buffer_verified(bh) && !ext4_dirent_csum_verify(dir, @@ -1808,9 +1841,15 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, } blocks = dir->i_size >> sb->s_blocksize_bits; for (block = 0; block < blocks; block++) { - bh = ext4_bread(handle, dir, block, 0, &retval); - if(!bh) + if (!(bh = ext4_bread(handle, dir, block, 0, &retval))) { + if (!retval) { + retval = -EIO; + ext4_error(inode->i_sb, + "Directory hole detected on inode %lu\n", + inode->i_ino); + } return retval; + } if (!buffer_verified(bh) && !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) @@ -1867,8 +1906,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, entries = frame->entries; at = frame->at; - if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err))) + if (!(bh = ext4_bread(handle, dir, dx_get_block(frame->at), 0, &err))) { + if (!err) { + err = -EIO; + ext4_error(dir->i_sb, + "Directory hole detected on inode %lu\n", + dir->i_ino); + } goto cleanup; + } if (!buffer_verified(bh) && !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) @@ -2204,9 +2250,15 @@ retry: inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; - dir_block = ext4_bread(handle, inode, 0, 1, &err); - if (!dir_block) + if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) { + if (!err) { + err = -EIO; + ext4_error(inode->i_sb, + "Directory hole detected on inode %lu\n", + inode->i_ino); + } goto out_clear_inode; + } BUFFER_TRACE(dir_block, "get_write_access"); err = ext4_journal_get_write_access(handle, dir_block); if (err) @@ -2323,6 +2375,11 @@ static int empty_dir(struct inode *inode) EXT4_ERROR_INODE(inode, "error %d reading directory " "lblock %u", err, lblock); + else + ext4_warning(inode->i_sb, + "bad directory (dir #%lu) - no data block", + inode->i_ino); + offset += sb->s_blocksize; continue; } @@ -2830,9 +2887,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; } retval = -EIO; - dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval); - if (!dir_bh) + if (!(dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval))) { + if (!retval) { + retval = -EIO; + ext4_error(old_inode->i_sb, + "Directory hole detected on inode %lu\n", + old_inode->i_ino); + } goto end_rename; + } if (!buffer_verified(dir_bh) && !ext4_dirent_csum_verify(old_inode, (struct ext4_dir_entry *)dir_bh->b_data)) -- cgit v0.10.2 From ba39ebb61401cfe0ccd58dd0cd4da88465528c0a Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 27 Sep 2012 09:37:53 -0400 Subject: ext4: convert to use leXX_add_cpu() Convert cpu_to_leXX(leXX_to_cpu(E1) + E2) to use leXX_add_cpu(). dpatch engine is used to auto generate this patch. (https://github.com/weiyj/dpatch) Signed-off-by: Wei Yongjun Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index cbcc6b3..8f08c7b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1177,7 +1177,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), ext4_idx_pblock(EXT_FIRST_INDEX(neh))); - neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1); + le16_add_cpu(&neh->eh_depth, 1); ext4_mark_inode_dirty(handle, inode); out: brelse(bh); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index cee4bd0..8076b96 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -570,9 +570,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, diff = donor_off - le32_to_cpu(tmp_dext->ee_block); ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff); - tmp_dext->ee_block = - cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff); - tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff); + le32_add_cpu(&tmp_dext->ee_block, diff); + le16_add_cpu(&tmp_dext->ee_len, -diff); if (max_count < ext4_ext_get_actual_len(tmp_dext)) tmp_dext->ee_len = cpu_to_le16(max_count); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a53a23a..ae45632 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -431,7 +431,7 @@ static void __save_error_info(struct super_block *sb, const char *func, */ if (!es->s_error_count) mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ); - es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1); + le32_add_cpu(&es->s_error_count, 1); } static void save_error_info(struct super_block *sb, const char *func, -- cgit v0.10.2 From f45ee3a1ea438af96e4fd2c0b16d195e67ef235f Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Fri, 28 Sep 2012 23:21:09 -0400 Subject: ext4: ext4_inode_info diet Generic inode has unused i_private pointer which may be used as cur_aio_dio storage. TODO: If cur_aio_dio will be passed as an argument to get_block_t this allow to have concurent AIO_DIO requests. Reviewed-by: Zheng Liu Reviewed-by: Jan Kara Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8b6902c..3e8e185 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -912,8 +912,6 @@ struct ext4_inode_info { struct list_head i_completed_io_list; spinlock_t i_completed_io_lock; atomic_t i_ioend_count; /* Number of outstanding io_end structs */ - /* current io_end structure for async DIO write*/ - ext4_io_end_t *cur_aio_dio; atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */ spinlock_t i_block_reservation_lock; @@ -1338,6 +1336,16 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode, } } +static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode) +{ + return inode->i_private; +} + +static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io) +{ + inode->i_private = io; +} + /* * Inode dynamic state flags */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 8f08c7b..a1f56c3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3618,7 +3618,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, { int ret = 0; int err = 0; - ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; + ext4_io_end_t *io = ext4_inode_aio(inode); ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical " "block %llu, max_blocks %u, flags %x, allocated %u\n", @@ -3876,7 +3876,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; - ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; + ext4_io_end_t *io = ext4_inode_aio(inode); ext4_lblk_t cluster_offset; ext_debug("blocks %u/%u requested for inode %lu\n", diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4df5e95..a995886 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3056,7 +3056,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, * hook to the iocb. */ iocb->private = NULL; - EXT4_I(inode)->cur_aio_dio = NULL; + ext4_inode_aio_set(inode, NULL); if (!is_sync_kiocb(iocb)) { ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); @@ -3073,7 +3073,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, * is a unwritten extents needs to be converted * when IO is completed. */ - EXT4_I(inode)->cur_aio_dio = iocb->private; + ext4_inode_aio_set(inode, io_end); } if (overwrite) @@ -3093,7 +3093,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, NULL, DIO_LOCKING); if (iocb->private) - EXT4_I(inode)->cur_aio_dio = NULL; + ext4_inode_aio_set(inode, NULL); /* * The io_end structure takes a reference to the inode, * that structure needs to be destroyed and the diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ae45632..e05267a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -965,7 +965,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->jinode = NULL; INIT_LIST_HEAD(&ei->i_completed_io_list); spin_lock_init(&ei->i_completed_io_lock); - ei->cur_aio_dio = NULL; ei->i_sync_tid = 0; ei->i_datasync_tid = 0; atomic_set(&ei->i_ioend_count, 0); -- cgit v0.10.2 From e27f41e1b789e60e7d8cc9c81fd93ca49ef31f13 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Fri, 28 Sep 2012 23:24:52 -0400 Subject: ext4: give i_aiodio_unwritten a more appropriate name AIO/DIO prefix is wrong because it account unwritten extents which also may be scheduled from buffered write endio Reviewed-by: Jan Kara Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3e8e185..6ec9856 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -912,7 +912,7 @@ struct ext4_inode_info { struct list_head i_completed_io_list; spinlock_t i_completed_io_lock; atomic_t i_ioend_count; /* Number of outstanding io_end structs */ - atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */ + atomic_t i_unwritten; /* Nr. of inflight conversions pending */ spinlock_t i_block_reservation_lock; @@ -1332,7 +1332,7 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode, { if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { io_end->flag |= EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + atomic_inc(&EXT4_I(inode)->i_unwritten); } } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3b0e3bd..39335bd 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -55,11 +55,11 @@ static int ext4_release_file(struct inode *inode, struct file *filp) return 0; } -static void ext4_aiodio_wait(struct inode *inode) +static void ext4_unwritten_wait(struct inode *inode) { wait_queue_head_t *wq = ext4_ioend_wq(inode); - wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0)); + wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)); } /* @@ -116,7 +116,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, "performance will be poor.", inode->i_ino, current->comm); mutex_lock(ext4_aio_mutex(inode)); - ext4_aiodio_wait(inode); + ext4_unwritten_wait(inode); } BUG_ON(iocb->ki_pos != pos); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index dcdeef1..de77e31 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -113,7 +113,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io) if (io->flag & EXT4_IO_END_DIRECT) inode_dio_done(inode); /* Wake up anyone waiting on unwritten extent conversion */ - if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten)) + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) wake_up_all(ext4_ioend_wq(io->inode)); return ret; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e05267a..982f6fc 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -968,7 +968,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_sync_tid = 0; ei->i_datasync_tid = 0; atomic_set(&ei->i_ioend_count, 0); - atomic_set(&ei->i_aiodio_unwritten, 0); + atomic_set(&ei->i_unwritten, 0); return &ei->vfs_inode; } -- cgit v0.10.2 From 82e54229118785badffb4ef5ba4803df25fe007f Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Fri, 28 Sep 2012 23:36:25 -0400 Subject: ext4: fix unwritten counter leakage ext4_set_io_unwritten_flag() will increment i_unwritten counter, so once we mark end_io with EXT4_END_IO_UNWRITTEN we have to revert it back on error path. - add missed error checks to prevent counter leakage - ext4_end_io_nolock() will clear EXT4_END_IO_UNWRITTEN flag to signal that conversion finished. - add BUG_ON to ext4_free_end_io() to prevent similar leakage in future. Visible effect of this bug is that unaligned aio_stress may deadlock Reviewed-by: Jan Kara Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a1f56c3..54a9442 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3633,6 +3633,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { ret = ext4_split_unwritten_extents(handle, inode, map, path, flags); + if (ret <= 0) + goto out; /* * Flag the inode(non aio case) or end_io struct (aio case) * that this IO needs to conversion to written when IO is @@ -3878,6 +3880,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_allocation_request ar; ext4_io_end_t *io = ext4_inode_aio(inode); ext4_lblk_t cluster_offset; + int set_unwritten = 0; ext_debug("blocks %u/%u requested for inode %lu\n", map->m_lblk, map->m_len, inode->i_ino); @@ -4100,13 +4103,8 @@ got_allocated_blocks: * For non asycn direct IO case, flag the inode state * that we need to perform conversion when IO is done. */ - if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { - if (io) - ext4_set_io_unwritten_flag(inode, io); - else - ext4_set_inode_state(inode, - EXT4_STATE_DIO_UNWRITTEN); - } + if ((flags & EXT4_GET_BLOCKS_PRE_IO)) + set_unwritten = 1; if (ext4_should_dioread_nolock(inode)) map->m_flags |= EXT4_MAP_UNINIT; } @@ -4118,6 +4116,15 @@ got_allocated_blocks: if (!err) err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + + if (!err && set_unwritten) { + if (io) + ext4_set_io_unwritten_flag(inode, io); + else + ext4_set_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN); + } + if (err && free_on_err) { int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index de77e31..9970022 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -71,6 +71,8 @@ void ext4_free_io_end(ext4_io_end_t *io) int i; BUG_ON(!io); + BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); + if (io->page) put_page(io->page); for (i = 0; i < io->num_io_pages; i++) @@ -94,6 +96,8 @@ int ext4_end_io_nolock(ext4_io_end_t *io) ssize_t size = io->size; int ret = 0; + BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); + ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", io, inode->i_ino, io->list.next, io->list.prev); @@ -106,7 +110,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io) "(inode %lu, offset %llu, size %zd, error %d)", inode->i_ino, offset, size, ret); } - + io->flag &= ~EXT4_IO_END_UNWRITTEN; if (io->iocb) aio_complete(io->iocb, io->result, 0); -- cgit v0.10.2 From 28a535f9a0df060569dcc786e5bc2e1de43d7dc7 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sat, 29 Sep 2012 00:14:55 -0400 Subject: ext4: completed_io locking cleanup Current unwritten extent conversion state-machine is very fuzzy. - For unknown reason it performs conversion under i_mutex. What for? My diagnosis: We already protect extent tree with i_data_sem, truncate and punch_hole should wait for DIO, so the only data we have to protect is end_io->flags modification, but only flush_completed_IO and end_io_work modified this flags and we can serialize them via i_completed_io_lock. Currently all these games with mutex_trylock result in the following deadlock truncate: kworker: ext4_setattr ext4_end_io_work mutex_lock(i_mutex) inode_dio_wait(inode) ->BLOCK DEADLOCK<- mutex_trylock() inode_dio_done() #TEST_CASE1_BEGIN MNT=/mnt_scrach unlink $MNT/file fallocate -l $((1024*1024*1024)) $MNT/file aio-stress -I 100000 -O -s 100m -n -t 1 -c 10 -o 2 -o 3 $MNT/file sleep 2 truncate -s 0 $MNT/file #TEST_CASE1_END Or use 286's xfstests https://github.com/dmonakhov/xfstests/blob/devel/286 This patch makes state machine simple and clean: (1) xxx_end_io schedule final extent conversion simply by calling ext4_add_complete_io(), which append it to ei->i_completed_io_list NOTE1: because of (2A) work should be queued only if ->i_completed_io_list was empty, otherwise the work is scheduled already. (2) ext4_flush_completed_IO is responsible for handling all pending end_io from ei->i_completed_io_list Flushing sequence consists of following stages: A) LOCKED: Atomically drain completed_io_list to local_list B) Perform extents conversion C) LOCKED: move converted io's to to_free list for final deletion This logic depends on context which we was called from. D) Final end_io context destruction NOTE1: i_mutex is no longer required because end_io->flags modification is protected by ei->ext4_complete_io_lock Full list of changes: - Move all completion end_io related routines to page-io.c in order to improve logic locality - Move open coded logic from various xx_end_xx routines to ext4_add_complete_io() - remove EXT4_IO_END_FSYNC - Improve SMP scalability by removing useless i_mutex which does not protect io->flags anymore. - Reduce lock contention on i_completed_io_lock by optimizing list walk. - Rename ext4_end_io_nolock to end4_end_io and make it static - Check flush completion status to ext4_ext_punch_hole(). Because it is not good idea to punch blocks from corrupted inode. Changes since V3 (in request to Jan's comments): Fall back to active flush_completed_IO() approach in order to prevent performance issues with nolocked DIO reads. Changes since V2: Fix use-after-free caused by race truncate vs end_io_work Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6ec9856..edb0495 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -186,7 +186,6 @@ struct mpage_da_data { #define EXT4_IO_END_ERROR 0x0002 #define EXT4_IO_END_QUEUED 0x0004 #define EXT4_IO_END_DIRECT 0x0008 -#define EXT4_IO_END_IN_FSYNC 0x0010 struct ext4_io_page { struct page *p_page; @@ -2418,11 +2417,11 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, /* page-io.c */ extern int __init ext4_init_pageio(void); +extern void ext4_add_complete_io(ext4_io_end_t *io_end); extern void ext4_exit_pageio(void); extern void ext4_ioend_wait(struct inode *); extern void ext4_free_io_end(ext4_io_end_t *io); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); -extern int ext4_end_io_nolock(ext4_io_end_t *io); extern void ext4_io_submit(struct ext4_io_submit *io); extern int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 54a9442..2320774 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4833,7 +4833,9 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) } /* finish any pending end_io work */ - ext4_flush_completed_IO(inode); + err = ext4_flush_completed_IO(inode); + if (err) + return err; credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, credits); diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 323eb15..4600008 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -34,87 +34,6 @@ #include -static void dump_completed_IO(struct inode * inode) -{ -#ifdef EXT4FS_DEBUG - struct list_head *cur, *before, *after; - ext4_io_end_t *io, *io0, *io1; - unsigned long flags; - - if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ - ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); - return; - } - - ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); - list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ - cur = &io->list; - before = cur->prev; - io0 = container_of(before, ext4_io_end_t, list); - after = cur->next; - io1 = container_of(after, ext4_io_end_t, list); - - ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", - io, inode->i_ino, io0, io1); - } - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); -#endif -} - -/* - * This function is called from ext4_sync_file(). - * - * When IO is completed, the work to convert unwritten extents to - * written is queued on workqueue but may not get immediately - * scheduled. When fsync is called, we need to ensure the - * conversion is complete before fsync returns. - * The inode keeps track of a list of pending/completed IO that - * might needs to do the conversion. This function walks through - * the list and convert the related unwritten extents for completed IO - * to written. - * The function return the number of pending IOs on success. - */ -int ext4_flush_completed_IO(struct inode *inode) -{ - ext4_io_end_t *io; - struct ext4_inode_info *ei = EXT4_I(inode); - unsigned long flags; - int ret = 0; - int ret2 = 0; - - dump_completed_IO(inode); - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - while (!list_empty(&ei->i_completed_io_list)){ - io = list_entry(ei->i_completed_io_list.next, - ext4_io_end_t, list); - list_del_init(&io->list); - io->flag |= EXT4_IO_END_IN_FSYNC; - /* - * Calling ext4_end_io_nolock() to convert completed - * IO to written. - * - * When ext4_sync_file() is called, run_queue() may already - * about to flush the work corresponding to this io structure. - * It will be upset if it founds the io structure related - * to the work-to-be schedule is freed. - * - * Thus we need to keep the io structure still valid here after - * conversion finished. The io structure has a flag to - * avoid double converting from both fsync and background work - * queue work. - */ - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - ret = ext4_end_io_nolock(io); - if (ret < 0) - ret2 = ret; - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - io->flag &= ~EXT4_IO_END_IN_FSYNC; - } - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - return (ret2 < 0) ? ret2 : 0; -} - /* * If we're not journaling and this is a just-created file, we have to * sync our parent directory (if it was freshly created) since diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 830e1b2..61f13e5 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -807,11 +807,9 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, retry: if (rw == READ && ext4_should_dioread_nolock(inode)) { - if (unlikely(!list_empty(&ei->i_completed_io_list))) { - mutex_lock(&inode->i_mutex); + if (unlikely(!list_empty(&ei->i_completed_io_list))) ext4_flush_completed_IO(inode); - mutex_unlock(&inode->i_mutex); - } + ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a995886..09d0488 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2881,9 +2881,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, { struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; ext4_io_end_t *io_end = iocb->private; - struct workqueue_struct *wq; - unsigned long flags; - struct ext4_inode_info *ei; /* if not async direct IO or dio with 0 bytes write, just return */ if (!io_end || !size) @@ -2912,24 +2909,14 @@ out: io_end->iocb = iocb; io_end->result = ret; } - wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; - - /* Add the io_end to per-inode completed aio dio list*/ - ei = EXT4_I(io_end->inode); - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - list_add_tail(&io_end->list, &ei->i_completed_io_list); - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - /* queue the work to convert unwritten extents to written */ - queue_work(wq, &io_end->work); + ext4_add_complete_io(io_end); } static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) { ext4_io_end_t *io_end = bh->b_private; - struct workqueue_struct *wq; struct inode *inode; - unsigned long flags; if (!test_clear_buffer_uninit(bh) || !io_end) goto out; @@ -2948,15 +2935,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) */ inode = io_end->inode; ext4_set_io_unwritten_flag(inode, io_end); - - /* Add the io_end to per-inode completed io list*/ - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); - list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); - - wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; - /* queue the work to convert unwritten extents to written */ - queue_work(wq, &io_end->work); + ext4_add_complete_io(io_end); out: bh->b_private = NULL; bh->b_end_io = NULL; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 9970022..5b24c40 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -71,6 +71,7 @@ void ext4_free_io_end(ext4_io_end_t *io) int i; BUG_ON(!io); + BUG_ON(!list_empty(&io->list)); BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); if (io->page) @@ -83,21 +84,14 @@ void ext4_free_io_end(ext4_io_end_t *io) kmem_cache_free(io_end_cachep, io); } -/* - * check a range of space and convert unwritten extents to written. - * - * Called with inode->i_mutex; we depend on this when we manipulate - * io->flag, since we could otherwise race with ext4_flush_completed_IO() - */ -int ext4_end_io_nolock(ext4_io_end_t *io) +/* check a range of space and convert unwritten extents to written. */ +static int ext4_end_io(ext4_io_end_t *io) { struct inode *inode = io->inode; loff_t offset = io->offset; ssize_t size = io->size; int ret = 0; - BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); - ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", io, inode->i_ino, io->list.next, io->list.prev); @@ -110,7 +104,6 @@ int ext4_end_io_nolock(ext4_io_end_t *io) "(inode %lu, offset %llu, size %zd, error %d)", inode->i_ino, offset, size, ret); } - io->flag &= ~EXT4_IO_END_UNWRITTEN; if (io->iocb) aio_complete(io->iocb, io->result, 0); @@ -122,51 +115,122 @@ int ext4_end_io_nolock(ext4_io_end_t *io) return ret; } -/* - * work on completed aio dio IO, to convert unwritten extents to extents - */ -static void ext4_end_io_work(struct work_struct *work) +static void dump_completed_IO(struct inode *inode) +{ +#ifdef EXT4FS_DEBUG + struct list_head *cur, *before, *after; + ext4_io_end_t *io, *io0, *io1; + unsigned long flags; + + if (list_empty(&EXT4_I(inode)->i_completed_io_list)) { + ext4_debug("inode %lu completed_io list is empty\n", + inode->i_ino); + return; + } + + ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino); + list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) { + cur = &io->list; + before = cur->prev; + io0 = container_of(before, ext4_io_end_t, list); + after = cur->next; + io1 = container_of(after, ext4_io_end_t, list); + + ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", + io, inode->i_ino, io0, io1); + } +#endif +} + +/* Add the io_end to per-inode completed end_io list. */ +void ext4_add_complete_io(ext4_io_end_t *io_end) { - ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); - struct inode *inode = io->inode; - struct ext4_inode_info *ei = EXT4_I(inode); - unsigned long flags; + struct ext4_inode_info *ei = EXT4_I(io_end->inode); + struct workqueue_struct *wq; + unsigned long flags; + + BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (io->flag & EXT4_IO_END_IN_FSYNC) - goto requeue; - if (list_empty(&io->list)) { - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - goto free; + if (list_empty(&ei->i_completed_io_list)) { + io_end->flag |= EXT4_IO_END_QUEUED; + queue_work(wq, &io_end->work); } + list_add_tail(&io_end->list, &ei->i_completed_io_list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); +} - if (!mutex_trylock(&inode->i_mutex)) { - bool was_queued; -requeue: - was_queued = !!(io->flag & EXT4_IO_END_QUEUED); - io->flag |= EXT4_IO_END_QUEUED; - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - /* - * Requeue the work instead of waiting so that the work - * items queued after this can be processed. - */ - queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work); - /* - * To prevent the ext4-dio-unwritten thread from keeping - * requeueing end_io requests and occupying cpu for too long, - * yield the cpu if it sees an end_io request that has already - * been requeued. - */ - if (was_queued) - yield(); - return; +static int ext4_do_flush_completed_IO(struct inode *inode, + ext4_io_end_t *work_io) +{ + ext4_io_end_t *io; + struct list_head unwritten, complete, to_free; + unsigned long flags; + struct ext4_inode_info *ei = EXT4_I(inode); + int err, ret = 0; + + INIT_LIST_HEAD(&complete); + INIT_LIST_HEAD(&to_free); + + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + dump_completed_IO(inode); + list_replace_init(&ei->i_completed_io_list, &unwritten); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + + while (!list_empty(&unwritten)) { + io = list_entry(unwritten.next, ext4_io_end_t, list); + BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); + list_del_init(&io->list); + + err = ext4_end_io(io); + if (unlikely(!ret && err)) + ret = err; + + list_add_tail(&io->list, &complete); + } + /* It is important to update all flags for all end_io in one shot w/o + * dropping the lock.*/ + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + while (!list_empty(&complete)) { + io = list_entry(complete.next, ext4_io_end_t, list); + io->flag &= ~EXT4_IO_END_UNWRITTEN; + /* end_io context can not be destroyed now because it still + * used by queued worker. Worker thread will destroy it later */ + if (io->flag & EXT4_IO_END_QUEUED) + list_del_init(&io->list); + else + list_move(&io->list, &to_free); + } + /* If we are called from worker context, it is time to clear queued + * flag, and destroy it's end_io if it was converted already */ + if (work_io) { + work_io->flag &= ~EXT4_IO_END_QUEUED; + if (!(work_io->flag & EXT4_IO_END_UNWRITTEN)) + list_add_tail(&work_io->list, &to_free); } - list_del_init(&io->list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - (void) ext4_end_io_nolock(io); - mutex_unlock(&inode->i_mutex); -free: - ext4_free_io_end(io); + + while (!list_empty(&to_free)) { + io = list_entry(to_free.next, ext4_io_end_t, list); + list_del_init(&io->list); + ext4_free_io_end(io); + } + return ret; +} + +/* + * work on completed aio dio IO, to convert unwritten extents to extents + */ +static void ext4_end_io_work(struct work_struct *work) +{ + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); + ext4_do_flush_completed_IO(io->inode, io); +} + +int ext4_flush_completed_IO(struct inode *inode) +{ + return ext4_do_flush_completed_IO(inode, NULL); } ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) @@ -199,9 +263,7 @@ static void buffer_io_error(struct buffer_head *bh) static void ext4_end_bio(struct bio *bio, int error) { ext4_io_end_t *io_end = bio->bi_private; - struct workqueue_struct *wq; struct inode *inode; - unsigned long flags; int i; sector_t bi_sector = bio->bi_sector; @@ -259,14 +321,7 @@ static void ext4_end_bio(struct bio *bio, int error) return; } - /* Add the io_end to per-inode completed io list*/ - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); - list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); - - wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; - /* queue the work to convert unwritten extents to written */ - queue_work(wq, &io_end->work); + ext4_add_complete_io(io_end); } void ext4_io_submit(struct ext4_io_submit *io) -- cgit v0.10.2 From 17335dcc471199717839b2fa3492ca36f70f1168 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sat, 29 Sep 2012 00:41:21 -0400 Subject: ext4: serialize dio nonlocked reads with defrag workers Inode's block defrag and ext4_change_inode_journal_flag() may affect nonlocked DIO reads result, so proper synchronization required. - Add missed inode_dio_wait() calls where appropriate - Check inode state under extra i_dio_count reference. Reviewed-by: Jan Kara Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index edb0495..1be2b44 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1358,6 +1358,8 @@ enum { EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ EXT4_STATE_NEWENTRY, /* File just added to dir */ EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ + EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read + nolocking */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -2469,6 +2471,21 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } +/* + * Disable DIO read nolock optimization, so new dioreaders will be forced + * to grab i_mutex + */ +static inline void ext4_inode_block_unlocked_dio(struct inode *inode) +{ + ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); + smp_mb(); +} +static inline void ext4_inode_resume_unlocked_dio(struct inode *inode) +{ + smp_mb(); + ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); +} + #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) /* For ioend & aio unwritten conversion wait queues */ diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 61f13e5..8d849da 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -810,11 +810,25 @@ retry: if (unlikely(!list_empty(&ei->i_completed_io_list))) ext4_flush_completed_IO(inode); + /* + * Nolock dioread optimization may be dynamically disabled + * via ext4_inode_block_unlocked_dio(). Check inode's state + * while holding extra i_dio_count ref. + */ + atomic_inc(&inode->i_dio_count); + smp_mb(); + if (unlikely(ext4_test_inode_state(inode, + EXT4_STATE_DIOREAD_LOCK))) { + inode_dio_done(inode); + goto locked; + } ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ext4_get_block, NULL, NULL, 0); + inode_dio_done(inode); } else { +locked: ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, ext4_get_block); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 09d0488..bdd399b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4720,6 +4720,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return err; } + /* Wait for all existing dio workers */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + jbd2_journal_lock_updates(journal); /* @@ -4739,6 +4743,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); + ext4_inode_resume_unlocked_dio(inode); /* Finally we can mark the inode as dirty. */ diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 8076b96..292daee 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -1323,6 +1323,12 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, /* Protect orig and donor inodes against a truncate */ mext_inode_double_lock(orig_inode, donor_inode); + /* Wait for all existing dio workers */ + ext4_inode_block_unlocked_dio(orig_inode); + ext4_inode_block_unlocked_dio(donor_inode); + inode_dio_wait(orig_inode); + inode_dio_wait(donor_inode); + /* Protect extent tree against block allocations via delalloc */ double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ @@ -1521,6 +1527,8 @@ out: kfree(holecheck_path); } double_up_write_data_sem(orig_inode, donor_inode); + ext4_inode_resume_unlocked_dio(orig_inode); + ext4_inode_resume_unlocked_dio(donor_inode); mext_inode_double_unlock(orig_inode, donor_inode); return ret; -- cgit v0.10.2 From 1c9114f9c0f10f58dd7e568a7152025af47b27e5 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sat, 29 Sep 2012 00:55:23 -0400 Subject: ext4: serialize unlocked dio reads with truncate Current serialization will works only for DIO which holds i_mutex, but nonlocked DIO following race is possible: dio_nolock_read_task truncate_task ->ext4_setattr() ->inode_dio_wait() ->ext4_ext_direct_IO ->ext4_ind_direct_IO ->__blockdev_direct_IO ->ext4_get_block ->truncate_setsize() ->ext4_truncate() #alloc truncated blocks #to other inode ->submit_io() #INFORMATION LEAK In order to serialize with unlocked DIO reads we have to rearrange wait sequence 1) update i_size first 2) if i_size about to be reduced wait for outstanding DIO requests 3) and only after that truncate inode blocks Reviewed-by: Jan Kara Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bdd399b..0bfc633 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4283,7 +4283,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_SIZE) { - inode_dio_wait(inode); if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -4332,8 +4331,12 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_SIZE) { - if (attr->ia_size != i_size_read(inode)) + if (attr->ia_size != i_size_read(inode)) { truncate_setsize(inode, attr->ia_size); + /* Inode size will be reduced, wait for dio in flight */ + if (orphan) + inode_dio_wait(inode); + } ext4_truncate(inode); } -- cgit v0.10.2 From 1b65007e9870e0021397b548e8cd6bbc584f9152 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sat, 29 Sep 2012 00:56:15 -0400 Subject: ext4: endless truncate due to nonlocked dio readers If we have enough aggressive DIO readers, truncate and other dio waiters will wait forever inside inode_dio_wait(). It is reasonable to disable nonlock DIO read optimization during truncate. Reviewed-by: Jan Kara Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0bfc633..05ab70d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4333,9 +4333,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_SIZE) { if (attr->ia_size != i_size_read(inode)) { truncate_setsize(inode, attr->ia_size); - /* Inode size will be reduced, wait for dio in flight */ - if (orphan) + /* Inode size will be reduced, wait for dio in flight. + * Temporarily disable dioread_nolock to prevent + * livelock. */ + if (orphan) { + ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); + ext4_inode_resume_unlocked_dio(inode); + } } ext4_truncate(inode); } -- cgit v0.10.2 From 1f555cfa29e8f787d675e8390f88ce517a37271a Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sat, 29 Sep 2012 00:58:26 -0400 Subject: ext4: serialize truncate with owerwrite DIO workers Jan Kara have spotted interesting issue: There are potential data corruption issue with direct IO overwrites racing with truncate: Like: dio write truncate_task ->ext4_ext_direct_IO ->overwrite == 1 ->down_read(&EXT4_I(inode)->i_data_sem); ->mutex_unlock(&inode->i_mutex); ->ext4_setattr() ->inode_dio_wait() ->truncate_setsize() ->ext4_truncate() ->down_write(&EXT4_I(inode)->i_data_sem); ->__blockdev_direct_IO ->ext4_get_block ->submit_io() ->up_read(&EXT4_I(inode)->i_data_sem); # truncate data blocks, allocate them to # other inode - bad stuff happens because # dio is still in flight. In order to serialize with truncate dio worker should grab extra i_dio_count reference before drop i_mutex. Reviewed-by: Jan Kara Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 05ab70d..09308ad 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3010,6 +3010,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, overwrite = *((int *)iocb->private); if (overwrite) { + atomic_inc(&inode->i_dio_count); down_read(&EXT4_I(inode)->i_data_sem); mutex_unlock(&inode->i_mutex); } @@ -3107,6 +3108,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, retake_lock: /* take i_mutex locking again if we do a ovewrite dio */ if (overwrite) { + inode_dio_done(inode); up_read(&EXT4_I(inode)->i_data_sem); mutex_lock(&inode->i_mutex); } -- cgit v0.10.2 From 02d262dffcf4c74e5c4612ee736bdb94f18ed5b9 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sun, 30 Sep 2012 23:03:42 -0400 Subject: ext4: punch_hole should wait for DIO writers punch_hole is the place where we have to wait for all existing writers (writeback, aio, dio), but currently we simply flush pended end_io request which is not sufficient. Other issue is that punch_hole performed w/o i_mutex held which obviously result in dangerous data corruption due to write-after-free. This patch performs following changes: - Guard punch_hole with i_mutex - Recheck inode flags under i_mutex - Block all new dio readers in order to prevent information leak caused by read-after-free pattern. - punch_hole now wait for all writers in flight NOTE: XXX write-after-free race is still possible because new dirty pages may appear due to mmap(), and currently there is no easy way to stop writeback while punch_hole is in progress. [ Fixed error return from ext4_ext_punch_hole() to make sure that we release i_mutex before returning EPERM or ETXTBUSY -- Ted ] Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2320774..5920e75 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4794,9 +4794,32 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) loff_t first_page_offset, last_page_offset; int credits, err = 0; + /* + * Write out all dirty pages to avoid race conditions + * Then release them. + */ + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + err = filemap_write_and_wait_range(mapping, + offset, offset + length - 1); + + if (err) + return err; + } + + mutex_lock(&inode->i_mutex); + /* It's not possible punch hole on append only file */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { + err = -EPERM; + goto out_mutex; + } + if (IS_SWAPFILE(inode)) { + err = -ETXTBSY; + goto out_mutex; + } + /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) - return 0; + goto out_mutex; /* * If the hole extends beyond i_size, set the hole @@ -4814,33 +4837,25 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) first_page_offset = first_page << PAGE_CACHE_SHIFT; last_page_offset = last_page << PAGE_CACHE_SHIFT; - /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - err = filemap_write_and_wait_range(mapping, - offset, offset + length - 1); - - if (err) - return err; - } - /* Now release the pages */ if (last_page_offset > first_page_offset) { truncate_pagecache_range(inode, first_page_offset, last_page_offset - 1); } - /* finish any pending end_io work */ + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); err = ext4_flush_completed_IO(inode); if (err) - return err; + goto out_dio; credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out_dio; + } /* @@ -4930,6 +4945,10 @@ out: inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); return err; } int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, -- cgit v0.10.2 From 6f2080e64487b9963f9c6ff8a252e1abce98f2d4 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sun, 30 Sep 2012 23:03:50 -0400 Subject: ext4: fix ext_remove_space for punch_hole case Inode is allowed to have empty leaf only if it this is blockless inode. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5920e75..c1fcf48 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2589,7 +2589,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, struct ext4_ext_path *path = NULL; ext4_fsblk_t partial_cluster = 0; handle_t *handle; - int i = 0, err; + int i = 0, err = 0; ext_debug("truncate since %u to %u\n", start, end); @@ -2621,12 +2621,16 @@ again: return PTR_ERR(path); } depth = ext_depth(inode); + /* Leaf not may not exist only if inode has no blocks at all */ ex = path[depth].p_ext; if (!ex) { - ext4_ext_drop_refs(path); - kfree(path); - path = NULL; - goto cont; + if (depth) { + EXT4_ERROR_INODE(inode, + "path[%d].p_hdr == NULL", + depth); + err = -EIO; + } + goto out; } ee_block = le32_to_cpu(ex->ee_block); @@ -2658,8 +2662,6 @@ again: goto out; } } -cont: - /* * We start scanning from right side, freeing all the blocks * after i_size and walking into the tree depth-wise. -- cgit v0.10.2 From 041bbb6d369811e948ae01f3d00414264076be35 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 30 Sep 2012 23:04:56 -0400 Subject: ext4: fix mtime update in nodelalloc mode Commits 5e8830dc85d0 and 41c4d25f78c0 introduced a regression into v3.6-rc1 for ext4 in nodealloc mode, such that mtime updates would not take place for files modified via mmap if the page was already in the page cache. This would also affect ext3 file systems mounted using the ext4 file system driver. The problem was that ext4_page_mkwrite() had a shortcut which would avoid calling __block_page_mkwrite() under some circumstances, and the above two commit transferred the responsibility of calling file_update_time() to __block_page_mkwrite --- which woudln't get called in some circumstances. Since __block_page_mkwrite() only has three callers, block_page_mkwrite(), ext4_page_mkwrite, and nilfs_page_mkwrite(), the best way to solve this is to move the responsibility for calling file_update_time() to its caller. This problem was found via xfstests #215 with a file system mounted with -o nodelalloc. Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara Cc: KONISHI Ryusuke Cc: stable@vger.kernel.org diff --git a/fs/buffer.c b/fs/buffer.c index 9f6d2e4..1fe3968 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2318,12 +2318,6 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, loff_t size; int ret; - /* - * Update file times before taking page lock. We may end up failing the - * fault so this update may be superfluous but who really cares... - */ - file_update_time(vma->vm_file); - lock_page(page); size = i_size_read(inode); if ((page->mapping != inode->i_mapping) || @@ -2361,6 +2355,13 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb; sb_start_pagefault(sb); + + /* + * Update file times before taking page lock. We may end up failing the + * fault so this update may be superfluous but who really cares... + */ + file_update_time(vma->vm_file); + ret = __block_page_mkwrite(vma, vmf, get_block); sb_end_pagefault(sb); return block_page_mkwrite_return(ret); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 09308ad..f18e786 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4788,6 +4788,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) int retries = 0; sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_should_journal_data(inode) && diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index a4d56ac..5b387a4 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -116,6 +116,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (unlikely(ret)) goto out; + file_update_time(vma->vm_file); ret = __block_page_mkwrite(vma, vmf, nilfs_get_block); if (ret) { nilfs_transaction_abort(inode->i_sb); -- cgit v0.10.2 From c278531d39f3158bfee93dc67da0b77e09776de2 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Fri, 5 Oct 2012 11:31:55 -0400 Subject: ext4: fix ext4_flush_completed_IO wait semantics BUG #1) All places where we call ext4_flush_completed_IO are broken because buffered io and DIO/AIO goes through three stages 1) submitted io, 2) completed io (in i_completed_io_list) conversion pended 3) finished io (conversion done) And by calling ext4_flush_completed_IO we will flush only requests which were in (2) stage, which is wrong because: 1) punch_hole and truncate _must_ wait for all outstanding unwritten io regardless to it's state. 2) fsync and nolock_dio_read should also wait because there is a time window between end_page_writeback() and ext4_add_complete_io() As result integrity fsync is broken in case of buffered write to fallocated region: fsync blkdev_completion ->filemap_write_and_wait_range ->ext4_end_bio ->end_page_writeback <-- filemap_write_and_wait_range return ->ext4_flush_completed_IO sees empty i_completed_io_list but pended conversion still exist ->ext4_add_complete_io BUG #2) Race window becomes wider due to the 'ext4: completed_io locking cleanup V4' patch series This patch make following changes: 1) ext4_flush_completed_io() now first try to flush completed io and when wait for any outstanding unwritten io via ext4_unwritten_wait() 2) Rename function to more appropriate name. 3) Assert that all callers of ext4_flush_unwritten_io should hold i_mutex to prevent endless wait Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1be2b44..3ab2539 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1947,7 +1947,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p); /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); -extern int ext4_flush_completed_IO(struct inode *); +extern int ext4_flush_unwritten_io(struct inode *); /* hash.c */ extern int ext4fs_dirhash(const char *name, int len, struct @@ -2371,6 +2371,7 @@ extern const struct file_operations ext4_dir_operations; extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); +extern void ext4_unwritten_wait(struct inode *inode); /* namei.c */ extern const struct inode_operations ext4_dir_inode_operations; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c1fcf48..1c94cca 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4268,7 +4268,7 @@ void ext4_ext_truncate(struct inode *inode) * finish any pending end_io work so we won't run the risk of * converting any truncated blocks to initialized later */ - ext4_flush_completed_IO(inode); + ext4_flush_unwritten_io(inode); /* * probably first extent we're gonna free will be last in block @@ -4847,10 +4847,10 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) /* Wait all existing dio workers, newcomers will block on i_mutex */ ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - err = ext4_flush_completed_IO(inode); + err = ext4_flush_unwritten_io(inode); if (err) goto out_dio; + inode_dio_wait(inode); credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, credits); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 39335bd..ca6f07a 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -55,7 +55,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp) return 0; } -static void ext4_unwritten_wait(struct inode *inode) +void ext4_unwritten_wait(struct inode *inode) { wait_queue_head_t *wq = ext4_ioend_wq(inode); diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 4600008..be1d89f 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -138,7 +138,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (inode->i_sb->s_flags & MS_RDONLY) goto out; - ret = ext4_flush_completed_IO(inode); + ret = ext4_flush_unwritten_io(inode); if (ret < 0) goto out; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 8d849da..792e388 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -807,9 +807,11 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, retry: if (rw == READ && ext4_should_dioread_nolock(inode)) { - if (unlikely(!list_empty(&ei->i_completed_io_list))) - ext4_flush_completed_IO(inode); - + if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) { + mutex_lock(&inode->i_mutex); + ext4_flush_unwritten_io(inode); + mutex_unlock(&inode->i_mutex); + } /* * Nolock dioread optimization may be dynamically disabled * via ext4_inode_block_unlocked_dio(). Check inode's state diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 5b24c40..68e896e 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -189,8 +189,6 @@ static int ext4_do_flush_completed_IO(struct inode *inode, list_add_tail(&io->list, &complete); } - /* It is important to update all flags for all end_io in one shot w/o - * dropping the lock.*/ spin_lock_irqsave(&ei->i_completed_io_lock, flags); while (!list_empty(&complete)) { io = list_entry(complete.next, ext4_io_end_t, list); @@ -228,9 +226,14 @@ static void ext4_end_io_work(struct work_struct *work) ext4_do_flush_completed_IO(io->inode, io); } -int ext4_flush_completed_IO(struct inode *inode) +int ext4_flush_unwritten_io(struct inode *inode) { - return ext4_do_flush_completed_IO(inode, NULL); + int ret; + WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) && + !(inode->i_state & I_FREEING)); + ret = ext4_do_flush_completed_IO(inode, NULL); + ext4_unwritten_wait(inode); + return ret; } ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) -- cgit v0.10.2