From 395a87bfefbc400011417e9eaae33169f9f036c0 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 10 Mar 2009 18:18:47 -0400 Subject: ext4: fix header check in ext4_ext_search_right() for deep extent trees. The ext4_ext_search_right() function is confusing; it uses a "depth" variable which is 0 at the root and maximum at the leaves, but the on-disk metadata uses a "depth" (actually eh_depth) which is opposite: maximum at the root, and 0 at the leaves. The ext4_ext_check_header() function is given a depth and checks the header agaisnt that depth; it expects the on-disk semantics, but we are giving it the opposite in the while loop in this function. We should be giving it the on-disk notion of "depth" which we can get from (p_depth - depth) - and if you look, the last (more commonly hit) call to ext4_ext_check_header() does just this. Sending in the wrong depth results in (incorrect) messages about corruption: EXT4-fs error (device sdb1): ext4_ext_search_right: bad header in inode #2621457: unexpected eh_depth - magic f30a, entries 340, max 340(0), depth 1(2) http://bugzilla.kernel.org/show_bug.cgi?id=12821 Reported-by: David Dindorp Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e2eab19..e0aa4fe 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1122,7 +1122,8 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent_idx *ix; struct ext4_extent *ex; ext4_fsblk_t block; - int depth, ee_len; + int depth; /* Note, NOT eh_depth; depth from top of tree */ + int ee_len; BUG_ON(path == NULL); depth = path->p_depth; @@ -1179,7 +1180,8 @@ got_index: if (bh == NULL) return -EIO; eh = ext_block_hdr(bh); - if (ext4_ext_check_header(inode, eh, depth)) { + /* subtract from p_depth to get proper eh_depth */ + if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { put_bh(bh); return -EIO; } -- cgit v0.10.2 From 2842c3b5449f31470b61db716f1926b594fb6156 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 12 Mar 2009 12:20:01 -0400 Subject: ext4: Print the find_group_flex() warning only once This is a short-term warning, and even printk_ratelimit() can result in too much noise in system logs. So only print it once as a warning. Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 627f8c3..2d2b358 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -698,6 +698,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) struct inode *ret; ext4_group_t i; int free = 0; + static int once = 1; ext4_group_t flex_group; /* Cannot create files in a deleted directory */ @@ -719,7 +720,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) ret2 = find_group_flex(sb, dir, &group); if (ret2 == -1) { ret2 = find_group_other(sb, dir, &group); - if (ret2 == 0 && printk_ratelimit()) + if (ret2 == 0 && once) + once = 0; printk(KERN_NOTICE "ext4: find_group_flex " "failed, fallback succeeded dir %lu\n", dir->i_ino); -- cgit v0.10.2 From 8d03c7a0c550e7ab24cadcef5e66656bfadec8b9 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 14 Mar 2009 11:51:46 -0400 Subject: ext4: fix bogus BUG_ONs in in mballoc code Thiemo Nagel reported that: # dd if=/dev/zero of=image.ext4 bs=1M count=2 # mkfs.ext4 -v -F -b 1024 -m 0 -g 512 -G 4 -I 128 -N 1 \ -O large_file,dir_index,flex_bg,extent,sparse_super image.ext4 # mount -o loop image.ext4 mnt/ # dd if=/dev/zero of=mnt/file oopsed, with a BUG_ON in ext4_mb_normalize_request because size == EXT4_BLOCKS_PER_GROUP It appears to me (esp. after talking to Andreas) that the BUG_ON is bogus; a request of exactly EXT4_BLOCKS_PER_GROUP should be allowed, though larger sizes do indicate a problem. Fix that an another (apparently rare) codepath with a similar check. Reported-by: Thiemo Nagel Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4415bee..41f4348 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1447,7 +1447,7 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, struct ext4_free_extent *gex = &ac->ac_g_ex; BUG_ON(ex->fe_len <= 0); - BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); @@ -3292,7 +3292,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, } BUG_ON(start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical); - BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); /* now prepare goal request */ -- cgit v0.10.2 From d33a1976fbee1ee321d6f014333d8f03a39d526c Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 16 Mar 2009 23:25:40 -0400 Subject: ext4: fix bb_prealloc_list corruption due to wrong group locking This is for Red Hat bug 490026: EXT4 panic, list corruption in ext4_mb_new_inode_pa ext4_lock_group(sb, group) is supposed to protect this list for each group, and a common code flow to remove an album is like this: ext4_get_group_no_and_offset(sb, pa->pa_pstart, &grp, NULL); ext4_lock_group(sb, grp); list_del(&pa->pa_group_list); ext4_unlock_group(sb, grp); so it's critical that we get the right group number back for this prealloc context, to lock the right group (the one associated with this pa) and prevent concurrent list manipulation. however, ext4_mb_put_pa() passes in (pa->pa_pstart - 1) with a comment, "-1 is to protect from crossing allocation group". This makes sense for the group_pa, where pa_pstart is advanced by the length which has been used (in ext4_mb_release_context()), and when the entire length has been used, pa_pstart has been advanced to the first block of the next group. However, for inode_pa, pa_pstart is never advanced; it's just set once to the first block in the group and not moved after that. So in this case, if we subtract one in ext4_mb_put_pa(), we are actually locking the *previous* group, and opening the race with the other threads which do not subtract off the extra block. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 41f4348..9f61e62 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3589,6 +3589,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, struct super_block *sb, struct ext4_prealloc_space *pa) { ext4_group_t grp; + ext4_fsblk_t grp_blk; if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) return; @@ -3603,8 +3604,12 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, pa->pa_deleted = 1; spin_unlock(&pa->pa_lock); - /* -1 is to protect from crossing allocation group */ - ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL); + grp_blk = pa->pa_pstart; + /* If linear, pa_pstart may be in the next group when pa is used up */ + if (pa->pa_linear) + grp_blk--; + + ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); /* * possible race: -- cgit v0.10.2