From d3ad8434aa83ef7c88bc91edcfe012cdcbab9f3e Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 27 Jun 2011 12:36:29 -0400 Subject: jbd2: use WRITE_SYNC in journal checkpoint In journal checkpoint, we write the buffer and wait for its finish. But in cfq, the async queue has a very low priority, and in our test, if there are too many sync queues and every queue is filled up with requests, the write request will be delayed for quite a long time and all the tasks which are waiting for journal space will end with errors like: INFO: task attr_set:3816 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. attr_set D ffff880028393480 0 3816 1 0x00000000 ffff8802073fbae8 0000000000000086 ffff8802140847c8 ffff8800283934e8 ffff8802073fb9d8 ffffffff8103e456 ffff8802140847b8 ffff8801ed728080 ffff8801db4bc080 ffff8801ed728450 ffff880028393480 0000000000000002 Call Trace: [] ? __dequeue_entity+0x33/0x38 [] ? need_resched+0x23/0x2d [] ? thread_return+0xa2/0xbc [] ? jbd2_journal_dirty_metadata+0x116/0x126 [jbd2] [] ? jbd2_journal_dirty_metadata+0x116/0x126 [jbd2] [] __mutex_lock_common+0x14e/0x1a9 [] ? brelse+0x13/0x15 [ext4] [] __mutex_lock_slowpath+0x19/0x1b [] mutex_lock+0x1b/0x32 [] __jbd2_journal_insert_checkpoint+0xe3/0x20c [jbd2] [] start_this_handle+0x438/0x527 [jbd2] [] ? autoremove_wake_function+0x0/0x3e [] jbd2_journal_start+0xa1/0xcc [jbd2] [] ext4_journal_start_sb+0x57/0x81 [ext4] [] ext4_xattr_set+0x6c/0xe3 [ext4] [] ext4_xattr_user_set+0x42/0x4b [ext4] [] generic_setxattr+0x6b/0x76 [] __vfs_setxattr_noperm+0x47/0xc0 [] vfs_setxattr+0x7f/0x9a [] setxattr+0xb5/0xe8 [] ? do_filp_open+0x571/0xa6e [] sys_fsetxattr+0x6b/0x91 [] system_call_fastpath+0x16/0x1b So this patch tries to use WRITE_SYNC in __flush_batch so that the request will be moved into sync queue and handled by cfq timely. We also use the new plug, sot that all the WRITE_SYNC requests can be given as a whole when we unplug it. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" Cc: Jan Kara Reported-by: Robin Dong diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 2c62c5a..16a698b 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -257,9 +257,12 @@ static void __flush_batch(journal_t *journal, int *batch_count) { int i; + struct blk_plug plug; + blk_start_plug(&plug); for (i = 0; i < *batch_count; i++) - write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE); + write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE_SYNC); + blk_finish_plug(&plug); for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = journal->j_chkpt_bhs[i]; -- cgit v0.10.2 From ed7a7e16724a4123fce1fc0ff1f5131a0596f189 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Mon, 27 Jun 2011 15:35:53 -0400 Subject: ext4: fix incorrect error msg in ext4_ext_insert_index In function ext4_ext_insert_index when eh_entries of curp is bigger than eh_max, error messages will be printed out, but the content is about logical and ei_block, that's incorret. Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f815cc8..eb63c7b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -808,8 +808,9 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) > le16_to_cpu(curp->p_hdr->eh_max))) { EXT4_ERROR_INODE(inode, - "logical %d == ei_block %d!", - logical, le32_to_cpu(curp->p_idx->ei_block)); + "eh_entries %d > eh_max %d!", + le16_to_cpu(curp->p_hdr->eh_entries), + le16_to_cpu(curp->p_hdr->eh_max)); return -EIO; } if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { -- cgit v0.10.2 From ff9893dc8aa622a4f122293a6861566a284edea5 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 27 Jun 2011 16:36:31 -0400 Subject: ext4: split ext4_ind_truncate from ext4_truncate We are about to move all indirect inode functions to a new file. Before we do that, let's split ext4_ind_truncate() out of ext4_truncate() leaving only generic code in the latter, so we will be able to move ext4_ind_truncate() to the new file. Signed-off-by: Amir Goldstein Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1921392..8532dd4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1834,6 +1834,8 @@ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); +extern void ext4_ind_truncate(struct inode *inode); + /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e3126c0..a8f310b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4471,6 +4471,26 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) */ void ext4_truncate(struct inode *inode) { + trace_ext4_truncate_enter(inode); + + if (!ext4_can_truncate(inode)) + return; + + ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); + + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) + ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ext4_ext_truncate(inode); + else + ext4_ind_truncate(inode); + + trace_ext4_truncate_exit(inode); +} + +void ext4_ind_truncate(struct inode *inode) +{ handle_t *handle; struct ext4_inode_info *ei = EXT4_I(inode); __le32 *i_data = ei->i_data; @@ -4484,22 +4504,6 @@ void ext4_truncate(struct inode *inode) ext4_lblk_t last_block, max_block; unsigned blocksize = inode->i_sb->s_blocksize; - trace_ext4_truncate_enter(inode); - - if (!ext4_can_truncate(inode)) - return; - - ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); - - if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) - ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); - - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - ext4_ext_truncate(inode); - trace_ext4_truncate_exit(inode); - return; - } - handle = start_transaction(inode); if (IS_ERR(handle)) return; /* AKPM: return what? */ -- cgit v0.10.2 From 8bb2b247124ba6093455d4aef26743b1bef27bc5 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 27 Jun 2011 17:10:28 -0400 Subject: ext4: rename ext4_indirect_* funcs to ext4_ind_* We are going to move all ext4_ind_* functions to indirect.c. Before we do that, let's rename 2 functions called ext4_indirect_* to ext4_ind_*, to keep to the naming convention. Signed-off-by: Amir Goldstein Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a8f310b..6c1d28e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1075,8 +1075,7 @@ qsize_t *ext4_get_reserved_space(struct inode *inode) * Calculate the number of metadata blocks need to reserve * to allocate a new block at @lblocks for non extent file based file */ -static int ext4_indirect_calc_metadata_amount(struct inode *inode, - sector_t lblock) +static int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) { struct ext4_inode_info *ei = EXT4_I(inode); sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); @@ -1107,7 +1106,7 @@ static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return ext4_ext_calc_metadata_amount(inode, lblock); - return ext4_indirect_calc_metadata_amount(inode, lblock); + return ext4_ind_calc_metadata_amount(inode, lblock); } /* @@ -5456,8 +5455,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } -static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, - int chunk) +static int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) { int indirects; @@ -5483,7 +5481,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) { if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return ext4_indirect_trans_blocks(inode, nrblocks, chunk); + return ext4_ind_trans_blocks(inode, nrblocks, chunk); return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); } -- cgit v0.10.2 From 1f7d1e77419050831a905353683807fa69a26625 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 27 Jun 2011 19:16:02 -0400 Subject: ext4: move __ext4_check_blockref to block_validity.c In preparation for moving the indirect functions to a separate file, move __ext4_check_blockref() to block_validity.c and rename it to ext4_check_blockref() which is exported as globally visible function. Also, rename the cpp macro ext4_check_inode_blockref() to ext4_ind_check_inode(), to make it clear that it is only valid for use with non-extent mapped inodes. Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index fac90f3..af103be 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -246,3 +246,23 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, return 1; } +int ext4_check_blockref(const char *function, unsigned int line, + struct inode *inode, __le32 *p, unsigned int max) +{ + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + __le32 *bref = p; + unsigned int blk; + + while (bref < p+max) { + blk = le32_to_cpu(*bref++); + if (blk && + unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), + blk, 1))) { + es->s_last_error_block = cpu_to_le64(blk); + ext4_error_inode(inode, function, line, blk, + "invalid block"); + return -EIO; + } + } + return 0; +} diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8532dd4..82ba7eb 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2125,6 +2125,19 @@ static inline void ext4_mark_super_dirty(struct super_block *sb) } /* + * Block validity checking + */ +#define ext4_check_indirect_blockref(inode, bh) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + (__le32 *)(bh)->b_data, \ + EXT4_ADDR_PER_BLOCK((inode)->i_sb)) + +#define ext4_ind_check_inode(inode) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + EXT4_I(inode)->i_data, \ + EXT4_NDIR_BLOCKS) + +/* * Inodes and files operations */ @@ -2153,6 +2166,8 @@ extern void ext4_exit_system_zone(void); extern int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, unsigned int count); +extern int ext4_check_blockref(const char *, unsigned int, + struct inode *, __le32 *, unsigned int); /* extents.c */ extern int ext4_ext_tree_init(handle_t *handle, struct inode *); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6c1d28e..3dca526 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -360,39 +360,6 @@ static int ext4_block_to_path(struct inode *inode, return n; } -static int __ext4_check_blockref(const char *function, unsigned int line, - struct inode *inode, - __le32 *p, unsigned int max) -{ - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; - __le32 *bref = p; - unsigned int blk; - - while (bref < p+max) { - blk = le32_to_cpu(*bref++); - if (blk && - unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), - blk, 1))) { - es->s_last_error_block = cpu_to_le64(blk); - ext4_error_inode(inode, function, line, blk, - "invalid block"); - return -EIO; - } - } - return 0; -} - - -#define ext4_check_indirect_blockref(inode, bh) \ - __ext4_check_blockref(__func__, __LINE__, inode, \ - (__le32 *)(bh)->b_data, \ - EXT4_ADDR_PER_BLOCK((inode)->i_sb)) - -#define ext4_check_inode_blockref(inode) \ - __ext4_check_blockref(__func__, __LINE__, inode, \ - EXT4_I(inode)->i_data, \ - EXT4_NDIR_BLOCKS) - /** * ext4_get_branch - read the chain of indirect blocks leading to data * @inode: inode in question @@ -5010,7 +4977,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) (S_ISLNK(inode->i_mode) && !ext4_inode_is_fast_symlink(inode))) { /* Validate block references which are part of inode */ - ret = ext4_check_inode_blockref(inode); + ret = ext4_ind_check_inode(inode); } if (ret) goto bad_inode; -- cgit v0.10.2 From 9f125d641beb898f5bf2fe69583192c18043517a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 27 Jun 2011 19:16:04 -0400 Subject: ext4: move common truncate functions to header file Move two functions that will be needed by the indirect functions to be moved to indirect.c as well as inode.c to truncate.h as inline functions, so that we can avoid having duplicate copies of the function (which can be a maintenance problem) without having to expose them as globally functions. Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3dca526..9b82ac7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -47,6 +47,7 @@ #include "xattr.h" #include "acl.h" #include "ext4_extents.h" +#include "truncate.h" #include @@ -89,33 +90,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) } /* - * Work out how many blocks we need to proceed with the next chunk of a - * truncate transaction. - */ -static unsigned long blocks_for_truncate(struct inode *inode) -{ - ext4_lblk_t needed; - - needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); - - /* Give ourselves just enough room to cope with inodes in which - * i_blocks is corrupt: we've seen disk corruptions in the past - * which resulted in random data in an inode which looked enough - * like a regular file for ext4 to try to delete it. Things - * will go a bit crazy if that happens, but at least we should - * try not to panic the whole kernel. */ - if (needed < 2) - needed = 2; - - /* But we need to bound the transaction so we don't overflow the - * journal. */ - if (needed > EXT4_MAX_TRANS_DATA) - needed = EXT4_MAX_TRANS_DATA; - - return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; -} - -/* * Truncate transactions can be complex and absolutely huge. So we need to * be able to restart the transaction at a conventient checkpoint to make * sure we don't overflow the journal. @@ -129,7 +103,7 @@ static handle_t *start_transaction(struct inode *inode) { handle_t *result; - result = ext4_journal_start(inode, blocks_for_truncate(inode)); + result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)); if (!IS_ERR(result)) return result; @@ -149,7 +123,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) return 0; if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) return 0; - if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) + if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode))) return 0; return 1; } @@ -204,7 +178,7 @@ void ext4_evict_inode(struct inode *inode) if (is_bad_inode(inode)) goto no_delete; - handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); + handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* @@ -1555,16 +1529,6 @@ static int do_journal_get_write_access(handle_t *handle, return ret; } -/* - * Truncate blocks that were not used by write. We have to truncate the - * pagecache as well so that corresponding buffers get properly unmapped. - */ -static void ext4_truncate_failed_write(struct inode *inode) -{ - truncate_inode_pages(inode->i_mapping, inode->i_size); - ext4_truncate(inode); -} - static int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); static int ext4_write_begin(struct file *file, struct address_space *mapping, @@ -4134,7 +4098,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, if (unlikely(err)) goto out_err; err = ext4_truncate_restart_trans(handle, inode, - blocks_for_truncate(inode)); + ext4_blocks_for_truncate(inode)); if (unlikely(err)) goto out_err; if (bh) { @@ -4329,7 +4293,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, if (try_to_extend_transaction(handle, inode)) { ext4_mark_inode_dirty(handle, inode); ext4_truncate_restart_trans(handle, inode, - blocks_for_truncate(inode)); + ext4_blocks_for_truncate(inode)); } /* diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h new file mode 100644 index 0000000..011ba66 --- /dev/null +++ b/fs/ext4/truncate.h @@ -0,0 +1,43 @@ +/* + * linux/fs/ext4/truncate.h + * + * Common inline functions needed for truncate support + */ + +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static inline void ext4_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext4_truncate(inode); +} + +/* + * Work out how many blocks we need to proceed with the next chunk of a + * truncate transaction. + */ +static inline unsigned long ext4_blocks_for_truncate(struct inode *inode) +{ + ext4_lblk_t needed; + + needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); + + /* Give ourselves just enough room to cope with inodes in which + * i_blocks is corrupt: we've seen disk corruptions in the past + * which resulted in random data in an inode which looked enough + * like a regular file for ext4 to try to delete it. Things + * will go a bit crazy if that happens, but at least we should + * try not to panic the whole kernel. */ + if (needed < 2) + needed = 2; + + /* But we need to bound the transaction so we don't overflow the + * journal. */ + if (needed > EXT4_MAX_TRANS_DATA) + needed = EXT4_MAX_TRANS_DATA; + + return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; +} + -- cgit v0.10.2 From dae1e52cb1267bf8f52e5e47a80fab566d7e8aa4 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 27 Jun 2011 19:40:50 -0400 Subject: ext4: move ext4_ind_* functions from inode.c to indirect.c This patch moves functions from inode.c to indirect.c. The moved functions are ext4_ind_* functions and their helpers. Functions called from inode.c are declared extern. Signed-off-by: Amir Goldstein Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 04109460..56fd8f86 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ - mmp.o + mmp.o indirect.o ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index af103be..8efb2f0 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -266,3 +266,4 @@ int ext4_check_blockref(const char *function, unsigned int line, } return 0; } + diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 82ba7eb..ddaf504 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1834,6 +1834,15 @@ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); + +/* indirect.c */ +extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs); +extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); +extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); extern void ext4_ind_truncate(struct inode *inode); /* ioctl.c */ diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c new file mode 100644 index 0000000..c3e85a8 --- /dev/null +++ b/fs/ext4/indirect.c @@ -0,0 +1,1510 @@ +/* + * linux/fs/ext4/indirect.c + * + * from + * + * linux/fs/ext4/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie + * (sct@redhat.com), 1993, 1998 + */ + +#include +#include "ext4_jbd2.h" +#include "truncate.h" + +#include + +typedef struct { + __le32 *p; + __le32 key; + struct buffer_head *bh; +} Indirect; + +static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +/** + * ext4_block_to_path - parse the block number into array of offsets + * @inode: inode in question (we are only interested in its superblock) + * @i_block: block number to be parsed + * @offsets: array to store the offsets in + * @boundary: set this non-zero if the referred-to block is likely to be + * followed (on disk) by an indirect block. + * + * To store the locations of file's data ext4 uses a data structure common + * for UNIX filesystems - tree of pointers anchored in the inode, with + * data blocks at leaves and indirect blocks in intermediate nodes. + * This function translates the block number into path in that tree - + * return value is the path length and @offsets[n] is the offset of + * pointer to (n+1)th node in the nth one. If @block is out of range + * (negative or too large) warning is printed and zero returned. + * + * Note: function doesn't find node addresses, so no IO is needed. All + * we need to know is the capacity of indirect blocks (taken from the + * inode->i_sb). + */ + +/* + * Portability note: the last comparison (check that we fit into triple + * indirect block) is spelled differently, because otherwise on an + * architecture with 32-bit longs and 8Kb pages we might get into trouble + * if our filesystem had 8Kb blocks. We might use long long, but that would + * kill us on x86. Oh, well, at least the sign propagation does not matter - + * i_block would have to be negative in the very beginning, so we would not + * get there at all. + */ + +static int ext4_block_to_path(struct inode *inode, + ext4_lblk_t i_block, + ext4_lblk_t offsets[4], int *boundary) +{ + int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); + const long direct_blocks = EXT4_NDIR_BLOCKS, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + int final = 0; + + if (i_block < direct_blocks) { + offsets[n++] = i_block; + final = direct_blocks; + } else if ((i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = EXT4_IND_BLOCK; + offsets[n++] = i_block; + final = ptrs; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = EXT4_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = EXT4_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else { + ext4_warning(inode->i_sb, "block %lu > max in inode %lu", + i_block + direct_blocks + + indirect_blocks + double_blocks, inode->i_ino); + } + if (boundary) + *boundary = final - 1 - (i_block & (ptrs - 1)); + return n; +} + +/** + * ext4_get_branch - read the chain of indirect blocks leading to data + * @inode: inode in question + * @depth: depth of the chain (1 - direct pointer, etc.) + * @offsets: offsets of pointers in inode/indirect blocks + * @chain: place to store the result + * @err: here we store the error value + * + * Function fills the array of triples and returns %NULL + * if everything went OK or the pointer to the last filled triple + * (incomplete one) otherwise. Upon the return chain[i].key contains + * the number of (i+1)-th block in the chain (as it is stored in memory, + * i.e. little-endian 32-bit), chain[i].p contains the address of that + * number (it points into struct inode for i==0 and into the bh->b_data + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect + * block for i>0 and NULL for i==0. In other words, it holds the block + * numbers of the chain, addresses they were taken from (and where we can + * verify that chain did not change) and buffer_heads hosting these + * numbers. + * + * Function stops when it stumbles upon zero pointer (absent block) + * (pointer to last triple returned, *@err == 0) + * or when it gets an IO error reading an indirect block + * (ditto, *@err == -EIO) + * or when it reads all @depth-1 indirect blocks successfully and finds + * the whole chain, all way to the data (returns %NULL, *err == 0). + * + * Need to be called with + * down_read(&EXT4_I(inode)->i_data_sem) + */ +static Indirect *ext4_get_branch(struct inode *inode, int depth, + ext4_lblk_t *offsets, + Indirect chain[4], int *err) +{ + struct super_block *sb = inode->i_sb; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + bh = sb_getblk(sb, le32_to_cpu(p->key)); + if (unlikely(!bh)) + goto failure; + + if (!bh_uptodate_or_lock(bh)) { + if (bh_submit_read(bh) < 0) { + put_bh(bh); + goto failure; + } + /* validate block references */ + if (ext4_check_indirect_blockref(inode, bh)) { + put_bh(bh); + goto failure; + } + } + + add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); + /* Reader: end */ + if (!p->key) + goto no_block; + } + return NULL; + +failure: + *err = -EIO; +no_block: + return p; +} + +/** + * ext4_find_near - find a place for allocation with sufficient locality + * @inode: owner + * @ind: descriptor of indirect block. + * + * This function returns the preferred place for block allocation. + * It is used when heuristic for sequential allocation fails. + * Rules are: + * + if there is a block to the left of our position - allocate near it. + * + if pointer will live in indirect block - allocate near that block. + * + if pointer will live in inode - allocate in the same + * cylinder group. + * + * In the latter case we colour the starting block by the callers PID to + * prevent it from clashing with concurrent allocations for a different inode + * in the same block group. The PID is used here so that functionally related + * files will be close-by on-disk. + * + * Caller must make sure that @ind is valid and will stay that way. + */ +static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; + __le32 *p; + ext4_fsblk_t bg_start; + ext4_fsblk_t last_block; + ext4_grpblk_t colour; + ext4_group_t block_group; + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) { + if (*p) + return le32_to_cpu(*p); + } + + /* No such thing, so let's try location of indirect block */ + if (ind->bh) + return ind->bh->b_blocknr; + + /* + * It is going to be referred to from the inode itself? OK, just put it + * into the same cylinder group then. + */ + block_group = ei->i_block_group; + if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { + block_group &= ~(flex_size-1); + if (S_ISREG(inode->i_mode)) + block_group++; + } + bg_start = ext4_group_first_block_no(inode->i_sb, block_group); + last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + + /* + * If we are doing delayed allocation, we don't need take + * colour into account. + */ + if (test_opt(inode->i_sb, DELALLOC)) + return bg_start; + + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) + colour = (current->pid % 16) * + (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); + else + colour = (current->pid % 16) * ((last_block - bg_start) / 16); + return bg_start + colour; +} + +/** + * ext4_find_goal - find a preferred place for allocation. + * @inode: owner + * @block: block we want + * @partial: pointer to the last triple within a chain + * + * Normally this function find the preferred place for block allocation, + * returns it. + * Because this is only used for non-extent files, we limit the block nr + * to 32 bits. + */ +static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, + Indirect *partial) +{ + ext4_fsblk_t goal; + + /* + * XXX need to get goal block from mballoc's data structures + */ + + goal = ext4_find_near(inode, partial); + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + return goal; +} + +/** + * ext4_blks_to_allocate - Look up the block map and count the number + * of direct blocks need to be allocated for the given branch. + * + * @branch: chain of indirect blocks + * @k: number of blocks need for indirect blocks + * @blks: number of data blocks to be mapped. + * @blocks_to_boundary: the offset in the indirect block + * + * return the total number of blocks to be allocate, including the + * direct and indirect blocks. + */ +static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, + int blocks_to_boundary) +{ + unsigned int count = 0; + + /* + * Simple case, [t,d]Indirect block(s) has not allocated yet + * then it's clear blocks on that path have not allocated + */ + if (k > 0) { + /* right now we don't handle cross boundary allocation */ + if (blks < blocks_to_boundary + 1) + count += blks; + else + count += blocks_to_boundary + 1; + return count; + } + + count++; + while (count < blks && count <= blocks_to_boundary && + le32_to_cpu(*(branch[0].p + count)) == 0) { + count++; + } + return count; +} + +/** + * ext4_alloc_blocks: multiple allocate blocks needed for a branch + * @handle: handle for this transaction + * @inode: inode which needs allocated blocks + * @iblock: the logical block to start allocated at + * @goal: preferred physical block of allocation + * @indirect_blks: the number of blocks need to allocate for indirect + * blocks + * @blks: number of desired blocks + * @new_blocks: on return it will store the new block numbers for + * the indirect blocks(if needed) and the first direct block, + * @err: on return it will store the error code + * + * This function will return the number of blocks allocated as + * requested by the passed-in parameters. + */ +static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, ext4_fsblk_t goal, + int indirect_blks, int blks, + ext4_fsblk_t new_blocks[4], int *err) +{ + struct ext4_allocation_request ar; + int target, i; + unsigned long count = 0, blk_allocated = 0; + int index = 0; + ext4_fsblk_t current_block = 0; + int ret = 0; + + /* + * Here we try to allocate the requested multiple blocks at once, + * on a best-effort basis. + * To build a branch, we should allocate blocks for + * the indirect blocks(if not allocated yet), and at least + * the first direct block of this branch. That's the + * minimum number of blocks need to allocate(required) + */ + /* first we try to allocate the indirect blocks */ + target = indirect_blks; + while (target > 0) { + count = target; + /* allocating blocks for indirect blocks and direct blocks */ + current_block = ext4_new_meta_blocks(handle, inode, goal, + 0, &count, err); + if (*err) + goto failed_out; + + if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { + EXT4_ERROR_INODE(inode, + "current_block %llu + count %lu > %d!", + current_block, count, + EXT4_MAX_BLOCK_FILE_PHYS); + *err = -EIO; + goto failed_out; + } + + target -= count; + /* allocate blocks for indirect blocks */ + while (index < indirect_blks && count) { + new_blocks[index++] = current_block++; + count--; + } + if (count > 0) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + printk(KERN_INFO "%s returned more blocks than " + "requested\n", __func__); + WARN_ON(1); + break; + } + } + + target = blks - count ; + blk_allocated = count; + if (!target) + goto allocated; + /* Now allocate data blocks */ + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.goal = goal; + ar.len = target; + ar.logical = iblock; + if (S_ISREG(inode->i_mode)) + /* enable in-core preallocation only for regular files */ + ar.flags = EXT4_MB_HINT_DATA; + + current_block = ext4_mb_new_blocks(handle, &ar, err); + if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { + EXT4_ERROR_INODE(inode, + "current_block %llu + ar.len %d > %d!", + current_block, ar.len, + EXT4_MAX_BLOCK_FILE_PHYS); + *err = -EIO; + goto failed_out; + } + + if (*err && (target == blks)) { + /* + * if the allocation failed and we didn't allocate + * any blocks before + */ + goto failed_out; + } + if (!*err) { + if (target == blks) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + } + blk_allocated += ar.len; + } +allocated: + /* total number of blocks allocated for direct blocks */ + ret = blk_allocated; + *err = 0; + return ret; +failed_out: + for (i = 0; i < index; i++) + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); + return ret; +} + +/** + * ext4_alloc_branch - allocate and set up a chain of blocks. + * @handle: handle for this transaction + * @inode: owner + * @indirect_blks: number of allocated indirect blocks + * @blks: number of allocated direct blocks + * @goal: preferred place for allocation + * @offsets: offsets (in the blocks) to store the pointers to next. + * @branch: place to store the chain in. + * + * This function allocates blocks, zeroes out all but the last one, + * links them into chain and (if we are synchronous) writes them to disk. + * In other words, it prepares a branch that can be spliced onto the + * inode. It stores the information about that chain in the branch[], in + * the same format as ext4_get_branch() would do. We are calling it after + * we had read the existing part of chain and partial points to the last + * triple of that (one with zero ->key). Upon the exit we have the same + * picture as after the successful ext4_get_block(), except that in one + * place chain is disconnected - *branch->p is still zero (we did not + * set the last link), but branch->key contains the number that should + * be placed into *branch->p to fill that gap. + * + * If allocation fails we free all blocks we've allocated (and forget + * their buffer_heads) and return the error value the from failed + * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ +static int ext4_alloc_branch(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, int indirect_blks, + int *blks, ext4_fsblk_t goal, + ext4_lblk_t *offsets, Indirect *branch) +{ + int blocksize = inode->i_sb->s_blocksize; + int i, n = 0; + int err = 0; + struct buffer_head *bh; + int num; + ext4_fsblk_t new_blocks[4]; + ext4_fsblk_t current_block; + + num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, + *blks, new_blocks, &err); + if (err) + return err; + + branch[0].key = cpu_to_le32(new_blocks[0]); + /* + * metadata blocks and data blocks are allocated. + */ + for (n = 1; n <= indirect_blks; n++) { + /* + * Get buffer_head for parent block, zero it out + * and set the pointer to new one, then send + * parent to disk. + */ + bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + if (unlikely(!bh)) { + err = -EIO; + goto failed; + } + + branch[n].bh = bh; + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext4_journal_get_create_access(handle, bh); + if (err) { + /* Don't brelse(bh) here; it's done in + * ext4_journal_forget() below */ + unlock_buffer(bh); + goto failed; + } + + memset(bh->b_data, 0, blocksize); + branch[n].p = (__le32 *) bh->b_data + offsets[n]; + branch[n].key = cpu_to_le32(new_blocks[n]); + *branch[n].p = branch[n].key; + if (n == indirect_blks) { + current_block = new_blocks[n]; + /* + * End of chain, update the last new metablock of + * the chain to point to the new allocated + * data blocks numbers + */ + for (i = 1; i < num; i++) + *(branch[n].p + i) = cpu_to_le32(++current_block); + } + BUFFER_TRACE(bh, "marking uptodate"); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (err) + goto failed; + } + *blks = num; + return err; +failed: + /* Allocation failed, free what we already allocated */ + ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); + for (i = 1; i <= n ; i++) { + /* + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, + EXT4_FREE_BLOCKS_FORGET); + } + for (i = n+1; i < indirect_blks; i++) + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); + + ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); + + return err; +} + +/** + * ext4_splice_branch - splice the allocated branch onto inode. + * @handle: handle for this transaction + * @inode: owner + * @block: (logical) number of block we are adding + * @chain: chain of indirect blocks (with a missing link - see + * ext4_alloc_branch) + * @where: location of missing link + * @num: number of indirect blocks we are adding + * @blks: number of direct blocks we are adding + * + * This function fills the missing link and does all housekeeping needed in + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. + */ +static int ext4_splice_branch(handle_t *handle, struct inode *inode, + ext4_lblk_t block, Indirect *where, int num, + int blks) +{ + int i; + int err = 0; + ext4_fsblk_t current_block; + + /* + * If we're splicing into a [td]indirect block (as opposed to the + * inode) then we need to get write access to the [td]indirect block + * before the splice. + */ + if (where->bh) { + BUFFER_TRACE(where->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, where->bh); + if (err) + goto err_out; + } + /* That's it */ + + *where->p = where->key; + + /* + * Update the host buffer_head or inode to point to more just allocated + * direct blocks blocks + */ + if (num == 0 && blks > 1) { + current_block = le32_to_cpu(where->key) + 1; + for (i = 1; i < blks; i++) + *(where->p + i) = cpu_to_le32(current_block++); + } + + /* We are done with atomic stuff, now do the rest of housekeeping */ + /* had we spliced it onto indirect block? */ + if (where->bh) { + /* + * If we spliced it onto an indirect block, we haven't + * altered the inode. Note however that if it is being spliced + * onto an indirect block at the very end of the file (the + * file is growing) then we *will* alter the inode to reflect + * the new i_size. But that is not done here - it is done in + * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. + */ + jbd_debug(5, "splicing indirect only\n"); + BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, where->bh); + if (err) + goto err_out; + } else { + /* + * OK, we spliced it into the inode itself on a direct block. + */ + ext4_mark_inode_dirty(handle, inode); + jbd_debug(5, "splicing direct\n"); + } + return err; + +err_out: + for (i = 1; i <= num; i++) { + /* + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, inode, where[i].bh, 0, 1, + EXT4_FREE_BLOCKS_FORGET); + } + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), + blks, 0); + + return err; +} + +/* + * The ext4_ind_map_blocks() function handles non-extents inodes + * (i.e., using the traditional indirect/double-indirect i_blocks + * scheme) for ext4_map_blocks(). + * + * Allocation strategy is simple: if we have to allocate something, we will + * have to go the whole way to leaf. So let's do it before attaching anything + * to tree, set linkage between the newborn blocks, write them if sync is + * required, recheck the path, free and repeat if check fails, otherwise + * set the last missing link (that will protect us from any truncate-generated + * removals - all blocks on the path are immune now) and possibly force the + * write on the parent block. + * That has a nice additional property: no special recovery from the failed + * allocations is needed - we simply release blocks and do not touch anything + * reachable from inode. + * + * `handle' can be NULL if create == 0. + * + * return > 0, # of blocks mapped or allocated. + * return = 0, if plain lookup failed. + * return < 0, error case. + * + * The ext4_ind_get_blocks() function should be called with + * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem + * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or + * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system + * blocks. + */ +int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + int flags) +{ + int err = -EIO; + ext4_lblk_t offsets[4]; + Indirect chain[4]; + Indirect *partial; + ext4_fsblk_t goal; + int indirect_blks; + int blocks_to_boundary = 0; + int depth; + int count = 0; + ext4_fsblk_t first_block = 0; + + trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); + J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); + J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); + depth = ext4_block_to_path(inode, map->m_lblk, offsets, + &blocks_to_boundary); + + if (depth == 0) + goto out; + + partial = ext4_get_branch(inode, depth, offsets, chain, &err); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { + first_block = le32_to_cpu(chain[depth - 1].key); + count++; + /*map more blocks*/ + while (count < map->m_len && count <= blocks_to_boundary) { + ext4_fsblk_t blk; + + blk = le32_to_cpu(*(chain[depth-1].p + count)); + + if (blk == first_block + count) + count++; + else + break; + } + goto got_it; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) + goto cleanup; + + /* + * Okay, we need to do block allocation. + */ + goal = ext4_find_goal(inode, map->m_lblk, partial); + + /* the number of blocks need to allocate for [d,t]indirect blocks */ + indirect_blks = (chain + depth) - partial - 1; + + /* + * Next look up the indirect map to count the totoal number of + * direct blocks to allocate for this branch. + */ + count = ext4_blks_to_allocate(partial, indirect_blks, + map->m_len, blocks_to_boundary); + /* + * Block out ext4_truncate while we alter the tree + */ + err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, + &count, goal, + offsets + (partial - chain), partial); + + /* + * The ext4_splice_branch call will free and forget any buffers + * on the new chain if there is a failure, but that risks using + * up transaction credits, especially for bitmaps where the + * credits cannot be returned. Can we handle this somehow? We + * may need to return -EAGAIN upwards in the worst case. --sct + */ + if (!err) + err = ext4_splice_branch(handle, inode, map->m_lblk, + partial, indirect_blks, count); + if (err) + goto cleanup; + + map->m_flags |= EXT4_MAP_NEW; + + ext4_update_inode_fsync_trans(handle, inode, 1); +got_it: + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = le32_to_cpu(chain[depth-1].key); + map->m_len = count; + if (count > blocks_to_boundary) + map->m_flags |= EXT4_MAP_BOUNDARY; + err = count; + /* Clean up and exit */ + partial = chain + depth - 1; /* the whole chain */ +cleanup: + while (partial > chain) { + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } +out: + trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, + map->m_pblk, map->m_len, err); + return err; +} + +/* + * O_DIRECT for ext3 (or indirect map) based files + * + * If the O_DIRECT write will extend the file then add this inode to the + * orphan list. So recovery will truncate it back to the original size + * if the machine crashes during the write. + * + * If the O_DIRECT write is intantiating holes inside i_size and the machine + * crashes then stale disk data _may_ be exposed inside the file. But current + * VFS code falls back into buffered path in that case so we are safe. + */ +ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct ext4_inode_info *ei = EXT4_I(inode); + handle_t *handle; + ssize_t ret; + int orphan = 0; + size_t count = iov_length(iov, nr_segs); + int retries = 0; + + if (rw == WRITE) { + loff_t final_size = offset + count; + + if (final_size > inode->i_size) { + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + orphan = 1; + ei->i_disksize = inode->i_size; + ext4_journal_stop(handle); + } + } + +retry: + if (rw == READ && ext4_should_dioread_nolock(inode)) + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block, NULL, NULL, 0); + else { + ret = blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block, NULL); + + if (unlikely((rw & WRITE) && ret < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + iov_length(iov, nr_segs); + + if (end > isize) + ext4_truncate_failed_write(inode); + } + } + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + if (orphan) { + int err; + + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) { + /* This is really bad luck. We've written the data + * but cannot extend i_size. Bail out and pretend + * the write failed... */ + ret = PTR_ERR(handle); + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + + goto out; + } + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + if (ret > 0) { + loff_t end = offset + ret; + if (end > inode->i_size) { + ei->i_disksize = end; + i_size_write(inode, end); + /* + * We're going to return a positive `ret' + * here due to non-zero-length I/O, so there's + * no way of reporting error returns from + * ext4_mark_inode_dirty() to userspace. So + * ignore it. + */ + ext4_mark_inode_dirty(handle, inode); + } + } + err = ext4_journal_stop(handle); + if (ret == 0) + ret = err; + } +out: + return ret; +} + +/* + * Calculate the number of metadata blocks need to reserve + * to allocate a new block at @lblocks for non extent file based file + */ +int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); + int blk_bits; + + if (lblock < EXT4_NDIR_BLOCKS) + return 0; + + lblock -= EXT4_NDIR_BLOCKS; + + if (ei->i_da_metadata_calc_len && + (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { + ei->i_da_metadata_calc_len++; + return 0; + } + ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; + ei->i_da_metadata_calc_len = 1; + blk_bits = order_base_2(lblock); + return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; +} + +int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + int indirects; + + /* if nrblocks are contiguous */ + if (chunk) { + /* + * With N contiguous data blocks, we need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, + * 2 dindirect blocks, and 1 tindirect block + */ + return DIV_ROUND_UP(nrblocks, + EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; + } + /* + * if nrblocks are not contiguous, worse case, each block touch + * a indirect block, and each indirect block touch a double indirect + * block, plus a triple indirect block + */ + indirects = nrblocks * 2 + 1; + return indirects; +} + +/* + * Truncate transactions can be complex and absolutely huge. So we need to + * be able to restart the transaction at a conventient checkpoint to make + * sure we don't overflow the journal. + * + * start_transaction gets us a new handle for a truncate transaction, + * and extend_transaction tries to extend the existing one a bit. If + * extend fails, we need to propagate the failure up and restart the + * transaction in the top-level truncate loop. --sct + */ +static handle_t *start_transaction(struct inode *inode) +{ + handle_t *result; + + result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)); + if (!IS_ERR(result)) + return result; + + ext4_std_error(inode->i_sb, PTR_ERR(result)); + return result; +} + +/* + * Try to extend this transaction for the purposes of truncation. + * + * Returns 0 if we managed to create more room. If we can't create more + * room, and the transaction must be restarted we return 1. + */ +static int try_to_extend_transaction(handle_t *handle, struct inode *inode) +{ + if (!ext4_handle_valid(handle)) + return 0; + if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) + return 0; + if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode))) + return 0; + return 1; +} + +/* + * Probably it should be a library function... search for first non-zero word + * or memcmp with zero_page, whatever is better for particular architecture. + * Linus? + */ +static inline int all_zeroes(__le32 *p, __le32 *q) +{ + while (p < q) + if (*p++) + return 0; + return 1; +} + +/** + * ext4_find_shared - find the indirect blocks for partial truncation. + * @inode: inode in question + * @depth: depth of the affected branch + * @offsets: offsets of pointers in that branch (see ext4_block_to_path) + * @chain: place to store the pointers to partial indirect blocks + * @top: place to the (detached) top of branch + * + * This is a helper function used by ext4_truncate(). + * + * When we do truncate() we may have to clean the ends of several + * indirect blocks but leave the blocks themselves alive. Block is + * partially truncated if some data below the new i_size is referred + * from it (and it is on the path to the first completely truncated + * data block, indeed). We have to free the top of that path along + * with everything to the right of the path. Since no allocation + * past the truncation point is possible until ext4_truncate() + * finishes, we may safely do the latter, but top of branch may + * require special attention - pageout below the truncation point + * might try to populate it. + * + * We atomically detach the top of branch from the tree, store the + * block number of its root in *@top, pointers to buffer_heads of + * partially truncated blocks - in @chain[].bh and pointers to + * their last elements that should not be removed - in + * @chain[].p. Return value is the pointer to last filled element + * of @chain. + * + * The work left to caller to do the actual freeing of subtrees: + * a) free the subtree starting from *@top + * b) free the subtrees whose roots are stored in + * (@chain[i].p+1 .. end of @chain[i].bh->b_data) + * c) free the subtrees growing from the inode past the @chain[0]. + * (no partially truncated stuff there). */ + +static Indirect *ext4_find_shared(struct inode *inode, int depth, + ext4_lblk_t offsets[4], Indirect chain[4], + __le32 *top) +{ + Indirect *partial, *p; + int k, err; + + *top = 0; + /* Make k index the deepest non-null offset + 1 */ + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + partial = ext4_get_branch(inode, k, offsets, chain, &err); + /* Writer: pointers */ + if (!partial) + partial = chain + k-1; + /* + * If the branch acquired continuation since we've looked at it - + * fine, it should all survive and (new) top doesn't belong to us. + */ + if (!partial->key && *partial->p) + /* Writer: end */ + goto no_top; + for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) + ; + /* + * OK, we've found the last block that must survive. The rest of our + * branch should be detached before unlocking. However, if that rest + * of branch is all ours and does not grow immediately from the inode + * it's easier to cheat and just decrement partial->p. + */ + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + /* Nope, don't do this in ext4. Must leave the tree intact */ +#if 0 + *p->p = 0; +#endif + } + /* Writer: end */ + + while (partial > p) { + brelse(partial->bh); + partial--; + } +no_top: + return partial; +} + +/* + * Zero a number of block pointers in either an inode or an indirect block. + * If we restart the transaction we must again get write access to the + * indirect block for further modification. + * + * We release `count' blocks on disk, but (last - first) may be greater + * than `count' because there can be holes in there. + * + * Return 0 on success, 1 on invalid block range + * and < 0 on fatal error. + */ +static int ext4_clear_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, + ext4_fsblk_t block_to_free, + unsigned long count, __le32 *first, + __le32 *last) +{ + __le32 *p; + int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; + int err; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_METADATA; + + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, + count)) { + EXT4_ERROR_INODE(inode, "attempt to clear invalid " + "blocks %llu len %lu", + (unsigned long long) block_to_free, count); + return 1; + } + + if (try_to_extend_transaction(handle, inode)) { + if (bh) { + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (unlikely(err)) + goto out_err; + } + err = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err)) + goto out_err; + err = ext4_truncate_restart_trans(handle, inode, + ext4_blocks_for_truncate(inode)); + if (unlikely(err)) + goto out_err; + if (bh) { + BUFFER_TRACE(bh, "retaking write access"); + err = ext4_journal_get_write_access(handle, bh); + if (unlikely(err)) + goto out_err; + } + } + + for (p = first; p < last; p++) + *p = 0; + + ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); + return 0; +out_err: + ext4_std_error(inode->i_sb, err); + return err; +} + +/** + * ext4_free_data - free a list of data blocks + * @handle: handle for this transaction + * @inode: inode we are dealing with + * @this_bh: indirect buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: points immediately past the end of array + * + * We are freeing all blocks referred from that array (numbers are stored as + * little-endian 32-bit) and updating @inode->i_blocks appropriately. + * + * We accumulate contiguous runs of blocks to free. Conveniently, if these + * blocks are contiguous then releasing them at one time will only affect one + * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't + * actually use a lot of journal space. + * + * @this_bh will be %NULL if @first and @last point into the inode's direct + * block pointers. + */ +static void ext4_free_data(handle_t *handle, struct inode *inode, + struct buffer_head *this_bh, + __le32 *first, __le32 *last) +{ + ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ + unsigned long count = 0; /* Number of blocks in the run */ + __le32 *block_to_free_p = NULL; /* Pointer into inode/ind + corresponding to + block_to_free */ + ext4_fsblk_t nr; /* Current block # */ + __le32 *p; /* Pointer into inode/ind + for current block */ + int err = 0; + + if (this_bh) { /* For indirect block */ + BUFFER_TRACE(this_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, this_bh); + /* Important: if we can't update the indirect pointers + * to the blocks, we can't free them. */ + if (err) + return; + } + + for (p = first; p < last; p++) { + nr = le32_to_cpu(*p); + if (nr) { + /* accumulate blocks to free if they're contiguous */ + if (count == 0) { + block_to_free = nr; + block_to_free_p = p; + count = 1; + } else if (nr == block_to_free + count) { + count++; + } else { + err = ext4_clear_blocks(handle, inode, this_bh, + block_to_free, count, + block_to_free_p, p); + if (err) + break; + block_to_free = nr; + block_to_free_p = p; + count = 1; + } + } + } + + if (!err && count > 0) + err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, + count, block_to_free_p, p); + if (err < 0) + /* fatal error */ + return; + + if (this_bh) { + BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); + + /* + * The buffer head should have an attached journal head at this + * point. However, if the data is corrupted and an indirect + * block pointed to itself, it would have been detached when + * the block was cleared. Check for this instead of OOPSing. + */ + if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) + ext4_handle_dirty_metadata(handle, inode, this_bh); + else + EXT4_ERROR_INODE(inode, + "circular indirect block detected at " + "block %llu", + (unsigned long long) this_bh->b_blocknr); + } +} + +/** + * ext4_free_branches - free an array of branches + * @handle: JBD handle for this transaction + * @inode: inode we are dealing with + * @parent_bh: the buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: pointer immediately past the end of array + * @depth: depth of the branches to free + * + * We are freeing all blocks referred from these branches (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ +static void ext4_free_branches(handle_t *handle, struct inode *inode, + struct buffer_head *parent_bh, + __le32 *first, __le32 *last, int depth) +{ + ext4_fsblk_t nr; + __le32 *p; + + if (ext4_handle_is_aborted(handle)) + return; + + if (depth--) { + struct buffer_head *bh; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + p = last; + while (--p >= first) { + nr = le32_to_cpu(*p); + if (!nr) + continue; /* A hole */ + + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), + nr, 1)) { + EXT4_ERROR_INODE(inode, + "invalid indirect mapped " + "block %lu (level %d)", + (unsigned long) nr, depth); + break; + } + + /* Go read the buffer for the next level down */ + bh = sb_bread(inode->i_sb, nr); + + /* + * A read failure? Report error and clear slot + * (should be rare). + */ + if (!bh) { + EXT4_ERROR_INODE_BLOCK(inode, nr, + "Read failure"); + continue; + } + + /* This zaps the entire block. Bottom up. */ + BUFFER_TRACE(bh, "free child branches"); + ext4_free_branches(handle, inode, bh, + (__le32 *) bh->b_data, + (__le32 *) bh->b_data + addr_per_block, + depth); + brelse(bh); + + /* + * Everything below this this pointer has been + * released. Now let this top-of-subtree go. + * + * We want the freeing of this indirect block to be + * atomic in the journal with the updating of the + * bitmap block which owns it. So make some room in + * the journal. + * + * We zero the parent pointer *after* freeing its + * pointee in the bitmaps, so if extend_transaction() + * for some reason fails to put the bitmap changes and + * the release into the same transaction, recovery + * will merely complain about releasing a free block, + * rather than leaking blocks. + */ + if (ext4_handle_is_aborted(handle)) + return; + if (try_to_extend_transaction(handle, inode)) { + ext4_mark_inode_dirty(handle, inode); + ext4_truncate_restart_trans(handle, inode, + ext4_blocks_for_truncate(inode)); + } + + /* + * The forget flag here is critical because if + * we are journaling (and not doing data + * journaling), we have to make sure a revoke + * record is written to prevent the journal + * replay from overwriting the (former) + * indirect block if it gets reallocated as a + * data block. This must happen in the same + * transaction where the data blocks are + * actually freed. + */ + ext4_free_blocks(handle, inode, NULL, nr, 1, + EXT4_FREE_BLOCKS_METADATA| + EXT4_FREE_BLOCKS_FORGET); + + if (parent_bh) { + /* + * The block which we have just freed is + * pointed to by an indirect block: journal it + */ + BUFFER_TRACE(parent_bh, "get_write_access"); + if (!ext4_journal_get_write_access(handle, + parent_bh)){ + *p = 0; + BUFFER_TRACE(parent_bh, + "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, + inode, + parent_bh); + } + } + } + } else { + /* We have reached the bottom of the tree. */ + BUFFER_TRACE(parent_bh, "free data blocks"); + ext4_free_data(handle, inode, parent_bh, first, last); + } +} + +void ext4_ind_truncate(struct inode *inode) +{ + handle_t *handle; + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *i_data = ei->i_data; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + struct address_space *mapping = inode->i_mapping; + ext4_lblk_t offsets[4]; + Indirect chain[4]; + Indirect *partial; + __le32 nr = 0; + int n = 0; + ext4_lblk_t last_block, max_block; + unsigned blocksize = inode->i_sb->s_blocksize; + + handle = start_transaction(inode); + if (IS_ERR(handle)) + return; /* AKPM: return what? */ + + last_block = (inode->i_size + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + + if (inode->i_size & (blocksize - 1)) + if (ext4_block_truncate_page(handle, mapping, inode->i_size)) + goto out_stop; + + if (last_block != max_block) { + n = ext4_block_to_path(inode, last_block, offsets, NULL); + if (n == 0) + goto out_stop; /* error */ + } + + /* + * OK. This truncate is going to happen. We add the inode to the + * orphan list, so that if this truncate spans multiple transactions, + * and we crash, we will resume the truncate when the filesystem + * recovers. It also marks the inode dirty, to catch the new size. + * + * Implication: the file must always be in a sane, consistent + * truncatable state while each transaction commits. + */ + if (ext4_orphan_add(handle, inode)) + goto out_stop; + + /* + * From here we block out all ext4_get_block() callers who want to + * modify the block allocation tree. + */ + down_write(&ei->i_data_sem); + + ext4_discard_preallocations(inode); + + /* + * The orphan list entry will now protect us from any crash which + * occurs before the truncate completes, so it is now safe to propagate + * the new, shorter inode size (held for now in i_size) into the + * on-disk inode. We do this via i_disksize, which is the value which + * ext4 *really* writes onto the disk inode. + */ + ei->i_disksize = inode->i_size; + + if (last_block == max_block) { + /* + * It is unnecessary to free any data blocks if last_block is + * equal to the indirect block limit. + */ + goto out_unlock; + } else if (n == 1) { /* direct blocks */ + ext4_free_data(handle, inode, NULL, i_data+offsets[0], + i_data + EXT4_NDIR_BLOCKS); + goto do_indirects; + } + + partial = ext4_find_shared(inode, n, offsets, chain, &nr); + /* Kill the top of shared branch (not detached) */ + if (nr) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext4_free_branches(handle, inode, NULL, + &nr, &nr+1, (chain+n-1) - partial); + *partial->p = 0; + /* + * We mark the inode dirty prior to restart, + * and prior to stop. No need for it here. + */ + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); + ext4_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, (chain+n-1) - partial); + } + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + ext4_free_branches(handle, inode, partial->bh, partial->p + 1, + (__le32*)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } +do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + nr = i_data[EXT4_IND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); + i_data[EXT4_IND_BLOCK] = 0; + } + case EXT4_IND_BLOCK: + nr = i_data[EXT4_DIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); + i_data[EXT4_DIND_BLOCK] = 0; + } + case EXT4_DIND_BLOCK: + nr = i_data[EXT4_TIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); + i_data[EXT4_TIND_BLOCK] = 0; + } + case EXT4_TIND_BLOCK: + ; + } + +out_unlock: + up_write(&ei->i_data_sem); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + + /* + * In a multi-transaction truncate, we only make the final transaction + * synchronous + */ + if (IS_SYNC(inode)) + ext4_handle_sync(handle); +out_stop: + /* + * If this was a simple ftruncate(), and the file will remain alive + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext4_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + ext4_journal_stop(handle); + trace_ext4_truncate_exit(inode); +} + diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9b82ac7..de50b16 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -12,10 +12,6 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * - * Goal-directed block allocation by Stephen Tweedie - * (sct@redhat.com), 1993, 1998 - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 * 64-bit file support on 64-bit platforms by Jakub Jelinek * (jj@sunsite.ms.mff.cuni.cz) * @@ -90,45 +86,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) } /* - * Truncate transactions can be complex and absolutely huge. So we need to - * be able to restart the transaction at a conventient checkpoint to make - * sure we don't overflow the journal. - * - * start_transaction gets us a new handle for a truncate transaction, - * and extend_transaction tries to extend the existing one a bit. If - * extend fails, we need to propagate the failure up and restart the - * transaction in the top-level truncate loop. --sct - */ -static handle_t *start_transaction(struct inode *inode) -{ - handle_t *result; - - result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)); - if (!IS_ERR(result)) - return result; - - ext4_std_error(inode->i_sb, PTR_ERR(result)); - return result; -} - -/* - * Try to extend this transaction for the purposes of truncation. - * - * Returns 0 if we managed to create more room. If we can't create more - * room, and the transaction must be restarted we return 1. - */ -static int try_to_extend_transaction(handle_t *handle, struct inode *inode) -{ - if (!ext4_handle_valid(handle)) - return 0; - if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) - return 0; - if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode))) - return 0; - return 1; -} - -/* * Restart the transaction associated with *handle. This does a commit, * so before we call here everything must be consistently dirtied against * this transaction. @@ -251,760 +208,6 @@ no_delete: ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } -typedef struct { - __le32 *p; - __le32 key; - struct buffer_head *bh; -} Indirect; - -static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) -{ - p->key = *(p->p = v); - p->bh = bh; -} - -/** - * ext4_block_to_path - parse the block number into array of offsets - * @inode: inode in question (we are only interested in its superblock) - * @i_block: block number to be parsed - * @offsets: array to store the offsets in - * @boundary: set this non-zero if the referred-to block is likely to be - * followed (on disk) by an indirect block. - * - * To store the locations of file's data ext4 uses a data structure common - * for UNIX filesystems - tree of pointers anchored in the inode, with - * data blocks at leaves and indirect blocks in intermediate nodes. - * This function translates the block number into path in that tree - - * return value is the path length and @offsets[n] is the offset of - * pointer to (n+1)th node in the nth one. If @block is out of range - * (negative or too large) warning is printed and zero returned. - * - * Note: function doesn't find node addresses, so no IO is needed. All - * we need to know is the capacity of indirect blocks (taken from the - * inode->i_sb). - */ - -/* - * Portability note: the last comparison (check that we fit into triple - * indirect block) is spelled differently, because otherwise on an - * architecture with 32-bit longs and 8Kb pages we might get into trouble - * if our filesystem had 8Kb blocks. We might use long long, but that would - * kill us on x86. Oh, well, at least the sign propagation does not matter - - * i_block would have to be negative in the very beginning, so we would not - * get there at all. - */ - -static int ext4_block_to_path(struct inode *inode, - ext4_lblk_t i_block, - ext4_lblk_t offsets[4], int *boundary) -{ - int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); - int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); - const long direct_blocks = EXT4_NDIR_BLOCKS, - indirect_blocks = ptrs, - double_blocks = (1 << (ptrs_bits * 2)); - int n = 0; - int final = 0; - - if (i_block < direct_blocks) { - offsets[n++] = i_block; - final = direct_blocks; - } else if ((i_block -= direct_blocks) < indirect_blocks) { - offsets[n++] = EXT4_IND_BLOCK; - offsets[n++] = i_block; - final = ptrs; - } else if ((i_block -= indirect_blocks) < double_blocks) { - offsets[n++] = EXT4_DIND_BLOCK; - offsets[n++] = i_block >> ptrs_bits; - offsets[n++] = i_block & (ptrs - 1); - final = ptrs; - } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { - offsets[n++] = EXT4_TIND_BLOCK; - offsets[n++] = i_block >> (ptrs_bits * 2); - offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); - offsets[n++] = i_block & (ptrs - 1); - final = ptrs; - } else { - ext4_warning(inode->i_sb, "block %lu > max in inode %lu", - i_block + direct_blocks + - indirect_blocks + double_blocks, inode->i_ino); - } - if (boundary) - *boundary = final - 1 - (i_block & (ptrs - 1)); - return n; -} - -/** - * ext4_get_branch - read the chain of indirect blocks leading to data - * @inode: inode in question - * @depth: depth of the chain (1 - direct pointer, etc.) - * @offsets: offsets of pointers in inode/indirect blocks - * @chain: place to store the result - * @err: here we store the error value - * - * Function fills the array of triples and returns %NULL - * if everything went OK or the pointer to the last filled triple - * (incomplete one) otherwise. Upon the return chain[i].key contains - * the number of (i+1)-th block in the chain (as it is stored in memory, - * i.e. little-endian 32-bit), chain[i].p contains the address of that - * number (it points into struct inode for i==0 and into the bh->b_data - * for i>0) and chain[i].bh points to the buffer_head of i-th indirect - * block for i>0 and NULL for i==0. In other words, it holds the block - * numbers of the chain, addresses they were taken from (and where we can - * verify that chain did not change) and buffer_heads hosting these - * numbers. - * - * Function stops when it stumbles upon zero pointer (absent block) - * (pointer to last triple returned, *@err == 0) - * or when it gets an IO error reading an indirect block - * (ditto, *@err == -EIO) - * or when it reads all @depth-1 indirect blocks successfully and finds - * the whole chain, all way to the data (returns %NULL, *err == 0). - * - * Need to be called with - * down_read(&EXT4_I(inode)->i_data_sem) - */ -static Indirect *ext4_get_branch(struct inode *inode, int depth, - ext4_lblk_t *offsets, - Indirect chain[4], int *err) -{ - struct super_block *sb = inode->i_sb; - Indirect *p = chain; - struct buffer_head *bh; - - *err = 0; - /* i_data is not going away, no lock needed */ - add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); - if (!p->key) - goto no_block; - while (--depth) { - bh = sb_getblk(sb, le32_to_cpu(p->key)); - if (unlikely(!bh)) - goto failure; - - if (!bh_uptodate_or_lock(bh)) { - if (bh_submit_read(bh) < 0) { - put_bh(bh); - goto failure; - } - /* validate block references */ - if (ext4_check_indirect_blockref(inode, bh)) { - put_bh(bh); - goto failure; - } - } - - add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); - /* Reader: end */ - if (!p->key) - goto no_block; - } - return NULL; - -failure: - *err = -EIO; -no_block: - return p; -} - -/** - * ext4_find_near - find a place for allocation with sufficient locality - * @inode: owner - * @ind: descriptor of indirect block. - * - * This function returns the preferred place for block allocation. - * It is used when heuristic for sequential allocation fails. - * Rules are: - * + if there is a block to the left of our position - allocate near it. - * + if pointer will live in indirect block - allocate near that block. - * + if pointer will live in inode - allocate in the same - * cylinder group. - * - * In the latter case we colour the starting block by the callers PID to - * prevent it from clashing with concurrent allocations for a different inode - * in the same block group. The PID is used here so that functionally related - * files will be close-by on-disk. - * - * Caller must make sure that @ind is valid and will stay that way. - */ -static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; - __le32 *p; - ext4_fsblk_t bg_start; - ext4_fsblk_t last_block; - ext4_grpblk_t colour; - ext4_group_t block_group; - int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); - - /* Try to find previous block */ - for (p = ind->p - 1; p >= start; p--) { - if (*p) - return le32_to_cpu(*p); - } - - /* No such thing, so let's try location of indirect block */ - if (ind->bh) - return ind->bh->b_blocknr; - - /* - * It is going to be referred to from the inode itself? OK, just put it - * into the same cylinder group then. - */ - block_group = ei->i_block_group; - if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { - block_group &= ~(flex_size-1); - if (S_ISREG(inode->i_mode)) - block_group++; - } - bg_start = ext4_group_first_block_no(inode->i_sb, block_group); - last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; - - /* - * If we are doing delayed allocation, we don't need take - * colour into account. - */ - if (test_opt(inode->i_sb, DELALLOC)) - return bg_start; - - if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) - colour = (current->pid % 16) * - (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); - else - colour = (current->pid % 16) * ((last_block - bg_start) / 16); - return bg_start + colour; -} - -/** - * ext4_find_goal - find a preferred place for allocation. - * @inode: owner - * @block: block we want - * @partial: pointer to the last triple within a chain - * - * Normally this function find the preferred place for block allocation, - * returns it. - * Because this is only used for non-extent files, we limit the block nr - * to 32 bits. - */ -static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, - Indirect *partial) -{ - ext4_fsblk_t goal; - - /* - * XXX need to get goal block from mballoc's data structures - */ - - goal = ext4_find_near(inode, partial); - goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; - return goal; -} - -/** - * ext4_blks_to_allocate - Look up the block map and count the number - * of direct blocks need to be allocated for the given branch. - * - * @branch: chain of indirect blocks - * @k: number of blocks need for indirect blocks - * @blks: number of data blocks to be mapped. - * @blocks_to_boundary: the offset in the indirect block - * - * return the total number of blocks to be allocate, including the - * direct and indirect blocks. - */ -static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, - int blocks_to_boundary) -{ - unsigned int count = 0; - - /* - * Simple case, [t,d]Indirect block(s) has not allocated yet - * then it's clear blocks on that path have not allocated - */ - if (k > 0) { - /* right now we don't handle cross boundary allocation */ - if (blks < blocks_to_boundary + 1) - count += blks; - else - count += blocks_to_boundary + 1; - return count; - } - - count++; - while (count < blks && count <= blocks_to_boundary && - le32_to_cpu(*(branch[0].p + count)) == 0) { - count++; - } - return count; -} - -/** - * ext4_alloc_blocks: multiple allocate blocks needed for a branch - * @handle: handle for this transaction - * @inode: inode which needs allocated blocks - * @iblock: the logical block to start allocated at - * @goal: preferred physical block of allocation - * @indirect_blks: the number of blocks need to allocate for indirect - * blocks - * @blks: number of desired blocks - * @new_blocks: on return it will store the new block numbers for - * the indirect blocks(if needed) and the first direct block, - * @err: on return it will store the error code - * - * This function will return the number of blocks allocated as - * requested by the passed-in parameters. - */ -static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, ext4_fsblk_t goal, - int indirect_blks, int blks, - ext4_fsblk_t new_blocks[4], int *err) -{ - struct ext4_allocation_request ar; - int target, i; - unsigned long count = 0, blk_allocated = 0; - int index = 0; - ext4_fsblk_t current_block = 0; - int ret = 0; - - /* - * Here we try to allocate the requested multiple blocks at once, - * on a best-effort basis. - * To build a branch, we should allocate blocks for - * the indirect blocks(if not allocated yet), and at least - * the first direct block of this branch. That's the - * minimum number of blocks need to allocate(required) - */ - /* first we try to allocate the indirect blocks */ - target = indirect_blks; - while (target > 0) { - count = target; - /* allocating blocks for indirect blocks and direct blocks */ - current_block = ext4_new_meta_blocks(handle, inode, goal, - 0, &count, err); - if (*err) - goto failed_out; - - if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { - EXT4_ERROR_INODE(inode, - "current_block %llu + count %lu > %d!", - current_block, count, - EXT4_MAX_BLOCK_FILE_PHYS); - *err = -EIO; - goto failed_out; - } - - target -= count; - /* allocate blocks for indirect blocks */ - while (index < indirect_blks && count) { - new_blocks[index++] = current_block++; - count--; - } - if (count > 0) { - /* - * save the new block number - * for the first direct block - */ - new_blocks[index] = current_block; - printk(KERN_INFO "%s returned more blocks than " - "requested\n", __func__); - WARN_ON(1); - break; - } - } - - target = blks - count ; - blk_allocated = count; - if (!target) - goto allocated; - /* Now allocate data blocks */ - memset(&ar, 0, sizeof(ar)); - ar.inode = inode; - ar.goal = goal; - ar.len = target; - ar.logical = iblock; - if (S_ISREG(inode->i_mode)) - /* enable in-core preallocation only for regular files */ - ar.flags = EXT4_MB_HINT_DATA; - - current_block = ext4_mb_new_blocks(handle, &ar, err); - if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { - EXT4_ERROR_INODE(inode, - "current_block %llu + ar.len %d > %d!", - current_block, ar.len, - EXT4_MAX_BLOCK_FILE_PHYS); - *err = -EIO; - goto failed_out; - } - - if (*err && (target == blks)) { - /* - * if the allocation failed and we didn't allocate - * any blocks before - */ - goto failed_out; - } - if (!*err) { - if (target == blks) { - /* - * save the new block number - * for the first direct block - */ - new_blocks[index] = current_block; - } - blk_allocated += ar.len; - } -allocated: - /* total number of blocks allocated for direct blocks */ - ret = blk_allocated; - *err = 0; - return ret; -failed_out: - for (i = 0; i < index; i++) - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - return ret; -} - -/** - * ext4_alloc_branch - allocate and set up a chain of blocks. - * @handle: handle for this transaction - * @inode: owner - * @indirect_blks: number of allocated indirect blocks - * @blks: number of allocated direct blocks - * @goal: preferred place for allocation - * @offsets: offsets (in the blocks) to store the pointers to next. - * @branch: place to store the chain in. - * - * This function allocates blocks, zeroes out all but the last one, - * links them into chain and (if we are synchronous) writes them to disk. - * In other words, it prepares a branch that can be spliced onto the - * inode. It stores the information about that chain in the branch[], in - * the same format as ext4_get_branch() would do. We are calling it after - * we had read the existing part of chain and partial points to the last - * triple of that (one with zero ->key). Upon the exit we have the same - * picture as after the successful ext4_get_block(), except that in one - * place chain is disconnected - *branch->p is still zero (we did not - * set the last link), but branch->key contains the number that should - * be placed into *branch->p to fill that gap. - * - * If allocation fails we free all blocks we've allocated (and forget - * their buffer_heads) and return the error value the from failed - * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain - * as described above and return 0. - */ -static int ext4_alloc_branch(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, int indirect_blks, - int *blks, ext4_fsblk_t goal, - ext4_lblk_t *offsets, Indirect *branch) -{ - int blocksize = inode->i_sb->s_blocksize; - int i, n = 0; - int err = 0; - struct buffer_head *bh; - int num; - ext4_fsblk_t new_blocks[4]; - ext4_fsblk_t current_block; - - num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, - *blks, new_blocks, &err); - if (err) - return err; - - branch[0].key = cpu_to_le32(new_blocks[0]); - /* - * metadata blocks and data blocks are allocated. - */ - for (n = 1; n <= indirect_blks; n++) { - /* - * Get buffer_head for parent block, zero it out - * and set the pointer to new one, then send - * parent to disk. - */ - bh = sb_getblk(inode->i_sb, new_blocks[n-1]); - if (unlikely(!bh)) { - err = -EIO; - goto failed; - } - - branch[n].bh = bh; - lock_buffer(bh); - BUFFER_TRACE(bh, "call get_create_access"); - err = ext4_journal_get_create_access(handle, bh); - if (err) { - /* Don't brelse(bh) here; it's done in - * ext4_journal_forget() below */ - unlock_buffer(bh); - goto failed; - } - - memset(bh->b_data, 0, blocksize); - branch[n].p = (__le32 *) bh->b_data + offsets[n]; - branch[n].key = cpu_to_le32(new_blocks[n]); - *branch[n].p = branch[n].key; - if (n == indirect_blks) { - current_block = new_blocks[n]; - /* - * End of chain, update the last new metablock of - * the chain to point to the new allocated - * data blocks numbers - */ - for (i = 1; i < num; i++) - *(branch[n].p + i) = cpu_to_le32(++current_block); - } - BUFFER_TRACE(bh, "marking uptodate"); - set_buffer_uptodate(bh); - unlock_buffer(bh); - - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, bh); - if (err) - goto failed; - } - *blks = num; - return err; -failed: - /* Allocation failed, free what we already allocated */ - ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); - for (i = 1; i <= n ; i++) { - /* - * branch[i].bh is newly allocated, so there is no - * need to revoke the block, which is why we don't - * need to set EXT4_FREE_BLOCKS_METADATA. - */ - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, - EXT4_FREE_BLOCKS_FORGET); - } - for (i = n+1; i < indirect_blks; i++) - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - - ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); - - return err; -} - -/** - * ext4_splice_branch - splice the allocated branch onto inode. - * @handle: handle for this transaction - * @inode: owner - * @block: (logical) number of block we are adding - * @chain: chain of indirect blocks (with a missing link - see - * ext4_alloc_branch) - * @where: location of missing link - * @num: number of indirect blocks we are adding - * @blks: number of direct blocks we are adding - * - * This function fills the missing link and does all housekeeping needed in - * inode (->i_blocks, etc.). In case of success we end up with the full - * chain to new block and return 0. - */ -static int ext4_splice_branch(handle_t *handle, struct inode *inode, - ext4_lblk_t block, Indirect *where, int num, - int blks) -{ - int i; - int err = 0; - ext4_fsblk_t current_block; - - /* - * If we're splicing into a [td]indirect block (as opposed to the - * inode) then we need to get write access to the [td]indirect block - * before the splice. - */ - if (where->bh) { - BUFFER_TRACE(where->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, where->bh); - if (err) - goto err_out; - } - /* That's it */ - - *where->p = where->key; - - /* - * Update the host buffer_head or inode to point to more just allocated - * direct blocks blocks - */ - if (num == 0 && blks > 1) { - current_block = le32_to_cpu(where->key) + 1; - for (i = 1; i < blks; i++) - *(where->p + i) = cpu_to_le32(current_block++); - } - - /* We are done with atomic stuff, now do the rest of housekeeping */ - /* had we spliced it onto indirect block? */ - if (where->bh) { - /* - * If we spliced it onto an indirect block, we haven't - * altered the inode. Note however that if it is being spliced - * onto an indirect block at the very end of the file (the - * file is growing) then we *will* alter the inode to reflect - * the new i_size. But that is not done here - it is done in - * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. - */ - jbd_debug(5, "splicing indirect only\n"); - BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, where->bh); - if (err) - goto err_out; - } else { - /* - * OK, we spliced it into the inode itself on a direct block. - */ - ext4_mark_inode_dirty(handle, inode); - jbd_debug(5, "splicing direct\n"); - } - return err; - -err_out: - for (i = 1; i <= num; i++) { - /* - * branch[i].bh is newly allocated, so there is no - * need to revoke the block, which is why we don't - * need to set EXT4_FREE_BLOCKS_METADATA. - */ - ext4_free_blocks(handle, inode, where[i].bh, 0, 1, - EXT4_FREE_BLOCKS_FORGET); - } - ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), - blks, 0); - - return err; -} - -/* - * The ext4_ind_map_blocks() function handles non-extents inodes - * (i.e., using the traditional indirect/double-indirect i_blocks - * scheme) for ext4_map_blocks(). - * - * Allocation strategy is simple: if we have to allocate something, we will - * have to go the whole way to leaf. So let's do it before attaching anything - * to tree, set linkage between the newborn blocks, write them if sync is - * required, recheck the path, free and repeat if check fails, otherwise - * set the last missing link (that will protect us from any truncate-generated - * removals - all blocks on the path are immune now) and possibly force the - * write on the parent block. - * That has a nice additional property: no special recovery from the failed - * allocations is needed - we simply release blocks and do not touch anything - * reachable from inode. - * - * `handle' can be NULL if create == 0. - * - * return > 0, # of blocks mapped or allocated. - * return = 0, if plain lookup failed. - * return < 0, error case. - * - * The ext4_ind_get_blocks() function should be called with - * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem - * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or - * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system - * blocks. - */ -static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map, - int flags) -{ - int err = -EIO; - ext4_lblk_t offsets[4]; - Indirect chain[4]; - Indirect *partial; - ext4_fsblk_t goal; - int indirect_blks; - int blocks_to_boundary = 0; - int depth; - int count = 0; - ext4_fsblk_t first_block = 0; - - trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); - J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); - J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); - depth = ext4_block_to_path(inode, map->m_lblk, offsets, - &blocks_to_boundary); - - if (depth == 0) - goto out; - - partial = ext4_get_branch(inode, depth, offsets, chain, &err); - - /* Simplest case - block found, no allocation needed */ - if (!partial) { - first_block = le32_to_cpu(chain[depth - 1].key); - count++; - /*map more blocks*/ - while (count < map->m_len && count <= blocks_to_boundary) { - ext4_fsblk_t blk; - - blk = le32_to_cpu(*(chain[depth-1].p + count)); - - if (blk == first_block + count) - count++; - else - break; - } - goto got_it; - } - - /* Next simple case - plain lookup or failed read of indirect block */ - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) - goto cleanup; - - /* - * Okay, we need to do block allocation. - */ - goal = ext4_find_goal(inode, map->m_lblk, partial); - - /* the number of blocks need to allocate for [d,t]indirect blocks */ - indirect_blks = (chain + depth) - partial - 1; - - /* - * Next look up the indirect map to count the totoal number of - * direct blocks to allocate for this branch. - */ - count = ext4_blks_to_allocate(partial, indirect_blks, - map->m_len, blocks_to_boundary); - /* - * Block out ext4_truncate while we alter the tree - */ - err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, - &count, goal, - offsets + (partial - chain), partial); - - /* - * The ext4_splice_branch call will free and forget any buffers - * on the new chain if there is a failure, but that risks using - * up transaction credits, especially for bitmaps where the - * credits cannot be returned. Can we handle this somehow? We - * may need to return -EAGAIN upwards in the worst case. --sct - */ - if (!err) - err = ext4_splice_branch(handle, inode, map->m_lblk, - partial, indirect_blks, count); - if (err) - goto cleanup; - - map->m_flags |= EXT4_MAP_NEW; - - ext4_update_inode_fsync_trans(handle, inode, 1); -got_it: - map->m_flags |= EXT4_MAP_MAPPED; - map->m_pblk = le32_to_cpu(chain[depth-1].key); - map->m_len = count; - if (count > blocks_to_boundary) - map->m_flags |= EXT4_MAP_BOUNDARY; - err = count; - /* Clean up and exit */ - partial = chain + depth - 1; /* the whole chain */ -cleanup: - while (partial > chain) { - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - partial--; - } -out: - trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, - map->m_pblk, map->m_len, err); - return err; -} - #ifdef CONFIG_QUOTA qsize_t *ext4_get_reserved_space(struct inode *inode) { @@ -1014,32 +217,6 @@ qsize_t *ext4_get_reserved_space(struct inode *inode) /* * Calculate the number of metadata blocks need to reserve - * to allocate a new block at @lblocks for non extent file based file - */ -static int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); - int blk_bits; - - if (lblock < EXT4_NDIR_BLOCKS) - return 0; - - lblock -= EXT4_NDIR_BLOCKS; - - if (ei->i_da_metadata_calc_len && - (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { - ei->i_da_metadata_calc_len++; - return 0; - } - ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; - ei->i_da_metadata_calc_len = 1; - blk_bits = order_base_2(lblock); - return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; -} - -/* - * Calculate the number of metadata blocks need to reserve * to allocate a block located at @lblock */ static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) @@ -3380,114 +2557,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait) } /* - * O_DIRECT for ext3 (or indirect map) based files - * - * If the O_DIRECT write will extend the file then add this inode to the - * orphan list. So recovery will truncate it back to the original size - * if the machine crashes during the write. - * - * If the O_DIRECT write is intantiating holes inside i_size and the machine - * crashes then stale disk data _may_ be exposed inside the file. But current - * VFS code falls back into buffered path in that case so we are safe. - */ -static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); - handle_t *handle; - ssize_t ret; - int orphan = 0; - size_t count = iov_length(iov, nr_segs); - int retries = 0; - - if (rw == WRITE) { - loff_t final_size = offset + count; - - if (final_size > inode->i_size) { - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - ret = ext4_orphan_add(handle, inode); - if (ret) { - ext4_journal_stop(handle); - goto out; - } - orphan = 1; - ei->i_disksize = inode->i_size; - ext4_journal_stop(handle); - } - } - -retry: - if (rw == READ && ext4_should_dioread_nolock(inode)) - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext4_get_block, NULL, NULL, 0); - else { - ret = blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext4_get_block, NULL); - - if (unlikely((rw & WRITE) && ret < 0)) { - loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); - - if (end > isize) - ext4_truncate_failed_write(inode); - } - } - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - - if (orphan) { - int err; - - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, 2); - if (IS_ERR(handle)) { - /* This is really bad luck. We've written the data - * but cannot extend i_size. Bail out and pretend - * the write failed... */ - ret = PTR_ERR(handle); - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - - goto out; - } - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - if (ret > 0) { - loff_t end = offset + ret; - if (end > inode->i_size) { - ei->i_disksize = end; - i_size_write(inode, end); - /* - * We're going to return a positive `ret' - * here due to non-zero-length I/O, so there's - * no way of reporting error returns from - * ext4_mark_inode_dirty() to userspace. So - * ignore it. - */ - ext4_mark_inode_dirty(handle, inode); - } - } - err = ext4_journal_stop(handle); - if (ret == 0) - ret = err; - } -out: - return ret; -} - -/* * ext4_get_block used when preparing for a DIO write or buffer write. * We allocate an uinitialized extent if blocks haven't been allocated. * The extent will be converted to initialized after the IO is complete. @@ -3958,383 +3027,6 @@ unlock: return err; } -/* - * Probably it should be a library function... search for first non-zero word - * or memcmp with zero_page, whatever is better for particular architecture. - * Linus? - */ -static inline int all_zeroes(__le32 *p, __le32 *q) -{ - while (p < q) - if (*p++) - return 0; - return 1; -} - -/** - * ext4_find_shared - find the indirect blocks for partial truncation. - * @inode: inode in question - * @depth: depth of the affected branch - * @offsets: offsets of pointers in that branch (see ext4_block_to_path) - * @chain: place to store the pointers to partial indirect blocks - * @top: place to the (detached) top of branch - * - * This is a helper function used by ext4_truncate(). - * - * When we do truncate() we may have to clean the ends of several - * indirect blocks but leave the blocks themselves alive. Block is - * partially truncated if some data below the new i_size is referred - * from it (and it is on the path to the first completely truncated - * data block, indeed). We have to free the top of that path along - * with everything to the right of the path. Since no allocation - * past the truncation point is possible until ext4_truncate() - * finishes, we may safely do the latter, but top of branch may - * require special attention - pageout below the truncation point - * might try to populate it. - * - * We atomically detach the top of branch from the tree, store the - * block number of its root in *@top, pointers to buffer_heads of - * partially truncated blocks - in @chain[].bh and pointers to - * their last elements that should not be removed - in - * @chain[].p. Return value is the pointer to last filled element - * of @chain. - * - * The work left to caller to do the actual freeing of subtrees: - * a) free the subtree starting from *@top - * b) free the subtrees whose roots are stored in - * (@chain[i].p+1 .. end of @chain[i].bh->b_data) - * c) free the subtrees growing from the inode past the @chain[0]. - * (no partially truncated stuff there). */ - -static Indirect *ext4_find_shared(struct inode *inode, int depth, - ext4_lblk_t offsets[4], Indirect chain[4], - __le32 *top) -{ - Indirect *partial, *p; - int k, err; - - *top = 0; - /* Make k index the deepest non-null offset + 1 */ - for (k = depth; k > 1 && !offsets[k-1]; k--) - ; - partial = ext4_get_branch(inode, k, offsets, chain, &err); - /* Writer: pointers */ - if (!partial) - partial = chain + k-1; - /* - * If the branch acquired continuation since we've looked at it - - * fine, it should all survive and (new) top doesn't belong to us. - */ - if (!partial->key && *partial->p) - /* Writer: end */ - goto no_top; - for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) - ; - /* - * OK, we've found the last block that must survive. The rest of our - * branch should be detached before unlocking. However, if that rest - * of branch is all ours and does not grow immediately from the inode - * it's easier to cheat and just decrement partial->p. - */ - if (p == chain + k - 1 && p > chain) { - p->p--; - } else { - *top = *p->p; - /* Nope, don't do this in ext4. Must leave the tree intact */ -#if 0 - *p->p = 0; -#endif - } - /* Writer: end */ - - while (partial > p) { - brelse(partial->bh); - partial--; - } -no_top: - return partial; -} - -/* - * Zero a number of block pointers in either an inode or an indirect block. - * If we restart the transaction we must again get write access to the - * indirect block for further modification. - * - * We release `count' blocks on disk, but (last - first) may be greater - * than `count' because there can be holes in there. - * - * Return 0 on success, 1 on invalid block range - * and < 0 on fatal error. - */ -static int ext4_clear_blocks(handle_t *handle, struct inode *inode, - struct buffer_head *bh, - ext4_fsblk_t block_to_free, - unsigned long count, __le32 *first, - __le32 *last) -{ - __le32 *p; - int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; - int err; - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; - - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, - count)) { - EXT4_ERROR_INODE(inode, "attempt to clear invalid " - "blocks %llu len %lu", - (unsigned long long) block_to_free, count); - return 1; - } - - if (try_to_extend_transaction(handle, inode)) { - if (bh) { - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, bh); - if (unlikely(err)) - goto out_err; - } - err = ext4_mark_inode_dirty(handle, inode); - if (unlikely(err)) - goto out_err; - err = ext4_truncate_restart_trans(handle, inode, - ext4_blocks_for_truncate(inode)); - if (unlikely(err)) - goto out_err; - if (bh) { - BUFFER_TRACE(bh, "retaking write access"); - err = ext4_journal_get_write_access(handle, bh); - if (unlikely(err)) - goto out_err; - } - } - - for (p = first; p < last; p++) - *p = 0; - - ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); - return 0; -out_err: - ext4_std_error(inode->i_sb, err); - return err; -} - -/** - * ext4_free_data - free a list of data blocks - * @handle: handle for this transaction - * @inode: inode we are dealing with - * @this_bh: indirect buffer_head which contains *@first and *@last - * @first: array of block numbers - * @last: points immediately past the end of array - * - * We are freeing all blocks referred from that array (numbers are stored as - * little-endian 32-bit) and updating @inode->i_blocks appropriately. - * - * We accumulate contiguous runs of blocks to free. Conveniently, if these - * blocks are contiguous then releasing them at one time will only affect one - * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't - * actually use a lot of journal space. - * - * @this_bh will be %NULL if @first and @last point into the inode's direct - * block pointers. - */ -static void ext4_free_data(handle_t *handle, struct inode *inode, - struct buffer_head *this_bh, - __le32 *first, __le32 *last) -{ - ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ - unsigned long count = 0; /* Number of blocks in the run */ - __le32 *block_to_free_p = NULL; /* Pointer into inode/ind - corresponding to - block_to_free */ - ext4_fsblk_t nr; /* Current block # */ - __le32 *p; /* Pointer into inode/ind - for current block */ - int err = 0; - - if (this_bh) { /* For indirect block */ - BUFFER_TRACE(this_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, this_bh); - /* Important: if we can't update the indirect pointers - * to the blocks, we can't free them. */ - if (err) - return; - } - - for (p = first; p < last; p++) { - nr = le32_to_cpu(*p); - if (nr) { - /* accumulate blocks to free if they're contiguous */ - if (count == 0) { - block_to_free = nr; - block_to_free_p = p; - count = 1; - } else if (nr == block_to_free + count) { - count++; - } else { - err = ext4_clear_blocks(handle, inode, this_bh, - block_to_free, count, - block_to_free_p, p); - if (err) - break; - block_to_free = nr; - block_to_free_p = p; - count = 1; - } - } - } - - if (!err && count > 0) - err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, - count, block_to_free_p, p); - if (err < 0) - /* fatal error */ - return; - - if (this_bh) { - BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); - - /* - * The buffer head should have an attached journal head at this - * point. However, if the data is corrupted and an indirect - * block pointed to itself, it would have been detached when - * the block was cleared. Check for this instead of OOPSing. - */ - if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) - ext4_handle_dirty_metadata(handle, inode, this_bh); - else - EXT4_ERROR_INODE(inode, - "circular indirect block detected at " - "block %llu", - (unsigned long long) this_bh->b_blocknr); - } -} - -/** - * ext4_free_branches - free an array of branches - * @handle: JBD handle for this transaction - * @inode: inode we are dealing with - * @parent_bh: the buffer_head which contains *@first and *@last - * @first: array of block numbers - * @last: pointer immediately past the end of array - * @depth: depth of the branches to free - * - * We are freeing all blocks referred from these branches (numbers are - * stored as little-endian 32-bit) and updating @inode->i_blocks - * appropriately. - */ -static void ext4_free_branches(handle_t *handle, struct inode *inode, - struct buffer_head *parent_bh, - __le32 *first, __le32 *last, int depth) -{ - ext4_fsblk_t nr; - __le32 *p; - - if (ext4_handle_is_aborted(handle)) - return; - - if (depth--) { - struct buffer_head *bh; - int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - p = last; - while (--p >= first) { - nr = le32_to_cpu(*p); - if (!nr) - continue; /* A hole */ - - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), - nr, 1)) { - EXT4_ERROR_INODE(inode, - "invalid indirect mapped " - "block %lu (level %d)", - (unsigned long) nr, depth); - break; - } - - /* Go read the buffer for the next level down */ - bh = sb_bread(inode->i_sb, nr); - - /* - * A read failure? Report error and clear slot - * (should be rare). - */ - if (!bh) { - EXT4_ERROR_INODE_BLOCK(inode, nr, - "Read failure"); - continue; - } - - /* This zaps the entire block. Bottom up. */ - BUFFER_TRACE(bh, "free child branches"); - ext4_free_branches(handle, inode, bh, - (__le32 *) bh->b_data, - (__le32 *) bh->b_data + addr_per_block, - depth); - brelse(bh); - - /* - * Everything below this this pointer has been - * released. Now let this top-of-subtree go. - * - * We want the freeing of this indirect block to be - * atomic in the journal with the updating of the - * bitmap block which owns it. So make some room in - * the journal. - * - * We zero the parent pointer *after* freeing its - * pointee in the bitmaps, so if extend_transaction() - * for some reason fails to put the bitmap changes and - * the release into the same transaction, recovery - * will merely complain about releasing a free block, - * rather than leaking blocks. - */ - if (ext4_handle_is_aborted(handle)) - return; - if (try_to_extend_transaction(handle, inode)) { - ext4_mark_inode_dirty(handle, inode); - ext4_truncate_restart_trans(handle, inode, - ext4_blocks_for_truncate(inode)); - } - - /* - * The forget flag here is critical because if - * we are journaling (and not doing data - * journaling), we have to make sure a revoke - * record is written to prevent the journal - * replay from overwriting the (former) - * indirect block if it gets reallocated as a - * data block. This must happen in the same - * transaction where the data blocks are - * actually freed. - */ - ext4_free_blocks(handle, inode, NULL, nr, 1, - EXT4_FREE_BLOCKS_METADATA| - EXT4_FREE_BLOCKS_FORGET); - - if (parent_bh) { - /* - * The block which we have just freed is - * pointed to by an indirect block: journal it - */ - BUFFER_TRACE(parent_bh, "get_write_access"); - if (!ext4_journal_get_write_access(handle, - parent_bh)){ - *p = 0; - BUFFER_TRACE(parent_bh, - "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, - inode, - parent_bh); - } - } - } - } else { - /* We have reached the bottom of the tree. */ - BUFFER_TRACE(parent_bh, "free data blocks"); - ext4_free_data(handle, inode, parent_bh, first, last); - } -} - int ext4_can_truncate(struct inode *inode) { if (S_ISREG(inode->i_mode)) @@ -4419,161 +3111,6 @@ void ext4_truncate(struct inode *inode) trace_ext4_truncate_exit(inode); } -void ext4_ind_truncate(struct inode *inode) -{ - handle_t *handle; - struct ext4_inode_info *ei = EXT4_I(inode); - __le32 *i_data = ei->i_data; - int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - struct address_space *mapping = inode->i_mapping; - ext4_lblk_t offsets[4]; - Indirect chain[4]; - Indirect *partial; - __le32 nr = 0; - int n = 0; - ext4_lblk_t last_block, max_block; - unsigned blocksize = inode->i_sb->s_blocksize; - - handle = start_transaction(inode); - if (IS_ERR(handle)) - return; /* AKPM: return what? */ - - last_block = (inode->i_size + blocksize-1) - >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) - >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - - if (inode->i_size & (blocksize - 1)) - if (ext4_block_truncate_page(handle, mapping, inode->i_size)) - goto out_stop; - - if (last_block != max_block) { - n = ext4_block_to_path(inode, last_block, offsets, NULL); - if (n == 0) - goto out_stop; /* error */ - } - - /* - * OK. This truncate is going to happen. We add the inode to the - * orphan list, so that if this truncate spans multiple transactions, - * and we crash, we will resume the truncate when the filesystem - * recovers. It also marks the inode dirty, to catch the new size. - * - * Implication: the file must always be in a sane, consistent - * truncatable state while each transaction commits. - */ - if (ext4_orphan_add(handle, inode)) - goto out_stop; - - /* - * From here we block out all ext4_get_block() callers who want to - * modify the block allocation tree. - */ - down_write(&ei->i_data_sem); - - ext4_discard_preallocations(inode); - - /* - * The orphan list entry will now protect us from any crash which - * occurs before the truncate completes, so it is now safe to propagate - * the new, shorter inode size (held for now in i_size) into the - * on-disk inode. We do this via i_disksize, which is the value which - * ext4 *really* writes onto the disk inode. - */ - ei->i_disksize = inode->i_size; - - if (last_block == max_block) { - /* - * It is unnecessary to free any data blocks if last_block is - * equal to the indirect block limit. - */ - goto out_unlock; - } else if (n == 1) { /* direct blocks */ - ext4_free_data(handle, inode, NULL, i_data+offsets[0], - i_data + EXT4_NDIR_BLOCKS); - goto do_indirects; - } - - partial = ext4_find_shared(inode, n, offsets, chain, &nr); - /* Kill the top of shared branch (not detached) */ - if (nr) { - if (partial == chain) { - /* Shared branch grows from the inode */ - ext4_free_branches(handle, inode, NULL, - &nr, &nr+1, (chain+n-1) - partial); - *partial->p = 0; - /* - * We mark the inode dirty prior to restart, - * and prior to stop. No need for it here. - */ - } else { - /* Shared branch grows from an indirect block */ - BUFFER_TRACE(partial->bh, "get_write_access"); - ext4_free_branches(handle, inode, partial->bh, - partial->p, - partial->p+1, (chain+n-1) - partial); - } - } - /* Clear the ends of indirect blocks on the shared branch */ - while (partial > chain) { - ext4_free_branches(handle, inode, partial->bh, partial->p + 1, - (__le32*)partial->bh->b_data+addr_per_block, - (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - partial--; - } -do_indirects: - /* Kill the remaining (whole) subtrees */ - switch (offsets[0]) { - default: - nr = i_data[EXT4_IND_BLOCK]; - if (nr) { - ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); - i_data[EXT4_IND_BLOCK] = 0; - } - case EXT4_IND_BLOCK: - nr = i_data[EXT4_DIND_BLOCK]; - if (nr) { - ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); - i_data[EXT4_DIND_BLOCK] = 0; - } - case EXT4_DIND_BLOCK: - nr = i_data[EXT4_TIND_BLOCK]; - if (nr) { - ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); - i_data[EXT4_TIND_BLOCK] = 0; - } - case EXT4_TIND_BLOCK: - ; - } - -out_unlock: - up_write(&ei->i_data_sem); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - - /* - * In a multi-transaction truncate, we only make the final transaction - * synchronous - */ - if (IS_SYNC(inode)) - ext4_handle_sync(handle); -out_stop: - /* - * If this was a simple ftruncate(), and the file will remain alive - * then we need to clear up the orphan record which we created above. - * However, if this was a real unlink then we were called by - * ext4_delete_inode(), and we allow that function to clean up the - * orphan info for us. - */ - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - - ext4_journal_stop(handle); - trace_ext4_truncate_exit(inode); -} - /* * ext4_get_inode_loc returns with an extra refcount against the inode's * underlying buffer_head on success. If 'in_mem' is true, we have all @@ -5386,29 +3923,6 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } -static int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) -{ - int indirects; - - /* if nrblocks are contiguous */ - if (chunk) { - /* - * With N contiguous data blocks, we need at most - * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, - * 2 dindirect blocks, and 1 tindirect block - */ - return DIV_ROUND_UP(nrblocks, - EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; - } - /* - * if nrblocks are not contiguous, worse case, each block touch - * a indirect block, and each indirect block touch a double indirect - * block, plus a triple indirect block - */ - indirects = nrblocks * 2 + 1; - return indirects; -} - static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) { if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) -- cgit v0.10.2 From f86186b44b4164600cce03d0d93ad48ec21fa429 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 28 Jun 2011 10:01:31 -0400 Subject: ext4: refactor duplicated block placement code I found that ext4_ext_find_goal() and ext4_find_near() share the same code for returning a coloured start block based on i_block_group. We can refactor this into a common function so that they don't diverge in the future. Thanks to adilger for suggesting the new function name. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 264f694..f8224ad 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -620,3 +620,51 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) } +/** + * ext4_inode_to_goal_block - return a hint for block allocation + * @inode: inode for block allocation + * + * Return the ideal location to start allocating blocks for a + * newly created inode. + */ +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + ext4_group_t block_group; + ext4_grpblk_t colour; + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); + ext4_fsblk_t bg_start; + ext4_fsblk_t last_block; + + block_group = ei->i_block_group; + if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { + /* + * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME + * block groups per flexgroup, reserve the first block + * group for directories and special files. Regular + * files will start at the second block group. This + * tends to speed up directory access and improves + * fsck times. + */ + block_group &= ~(flex_size-1); + if (S_ISREG(inode->i_mode)) + block_group++; + } + bg_start = ext4_group_first_block_no(inode->i_sb, block_group); + last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + + /* + * If we are doing delayed allocation, we don't need take + * colour into account. + */ + if (test_opt(inode->i_sb, DELALLOC)) + return bg_start; + + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) + colour = (current->pid % 16) * + (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); + else + colour = (current->pid % 16) * ((last_block - bg_start) / 16); + return bg_start + colour; +} + diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ddaf504..49d2cea 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1743,6 +1743,7 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc); #define ext4_free_blocks_after_init(sb, group, desc) \ ext4_init_block_bitmap(sb, NULL, group, desc) +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); /* dir.c */ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index eb63c7b..f331e50 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -114,12 +114,6 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { - struct ext4_inode_info *ei = EXT4_I(inode); - ext4_fsblk_t bg_start; - ext4_fsblk_t last_block; - ext4_grpblk_t colour; - ext4_group_t block_group; - int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); int depth; if (path) { @@ -161,36 +155,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, } /* OK. use inode's group */ - block_group = ei->i_block_group; - if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { - /* - * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME - * block groups per flexgroup, reserve the first block - * group for directories and special files. Regular - * files will start at the second block group. This - * tends to speed up directory access and improves - * fsck times. - */ - block_group &= ~(flex_size-1); - if (S_ISREG(inode->i_mode)) - block_group++; - } - bg_start = ext4_group_first_block_no(inode->i_sb, block_group); - last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; - - /* - * If we are doing delayed allocation, we don't need take - * colour into account. - */ - if (test_opt(inode->i_sb, DELALLOC)) - return bg_start; - - if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) - colour = (current->pid % 16) * - (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); - else - colour = (current->pid % 16) * ((last_block - bg_start) / 16); - return bg_start + colour + block; + return ext4_inode_to_goal_block(inode); } /* diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index c3e85a8..6c27111 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -207,11 +207,6 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) struct ext4_inode_info *ei = EXT4_I(inode); __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; __le32 *p; - ext4_fsblk_t bg_start; - ext4_fsblk_t last_block; - ext4_grpblk_t colour; - ext4_group_t block_group; - int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); /* Try to find previous block */ for (p = ind->p - 1; p >= start; p--) { @@ -227,28 +222,7 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) * It is going to be referred to from the inode itself? OK, just put it * into the same cylinder group then. */ - block_group = ei->i_block_group; - if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { - block_group &= ~(flex_size-1); - if (S_ISREG(inode->i_mode)) - block_group++; - } - bg_start = ext4_group_first_block_no(inode->i_sb, block_group); - last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; - - /* - * If we are doing delayed allocation, we don't need take - * colour into account. - */ - if (test_opt(inode->i_sb, DELALLOC)) - return bg_start; - - if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) - colour = (current->pid % 16) * - (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); - else - colour = (current->pid % 16) * ((last_block - bg_start) / 16); - return bg_start + colour; + return ext4_inode_to_goal_block(inode); } /** -- cgit v0.10.2 From 9331b6261058eb85ae7c57ab8ac279e7fdaa9f04 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Tue, 28 Jun 2011 10:19:05 -0400 Subject: ext4: quiet 'unused variables' compile warnings Unused variables was deleted. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f331e50..31ae5fb 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3073,12 +3073,10 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct ext4_ext_path *path) { struct ext4_extent *ex; - struct ext4_extent_header *eh; int depth; int err = 0; depth = ext_depth(inode); - eh = path[depth].p_hdr; ex = path[depth].p_ext; ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 6ed859d..389386b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4666,12 +4666,10 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, struct ext4_buddy e4b; int err = 0, ret, blk_free_count; ext4_grpblk_t blocks_freed; - struct ext4_group_info *grp; ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); ext4_get_group_no_and_offset(sb, block, &block_group, &bit); - grp = ext4_get_group_info(sb, block_group); /* * Check to see if we are freeing blocks across a group * boundary. -- cgit v0.10.2 From 275d3ba6b40d0f098693b9089c6fee9bd4e55d74 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 29 Jun 2011 21:44:45 -0400 Subject: ext4: remove loop around bio_alloc() These days, bio_alloc() is guaranteed to never fail (as long as nvecs is less than BIO_MAX_PAGES), so we don't need the loop around the struct bio allocation. Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 7bb8f76..430c401 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -285,11 +285,7 @@ static int io_submit_init(struct ext4_io_submit *io, io_end = ext4_init_io_end(inode, GFP_NOFS); if (!io_end) return -ENOMEM; - do { - bio = bio_alloc(GFP_NOIO, nvecs); - nvecs >>= 1; - } while (bio == NULL); - + bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_private = io->io_end = io_end; -- cgit v0.10.2 From 7132de744ba76930d13033061018ddd7e3e8cd91 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Sun, 10 Jul 2011 19:37:48 -0400 Subject: ext4: fix i_blocks/quota accounting when extent insertion fails The current implementation of ext4_free_blocks() always calls dquot_free_block This looks quite sensible in the most cases: blocks to be freed are associated with inode and were accounted in quota and i_blocks some time ago. However, there is a case when blocks to free were not accounted by the time calling ext4_free_blocks() yet: 1. delalloc is on, write_begin pre-allocated some space in quota 2. write-back happens, ext4 allocates some blocks in ext4_ext_map_blocks() 3. then ext4_ext_map_blocks() gets an error (e.g. ENOSPC) from ext4_ext_insert_extent() and calls ext4_free_blocks(). In this scenario, ext4_free_blocks() calls dquot_free_block() who, in turn, decrements i_blocks for blocks which were not accounted yet (due to delalloc) After clean umount, e2fsck reports something like: > Inode 21, i_blocks is 5080, should be 5128. Fix? because i_blocks was erroneously decremented as explained above. The patch fixes the problem by passing the new flag EXT4_FREE_BLOCKS_NO_QUOT_UPDATE to ext4_free_blocks(), to request that the dquot_free_block() call be skipped. Signed-off-by: Maxim Patlasov Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 49d2cea..d13f3b5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -526,6 +526,7 @@ struct ext4_new_group_data { #define EXT4_FREE_BLOCKS_METADATA 0x0001 #define EXT4_FREE_BLOCKS_FORGET 0x0002 #define EXT4_FREE_BLOCKS_VALIDATED 0x0004 +#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 /* * ioctl commands diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 31ae5fb..a862138 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3565,12 +3565,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err) { + int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? + EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; /* free data blocks we just allocated */ /* not a good idea to call discard here directly, * but otherwise we'd need to call it every free() */ ext4_discard_preallocations(inode); ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), - ext4_ext_get_actual_len(&newex), 0); + ext4_ext_get_actual_len(&newex), fb_flags); goto out2; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 389386b..1900ec7 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4637,7 +4637,7 @@ do_more: } ext4_mark_super_dirty(sb); error_return: - if (freed) + if (freed && !(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, freed); brelse(bitmap_bh); ext4_std_error(sb, err); -- cgit v0.10.2 From 575a1d4bdfa2ea9fc10733013136145b497e1be0 Mon Sep 17 00:00:00 2001 From: Jiaying Zhang Date: Sun, 10 Jul 2011 20:07:25 -0400 Subject: ext4: free allocated and pre-allocated blocks when check_eofblocks_fl fails Upon corrupted inode or disk failures, we may fail after we already allocate some blocks from the inode or take some blocks from the inode's preallocation list, but before we successfully insert the corresponding extent to the extent tree. In this case, we should free any allocated blocks and discard the inode's preallocated blocks because the entries in the inode's preallocation list may be in an inconsistent state. Signed-off-by: Jiaying Zhang Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a862138..c969ae2 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3560,10 +3560,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, } err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); - if (err) - goto out2; - - err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (!err) + err = ext4_ext_insert_extent(handle, inode, path, + &newex, flags); if (err) { int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; -- cgit v0.10.2 From 4862fd6047ed02e2726667c54d35f538eecc56aa Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 10 Jul 2011 22:05:08 -0400 Subject: jbd2: remove jbd2_dev_to_name() from jbd2 tracepoints Using function calls in TP_printk causes perf heartburn, so print the MAJOR/MINOR device numbers instead. Signed-off-by: "Theodore Ts'o" diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0dfa5b59..f24df13 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2390,73 +2390,6 @@ static void __exit journal_exit(void) jbd2_journal_destroy_caches(); } -/* - * jbd2_dev_to_name is a utility function used by the jbd2 and ext4 - * tracing infrastructure to map a dev_t to a device name. - * - * The caller should use rcu_read_lock() in order to make sure the - * device name stays valid until its done with it. We use - * rcu_read_lock() as well to make sure we're safe in case the caller - * gets sloppy, and because rcu_read_lock() is cheap and can be safely - * nested. - */ -struct devname_cache { - struct rcu_head rcu; - dev_t device; - char devname[BDEVNAME_SIZE]; -}; -#define CACHE_SIZE_BITS 6 -static struct devname_cache *devcache[1 << CACHE_SIZE_BITS]; -static DEFINE_SPINLOCK(devname_cache_lock); - -static void free_devcache(struct rcu_head *rcu) -{ - kfree(rcu); -} - -const char *jbd2_dev_to_name(dev_t device) -{ - int i = hash_32(device, CACHE_SIZE_BITS); - char *ret; - struct block_device *bd; - static struct devname_cache *new_dev; - - rcu_read_lock(); - if (devcache[i] && devcache[i]->device == device) { - ret = devcache[i]->devname; - rcu_read_unlock(); - return ret; - } - rcu_read_unlock(); - - new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL); - if (!new_dev) - return "NODEV-ALLOCFAILURE"; /* Something non-NULL */ - bd = bdget(device); - spin_lock(&devname_cache_lock); - if (devcache[i]) { - if (devcache[i]->device == device) { - kfree(new_dev); - bdput(bd); - ret = devcache[i]->devname; - spin_unlock(&devname_cache_lock); - return ret; - } - call_rcu(&devcache[i]->rcu, free_devcache); - } - devcache[i] = new_dev; - devcache[i]->device = device; - if (bd) { - bdevname(bd, devcache[i]->devname); - bdput(bd); - } else - __bdevname(device, devcache[i]->devname); - ret = devcache[i]->devname; - spin_unlock(&devname_cache_lock); - return ret; -} -EXPORT_SYMBOL(jbd2_dev_to_name); - MODULE_LICENSE("GPL"); module_init(journal_init); module_exit(journal_exit); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index d087c2e..38f307b 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1329,12 +1329,6 @@ extern int jbd_blocks_per_page(struct inode *inode); #define BUFFER_TRACE2(bh, bh2, info) do {} while (0) #define JBUFFER_TRACE(jh, info) do {} while (0) -/* - * jbd2_dev_to_name is a utility function used by the jbd2 and ext4 - * tracing infrastructure to map a dev_t to a device name. - */ -extern const char *jbd2_dev_to_name(dev_t device); - #endif /* __KERNEL__ */ #endif /* _LINUX_JBD2_H */ diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index bf16545..7596441 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -26,8 +26,8 @@ TRACE_EVENT(jbd2_checkpoint, __entry->result = result; ), - TP_printk("dev %s result %d", - jbd2_dev_to_name(__entry->dev), __entry->result) + TP_printk("dev %d,%d result %d", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->result) ); DECLARE_EVENT_CLASS(jbd2_commit, @@ -48,9 +48,9 @@ DECLARE_EVENT_CLASS(jbd2_commit, __entry->transaction = commit_transaction->t_tid; ), - TP_printk("dev %s transaction %d sync %d", - jbd2_dev_to_name(__entry->dev), __entry->transaction, - __entry->sync_commit) + TP_printk("dev %d,%d transaction %d sync %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->transaction, __entry->sync_commit) ); DEFINE_EVENT(jbd2_commit, jbd2_start_commit, @@ -100,9 +100,9 @@ TRACE_EVENT(jbd2_end_commit, __entry->head = journal->j_tail_sequence; ), - TP_printk("dev %s transaction %d sync %d head %d", - jbd2_dev_to_name(__entry->dev), __entry->transaction, - __entry->sync_commit, __entry->head) + TP_printk("dev %d,%d transaction %d sync %d head %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->transaction, __entry->sync_commit, __entry->head) ); TRACE_EVENT(jbd2_submit_inode_data, @@ -120,8 +120,9 @@ TRACE_EVENT(jbd2_submit_inode_data, __entry->ino = inode->i_ino; ), - TP_printk("dev %s ino %lu", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino) + TP_printk("dev %d,%d ino %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino) ); TRACE_EVENT(jbd2_run_stats, @@ -156,9 +157,9 @@ TRACE_EVENT(jbd2_run_stats, __entry->blocks_logged = stats->rs_blocks_logged; ), - TP_printk("dev %s tid %lu wait %u running %u locked %u flushing %u " + TP_printk("dev %d,%d tid %lu wait %u running %u locked %u flushing %u " "logging %u handle_count %u blocks %u blocks_logged %u", - jbd2_dev_to_name(__entry->dev), __entry->tid, + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, jiffies_to_msecs(__entry->wait), jiffies_to_msecs(__entry->running), jiffies_to_msecs(__entry->locked), @@ -192,9 +193,9 @@ TRACE_EVENT(jbd2_checkpoint_stats, __entry->dropped = stats->cs_dropped; ), - TP_printk("dev %s tid %lu chp_time %u forced_to_close %u " + TP_printk("dev %d,%d tid %lu chp_time %u forced_to_close %u " "written %u dropped %u", - jbd2_dev_to_name(__entry->dev), __entry->tid, + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, jiffies_to_msecs(__entry->chp_time), __entry->forced_to_close, __entry->written, __entry->dropped) ); @@ -222,9 +223,10 @@ TRACE_EVENT(jbd2_cleanup_journal_tail, __entry->freed = freed; ), - TP_printk("dev %s from %u to %u offset %lu freed %lu", - jbd2_dev_to_name(__entry->dev), __entry->tail_sequence, - __entry->first_tid, __entry->block_nr, __entry->freed) + TP_printk("dev %d,%d from %u to %u offset %lu freed %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tail_sequence, __entry->first_tid, + __entry->block_nr, __entry->freed) ); #endif /* _TRACE_JBD2_H */ -- cgit v0.10.2 From 12706394bcaa48e3d5e19c97d7b4e5683ebb12fb Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 10 Jul 2011 22:37:50 -0400 Subject: ext4: add tracepoint for ext4_journal_start This will help debug who is responsible for starting a jbd2 transaction. Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9ea71aa..7910e61 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -269,6 +269,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) journal_t *journal; handle_t *handle; + trace_ext4_journal_start(sb, nblocks, _RET_IP_); if (sb->s_flags & MS_RDONLY) return ERR_PTR(-EROFS); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 5ce2b2f..6f27a59 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -1520,6 +1520,28 @@ TRACE_EVENT(ext4_load_inode, (unsigned long) __entry->ino) ); +TRACE_EVENT(ext4_journal_start, + TP_PROTO(struct super_block *sb, int nblocks, unsigned long IP), + + TP_ARGS(sb, nblocks, IP), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, nblocks ) + __field(unsigned long, ip ) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->nblocks = nblocks; + __entry->ip = IP; + ), + + TP_printk("dev %d,%d nblocks %d caller %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->nblocks, (void *)__entry->ip) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ -- cgit v0.10.2 From 22f10457432387615fa1ae6e0375d9cacc50819b Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sun, 10 Jul 2011 23:52:37 -0400 Subject: ext4: fix trim length underflow with small trim length In 0f0a25b, we adjust 'len' with s_first_data_block - start, but it could underflow in case blocksize=1K, fstrim_range.len=512 and fstrim_range.start = 0. In this case, when we run the code: len -= first_data_blk - start; len will be underflow to -1ULL. In the end, although we are safe that last_group check later will limit the trim to the whole volume, but that isn't what the user really want. So this patch fix it. It also adds the check for 'start' like ext3 so that we can break immediately if the start is invalid. Cc: Lukas Czerner Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 1900ec7..b189cb4 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4902,6 +4902,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) return -EINVAL; + if (start + len <= first_data_blk) + goto out; if (start < first_data_blk) { len -= first_data_blk - start; start = first_data_blk; @@ -4950,5 +4952,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) } range->len = trimmed * sb->s_blocksize; +out: return ret; } -- cgit v0.10.2 From 169ddc3ec83b5f732e51d975befb191d50795844 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 11 Jul 2011 00:00:07 -0400 Subject: ext4: speed up group trim with the right free block count When we trim some free blocks in a group of ext4, we need to calculate the free blocks properly and check whether there are enough freed blocks left for us to trim. Current solution will only calculate free spaces if they are large for a trim which isn't appropriate. Let us see a small example: a group has 1.5M free which are 300k, 300k, 300k, 300k, 300k. And minblocks is 1M. With current solution, we have to iterate the whole group since these 300k will never be subtracted from 1.5M. But actually we should exit after we find the first 2 free spaces since the left 3 chunks only sum up to 900K if we subtract the first 600K although they can't be trimed. Reviewed-by: Andreas Dilger Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b189cb4..4a25725 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4821,7 +4821,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_grpblk_t minblocks) { void *bitmap; - ext4_grpblk_t next, count = 0; + ext4_grpblk_t next, count = 0, free_count = 0; struct ext4_buddy e4b; int ret; @@ -4848,6 +4848,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, next - start, group, &e4b); count += next - start; } + free_count += next - start; start = next + 1; if (fatal_signal_pending(current)) { @@ -4861,7 +4862,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_lock_group(sb, group); } - if ((e4b.bd_info->bb_free - count) < minblocks) + if ((e4b.bd_info->bb_free - free_count) < minblocks) break; } ext4_unlock_group(sb, group); -- cgit v0.10.2 From b3d4c2b10b68d205d3eb1b5c17dcb4649a502798 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 11 Jul 2011 00:01:52 -0400 Subject: ext4: Add new ext4 trim tracepoints Add ext4_trim_extent and ext4_trim_all_free. Reviewed-by: Lukas Czerner Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4a25725..7aa4c16 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4780,6 +4780,8 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count, { struct ext4_free_extent ex; + trace_ext4_trim_extent(sb, group, start, count); + assert_spin_locked(ext4_group_lock_ptr(sb, group)); ex.fe_start = start; @@ -4825,6 +4827,8 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, struct ext4_buddy e4b; int ret; + trace_ext4_trim_all_free(sb, group, start, max); + ret = ext4_mb_load_buddy(sb, group, &e4b); if (ret) { ext4_error(sb, "Error in loading buddy " diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 6f27a59..51d8813 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -1542,6 +1542,55 @@ TRACE_EVENT(ext4_journal_start, __entry->nblocks, (void *)__entry->ip) ); +DECLARE_EVENT_CLASS(ext4__trim, + TP_PROTO(struct super_block *sb, + ext4_group_t group, + ext4_grpblk_t start, + ext4_grpblk_t len), + + TP_ARGS(sb, group, start, len), + + TP_STRUCT__entry( + __field( int, dev_major ) + __field( int, dev_minor ) + __field( __u32, group ) + __field( int, start ) + __field( int, len ) + ), + + TP_fast_assign( + __entry->dev_major = MAJOR(sb->s_dev); + __entry->dev_minor = MINOR(sb->s_dev); + __entry->group = group; + __entry->start = start; + __entry->len = len; + ), + + TP_printk("dev %d,%d group %u, start %d, len %d", + __entry->dev_major, __entry->dev_minor, + __entry->group, __entry->start, __entry->len) +); + +DEFINE_EVENT(ext4__trim, ext4_trim_extent, + + TP_PROTO(struct super_block *sb, + ext4_group_t group, + ext4_grpblk_t start, + ext4_grpblk_t len), + + TP_ARGS(sb, group, start, len) +); + +DEFINE_EVENT(ext4__trim, ext4_trim_all_free, + + TP_PROTO(struct super_block *sb, + ext4_group_t group, + ext4_grpblk_t start, + ext4_grpblk_t len), + + TP_ARGS(sb, group, start, len) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ -- cgit v0.10.2 From 3d56b8d2c74cc3f375ce332b3ac3519e009d79ee Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 11 Jul 2011 00:03:38 -0400 Subject: ext4: Speed up FITRIM by recording flags in ext4_group_info In ext4, when FITRIM is called every time, we iterate all the groups and do trim one by one. It is a bit time wasting if the group has been trimmed and there is no change since the last trim. So this patch adds a new flag in ext4_group_info->bb_state to indicate that the group has been trimmed, and it will be cleared if some blocks is freed(in release_blocks_on_commit). Another trim_minlen is added in ext4_sb_info to record the last minlen we use to trim the volume, so that if the caller provide a small one, we will go on the trim regardless of the bb_state. A simple test with my intel x25m ssd: df -h shows: /dev/sdb1 40G 21G 17G 56% /mnt/ext4 Block size: 4096 run the FITRIM with the following parameter: range.start = 0; range.len = UINT64_MAX; range.minlen = 1048576; without the patch: [root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a real 0m5.505s user 0m0.000s sys 0m1.224s [root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a real 0m5.359s user 0m0.000s sys 0m1.178s [root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a real 0m5.228s user 0m0.000s sys 0m1.151s with the patch: [root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a real 0m5.625s user 0m0.000s sys 0m1.269s [root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a real 0m0.002s user 0m0.000s sys 0m0.001s [root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a real 0m0.002s user 0m0.000s sys 0m0.001s A big improvement for the 2nd and 3rd run. Even after I delete some big image files, it is still much faster than iterating the whole disk. [root@boyu-tm test]# time ./ftrim /mnt/ext4/a real 0m1.217s user 0m0.000s sys 0m0.196s Cc: Lukas Czerner Reviewed-by: Andreas Dilger Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d13f3b5..62cee2b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1215,6 +1215,9 @@ struct ext4_sb_info { /* Kernel thread for multiple mount protection */ struct task_struct *s_mmp_tsk; + + /* record the last minlen when FITRIM is called. */ + atomic_t s_last_trim_minblks; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -2080,11 +2083,19 @@ struct ext4_group_info { * 5 free 8-block regions. */ }; -#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_WAS_TRIMMED(grp) \ + (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_SET_TRIMMED(grp) \ + (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ + (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) + #define EXT4_MAX_CONTENTION 8 #define EXT4_CONTENTION_THRESHOLD 2 diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7aa4c16..73c2540 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2628,6 +2628,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) rb_erase(&entry->node, &(db->bb_free_root)); mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); + /* + * Clear the trimmed flag for the group so that the next + * ext4_trim_fs can trim it. + * If the volume is mounted with -o discard, online discard + * is supported and the free blocks will be trimmed online. + */ + if (!test_opt(sb, DISCARD)) + EXT4_MB_GRP_CLEAR_TRIMMED(db); + if (!db->bb_free_root.rb_node) { /* No more items in the per group rb tree * balance refcounts from ext4_mb_free_metadata() @@ -4838,6 +4847,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, bitmap = e4b.bd_bitmap; ext4_lock_group(sb, group); + if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && + minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) + goto out; + start = (e4b.bd_info->bb_first_free > start) ? e4b.bd_info->bb_first_free : start; @@ -4869,6 +4882,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, if ((e4b.bd_info->bb_free - free_count) < minblocks) break; } + + if (!ret) + EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); +out: ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); @@ -4957,6 +4974,9 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) } range->len = trimmed * sb->s_blocksize; + if (!ret) + atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); + out: return ret; } -- cgit v0.10.2 From 22612283f7da1ce9849d9b3716010b07a0446fd9 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 11 Jul 2011 00:04:34 -0400 Subject: ext4: Change the wrong param comment for ext4_trim_all_free at ext4_trim_all_free() comment, there is no longer an @e4b parameter, instead it is @group. Reported-by: Andreas Dilger Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 73c2540..04a3d92 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4811,7 +4811,7 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count, /** * ext4_trim_all_free -- function to trim all free space in alloc. group * @sb: super block for file system - * @e4b: ext4 buddy + * @group: group to be trimmed * @start: first group block to examine * @max: last group block to examine * @minblocks: minimum extent block count -- cgit v0.10.2 From ffb505ff0f7b52318dea46dd139107a8371b4ad7 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Mon, 11 Jul 2011 11:43:59 -0400 Subject: ext4: remove redundant goto in ext4_ext_insert_extent() If eh->eh_entries is smaller than eh->eh_max, the routine will go to the "repeat" and then go to "has_space" directlly , since argument "depth" and "eh" are not even changed. Therefore, goto "has_space" directly and remove redundant "repeat" tag. Signed-off-by: Robin Dong diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c969ae2..9cbdcb2 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1723,7 +1723,6 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, goto merge; } -repeat: depth = ext_depth(inode); eh = path[depth].p_hdr; if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) @@ -1745,7 +1744,7 @@ repeat: ext_debug("next leaf isn't full(%d)\n", le16_to_cpu(eh->eh_entries)); path = npath; - goto repeat; + goto has_space; } ext_debug("next leaf has no free space(%d,%d)\n", le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); -- cgit v0.10.2 From 598dbdf2433cad55bd44d923f67a053871e3eabf Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Mon, 11 Jul 2011 18:24:01 -0400 Subject: ext4: avoid unneeded ext4_ext_next_leaf_block() while inserting extents Optimize ext4_ext_insert_extent() by avoiding ext4_ext_next_leaf_block() when the result is not used/needed. Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9cbdcb2..f1c538e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1730,9 +1730,10 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, /* probably next leaf has space for us? */ fex = EXT_LAST_EXTENT(eh); - next = ext4_ext_next_leaf_block(inode, path); - if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) - && next != EXT_MAX_BLOCKS) { + next = EXT_MAX_BLOCKS; + if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) + next = ext4_ext_next_leaf_block(inode, path); + if (next != EXT_MAX_BLOCKS) { ext_debug("next leaf block - %d\n", next); BUG_ON(npath != NULL); npath = ext4_ext_find_extent(inode, next, NULL); -- cgit v0.10.2 From 823ba01fc07751200c43e45733925a98b73eac3a Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 11 Jul 2011 18:26:01 -0400 Subject: ext4: fix a race which could leak memory in ext4_groupinfo_create_slab() In ext4_groupinfo_create_slab, we create ext4_groupinfo_caches within ext4_grpinfo_slab_create_mutex, but set it outside the lock, and there does exist some case that we may create it twice and causes a memory leak. So set it before we call mutex_unlock. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 04a3d92..2b9a71b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2404,14 +2404,14 @@ static int ext4_groupinfo_create_slab(size_t size) slab_size, 0, SLAB_RECLAIM_ACCOUNT, NULL); + ext4_groupinfo_caches[cache_index] = cachep; + mutex_unlock(&ext4_grpinfo_slab_create_mutex); if (!cachep) { printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n"); return -ENOMEM; } - ext4_groupinfo_caches[cache_index] = cachep; - return 0; } -- cgit v0.10.2 From caaf7a29d31da21bb8d8200d5e42d1c93d3c6e00 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 11 Jul 2011 18:42:42 -0400 Subject: ext4: Fix a double free of sbi->s_group_info in ext4_mb_init_backend If we meet with an error in ext4_mb_add_groupinfo, we kfree sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)], but fail to reset it to NULL. So the caller ext4_mb_init_backend will try to kfree it again and causes a double free. So fix it by resetting it to NULL. Some typo in comments of mballoc.c are also changed. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 2b9a71b..b97a2d2 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -75,8 +75,8 @@ * * The inode preallocation space is used looking at the _logical_ start * block. If only the logical file block falls within the range of prealloc - * space we will consume the particular prealloc space. This make sure that - * that the we have contiguous physical blocks representing the file blocks + * space we will consume the particular prealloc space. This makes sure that + * we have contiguous physical blocks representing the file blocks * * The important thing to be noted in case of inode prealloc space is that * we don't modify the values associated to inode prealloc space except @@ -84,7 +84,7 @@ * * If we are not able to find blocks in the inode prealloc space and if we * have the group allocation flag set then we look at the locality group - * prealloc space. These are per CPU prealloc list repreasented as + * prealloc space. These are per CPU prealloc list represented as * * ext4_sb_info.s_locality_groups[smp_processor_id()] * @@ -152,7 +152,7 @@ * best extent in the found extents. Searching for the blocks starts with * the group specified as the goal value in allocation context via * ac_g_ex. Each group is first checked based on the criteria whether it - * can used for allocation. ext4_mb_good_group explains how the groups are + * can be used for allocation. ext4_mb_good_group explains how the groups are * checked. * * Both the prealloc space are getting populated as above. So for the first @@ -2279,8 +2279,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, exit_group_info: /* If a meta_group_info table has been allocated, release it now */ - if (group % EXT4_DESC_PER_BLOCK(sb) == 0) + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL; + } exit_meta_group_info: return -ENOMEM; } /* ext4_mb_add_groupinfo */ -- cgit v0.10.2 From afb86178cb9b6a7329cf8709aa210fb0a245b606 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Mon, 11 Jul 2011 18:47:04 -0400 Subject: ext4: remove unnecessary comments in ext4_orphan_add() The comment from Al Viro about possible race in the ext4_orphan_add() is not justified. There is no race possible as we always have either i_mutex locked, or the inode can not be referenced from outside hence the J_ASSERS should not be hit from the reason described in comment. This commit replaces it with notion that we are holding i_mutex so it should not be possible for i_nlink to be changed while waiting for s_orphan_lock. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b754b77..8dde5ab 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1989,18 +1989,11 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) if (!list_empty(&EXT4_I(inode)->i_orphan)) goto out_unlock; - /* Orphan handling is only valid for files with data blocks - * being truncated, or files being unlinked. */ - - /* @@@ FIXME: Observation from aviro: - * I think I can trigger J_ASSERT in ext4_orphan_add(). We block - * here (on s_orphan_lock), so race with ext4_link() which might bump - * ->i_nlink. For, say it, character device. Not a regular file, - * not a directory, not a symlink and ->i_nlink > 0. - * - * tytso, 4/25/2009: I'm not sure how that could happen; - * shouldn't the fs core protect us from these sort of - * unlink()/link() races? + /* + * Orphan handling is only valid for files with data blocks + * being truncated, or files being unlinked. Note that we either + * hold i_mutex, or the inode can not be referenced from outside, + * so i_nlink should not be bumped due to race */ J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); -- cgit v0.10.2 From 265c6a0f9290c8f470b839257dc6af3c46b24da1 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Sat, 16 Jul 2011 19:41:23 -0400 Subject: ext4: fix compilation with -DDX_DEBUG Compilation of ext4/namei.c brought up an error and warning messages when compiled with -DDX_DEBUG Signed-off-by: Bernd Schubert Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 8dde5ab..aaf3131 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -289,7 +289,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent while (len--) printk("%c", *name++); ext4fs_dirhash(de->name, de->name_len, &h); printk(":%x.%u ", h.hash, - ((char *) de - base)); + (unsigned) ((char *) de - base)); } space += EXT4_DIR_REC_LEN(de->name_len); names++; @@ -1013,7 +1013,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q *err = -ENOENT; errout: - dxtrace(printk(KERN_DEBUG "%s not found\n", name)); + dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name)); dx_release (frames); return NULL; } -- cgit v0.10.2 From d7a1fee135771e6e5185642bdc17df19bbdbcc48 Mon Sep 17 00:00:00 2001 From: Dan Ehrenberg Date: Sun, 17 Jul 2011 21:11:30 -0400 Subject: ext4: make the preallocation size be a multiple of stripe size Previously, if a stripe width was provided, then it would be used as the preallocation granularity, with no santiy checking and no way to override this. Now, mb_prealloc_size defaults to the smallest multiple of stripe size that is greater than or equal to the old default mb_prealloc_size, and this can be overridden with the sysfs interface. Signed-off-by: Dan Ehrenberg Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b97a2d2..037f680 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -128,12 +128,13 @@ * we are doing a group prealloc we try to normalize the request to * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is * 512 blocks. This can be tuned via - * /sys/fs/ext4//mb_group_prealloc. The value is represented in * terms of number of blocks. If we have mounted the file system with -O * stripe= option the group prealloc request is normalized to the - * stripe value (sbi->s_stripe) + * the smallest multiple of the stripe value (sbi->s_stripe) which is + * greater than the default mb_group_prealloc. * - * The regular allocator(using the buddy cache) supports few tunables. + * The regular allocator (using the buddy cache) supports a few tunables. * * /sys/fs/ext4//mb_min_to_scan * /sys/fs/ext4//mb_max_to_scan @@ -2474,6 +2475,18 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; + /* + * If there is a s_stripe > 1, then we set the s_mb_group_prealloc + * to the lowest multiple of s_stripe which is bigger than + * the s_mb_group_prealloc as determined above. We want + * the preallocation size to be an exact multiple of the + * RAID stripe size so that preallocations don't fragment + * the stripes. + */ + if (sbi->s_stripe > 1) { + sbi->s_mb_group_prealloc = roundup( + sbi->s_mb_group_prealloc, sbi->s_stripe); + } sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { @@ -2841,8 +2854,9 @@ out_err: /* * here we normalize request for locality group - * Group request are normalized to s_strip size if we set the same via mount - * option. If not we set it to s_mb_group_prealloc which can be configured via + * Group request are normalized to s_mb_group_prealloc, which goes to + * s_strip if we set the same via mount option. + * s_mb_group_prealloc can be configured via * /sys/fs/ext4//mb_group_prealloc * * XXX: should we try to preallocate more than the group has now? @@ -2853,10 +2867,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) struct ext4_locality_group *lg = ac->ac_lg; BUG_ON(lg == NULL); - if (EXT4_SB(sb)->s_stripe) - ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; - else - ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; + ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; mb_debug(1, "#%u: goal %u blocks for locality group\n", current->pid, ac->ac_g_ex.fe_len); } -- cgit v0.10.2 From 3eb08658431abd65c0fe6855d1860859c2d416f7 Mon Sep 17 00:00:00 2001 From: Dan Ehrenberg Date: Sun, 17 Jul 2011 21:18:51 -0400 Subject: ext4: ignore a stripe width of 1 If the stripe width was set to 1, then this patch will ignore that stripe width and ext4 will act as if the stripe width were 0 with respect to optimizing allocations. Signed-off-by: Dan Ehrenberg Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7910e61..143d763 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2384,17 +2384,25 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); unsigned long stripe_width = le32_to_cpu(sbi->s_es->s_raid_stripe_width); + int ret; if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) - return sbi->s_stripe; - - if (stripe_width <= sbi->s_blocks_per_group) - return stripe_width; + ret = sbi->s_stripe; + else if (stripe_width <= sbi->s_blocks_per_group) + ret = stripe_width; + else if (stride <= sbi->s_blocks_per_group) + ret = stride; + else + ret = 0; - if (stride <= sbi->s_blocks_per_group) - return stride; + /* + * If the stripe width is 1, this makes no sense and + * we set it to 0 to turn off stripe handling code. + */ + if (ret <= 1) + ret = 0; - return 0; + return ret; } /* sysfs supprt */ -- cgit v0.10.2 From f7d0d3797fac6cad24ad9f86dd9baf65c586b434 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Sun, 17 Jul 2011 23:17:02 -0400 Subject: ext4: punch hole optimizations: skip un-needed extent lookup This patch optimizes the punch hole operation by skipping the tree walking code that is used by truncate. Since punch hole is done through map blocks, the path to the extent is already known in this function, so we do not need to look it up again. Signed-off-by: Allison Henderson Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f1c538e..06b30b6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3461,8 +3461,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_ext_mark_uninitialized(ex); - err = ext4_ext_remove_space(inode, map->m_lblk, - map->m_lblk + punched_out); + ext4_ext_invalidate_cache(inode); + + err = ext4_ext_rm_leaf(handle, inode, path, + map->m_lblk, map->m_lblk + punched_out); + + if (!err && path->p_hdr->eh_entries == 0) { + /* + * Punch hole freed all of this sub tree, + * so we need to correct eh_depth + */ + err = ext4_ext_get_access(handle, inode, path); + if (err == 0) { + ext_inode_hdr(inode)->eh_depth = 0; + ext_inode_hdr(inode)->eh_max = + cpu_to_le16(ext4_ext_space_root( + inode, 0)); + + err = ext4_ext_dirty( + handle, inode, path); + } + } goto out2; } -- cgit v0.10.2 From c6a0371cbefade85376bbc326d18451860632dce Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Sun, 17 Jul 2011 23:21:03 -0400 Subject: ext4: remove unneeded parameter to ext4_ext_remove_space() This patch removes the extra parameter in ext4_ext_remove_space() which is no longer needed. Signed-off-by: Allison Henderson Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 06b30b6..3d8c5f5 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2500,8 +2500,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) return 1; } -static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, - ext4_lblk_t end) +static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) { struct super_block *sb = inode->i_sb; int depth = ext_depth(inode); @@ -2541,7 +2540,7 @@ again: if (i == depth) { /* this is leaf block */ err = ext4_ext_rm_leaf(handle, inode, path, - start, end); + start, EXT_MAX_BLOCKS - 1); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; @@ -3683,7 +3682,7 @@ void ext4_ext_truncate(struct inode *inode) last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); - err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); + err = ext4_ext_remove_space(inode, last_block); /* In a multi-transaction truncate, we only make the final * transaction synchronous. -- cgit v0.10.2 From 015861badd0db43d025bbb538f8fc62dfaf3f18d Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Sun, 17 Jul 2011 23:27:43 -0400 Subject: ext4: avoid wasted extent cache lookup if !PUNCH_OUT_EXT This patch avoids an extraneous lookup of the extent cache in ext4_ext_map_blocks() when the flag EXT4_GET_BLOCKS_PUNCH_OUT_EXT is absent. The existing logic was performing the lookup but not making use of the result. The patch simply reverses the order of evaluation in the condition. Since ext4_ext_in_cache() does not initialize newex on misses, bypassing its invocation does not introduce any new issue in this regard. Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" Reviewed-by: Lukas Czerner Reviewed-by: Eric Gouriou diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3d8c5f5..b8acfab 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3320,8 +3320,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* check in cache */ - if (ext4_ext_in_cache(inode, map->m_lblk, &newex) && - ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) { + if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) && + ext4_ext_in_cache(inode, map->m_lblk, &newex)) { if (!newex.ee_start_lo && !newex.ee_start_hi) { if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* -- cgit v0.10.2 From d46203159ed376fdbe2b05aa57e58207bf27a8f9 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Sun, 17 Jul 2011 23:43:42 -0400 Subject: ext4: avoid eh_entries overflow before insert extent_idx If eh_entries is equal to (or greater than) eh_max, the operation of inserting new extent_idx will make number of entries overflow. So check eh_entries before inserting the new extent_idx. Although there is no bug case according the code (function ext4_ext_insert_index is called by ext4_ext_split and ext4_ext_split is called only if the index block has free space), the right logic should be "lookup the capacity before insertion". Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b8acfab..9bec432 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -741,6 +741,16 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, logical, le32_to_cpu(curp->p_idx->ei_block)); return -EIO; } + + if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) + >= le16_to_cpu(curp->p_hdr->eh_max))) { + EXT4_ERROR_INODE(inode, + "eh_entries %d >= eh_max %d!", + le16_to_cpu(curp->p_hdr->eh_entries), + le16_to_cpu(curp->p_hdr->eh_max)); + return -EIO; + } + len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; if (logical > le32_to_cpu(curp->p_idx->ei_block)) { /* insert after */ @@ -770,14 +780,6 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, ext4_idx_store_pblock(ix, ptr); le16_add_cpu(&curp->p_hdr->eh_entries, 1); - if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) - > le16_to_cpu(curp->p_hdr->eh_max))) { - EXT4_ERROR_INODE(inode, - "eh_entries %d > eh_max %d!", - le16_to_cpu(curp->p_hdr->eh_entries), - le16_to_cpu(curp->p_hdr->eh_max)); - return -EIO; - } if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); return -EIO; -- cgit v0.10.2 From 529da704ad1ead755d9e40721f29446cb278e099 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sat, 23 Jul 2011 16:07:26 -0400 Subject: ext4: remove unnecessary ext4_get_group_info in ext4_mb_load_buddy ext4_mb_load_buddy() calls ext4_get_group_info() for setting both "grp" and "e4b->bd_info", but it could do "e4b->bd_info = grp". Reported-by: Andreas Dilger Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 037f680..447c0f3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1126,7 +1126,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, grp = ext4_get_group_info(sb, group); e4b->bd_blkbits = sb->s_blocksize_bits; - e4b->bd_info = ext4_get_group_info(sb, group); + e4b->bd_info = grp; e4b->bd_sb = sb; e4b->bd_group = group; e4b->bd_buddy_page = NULL; -- cgit v0.10.2 From ced156e464e49b6b4153ede9aaa04d9a4ad24e0c Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sat, 23 Jul 2011 16:18:05 -0400 Subject: ext4: don't increment s_mb_buddies_generated in ext4_mb_release In ext4_mb_release, we use s_mb_buddies_generated++. Although the output is OK, but I don't think we need this extra ++. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 447c0f3..e165830 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2581,7 +2581,7 @@ int ext4_mb_release(struct super_block *sb) atomic_read(&sbi->s_mb_lost_chunks)); printk(KERN_INFO "EXT4-fs: mballoc: %lu generated and it took %Lu\n", - sbi->s_mb_buddies_generated++, + sbi->s_mb_buddies_generated, sbi->s_mb_generation_time); printk(KERN_INFO "EXT4-fs: mballoc: %u preallocated, %u discarded\n", -- cgit v0.10.2 From 6a0fe49308dcac10ab510b3c45e00eb8d5ef440e Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sat, 23 Jul 2011 16:18:55 -0400 Subject: ext4: remove ac_repeats from ext4_allocation_context ac_repeats isn't referenced in the mballoc code. So remove it. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 20b5e7b..9d4a636 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -187,7 +187,6 @@ struct ext4_allocation_context { __u16 ac_flags; /* allocation hints */ __u8 ac_status; __u8 ac_criteria; - __u8 ac_repeats; __u8 ac_2order; /* if request is to allocate 2^N blocks and * N > 0, the field stores N, otherwise 0 */ __u8 ac_op; /* operation, for history only */ -- cgit v0.10.2 From 5718789da5b94bd4148cb7ea0f457089c26bc1c3 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Sat, 23 Jul 2011 21:49:07 -0400 Subject: ext4: remove unused argument in ext4_ext_next_leaf_block The argument "inode" in function ext4_ext_next_allocated_block looks useless, so clean it. Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9bec432..33bbe84 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1414,8 +1414,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path) * ext4_ext_next_leaf_block: * returns first allocated block from next leaf or EXT_MAX_BLOCKS */ -static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, - struct ext4_ext_path *path) +static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) { int depth; @@ -1734,7 +1733,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, fex = EXT_LAST_EXTENT(eh); next = EXT_MAX_BLOCKS; if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) - next = ext4_ext_next_leaf_block(inode, path); + next = ext4_ext_next_leaf_block(path); if (next != EXT_MAX_BLOCKS) { ext_debug("next leaf block - %d\n", next); BUG_ON(npath != NULL); -- cgit v0.10.2 From 0737964bc98202776f4d10bc8e108f45b3115037 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Sat, 23 Jul 2011 21:51:07 -0400 Subject: ext4: correct the debug message in ext4_ext_insert_extent The debug message in ext4_ext_insert_extent before moving extent is incorrect (the "from xx to xx"). Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 33bbe84..a637d83 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1806,7 +1806,7 @@ has_space: ext4_ext_pblock(newext), ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), - nearex, len, nearex + 1, nearex + 2); + nearex, len, nearex, nearex + 1); memmove(nearex + 1, nearex, len); path[depth].p_ext = nearex; } -- cgit v0.10.2 From b7ca1e8ec53259359db5313f923a0a20fa04bdb6 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Sat, 23 Jul 2011 21:53:25 -0400 Subject: ext4: correct comment for ext4_ext_check_cache The comment for ext4_ext_check_cache has a litte mistake. Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a637d83..4d73e11 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2019,7 +2019,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, } /* - * ext4_ext_in_cache() + * ext4_ext_check_cache() * Checks to see if the given block is in the cache. * If it is, the cached extent is stored in the given * cache extent pointer. If the cached extent is a hole, -- cgit v0.10.2 From 2d859db3e4a82a365572592d57624a5f996ed0ec Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 26 Jul 2011 09:07:11 -0400 Subject: ext4: fix data corruption in inodes with journalled data When journalling data for an inode (either because it is a symlink or because the filesystem is mounted in data=journal mode), ext4_evict_inode() can discard unwritten data by calling truncate_inode_pages(). This is because we don't mark the buffer / page dirty when journalling data but only add the buffer to the running transaction and thus mm does not know there are still unwritten data. Fix the problem by carefully tracking transaction containing inode's data, committing this transaction, and writing uncheckpointed buffers when inode should be reaped. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index de50b16..43e4abd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -121,6 +121,33 @@ void ext4_evict_inode(struct inode *inode) trace_ext4_evict_inode(inode); if (inode->i_nlink) { + /* + * When journalling data dirty buffers are tracked only in the + * journal. So although mm thinks everything is clean and + * ready for reaping the inode might still have some pages to + * write in the running transaction or waiting to be + * checkpointed. Thus calling jbd2_journal_invalidatepage() + * (via truncate_inode_pages()) to discard these buffers can + * cause data loss. Also even if we did not discard these + * buffers, we would have no way to find them after the inode + * is reaped and thus user could see stale data if he tries to + * read them before the transaction is checkpointed. So be + * careful and force everything to disk here... We use + * ei->i_datasync_tid to store the newest transaction + * containing inode's data. + * + * Note that directories do not have this problem because they + * don't use page cache. + */ + if (ext4_should_journal_data(inode) && + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; + + jbd2_log_start_commit(journal, commit_tid); + jbd2_log_wait_commit(journal, commit_tid); + filemap_write_and_wait(&inode->i_data); + } truncate_inode_pages(&inode->i_data, 0); goto no_delete; } @@ -970,6 +997,7 @@ static int ext4_journalled_write_end(struct file *file, if (new_i_size > inode->i_size) i_size_write(inode, pos+copied); ext4_set_inode_state(inode, EXT4_STATE_JDATA); + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; if (new_i_size > EXT4_I(inode)->i_disksize) { ext4_update_i_disksize(inode, new_i_size); ret2 = ext4_mark_inode_dirty(handle, inode); @@ -1678,6 +1706,7 @@ static int __ext4_journalled_writepage(struct page *page, write_end_fn); if (ret == 0) ret = err; + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; err = ext4_journal_stop(handle); if (!ret) ret = err; -- cgit v0.10.2 From 8f82f840ec6ab873f520364d443ff6fa1b3f8e22 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Tue, 26 Jul 2011 21:35:44 -0400 Subject: ext4: prevent parallel resizers by atomic bit ops Before this patch, parallel resizers are allowed and protected by a mutex lock, actually, there is no need to support parallel resizer, so this patch prevents parallel resizers by atmoic bit ops, like lock_page() and unlock_page() do. To do this, the patch removed the mutex lock s_resize_lock from struct ext4_sb_info and added a unsigned long field named s_resize_flags which inidicates if there is a resizer. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 62cee2b..bb0f776 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1127,7 +1127,8 @@ struct ext4_sb_info { struct journal_s *s_journal; struct list_head s_orphan; struct mutex s_orphan_lock; - struct mutex s_resize_lock; + unsigned long s_resize_flags; /* Flags indicating if there + is a resizer */ unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; @@ -2269,6 +2270,10 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; +#define EXT4_RESIZING 0 +extern int ext4_resize_begin(struct super_block *sb); +extern void ext4_resize_end(struct super_block *sb); + #endif /* __KERNEL__ */ #endif /* _EXT4_H */ diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 808c554..f18bfe3 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -202,8 +202,9 @@ setversion_out: struct super_block *sb = inode->i_sb; int err, err2=0; - if (!capable(CAP_SYS_RESOURCE)) - return -EPERM; + err = ext4_resize_begin(sb); + if (err) + return err; if (get_user(n_blocks_count, (__u32 __user *)arg)) return -EFAULT; @@ -221,6 +222,7 @@ setversion_out: if (err == 0) err = err2; mnt_drop_write(filp->f_path.mnt); + ext4_resize_end(sb); return err; } @@ -271,8 +273,9 @@ mext_out: struct super_block *sb = inode->i_sb; int err, err2=0; - if (!capable(CAP_SYS_RESOURCE)) - return -EPERM; + err = ext4_resize_begin(sb); + if (err) + return err; if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, sizeof(input))) @@ -291,6 +294,7 @@ mext_out: if (err == 0) err = err2; mnt_drop_write(filp->f_path.mnt); + ext4_resize_end(sb); return err; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 80bbc9c..0213f63 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -16,6 +16,25 @@ #include "ext4_jbd2.h" +int ext4_resize_begin(struct super_block *sb) +{ + int ret = 0; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags)) + ret = -EBUSY; + + return ret; +} + +void ext4_resize_end(struct super_block *sb) +{ + clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags); + smp_mb__after_clear_bit(); +} + #define outside(b, first, last) ((b) < (first) || (b) >= (last)) #define inside(b, first, last) ((b) >= (first) && (b) < (last)) @@ -181,11 +200,7 @@ static int setup_new_group_blocks(struct super_block *sb, if (IS_ERR(handle)) return PTR_ERR(handle); - mutex_lock(&sbi->s_resize_lock); - if (input->group != sbi->s_groups_count) { - err = -EBUSY; - goto exit_journal; - } + BUG_ON(input->group != sbi->s_groups_count); if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) { err = PTR_ERR(bh); @@ -285,7 +300,6 @@ exit_bh: brelse(bh); exit_journal: - mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext4_journal_stop(handle)) && !err) err = err2; @@ -799,13 +813,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) goto exit_put; } - mutex_lock(&sbi->s_resize_lock); - if (input->group != sbi->s_groups_count) { - ext4_warning(sb, "multiple resizers run on filesystem!"); - err = -EBUSY; - goto exit_journal; - } - if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) goto exit_journal; @@ -829,7 +836,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) /* * OK, now we've set up the new group. Time to make it active. * - * We do not lock all allocations via s_resize_lock * so we have to be safe wrt. concurrent accesses the group * data. So we need to be careful to set all of the relevant * group descriptor data etc. *before* we enable the group. @@ -886,13 +892,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) * * The precise rules we use are: * - * * Writers of s_groups_count *must* hold s_resize_lock - * AND * * Writers must perform a smp_wmb() after updating all dependent * data and before modifying the groups count * - * * Readers must hold s_resize_lock over the access - * OR * * Readers must perform an smp_rmb() after reading the groups count * and before reading any dependent data. * @@ -937,7 +939,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) ext4_handle_dirty_super(handle, sb); exit_journal: - mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext4_journal_stop(handle)) && !err) err = err2; if (!err) { @@ -972,9 +973,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, int err; ext4_group_t group; - /* We don't need to worry about locking wrt other resizers just - * yet: we're going to revalidate es->s_blocks_count after - * taking the s_resize_lock below. */ o_blocks_count = ext4_blocks_count(es); if (test_opt(sb, DEBUG)) @@ -995,7 +993,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, if (n_blocks_count < o_blocks_count) { ext4_warning(sb, "can't shrink FS - resize aborted"); - return -EBUSY; + return -EINVAL; } /* Handle the remaining blocks in the last group only. */ @@ -1038,24 +1036,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, goto exit_put; } - mutex_lock(&EXT4_SB(sb)->s_resize_lock); - if (o_blocks_count != ext4_blocks_count(es)) { - ext4_warning(sb, "multiple resizers run on filesystem!"); - mutex_unlock(&EXT4_SB(sb)->s_resize_lock); - ext4_journal_stop(handle); - err = -EBUSY; - goto exit_put; - } - if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh))) { ext4_warning(sb, "error %d on journal write access", err); - mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_journal_stop(handle); goto exit_put; } ext4_blocks_count_set(es, o_blocks_count + add); - mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); /* We add the blocks to the bitmap and set the group need init bit */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 143d763..cfe9f39 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3500,7 +3500,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); - mutex_init(&sbi->s_resize_lock); + sbi->s_resize_flags = 0; sb->s_root = NULL; -- cgit v0.10.2 From ce723c31b58f54fb865036805475ee7a8c5dc173 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Tue, 26 Jul 2011 21:39:09 -0400 Subject: ext4: prevent a fs with errors from being resized A filesystem with errors is not allowed to being resized, otherwise, it is easy to destroy the filesystem. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 0213f63..53d9795 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -23,6 +23,16 @@ int ext4_resize_begin(struct super_block *sb) if (!capable(CAP_SYS_RESOURCE)) return -EPERM; + /* + * We are not allowed to do online-resizing on a filesystem mounted + * with error, because it can destroy the filesystem easily. + */ + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + ext4_warning(sb, "There are errors in the filesystem, " + "so online resizing is not allowed\n"); + return -EPERM; + } + if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags)) ret = -EBUSY; -- cgit v0.10.2 From 0529155e8a4bcb77dfc9ceaea19c6501487e452b Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Tue, 26 Jul 2011 21:43:56 -0400 Subject: ext4: rename ext4_add_groupblocks() to ext4_group_add_blocks() Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bb0f776..bbe81db 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1799,7 +1799,7 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode, unsigned long count, int flags); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); -extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, +extern void ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count); extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e165830..93035ea 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4667,7 +4667,7 @@ error_return: } /** - * ext4_add_groupblocks() -- Add given blocks to an existing group + * ext4_group_add_blocks() -- Add given blocks to an existing group * @handle: handle to this transaction * @sb: super block * @block: start physcial block to add to the block group @@ -4675,7 +4675,7 @@ error_return: * * This marks the blocks as free in the bitmap and buddy. */ -void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, +void ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count) { struct buffer_head *bitmap_bh = NULL; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 53d9795..d241ecb 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1056,7 +1056,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); /* We add the blocks to the bitmap and set the group need init bit */ - ext4_add_groupblocks(handle, sb, o_blocks_count, add); + ext4_group_add_blocks(handle, sb, o_blocks_count, add); ext4_handle_dirty_super(handle, sb); ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); -- cgit v0.10.2 From cc7365dfe48cb2191f1572bf69e30d3e58716313 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Tue, 26 Jul 2011 21:46:07 -0400 Subject: ext4: let ext4_group_add_blocks() return an error code This patch lets ext4_group_add_blocks() return an error code if it fails, so that upper functions can handle error correctly. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bbe81db..da7ab48 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1799,7 +1799,7 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode, unsigned long count, int flags); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); -extern void ext4_group_add_blocks(handle_t *handle, struct super_block *sb, +extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count); extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 93035ea..dbe4295 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4675,7 +4675,7 @@ error_return: * * This marks the blocks as free in the bitmap and buddy. */ -void ext4_group_add_blocks(handle_t *handle, struct super_block *sb, +int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count) { struct buffer_head *bitmap_bh = NULL; @@ -4696,15 +4696,24 @@ void ext4_group_add_blocks(handle_t *handle, struct super_block *sb, * Check to see if we are freeing blocks across a group * boundary. */ - if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) + if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { + ext4_warning(sb, "too much blocks added to group %u\n", + block_group); + err = -EINVAL; goto error_return; + } bitmap_bh = ext4_read_block_bitmap(sb, block_group); - if (!bitmap_bh) + if (!bitmap_bh) { + err = -EIO; goto error_return; + } + desc = ext4_get_group_desc(sb, block_group, &gd_bh); - if (!desc) + if (!desc) { + err = -EIO; goto error_return; + } if (in_range(ext4_block_bitmap(sb, desc), block, count) || in_range(ext4_inode_bitmap(sb, desc), block, count) || @@ -4714,6 +4723,7 @@ void ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_error(sb, "Adding blocks in system zones - " "Block = %llu, count = %lu", block, count); + err = -EINVAL; goto error_return; } @@ -4782,7 +4792,7 @@ void ext4_group_add_blocks(handle_t *handle, struct super_block *sb, error_return: brelse(bitmap_bh); ext4_std_error(sb, err); - return; + return err; } /** diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index d241ecb..4c041e3 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -980,7 +980,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_grpblk_t add; struct buffer_head *bh; handle_t *handle; - int err; + int err, err2; ext4_group_t group; o_blocks_count = ext4_blocks_count(es); @@ -1056,11 +1056,15 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); /* We add the blocks to the bitmap and set the group need init bit */ - ext4_group_add_blocks(handle, sb, o_blocks_count, add); + err = ext4_group_add_blocks(handle, sb, o_blocks_count, add); ext4_handle_dirty_super(handle, sb); ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); - if ((err = ext4_journal_stop(handle))) + err2 = ext4_journal_stop(handle); + if (!err && err2) + err = err2; + + if (err) goto exit_put; if (test_opt(sb, DEBUG)) -- cgit v0.10.2 From 4740b830ed5720ade6c780dbf3fdfe9089b3552d Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Tue, 26 Jul 2011 21:51:08 -0400 Subject: ext4: let ext4_group_add_blocks() handle 0 blocks quickly If ext4_group_add_blocks() is called with 0 block, make it return 0 without doing any extra work. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index dbe4295..b6ef4da 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4691,6 +4691,9 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); + if (count == 0) + return 0; + ext4_get_group_no_and_offset(sb, block, &block_group, &bit); /* * Check to see if we are freeing blocks across a group -- cgit v0.10.2 From 2b79b09d13e35577151bd13ba08809911baccd1c Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Tue, 26 Jul 2011 21:53:35 -0400 Subject: ext4: fix a typo in ext4_group_extend() Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 4c041e3..5f0aefd 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -986,7 +986,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, o_blocks_count = ext4_blocks_count(es); if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n", + printk(KERN_DEBUG "EXT4-fs: extending last group from %llu to %llu blocks\n", o_blocks_count, n_blocks_count); if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) -- cgit v0.10.2 From c3e94d1df9bdd9e2c4ba7e8f534f7925f1756f97 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Tue, 26 Jul 2011 22:05:53 -0400 Subject: ext4: let setup_new_group_blocks() set multiple bits at a time Rename mb_set_bits() to ext4_set_bits() and make it a global function so that setup_new_group_blocks() can use it. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index da7ab48..ba2009b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -940,6 +940,8 @@ struct ext4_inode_info { #define ext4_find_next_zero_bit find_next_zero_bit_le #define ext4_find_next_bit find_next_bit_le +extern void ext4_set_bits(void *bm, int cur, int len); + /* * Maximal mount counts between two filesystem checks */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b6ef4da..fa716c9 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1282,7 +1282,7 @@ static void mb_clear_bits(void *bm, int cur, int len) } } -static void mb_set_bits(void *bm, int cur, int len) +void ext4_set_bits(void *bm, int cur, int len) { __u32 *addr; @@ -1511,7 +1511,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); - mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); + ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); mb_check_buddy(e4b); return ret; @@ -2795,8 +2795,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, * We leak some of the blocks here. */ ext4_lock_group(sb, ac->ac_b_ex.fe_group); - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, - ac->ac_b_ex.fe_len); + ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!err) @@ -2814,7 +2814,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, } } #endif - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len); + ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_blks_set(sb, gdp, @@ -3284,7 +3285,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, while (n) { entry = rb_entry(n, struct ext4_free_data, node); - mb_set_bits(bitmap, entry->start_blk, entry->count); + ext4_set_bits(bitmap, entry->start_blk, entry->count); n = rb_next(n); } return; @@ -3326,7 +3327,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, if (unlikely(len == 0)) continue; BUG_ON(groupnr != group); - mb_set_bits(bitmap, start, len); + ext4_set_bits(bitmap, start, len); preallocated += len; count++; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 5f0aefd..178fb2f 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -217,11 +217,6 @@ static int setup_new_group_blocks(struct super_block *sb, goto exit_journal; } - if (ext4_bg_has_super(sb, input->group)) { - ext4_debug("mark backup superblock %#04llx (+0)\n", start); - ext4_set_bit(0, bh->b_data); - } - /* Copy all of the GDT blocks into the backup in this group */ for (i = 0, bit = 1, block = start + 1; i < gdblocks; i++, block++, bit++) { @@ -250,7 +245,6 @@ static int setup_new_group_blocks(struct super_block *sb, brelse(gdb); goto exit_bh; } - ext4_set_bit(bit, bh->b_data); brelse(gdb); } @@ -261,8 +255,11 @@ static int setup_new_group_blocks(struct super_block *sb, GFP_NOFS); if (err) goto exit_bh; - for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++) - ext4_set_bit(bit, bh->b_data); + + if (ext4_bg_has_super(sb, input->group)) { + ext4_debug("mark backup group tables %#04llx (+0)\n", start); + ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 1); + } ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, input->block_bitmap - start); @@ -278,9 +275,8 @@ static int setup_new_group_blocks(struct super_block *sb, err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); if (err) goto exit_bh; - for (i = 0, bit = input->inode_table - start; - i < sbi->s_itb_per_group; i++, bit++) - ext4_set_bit(bit, bh->b_data); + ext4_set_bits(bh->b_data, input->inode_table - start, + sbi->s_itb_per_group); if ((err = extend_or_restart_transaction(handle, 2, bh))) goto exit_bh; -- cgit v0.10.2 From 6d40bc5a7e8fc71795d131e835f38f161ed7e1b1 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Tue, 26 Jul 2011 22:24:41 -0400 Subject: ext4: simplify journal handling in setup_new_group_blocks() This patch simplifies journal handling in setup_new_group_blocks(). In previous code, block bitmap is modified everywhere in setup_new_group_blocks(), ext4_get_write_access() in extend_or_restart_transaction() is used to guarantee that the block bitmap stays in the new handle, this makes things complicated. The previous commit changed things so that the modifications on the block bitmap are batched and done by ext4_set_bits() at the end of the for loop. This allows us to simplify things. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 178fb2f..5b423f8 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -161,8 +161,7 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, * If that fails, restart the transaction & regain write access for the * buffer head which is used for block_bitmap modifications. */ -static int extend_or_restart_transaction(handle_t *handle, int thresh, - struct buffer_head *bh) +static int extend_or_restart_transaction(handle_t *handle, int thresh) { int err; @@ -173,9 +172,8 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh, if (err < 0) return err; if (err) { - if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) - return err; - if ((err = ext4_journal_get_write_access(handle, bh))) + err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA); + if (err) return err; } @@ -212,29 +210,24 @@ static int setup_new_group_blocks(struct super_block *sb, BUG_ON(input->group != sbi->s_groups_count); - if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) { - err = PTR_ERR(bh); - goto exit_journal; - } - /* Copy all of the GDT blocks into the backup in this group */ for (i = 0, bit = 1, block = start + 1; i < gdblocks; i++, block++, bit++) { struct buffer_head *gdb; ext4_debug("update backup group %#04llx (+%d)\n", block, bit); - - if ((err = extend_or_restart_transaction(handle, 1, bh))) - goto exit_bh; + err = extend_or_restart_transaction(handle, 1); + if (err) + goto exit_journal; gdb = sb_getblk(sb, block); if (!gdb) { err = -EIO; - goto exit_bh; + goto exit_journal; } if ((err = ext4_journal_get_write_access(handle, gdb))) { brelse(gdb); - goto exit_bh; + goto exit_journal; } lock_buffer(gdb); memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); @@ -243,7 +236,7 @@ static int setup_new_group_blocks(struct super_block *sb, err = ext4_handle_dirty_metadata(handle, NULL, gdb); if (unlikely(err)) { brelse(gdb); - goto exit_bh; + goto exit_journal; } brelse(gdb); } @@ -254,7 +247,17 @@ static int setup_new_group_blocks(struct super_block *sb, err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, GFP_NOFS); if (err) - goto exit_bh; + goto exit_journal; + + err = extend_or_restart_transaction(handle, 2); + if (err) + goto exit_journal; + + bh = bclean(handle, sb, input->block_bitmap); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + goto exit_journal; + } if (ext4_bg_has_super(sb, input->group)) { ext4_debug("mark backup group tables %#04llx (+0)\n", start); @@ -278,8 +281,6 @@ static int setup_new_group_blocks(struct super_block *sb, ext4_set_bits(bh->b_data, input->inode_table - start, sbi->s_itb_per_group); - if ((err = extend_or_restart_transaction(handle, 2, bh))) - goto exit_bh; ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data); -- cgit v0.10.2 From e6075e984d100c12bb79267639c3f661d9788a67 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Wed, 27 Jul 2011 20:40:18 -0400 Subject: ext4: remove lock_buffer in bclean() and setup_new_group_blocks() There is no need to lock the buffers since no one else should be touching these buffers besides the file system. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 5b423f8..65e5cb6 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -147,10 +147,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, brelse(bh); bh = ERR_PTR(err); } else { - lock_buffer(bh); memset(bh->b_data, 0, sb->s_blocksize); set_buffer_uptodate(bh); - unlock_buffer(bh); } return bh; @@ -229,10 +227,8 @@ static int setup_new_group_blocks(struct super_block *sb, brelse(gdb); goto exit_journal; } - lock_buffer(gdb); memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); set_buffer_uptodate(gdb); - unlock_buffer(gdb); err = ext4_handle_dirty_metadata(handle, NULL, gdb); if (unlikely(err)) { brelse(gdb); -- cgit v0.10.2 From 2f919710143cb2025157c3c193ee22de86f3ed73 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Wed, 27 Jul 2011 21:16:33 -0400 Subject: ext4: simplify parameters of add_new_gdb() add_new_gdb() only needs the block group number; there is no need to pass a pointer to struct ext4_new_group_data to add_new_gdb(). Instead of filling in a pointer the struct buffer_head in add_new_gdb(), it's simpler to have the caller fetch it from the s_group_desc[] array. [Fixed error path to handle the case where struct buffer_head *primary hasn't been set yet. -- Ted] Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 65e5cb6..9e45355 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -394,15 +394,15 @@ static int verify_reserved_gdb(struct super_block *sb, * fail once we start modifying the data on disk, because JBD has no rollback. */ static int add_new_gdb(handle_t *handle, struct inode *inode, - struct ext4_new_group_data *input, - struct buffer_head **primary) + ext4_group_t group) { struct super_block *sb = inode->i_sb; struct ext4_super_block *es = EXT4_SB(sb)->s_es; - unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); + unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; struct buffer_head **o_group_desc, **n_group_desc; struct buffer_head *dind; + struct buffer_head *gdb_bh; int gdbackups; struct ext4_iloc iloc; __le32 *data; @@ -425,11 +425,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, return -EPERM; } - *primary = sb_bread(sb, gdblock); - if (!*primary) + gdb_bh = sb_bread(sb, gdblock); + if (!gdb_bh) return -EIO; - if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) { + gdbackups = verify_reserved_gdb(sb, gdb_bh); + if (gdbackups < 0) { err = gdbackups; goto exit_bh; } @@ -444,7 +445,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, data = (__le32 *)dind->b_data; if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { ext4_warning(sb, "new group %u GDT block %llu not reserved", - input->group, gdblock); + group, gdblock); err = -EINVAL; goto exit_dind; } @@ -453,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, if (unlikely(err)) goto exit_dind; - err = ext4_journal_get_write_access(handle, *primary); + err = ext4_journal_get_write_access(handle, gdb_bh); if (unlikely(err)) goto exit_sbh; @@ -492,8 +493,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, } inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; ext4_mark_iloc_dirty(handle, inode, &iloc); - memset((*primary)->b_data, 0, sb->s_blocksize); - err = ext4_handle_dirty_metadata(handle, NULL, *primary); + memset(gdb_bh->b_data, 0, sb->s_blocksize); + err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); if (unlikely(err)) { ext4_std_error(sb, err); goto exit_inode; @@ -503,7 +504,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, o_group_desc = EXT4_SB(sb)->s_group_desc; memcpy(n_group_desc, o_group_desc, EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); - n_group_desc[gdb_num] = *primary; + n_group_desc[gdb_num] = gdb_bh; EXT4_SB(sb)->s_group_desc = n_group_desc; EXT4_SB(sb)->s_gdb_count++; kfree(o_group_desc); @@ -525,7 +526,7 @@ exit_sbh: exit_dind: brelse(dind); exit_bh: - brelse(*primary); + brelse(gdb_bh); ext4_debug("leaving with error %d\n", err); return err; @@ -833,8 +834,16 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) && (err = reserve_backup_gdb(handle, inode, input))) goto exit_journal; - } else if ((err = add_new_gdb(handle, inode, input, &primary))) - goto exit_journal; + } else { + /* + * Note that we can access new group descriptor block safely + * only if add_new_gdb() succeeds. + */ + err = add_new_gdb(handle, inode, input->group); + if (err) + goto exit_journal; + primary = sbi->s_group_desc[gdb_num]; + } /* * OK, now we've set up the new group. Time to make it active. @@ -944,7 +953,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) exit_journal: if ((err2 = ext4_journal_stop(handle)) && !err) err = err2; - if (!err) { + if (!err && primary) { update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, sizeof(struct ext4_super_block)); update_backups(sb, primary->b_blocknr, primary->b_data, -- cgit v0.10.2 From 668f4dc5593327fadc95b33189c375f7178ef88e Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Wed, 27 Jul 2011 21:23:13 -0400 Subject: ext4: simplify parameters of reserve_backup_gdb() The reserve_backup_gdb() function only needs the block group number; there's no need to pass a pointer to struct ext4_new_group_data to it. Signed-off-by: Yongqiang Yang diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 9e45355..6e3327d 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -546,7 +546,7 @@ exit_bh: * backup GDT blocks are stored in their reserved primary GDT block. */ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, - struct ext4_new_group_data *input) + ext4_group_t group) { struct super_block *sb = inode->i_sb; int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); @@ -617,7 +617,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, * Finally we can add each of the reserved backup GDT blocks from * the new group to its reserved primary GDT block. */ - blk = input->group * EXT4_BLOCKS_PER_GROUP(sb); + blk = group * EXT4_BLOCKS_PER_GROUP(sb); for (i = 0; i < reserved_gdb; i++) { int err2; data = (__le32 *)primary[i]->b_data; @@ -831,9 +831,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if ((err = ext4_journal_get_write_access(handle, primary))) goto exit_journal; - if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) && - (err = reserve_backup_gdb(handle, inode, input))) - goto exit_journal; + if (reserved_gdb && ext4_bg_num_gdb(sb, input->group)) { + err = reserve_backup_gdb(handle, inode, input->group); + if (err) + goto exit_journal; + } } else { /* * Note that we can access new group descriptor block safely -- cgit v0.10.2 From 0e1147b001793593624e80b3c0a1790822b6baca Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Wed, 27 Jul 2011 21:29:33 -0400 Subject: ext4: add action of moving index in ext4_ext_rm_idx for Punch Hole The old function ext4_ext_rm_idx is used only for truncate case because it just remove last index in extent-index-block. When punching hole, it usually needed to remove "middle" index, therefore we must move indexes which after it forward. (I create a file with 1 depth extent tree and punch hole in the middle of it, the last index in index-block strangly gone, so I find out this bug) Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4d73e11..a25bbdc 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2101,8 +2101,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, /* * ext4_ext_rm_idx: * removes index from the index block. - * It's used in truncate case only, thus all requests are for - * last index in the block only. */ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) @@ -2120,6 +2118,13 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, err = ext4_ext_get_access(handle, inode, path); if (err) return err; + + if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { + int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; + len *= sizeof(struct ext4_extent_idx); + memmove(path->p_idx, path->p_idx + 1, len); + } + le16_add_cpu(&path->p_hdr->eh_entries, -1); err = ext4_ext_dirty(handle, inode, path); if (err) -- cgit v0.10.2 From 29ae07b702cb77dbc24b0843f15ee8cf8a642311 Mon Sep 17 00:00:00 2001 From: Utako Kusaka Date: Wed, 27 Jul 2011 22:11:20 -0400 Subject: ext4: Fix overflow caused by missing cast in ext4_fallocate() The logical block number in map.l_blk is a __u32, and so before we shift it left, by the block size, we neeed cast it to a 64-bit size. Otherwise i_size can be corrupted on an ENOSPC. # df -T /mnt/mp1 Filesystem Type 1K-blocks Used Available Use% Mounted on /dev/sda6 ext4 9843276 153056 9190200 2% /mnt/mp1 # fallocate -o 0 -l 2199023251456 /mnt/mp1/testfile fallocate: /mnt/mp1/testfile: fallocate failed: No space left on device # stat /mnt/mp1/testfile File: `/mnt/mp1/testfile' Size: 4293656576 Blocks: 19380440 IO Block: 4096 regular file Device: 806h/2054d Inode: 12 Links: 1 Access: (0644/-rw-r--r--) Uid: ( 0/ root) Gid: ( 0/ root) Access: 2011-07-25 13:01:31.414490496 +0900 Modify: 2011-07-25 13:01:31.414490496 +0900 Change: 2011-07-25 13:01:31.454490495 +0900 Signed-off-by: Utako Kusaka Signed-off-by: "Theodore Ts'o" -- fs/ext4/extents.c | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a25bbdc..57cf568 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3824,7 +3824,7 @@ retry: blkbits) >> blkbits)) new_size = offset + len; else - new_size = (map.m_lblk + ret) << blkbits; + new_size = ((loff_t) map.m_lblk + ret) << blkbits; ext4_falloc_update_inode(inode, mode, new_size, (map.m_flags & EXT4_MAP_NEW)); -- cgit v0.10.2 From d59729f4e794f814b25ccd2aebfbe606242c4544 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 30 Jul 2011 12:34:19 -0400 Subject: ext4: fix races in ext4_sync_parent() Fix problems if fsync() races against a rename of a parent directory as pointed out by Al Viro in his own inimitable way: >While we are at it, could somebody please explain what the hell is ext4 >doing in >static int ext4_sync_parent(struct inode *inode) >{ > struct writeback_control wbc; > struct dentry *dentry = NULL; > int ret = 0; > > while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { > ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); > dentry = list_entry(inode->i_dentry.next, > struct dentry, d_alias); > if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode) > break; > inode = dentry->d_parent->d_inode; > ret = sync_mapping_buffers(inode->i_mapping); > ... >Note that dentry obviously can't be NULL there. dentry->d_parent is never >NULL. And dentry->d_parent would better not be negative, for crying out >loud! What's worse, there's no guarantees that dentry->d_parent will >remain our parent over that sync_mapping_buffers() *and* that inode won't >just be freed under us (after rename() and memory pressure leading to >eviction of what used to be our dentry->d_parent)...... Reported-by: Al Viro Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index ce66d2f..f9dbe33 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -129,15 +129,30 @@ static int ext4_sync_parent(struct inode *inode) { struct writeback_control wbc; struct dentry *dentry = NULL; + struct inode *next; int ret = 0; - while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { + if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) + return 0; + inode = igrab(inode); + while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); - dentry = list_entry(inode->i_dentry.next, - struct dentry, d_alias); - if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode) + dentry = NULL; + spin_lock(&inode->i_lock); + if (!list_empty(&inode->i_dentry)) { + dentry = list_first_entry(&inode->i_dentry, + struct dentry, d_alias); + dget(dentry); + } + spin_unlock(&inode->i_lock); + if (!dentry) + break; + next = igrab(dentry->d_parent->d_inode); + dput(dentry); + if (!next) break; - inode = dentry->d_parent->d_inode; + iput(inode); + inode = next; ret = sync_mapping_buffers(inode->i_mapping); if (ret) break; @@ -148,6 +163,7 @@ static int ext4_sync_parent(struct inode *inode) if (ret) break; } + iput(inode); return ret; } -- cgit v0.10.2 From 59be8e7280c10fd8f078ba6dc2bcdc2b1453b6ab Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 30 Jul 2011 12:38:46 -0400 Subject: ext4: change umode_t in tracepoint headers to be an explicit __u16 As requested by Al Viro, since umode_t may be changing to a u32 for some architectures. Signed-off-by: "Theodore Ts'o" Cc: Al Viro diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 51d8813..bf5518d 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -23,7 +23,7 @@ TRACE_EVENT(ext4_free_inode, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( umode_t, mode ) + __field( __u16, mode ) __field( uid_t, uid ) __field( gid_t, gid ) __field( __u64, blocks ) @@ -52,7 +52,7 @@ TRACE_EVENT(ext4_request_inode, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, dir ) - __field( umode_t, mode ) + __field( __u16, mode ) ), TP_fast_assign( @@ -75,7 +75,7 @@ TRACE_EVENT(ext4_allocate_inode, __field( dev_t, dev ) __field( ino_t, ino ) __field( ino_t, dir ) - __field( umode_t, mode ) + __field( __u16, mode ) ), TP_fast_assign( @@ -727,7 +727,7 @@ TRACE_EVENT(ext4_free_blocks, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( umode_t, mode ) + __field( __u16, mode ) __field( __u64, block ) __field( unsigned long, count ) __field( int, flags ) @@ -1014,7 +1014,7 @@ TRACE_EVENT(ext4_forget, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( umode_t, mode ) + __field( __u16, mode ) __field( int, is_metadata ) __field( __u64, block ) ), @@ -1041,7 +1041,7 @@ TRACE_EVENT(ext4_da_update_reserve_space, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( umode_t, mode ) + __field( __u16, mode ) __field( __u64, i_blocks ) __field( int, used_blocks ) __field( int, reserved_data_blocks ) @@ -1078,7 +1078,7 @@ TRACE_EVENT(ext4_da_reserve_space, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( umode_t, mode ) + __field( __u16, mode ) __field( __u64, i_blocks ) __field( int, md_needed ) __field( int, reserved_data_blocks ) @@ -1112,7 +1112,7 @@ TRACE_EVENT(ext4_da_release_space, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( umode_t, mode ) + __field( __u16, mode ) __field( __u64, i_blocks ) __field( int, freed_blocks ) __field( int, reserved_data_blocks ) -- cgit v0.10.2 From c49bafa3842751b8955a962859f42d307673d75d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 30 Jul 2011 12:58:41 -0400 Subject: ext4: add missing kfree() on error return path in add_new_gdb() We added some more error handling in b40971426a "ext4: add error checking to calls to ext4_handle_dirty_metadata()". But we need to call kfree() as well to avoid a memory leak. Signed-off-by: Dan Carpenter Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 6e3327d..71085df 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -517,6 +517,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, return err; exit_inode: + kfree(n_group_desc); /* ext4_handle_release_buffer(handle, iloc.bh); */ brelse(iloc.bh); exit_dindj: -- cgit v0.10.2 From 33853a0dde359ded0534204eb6857ad5166d515b Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Mon, 1 Aug 2011 06:32:19 -0400 Subject: ext4: use the correct error exit path in ext4_init_inode_table() This patch lets ext4_init_inode_table() handle errors right. ext4_init_inode_table() should down_write() alloc_sem which has been up_write()ed and stop the started journal handle. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 21bb2f6..9c63f27 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1287,7 +1287,7 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, group, used_blks, ext4_itable_unused_count(sb, gdp)); ret = 1; - goto out; + goto err_out; } blk = ext4_inode_table(sb, gdp) + used_blks; -- cgit v0.10.2 From 9933fc0ac1ac14b795819cd63d05ea92112f690a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 1 Aug 2011 08:45:02 -0400 Subject: ext4: introduce ext4_kvmalloc(), ext4_kzalloc(), and ext4_kvfree() Introduce new helper functions which try kmalloc, and then fall back to vmalloc if necessary, and use them for allocating and deallocating s_flex_groups. Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ba2009b..db9fead 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1874,6 +1874,9 @@ extern int ext4_group_extend(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ +extern void *ext4_kvmalloc(size_t size, gfp_t flags); +extern void *ext4_kvzalloc(size_t size, gfp_t flags); +extern void ext4_kvfree(void *ptr); extern void __ext4_error(struct super_block *, const char *, unsigned int, const char *, ...) __attribute__ ((format (printf, 4, 5))); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index cfe9f39..658f586 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -110,6 +110,35 @@ static struct file_system_type ext3_fs_type = { #define IS_EXT3_SB(sb) (0) #endif +void *ext4_kvmalloc(size_t size, gfp_t flags) +{ + void *ret; + + ret = kmalloc(size, flags); + if (!ret) + ret = __vmalloc(size, flags, PAGE_KERNEL); + return ret; +} + +void *ext4_kvzalloc(size_t size, gfp_t flags) +{ + void *ret; + + ret = kmalloc(size, flags); + if (!ret) + ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); + return ret; +} + +void ext4_kvfree(void *ptr) +{ + if (is_vmalloc_addr(ptr)) + vfree(ptr); + else + kfree(ptr); + +} + ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { @@ -791,10 +820,7 @@ static void ext4_put_super(struct super_block *sb) for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); kfree(sbi->s_group_desc); - if (is_vmalloc_addr(sbi->s_flex_groups)) - vfree(sbi->s_flex_groups); - else - kfree(sbi->s_flex_groups); + ext4_kvfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); @@ -1977,15 +2003,11 @@ static int ext4_fill_flex_info(struct super_block *sb) ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; size = flex_group_count * sizeof(struct flex_groups); - sbi->s_flex_groups = kzalloc(size, GFP_KERNEL); + sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL); if (sbi->s_flex_groups == NULL) { - sbi->s_flex_groups = vzalloc(size); - if (sbi->s_flex_groups == NULL) { - ext4_msg(sb, KERN_ERR, - "not enough memory for %u flex groups", - flex_group_count); - goto failed; - } + ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups", + flex_group_count); + goto failed; } for (i = 0; i < sbi->s_groups_count; i++) { @@ -3750,12 +3772,8 @@ failed_mount_wq: } failed_mount3: del_timer(&sbi->s_err_report); - if (sbi->s_flex_groups) { - if (is_vmalloc_addr(sbi->s_flex_groups)) - vfree(sbi->s_flex_groups); - else - kfree(sbi->s_flex_groups); - } + if (sbi->s_flex_groups) + ext4_kvfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); -- cgit v0.10.2 From f18a5f21c25707b4fe64b326e2b4d150565e7300 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 1 Aug 2011 08:45:38 -0400 Subject: ext4: use ext4_kvzalloc()/ext4_kvmalloc() for s_group_desc and s_group_info Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index fa716c9..d5021e8 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2331,7 +2331,7 @@ static int ext4_mb_init_backend(struct super_block *sb) /* An 8TB filesystem with 64-bit pointers requires a 4096 byte * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. * So a two level scheme suffices for now. */ - sbi->s_group_info = kzalloc(array_size, GFP_KERNEL); + sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL); if (sbi->s_group_info == NULL) { printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); return -ENOMEM; @@ -2365,7 +2365,7 @@ err_freebuddy: kfree(sbi->s_group_info[i]); iput(sbi->s_buddy_cache); err_freesgi: - kfree(sbi->s_group_info); + ext4_kvfree(sbi->s_group_info); return -ENOMEM; } @@ -2559,7 +2559,7 @@ int ext4_mb_release(struct super_block *sb) EXT4_DESC_PER_BLOCK_BITS(sb); for (i = 0; i < num_meta_group_infos; i++) kfree(sbi->s_group_info[i]); - kfree(sbi->s_group_info); + ext4_kvfree(sbi->s_group_info); } kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 71085df..707d3f1 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -467,12 +467,13 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, if (unlikely(err)) goto exit_dindj; - n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), - GFP_NOFS); + n_group_desc = ext4_kvmalloc((gdb_num + 1) * + sizeof(struct buffer_head *), + GFP_NOFS); if (!n_group_desc) { err = -ENOMEM; - ext4_warning(sb, - "not enough memory for %lu groups", gdb_num + 1); + ext4_warning(sb, "not enough memory for %lu groups", + gdb_num + 1); goto exit_inode; } @@ -507,7 +508,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, n_group_desc[gdb_num] = gdb_bh; EXT4_SB(sb)->s_group_desc = n_group_desc; EXT4_SB(sb)->s_gdb_count++; - kfree(o_group_desc); + ext4_kvfree(o_group_desc); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); @@ -517,7 +518,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, return err; exit_inode: - kfree(n_group_desc); + ext4_kvfree(n_group_desc); /* ext4_handle_release_buffer(handle, iloc.bh); */ brelse(iloc.bh); exit_dindj: diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 658f586..e2d88ba 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -819,7 +819,7 @@ static void ext4_put_super(struct super_block *sb) for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); - kfree(sbi->s_group_desc); + ext4_kvfree(sbi->s_group_desc); ext4_kvfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); @@ -3439,8 +3439,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); - sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), - GFP_KERNEL); + sbi->s_group_desc = ext4_kvmalloc(db_count * + sizeof(struct buffer_head *), + GFP_KERNEL); if (sbi->s_group_desc == NULL) { ext4_msg(sb, KERN_ERR, "not enough memory"); goto failed_mount; @@ -3783,7 +3784,7 @@ failed_mount3: failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); - kfree(sbi->s_group_desc); + ext4_kvfree(sbi->s_group_desc); failed_mount: if (sbi->s_proc) { remove_proc_entry(sb->s_id, ext4_proc_root); -- cgit v0.10.2 From 9d8b9ec44234b2f6e0225300632d250210c04f11 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 1 Aug 2011 17:41:35 -0400 Subject: ext4: use ext4_msg() instead of printk in mballoc Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d5021e8..70d1b3e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -493,10 +493,11 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) b2 = (unsigned char *) bitmap; for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { if (b1[i] != b2[i]) { - printk(KERN_ERR "corruption in group %u " - "at byte %u(%u): %x in copy != %x " - "on disk/prealloc\n", - e4b->bd_group, i, i * 8, b1[i], b2[i]); + ext4_msg(e4b->bd_sb, KERN_ERR, + "corruption in group %u " + "at byte %u(%u): %x in copy != %x " + "on disk/prealloc", + e4b->bd_group, i, i * 8, b1[i], b2[i]); BUG(); } } @@ -2224,8 +2225,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, EXT4_DESC_PER_BLOCK_BITS(sb); meta_group_info = kmalloc(metalen, GFP_KERNEL); if (meta_group_info == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate mem for a " - "buddy group\n"); + ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem " + "for a buddy group"); goto exit_meta_group_info; } sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = @@ -2238,7 +2239,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); if (meta_group_info[i] == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); + ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem"); goto exit_group_info; } memset(meta_group_info[i], 0, kmem_cache_size(cachep)); @@ -2333,12 +2334,12 @@ static int ext4_mb_init_backend(struct super_block *sb) * So a two level scheme suffices for now. */ sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL); if (sbi->s_group_info == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); + ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); return -ENOMEM; } sbi->s_buddy_cache = new_inode(sb); if (sbi->s_buddy_cache == NULL) { - printk(KERN_ERR "EXT4-fs: can't get new inode\n"); + ext4_msg(sb, KERN_ERR, "can't get new inode"); goto err_freesgi; } sbi->s_buddy_cache->i_ino = get_next_ino(); @@ -2346,8 +2347,7 @@ static int ext4_mb_init_backend(struct super_block *sb) for (i = 0; i < ngroups; i++) { desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { - printk(KERN_ERR - "EXT4-fs: can't read descriptor %u\n", i); + ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); goto err_freebuddy; } if (ext4_mb_add_groupinfo(sb, i, desc) != 0) @@ -2411,7 +2411,8 @@ static int ext4_groupinfo_create_slab(size_t size) mutex_unlock(&ext4_grpinfo_slab_create_mutex); if (!cachep) { - printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n"); + printk(KERN_EMERG + "EXT4-fs: no memory for groupinfo slab cache\n"); return -ENOMEM; } @@ -2566,25 +2567,25 @@ int ext4_mb_release(struct super_block *sb) if (sbi->s_buddy_cache) iput(sbi->s_buddy_cache); if (sbi->s_mb_stats) { - printk(KERN_INFO - "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n", + ext4_msg(sb, KERN_INFO, + "mballoc: %u blocks %u reqs (%u success)", atomic_read(&sbi->s_bal_allocated), atomic_read(&sbi->s_bal_reqs), atomic_read(&sbi->s_bal_success)); - printk(KERN_INFO - "EXT4-fs: mballoc: %u extents scanned, %u goal hits, " - "%u 2^N hits, %u breaks, %u lost\n", + ext4_msg(sb, KERN_INFO, + "mballoc: %u extents scanned, %u goal hits, " + "%u 2^N hits, %u breaks, %u lost", atomic_read(&sbi->s_bal_ex_scanned), atomic_read(&sbi->s_bal_goals), atomic_read(&sbi->s_bal_2orders), atomic_read(&sbi->s_bal_breaks), atomic_read(&sbi->s_mb_lost_chunks)); - printk(KERN_INFO - "EXT4-fs: mballoc: %lu generated and it took %Lu\n", + ext4_msg(sb, KERN_INFO, + "mballoc: %lu generated and it took %Lu", sbi->s_mb_buddies_generated, sbi->s_mb_generation_time); - printk(KERN_INFO - "EXT4-fs: mballoc: %u preallocated, %u discarded\n", + ext4_msg(sb, KERN_INFO, + "mballoc: %u preallocated, %u discarded", atomic_read(&sbi->s_mb_preallocated), atomic_read(&sbi->s_mb_discarded)); } @@ -3024,9 +3025,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, if (start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical) { - printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n", - (unsigned long) start, (unsigned long) size, - (unsigned long) ac->ac_o_ex.fe_logical); + ext4_msg(ac->ac_sb, KERN_ERR, + "start %lu, size %lu, fe_logical %lu", + (unsigned long) start, (unsigned long) size, + (unsigned long) ac->ac_o_ex.fe_logical); } BUG_ON(start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical); @@ -3607,10 +3609,11 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, bit = next + 1; } if (free != pa->pa_free) { - printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n", - pa, (unsigned long) pa->pa_lstart, - (unsigned long) pa->pa_pstart, - (unsigned long) pa->pa_len); + ext4_msg(e4b->bd_sb, KERN_CRIT, + "pa %p: logic %lu, phys. %lu, len %lu", + pa, (unsigned long) pa->pa_lstart, + (unsigned long) pa->pa_pstart, + (unsigned long) pa->pa_len); ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", free, pa->pa_free); /* @@ -3798,7 +3801,8 @@ repeat: * use preallocation while we're discarding it */ spin_unlock(&pa->pa_lock); spin_unlock(&ei->i_prealloc_lock); - printk(KERN_ERR "uh-oh! used pa while discarding\n"); + ext4_msg(sb, KERN_ERR, + "uh-oh! used pa while discarding"); WARN_ON(1); schedule_timeout_uninterruptible(HZ); goto repeat; @@ -3875,12 +3879,13 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) return; - printk(KERN_ERR "EXT4-fs: Can't allocate:" - " Allocation context details:\n"); - printk(KERN_ERR "EXT4-fs: status %d flags %d\n", + ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:" + " Allocation context details:"); + ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d", ac->ac_status, ac->ac_flags); - printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, " - "best %lu/%lu/%lu@%lu cr %d\n", + ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, " + "goal %lu/%lu/%lu@%lu, " + "best %lu/%lu/%lu@%lu cr %d", (unsigned long)ac->ac_o_ex.fe_group, (unsigned long)ac->ac_o_ex.fe_start, (unsigned long)ac->ac_o_ex.fe_len, @@ -3894,9 +3899,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); - printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, - ac->ac_found); - printk(KERN_ERR "EXT4-fs: groups: \n"); + ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found", + ac->ac_ex_scanned, ac->ac_found); + ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: "); ngroups = ext4_get_groups_count(sb); for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); -- cgit v0.10.2 From 48e6061bf4bb25eec151b91f22fd90a5b9a4920a Mon Sep 17 00:00:00 2001 From: Yu Jian Date: Mon, 1 Aug 2011 17:41:39 -0400 Subject: ext4: use EXT4_BAD_INO for buddy cache to avoid colliding with valid inode # Signed-off-by: Yu Jian Signed-off-by: Andreas Dilger Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 70d1b3e..e41620b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2342,7 +2342,11 @@ static int ext4_mb_init_backend(struct super_block *sb) ext4_msg(sb, KERN_ERR, "can't get new inode"); goto err_freesgi; } - sbi->s_buddy_cache->i_ino = get_next_ino(); + /* To avoid potentially colliding with an valid on-disk inode number, + * use EXT4_BAD_INO for the buddy cache inode number. This inode is + * not in the inode hash, so it should never be found by iget(), but + * this will avoid confusion if it ever shows up during debugging. */ + sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; for (i = 0; i < ngroups; i++) { desc = ext4_get_group_desc(sb, i, NULL); -- cgit v0.10.2 From 79a77c5ac34cc27ccbfbdf7113b41cdd93534eab Mon Sep 17 00:00:00 2001 From: Yu Jian Date: Mon, 1 Aug 2011 17:41:46 -0400 Subject: ext4: prevent memory leaks from ext4_mb_init_backend() on error path In ext4_mb_init(), if the s_locality_group allocation fails it will currently cause the allocations made in ext4_mb_init_backend() to be leaked. Moving the ext4_mb_init_backend() allocation after the s_locality_group allocation avoids that problem. Signed-off-by: Yu Jian Signed-off-by: Andreas Dilger Signed-off-by: "Theodore Ts'o" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e41620b..17a5a57 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2465,12 +2465,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) i++; } while (i <= sb->s_blocksize_bits + 1); - /* init file for buddy data */ - ret = ext4_mb_init_backend(sb); - if (ret != 0) { - goto out; - } - spin_lock_init(&sbi->s_md_lock); spin_lock_init(&sbi->s_bal_lock); @@ -2507,6 +2501,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) spin_lock_init(&lg->lg_prealloc_lock); } + /* init file for buddy data */ + ret = ext4_mb_init_backend(sb); + if (ret != 0) { + goto out; + } + if (sbi->s_proc) proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, &ext4_mb_seq_groups_fops, sb); -- cgit v0.10.2