From e3014d14a81edde488d9a6758eea8afc41752d2d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 29 Aug 2016 15:38:11 -0400 Subject: ext4: fixup free space calculations when expanding inodes Conditions checking whether there is enough free space in an xattr block and when xattr is large enough to make enough space in the inode forgot to account for the fact that inode need not be completely filled up with xattrs. Thus we could move unnecessarily many xattrs out of inode or even falsely claim there is not enough space to expand the inode. We also forgot to update the amount of free space in xattr block when moving more xattrs and thus could decide to move too big xattr resulting in unexpected failure. Fix these problems by properly updating free space in the inode and xattr block as we move xattrs. To simplify the math, avoid shifting xattrs after removing each one xattr and instead just shift xattrs only once there is enough free space in the inode. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 2eb935c..22d2ebc 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1350,7 +1350,8 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_xattr_ibody_find *is = NULL; struct ext4_xattr_block_find *bs = NULL; char *buffer = NULL, *b_entry_name = NULL; - size_t min_offs, free; + size_t min_offs; + size_t ifree, bfree; int total_ino; void *base, *start, *end; int error = 0, tried_min_extra_isize = 0; @@ -1385,17 +1386,9 @@ retry: if (error) goto cleanup; - free = ext4_xattr_free_space(last, &min_offs, base, &total_ino); - if (free >= isize_diff) { - entry = IFIRST(header); - ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - - new_extra_isize, (void *)raw_inode + - EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, - (void *)header, total_ino, - inode->i_sb->s_blocksize); - EXT4_I(inode)->i_extra_isize = new_extra_isize; - goto out; - } + ifree = ext4_xattr_free_space(last, &min_offs, base, &total_ino); + if (ifree >= isize_diff) + goto shift; /* * Enough free space isn't available in the inode, check if @@ -1416,8 +1409,8 @@ retry: first = BFIRST(bh); end = bh->b_data + bh->b_size; min_offs = end - base; - free = ext4_xattr_free_space(first, &min_offs, base, NULL); - if (free < isize_diff) { + bfree = ext4_xattr_free_space(first, &min_offs, base, NULL); + if (bfree + ifree < isize_diff) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; @@ -1428,10 +1421,10 @@ retry: goto cleanup; } } else { - free = inode->i_sb->s_blocksize; + bfree = inode->i_sb->s_blocksize; } - while (isize_diff > 0) { + while (isize_diff > ifree) { size_t offs, size, entry_size; struct ext4_xattr_entry *small_entry = NULL; struct ext4_xattr_info i = { @@ -1439,7 +1432,6 @@ retry: .value_len = 0, }; unsigned int total_size; /* EA entry size + value size */ - unsigned int shift_bytes; /* No. of bytes to shift EAs by? */ unsigned int min_total_size = ~0U; is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); @@ -1461,8 +1453,9 @@ retry: total_size = EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + EXT4_XATTR_LEN(last->e_name_len); - if (total_size <= free && total_size < min_total_size) { - if (total_size < isize_diff) { + if (total_size <= bfree && + total_size < min_total_size) { + if (total_size + ifree < isize_diff) { small_entry = last; } else { entry = last; @@ -1491,6 +1484,7 @@ retry: offs = le16_to_cpu(entry->e_value_offs); size = le32_to_cpu(entry->e_value_size); entry_size = EXT4_XATTR_LEN(entry->e_name_len); + total_size = entry_size + EXT4_XATTR_SIZE(size); i.name_index = entry->e_name_index, buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_NOFS); b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); @@ -1518,21 +1512,8 @@ retry: if (error) goto cleanup; total_ino -= entry_size; - - entry = IFIRST(header); - if (entry_size + EXT4_XATTR_SIZE(size) >= isize_diff) - shift_bytes = isize_diff; - else - shift_bytes = entry_size + EXT4_XATTR_SIZE(size); - /* Adjust the offsets and shift the remaining entries ahead */ - ext4_xattr_shift_entries(entry, -shift_bytes, - (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + - EXT4_I(inode)->i_extra_isize + shift_bytes, - (void *)header, total_ino, inode->i_sb->s_blocksize); - - isize_diff -= shift_bytes; - EXT4_I(inode)->i_extra_isize += shift_bytes; - header = IHDR(inode, raw_inode); + ifree += total_size; + bfree -= total_size; i.name = b_entry_name; i.value = buffer; @@ -1553,6 +1534,15 @@ retry: kfree(is); kfree(bs); } + +shift: + /* Adjust the offsets and shift the remaining entries ahead */ + entry = IFIRST(header); + ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize + - new_extra_isize, (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, + (void *)header, total_ino, inode->i_sb->s_blocksize); + EXT4_I(inode)->i_extra_isize = new_extra_isize; brelse(bh); out: ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); -- cgit v0.10.2 From 2de58f1102cf1ac6091209e1dfa8eccbcb039570 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 29 Aug 2016 15:39:11 -0400 Subject: ext4: Check that external xattr value block is zero Currently we don't support xattrs with values stored out of line. Check for that in ext4_xattr_check_names() to make sure we never work with such xattrs since not all the code counts with that resulting is possible weird corruption issues. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 22d2ebc..f845cb7 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -199,6 +199,8 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, } while (!IS_LAST_ENTRY(entry)) { + if (entry->e_value_block != 0) + return -EFSCORRUPTED; if (entry->e_value_size != 0 && (value_start + le16_to_cpu(entry->e_value_offs) < (void *)e + sizeof(__u32) || -- cgit v0.10.2 From 1cba423707b47886391c7fcb3614fc67394be06e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 29 Aug 2016 15:40:11 -0400 Subject: ext4: remove checks for e_value_block Currently we don't support xattrs with e_value_block set. We don't allow them to pass initial xattr check so there's no point for checking for this later. Since these tests were untested, bugs were creeping in and not all places which should have checked were checking e_value_block anyway. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index f845cb7..1447860 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -643,7 +643,7 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, size_t *min_offs, void *base, int *total) { for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (!last->e_value_block && last->e_value_size) { + if (last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < *min_offs) *min_offs = offs; @@ -663,7 +663,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) /* Compute min_offs and last. */ last = s->first; for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (!last->e_value_block && last->e_value_size) { + if (last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < min_offs) min_offs = offs; @@ -671,7 +671,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) } free = min_offs - ((void *)last - s->base) - sizeof(__u32); if (!s->not_found) { - if (!s->here->e_value_block && s->here->e_value_size) { + if (s->here->e_value_size) { size_t size = le32_to_cpu(s->here->e_value_size); free += EXT4_XATTR_SIZE(size); } @@ -693,7 +693,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) s->here->e_name_len = name_len; memcpy(s->here->e_name, i->name, name_len); } else { - if (!s->here->e_value_block && s->here->e_value_size) { + if (s->here->e_value_size) { void *first_val = s->base + min_offs; size_t offs = le16_to_cpu(s->here->e_value_offs); void *val = s->base + offs; @@ -727,8 +727,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) last = s->first; while (!IS_LAST_ENTRY(last)) { size_t o = le16_to_cpu(last->e_value_offs); - if (!last->e_value_block && - last->e_value_size && o < offs) + if (last->e_value_size && o < offs) last->e_value_offs = cpu_to_le16(o + size); last = EXT4_XATTR_NEXT(last); @@ -1327,7 +1326,7 @@ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, /* Adjust the value offsets of the entries */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (!last->e_value_block && last->e_value_size) { + if (last->e_value_size) { new_offs = le16_to_cpu(last->e_value_offs) + value_offs_shift; BUG_ON(new_offs + le32_to_cpu(last->e_value_size) @@ -1726,7 +1725,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, *name++; } - if (entry->e_value_block == 0 && entry->e_value_size != 0) { + if (entry->e_value_size != 0) { __le32 *value = (__le32 *)((char *)header + le16_to_cpu(entry->e_value_offs)); for (n = (le32_to_cpu(entry->e_value_size) + -- cgit v0.10.2 From 94405713889d4a9d341b4ad92956e4e2ec8ec2c2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 29 Aug 2016 15:41:11 -0400 Subject: ext4: replace bogus assertion in ext4_xattr_shift_entries() We were checking whether computed offsets do not exceed end of block in ext4_xattr_shift_entries(). However this does not make sense since we always only decrease offsets. So replace that assertion with a check whether we really decrease xattrs value offsets. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 1447860..82b025c 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1319,18 +1319,19 @@ retry: */ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, int value_offs_shift, void *to, - void *from, size_t n, int blocksize) + void *from, size_t n) { struct ext4_xattr_entry *last = entry; int new_offs; + /* We always shift xattr headers further thus offsets get lower */ + BUG_ON(value_offs_shift > 0); + /* Adjust the value offsets of the entries */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { if (last->e_value_size) { new_offs = le16_to_cpu(last->e_value_offs) + value_offs_shift; - BUG_ON(new_offs + le32_to_cpu(last->e_value_size) - > blocksize); last->e_value_offs = cpu_to_le16(new_offs); } } @@ -1542,7 +1543,7 @@ shift: ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - new_extra_isize, (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, - (void *)header, total_ino, inode->i_sb->s_blocksize); + (void *)header, total_ino); EXT4_I(inode)->i_extra_isize = new_extra_isize; brelse(bh); out: -- cgit v0.10.2 From 3f2571c1f91f2de729562344b4956786a2c74d73 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 29 Aug 2016 15:42:11 -0400 Subject: ext4: factor out xattr moving Factor out function for moving xattrs from inode into external xattr block from ext4_expand_extra_isize_ea(). That function is already quite long and factoring out this rather standalone functionality helps readability. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 82b025c..8f582ae 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1340,6 +1340,84 @@ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, } /* + * Move xattr pointed to by 'entry' from inode into external xattr block + */ +static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, + struct ext4_inode *raw_inode, + struct ext4_xattr_entry *entry) +{ + struct ext4_xattr_ibody_find *is = NULL; + struct ext4_xattr_block_find *bs = NULL; + char *buffer = NULL, *b_entry_name = NULL; + size_t value_offs, value_size; + struct ext4_xattr_info i = { + .value = NULL, + .value_len = 0, + .name_index = entry->e_name_index, + }; + struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); + int error; + + value_offs = le16_to_cpu(entry->e_value_offs); + value_size = le32_to_cpu(entry->e_value_size); + + is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); + bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); + buffer = kmalloc(value_size, GFP_NOFS); + b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); + if (!is || !bs || !buffer || !b_entry_name) { + error = -ENOMEM; + goto out; + } + + is->s.not_found = -ENODATA; + bs->s.not_found = -ENODATA; + is->iloc.bh = NULL; + bs->bh = NULL; + + /* Save the entry name and the entry value */ + memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size); + memcpy(b_entry_name, entry->e_name, entry->e_name_len); + b_entry_name[entry->e_name_len] = '\0'; + i.name = b_entry_name; + + error = ext4_get_inode_loc(inode, &is->iloc); + if (error) + goto out; + + error = ext4_xattr_ibody_find(inode, &i, is); + if (error) + goto out; + + /* Remove the chosen entry from the inode */ + error = ext4_xattr_ibody_set(handle, inode, &i, is); + if (error) + goto out; + + i.name = b_entry_name; + i.value = buffer; + i.value_len = value_size; + error = ext4_xattr_block_find(inode, &i, bs); + if (error) + goto out; + + /* Add entry which was removed from the inode into the block */ + error = ext4_xattr_block_set(handle, inode, &i, bs); + if (error) + goto out; + error = 0; +out: + kfree(b_entry_name); + kfree(buffer); + if (is) + brelse(is->iloc.bh); + kfree(is); + kfree(bs); + + return error; +} + +/* * Expand an inode by new_extra_isize bytes when EAs are present. * Returns 0 on success or negative error number on failure. */ @@ -1349,9 +1427,6 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry, *last, *first; struct buffer_head *bh = NULL; - struct ext4_xattr_ibody_find *is = NULL; - struct ext4_xattr_block_find *bs = NULL; - char *buffer = NULL, *b_entry_name = NULL; size_t min_offs; size_t ifree, bfree; int total_ino; @@ -1427,27 +1502,11 @@ retry: } while (isize_diff > ifree) { - size_t offs, size, entry_size; struct ext4_xattr_entry *small_entry = NULL; - struct ext4_xattr_info i = { - .value = NULL, - .value_len = 0, - }; - unsigned int total_size; /* EA entry size + value size */ + unsigned int entry_size; /* EA entry size */ + unsigned int total_size; /* EA entry size + value size */ unsigned int min_total_size = ~0U; - is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); - bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); - if (!is || !bs) { - error = -ENOMEM; - goto cleanup; - } - - is->s.not_found = -ENODATA; - bs->s.not_found = -ENODATA; - is->iloc.bh = NULL; - bs->bh = NULL; - last = IFIRST(header); /* Find the entry best suited to be pushed into EA block */ entry = NULL; @@ -1474,8 +1533,6 @@ retry: s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; - kfree(is); is = NULL; - kfree(bs); bs = NULL; brelse(bh); goto retry; } @@ -1483,58 +1540,18 @@ retry: goto cleanup; } } - offs = le16_to_cpu(entry->e_value_offs); - size = le32_to_cpu(entry->e_value_size); - entry_size = EXT4_XATTR_LEN(entry->e_name_len); - total_size = entry_size + EXT4_XATTR_SIZE(size); - i.name_index = entry->e_name_index, - buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_NOFS); - b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); - if (!buffer || !b_entry_name) { - error = -ENOMEM; - goto cleanup; - } - /* Save the entry name and the entry value */ - memcpy(buffer, (void *)IFIRST(header) + offs, - EXT4_XATTR_SIZE(size)); - memcpy(b_entry_name, entry->e_name, entry->e_name_len); - b_entry_name[entry->e_name_len] = '\0'; - i.name = b_entry_name; - - error = ext4_get_inode_loc(inode, &is->iloc); - if (error) - goto cleanup; - error = ext4_xattr_ibody_find(inode, &i, is); + entry_size = EXT4_XATTR_LEN(entry->e_name_len); + total_size = entry_size + + EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); + error = ext4_xattr_move_to_block(handle, inode, raw_inode, + entry); if (error) goto cleanup; - /* Remove the chosen entry from the inode */ - error = ext4_xattr_ibody_set(handle, inode, &i, is); - if (error) - goto cleanup; total_ino -= entry_size; ifree += total_size; bfree -= total_size; - - i.name = b_entry_name; - i.value = buffer; - i.value_len = size; - error = ext4_xattr_block_find(inode, &i, bs); - if (error) - goto cleanup; - - /* Add entry which was removed from the inode into the block */ - error = ext4_xattr_block_set(handle, inode, &i, bs); - if (error) - goto cleanup; - kfree(b_entry_name); - kfree(buffer); - b_entry_name = NULL; - buffer = NULL; - brelse(is->iloc.bh); - kfree(is); - kfree(bs); } shift: @@ -1552,12 +1569,6 @@ out: return 0; cleanup: - kfree(b_entry_name); - kfree(buffer); - if (is) - brelse(is->iloc.bh); - kfree(is); - kfree(bs); brelse(bh); /* * We deliberately leave EXT4_STATE_NO_EXPAND set here since inode -- cgit v0.10.2 From 6e0cd088c01023c02b1e887e02c8b6f3f395344f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 29 Aug 2016 15:43:11 -0400 Subject: ext4: remove (almost) unused variables from ext4_expand_extra_isize_ea() 'start' variable is completely unused in ext4_expand_extra_isize_ea(). Variable 'first' is used only once in one place. So just remove them. Variables 'entry' and 'last' are only really used later in the function inside a loop. Move their declarations there. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 8f582ae..2ef6876 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1425,12 +1425,11 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle) { struct ext4_xattr_ibody_header *header; - struct ext4_xattr_entry *entry, *last, *first; struct buffer_head *bh = NULL; size_t min_offs; size_t ifree, bfree; int total_ino; - void *base, *start, *end; + void *base, *end; int error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); int isize_diff; /* How much do we need to grow i_extra_isize */ @@ -1446,24 +1445,22 @@ retry: goto out; header = IHDR(inode, raw_inode); - entry = IFIRST(header); /* * Check if enough free space is available in the inode to shift the * entries ahead by new_extra_isize. */ - base = start = entry; + base = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; min_offs = end - base; - last = entry; total_ino = sizeof(struct ext4_xattr_ibody_header); error = xattr_check_inode(inode, header, end); if (error) goto cleanup; - ifree = ext4_xattr_free_space(last, &min_offs, base, &total_ino); + ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino); if (ifree >= isize_diff) goto shift; @@ -1483,10 +1480,10 @@ retry: goto cleanup; } base = BHDR(bh); - first = BFIRST(bh); end = bh->b_data + bh->b_size; min_offs = end - base; - bfree = ext4_xattr_free_space(first, &min_offs, base, NULL); + bfree = ext4_xattr_free_space(BFIRST(bh), &min_offs, base, + NULL); if (bfree + ifree < isize_diff) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; @@ -1502,14 +1499,14 @@ retry: } while (isize_diff > ifree) { - struct ext4_xattr_entry *small_entry = NULL; + struct ext4_xattr_entry *small_entry = NULL, *entry = NULL; + struct ext4_xattr_entry *last; unsigned int entry_size; /* EA entry size */ unsigned int total_size; /* EA entry size + value size */ unsigned int min_total_size = ~0U; last = IFIRST(header); /* Find the entry best suited to be pushed into EA block */ - entry = NULL; for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { total_size = EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + @@ -1556,8 +1553,7 @@ retry: shift: /* Adjust the offsets and shift the remaining entries ahead */ - entry = IFIRST(header); - ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize + ext4_xattr_shift_entries(IFIRST(header), EXT4_I(inode)->i_extra_isize - new_extra_isize, (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, (void *)header, total_ino); -- cgit v0.10.2 From dfa2064b222c901b05c19ec5b7f42a25f7bee0e3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 29 Aug 2016 15:44:11 -0400 Subject: ext4: factor out loop for freeing inode xattr space Move loop to make enough space in the inode from ext4_expand_extra_isize_ea() into a separate function to make that function smaller and better readable and also to avoid delaration of variables inside a loop block. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 2ef6876..c15d633 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1417,6 +1417,63 @@ out: return error; } +static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode, + struct ext4_inode *raw_inode, + int isize_diff, size_t ifree, + size_t bfree, int *total_ino) +{ + struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); + struct ext4_xattr_entry *small_entry; + struct ext4_xattr_entry *entry; + struct ext4_xattr_entry *last; + unsigned int entry_size; /* EA entry size */ + unsigned int total_size; /* EA entry size + value size */ + unsigned int min_total_size; + int error; + + while (isize_diff > ifree) { + entry = NULL; + small_entry = NULL; + min_total_size = ~0U; + last = IFIRST(header); + /* Find the entry best suited to be pushed into EA block */ + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + total_size = + EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + + EXT4_XATTR_LEN(last->e_name_len); + if (total_size <= bfree && + total_size < min_total_size) { + if (total_size + ifree < isize_diff) { + small_entry = last; + } else { + entry = last; + min_total_size = total_size; + } + } + } + + if (entry == NULL) { + if (small_entry == NULL) + return -ENOSPC; + entry = small_entry; + } + + entry_size = EXT4_XATTR_LEN(entry->e_name_len); + total_size = entry_size + + EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); + error = ext4_xattr_move_to_block(handle, inode, raw_inode, + entry); + if (error) + return error; + + *total_ino -= entry_size; + ifree += total_size; + bfree -= total_size; + } + + return 0; +} + /* * Expand an inode by new_extra_isize bytes when EAs are present. * Returns 0 on success or negative error number on failure. @@ -1491,66 +1548,26 @@ retry: brelse(bh); goto retry; } - error = -1; + error = -ENOSPC; goto cleanup; } } else { bfree = inode->i_sb->s_blocksize; } - while (isize_diff > ifree) { - struct ext4_xattr_entry *small_entry = NULL, *entry = NULL; - struct ext4_xattr_entry *last; - unsigned int entry_size; /* EA entry size */ - unsigned int total_size; /* EA entry size + value size */ - unsigned int min_total_size = ~0U; - - last = IFIRST(header); - /* Find the entry best suited to be pushed into EA block */ - for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - total_size = - EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + - EXT4_XATTR_LEN(last->e_name_len); - if (total_size <= bfree && - total_size < min_total_size) { - if (total_size + ifree < isize_diff) { - small_entry = last; - } else { - entry = last; - min_total_size = total_size; - } - } - } - - if (entry == NULL) { - if (small_entry) { - entry = small_entry; - } else { - if (!tried_min_extra_isize && - s_min_extra_isize) { - tried_min_extra_isize++; - new_extra_isize = s_min_extra_isize; - brelse(bh); - goto retry; - } - error = -1; - goto cleanup; - } + error = ext4_xattr_make_inode_space(handle, inode, raw_inode, + isize_diff, ifree, bfree, + &total_ino); + if (error) { + if (error == -ENOSPC && !tried_min_extra_isize && + s_min_extra_isize) { + tried_min_extra_isize++; + new_extra_isize = s_min_extra_isize; + brelse(bh); + goto retry; } - - entry_size = EXT4_XATTR_LEN(entry->e_name_len); - total_size = entry_size + - EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); - error = ext4_xattr_move_to_block(handle, inode, raw_inode, - entry); - if (error) - goto cleanup; - - total_ino -= entry_size; - ifree += total_size; - bfree -= total_size; + goto cleanup; } - shift: /* Adjust the offsets and shift the remaining entries ahead */ ext4_xattr_shift_entries(IFIRST(header), EXT4_I(inode)->i_extra_isize -- cgit v0.10.2 From 14fbd4aa613bd5110556c281799ce36dc6f3ba97 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Mon, 29 Aug 2016 15:45:11 -0400 Subject: ext4: enforce online defrag restriction for encrypted files Online defragging of encrypted files is not currently implemented. However, the move extent ioctl can still return successfully when called. For example, this occurs when xfstest ext4/020 is run on an encrypted file system, resulting in a corrupted test file and a corresponding test failure. Until the proper functionality is implemented, fail the move extent ioctl if either the original or donor file is encrypted. Cc: stable@vger.kernel.org Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index a920c5d..6fc14de 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -598,6 +598,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, return -EOPNOTSUPP; } + if (ext4_encrypted_inode(orig_inode) || + ext4_encrypted_inode(donor_inode)) { + ext4_msg(orig_inode->i_sb, KERN_ERR, + "Online defrag not supported for encrypted files"); + return -EOPNOTSUPP; + } + /* Protect orig and donor inodes against a truncate */ lock_two_nondirectories(orig_inode, donor_inode); -- cgit v0.10.2 From 8913f343cdb568222c95afe3cad78aee1a4df56b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 31 Aug 2016 11:44:36 -0400 Subject: mbcache: fix to detect failure of register_shrinker register_shrinker in mb_cache_create may fail due to no memory. This patch fixes to do the check of return value of register_shrinker and handle the error case, otherwise mb_cache_create may return with no error, but losing the inner shrinker. Signed-off-by: Chao Yu Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/mbcache.c b/fs/mbcache.c index eccda3a..c5bd19f 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -366,7 +366,11 @@ struct mb_cache *mb_cache_create(int bucket_bits) cache->c_shrink.count_objects = mb_cache_count; cache->c_shrink.scan_objects = mb_cache_scan; cache->c_shrink.seeks = DEFAULT_SEEKS; - register_shrinker(&cache->c_shrink); + if (register_shrinker(&cache->c_shrink)) { + kfree(cache->c_hash); + kfree(cache); + goto err_out; + } INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker); -- cgit v0.10.2 From 93e3b4e6631d2a74a8cf7429138096862ff9f452 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 5 Sep 2016 22:56:10 -0400 Subject: ext4: reinforce check of i_dtime when clearing high fields of uid and gid Now, ext4_do_update_inode() clears high 16-bit fields of uid/gid of deleted and evicted inode to fix up interoperability with old kernels. However, it checks only i_dtime of an inode to determine whether the inode was deleted and evicted, and this is very risky, because i_dtime can be used for the pointer maintaining orphan inode list, too. We need to further check whether the i_dtime is being used for the orphan inode list even if the i_dtime is not NULL. We found that high 16-bit fields of uid/gid of inode are unintentionally and permanently cleared when the inode truncation is just triggered, but not finished, and the inode metadata, whose high uid/gid bits are cleared, is written on disk, and the sudden power-off follows that in order. Cc: stable@vger.kernel.org Signed-off-by: Daeho Jeong Signed-off-by: Hobin Woo Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c6ea25a..15d4b41 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4814,14 +4814,14 @@ static int ext4_do_update_inode(handle_t *handle, * Fix up interoperability with old kernels. Otherwise, old inodes get * re-used with the upper 16 bits of the uid/gid intact */ - if (!ei->i_dtime) { + if (ei->i_dtime && list_empty(&ei->i_orphan)) { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } else { raw_inode->i_uid_high = cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = cpu_to_le16(high_16_bits(i_gid)); - } else { - raw_inode->i_uid_high = 0; - raw_inode->i_gid_high = 0; } } else { raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); -- cgit v0.10.2 From 49da939272f417ff94c40f132a308748b46efe68 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 5 Sep 2016 23:08:16 -0400 Subject: ext4: enable quota enforcement based on mount options When quota information is stored in quota files, we enable only quota accounting on mount and enforcement is enabled only in response to Q_QUOTAON quotactl. To make ext4 behavior consistent with XFS, we add a possibility to enable quota enforcement on mount by specifying corresponding quota mount option (usrquota, grpquota, prjquota). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ea31931..0c2bf44 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1117,9 +1117,15 @@ struct ext4_inode_info { #define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ #define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ #define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ -#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ -#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ -#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT4_MOUNT_QUOTA 0x40000 /* Some quota option set */ +#define EXT4_MOUNT_USRQUOTA 0x80000 /* "old" user quota, + * enable enforcement for hidden + * quota files */ +#define EXT4_MOUNT_GRPQUOTA 0x100000 /* "old" group quota, enable + * enforcement for hidden quota + * files */ +#define EXT4_MOUNT_PRJQUOTA 0x200000 /* Enable project quota + * enforcement */ #define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3ec8708..5819b0e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1267,7 +1267,7 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax, + Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, Opt_lazytime, Opt_nolazytime, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, @@ -1327,6 +1327,7 @@ static const match_table_t tokens = { {Opt_noquota, "noquota"}, {Opt_quota, "quota"}, {Opt_usrquota, "usrquota"}, + {Opt_prjquota, "prjquota"}, {Opt_barrier, "barrier=%u"}, {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, @@ -1546,8 +1547,11 @@ static const struct mount_opts { MOPT_SET | MOPT_Q}, {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, MOPT_SET | MOPT_Q}, + {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA, + MOPT_SET | MOPT_Q}, {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | - EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q}, + EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA), + MOPT_CLEAR | MOPT_Q}, {Opt_usrjquota, 0, MOPT_Q}, {Opt_grpjquota, 0, MOPT_Q}, {Opt_offusrjquota, 0, MOPT_Q}, @@ -1836,13 +1840,17 @@ static int parse_options(char *options, struct super_block *sb, return 0; } #ifdef CONFIG_QUOTA - if (ext4_has_feature_quota(sb) && - (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) { - ext4_msg(sb, KERN_INFO, "Quota feature enabled, usrquota and grpquota " - "mount options ignored."); - clear_opt(sb, USRQUOTA); - clear_opt(sb, GRPQUOTA); - } else if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { + /* + * We do the test below only for project quotas. 'usrquota' and + * 'grpquota' mount options are allowed even without quota feature + * to support legacy quotas in quota files. + */ + if (test_opt(sb, PRJQUOTA) && !ext4_has_feature_project(sb)) { + ext4_msg(sb, KERN_ERR, "Project quota feature not enabled. " + "Cannot enable project quota enforcement."); + return 0; + } + if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) clear_opt(sb, USRQUOTA); @@ -5250,12 +5258,18 @@ static int ext4_enable_quotas(struct super_block *sb) le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; + bool quota_mopt[EXT4_MAXQUOTAS] = { + test_opt(sb, USRQUOTA), + test_opt(sb, GRPQUOTA), + test_opt(sb, PRJQUOTA), + }; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; for (type = 0; type < EXT4_MAXQUOTAS; type++) { if (qf_inums[type]) { err = ext4_quota_enable(sb, type, QFMT_VFS_V1, - DQUOT_USAGE_ENABLED); + DQUOT_USAGE_ENABLED | + (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); if (err) { ext4_warning(sb, "Failed to enable quota tracking " -- cgit v0.10.2 From 0b7b77791cc1d99cbca08d5bc8210e57e6165612 Mon Sep 17 00:00:00 2001 From: Kaho Ng Date: Mon, 5 Sep 2016 23:11:58 -0400 Subject: ext4: remove old feature helpers Use the ext4_{has,set,clear}_feature_* helpers to replace the old feature helpers. Signed-off-by: Kaho Ng Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0c2bf44..962a044 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1642,26 +1642,6 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) * Feature set definitions */ -/* Use the ext4_{has,set,clear}_feature_* helpers; these will be removed */ -#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ - ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0) -#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ - ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0) -#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ - ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0) -#define EXT4_SET_COMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) -#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) -#define EXT4_SET_INCOMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) -#define EXT4_CLEAR_COMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) -#define EXT4_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) -#define EXT4_CLEAR_INCOMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) - #define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 #define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 #define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 9e66cd1..170421e 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -802,7 +802,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, } else inode_init_owner(inode, dir, mode); - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && + if (ext4_has_feature_project(sb) && ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) ei->i_projid = EXT4_I(dir)->i_projid; else diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 15d4b41..f058afb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4414,7 +4414,7 @@ static inline void ext4_iget_extra_inode(struct inode *inode, int ext4_get_projid(struct inode *inode, kprojid_t *projid) { - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_PROJECT)) + if (!ext4_has_feature_project(inode->i_sb)) return -EOPNOTSUPP; *projid = EXT4_I(inode)->i_projid; return 0; @@ -4481,7 +4481,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && + if (ext4_has_feature_project(sb) && EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); @@ -4885,8 +4885,7 @@ static int ext4_do_update_inode(handle_t *handle, } } - BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_PROJECT) && + BUG_ON(!ext4_has_feature_project(inode->i_sb) && i_projid != EXT4_DEF_PROJID); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 10686fd..ef4430b 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -310,8 +310,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) struct ext4_inode *raw_inode; struct dquot *transfer_to[MAXQUOTAS] = { }; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_PROJECT)) { + if (!ext4_has_feature_project(sb)) { if (projid != EXT4_DEF_PROJID) return -EOPNOTSUPP; else @@ -842,8 +841,7 @@ resizefs_out: ext4_get_inode_flags(ei); fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE); - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_PROJECT)) { + if (ext4_has_feature_project(inode->i_sb)) { fa.fsx_projid = (__u32)from_kprojid(&init_user_ns, EXT4_I(inode)->i_projid); } -- cgit v0.10.2 From 6ae4c5a69877e5afb6ce63f7d82432133934073a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 5 Sep 2016 23:21:43 -0400 Subject: ext4: cleanup ext4_sync_parent() A condition !hlist_empty(&inode->i_dentry) is always true for open file. Just remove it. Also ext4_sync_parent() could use some explanation why races with rmdir() are not an issue - add a comment explaining that. Reported-by: Al Viro Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 5c43725..88effb1 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -61,6 +61,13 @@ static int ext4_sync_parent(struct inode *inode) break; iput(inode); inode = next; + /* + * The directory inode may have gone through rmdir by now. But + * the inode itself and its blocks are still allocated (we hold + * a reference to the inode so it didn't go through + * ext4_evict_inode()) and so we are safe to flush metadata + * blocks and the inode. + */ ret = sync_mapping_buffers(inode->i_mapping); if (ret) break; @@ -107,7 +114,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (!journal) { ret = __generic_file_fsync(file, start, end, datasync); - if (!ret && !hlist_empty(&inode->i_dentry)) + if (!ret) ret = ext4_sync_parent(inode); if (test_opt(inode->i_sb, BARRIER)) goto issue_flush; -- cgit v0.10.2 From e22834f0248d0fa841ead6436d6c19f65539dc9c Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 5 Sep 2016 23:38:36 -0400 Subject: ext4: improve ext4lazyinit scalability ext4lazyinit is a global thread. This thread performs itable initalization under li_list_mtx mutex. It basically does the following: ext4_lazyinit_thread ->mutex_lock(&eli->li_list_mtx); ->ext4_run_li_request(elr) ->ext4_init_inode_table-> Do a lot of IO if the list is large And when new mount/umount arrive they have to block on ->li_list_mtx because lazy_thread holds it during full walk procedure. ext4_fill_super ->ext4_register_li_request ->mutex_lock(&ext4_li_info->li_list_mtx); ->list_add(&elr->lr_request, &ext4_li_info >li_request_list); In my case mount takes 40minutes on server with 36 * 4Tb HDD. Common user may face this in case of very slow dev ( /dev/mmcblkXXX) Even more. If one of filesystems was frozen lazyinit_thread will simply block on sb_start_write() so other mount/umount will be stuck forever. This patch changes logic like follows: - grab ->s_umount read sem before processing new li_request. After that it is safe to drop li_list_mtx because all callers of li_remove_request are holding ->s_umount for write. - li_thread skips frozen SB's Locking order: Mh KOrder is asserted by umount path like follows: s_umount ->li_list_mtx so the only way to to grab ->s_mount inside li_thread is via down_read_trylock xfstests:ext4/023 #PSBM-49658 Signed-off-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5819b0e..50912cc 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2749,7 +2749,6 @@ static int ext4_run_li_request(struct ext4_li_request *elr) sb = elr->lr_super; ngroups = EXT4_SB(sb)->s_groups_count; - sb_start_write(sb); for (group = elr->lr_next_group; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) { @@ -2776,8 +2775,6 @@ static int ext4_run_li_request(struct ext4_li_request *elr) elr->lr_next_sched = jiffies + elr->lr_timeout; elr->lr_next_group = group + 1; } - sb_end_write(sb); - return ret; } @@ -2842,19 +2839,43 @@ cont_thread: mutex_unlock(&eli->li_list_mtx); goto exit_thread; } - list_for_each_safe(pos, n, &eli->li_request_list) { + int err = 0; + int progress = 0; elr = list_entry(pos, struct ext4_li_request, lr_request); - if (time_after_eq(jiffies, elr->lr_next_sched)) { - if (ext4_run_li_request(elr) != 0) { - /* error, remove the lazy_init job */ - ext4_remove_li_request(elr); - continue; + if (time_before(jiffies, elr->lr_next_sched)) { + if (time_before(elr->lr_next_sched, next_wakeup)) + next_wakeup = elr->lr_next_sched; + continue; + } + if (down_read_trylock(&elr->lr_super->s_umount)) { + if (sb_start_write_trylock(elr->lr_super)) { + progress = 1; + /* + * We hold sb->s_umount, sb can not + * be removed from the list, it is + * now safe to drop li_list_mtx + */ + mutex_unlock(&eli->li_list_mtx); + err = ext4_run_li_request(elr); + sb_end_write(elr->lr_super); + mutex_lock(&eli->li_list_mtx); + n = pos->next; } + up_read((&elr->lr_super->s_umount)); + } + /* error, remove the lazy_init job */ + if (err) { + ext4_remove_li_request(elr); + continue; + } + if (!progress) { + elr->lr_next_sched = jiffies + + (prandom_u32() + % (EXT4_DEF_LI_MAX_START_DELAY * HZ)); } - if (time_before(elr->lr_next_sched, next_wakeup)) next_wakeup = elr->lr_next_sched; } -- cgit v0.10.2 From 4e800c0359d9a53e6bf0ab216954971b2515247f Mon Sep 17 00:00:00 2001 From: wangguang Date: Thu, 15 Sep 2016 11:32:46 -0400 Subject: ext4: bugfix for mmaped pages in mpage_release_unused_pages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pages clear buffers after ext4 delayed block allocation failed, However, it does not clean its pte_dirty flag. if the pages unmap ,in cording to the pte_dirty , unmap_page_range may try to call __set_page_dirty, which may lead to the bugon at mpage_prepare_extent_to_map:head = page_buffers(page);. This patch just call clear_page_dirty_for_io to clean pte_dirty at mpage_release_unused_pages for pages mmaped. Steps to reproduce the bug: (1) mmap a file in ext4 addr = (char *)mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); memset(addr, 'i', 4096); (2) return EIO at ext4_writepages->mpage_map_and_submit_extent->mpage_map_one_extent which causes this log message to be print: ext4_msg(sb, KERN_CRIT, "Delayed block allocation failed for " "inode %lu at logical offset %llu with" " max blocks %u with error %d", inode->i_ino, (unsigned long long)map->m_lblk, (unsigned)map->m_len, -err); (3)Unmap the addr cause warning at __set_page_dirty:WARN_ON_ONCE(warn && !PageUptodate(page)); (4) wait for a minute,then bugon happen. Cc: stable@vger.kernel.org Signed-off-by: wangguang Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f058afb..9b464e5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1649,6 +1649,8 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); if (invalidate) { + if (page_mapped(page)) + clear_page_dirty_for_io(page); block_invalidatepage(page, 0, PAGE_SIZE); ClearPageUptodate(page); } -- cgit v0.10.2 From edf15aa180d7b98fe16bd3eda42f9dd0e60dee20 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 15 Sep 2016 11:39:52 -0400 Subject: ext4: fix memory leak in ext4_insert_range() Running xfstests generic/013 with kmemleak gives the following: unreferenced object 0xffff8801d3d27de0 (size 96): comm "fsstress", pid 4941, jiffies 4294860168 (age 53.485s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 01 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x23/0x40 [] __kmalloc+0xf5/0x1d0 [] ext4_find_extent+0x1ec/0x2f0 [] ext4_insert_range+0x34c/0x4a0 [] ext4_fallocate+0x4e2/0x8b0 [] vfs_fallocate+0x134/0x210 [] SyS_fallocate+0x3f/0x60 [] entry_SYSCALL_64_fastpath+0x13/0x8f [] 0xffffffffffffffff Problem seems mitigated by dropping refs and freeing path when there's no path[depth].p_ext Cc: stable@vger.kernel.org Signed-off-by: Fabian Frederick Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d7ccb7f..7f69347 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5734,6 +5734,9 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } + } else { + ext4_ext_drop_refs(path); + kfree(path); } ret = ext4_es_remove_extent(inode, offset_lblk, -- cgit v0.10.2 From c3fe493ccdb1f443c30155150391835004014c6a Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 15 Sep 2016 11:52:07 -0400 Subject: ext4: remove unneeded test in ext4_alloc_file_blocks() ext4_alloc_file_blocks() is called from ext4_zero_range() and ext4_fallocate() both already testing EXT4_INODE_EXTENTS We can call ext_depth(inode) unconditionnally. [ Added BUG_ON check to make sure ext4_alloc_file_blocks() won't get called for a indirect-mapped inode in the future. -- tytso ] Signed-off-by: Fabian Frederick Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7f69347..ef28855 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4679,6 +4679,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, unsigned int credits; loff_t epos; + BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); map.m_lblk = offset; map.m_len = len; /* @@ -4693,13 +4694,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, len); - /* - * We can only call ext_depth() on extent based inodes - */ - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - depth = ext_depth(inode); - else - depth = -1; + depth = ext_depth(inode); retry: while (ret >= 0 && len) { -- cgit v0.10.2 From 518eaa6387df796d54e065e04502e800cae1df80 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 15 Sep 2016 11:55:01 -0400 Subject: ext4: create EXT4_MAX_BLOCKS() macro Create a macro to calculate length + offset -> maximum blocks This adds more readability. Signed-off-by: Fabian Frederick Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 962a044..282a51b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -262,6 +262,9 @@ struct ext4_io_submit { (s)->s_first_ino) #endif #define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) +#define EXT4_MAX_BLOCKS(size, offset, blkbits) \ + ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ + blkbits)) /* Translate a block number to a cluster number */ #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ef28855..c930a01 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4961,13 +4961,8 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) trace_ext4_fallocate_enter(inode, offset, len, mode); lblk = offset >> blkbits; - /* - * We can't just convert len to max_blocks because - * If blocksize = 4096 offset = 3072 and len = 2048 - */ - max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - - lblk; + max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; @@ -5030,12 +5025,8 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, unsigned int credits, blkbits = inode->i_blkbits; map.m_lblk = offset >> blkbits; - /* - * We can't just convert len to max_blocks because - * If blocksize = 4096 offset = 3072 and len = 2048 - */ - max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - - map.m_lblk); + max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); + /* * This is somewhat ugly but the idea is clear: When transaction is * reserved, everything goes into it. Otherwise we rather start several diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 261ac37..34acda7 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -144,8 +144,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) int err, len; map.m_lblk = pos >> blkbits; - map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits) - - map.m_lblk; + map.m_len = EXT4_MAX_BLOCKS(length, pos, blkbits); len = map.m_len; err = ext4_map_blocks(NULL, inode, &map, 0); -- cgit v0.10.2 From be32197cd6b8c37425ca954e719d236e6117a8ee Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 15 Sep 2016 11:58:47 -0400 Subject: ext4: remove unused definition for MAX_32_NUM MAX_32_NUM isn't used in ext4 Signed-off-by: Fabian Frederick Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index ef4430b..c64239b 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -19,8 +19,6 @@ #include "ext4_jbd2.h" #include "ext4.h" -#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) - /** * Swap memory between @a and @b for @len bytes. * -- cgit v0.10.2 From f0c9fd5458bacf7b12a9a579a727dc740cbe047e Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 15 Sep 2016 12:02:32 -0400 Subject: jbd2: move more common code into journal_init_common() There are some repetitive code in jbd2_journal_init_dev() and jbd2_journal_init_inode(). So this patch moves the common code into journal_init_common() helper to simplify the code. And fix the coding style warnings reported by checkpatch.pl by the way. Signed-off-by: Geliang Tang Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 46261a6..927da49 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1090,11 +1090,15 @@ static void jbd2_stats_proc_exit(journal_t *journal) * very few fields yet: that has to wait until we have created the * journal structures from from scratch, or loaded them from disk. */ -static journal_t * journal_init_common (void) +static journal_t *journal_init_common(struct block_device *bdev, + struct block_device *fs_dev, + unsigned long long start, int len, int blocksize) { static struct lock_class_key jbd2_trans_commit_key; journal_t *journal; int err; + struct buffer_head *bh; + int n; journal = kzalloc(sizeof(*journal), GFP_KERNEL); if (!journal) @@ -1131,6 +1135,32 @@ static journal_t * journal_init_common (void) lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle", &jbd2_trans_commit_key, 0); + /* journal descriptor can store up to n blocks -bzzz */ + journal->j_blocksize = blocksize; + journal->j_dev = bdev; + journal->j_fs_dev = fs_dev; + journal->j_blk_offset = start; + journal->j_maxlen = len; + n = journal->j_blocksize / sizeof(journal_block_tag_t); + journal->j_wbufsize = n; + journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *), + GFP_KERNEL); + if (!journal->j_wbuf) { + kfree(journal); + return NULL; + } + + bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize); + if (!bh) { + pr_err("%s: Cannot get buffer for journal superblock\n", + __func__); + kfree(journal->j_wbuf); + kfree(journal); + return NULL; + } + journal->j_sb_buffer = bh; + journal->j_superblock = (journal_superblock_t *)bh->b_data; + return journal; } @@ -1157,51 +1187,21 @@ static journal_t * journal_init_common (void) * range of blocks on an arbitrary block device. * */ -journal_t * jbd2_journal_init_dev(struct block_device *bdev, +journal_t *jbd2_journal_init_dev(struct block_device *bdev, struct block_device *fs_dev, unsigned long long start, int len, int blocksize) { - journal_t *journal = journal_init_common(); - struct buffer_head *bh; - int n; + journal_t *journal; + journal = journal_init_common(bdev, fs_dev, start, len, blocksize); if (!journal) return NULL; - /* journal descriptor can store up to n blocks -bzzz */ - journal->j_blocksize = blocksize; - journal->j_dev = bdev; - journal->j_fs_dev = fs_dev; - journal->j_blk_offset = start; - journal->j_maxlen = len; bdevname(journal->j_dev, journal->j_devname); strreplace(journal->j_devname, '/', '!'); jbd2_stats_proc_init(journal); - n = journal->j_blocksize / sizeof(journal_block_tag_t); - journal->j_wbufsize = n; - journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); - if (!journal->j_wbuf) { - printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", - __func__); - goto out_err; - } - - bh = __getblk(journal->j_dev, start, journal->j_blocksize); - if (!bh) { - printk(KERN_ERR - "%s: Cannot get buffer for journal superblock\n", - __func__); - goto out_err; - } - journal->j_sb_buffer = bh; - journal->j_superblock = (journal_superblock_t *)bh->b_data; return journal; -out_err: - kfree(journal->j_wbuf); - jbd2_stats_proc_exit(journal); - kfree(journal); - return NULL; } /** @@ -1212,67 +1212,36 @@ out_err: * the journal. The inode must exist already, must support bmap() and * must have all data blocks preallocated. */ -journal_t * jbd2_journal_init_inode (struct inode *inode) +journal_t *jbd2_journal_init_inode(struct inode *inode) { - struct buffer_head *bh; - journal_t *journal = journal_init_common(); + journal_t *journal; char *p; - int err; - int n; unsigned long long blocknr; + blocknr = bmap(inode, 0); + if (!blocknr) { + pr_err("%s: Cannot locate journal superblock\n", + __func__); + return NULL; + } + + jbd_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n", + inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size, + inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); + + journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev, + blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits, + inode->i_sb->s_blocksize); if (!journal) return NULL; - journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; journal->j_inode = inode; bdevname(journal->j_dev, journal->j_devname); p = strreplace(journal->j_devname, '/', '!'); sprintf(p, "-%lu", journal->j_inode->i_ino); - jbd_debug(1, - "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", - journal, inode->i_sb->s_id, inode->i_ino, - (long long) inode->i_size, - inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); - - journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; - journal->j_blocksize = inode->i_sb->s_blocksize; jbd2_stats_proc_init(journal); - /* journal descriptor can store up to n blocks -bzzz */ - n = journal->j_blocksize / sizeof(journal_block_tag_t); - journal->j_wbufsize = n; - journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); - if (!journal->j_wbuf) { - printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", - __func__); - goto out_err; - } - - err = jbd2_journal_bmap(journal, 0, &blocknr); - /* If that failed, give up */ - if (err) { - printk(KERN_ERR "%s: Cannot locate journal superblock\n", - __func__); - goto out_err; - } - - bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize); - if (!bh) { - printk(KERN_ERR - "%s: Cannot get buffer for journal superblock\n", - __func__); - goto out_err; - } - journal->j_sb_buffer = bh; - journal->j_superblock = (journal_superblock_t *)bh->b_data; - return journal; -out_err: - kfree(journal->j_wbuf); - jbd2_stats_proc_exit(journal); - kfree(journal); - return NULL; } /* -- cgit v0.10.2 From dcce7a46c6f28f41447272fb44348ead8f584573 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 15 Sep 2016 13:13:13 -0400 Subject: ext4: fix memory leak when symlink decryption fails This bug was introduced in v4.8-rc1. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 4d83d9e..04a7850 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -65,13 +65,12 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); if (res) goto errout; + paddr = pstr.name; res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); if (res < 0) goto errout; - paddr = pstr.name; - /* Null-terminate the name */ if (res <= pstr.len) paddr[res] = '\0'; -- cgit v0.10.2 From 8f39850dffa9cba0f6920ff907710bcddc7f2a26 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 15 Sep 2016 13:32:11 -0400 Subject: fscrypto: improved validation when loading inode encryption metadata - Validate fscrypt_context.format and fscrypt_context.flags. If unrecognized values are set, then the kernel may not know how to interpret the encrypted file, so it should fail the operation. - Validate that AES_256_XTS is used for contents and that AES_256_CTS is used for filenames. It was previously possible for the kernel to accept these reversed, though it would have taken manual editing of the block device. This was not intended. - Fail cleanly rather than BUG()-ing if a file has an unexpected type. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 1ac263e..b5374ef 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -139,6 +139,38 @@ out: return res; } +static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, + const char **cipher_str_ret, int *keysize_ret) +{ + if (S_ISREG(inode->i_mode)) { + if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) { + *cipher_str_ret = "xts(aes)"; + *keysize_ret = FS_AES_256_XTS_KEY_SIZE; + return 0; + } + pr_warn_once("fscrypto: unsupported contents encryption mode " + "%d for inode %lu\n", + ci->ci_data_mode, inode->i_ino); + return -ENOKEY; + } + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { + if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) { + *cipher_str_ret = "cts(cbc(aes))"; + *keysize_ret = FS_AES_256_CTS_KEY_SIZE; + return 0; + } + pr_warn_once("fscrypto: unsupported filenames encryption mode " + "%d for inode %lu\n", + ci->ci_filename_mode, inode->i_ino); + return -ENOKEY; + } + + pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n", + (inode->i_mode & S_IFMT), inode->i_ino); + return -ENOKEY; +} + static void put_crypt_info(struct fscrypt_info *ci) { if (!ci) @@ -155,8 +187,8 @@ int get_crypt_info(struct inode *inode) struct fscrypt_context ctx; struct crypto_skcipher *ctfm; const char *cipher_str; + int keysize; u8 raw_key[FS_MAX_KEY_SIZE]; - u8 mode; int res; res = fscrypt_initialize(); @@ -179,13 +211,19 @@ retry: if (res < 0) { if (!fscrypt_dummy_context_enabled(inode)) return res; + ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; ctx.flags = 0; } else if (res != sizeof(ctx)) { return -EINVAL; } - res = 0; + + if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) + return -EINVAL; + + if (ctx.flags & ~FS_POLICY_FLAGS_VALID) + return -EINVAL; crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS); if (!crypt_info) @@ -198,27 +236,11 @@ retry: crypt_info->ci_keyring_key = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); - if (S_ISREG(inode->i_mode)) - mode = crypt_info->ci_data_mode; - else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - mode = crypt_info->ci_filename_mode; - else - BUG(); - - switch (mode) { - case FS_ENCRYPTION_MODE_AES_256_XTS: - cipher_str = "xts(aes)"; - break; - case FS_ENCRYPTION_MODE_AES_256_CTS: - cipher_str = "cts(cbc(aes))"; - break; - default: - printk_once(KERN_WARNING - "%s: unsupported key mode %d (ino %u)\n", - __func__, mode, (unsigned) inode->i_ino); - res = -ENOKEY; + + res = determine_cipher_type(crypt_info, inode, &cipher_str, &keysize); + if (res) goto out; - } + if (fscrypt_dummy_context_enabled(inode)) { memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); goto got_key; @@ -253,7 +275,7 @@ got_key: crypt_info->ci_ctfm = ctfm; crypto_skcipher_clear_flags(ctfm, ~0); crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_skcipher_setkey(ctfm, raw_key, fscrypt_key_size(mode)); + res = crypto_skcipher_setkey(ctfm, raw_key, keysize); if (res) goto out; diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h index cfa6cde..00813c2 100644 --- a/include/linux/fscrypto.h +++ b/include/linux/fscrypto.h @@ -111,23 +111,6 @@ struct fscrypt_completion_result { struct fscrypt_completion_result ecr = { \ COMPLETION_INITIALIZER((ecr).completion), 0 } -static inline int fscrypt_key_size(int mode) -{ - switch (mode) { - case FS_ENCRYPTION_MODE_AES_256_XTS: - return FS_AES_256_XTS_KEY_SIZE; - case FS_ENCRYPTION_MODE_AES_256_GCM: - return FS_AES_256_GCM_KEY_SIZE; - case FS_ENCRYPTION_MODE_AES_256_CBC: - return FS_AES_256_CBC_KEY_SIZE; - case FS_ENCRYPTION_MODE_AES_256_CTS: - return FS_AES_256_CTS_KEY_SIZE; - default: - BUG(); - } - return 0; -} - #define FS_FNAME_NUM_SCATTER_ENTRIES 4 #define FS_CRYPTO_BLOCK_SIZE 16 #define FS_FNAME_CRYPTO_DIGEST_SIZE 32 @@ -202,13 +185,6 @@ static inline bool fscrypt_valid_filenames_enc_mode(u32 mode) return (mode == FS_ENCRYPTION_MODE_AES_256_CTS); } -static inline u32 fscrypt_validate_encryption_key_size(u32 mode, u32 size) -{ - if (size == fscrypt_key_size(mode)) - return size; - return 0; -} - static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) { if (str->len == 1 && str->name[0] == '.') -- cgit v0.10.2 From d83ae730b6f9464e46eaf052ff84d9486c5b4107 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 15 Sep 2016 16:41:09 -0400 Subject: fscrypto: remove unnecessary includes This patch removes some #includes that are clearly not needed, such as a reference to ecryptfs, which is unrelated to the new filesystem encryption code. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index c502c11..7c39eab 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -28,7 +28,6 @@ #include #include #include -#include static unsigned int num_prealloc_crypto_pages = 32; static unsigned int num_prealloc_crypto_ctxs = 128; diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 5d6d491..3108806 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -10,8 +10,6 @@ * This has not yet undergone a rigorous security audit. */ -#include -#include #include #include #include diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index b5374ef..82f0285 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -8,11 +8,8 @@ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. */ -#include #include -#include #include -#include #include static void derive_crypt_complete(struct crypto_async_request *req, int rc) -- cgit v0.10.2 From 53fd7550ec40571e26f730a0d3fc0a5dd93ecda2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 15 Sep 2016 16:51:01 -0400 Subject: fscrypto: rename completion callbacks to reflect usage fscrypt_complete() was used only for data pages, not for all encryption/decryption. Rename it to page_crypt_complete(). dir_crypt_complete() was used for filename encryption/decryption for both directory entries and symbolic links. Rename it to fname_crypt_complete(). Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 7c39eab..61057b7d 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -127,11 +127,11 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) EXPORT_SYMBOL(fscrypt_get_ctx); /** - * fscrypt_complete() - The completion callback for page encryption - * @req: The asynchronous encryption request context - * @res: The result of the encryption operation + * page_crypt_complete() - completion callback for page crypto + * @req: The asynchronous cipher request context + * @res: The result of the cipher operation */ -static void fscrypt_complete(struct crypto_async_request *req, int res) +static void page_crypt_complete(struct crypto_async_request *req, int res) { struct fscrypt_completion_result *ecr = req->data; @@ -169,7 +169,7 @@ static int do_page_crypto(struct inode *inode, skcipher_request_set_callback( req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - fscrypt_complete, &ecr); + page_crypt_complete, &ecr); BUILD_BUG_ON(FS_XTS_TWEAK_SIZE < sizeof(index)); memcpy(xts_tweak, &index, sizeof(index)); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 3108806..6bbc3b1 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -20,9 +20,11 @@ static u32 size_round_up(size_t size, size_t blksize) } /** - * dir_crypt_complete() - + * fname_crypt_complete() - completion callback for filename crypto + * @req: The asynchronous cipher request context + * @res: The result of the cipher operation */ -static void dir_crypt_complete(struct crypto_async_request *req, int res) +static void fname_crypt_complete(struct crypto_async_request *req, int res) { struct fscrypt_completion_result *ecr = req->data; @@ -82,7 +84,7 @@ static int fname_encrypt(struct inode *inode, } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - dir_crypt_complete, &ecr); + fname_crypt_complete, &ecr); /* Copy the input */ memcpy(workbuf, iname->name, iname->len); @@ -144,7 +146,7 @@ static int fname_decrypt(struct inode *inode, } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - dir_crypt_complete, &ecr); + fname_crypt_complete, &ecr); /* Initialize IV */ memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); -- cgit v0.10.2 From ef1eb3aa50930f026135085cd160b1212cdfe817 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 15 Sep 2016 17:25:55 -0400 Subject: fscrypto: make filename crypto functions return 0 on success Several filename crypto functions: fname_decrypt(), fscrypt_fname_disk_to_usr(), and fscrypt_fname_usr_to_disk(), returned the output length on success or -errno on failure. However, the output length was redundant with the value written to 'oname->len'. It is also potentially error-prone to make callers have to check for '< 0' instead of '!= 0'. Therefore, make these functions return 0 instead of a length, and make the callers who cared about the return value being a length use 'oname->len' instead. For consistency also make other callers check for a nonzero result rather than a negative result. This change also fixes the inconsistency of fname_encrypt() actually already returning 0 on success, not a length like the other filename crypto functions and as documented in its function comment. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger Acked-by: Jaegeuk Kim diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 6bbc3b1..90697c7 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -35,11 +35,11 @@ static void fname_crypt_complete(struct crypto_async_request *req, int res) } /** - * fname_encrypt() - + * fname_encrypt() - encrypt a filename * - * This function encrypts the input filename, and returns the length of the - * ciphertext. Errors are returned as negative numbers. We trust the caller to - * allocate sufficient memory to oname string. + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ static int fname_encrypt(struct inode *inode, const struct qstr *iname, struct fscrypt_str *oname) @@ -105,20 +105,22 @@ static int fname_encrypt(struct inode *inode, } kfree(alloc_buf); skcipher_request_free(req); - if (res < 0) + if (res < 0) { printk_ratelimited(KERN_ERR "%s: Error (error code %d)\n", __func__, res); + return res; + } oname->len = ciphertext_len; - return res; + return 0; } -/* - * fname_decrypt() - * This function decrypts the input filename, and returns - * the length of the plaintext. - * Errors are returned as negative numbers. - * We trust the caller to allocate sufficient memory to oname string. +/** + * fname_decrypt() - decrypt a filename + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ static int fname_decrypt(struct inode *inode, const struct fscrypt_str *iname, @@ -168,7 +170,7 @@ static int fname_decrypt(struct inode *inode, } oname->len = strnlen(oname->name, iname->len); - return oname->len; + return 0; } static const char *lookup_table = @@ -279,6 +281,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer); /** * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user * space + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ int fscrypt_fname_disk_to_usr(struct inode *inode, u32 hash, u32 minor_hash, @@ -287,13 +293,12 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, { const struct qstr qname = FSTR_TO_QSTR(iname); char buf[24]; - int ret; if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; - return oname->len; + return 0; } if (iname->len < FS_CRYPTO_BLOCK_SIZE) @@ -303,9 +308,9 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, return fname_decrypt(inode, iname, oname); if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) { - ret = digest_encode(iname->name, iname->len, oname->name); - oname->len = ret; - return ret; + oname->len = digest_encode(iname->name, iname->len, + oname->name); + return 0; } if (hash) { memcpy(buf, &hash, 4); @@ -315,15 +320,18 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, } memcpy(buf + 8, iname->name + iname->len - 16, 16); oname->name[0] = '_'; - ret = digest_encode(buf, 24, oname->name + 1); - oname->len = ret + 1; - return ret + 1; + oname->len = 1 + digest_encode(buf, 24, oname->name + 1); + return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); /** * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk * space + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ int fscrypt_fname_usr_to_disk(struct inode *inode, const struct qstr *iname, @@ -333,7 +341,7 @@ int fscrypt_fname_usr_to_disk(struct inode *inode, oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; - return oname->len; + return 0; } if (inode->i_crypt_info) return fname_encrypt(inode, iname, oname); @@ -367,10 +375,10 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, if (dir->i_crypt_info) { ret = fscrypt_fname_alloc_buffer(dir, iname->len, &fname->crypto_buf); - if (ret < 0) + if (ret) return ret; ret = fname_encrypt(dir, iname, &fname->crypto_buf); - if (ret < 0) + if (ret) goto errout; fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 67415e0..4d4b910 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -260,11 +260,12 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) /* Directory is encrypted */ err = fscrypt_fname_disk_to_usr(inode, 0, 0, &de_name, &fstr); + de_name = fstr; fstr.len = save_len; - if (err < 0) + if (err) goto errout; if (!dir_emit(ctx, - fstr.name, err, + de_name.name, de_name.len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto done; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 34c0142..2243ae2 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -639,7 +639,7 @@ static struct stats dx_show_leaf(struct inode *dir, res = fscrypt_fname_alloc_buffer( dir, len, &fname_crypto_str); - if (res < 0) + if (res) printk(KERN_WARNING "Error " "allocating crypto " "buffer--skipping " @@ -647,7 +647,7 @@ static struct stats dx_show_leaf(struct inode *dir, res = fscrypt_fname_disk_to_usr(dir, 0, 0, &de_name, &fname_crypto_str); - if (res < 0) { + if (res) { printk(KERN_WARNING "Error " "converting filename " "from disk to usr" @@ -1011,7 +1011,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, err = fscrypt_fname_disk_to_usr(dir, hinfo->hash, hinfo->minor_hash, &de_name, &fname_crypto_str); - if (err < 0) { + if (err) { count = err; goto errout; } @@ -3144,7 +3144,7 @@ static int ext4_symlink(struct inode *dir, istr.name = (const unsigned char *) symname; istr.len = len; err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); - if (err < 0) + if (err) goto err_drop_inode; sd->len = cpu_to_le16(ostr.len); disk_link.name = (char *) sd; diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 04a7850..0a26cbd 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -68,12 +68,11 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, paddr = pstr.name; res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); - if (res < 0) + if (res) goto errout; /* Null-terminate the name */ - if (res <= pstr.len) - paddr[res] = '\0'; + paddr[pstr.len] = '\0'; if (cpage) put_page(cpage); set_delayed_call(done, kfree_link, paddr); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 9054aea..8716943 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -786,7 +786,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (f2fs_encrypted_inode(d->inode)) { int save_len = fstr->len; - int ret; + int err; de_name.name = f2fs_kmalloc(de_name.len, GFP_NOFS); if (!de_name.name) @@ -794,11 +794,11 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, memcpy(de_name.name, d->filename[bit_pos], de_name.len); - ret = fscrypt_fname_disk_to_usr(d->inode, + err = fscrypt_fname_disk_to_usr(d->inode, (u32)de->hash_code, 0, &de_name, fstr); kfree(de_name.name); - if (ret < 0) + if (err) return true; de_name = *fstr; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 73fa356..afd5633 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -449,7 +449,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, ostr.name = sd->encrypted_path; ostr.len = disk_link.len; err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); - if (err < 0) + if (err) goto err_out; sd->len = cpu_to_le16(ostr.len); @@ -1048,7 +1048,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, goto errout; res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); - if (res < 0) + if (res) goto errout; /* this is broken symlink case */ @@ -1060,7 +1060,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, paddr = pstr.name; /* Null-terminate the name */ - paddr[res] = '\0'; + paddr[pstr.len] = '\0'; put_page(cpage); set_delayed_call(done, kfree_link, paddr); -- cgit v0.10.2 From 9e5ab85deb2c80f9707be39cd0a2c5f90c89dd97 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 15 Sep 2016 18:25:07 -0400 Subject: blockgroup_lock.h: remove debris from bgl_lock_ptr() conversion An obsolete comment and extra parentheses were left over from when the sb_bgl_lock() macro was replaced with the bgl_lock_ptr() function. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger diff --git a/include/linux/blockgroup_lock.h b/include/linux/blockgroup_lock.h index e44b88b..61b583d 100644 --- a/include/linux/blockgroup_lock.h +++ b/include/linux/blockgroup_lock.h @@ -49,14 +49,10 @@ static inline void bgl_lock_init(struct blockgroup_lock *bgl) spin_lock_init(&bgl->locks[i].lock); } -/* - * The accessor is a macro so we can embed a blockgroup_lock into different - * superblock types - */ static inline spinlock_t * bgl_lock_ptr(struct blockgroup_lock *bgl, unsigned int block_group) { - return &bgl->locks[(block_group) & (NR_BG_LOCKS-1)].lock; + return &bgl->locks[block_group & (NR_BG_LOCKS-1)].lock; } #endif -- cgit v0.10.2 From 7c5f6b320b59cb4a674750bbb29a248b5bae7641 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 15 Sep 2016 18:29:06 -0400 Subject: blockgroup_lock.h: simplify definition of NR_BG_LOCKS We can use ilog2() to more easily produce the desired NR_BG_LOCKS. This works because ilog2() is evaluated at compile-time when its argument is a compile-time constant. I did not change the chosen NR_BG_LOCKS values. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger diff --git a/include/linux/blockgroup_lock.h b/include/linux/blockgroup_lock.h index 61b583d..225bdb7 100644 --- a/include/linux/blockgroup_lock.h +++ b/include/linux/blockgroup_lock.h @@ -10,28 +10,10 @@ #include #ifdef CONFIG_SMP - -/* - * We want a power-of-two. Is there a better way than this? - */ - -#if NR_CPUS >= 32 -#define NR_BG_LOCKS 128 -#elif NR_CPUS >= 16 -#define NR_BG_LOCKS 64 -#elif NR_CPUS >= 8 -#define NR_BG_LOCKS 32 -#elif NR_CPUS >= 4 -#define NR_BG_LOCKS 16 -#elif NR_CPUS >= 2 -#define NR_BG_LOCKS 8 +#define NR_BG_LOCKS (4 << ilog2(NR_CPUS < 32 ? NR_CPUS : 32)) #else -#define NR_BG_LOCKS 4 -#endif - -#else /* CONFIG_SMP */ #define NR_BG_LOCKS 1 -#endif /* CONFIG_SMP */ +#endif struct bgl_lock { spinlock_t lock; -- cgit v0.10.2 From e03a9976afce6634826d56c33531dd10bb9a9166 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 22 Sep 2016 11:44:06 -0400 Subject: jbd2: fix lockdep annotation in add_transaction_credits() Thomas has reported a lockdep splat hitting in add_transaction_credits(). The problem is that that function calls jbd2_might_wait_for_commit() while holding j_state_lock which is wrong (we do not really wait for transaction commit while holding that lock). Fix the problem by moving jbd2_might_wait_for_commit() into places where we are ready to wait for transaction commit and thus j_state_lock is unlocked. Cc: stable@vger.kernel.org Fixes: 1eaa566d368b214d99cbb973647c1b0b8102a9ae Reported-by: Thomas Gleixner Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index b5bc3e2..3d8246a 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -159,6 +159,7 @@ static void wait_transaction_locked(journal_t *journal) read_unlock(&journal->j_state_lock); if (need_to_start) jbd2_log_start_commit(journal, tid); + jbd2_might_wait_for_commit(journal); schedule(); finish_wait(&journal->j_wait_transaction_locked, &wait); } @@ -182,8 +183,6 @@ static int add_transaction_credits(journal_t *journal, int blocks, int needed; int total = blocks + rsv_blocks; - jbd2_might_wait_for_commit(journal); - /* * If the current transaction is locked down for commit, wait * for the lock to be released. @@ -214,6 +213,7 @@ static int add_transaction_credits(journal_t *journal, int blocks, if (atomic_read(&journal->j_reserved_credits) + total > journal->j_max_transaction_buffers) { read_unlock(&journal->j_state_lock); + jbd2_might_wait_for_commit(journal); wait_event(journal->j_wait_reserved, atomic_read(&journal->j_reserved_credits) + total <= journal->j_max_transaction_buffers); @@ -238,6 +238,7 @@ static int add_transaction_credits(journal_t *journal, int blocks, if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) { atomic_sub(total, &t->t_outstanding_credits); read_unlock(&journal->j_state_lock); + jbd2_might_wait_for_commit(journal); write_lock(&journal->j_state_lock); if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) __jbd2_log_wait_for_space(journal); @@ -255,6 +256,7 @@ static int add_transaction_credits(journal_t *journal, int blocks, sub_reserved_credits(journal, rsv_blocks); atomic_sub(total, &t->t_outstanding_credits); read_unlock(&journal->j_state_lock); + jbd2_might_wait_for_commit(journal); wait_event(journal->j_wait_reserved, atomic_read(&journal->j_reserved_credits) + rsv_blocks <= journal->j_max_transaction_buffers / 2); -- cgit v0.10.2 From cca32b7eeb4ea24fa6596650e06279ad9130af98 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Thu, 22 Sep 2016 11:49:38 -0400 Subject: ext4: allow DAX writeback for hole punch Currently when doing a DAX hole punch with ext4 we fail to do a writeback. This is because the logic around filemap_write_and_wait_range() in ext4_punch_hole() only looks for dirty page cache pages in the radix tree, not for dirty DAX exceptional entries. Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Cc: Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9b464e5..3988315 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3892,7 +3892,7 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, } /* - * ext4_punch_hole: punches a hole in a file by releaseing the blocks + * ext4_punch_hole: punches a hole in a file by releasing the blocks * associated with the given offset and length * * @inode: File inode @@ -3921,7 +3921,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) * Write out all dirty pages to avoid race conditions * Then release them. */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { ret = filemap_write_and_wait_range(mapping, offset, offset + length - 1); if (ret) -- cgit v0.10.2 From 16c54688592ce8eea85d2a26d37b64fa07e3e233 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 30 Sep 2016 01:03:17 -0400 Subject: ext4: Allow parallel DIO reads We can easily support parallel direct IO reads. We only have to make sure we cannot expose uninitialized data by reading allocated block to which data was not written yet, or which was already truncated. That is easily achieved by holding inode_lock in shared mode - that excludes all writes, truncates, hole punches. We also have to guard against page writeback allocating blocks for delay-allocated pages - that race is handled by the fact that we writeback all the pages in the affected range and the lock protects us from new pages being created there. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3988315..d8a4afc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3528,35 +3528,31 @@ out: static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) { - int unlocked = 0; - struct inode *inode = iocb->ki_filp->f_mapping->host; + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; ssize_t ret; - if (ext4_should_dioread_nolock(inode)) { - /* - * Nolock dioread optimization may be dynamically disabled - * via ext4_inode_block_unlocked_dio(). Check inode's state - * while holding extra i_dio_count ref. - */ - inode_dio_begin(inode); - smp_mb(); - if (unlikely(ext4_test_inode_state(inode, - EXT4_STATE_DIOREAD_LOCK))) - inode_dio_end(inode); - else - unlocked = 1; - } + /* + * Shared inode_lock is enough for us - it protects against concurrent + * writes & truncates and since we take care of writing back page cache, + * we are protected against page writeback as well. + */ + inode_lock_shared(inode); if (IS_DAX(inode)) { - ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, - NULL, unlocked ? 0 : DIO_LOCKING); + ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, NULL, 0); } else { + size_t count = iov_iter_count(iter); + + ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, + iocb->ki_pos + count); + if (ret) + goto out_unlock; ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, ext4_dio_get_block, - NULL, NULL, - unlocked ? 0 : DIO_LOCKING); + NULL, NULL, 0); } - if (unlocked) - inode_dio_end(inode); +out_unlock: + inode_unlock_shared(inode); return ret; } -- cgit v0.10.2 From e81d44778d1d57bbaef9e24c4eac7c8a7a401d40 Mon Sep 17 00:00:00 2001 From: gmail Date: Fri, 30 Sep 2016 01:33:37 -0400 Subject: ext4: release bh in make_indexed_dir The commit 6050d47adcad: "ext4: bail out from make_indexed_dir() on first error" could end up leaking bh2 in the error path. [ Also avoid renaming bh2 to bh, which just confuses things --tytso ] Cc: stable@vger.kernel.org Signed-off-by: yangsheng Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2243ae2..c344b81 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2044,33 +2044,31 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, frame->entries = entries; frame->at = entries; frame->bh = bh; - bh = bh2; retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (retval) goto out_frames; - retval = ext4_handle_dirty_dirent_node(handle, dir, bh); + retval = ext4_handle_dirty_dirent_node(handle, dir, bh2); if (retval) goto out_frames; - de = do_split(handle,dir, &bh, frame, &fname->hinfo); + de = do_split(handle,dir, &bh2, frame, &fname->hinfo); if (IS_ERR(de)) { retval = PTR_ERR(de); goto out_frames; } - dx_release(frames); - retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh); - brelse(bh); - return retval; + retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh2); out_frames: /* * Even if the block split failed, we have to properly write * out all the changes we did so far. Otherwise we can end up * with corrupted filesystem. */ - ext4_mark_inode_dirty(handle, dir); + if (retval) + ext4_mark_inode_dirty(handle, dir); dx_release(frames); + brelse(bh2); return retval; } -- cgit v0.10.2 From cc91542ac8f29d8446ef59234796b95f0fbce09a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 30 Sep 2016 01:44:17 -0400 Subject: ext4: do not unnecessarily null-terminate encrypted symlink data Null-terminating the fscrypt_symlink_data on read is unnecessary because it is not string data --- it contains binary ciphertext. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 0a26cbd..fdf1c61 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -30,7 +30,6 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, char *caddr, *paddr = NULL; struct fscrypt_str cstr, pstr; struct fscrypt_symlink_data *sd; - loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); int res; u32 max_size = inode->i_sb->s_blocksize; @@ -49,7 +48,6 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, if (IS_ERR(cpage)) return ERR_CAST(cpage); caddr = page_address(cpage); - caddr[size] = 0; } /* Symlink is encrypted */ -- cgit v0.10.2 From 55be3145d174c04c44f1996eca8eb3a4476b1063 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 30 Sep 2016 01:46:18 -0400 Subject: fscrypto: use standard macros to compute length of fname ciphertext Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 90697c7..9a28133 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -14,11 +14,6 @@ #include #include -static u32 size_round_up(size_t size, size_t blksize) -{ - return ((size + blksize - 1) / blksize) * blksize; -} - /** * fname_crypt_complete() - completion callback for filename crypto * @req: The asynchronous cipher request context @@ -60,10 +55,9 @@ static int fname_encrypt(struct inode *inode, if (iname->len <= 0 || iname->len > lim) return -EIO; - ciphertext_len = (iname->len < FS_CRYPTO_BLOCK_SIZE) ? - FS_CRYPTO_BLOCK_SIZE : iname->len; - ciphertext_len = size_round_up(ciphertext_len, padding); - ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len; + ciphertext_len = max(iname->len, (u32)FS_CRYPTO_BLOCK_SIZE); + ciphertext_len = round_up(ciphertext_len, padding); + ciphertext_len = min(ciphertext_len, lim); if (ciphertext_len <= sizeof(buf)) { workbuf = buf; @@ -233,9 +227,8 @@ u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen) if (ci) padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); - if (ilen < FS_CRYPTO_BLOCK_SIZE) - ilen = FS_CRYPTO_BLOCK_SIZE; - return size_round_up(ilen, padding); + ilen = max(ilen, (u32)FS_CRYPTO_BLOCK_SIZE); + return round_up(ilen, padding); } EXPORT_SYMBOL(fscrypt_fname_encrypted_size); -- cgit v0.10.2 From 9a200d075e5d05be1fcad4547a0f8aee4e2f9a04 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Fri, 30 Sep 2016 01:49:55 -0400 Subject: ext4: require encryption feature for EXT4_IOC_SET_ENCRYPTION_POLICY ...otherwise an user can enable encryption for certain files even when the filesystem is unable to support it. Such a case would be a filesystem created by mkfs.ext4's default settings, 1KiB block size. Ext4 supports encyption only when block size is equal to PAGE_SIZE. But this constraint is only checked when the encryption feature flag is set. Signed-off-by: Richard Weinberger Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index c64239b..22abf7a 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -769,6 +769,9 @@ resizefs_out: #ifdef CONFIG_EXT4_FS_ENCRYPTION struct fscrypt_policy policy; + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, sizeof(policy))) -- cgit v0.10.2 From 4b0524aae0082272737c97d2b160d55d6e8f0b2b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 30 Sep 2016 01:55:32 -0400 Subject: ext4: allow unlocked direct IO when pages are cached Currently we do not allow unlocked (meaning without inode_lock) direct IO when the file has any pages cached. This check is not needed anymore as we keep inode lock until ext4_direct_IO_write() and thus can happily writeback and evict any pages conflicting with current direct IO write. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 34acda7..25342c8 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -91,7 +91,6 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; struct inode *inode = file_inode(iocb->ki_filp); struct blk_plug plug; int o_direct = iocb->ki_flags & IOCB_DIRECT; @@ -138,7 +137,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) /* check whether we do a DIO overwrite or not */ if (ext4_should_dioread_nolock(inode) && !unaligned_aio && - !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { + pos + length <= i_size_read(inode)) { struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; int err, len; -- cgit v0.10.2 From 51e8137b82622d8ea22e993d613db568f11c1523 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 30 Sep 2016 01:57:41 -0400 Subject: ext4: remove plugging from ext4_file_write_iter() do_blockdev_direct_IO() takes care of properly plugging direct IO so there's no need to plug again inside ext4_file_write_iter(). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 25342c8..25f763f 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -92,7 +92,6 @@ static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - struct blk_plug plug; int o_direct = iocb->ki_flags & IOCB_DIRECT; int unaligned_aio = 0; int overwrite = 0; @@ -133,7 +132,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (o_direct) { size_t length = iov_iter_count(from); loff_t pos = iocb->ki_pos; - blk_start_plug(&plug); /* check whether we do a DIO overwrite or not */ if (ext4_should_dioread_nolock(inode) && !unaligned_aio && @@ -169,8 +167,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret > 0) ret = generic_write_sync(iocb, ret); - if (o_direct) - blk_finish_plug(&plug); return ret; -- cgit v0.10.2 From 9b623df614576680cadeaa4d7e0b5884de8f7c17 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 30 Sep 2016 02:02:29 -0400 Subject: ext4: unmap metadata when zeroing blocks When zeroing blocks for DAX allocations, we also have to unmap aliases in the block device mappings. Otherwise writeback can overwrite zeros with stale data from block device page cache. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@kernel.org diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d8a4afc..cd91882 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -647,11 +647,19 @@ found: /* * We have to zeroout blocks before inserting them into extent * status tree. Otherwise someone could look them up there and - * use them before they are really zeroed. + * use them before they are really zeroed. We also have to + * unmap metadata before zeroing as otherwise writeback can + * overwrite zeros with stale data from block device. */ if (flags & EXT4_GET_BLOCKS_ZERO && map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { + ext4_lblk_t i; + + for (i = 0; i < map->m_len; i++) { + unmap_underlying_metadata(inode->i_sb->s_bdev, + map->m_pblk + i); + } ret = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, map->m_len); if (ret) { -- cgit v0.10.2 From c6cb7e776ad59feba5d84b2524efd6dcbc618e42 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Fri, 30 Sep 2016 02:05:09 -0400 Subject: ext4: create function to read journal inode Factor out the code used in ext4_get_journal() to read a valid journal inode from storage, enabling its reuse in other functions. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 50912cc..056ca1b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -78,6 +78,8 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); +static struct inode *ext4_get_journal_inode(struct super_block *sb, + unsigned int journal_inum); /* * Lock ordering @@ -4237,18 +4239,16 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) write_unlock(&journal->j_state_lock); } -static journal_t *ext4_get_journal(struct super_block *sb, - unsigned int journal_inum) +static struct inode *ext4_get_journal_inode(struct super_block *sb, + unsigned int journal_inum) { struct inode *journal_inode; - journal_t *journal; - - BUG_ON(!ext4_has_feature_journal(sb)); - - /* First, test for the existence of a valid inode on disk. Bad - * things happen if we iget() an unused inode, as the subsequent - * iput() will try to delete it. */ + /* + * Test for the existence of a valid inode on disk. Bad things + * happen if we iget() an unused inode, as the subsequent iput() + * will try to delete it. + */ journal_inode = ext4_iget(sb, journal_inum); if (IS_ERR(journal_inode)) { ext4_msg(sb, KERN_ERR, "no journal found"); @@ -4268,6 +4268,20 @@ static journal_t *ext4_get_journal(struct super_block *sb, iput(journal_inode); return NULL; } + return journal_inode; +} + +static journal_t *ext4_get_journal(struct super_block *sb, + unsigned int journal_inum) +{ + struct inode *journal_inode; + journal_t *journal; + + BUG_ON(!ext4_has_feature_journal(sb)); + + journal_inode = ext4_get_journal_inode(sb, journal_inum); + if (!journal_inode) + return NULL; journal = jbd2_journal_init_inode(journal_inode); if (!journal) { -- cgit v0.10.2 From 3c816ded78bbd645d61ee36f461deb8376d3723a Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Fri, 30 Sep 2016 02:08:49 -0400 Subject: ext4: use journal inode to determine journal overhead When a file system contains an internal journal that has not been loaded, use the journal inode's i_size field to determine its contribution to the file system's overhead. (The journal's j_maxlen field is normally used to determine its size, but it's unavailable when the journal has not been loaded.) Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 056ca1b..6db81fb 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3210,6 +3210,8 @@ int ext4_calculate_overhead(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; + struct inode *j_inode; + unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum); ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; char *buf = (char *) get_zeroed_page(GFP_NOFS); @@ -3240,10 +3242,23 @@ int ext4_calculate_overhead(struct super_block *sb) memset(buf, 0, PAGE_SIZE); cond_resched(); } - /* Add the internal journal blocks as well */ + + /* + * Add the internal journal blocks whether the journal has been + * loaded or not + */ if (sbi->s_journal && !sbi->journal_bdev) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); - + else if (ext4_has_feature_journal(sb) && !sbi->s_journal) { + j_inode = ext4_get_journal_inode(sb, j_inum); + if (j_inode) { + j_blocks = j_inode->i_size >> sb->s_blocksize_bits; + overhead += EXT4_NUM_B2C(sbi, j_blocks); + iput(j_inode); + } else { + ext4_msg(sb, KERN_ERR, "can't get journal size"); + } + } sbi->s_overhead = overhead; smp_wmb(); free_page((unsigned long) buf); -- cgit v0.10.2 From 18017479cabaeb5c659d789f04ecf7939f8ee28f Mon Sep 17 00:00:00 2001 From: Eric Engestrom Date: Fri, 30 Sep 2016 02:14:56 -0400 Subject: ext4: remove unused variable Signed-off-by: Eric Engestrom diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 4d4b910..e8b3650 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -628,7 +628,7 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size) { struct ext4_dir_entry_2 *de; - int nlen, rlen; + int rlen; unsigned int offset = 0; char *top; @@ -638,7 +638,6 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, if (ext4_check_dir_entry(dir, NULL, de, bh, buf, buf_size, offset)) return -EFSCORRUPTED; - nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index a6132a7..b4cbee9 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -405,14 +405,12 @@ int ext4_bio_write_page(struct ext4_io_submit *io, { struct page *data_page = NULL; struct inode *inode = page->mapping->host; - unsigned block_start, blocksize; + unsigned block_start; struct buffer_head *bh, *head; int ret = 0; int nr_submitted = 0; int nr_to_submit = 0; - blocksize = 1 << inode->i_blkbits; - BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); -- cgit v0.10.2