From 5b643f9ce34df945e58c7176275d406aa0db704f Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 18 May 2015 13:14:47 -0400 Subject: ext4 crypto: optimize filename encryption Encrypt the filename as soon it is passed in by the user. This avoids our needing to encrypt the filename 2 or 3 times while in the process of creating a filename. Similarly, when looking up a directory entry, encrypt the filename early, or if the encryption key is not available, base-64 decode the file syystem so that the hash value and the last 16 bytes of the encrypted filename is available in the new struct ext4_filename data structure. Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index fded02f..ad5e328 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -611,109 +611,82 @@ int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, return -EACCES; } -/* - * Calculate the htree hash from a filename from user space - */ -int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, - const struct qstr *iname, - struct dx_hash_info *hinfo) +int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct ext4_filename *fname) { - struct ext4_str tmp; - int ret = 0; - char buf[EXT4_FNAME_CRYPTO_DIGEST_SIZE+1]; + struct ext4_fname_crypto_ctx *ctx; + int ret = 0, bigname = 0; + + memset(fname, 0, sizeof(struct ext4_filename)); + fname->usr_fname = iname; - if (!ctx || + ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + if ((ctx == NULL) || ((iname->name[0] == '.') && ((iname->len == 1) || ((iname->name[1] == '.') && (iname->len == 2))))) { - ext4fs_dirhash(iname->name, iname->len, hinfo); - return 0; + fname->disk_name.name = (unsigned char *) iname->name; + fname->disk_name.len = iname->len; + goto out; } - - if (!ctx->has_valid_key && iname->name[0] == '_') { - if (iname->len != 33) - return -ENOENT; - ret = digest_decode(iname->name+1, iname->len, buf); - if (ret != 24) - return -ENOENT; - memcpy(&hinfo->hash, buf, 4); - memcpy(&hinfo->minor_hash, buf + 4, 4); - return 0; + if (ctx->has_valid_key) { + ret = ext4_fname_crypto_alloc_buffer(ctx, iname->len, + &fname->crypto_buf); + if (ret < 0) + goto out; + ret = ext4_fname_encrypt(ctx, iname, &fname->crypto_buf); + if (ret < 0) + goto out; + fname->disk_name.name = fname->crypto_buf.name; + fname->disk_name.len = fname->crypto_buf.len; + ret = 0; + goto out; } - - if (!ctx->has_valid_key && iname->name[0] != '_') { - if (iname->len > 43) - return -ENOENT; - ret = digest_decode(iname->name, iname->len, buf); - ext4fs_dirhash(buf, ret, hinfo); - return 0; + if (!lookup) { + ret = -EACCES; + goto out; } - /* First encrypt the plaintext name */ - ret = ext4_fname_crypto_alloc_buffer(ctx, iname->len, &tmp); - if (ret < 0) - return ret; - - ret = ext4_fname_encrypt(ctx, iname, &tmp); - if (ret >= 0) { - ext4fs_dirhash(tmp.name, tmp.len, hinfo); - ret = 0; + /* We don't have the key and we are doing a lookup; decode the + * user-supplied name + */ + if (iname->name[0] == '_') + bigname = 1; + if ((bigname && (iname->len != 33)) || + (!bigname && (iname->len > 43))) { + ret = -ENOENT; } - - ext4_fname_crypto_free_buffer(&tmp); + fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); + if (fname->crypto_buf.name == NULL) { + ret = -ENOMEM; + goto out; + } + ret = digest_decode(iname->name + bigname, iname->len - bigname, + fname->crypto_buf.name); + if (ret < 0) { + ret = -ENOENT; + goto out; + } + fname->crypto_buf.len = ret; + if (bigname) { + memcpy(&fname->hinfo.hash, fname->crypto_buf.name, 4); + memcpy(&fname->hinfo.minor_hash, fname->crypto_buf.name + 4, 4); + } else { + fname->disk_name.name = fname->crypto_buf.name; + fname->disk_name.len = fname->crypto_buf.len; + } + ret = 0; +out: + ext4_put_fname_crypto_ctx(&ctx); return ret; } -int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr, - int len, const char * const name, - struct ext4_dir_entry_2 *de) +void ext4_fname_free_filename(struct ext4_filename *fname) { - int ret = -ENOENT; - int bigname = (*name == '_'); - - if (ctx->has_valid_key) { - if (cstr->name == NULL) { - struct qstr istr; - - ret = ext4_fname_crypto_alloc_buffer(ctx, len, cstr); - if (ret < 0) - goto errout; - istr.name = name; - istr.len = len; - ret = ext4_fname_encrypt(ctx, &istr, cstr); - if (ret < 0) - goto errout; - } - } else { - if (cstr->name == NULL) { - cstr->name = kmalloc(32, GFP_KERNEL); - if (cstr->name == NULL) - return -ENOMEM; - if ((bigname && (len != 33)) || - (!bigname && (len > 43))) - goto errout; - ret = digest_decode(name+bigname, len-bigname, - cstr->name); - if (ret < 0) { - ret = -ENOENT; - goto errout; - } - cstr->len = ret; - } - if (bigname) { - if (de->name_len < 16) - return 0; - ret = memcmp(de->name + de->name_len - 16, - cstr->name + 8, 16); - return (ret == 0) ? 1 : 0; - } - } - if (de->name_len != cstr->len) - return 0; - ret = memcmp(de->name, cstr->name, cstr->len); - return (ret == 0) ? 1 : 0; -errout: - kfree(cstr->name); - cstr->name = NULL; - return ret; + kfree(fname->crypto_buf.name); + fname->crypto_buf.name = NULL; + fname->usr_fname = NULL; + fname->disk_name.name = NULL; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9a83f14..866831e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1838,6 +1838,17 @@ struct dx_hash_info */ #define HASH_NB_ALWAYS 1 +struct ext4_filename { + const struct qstr *usr_fname; + struct ext4_str disk_name; + struct dx_hash_info hinfo; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct ext4_str crypto_buf; +#endif +}; + +#define fname_name(p) ((p)->disk_name.name) +#define fname_len(p) ((p)->disk_name.len) /* * Describe an inode's exact location on disk and in memory @@ -2098,21 +2109,16 @@ int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, const struct qstr *iname, struct ext4_str *oname); -int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, - const struct qstr *iname, - struct dx_hash_info *hinfo); int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, u32 namelen); -int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr, - int len, const char * const name, - struct ext4_dir_entry_2 *de); - - #ifdef CONFIG_EXT4_FS_ENCRYPTION void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx); struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode, u32 max_len); void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str); +int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct ext4_filename *fname); +void ext4_fname_free_filename(struct ext4_filename *fname); #else static inline void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx) { } @@ -2123,6 +2129,16 @@ struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode, return NULL; } static inline void ext4_fname_crypto_free_buffer(struct ext4_str *p) { } +static inline int ext4_fname_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, struct ext4_filename *fname) +{ + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *) iname->name; + fname->disk_name.len = iname->len; + return 0; +} +static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } #endif @@ -2156,14 +2172,13 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p); extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, struct buffer_head *bh, void *buf, int buf_size, - const char *name, int namelen, + struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de); int ext4_insert_dentry(struct inode *dir, - struct inode *inode, - struct ext4_dir_entry_2 *de, - int buf_size, - const struct qstr *iname, - const char *name, int namelen); + struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname); static inline void ext4_update_dx_flag(struct inode *inode) { if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, @@ -2317,13 +2332,14 @@ extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash); -extern int search_dir(struct buffer_head *bh, - char *search_buf, - int buf_size, - struct inode *dir, - const struct qstr *d_name, - unsigned int offset, - struct ext4_dir_entry_2 **res_dir); +extern int ext4_search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + struct ext4_filename *fname, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir); extern int ext4_generic_delete_entry(handle_t *handle, struct inode *dir, struct ext4_dir_entry_2 *de_del, @@ -2768,7 +2784,9 @@ extern int ext4_da_write_inline_data_begin(struct address_space *mapping, extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page); -extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, +extern int ext4_try_add_inline_entry(handle_t *handle, + struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode); extern int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, @@ -2782,6 +2800,7 @@ extern int htree_inlinedir_to_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, int *has_inline_data); extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, + struct ext4_filename *fname, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *has_inline_data); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 095c7a2..cd944a7 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -995,20 +995,18 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, * and -EEXIST if directory entry already exists. */ static int ext4_add_dirent_to_inline(handle_t *handle, + struct ext4_filename *fname, struct dentry *dentry, struct inode *inode, struct ext4_iloc *iloc, void *inline_start, int inline_size) { struct inode *dir = d_inode(dentry->d_parent); - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; int err; struct ext4_dir_entry_2 *de; - err = ext4_find_dest_de(dir, inode, iloc->bh, - inline_start, inline_size, - name, namelen, &de); + err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, + inline_size, fname, &de); if (err) return err; @@ -1016,8 +1014,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, err = ext4_journal_get_write_access(handle, iloc->bh); if (err) return err; - ext4_insert_dentry(dir, inode, de, inline_size, &dentry->d_name, - name, namelen); + ext4_insert_dentry(dir, inode, de, inline_size, fname); ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); @@ -1248,8 +1245,8 @@ out: * If succeeds, return 0. If not, extended the inline dir and copied data to * the new created block. */ -int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode) +int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode) { int ret, inline_size; void *inline_start; @@ -1268,7 +1265,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; - ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, + ret = ext4_add_dirent_to_inline(handle, fname, dentry, inode, &iloc, inline_start, inline_size); if (ret != -ENOSPC) goto out; @@ -1289,8 +1286,9 @@ int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, if (inline_size) { inline_start = ext4_get_inline_xattr_pos(dir, &iloc); - ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, - inline_start, inline_size); + ret = ext4_add_dirent_to_inline(handle, fname, dentry, + inode, &iloc, inline_start, + inline_size); if (ret != -ENOSPC) goto out; @@ -1611,6 +1609,7 @@ out: } struct buffer_head *ext4_find_inline_entry(struct inode *dir, + struct ext4_filename *fname, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *has_inline_data) @@ -1632,8 +1631,8 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir, inline_start = (void *)ext4_raw_inode(&iloc)->i_block + EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; - ret = search_dir(iloc.bh, inline_start, inline_size, - dir, d_name, 0, res_dir); + ret = ext4_search_dir(iloc.bh, inline_start, inline_size, + dir, fname, d_name, 0, res_dir); if (ret == 1) goto out_find; if (ret < 0) @@ -1645,8 +1644,8 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir, inline_start = ext4_get_inline_xattr_pos(dir, &iloc); inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; - ret = search_dir(iloc.bh, inline_start, inline_size, - dir, d_name, 0, res_dir); + ret = ext4_search_dir(iloc.bh, inline_start, inline_size, + dir, fname, d_name, 0, res_dir); if (ret == 1) goto out_find; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 814f3be..56c60cb 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -248,7 +248,7 @@ static void dx_set_count(struct dx_entry *entries, unsigned value); static void dx_set_limit(struct dx_entry *entries, unsigned value); static unsigned dx_root_limit(struct inode *dir, unsigned infosize); static unsigned dx_node_limit(struct inode *dir); -static struct dx_frame *dx_probe(const struct qstr *d_name, +static struct dx_frame *dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame); @@ -267,10 +267,10 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frames, __u32 *start_hash); static struct buffer_head * ext4_dx_find_entry(struct inode *dir, - const struct qstr *d_name, + struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir); -static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode); +static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode); /* checksumming functions */ void initialize_dirent_tail(struct ext4_dir_entry_tail *t, @@ -724,7 +724,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, * back to userspace. */ static struct dx_frame * -dx_probe(const struct qstr *d_name, struct inode *dir, +dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame_in) { unsigned count, indirect; @@ -746,32 +746,14 @@ dx_probe(const struct qstr *d_name, struct inode *dir, root->info.hash_version); goto fail; } + if (fname) + hinfo = &fname->hinfo; hinfo->hash_version = root->info.hash_version; if (hinfo->hash_version <= DX_HASH_TEA) hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - if (d_name) { - struct ext4_fname_crypto_ctx *ctx = NULL; - int res; - - /* Check if the directory is encrypted */ - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) { - ret_err = ERR_PTR(PTR_ERR(ctx)); - goto fail; - } - res = ext4_fname_usr_to_hash(ctx, d_name, hinfo); - if (res < 0) { - ret_err = ERR_PTR(res); - goto fail; - } - ext4_put_fname_crypto_ctx(&ctx); - } -#else - if (d_name) - ext4fs_dirhash(d_name->name, d_name->len, hinfo); -#endif + if (fname && fname_name(fname)) + ext4fs_dirhash(fname_name(fname), fname_len(fname), hinfo); hash = hinfo->hash; if (root->info.unused_flags & 1) { @@ -1155,12 +1137,13 @@ errout: static inline int search_dirblock(struct buffer_head *bh, struct inode *dir, + struct ext4_filename *fname, const struct qstr *d_name, unsigned int offset, struct ext4_dir_entry_2 **res_dir) { - return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, - d_name, offset, res_dir); + return ext4_search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, + fname, d_name, offset, res_dir); } /* @@ -1242,54 +1225,54 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) * `len <= EXT4_NAME_LEN' is guaranteed by caller. * `de != NULL' is guaranteed by caller. */ -static inline int ext4_match(struct ext4_fname_crypto_ctx *ctx, - struct ext4_str *fname_crypto_str, - int len, const char * const name, +static inline int ext4_match(struct ext4_filename *fname, struct ext4_dir_entry_2 *de) { - int res; + const void *name = fname_name(fname); + u32 len = fname_len(fname); if (!de->inode) return 0; #ifdef CONFIG_EXT4_FS_ENCRYPTION - if (ctx) - return ext4_fname_match(ctx, fname_crypto_str, len, name, de); + if (unlikely(!name)) { + if (fname->usr_fname->name[0] == '_') { + int ret; + if (de->name_len < 16) + return 0; + ret = memcmp(de->name + de->name_len - 16, + fname->crypto_buf.name + 8, 16); + return (ret == 0) ? 1 : 0; + } + name = fname->crypto_buf.name; + len = fname->crypto_buf.len; + } #endif - if (len != de->name_len) + if (de->name_len != len) return 0; - res = memcmp(name, de->name, len); - return (res == 0) ? 1 : 0; + return (memcmp(de->name, name, len) == 0) ? 1 : 0; } /* * Returns 0 if not found, -1 on failure, and 1 on success */ -int search_dir(struct buffer_head *bh, char *search_buf, int buf_size, - struct inode *dir, const struct qstr *d_name, - unsigned int offset, struct ext4_dir_entry_2 **res_dir) +int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, + struct inode *dir, struct ext4_filename *fname, + const struct qstr *d_name, + unsigned int offset, struct ext4_dir_entry_2 **res_dir) { struct ext4_dir_entry_2 * de; char * dlimit; int de_len; - const char *name = d_name->name; - int namelen = d_name->len; - struct ext4_fname_crypto_ctx *ctx = NULL; - struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; int res; - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) - return -1; - de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; while ((char *) de < dlimit) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ if ((char *) de + de->name_len <= dlimit) { - res = ext4_match(ctx, &fname_crypto_str, namelen, - name, de); + res = ext4_match(fname, de); if (res < 0) { res = -1; goto return_result; @@ -1322,8 +1305,6 @@ int search_dir(struct buffer_head *bh, char *search_buf, int buf_size, res = 0; return_result: - ext4_put_fname_crypto_ctx(&ctx); - ext4_fname_crypto_free_buffer(&fname_crypto_str); return res; } @@ -1370,7 +1351,8 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, buffer */ int num = 0; ext4_lblk_t nblocks; - int i, namelen; + int i, namelen, retval; + struct ext4_filename fname; *res_dir = NULL; sb = dir->i_sb; @@ -1378,14 +1360,18 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, if (namelen > EXT4_NAME_LEN) return NULL; + retval = ext4_fname_setup_filename(dir, d_name, 1, &fname); + if (retval) + return ERR_PTR(retval); + if (ext4_has_inline_data(dir)) { int has_inline_data = 1; - ret = ext4_find_inline_entry(dir, d_name, res_dir, + ret = ext4_find_inline_entry(dir, &fname, d_name, res_dir, &has_inline_data); if (has_inline_data) { if (inlined) *inlined = 1; - return ret; + goto cleanup_and_exit; } } @@ -1400,14 +1386,14 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, goto restart; } if (is_dx(dir)) { - bh = ext4_dx_find_entry(dir, d_name, res_dir); + ret = ext4_dx_find_entry(dir, &fname, res_dir); /* * On success, or if the error was file not found, * return. Otherwise, fall back to doing a search the * old fashioned way. */ - if (!IS_ERR(bh) || PTR_ERR(bh) != ERR_BAD_DX_DIR) - return bh; + if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR) + goto cleanup_and_exit; dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " "falling back\n")); } @@ -1438,8 +1424,10 @@ restart: num++; bh = ext4_getblk(NULL, dir, b++, 0); if (unlikely(IS_ERR(bh))) { - if (ra_max == 0) - return bh; + if (ra_max == 0) { + ret = bh; + goto cleanup_and_exit; + } break; } bh_use[ra_max] = bh; @@ -1469,7 +1457,7 @@ restart: goto next; } set_buffer_verified(bh); - i = search_dirblock(bh, dir, d_name, + i = search_dirblock(bh, dir, &fname, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (i == 1) { EXT4_I(dir)->i_dir_start_lookup = block; @@ -1500,15 +1488,17 @@ cleanup_and_exit: /* Clean up the read-ahead blocks */ for (; ra_ptr < ra_max; ra_ptr++) brelse(bh_use[ra_ptr]); + ext4_fname_free_filename(&fname); return ret; } -static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir) +static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **res_dir) { struct super_block * sb = dir->i_sb; - struct dx_hash_info hinfo; struct dx_frame frames[2], *frame; + const struct qstr *d_name = fname->usr_fname; struct buffer_head *bh; ext4_lblk_t block; int retval; @@ -1516,7 +1506,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q #ifdef CONFIG_EXT4_FS_ENCRYPTION *res_dir = NULL; #endif - frame = dx_probe(d_name, dir, &hinfo, frames); + frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return (struct buffer_head *) frame; do { @@ -1525,7 +1515,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q if (IS_ERR(bh)) goto errout; - retval = search_dirblock(bh, dir, d_name, + retval = search_dirblock(bh, dir, fname, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (retval == 1) @@ -1537,7 +1527,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q } /* Check to see if we should continue to search */ - retval = ext4_htree_next_block(dir, hinfo.hash, frame, + retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame, frames, NULL); if (retval < 0) { ext4_warning(sb, @@ -1796,32 +1786,16 @@ journal_error: int ext4_find_dest_de(struct inode *dir, struct inode *inode, struct buffer_head *bh, void *buf, int buf_size, - const char *name, int namelen, + struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de) { struct ext4_dir_entry_2 *de; - unsigned short reclen = EXT4_DIR_REC_LEN(namelen); + unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname)); int nlen, rlen; unsigned int offset = 0; char *top; - struct ext4_fname_crypto_ctx *ctx = NULL; - struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; int res; - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) - return -1; - - if (ctx != NULL) { - /* Calculate record length needed to store the entry */ - res = ext4_fname_crypto_namelen_on_disk(ctx, namelen); - if (res < 0) { - ext4_put_fname_crypto_ctx(&ctx); - return res; - } - reclen = EXT4_DIR_REC_LEN(res); - } - de = (struct ext4_dir_entry_2 *)buf; top = buf + buf_size - reclen; while ((char *) de <= top) { @@ -1831,7 +1805,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, goto return_result; } /* Provide crypto context and crypto buffer to ext4 match */ - res = ext4_match(ctx, &fname_crypto_str, namelen, name, de); + res = ext4_match(fname, de); if (res < 0) goto return_result; if (res > 0) { @@ -1853,8 +1827,6 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, res = 0; } return_result: - ext4_put_fname_crypto_ctx(&ctx); - ext4_fname_crypto_free_buffer(&fname_crypto_str); return res; } @@ -1862,39 +1834,10 @@ int ext4_insert_dentry(struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, int buf_size, - const struct qstr *iname, - const char *name, int namelen) + struct ext4_filename *fname) { int nlen, rlen; - struct ext4_fname_crypto_ctx *ctx = NULL; - struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; - struct ext4_str tmp_str; - int res; - - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) - return -EIO; - /* By default, the input name would be written to the disk */ - tmp_str.name = (unsigned char *)name; - tmp_str.len = namelen; - if (ctx != NULL) { - /* Directory is encrypted */ - res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN, - &fname_crypto_str); - if (res < 0) { - ext4_put_fname_crypto_ctx(&ctx); - return -ENOMEM; - } - res = ext4_fname_usr_to_disk(ctx, iname, &fname_crypto_str); - if (res < 0) { - ext4_put_fname_crypto_ctx(&ctx); - ext4_fname_crypto_free_buffer(&fname_crypto_str); - return res; - } - tmp_str.name = fname_crypto_str.name; - tmp_str.len = fname_crypto_str.len; - } nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); @@ -1908,11 +1851,8 @@ int ext4_insert_dentry(struct inode *dir, de->file_type = EXT4_FT_UNKNOWN; de->inode = cpu_to_le32(inode->i_ino); ext4_set_de_type(inode->i_sb, de, inode->i_mode); - de->name_len = tmp_str.len; - - memcpy(de->name, tmp_str.name, tmp_str.len); - ext4_put_fname_crypto_ctx(&ctx); - ext4_fname_crypto_free_buffer(&fname_crypto_str); + de->name_len = fname_len(fname); + memcpy(de->name, fname_name(fname), fname_len(fname)); return 0; } @@ -1924,13 +1864,11 @@ int ext4_insert_dentry(struct inode *dir, * space. It will return -ENOSPC if no space is available, and -EIO * and -EEXIST if directory entry already exists. */ -static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, +static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, + struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, struct buffer_head *bh) { - struct inode *dir = d_inode(dentry->d_parent); - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; unsigned int blocksize = dir->i_sb->s_blocksize; int csum_size = 0; int err; @@ -1939,9 +1877,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, csum_size = sizeof(struct ext4_dir_entry_tail); if (!de) { - err = ext4_find_dest_de(dir, inode, - bh, bh->b_data, blocksize - csum_size, - name, namelen, &de); + err = ext4_find_dest_de(dir, inode, bh, bh->b_data, + blocksize - csum_size, fname, &de); if (err) return err; } @@ -1954,8 +1891,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, /* By now the buffer is marked for journaling. Due to crypto operations, * the following function call may fail */ - err = ext4_insert_dentry(dir, inode, de, blocksize, &dentry->d_name, - name, namelen); + err = ext4_insert_dentry(dir, inode, de, blocksize, fname); if (err < 0) return err; @@ -1985,17 +1921,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, * This converts a one block unindexed directory to a 3 block indexed * directory, and adds the dentry to the indexed directory. */ -static int make_indexed_dir(handle_t *handle, struct dentry *dentry, +static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode, struct buffer_head *bh) { struct inode *dir = d_inode(dentry->d_parent); -#ifdef CONFIG_EXT4_FS_ENCRYPTION - struct ext4_fname_crypto_ctx *ctx = NULL; - int res; -#else - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; -#endif struct buffer_head *bh2; struct dx_root *root; struct dx_frame frames[2], *frame; @@ -2006,17 +1936,10 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, unsigned len; int retval; unsigned blocksize; - struct dx_hash_info hinfo; ext4_lblk_t block; struct fake_dirent *fde; int csum_size = 0; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); -#endif - if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); @@ -2078,22 +2001,12 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info))); /* Initialize as for dx_probe */ - hinfo.hash_version = root->info.hash_version; - if (hinfo.hash_version <= DX_HASH_TEA) - hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; - hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - res = ext4_fname_usr_to_hash(ctx, &dentry->d_name, &hinfo); - if (res < 0) { - ext4_put_fname_crypto_ctx(&ctx); - ext4_mark_inode_dirty(handle, dir); - brelse(bh); - return res; - } - ext4_put_fname_crypto_ctx(&ctx); -#else - ext4fs_dirhash(name, namelen, &hinfo); -#endif + fname->hinfo.hash_version = root->info.hash_version; + if (fname->hinfo.hash_version <= DX_HASH_TEA) + fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; + ext4fs_dirhash(fname_name(fname), fname_len(fname), &fname->hinfo); + memset(frames, 0, sizeof(frames)); frame = frames; frame->entries = entries; @@ -2108,14 +2021,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, if (retval) goto out_frames; - de = do_split(handle,dir, &bh, frame, &hinfo); + de = do_split(handle,dir, &bh, frame, &fname->hinfo); if (IS_ERR(de)) { retval = PTR_ERR(de); goto out_frames; } dx_release(frames); - retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh); brelse(bh); return retval; out_frames: @@ -2147,6 +2060,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct ext4_dir_entry_2 *de; struct ext4_dir_entry_tail *t; struct super_block *sb; + struct ext4_filename fname; int retval; int dx_fallback=0; unsigned blocksize; @@ -2161,10 +2075,15 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (!dentry->d_name.len) return -EINVAL; + retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname); + if (retval) + return retval; + if (ext4_has_inline_data(dir)) { - retval = ext4_try_add_inline_entry(handle, dentry, inode); + retval = ext4_try_add_inline_entry(handle, &fname, + dentry, inode); if (retval < 0) - return retval; + goto out; if (retval == 1) { retval = 0; goto out; @@ -2172,7 +2091,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, } if (is_dx(dir)) { - retval = ext4_dx_add_entry(handle, dentry, inode); + retval = ext4_dx_add_entry(handle, &fname, dentry, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) goto out; ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); @@ -2182,24 +2101,31 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, blocks = dir->i_size >> sb->s_blocksize_bits; for (block = 0; block < blocks; block++) { bh = ext4_read_dirblock(dir, block, DIRENT); - if (IS_ERR(bh)) - return PTR_ERR(bh); - - retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (IS_ERR(bh)) { + retval = PTR_ERR(bh); + bh = NULL; + goto out; + } + retval = add_dirent_to_buf(handle, &fname, dir, inode, + NULL, bh); if (retval != -ENOSPC) goto out; if (blocks == 1 && !dx_fallback && EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { - retval = make_indexed_dir(handle, dentry, inode, bh); + retval = make_indexed_dir(handle, &fname, dentry, + inode, bh); bh = NULL; /* make_indexed_dir releases bh */ goto out; } brelse(bh); } bh = ext4_append(handle, dir, &block); - if (IS_ERR(bh)) - return PTR_ERR(bh); + if (IS_ERR(bh)) { + retval = PTR_ERR(bh); + bh = NULL; + goto out; + } de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize); @@ -2209,8 +2135,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, initialize_dirent_tail(t, blocksize); } - retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh); out: + ext4_fname_free_filename(&fname); brelse(bh); if (retval == 0) ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); @@ -2220,19 +2147,18 @@ out: /* * Returns 0 for success, or a negative error value */ -static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode) +static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode) { struct dx_frame frames[2], *frame; struct dx_entry *entries, *at; - struct dx_hash_info hinfo; struct buffer_head *bh; struct inode *dir = d_inode(dentry->d_parent); struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; int err; - frame = dx_probe(&dentry->d_name, dir, &hinfo, frames); + frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return PTR_ERR(frame); entries = frame->entries; @@ -2249,7 +2175,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; - err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh); if (err != -ENOSPC) goto cleanup; @@ -2345,12 +2271,12 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, goto cleanup; } } - de = do_split(handle, dir, &bh, frame, &hinfo); + de = do_split(handle, dir, &bh, frame, &fname->hinfo); if (IS_ERR(de)) { err = PTR_ERR(de); goto cleanup; } - err = add_dirent_to_buf(handle, dentry, inode, de, bh); + err = add_dirent_to_buf(handle, fname, dir, inode, de, bh); goto cleanup; journal_error: -- cgit v0.10.2 From d229959072eba40e1c2a4f53f8af17f1e770eb66 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 18 May 2015 13:15:47 -0400 Subject: ext4 crypto: don't allocate a page when encrypting/decrypting file names Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index ad5e328..23d7f1d 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -65,9 +65,9 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, struct crypto_ablkcipher *tfm = ctx->ctfm; int res = 0; char iv[EXT4_CRYPTO_BLOCK_SIZE]; - struct scatterlist sg[1]; + struct scatterlist src_sg, dst_sg; int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); - char *workbuf; + char *workbuf, buf[32], *alloc_buf = NULL; if (iname->len <= 0 || iname->len > ctx->lim) return -EIO; @@ -78,20 +78,27 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, ciphertext_len = (ciphertext_len > ctx->lim) ? ctx->lim : ciphertext_len; + if (ciphertext_len <= sizeof(buf)) { + workbuf = buf; + } else { + alloc_buf = kmalloc(ciphertext_len, GFP_NOFS); + if (!alloc_buf) + return -ENOMEM; + workbuf = alloc_buf; + } + /* Allocate request */ req = ablkcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited( KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); + kfree(alloc_buf); return -ENOMEM; } ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, ext4_dir_crypt_complete, &ecr); - /* Map the workpage */ - workbuf = kmap(ctx->workpage); - /* Copy the input */ memcpy(workbuf, iname->name, iname->len); if (iname->len < ciphertext_len) @@ -101,21 +108,16 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE); /* Create encryption request */ - sg_init_table(sg, 1); - sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0); - ablkcipher_request_set_crypt(req, sg, sg, ciphertext_len, iv); + sg_init_one(&src_sg, workbuf, ciphertext_len); + sg_init_one(&dst_sg, oname->name, ciphertext_len); + ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); res = crypto_ablkcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } - if (res >= 0) { - /* Copy the result to output */ - memcpy(oname->name, workbuf, ciphertext_len); - res = ciphertext_len; - } - kunmap(ctx->workpage); + kfree(alloc_buf); ablkcipher_request_free(req); if (res < 0) { printk_ratelimited( @@ -139,11 +141,10 @@ static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx, struct ext4_str tmp_in[2], tmp_out[1]; struct ablkcipher_request *req = NULL; DECLARE_EXT4_COMPLETION_RESULT(ecr); - struct scatterlist sg[1]; + struct scatterlist src_sg, dst_sg; struct crypto_ablkcipher *tfm = ctx->ctfm; int res = 0; char iv[EXT4_CRYPTO_BLOCK_SIZE]; - char *workbuf; if (iname->len <= 0 || iname->len > ctx->lim) return -EIO; @@ -163,31 +164,19 @@ static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, ext4_dir_crypt_complete, &ecr); - /* Map the workpage */ - workbuf = kmap(ctx->workpage); - - /* Copy the input */ - memcpy(workbuf, iname->name, iname->len); - /* Initialize IV */ memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE); /* Create encryption request */ - sg_init_table(sg, 1); - sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0); - ablkcipher_request_set_crypt(req, sg, sg, iname->len, iv); + sg_init_one(&src_sg, iname->name, iname->len); + sg_init_one(&dst_sg, oname->name, oname->len); + ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); res = crypto_ablkcipher_decrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } - if (res >= 0) { - /* Copy the result to output */ - memcpy(oname->name, workbuf, iname->len); - res = iname->len; - } - kunmap(ctx->workpage); ablkcipher_request_free(req); if (res < 0) { printk_ratelimited( @@ -267,8 +256,6 @@ void ext4_free_fname_crypto_ctx(struct ext4_fname_crypto_ctx *ctx) crypto_free_ablkcipher(ctx->ctfm); if (ctx->htfm && !IS_ERR(ctx->htfm)) crypto_free_hash(ctx->htfm); - if (ctx->workpage && !IS_ERR(ctx->workpage)) - __free_page(ctx->workpage); kfree(ctx); } @@ -322,7 +309,6 @@ struct ext4_fname_crypto_ctx *ext4_alloc_fname_crypto_ctx( ctx->ctfm_key_is_ready = 0; ctx->ctfm = NULL; ctx->htfm = NULL; - ctx->workpage = NULL; return ctx; } @@ -390,24 +376,6 @@ struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx( ext4_put_fname_crypto_ctx(&ctx); return ERR_PTR(-ENOMEM); } - if (ctx->workpage == NULL) - ctx->workpage = alloc_page(GFP_NOFS); - if (IS_ERR(ctx->workpage)) { - res = PTR_ERR(ctx->workpage); - printk( - KERN_DEBUG "%s: error (%d) allocating work page\n", - __func__, res); - ctx->workpage = NULL; - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(res); - } - if (ctx->workpage == NULL) { - printk( - KERN_DEBUG "%s: could not allocate work page\n", - __func__); - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(-ENOMEM); - } ctx->lim = max_ciphertext_len; crypto_ablkcipher_clear_flags(ctx->ctfm, ~0); crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctx->ctfm), diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 5665d82..d799d5d 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -247,9 +247,12 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) get_dtype(sb, de->file_type))) goto done; } else { + int save_len = fname_crypto_str.len; + /* Directory is encrypted */ err = ext4_fname_disk_to_usr(enc_ctx, NULL, de, &fname_crypto_str); + fname_crypto_str.len = save_len; if (err < 0) goto errout; if (!dir_emit(ctx, diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index d75159c..552424a 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -123,10 +123,8 @@ struct ext4_str { struct ext4_fname_crypto_ctx { u32 lim; - char tmp_buf[EXT4_CRYPTO_BLOCK_SIZE]; struct crypto_ablkcipher *ctfm; struct crypto_hash *htfm; - struct page *workpage; struct ext4_encryption_key key; unsigned flags : 8; unsigned has_valid_key : 1; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 56c60cb..b340643 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -998,6 +998,8 @@ static int htree_dirblock_to_tree(struct file *dir_file, hinfo->hash, hinfo->minor_hash, de, &tmp_str); } else { + int save_len = fname_crypto_str.len; + /* Directory is encrypted */ err = ext4_fname_disk_to_usr(ctx, hinfo, de, &fname_crypto_str); @@ -1008,6 +1010,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &fname_crypto_str); + fname_crypto_str.len = save_len; } if (err != 0) { count = err; @@ -3126,6 +3129,7 @@ static int ext4_symlink(struct inode *dir, istr.name = (const unsigned char *) symname; istr.len = len; ostr.name = sd->encrypted_path; + ostr.len = disk_link.len; err = ext4_fname_usr_to_disk(ctx, &istr, &ostr); ext4_put_fname_crypto_ctx(&ctx); if (err < 0) diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 187b789..ca65d45 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -74,6 +74,7 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) goto errout; } pstr.name = paddr; + pstr.len = plen; res = _ext4_fname_disk_to_usr(ctx, NULL, &cstr, &pstr); if (res < 0) goto errout; -- cgit v0.10.2 From e2881b1b51d871a72911faf2fc7e090655940506 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 18 May 2015 13:16:47 -0400 Subject: ext4 crypto: separate kernel and userspace structure for the key Use struct ext4_encryption_key only for the master key passed via the kernel keyring. For internal kernel space users, we now use struct ext4_crypt_info. This will allow us to put information from the policy structure so we can cache it and avoid needing to constantly looking up the extended attribute. We will do this in a spearate patch. This patch is mostly mechnical to make it easier for patch review. Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 8ff1527..918200e 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -118,7 +118,7 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) struct ext4_crypto_ctx *ctx = NULL; int res = 0; unsigned long flags; - struct ext4_encryption_key *key = &EXT4_I(inode)->i_encryption_key; + struct ext4_crypt_info *ci = &EXT4_I(inode)->i_crypt_info; if (!ext4_read_workqueue) ext4_init_crypto(); @@ -152,14 +152,14 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) /* Allocate a new Crypto API context if we don't already have * one or if it isn't the right mode. */ - BUG_ON(key->mode == EXT4_ENCRYPTION_MODE_INVALID); - if (ctx->tfm && (ctx->mode != key->mode)) { + BUG_ON(ci->ci_mode == EXT4_ENCRYPTION_MODE_INVALID); + if (ctx->tfm && (ctx->mode != ci->ci_mode)) { crypto_free_tfm(ctx->tfm); ctx->tfm = NULL; ctx->mode = EXT4_ENCRYPTION_MODE_INVALID; } if (!ctx->tfm) { - switch (key->mode) { + switch (ci->ci_mode) { case EXT4_ENCRYPTION_MODE_AES_256_XTS: ctx->tfm = crypto_ablkcipher_tfm( crypto_alloc_ablkcipher("xts(aes)", 0, 0)); @@ -177,9 +177,9 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) ctx->tfm = NULL; goto out; } - ctx->mode = key->mode; + ctx->mode = ci->ci_mode; } - BUG_ON(key->size != ext4_encryption_key_size(key->mode)); + BUG_ON(ci->ci_size != ext4_encryption_key_size(ci->ci_mode)); /* There shouldn't be a bounce page attached to the crypto * context at this point. */ @@ -322,7 +322,7 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, int res = 0; BUG_ON(!ctx->tfm); - BUG_ON(ctx->mode != ei->i_encryption_key.mode); + BUG_ON(ctx->mode != ei->i_crypt_info.ci_mode); if (ctx->mode != EXT4_ENCRYPTION_MODE_AES_256_XTS) { printk_ratelimited(KERN_ERR @@ -334,8 +334,8 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, crypto_ablkcipher_clear_flags(atfm, ~0); crypto_tfm_set_flags(ctx->tfm, CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_ablkcipher_setkey(atfm, ei->i_encryption_key.raw, - ei->i_encryption_key.size); + res = crypto_ablkcipher_setkey(atfm, ei->i_crypt_info.ci_raw, + ei->i_crypt_info.ci_size); if (res) { printk_ratelimited(KERN_ERR "%s: crypto_ablkcipher_setkey() failed\n", diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 23d7f1d..d9f08dd 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -278,33 +278,24 @@ void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx) } /** - * ext4_search_fname_crypto_ctx() - - */ -static struct ext4_fname_crypto_ctx *ext4_search_fname_crypto_ctx( - const struct ext4_encryption_key *key) -{ - return NULL; -} - -/** * ext4_alloc_fname_crypto_ctx() - */ struct ext4_fname_crypto_ctx *ext4_alloc_fname_crypto_ctx( - const struct ext4_encryption_key *key) + const struct ext4_crypt_info *ci) { struct ext4_fname_crypto_ctx *ctx; ctx = kmalloc(sizeof(struct ext4_fname_crypto_ctx), GFP_NOFS); if (ctx == NULL) return ERR_PTR(-ENOMEM); - if (key->mode == EXT4_ENCRYPTION_MODE_INVALID) { + if (ci->ci_mode == EXT4_ENCRYPTION_MODE_INVALID) { /* This will automatically set key mode to invalid * As enum for ENCRYPTION_MODE_INVALID is zero */ - memset(&ctx->key, 0, sizeof(ctx->key)); + memset(&ctx->ci, 0, sizeof(ctx->ci)); } else { - memcpy(&ctx->key, key, sizeof(struct ext4_encryption_key)); + memcpy(&ctx->ci, ci, sizeof(struct ext4_crypt_info)); } - ctx->has_valid_key = (EXT4_ENCRYPTION_MODE_INVALID == key->mode) + ctx->has_valid_key = (EXT4_ENCRYPTION_MODE_INVALID == ci->ci_mode) ? 0 : 1; ctx->ctfm_key_is_ready = 0; ctx->ctfm = NULL; @@ -335,21 +326,17 @@ struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx( if (!ext4_has_encryption_key(inode)) ext4_generate_encryption_key(inode); - /* Get a crypto context based on the key. - * A new context is allocated if no context matches the requested key. - */ - ctx = ext4_search_fname_crypto_ctx(&(ei->i_encryption_key)); - if (ctx == NULL) - ctx = ext4_alloc_fname_crypto_ctx(&(ei->i_encryption_key)); + /* Get a crypto context based on the key. */ + ctx = ext4_alloc_fname_crypto_ctx(&(ei->i_crypt_info)); if (IS_ERR(ctx)) return ctx; ctx->flags = ei->i_crypt_policy_flags; if (ctx->has_valid_key) { - if (ctx->key.mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) { + if (ctx->ci.ci_mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) { printk_once(KERN_WARNING "ext4: unsupported key mode %d\n", - ctx->key.mode); + ctx->ci.ci_mode); return ERR_PTR(-ENOKEY); } @@ -389,7 +376,7 @@ struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx( * are pretty weak, * we directly use the inode master key */ res = crypto_ablkcipher_setkey(ctx->ctfm, - ctx->key.raw, ctx->key.size); + ctx->ci.ci_raw, ctx->ci.ci_size); if (res) { ext4_put_fname_crypto_ctx(&ctx); return ERR_PTR(-EIO); diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index 52170d0..ec6635d 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -91,7 +91,7 @@ out: int ext4_generate_encryption_key(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_encryption_key *crypt_key = &ei->i_encryption_key; + struct ext4_crypt_info *crypt_info = &ei->i_crypt_info; char full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE + (EXT4_KEY_DESCRIPTOR_SIZE * 2) + 1]; struct key *keyring_key = NULL; @@ -112,17 +112,17 @@ int ext4_generate_encryption_key(struct inode *inode) ei->i_crypt_policy_flags = ctx.flags; if (S_ISREG(inode->i_mode)) - crypt_key->mode = ctx.contents_encryption_mode; + crypt_info->ci_mode = ctx.contents_encryption_mode; else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - crypt_key->mode = ctx.filenames_encryption_mode; + crypt_info->ci_mode = ctx.filenames_encryption_mode; else { printk(KERN_ERR "ext4 crypto: Unsupported inode type.\n"); BUG(); } - crypt_key->size = ext4_encryption_key_size(crypt_key->mode); - BUG_ON(!crypt_key->size); + crypt_info->ci_size = ext4_encryption_key_size(crypt_info->ci_mode); + BUG_ON(!crypt_info->ci_size); if (DUMMY_ENCRYPTION_ENABLED(sbi)) { - memset(crypt_key->raw, 0x42, EXT4_AES_256_XTS_KEY_SIZE); + memset(crypt_info->ci_raw, 0x42, EXT4_AES_256_XTS_KEY_SIZE); goto out; } memcpy(full_key_descriptor, EXT4_KEY_DESC_PREFIX, @@ -148,19 +148,20 @@ int ext4_generate_encryption_key(struct inode *inode) BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE != EXT4_KEY_DERIVATION_NONCE_SIZE); BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE); - res = ext4_derive_key_aes(ctx.nonce, master_key->raw, crypt_key->raw); + res = ext4_derive_key_aes(ctx.nonce, master_key->raw, + crypt_info->ci_raw); out: if (keyring_key) key_put(keyring_key); if (res < 0) - crypt_key->mode = EXT4_ENCRYPTION_MODE_INVALID; + crypt_info->ci_mode = EXT4_ENCRYPTION_MODE_INVALID; return res; } int ext4_has_encryption_key(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_encryption_key *crypt_key = &ei->i_encryption_key; + struct ext4_crypt_info *crypt_info = &ei->i_crypt_info; - return (crypt_key->mode != EXT4_ENCRYPTION_MODE_INVALID); + return (crypt_info->ci_mode != EXT4_ENCRYPTION_MODE_INVALID); } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 866831e..3cf3bcb 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -955,7 +955,7 @@ struct ext4_inode_info { #ifdef CONFIG_EXT4_FS_ENCRYPTION /* Encryption params */ - struct ext4_encryption_key i_encryption_key; + struct ext4_crypt_info i_crypt_info; #endif }; diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index 552424a..deecbe8 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -66,10 +66,17 @@ struct ext4_encryption_context { #define EXT4_KEY_DESC_PREFIX "ext4:" #define EXT4_KEY_DESC_PREFIX_SIZE 5 +/* This is passed in from userspace into the kernel keyring */ struct ext4_encryption_key { - uint32_t mode; - char raw[EXT4_MAX_KEY_SIZE]; - uint32_t size; + __u32 mode; + char raw[EXT4_MAX_KEY_SIZE]; + __u32 size; +} __attribute__((__packed__)); + +struct ext4_crypt_info { + unsigned char ci_mode; + unsigned char ci_size; + char ci_raw[EXT4_MAX_KEY_SIZE]; }; #define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 @@ -125,7 +132,7 @@ struct ext4_fname_crypto_ctx { u32 lim; struct crypto_ablkcipher *ctfm; struct crypto_hash *htfm; - struct ext4_encryption_key key; + struct ext4_crypt_info ci; unsigned flags : 8; unsigned has_valid_key : 1; unsigned ctfm_key_is_ready : 1; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ca9d4a2..bcd7a4b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -879,7 +879,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); #ifdef CONFIG_EXT4_FS_ENCRYPTION - ei->i_encryption_key.mode = EXT4_ENCRYPTION_MODE_INVALID; + ei->i_crypt_info.ci_mode = EXT4_ENCRYPTION_MODE_INVALID; #endif return &ei->vfs_inode; -- cgit v0.10.2 From b7236e21d55ff9008737621c84dd8ee6c37c7c6d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 18 May 2015 13:17:47 -0400 Subject: ext4 crypto: reorganize how we store keys in the inode This is a pretty massive patch which does a number of different things: 1) The per-inode encryption information is now stored in an allocated data structure, ext4_crypt_info, instead of directly in the node. This reduces the size usage of an in-memory inode when it is not using encryption. 2) We drop the ext4_fname_crypto_ctx entirely, and use the per-inode encryption structure instead. This remove an unnecessary memory allocation and free for the fname_crypto_ctx as well as allowing us to reuse the ctfm in a directory for multiple lookups and file creations. 3) We also cache the inode's policy information in the ext4_crypt_info structure so we don't have to continually read it out of the extended attributes. 4) We now keep the keyring key in the inode's encryption structure instead of releasing it after we are done using it to derive the per-inode key. This allows us to test to see if the key has been revoked; if it has, we prevent the use of the derived key and free it. 5) When an inode is released (or when the derived key is freed), we will use memset_explicit() to zero out the derived key, so it's not left hanging around in memory. This implies that when a user logs out, it is important to first revoke the key, and then unlink it, and then finally, to use "echo 3 > /proc/sys/vm/drop_caches" to release any decrypted pages and dcache entries from the system caches. 6) All this, and we also shrink the number of lines of code by around 100. :-) Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 918200e..3a25aa4 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -118,8 +118,9 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) struct ext4_crypto_ctx *ctx = NULL; int res = 0; unsigned long flags; - struct ext4_crypt_info *ci = &EXT4_I(inode)->i_crypt_info; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + BUG_ON(ci == NULL); if (!ext4_read_workqueue) ext4_init_crypto(); @@ -322,7 +323,7 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, int res = 0; BUG_ON(!ctx->tfm); - BUG_ON(ctx->mode != ei->i_crypt_info.ci_mode); + BUG_ON(ctx->mode != ei->i_crypt_info->ci_mode); if (ctx->mode != EXT4_ENCRYPTION_MODE_AES_256_XTS) { printk_ratelimited(KERN_ERR @@ -334,8 +335,8 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, crypto_ablkcipher_clear_flags(atfm, ~0); crypto_tfm_set_flags(ctx->tfm, CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_ablkcipher_setkey(atfm, ei->i_crypt_info.ci_raw, - ei->i_crypt_info.ci_size); + res = crypto_ablkcipher_setkey(atfm, ei->i_crypt_info->ci_raw, + ei->i_crypt_info->ci_size); if (res) { printk_ratelimited(KERN_ERR "%s: crypto_ablkcipher_setkey() failed\n", diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index d9f08dd..374d0e7 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -48,6 +48,12 @@ bool ext4_valid_filenames_enc_mode(uint32_t mode) return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS); } +static unsigned max_name_len(struct inode *inode) +{ + return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize : + EXT4_NAME_LEN; +} + /** * ext4_fname_encrypt() - * @@ -55,28 +61,30 @@ bool ext4_valid_filenames_enc_mode(uint32_t mode) * ciphertext. Errors are returned as negative numbers. We trust the caller to * allocate sufficient memory to oname string. */ -static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, +static int ext4_fname_encrypt(struct inode *inode, const struct qstr *iname, struct ext4_str *oname) { u32 ciphertext_len; struct ablkcipher_request *req = NULL; DECLARE_EXT4_COMPLETION_RESULT(ecr); - struct crypto_ablkcipher *tfm = ctx->ctfm; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + struct crypto_ablkcipher *tfm = ci->ci_ctfm; int res = 0; char iv[EXT4_CRYPTO_BLOCK_SIZE]; struct scatterlist src_sg, dst_sg; - int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); + int padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK); char *workbuf, buf[32], *alloc_buf = NULL; + unsigned lim = max_name_len(inode); - if (iname->len <= 0 || iname->len > ctx->lim) + if (iname->len <= 0 || iname->len > lim) return -EIO; ciphertext_len = (iname->len < EXT4_CRYPTO_BLOCK_SIZE) ? EXT4_CRYPTO_BLOCK_SIZE : iname->len; ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding); - ciphertext_len = (ciphertext_len > ctx->lim) - ? ctx->lim : ciphertext_len; + ciphertext_len = (ciphertext_len > lim) + ? lim : ciphertext_len; if (ciphertext_len <= sizeof(buf)) { workbuf = buf; @@ -134,7 +142,7 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, * Errors are returned as negative numbers. * We trust the caller to allocate sufficient memory to oname string. */ -static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx, +static int ext4_fname_decrypt(struct inode *inode, const struct ext4_str *iname, struct ext4_str *oname) { @@ -142,11 +150,13 @@ static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx, struct ablkcipher_request *req = NULL; DECLARE_EXT4_COMPLETION_RESULT(ecr); struct scatterlist src_sg, dst_sg; - struct crypto_ablkcipher *tfm = ctx->ctfm; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + struct crypto_ablkcipher *tfm = ci->ci_ctfm; int res = 0; char iv[EXT4_CRYPTO_BLOCK_SIZE]; + unsigned lim = max_name_len(inode); - if (iname->len <= 0 || iname->len > ctx->lim) + if (iname->len <= 0 || iname->len > lim) return -EIO; tmp_in[0].name = iname->name; @@ -242,171 +252,50 @@ static int digest_decode(const char *src, int len, char *dst) return cp - dst; } -/** - * ext4_free_fname_crypto_ctx() - - * - * Frees up a crypto context. - */ -void ext4_free_fname_crypto_ctx(struct ext4_fname_crypto_ctx *ctx) -{ - if (ctx == NULL || IS_ERR(ctx)) - return; - - if (ctx->ctfm && !IS_ERR(ctx->ctfm)) - crypto_free_ablkcipher(ctx->ctfm); - if (ctx->htfm && !IS_ERR(ctx->htfm)) - crypto_free_hash(ctx->htfm); - kfree(ctx); -} - -/** - * ext4_put_fname_crypto_ctx() - - * - * Return: The crypto context onto free list. If the free list is above a - * threshold, completely frees up the context, and returns the memory. - * - * TODO: Currently we directly free the crypto context. Eventually we should - * add code it to return to free list. Such an approach will increase - * efficiency of directory lookup. - */ -void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx) +int ext4_setup_fname_crypto(struct inode *inode) { - if (*ctx == NULL || IS_ERR(*ctx)) - return; - ext4_free_fname_crypto_ctx(*ctx); - *ctx = NULL; -} - -/** - * ext4_alloc_fname_crypto_ctx() - - */ -struct ext4_fname_crypto_ctx *ext4_alloc_fname_crypto_ctx( - const struct ext4_crypt_info *ci) -{ - struct ext4_fname_crypto_ctx *ctx; - - ctx = kmalloc(sizeof(struct ext4_fname_crypto_ctx), GFP_NOFS); - if (ctx == NULL) - return ERR_PTR(-ENOMEM); - if (ci->ci_mode == EXT4_ENCRYPTION_MODE_INVALID) { - /* This will automatically set key mode to invalid - * As enum for ENCRYPTION_MODE_INVALID is zero */ - memset(&ctx->ci, 0, sizeof(ctx->ci)); - } else { - memcpy(&ctx->ci, ci, sizeof(struct ext4_crypt_info)); - } - ctx->has_valid_key = (EXT4_ENCRYPTION_MODE_INVALID == ci->ci_mode) - ? 0 : 1; - ctx->ctfm_key_is_ready = 0; - ctx->ctfm = NULL; - ctx->htfm = NULL; - return ctx; -} - -/** - * ext4_get_fname_crypto_ctx() - - * - * Allocates a free crypto context and initializes it to hold - * the crypto material for the inode. - * - * Return: NULL if not encrypted. Error value on error. Valid pointer otherwise. - */ -struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx( - struct inode *inode, u32 max_ciphertext_len) -{ - struct ext4_fname_crypto_ctx *ctx; struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_crypt_info *ci = ei->i_crypt_info; + struct crypto_ablkcipher *ctfm; int res; /* Check if the crypto policy is set on the inode */ res = ext4_encrypted_inode(inode); if (res == 0) - return NULL; - - if (!ext4_has_encryption_key(inode)) - ext4_generate_encryption_key(inode); - - /* Get a crypto context based on the key. */ - ctx = ext4_alloc_fname_crypto_ctx(&(ei->i_crypt_info)); - if (IS_ERR(ctx)) - return ctx; - - ctx->flags = ei->i_crypt_policy_flags; - if (ctx->has_valid_key) { - if (ctx->ci.ci_mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) { - printk_once(KERN_WARNING - "ext4: unsupported key mode %d\n", - ctx->ci.ci_mode); - return ERR_PTR(-ENOKEY); - } + return 0; - /* As a first cut, we will allocate new tfm in every call. - * later, we will keep the tfm around, in case the key gets - * re-used */ - if (ctx->ctfm == NULL) { - ctx->ctfm = crypto_alloc_ablkcipher("cts(cbc(aes))", - 0, 0); - } - if (IS_ERR(ctx->ctfm)) { - res = PTR_ERR(ctx->ctfm); - printk( - KERN_DEBUG "%s: error (%d) allocating crypto tfm\n", - __func__, res); - ctx->ctfm = NULL; - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(res); - } - if (ctx->ctfm == NULL) { - printk( - KERN_DEBUG "%s: could not allocate crypto tfm\n", - __func__); - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(-ENOMEM); - } - ctx->lim = max_ciphertext_len; - crypto_ablkcipher_clear_flags(ctx->ctfm, ~0); - crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctx->ctfm), - CRYPTO_TFM_REQ_WEAK_KEY); - - /* If we are lucky, we will get a context that is already - * set up with the right key. Else, we will have to - * set the key */ - if (!ctx->ctfm_key_is_ready) { - /* Since our crypto objectives for filename encryption - * are pretty weak, - * we directly use the inode master key */ - res = crypto_ablkcipher_setkey(ctx->ctfm, - ctx->ci.ci_raw, ctx->ci.ci_size); - if (res) { - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(-EIO); - } - ctx->ctfm_key_is_ready = 1; - } else { - /* In the current implementation, key should never be - * marked "ready" for a context that has just been - * allocated. So we should never reach here */ - BUG(); - } - } - if (ctx->htfm == NULL) - ctx->htfm = crypto_alloc_hash("sha256", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(ctx->htfm)) { - res = PTR_ERR(ctx->htfm); - printk(KERN_DEBUG "%s: error (%d) allocating hash tfm\n", - __func__, res); - ctx->htfm = NULL; - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(res); + res = ext4_get_encryption_info(inode); + if (res < 0) + return res; + ci = ei->i_crypt_info; + + if (!ci || ci->ci_ctfm) + return 0; + + if (ci->ci_mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) { + printk_once(KERN_WARNING "ext4: unsupported key mode %d\n", + ci->ci_mode); + return -ENOKEY; } - if (ctx->htfm == NULL) { - printk(KERN_DEBUG "%s: could not allocate hash tfm\n", - __func__); - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(-ENOMEM); + + ctfm = crypto_alloc_ablkcipher("cts(cbc(aes))", 0, 0); + if (!ctfm || IS_ERR(ctfm)) { + res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; + printk(KERN_DEBUG "%s: error (%d) allocating crypto tfm\n", + __func__, res); + return res; } + crypto_ablkcipher_clear_flags(ctfm, ~0); + crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm), + CRYPTO_TFM_REQ_WEAK_KEY); - return ctx; + res = crypto_ablkcipher_setkey(ctfm, ci->ci_raw, ci->ci_size); + if (res) { + crypto_free_ablkcipher(ctfm); + return -EIO; + } + ci->ci_ctfm = ctfm; + return 0; } /** @@ -420,40 +309,20 @@ u32 ext4_fname_crypto_round_up(u32 size, u32 blksize) } /** - * ext4_fname_crypto_namelen_on_disk() - - */ -int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, - u32 namelen) -{ - u32 ciphertext_len; - int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); - - if (ctx == NULL) - return -EIO; - if (!(ctx->has_valid_key)) - return -EACCES; - ciphertext_len = (namelen < EXT4_CRYPTO_BLOCK_SIZE) ? - EXT4_CRYPTO_BLOCK_SIZE : namelen; - ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding); - ciphertext_len = (ciphertext_len > ctx->lim) - ? ctx->lim : ciphertext_len; - return (int) ciphertext_len; -} - -/** * ext4_fname_crypto_alloc_obuff() - * * Allocates an output buffer that is sufficient for the crypto operation * specified by the context and the direction. */ -int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx, +int ext4_fname_crypto_alloc_buffer(struct inode *inode, u32 ilen, struct ext4_str *crypto_str) { unsigned int olen; - int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); + int padding = 16; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - if (!ctx) - return -EIO; + if (ci) + padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK); if (padding < EXT4_CRYPTO_BLOCK_SIZE) padding = EXT4_CRYPTO_BLOCK_SIZE; olen = ext4_fname_crypto_round_up(ilen, padding); @@ -484,7 +353,7 @@ void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str) /** * ext4_fname_disk_to_usr() - converts a filename from disk space to user space */ -int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, +int _ext4_fname_disk_to_usr(struct inode *inode, struct dx_hash_info *hinfo, const struct ext4_str *iname, struct ext4_str *oname) @@ -492,8 +361,6 @@ int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, char buf[24]; int ret; - if (ctx == NULL) - return -EIO; if (iname->len < 3) { /*Check for . and .. */ if (iname->name[0] == '.' && iname->name[iname->len-1] == '.') { @@ -503,8 +370,8 @@ int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, return oname->len; } } - if (ctx->has_valid_key) - return ext4_fname_decrypt(ctx, iname, oname); + if (EXT4_I(inode)->i_crypt_info) + return ext4_fname_decrypt(inode, iname, oname); if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) { ret = digest_encode(iname->name, iname->len, oname->name); @@ -523,7 +390,7 @@ int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, return ret + 1; } -int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, +int ext4_fname_disk_to_usr(struct inode *inode, struct dx_hash_info *hinfo, const struct ext4_dir_entry_2 *de, struct ext4_str *oname) @@ -531,21 +398,20 @@ int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, struct ext4_str iname = {.name = (unsigned char *) de->name, .len = de->name_len }; - return _ext4_fname_disk_to_usr(ctx, hinfo, &iname, oname); + return _ext4_fname_disk_to_usr(inode, hinfo, &iname, oname); } /** * ext4_fname_usr_to_disk() - converts a filename from user space to disk space */ -int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, +int ext4_fname_usr_to_disk(struct inode *inode, const struct qstr *iname, struct ext4_str *oname) { int res; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - if (ctx == NULL) - return -EIO; if (iname->len < 3) { /*Check for . and .. */ if (iname->name[0] == '.' && @@ -556,8 +422,8 @@ int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, return oname->len; } } - if (ctx->has_valid_key) { - res = ext4_fname_encrypt(ctx, iname, oname); + if (ci) { + res = ext4_fname_encrypt(inode, iname, oname); return res; } /* Without a proper key, a user is not allowed to modify the filenames @@ -569,16 +435,13 @@ int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct ext4_filename *fname) { - struct ext4_fname_crypto_ctx *ctx; + struct ext4_crypt_info *ci; int ret = 0, bigname = 0; memset(fname, 0, sizeof(struct ext4_filename)); fname->usr_fname = iname; - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - if ((ctx == NULL) || + if (!ext4_encrypted_inode(dir) || ((iname->name[0] == '.') && ((iname->len == 1) || ((iname->name[1] == '.') && (iname->len == 2))))) { @@ -586,12 +449,16 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, fname->disk_name.len = iname->len; goto out; } - if (ctx->has_valid_key) { - ret = ext4_fname_crypto_alloc_buffer(ctx, iname->len, + ret = ext4_setup_fname_crypto(dir); + if (ret) + return ret; + ci = EXT4_I(dir)->i_crypt_info; + if (ci) { + ret = ext4_fname_crypto_alloc_buffer(dir, iname->len, &fname->crypto_buf); if (ret < 0) goto out; - ret = ext4_fname_encrypt(ctx, iname, &fname->crypto_buf); + ret = ext4_fname_encrypt(dir, iname, &fname->crypto_buf); if (ret < 0) goto out; fname->disk_name.name = fname->crypto_buf.name; @@ -634,7 +501,6 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, } ret = 0; out: - ext4_put_fname_crypto_ctx(&ctx); return ret; } diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index ec6635d..0075e43 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -84,14 +84,26 @@ out: return res; } -/** - * ext4_generate_encryption_key() - generates an encryption key - * @inode: The inode to generate the encryption key for. - */ -int ext4_generate_encryption_key(struct inode *inode) +void ext4_free_encryption_info(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_crypt_info *ci = ei->i_crypt_info; + + if (!ci) + return; + + if (ci->ci_keyring_key) + key_put(ci->ci_keyring_key); + crypto_free_ablkcipher(ci->ci_ctfm); + memzero_explicit(&ci->ci_raw, sizeof(ci->ci_raw)); + kfree(ci); + ei->i_crypt_info = NULL; +} + +int _ext4_get_encryption_info(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_crypt_info *crypt_info = &ei->i_crypt_info; + struct ext4_crypt_info *crypt_info; char full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE + (EXT4_KEY_DESCRIPTOR_SIZE * 2) + 1]; struct key *keyring_key = NULL; @@ -99,18 +111,40 @@ int ext4_generate_encryption_key(struct inode *inode) struct ext4_encryption_context ctx; struct user_key_payload *ukp; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - &ctx, sizeof(ctx)); + int res; - if (res != sizeof(ctx)) { - if (res > 0) - res = -EINVAL; - goto out; + if (ei->i_crypt_info) { + if (!ei->i_crypt_info->ci_keyring_key || + key_validate(ei->i_crypt_info->ci_keyring_key) == 0) + return 0; + ext4_free_encryption_info(inode); } + + res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &ctx, sizeof(ctx)); + if (res < 0) { + if (!DUMMY_ENCRYPTION_ENABLED(sbi)) + return res; + ctx.contents_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS; + ctx.filenames_encryption_mode = + EXT4_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; + } else if (res != sizeof(ctx)) + return -EINVAL; res = 0; + crypt_info = kmalloc(sizeof(struct ext4_crypt_info), GFP_KERNEL); + if (!crypt_info) + return -ENOMEM; + ei->i_crypt_policy_flags = ctx.flags; + crypt_info->ci_flags = ctx.flags; + crypt_info->ci_data_mode = ctx.contents_encryption_mode; + crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; + crypt_info->ci_ctfm = NULL; + memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, + sizeof(crypt_info->ci_master_key)); if (S_ISREG(inode->i_mode)) crypt_info->ci_mode = ctx.contents_encryption_mode; else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) @@ -151,17 +185,23 @@ int ext4_generate_encryption_key(struct inode *inode) res = ext4_derive_key_aes(ctx.nonce, master_key->raw, crypt_info->ci_raw); out: + if (res < 0) { + if (res == -ENOKEY) + res = 0; + kfree(crypt_info); + } else { + ei->i_crypt_info = crypt_info; + crypt_info->ci_keyring_key = keyring_key; + keyring_key = NULL; + } if (keyring_key) key_put(keyring_key); - if (res < 0) - crypt_info->ci_mode = EXT4_ENCRYPTION_MODE_INVALID; return res; } int ext4_has_encryption_key(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_crypt_info *crypt_info = &ei->i_crypt_info; - return (crypt_info->ci_mode != EXT4_ENCRYPTION_MODE_INVALID); + return (ei->i_crypt_info != NULL); } diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index a6d6291..370d3aa 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -126,7 +126,7 @@ int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy) int ext4_is_child_context_consistent_with_parent(struct inode *parent, struct inode *child) { - struct ext4_encryption_context parent_ctx, child_ctx; + struct ext4_crypt_info *parent_ci, *child_ci; int res; if ((parent == NULL) || (child == NULL)) { @@ -136,26 +136,28 @@ int ext4_is_child_context_consistent_with_parent(struct inode *parent, /* no restrictions if the parent directory is not encrypted */ if (!ext4_encrypted_inode(parent)) return 1; - res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - &parent_ctx, sizeof(parent_ctx)); - if (res != sizeof(parent_ctx)) - return 0; /* if the child directory is not encrypted, this is always a problem */ if (!ext4_encrypted_inode(child)) return 0; - res = ext4_xattr_get(child, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - &child_ctx, sizeof(child_ctx)); - if (res != sizeof(child_ctx)) + res = ext4_get_encryption_info(parent); + if (res) + return 0; + res = ext4_get_encryption_info(child); + if (res) + return 0; + parent_ci = EXT4_I(parent)->i_crypt_info; + child_ci = EXT4_I(child)->i_crypt_info; + if (!parent_ci && !child_ci) + return 1; + if (!parent_ci || !child_ci) return 0; - return (memcmp(parent_ctx.master_key_descriptor, - child_ctx.master_key_descriptor, + + return (memcmp(parent_ci->ci_master_key, + child_ci->ci_master_key, EXT4_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ctx.contents_encryption_mode == - child_ctx.contents_encryption_mode) && - (parent_ctx.filenames_encryption_mode == - child_ctx.filenames_encryption_mode)); + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && + (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && + (parent_ci->ci_flags == child_ci->ci_flags)); } /** @@ -168,31 +170,37 @@ int ext4_is_child_context_consistent_with_parent(struct inode *parent, int ext4_inherit_context(struct inode *parent, struct inode *child) { struct ext4_encryption_context ctx; - int res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - &ctx, sizeof(ctx)); + struct ext4_crypt_info *ci; + int res; + + res = ext4_get_encryption_info(parent); + if (res < 0) + return res; + ci = EXT4_I(parent)->i_crypt_info; + BUG_ON(ci == NULL); - if (res != sizeof(ctx)) { - if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) { - ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; - ctx.contents_encryption_mode = - EXT4_ENCRYPTION_MODE_AES_256_XTS; - ctx.filenames_encryption_mode = - EXT4_ENCRYPTION_MODE_AES_256_CTS; - ctx.flags = 0; - memset(ctx.master_key_descriptor, 0x42, - EXT4_KEY_DESCRIPTOR_SIZE); - res = 0; - } else { - goto out; - } + ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; + if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) { + ctx.contents_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS; + ctx.filenames_encryption_mode = + EXT4_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; + memset(ctx.master_key_descriptor, 0x42, + EXT4_KEY_DESCRIPTOR_SIZE); + res = 0; + } else { + ctx.contents_encryption_mode = ci->ci_data_mode; + ctx.filenames_encryption_mode = ci->ci_filename_mode; + ctx.flags = ci->ci_flags; + memcpy(ctx.master_key_descriptor, ci->ci_master_key, + EXT4_KEY_DESCRIPTOR_SIZE); } get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); res = ext4_xattr_set(child, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, sizeof(ctx), 0); -out: if (!res) ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT); return res; + } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index d799d5d..28cb94f 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -110,7 +110,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) struct super_block *sb = inode->i_sb; struct buffer_head *bh = NULL; int dir_has_error = 0; - struct ext4_fname_crypto_ctx *enc_ctx = NULL; struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; if (is_dx_dir(inode)) { @@ -134,16 +133,14 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) return err; } - enc_ctx = ext4_get_fname_crypto_ctx(inode, EXT4_NAME_LEN); - if (IS_ERR(enc_ctx)) - return PTR_ERR(enc_ctx); - if (enc_ctx) { - err = ext4_fname_crypto_alloc_buffer(enc_ctx, EXT4_NAME_LEN, + err = ext4_setup_fname_crypto(inode); + if (err) + return err; + if (ext4_encrypted_inode(inode)) { + err = ext4_fname_crypto_alloc_buffer(inode, EXT4_NAME_LEN, &fname_crypto_str); - if (err < 0) { - ext4_put_fname_crypto_ctx(&enc_ctx); + if (err < 0) return err; - } } offset = ctx->pos & (sb->s_blocksize - 1); @@ -239,8 +236,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); if (le32_to_cpu(de->inode)) { - if (enc_ctx == NULL) { - /* Directory is not encrypted */ + if (!ext4_encrypted_inode(inode)) { if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), @@ -250,7 +246,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) int save_len = fname_crypto_str.len; /* Directory is encrypted */ - err = ext4_fname_disk_to_usr(enc_ctx, + err = ext4_fname_disk_to_usr(inode, NULL, de, &fname_crypto_str); fname_crypto_str.len = save_len; if (err < 0) @@ -275,7 +271,6 @@ done: err = 0; errout: #ifdef CONFIG_EXT4_FS_ENCRYPTION - ext4_put_fname_crypto_ctx(&enc_ctx); ext4_fname_crypto_free_buffer(&fname_crypto_str); #endif brelse(bh); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3cf3bcb..cac1968 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -955,7 +955,7 @@ struct ext4_inode_info { #ifdef CONFIG_EXT4_FS_ENCRYPTION /* Encryption params */ - struct ext4_crypt_info i_crypt_info; + struct ext4_crypt_info *i_crypt_info; #endif }; @@ -2096,37 +2096,30 @@ static inline int ext4_sb_has_crypto(struct super_block *sb) /* crypto_fname.c */ bool ext4_valid_filenames_enc_mode(uint32_t mode); u32 ext4_fname_crypto_round_up(u32 size, u32 blksize); -int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx, +int ext4_fname_crypto_alloc_buffer(struct inode *inode, u32 ilen, struct ext4_str *crypto_str); -int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, +int _ext4_fname_disk_to_usr(struct inode *inode, struct dx_hash_info *hinfo, const struct ext4_str *iname, struct ext4_str *oname); -int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, +int ext4_fname_disk_to_usr(struct inode *inode, struct dx_hash_info *hinfo, const struct ext4_dir_entry_2 *de, struct ext4_str *oname); -int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, +int ext4_fname_usr_to_disk(struct inode *inode, const struct qstr *iname, struct ext4_str *oname); -int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, - u32 namelen); #ifdef CONFIG_EXT4_FS_ENCRYPTION -void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx); -struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode, - u32 max_len); +int ext4_setup_fname_crypto(struct inode *inode); void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str); int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct ext4_filename *fname); void ext4_fname_free_filename(struct ext4_filename *fname); #else static inline -void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx) { } -static inline -struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode, - u32 max_len) +int ext4_setup_fname_crypto(struct inode *inode) { - return NULL; + return 0; } static inline void ext4_fname_crypto_free_buffer(struct ext4_str *p) { } static inline int ext4_fname_setup_filename(struct inode *dir, @@ -2143,15 +2136,34 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } /* crypto_key.c */ -int ext4_generate_encryption_key(struct inode *inode); +void ext4_free_encryption_info(struct inode *inode); +int _ext4_get_encryption_info(struct inode *inode); #ifdef CONFIG_EXT4_FS_ENCRYPTION int ext4_has_encryption_key(struct inode *inode); + +static inline int ext4_get_encryption_info(struct inode *inode) +{ + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + + if (!ci || + (ci->ci_keyring_key && + (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED) | + (1 << KEY_FLAG_DEAD))))) + return _ext4_get_encryption_info(inode); + return 0; +} + #else static inline int ext4_has_encryption_key(struct inode *inode) { return 0; } +static inline int ext4_get_encryption_info(struct inode *inode) +{ + return 0; +} #endif diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index deecbe8..d29687c 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -76,7 +76,13 @@ struct ext4_encryption_key { struct ext4_crypt_info { unsigned char ci_mode; unsigned char ci_size; + char ci_data_mode; + char ci_filename_mode; + char ci_flags; + struct crypto_ablkcipher *ci_ctfm; + struct key *ci_keyring_key; char ci_raw[EXT4_MAX_KEY_SIZE]; + char ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE]; }; #define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 @@ -128,16 +134,6 @@ struct ext4_str { u32 len; }; -struct ext4_fname_crypto_ctx { - u32 lim; - struct crypto_ablkcipher *ctfm; - struct crypto_hash *htfm; - struct ext4_crypt_info ci; - unsigned flags : 8; - unsigned has_valid_key : 1; - unsigned ctfm_key_is_ready : 1; -}; - /** * For encrypted symlinks, the ciphertext length is stored at the beginning * of the string in little-endian format. diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 0613c25..875ca6b 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -223,7 +223,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) struct inode *inode = file->f_mapping->host; if (ext4_encrypted_inode(inode)) { - int err = ext4_generate_encryption_key(inode); + int err = ext4_get_encryption_info(inode); if (err) return 0; } @@ -289,7 +289,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) } ret = dquot_file_open(inode, filp); if (!ret && ext4_encrypted_inode(inode)) { - ret = ext4_generate_encryption_key(inode); + ret = ext4_get_encryption_info(inode); if (ret) ret = -EACCES; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b340643..9bed99f 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -607,17 +607,14 @@ static struct stats dx_show_leaf(struct inode *dir, char *name; struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; - struct ext4_fname_crypto_ctx *ctx = NULL; int res; name = de->name; len = de->name_len; - ctx = ext4_get_fname_crypto_ctx(dir, - EXT4_NAME_LEN); - if (IS_ERR(ctx)) { - printk(KERN_WARNING "Error acquiring" - " crypto ctxt--skipping crypto\n"); - ctx = NULL; + res = ext4_setup_fname_crypto(dir); + if (res) { + printk(KERN_WARNING "Error setting up" + " fname crypto: %d\n", res); } if (ctx == NULL) { /* Directory is not encrypted */ @@ -637,7 +634,6 @@ static struct stats dx_show_leaf(struct inode *dir, "allocating crypto " "buffer--skipping " "crypto\n"); - ext4_put_fname_crypto_ctx(&ctx); ctx = NULL; } res = ext4_fname_disk_to_usr(ctx, NULL, de, @@ -658,7 +654,6 @@ static struct stats dx_show_leaf(struct inode *dir, printk("%*.s:(E)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); - ext4_put_fname_crypto_ctx(&ctx); ext4_fname_crypto_free_buffer( &fname_crypto_str); } @@ -944,7 +939,6 @@ static int htree_dirblock_to_tree(struct file *dir_file, struct buffer_head *bh; struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; - struct ext4_fname_crypto_ctx *ctx = NULL; struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}, tmp_str; dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", @@ -959,17 +953,15 @@ static int htree_dirblock_to_tree(struct file *dir_file, EXT4_DIR_REC_LEN(0)); #ifdef CONFIG_EXT4_FS_ENCRYPTION /* Check if the directory is encrypted */ - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) { - err = PTR_ERR(ctx); + err = ext4_setup_fname_crypto(dir); + if (err) { brelse(bh); return err; } - if (ctx != NULL) { - err = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN, + if (ext4_encrypted_inode(dir)) { + err = ext4_fname_crypto_alloc_buffer(dir, EXT4_NAME_LEN, &fname_crypto_str); if (err < 0) { - ext4_put_fname_crypto_ctx(&ctx); brelse(bh); return err; } @@ -990,8 +982,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, continue; if (de->inode == 0) continue; - if (ctx == NULL) { - /* Directory is not encrypted */ + if (!ext4_encrypted_inode(dir)) { tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, @@ -1001,7 +992,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, int save_len = fname_crypto_str.len; /* Directory is encrypted */ - err = ext4_fname_disk_to_usr(ctx, hinfo, de, + err = ext4_fname_disk_to_usr(dir, hinfo, de, &fname_crypto_str); if (err < 0) { count = err; @@ -1021,7 +1012,6 @@ static int htree_dirblock_to_tree(struct file *dir_file, errout: brelse(bh); #ifdef CONFIG_EXT4_FS_ENCRYPTION - ext4_put_fname_crypto_ctx(&ctx); ext4_fname_crypto_free_buffer(&fname_crypto_str); #endif return count; @@ -3107,7 +3097,6 @@ static int ext4_symlink(struct inode *dir, } if (encryption_required) { - struct ext4_fname_crypto_ctx *ctx = NULL; struct qstr istr; struct ext4_str ostr; @@ -3119,19 +3108,14 @@ static int ext4_symlink(struct inode *dir, err = ext4_inherit_context(dir, inode); if (err) goto err_drop_inode; - ctx = ext4_get_fname_crypto_ctx(inode, - inode->i_sb->s_blocksize); - if (IS_ERR_OR_NULL(ctx)) { - /* We just set the policy, so ctx should not be NULL */ - err = (ctx == NULL) ? -EIO : PTR_ERR(ctx); + err = ext4_setup_fname_crypto(inode); + if (err) goto err_drop_inode; - } istr.name = (const unsigned char *) symname; istr.len = len; ostr.name = sd->encrypted_path; ostr.len = disk_link.len; - err = ext4_fname_usr_to_disk(ctx, &istr, &ostr); - ext4_put_fname_crypto_ctx(&ctx); + err = ext4_fname_usr_to_disk(inode, &istr, &ostr); if (err < 0) goto err_drop_inode; sd->len = cpu_to_le16(ostr.len); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bcd7a4b..e0dac10 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -879,9 +879,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); #ifdef CONFIG_EXT4_FS_ENCRYPTION - ei->i_crypt_info.ci_mode = EXT4_ENCRYPTION_MODE_INVALID; + ei->i_crypt_info = NULL; #endif - return &ei->vfs_inode; } @@ -958,6 +957,10 @@ void ext4_clear_inode(struct inode *inode) jbd2_free_inode(EXT4_I(inode)->jinode); EXT4_I(inode)->jinode = NULL; } +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (EXT4_I(inode)->i_crypt_info) + ext4_free_encryption_info(inode); +#endif } static struct inode *ext4_nfs_get_inode(struct super_block *sb, diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index ca65d45..3287088 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -29,7 +29,6 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) char *caddr, *paddr = NULL; struct ext4_str cstr, pstr; struct inode *inode = d_inode(dentry); - struct ext4_fname_crypto_ctx *ctx = NULL; struct ext4_encrypted_symlink_data *sd; loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); int res; @@ -38,19 +37,17 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) if (!ext4_encrypted_inode(inode)) return page_follow_link_light(dentry, nd); - ctx = ext4_get_fname_crypto_ctx(inode, inode->i_sb->s_blocksize); - if (IS_ERR(ctx)) - return ctx; + res = ext4_setup_fname_crypto(inode); + if (res) + return ERR_PTR(res); if (ext4_inode_is_fast_symlink(inode)) { caddr = (char *) EXT4_I(inode)->i_data; max_size = sizeof(EXT4_I(inode)->i_data); } else { cpage = read_mapping_page(inode->i_mapping, 0, NULL); - if (IS_ERR(cpage)) { - ext4_put_fname_crypto_ctx(&ctx); + if (IS_ERR(cpage)) return cpage; - } caddr = kmap(cpage); caddr[size] = 0; } @@ -75,21 +72,19 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) } pstr.name = paddr; pstr.len = plen; - res = _ext4_fname_disk_to_usr(ctx, NULL, &cstr, &pstr); + res = _ext4_fname_disk_to_usr(inode, NULL, &cstr, &pstr); if (res < 0) goto errout; /* Null-terminate the name */ if (res <= plen) paddr[res] = '\0'; nd_set_link(nd, paddr); - ext4_put_fname_crypto_ctx(&ctx); if (cpage) { kunmap(cpage); page_cache_release(cpage); } return NULL; errout: - ext4_put_fname_crypto_ctx(&ctx); if (cpage) { kunmap(cpage); page_cache_release(cpage); -- cgit v0.10.2 From f5aed2c2a825618553b20e8a67109570489b40d7 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 18 May 2015 13:18:47 -0400 Subject: ext4: clean up superblock encryption mode fields The superblock fields s_file_encryption_mode and s_dir_encryption_mode are vestigal, so remove them as a cleanup. While we're at it, allow file systems with both encryption and inline_data enabled at the same time to work correctly. We can't have encrypted inodes with inline data, but there's no reason to prohibit unencrypted inodes from using the inline data feature. Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index 370d3aa..683391f 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -51,6 +51,10 @@ static int ext4_create_encryption_context_from_policy( struct ext4_encryption_context ctx; int res = 0; + res = ext4_convert_inline_data(inode); + if (res) + return res; + ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, EXT4_KEY_DESCRIPTOR_SIZE); @@ -199,8 +203,9 @@ int ext4_inherit_context(struct inode *parent, struct inode *child) res = ext4_xattr_set(child, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, sizeof(ctx), 0); - if (!res) + if (!res) { ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT); + ext4_clear_inode_state(child, EXT4_STATE_MAY_INLINE_DATA); + } return res; - } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cac1968..213536f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1374,12 +1374,6 @@ struct ext4_sb_info { struct ratelimit_state s_err_ratelimit_state; struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; - -#ifdef CONFIG_EXT4_FS_ENCRYPTION - /* Encryption */ - uint32_t s_file_encryption_mode; - uint32_t s_dir_encryption_mode; -#endif }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 1eaa6cb..ddca169 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1034,28 +1034,9 @@ got: ext4_set_inode_state(inode, EXT4_STATE_NEW); ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - if ((sbi->s_file_encryption_mode == EXT4_ENCRYPTION_MODE_INVALID) && - (sbi->s_dir_encryption_mode == EXT4_ENCRYPTION_MODE_INVALID)) { - ei->i_inline_off = 0; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_INLINE_DATA)) - ext4_set_inode_state(inode, - EXT4_STATE_MAY_INLINE_DATA); - } else { - /* Inline data and encryption are incompatible - * We turn off inline data since encryption is enabled */ - ei->i_inline_off = 1; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_INLINE_DATA)) - ext4_clear_inode_state(inode, - EXT4_STATE_MAY_INLINE_DATA); - } -#else ei->i_inline_off = 0; if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA)) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); -#endif ret = inode; err = dquot_alloc_inode(inode); if (err) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e0dac10..b0bd1c1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3452,11 +3452,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_bdev->bd_part) sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part, sectors[1]); -#ifdef CONFIG_EXT4_FS_ENCRYPTION - /* Modes of operations for file and directory encryption. */ - sbi->s_file_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS; - sbi->s_dir_encryption_mode = EXT4_ENCRYPTION_MODE_INVALID; -#endif /* Cleanup superblock name */ for (cp = sb->s_id; (cp = strchr(cp, '/'));) -- cgit v0.10.2 From 8ee0371470038371729a39ee6669a2132ac47649 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 18 May 2015 13:19:47 -0400 Subject: ext4 crypto: use slab caches Use slab caches the ext4_crypto_ctx and ext4_crypt_info structures for slighly better memory efficiency and debuggability. Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 3a25aa4..1c34f0e 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -55,6 +55,9 @@ static mempool_t *ext4_bounce_page_pool; static LIST_HEAD(ext4_free_crypto_ctxs); static DEFINE_SPINLOCK(ext4_crypto_ctx_lock); +static struct kmem_cache *ext4_crypto_ctx_cachep; +struct kmem_cache *ext4_crypt_info_cachep; + /** * ext4_release_crypto_ctx() - Releases an encryption context * @ctx: The encryption context to release. @@ -79,7 +82,7 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx) if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) { if (ctx->tfm) crypto_free_tfm(ctx->tfm); - kfree(ctx); + kmem_cache_free(ext4_crypto_ctx_cachep, ctx); } else { spin_lock_irqsave(&ext4_crypto_ctx_lock, flags); list_add(&ctx->free_list, &ext4_free_crypto_ctxs); @@ -88,23 +91,6 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx) } /** - * ext4_alloc_and_init_crypto_ctx() - Allocates and inits an encryption context - * @mask: The allocation mask. - * - * Return: An allocated and initialized encryption context on success. An error - * value or NULL otherwise. - */ -static struct ext4_crypto_ctx *ext4_alloc_and_init_crypto_ctx(gfp_t mask) -{ - struct ext4_crypto_ctx *ctx = kzalloc(sizeof(struct ext4_crypto_ctx), - mask); - - if (!ctx) - return ERR_PTR(-ENOMEM); - return ctx; -} - -/** * ext4_get_crypto_ctx() - Gets an encryption context * @inode: The inode for which we are doing the crypto * @@ -121,8 +107,6 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; BUG_ON(ci == NULL); - if (!ext4_read_workqueue) - ext4_init_crypto(); /* * We first try getting the ctx from a free list because in @@ -141,9 +125,9 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) list_del(&ctx->free_list); spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags); if (!ctx) { - ctx = ext4_alloc_and_init_crypto_ctx(GFP_NOFS); - if (IS_ERR(ctx)) { - res = PTR_ERR(ctx); + ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, GFP_NOFS); + if (!ctx) { + res = -ENOMEM; goto out; } ctx->flags |= EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL; @@ -217,7 +201,7 @@ void ext4_exit_crypto(void) } if (pos->tfm) crypto_free_tfm(pos->tfm); - kfree(pos); + kmem_cache_free(ext4_crypto_ctx_cachep, pos); } INIT_LIST_HEAD(&ext4_free_crypto_ctxs); if (ext4_bounce_page_pool) @@ -226,6 +210,12 @@ void ext4_exit_crypto(void) if (ext4_read_workqueue) destroy_workqueue(ext4_read_workqueue); ext4_read_workqueue = NULL; + if (ext4_crypto_ctx_cachep) + kmem_cache_destroy(ext4_crypto_ctx_cachep); + ext4_crypto_ctx_cachep = NULL; + if (ext4_crypt_info_cachep) + kmem_cache_destroy(ext4_crypt_info_cachep); + ext4_crypt_info_cachep = NULL; } /** @@ -238,23 +228,31 @@ void ext4_exit_crypto(void) */ int ext4_init_crypto(void) { - int i, res; + int i, res = -ENOMEM; mutex_lock(&crypto_init); if (ext4_read_workqueue) goto already_initialized; ext4_read_workqueue = alloc_workqueue("ext4_crypto", WQ_HIGHPRI, 0); - if (!ext4_read_workqueue) { - res = -ENOMEM; + if (!ext4_read_workqueue) + goto fail; + + ext4_crypto_ctx_cachep = KMEM_CACHE(ext4_crypto_ctx, + SLAB_RECLAIM_ACCOUNT); + if (!ext4_crypto_ctx_cachep) + goto fail; + + ext4_crypt_info_cachep = KMEM_CACHE(ext4_crypt_info, + SLAB_RECLAIM_ACCOUNT); + if (!ext4_crypt_info_cachep) goto fail; - } for (i = 0; i < num_prealloc_crypto_ctxs; i++) { struct ext4_crypto_ctx *ctx; - ctx = ext4_alloc_and_init_crypto_ctx(GFP_KERNEL); - if (IS_ERR(ctx)) { - res = PTR_ERR(ctx); + ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, GFP_NOFS); + if (!ctx) { + res = -ENOMEM; goto fail; } list_add(&ctx->free_list, &ext4_free_crypto_ctxs); diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index 0075e43..d6abe46 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -96,7 +96,7 @@ void ext4_free_encryption_info(struct inode *inode) key_put(ci->ci_keyring_key); crypto_free_ablkcipher(ci->ci_ctfm); memzero_explicit(&ci->ci_raw, sizeof(ci->ci_raw)); - kfree(ci); + kmem_cache_free(ext4_crypt_info_cachep, ci); ei->i_crypt_info = NULL; } @@ -113,6 +113,12 @@ int _ext4_get_encryption_info(struct inode *inode) struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int res; + if (!ext4_read_workqueue) { + res = ext4_init_crypto(); + if (res) + return res; + } + if (ei->i_crypt_info) { if (!ei->i_crypt_info->ci_keyring_key || key_validate(ei->i_crypt_info->ci_keyring_key) == 0) @@ -134,7 +140,7 @@ int _ext4_get_encryption_info(struct inode *inode) return -EINVAL; res = 0; - crypt_info = kmalloc(sizeof(struct ext4_crypt_info), GFP_KERNEL); + crypt_info = kmem_cache_alloc(ext4_crypt_info_cachep, GFP_KERNEL); if (!crypt_info) return -ENOMEM; @@ -188,7 +194,7 @@ out: if (res < 0) { if (res == -ENOKEY) res = 0; - kfree(crypt_info); + kmem_cache_free(ext4_crypt_info_cachep, crypt_info); } else { ei->i_crypt_info = crypt_info; crypt_info->ci_keyring_key = keyring_key; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 213536f..23e33fb 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2059,6 +2059,7 @@ int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy); /* crypto.c */ +extern struct kmem_cache *ext4_crypt_info_cachep; bool ext4_valid_contents_enc_mode(uint32_t mode); uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size); extern struct workqueue_struct *ext4_read_workqueue; -- cgit v0.10.2 From 1aaa6e8b24114757a836ae0e62d2096deb76f274 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 18 May 2015 13:20:47 -0400 Subject: ext4 crypto: get rid of ci_mode from struct ext4_crypt_info The ci_mode field was superfluous, and getting rid of it gets rid of an unused hole in the structure. Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 1c34f0e..9969d05 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -137,14 +137,13 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) /* Allocate a new Crypto API context if we don't already have * one or if it isn't the right mode. */ - BUG_ON(ci->ci_mode == EXT4_ENCRYPTION_MODE_INVALID); - if (ctx->tfm && (ctx->mode != ci->ci_mode)) { + if (ctx->tfm && (ctx->mode != ci->ci_data_mode)) { crypto_free_tfm(ctx->tfm); ctx->tfm = NULL; ctx->mode = EXT4_ENCRYPTION_MODE_INVALID; } if (!ctx->tfm) { - switch (ci->ci_mode) { + switch (ci->ci_data_mode) { case EXT4_ENCRYPTION_MODE_AES_256_XTS: ctx->tfm = crypto_ablkcipher_tfm( crypto_alloc_ablkcipher("xts(aes)", 0, 0)); @@ -162,9 +161,9 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) ctx->tfm = NULL; goto out; } - ctx->mode = ci->ci_mode; + ctx->mode = ci->ci_data_mode; } - BUG_ON(ci->ci_size != ext4_encryption_key_size(ci->ci_mode)); + BUG_ON(ci->ci_size != ext4_encryption_key_size(ci->ci_data_mode)); /* There shouldn't be a bounce page attached to the crypto * context at this point. */ @@ -321,7 +320,7 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, int res = 0; BUG_ON(!ctx->tfm); - BUG_ON(ctx->mode != ei->i_crypt_info->ci_mode); + BUG_ON(ctx->mode != ei->i_crypt_info->ci_data_mode); if (ctx->mode != EXT4_ENCRYPTION_MODE_AES_256_XTS) { printk_ratelimited(KERN_ERR diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 374d0e7..e63dd29 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -272,9 +272,9 @@ int ext4_setup_fname_crypto(struct inode *inode) if (!ci || ci->ci_ctfm) return 0; - if (ci->ci_mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) { + if (ci->ci_filename_mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) { printk_once(KERN_WARNING "ext4: unsupported key mode %d\n", - ci->ci_mode); + ci->ci_filename_mode); return -ENOKEY; } diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index d6abe46..858d7d6 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -152,14 +152,13 @@ int _ext4_get_encryption_info(struct inode *inode) memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); if (S_ISREG(inode->i_mode)) - crypt_info->ci_mode = ctx.contents_encryption_mode; + crypt_info->ci_size = + ext4_encryption_key_size(crypt_info->ci_data_mode); else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - crypt_info->ci_mode = ctx.filenames_encryption_mode; - else { - printk(KERN_ERR "ext4 crypto: Unsupported inode type.\n"); + crypt_info->ci_size = + ext4_encryption_key_size(crypt_info->ci_filename_mode); + else BUG(); - } - crypt_info->ci_size = ext4_encryption_key_size(crypt_info->ci_mode); BUG_ON(!crypt_info->ci_size); if (DUMMY_ENCRYPTION_ENABLED(sbi)) { memset(crypt_info->ci_raw, 0x42, EXT4_AES_256_XTS_KEY_SIZE); diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index d29687c..69faf0e 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -74,7 +74,6 @@ struct ext4_encryption_key { } __attribute__((__packed__)); struct ext4_crypt_info { - unsigned char ci_mode; unsigned char ci_size; char ci_data_mode; char ci_filename_mode; -- cgit v0.10.2 From 614def7013574ffcd54019b6df40ac1c0df754af Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 31 May 2015 13:31:34 -0400 Subject: ext4 crypto: shrink size of the ext4_crypto_ctx structure Some fields are only used when the crypto_ctx is being used on the read path, some are only used on the write path, and some are only used when the structure is on free list. Optimize memory use by using a union. Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 9969d05..28a0e4bd 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -71,14 +71,14 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx) { unsigned long flags; - if (ctx->bounce_page) { + if (ctx->flags & EXT4_WRITE_PATH_FL && ctx->w.bounce_page) { if (ctx->flags & EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL) - __free_page(ctx->bounce_page); + __free_page(ctx->w.bounce_page); else - mempool_free(ctx->bounce_page, ext4_bounce_page_pool); - ctx->bounce_page = NULL; + mempool_free(ctx->w.bounce_page, ext4_bounce_page_pool); } - ctx->control_page = NULL; + ctx->w.bounce_page = NULL; + ctx->w.control_page = NULL; if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) { if (ctx->tfm) crypto_free_tfm(ctx->tfm); @@ -134,6 +134,7 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) } else { ctx->flags &= ~EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL; } + ctx->flags &= ~EXT4_WRITE_PATH_FL; /* Allocate a new Crypto API context if we don't already have * one or if it isn't the right mode. */ @@ -165,10 +166,6 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) } BUG_ON(ci->ci_size != ext4_encryption_key_size(ci->ci_data_mode)); - /* There shouldn't be a bounce page attached to the crypto - * context at this point. */ - BUG_ON(ctx->bounce_page); - out: if (res) { if (!IS_ERR_OR_NULL(ctx)) @@ -189,15 +186,6 @@ void ext4_exit_crypto(void) struct ext4_crypto_ctx *pos, *n; list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list) { - if (pos->bounce_page) { - if (pos->flags & - EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL) { - __free_page(pos->bounce_page); - } else { - mempool_free(pos->bounce_page, - ext4_bounce_page_pool); - } - } if (pos->tfm) crypto_free_tfm(pos->tfm); kmem_cache_free(ext4_crypto_ctx_cachep, pos); @@ -425,8 +413,9 @@ struct page *ext4_encrypt(struct inode *inode, } else { ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; } - ctx->bounce_page = ciphertext_page; - ctx->control_page = plaintext_page; + ctx->flags |= EXT4_WRITE_PATH_FL; + ctx->w.bounce_page = ciphertext_page; + ctx->w.control_page = plaintext_page; err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index, plaintext_page, ciphertext_page); if (err) { @@ -505,7 +494,7 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) } else { ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; } - ctx->bounce_page = ciphertext_page; + ctx->w.bounce_page = ciphertext_page; while (len--) { err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk, diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index 69faf0e..c5258f2 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -86,16 +86,23 @@ struct ext4_crypt_info { #define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 #define EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL 0x00000002 +#define EXT4_WRITE_PATH_FL 0x00000004 struct ext4_crypto_ctx { struct crypto_tfm *tfm; /* Crypto API context */ - struct page *bounce_page; /* Ciphertext page on write path */ - struct page *control_page; /* Original page on write path */ - struct bio *bio; /* The bio for this context */ - struct work_struct work; /* Work queue for read complete path */ - struct list_head free_list; /* Free list */ - int flags; /* Flags */ - int mode; /* Encryption mode for tfm */ + union { + struct { + struct page *bounce_page; /* Ciphertext page */ + struct page *control_page; /* Original page */ + } w; + struct { + struct bio *bio; + struct work_struct work; + } r; + struct list_head free_list; /* Free list */ + }; + char flags; /* Flags */ + char mode; /* Encryption mode for tfm */ }; struct ext4_completion_result { diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 5765f88..79636e2 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -84,7 +84,7 @@ static void ext4_finish_bio(struct bio *bio) /* The bounce data pages are unmapped. */ data_page = page; ctx = (struct ext4_crypto_ctx *)page_private(data_page); - page = ctx->control_page; + page = ctx->w.control_page; } #endif diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 171b9ac..ec3ef93 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -54,8 +54,8 @@ static void completion_pages(struct work_struct *work) { #ifdef CONFIG_EXT4_FS_ENCRYPTION struct ext4_crypto_ctx *ctx = - container_of(work, struct ext4_crypto_ctx, work); - struct bio *bio = ctx->bio; + container_of(work, struct ext4_crypto_ctx, r.work); + struct bio *bio = ctx->r.bio; struct bio_vec *bv; int i; @@ -109,9 +109,9 @@ static void mpage_end_io(struct bio *bio, int err) if (err) { ext4_release_crypto_ctx(ctx); } else { - INIT_WORK(&ctx->work, completion_pages); - ctx->bio = bio; - queue_work(ext4_read_workqueue, &ctx->work); + INIT_WORK(&ctx->r.work, completion_pages); + ctx->r.bio = bio; + queue_work(ext4_read_workqueue, &ctx->r.work); return; } } -- cgit v0.10.2 From 71dea01ea2edb73f3c5d9a0cd7ba028bb9313287 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 31 May 2015 13:31:37 -0400 Subject: ext4 crypto: require CONFIG_CRYPTO_CTR if ext4 encryption is enabled On arm64 this is apparently needed for CTS mode to function correctly. Otherwise attempts to use CTS return ENOENT. Change-Id: I732ea9a5157acc76de5b89edec195d0365f4ca63 Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 024f228..bf8bc8a 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -72,6 +72,7 @@ config EXT4_ENCRYPTION select CRYPTO_ECB select CRYPTO_XTS select CRYPTO_CTS + select CRYPTO_CTR select CRYPTO_SHA256 select KEYS select ENCRYPTED_KEYS -- cgit v0.10.2 From c936e1ec2879e43599d801dfa6fe58e7ccfee433 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 31 May 2015 13:34:22 -0400 Subject: ext4 crypto: use per-inode tfm structure As suggested by Herbert Xu, we shouldn't allocate a new tfm each time we read or write a page. Instead we can use a single tfm hanging off the inode's crypt_info structure for all of our encryption needs for that inode, since the tfm can be used by multiple crypto requests in parallel. Also use cmpxchg() to avoid races that could result in crypt_info structure getting doubly allocated or doubly freed. Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 28a0e4bd..c3a9b08 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -80,8 +80,6 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx) ctx->w.bounce_page = NULL; ctx->w.control_page = NULL; if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) { - if (ctx->tfm) - crypto_free_tfm(ctx->tfm); kmem_cache_free(ext4_crypto_ctx_cachep, ctx); } else { spin_lock_irqsave(&ext4_crypto_ctx_lock, flags); @@ -136,36 +134,6 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) } ctx->flags &= ~EXT4_WRITE_PATH_FL; - /* Allocate a new Crypto API context if we don't already have - * one or if it isn't the right mode. */ - if (ctx->tfm && (ctx->mode != ci->ci_data_mode)) { - crypto_free_tfm(ctx->tfm); - ctx->tfm = NULL; - ctx->mode = EXT4_ENCRYPTION_MODE_INVALID; - } - if (!ctx->tfm) { - switch (ci->ci_data_mode) { - case EXT4_ENCRYPTION_MODE_AES_256_XTS: - ctx->tfm = crypto_ablkcipher_tfm( - crypto_alloc_ablkcipher("xts(aes)", 0, 0)); - break; - case EXT4_ENCRYPTION_MODE_AES_256_GCM: - /* TODO(mhalcrow): AEAD w/ gcm(aes); - * crypto_aead_setauthsize() */ - ctx->tfm = ERR_PTR(-ENOTSUPP); - break; - default: - BUG(); - } - if (IS_ERR_OR_NULL(ctx->tfm)) { - res = PTR_ERR(ctx->tfm); - ctx->tfm = NULL; - goto out; - } - ctx->mode = ci->ci_data_mode; - } - BUG_ON(ci->ci_size != ext4_encryption_key_size(ci->ci_data_mode)); - out: if (res) { if (!IS_ERR_OR_NULL(ctx)) @@ -185,11 +153,8 @@ void ext4_exit_crypto(void) { struct ext4_crypto_ctx *pos, *n; - list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list) { - if (pos->tfm) - crypto_free_tfm(pos->tfm); + list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list) kmem_cache_free(ext4_crypto_ctx_cachep, pos); - } INIT_LIST_HEAD(&ext4_free_crypto_ctxs); if (ext4_bounce_page_pool) mempool_destroy(ext4_bounce_page_pool); @@ -303,32 +268,11 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, struct ablkcipher_request *req = NULL; DECLARE_EXT4_COMPLETION_RESULT(ecr); struct scatterlist dst, src; - struct ext4_inode_info *ei = EXT4_I(inode); - struct crypto_ablkcipher *atfm = __crypto_ablkcipher_cast(ctx->tfm); + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + struct crypto_ablkcipher *tfm = ci->ci_ctfm; int res = 0; - BUG_ON(!ctx->tfm); - BUG_ON(ctx->mode != ei->i_crypt_info->ci_data_mode); - - if (ctx->mode != EXT4_ENCRYPTION_MODE_AES_256_XTS) { - printk_ratelimited(KERN_ERR - "%s: unsupported crypto algorithm: %d\n", - __func__, ctx->mode); - return -ENOTSUPP; - } - - crypto_ablkcipher_clear_flags(atfm, ~0); - crypto_tfm_set_flags(ctx->tfm, CRYPTO_TFM_REQ_WEAK_KEY); - - res = crypto_ablkcipher_setkey(atfm, ei->i_crypt_info->ci_raw, - ei->i_crypt_info->ci_size); - if (res) { - printk_ratelimited(KERN_ERR - "%s: crypto_ablkcipher_setkey() failed\n", - __func__); - return res; - } - req = ablkcipher_request_alloc(atfm, GFP_NOFS); + req = ablkcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited(KERN_ERR "%s: crypto_request_alloc() failed\n", diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index e63dd29..29a2dc9 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -252,52 +252,6 @@ static int digest_decode(const char *src, int len, char *dst) return cp - dst; } -int ext4_setup_fname_crypto(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_crypt_info *ci = ei->i_crypt_info; - struct crypto_ablkcipher *ctfm; - int res; - - /* Check if the crypto policy is set on the inode */ - res = ext4_encrypted_inode(inode); - if (res == 0) - return 0; - - res = ext4_get_encryption_info(inode); - if (res < 0) - return res; - ci = ei->i_crypt_info; - - if (!ci || ci->ci_ctfm) - return 0; - - if (ci->ci_filename_mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) { - printk_once(KERN_WARNING "ext4: unsupported key mode %d\n", - ci->ci_filename_mode); - return -ENOKEY; - } - - ctfm = crypto_alloc_ablkcipher("cts(cbc(aes))", 0, 0); - if (!ctfm || IS_ERR(ctfm)) { - res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; - printk(KERN_DEBUG "%s: error (%d) allocating crypto tfm\n", - __func__, res); - return res; - } - crypto_ablkcipher_clear_flags(ctfm, ~0); - crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm), - CRYPTO_TFM_REQ_WEAK_KEY); - - res = crypto_ablkcipher_setkey(ctfm, ci->ci_raw, ci->ci_size); - if (res) { - crypto_free_ablkcipher(ctfm); - return -EIO; - } - ci->ci_ctfm = ctfm; - return 0; -} - /** * ext4_fname_crypto_round_up() - * @@ -449,7 +403,7 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, fname->disk_name.len = iname->len; goto out; } - ret = ext4_setup_fname_crypto(dir); + ret = ext4_get_encryption_info(dir); if (ret) return ret; ci = EXT4_I(dir)->i_crypt_info; diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index 858d7d6..442d24e 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -84,20 +84,32 @@ out: return res; } -void ext4_free_encryption_info(struct inode *inode) +void ext4_free_crypt_info(struct ext4_crypt_info *ci) { - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_crypt_info *ci = ei->i_crypt_info; - if (!ci) return; if (ci->ci_keyring_key) key_put(ci->ci_keyring_key); crypto_free_ablkcipher(ci->ci_ctfm); - memzero_explicit(&ci->ci_raw, sizeof(ci->ci_raw)); kmem_cache_free(ext4_crypt_info_cachep, ci); - ei->i_crypt_info = NULL; +} + +void ext4_free_encryption_info(struct inode *inode, + struct ext4_crypt_info *ci) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_crypt_info *prev; + + if (ci == NULL) + ci = ACCESS_ONCE(ei->i_crypt_info); + if (ci == NULL) + return; + prev = cmpxchg(&ei->i_crypt_info, ci, NULL); + if (prev != ci) + return; + + ext4_free_crypt_info(ci); } int _ext4_get_encryption_info(struct inode *inode) @@ -111,6 +123,10 @@ int _ext4_get_encryption_info(struct inode *inode) struct ext4_encryption_context ctx; struct user_key_payload *ukp; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct crypto_ablkcipher *ctfm; + const char *cipher_str; + char raw_key[EXT4_MAX_KEY_SIZE]; + char mode; int res; if (!ext4_read_workqueue) { @@ -119,11 +135,14 @@ int _ext4_get_encryption_info(struct inode *inode) return res; } - if (ei->i_crypt_info) { - if (!ei->i_crypt_info->ci_keyring_key || - key_validate(ei->i_crypt_info->ci_keyring_key) == 0) +retry: + crypt_info = ACCESS_ONCE(ei->i_crypt_info); + if (crypt_info) { + if (!crypt_info->ci_keyring_key || + key_validate(crypt_info->ci_keyring_key) == 0) return 0; - ext4_free_encryption_info(inode); + ext4_free_encryption_info(inode, crypt_info); + goto retry; } res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, @@ -144,26 +163,37 @@ int _ext4_get_encryption_info(struct inode *inode) if (!crypt_info) return -ENOMEM; - ei->i_crypt_policy_flags = ctx.flags; crypt_info->ci_flags = ctx.flags; crypt_info->ci_data_mode = ctx.contents_encryption_mode; crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; crypt_info->ci_ctfm = NULL; + crypt_info->ci_keyring_key = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); if (S_ISREG(inode->i_mode)) - crypt_info->ci_size = - ext4_encryption_key_size(crypt_info->ci_data_mode); + mode = crypt_info->ci_data_mode; else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - crypt_info->ci_size = - ext4_encryption_key_size(crypt_info->ci_filename_mode); + mode = crypt_info->ci_filename_mode; else BUG(); - BUG_ON(!crypt_info->ci_size); - if (DUMMY_ENCRYPTION_ENABLED(sbi)) { - memset(crypt_info->ci_raw, 0x42, EXT4_AES_256_XTS_KEY_SIZE); + switch (mode) { + case EXT4_ENCRYPTION_MODE_AES_256_XTS: + cipher_str = "xts(aes)"; + break; + case EXT4_ENCRYPTION_MODE_AES_256_CTS: + cipher_str = "cts(cbc(aes))"; + break; + default: + printk_once(KERN_WARNING + "ext4: unsupported key mode %d (ino %u)\n", + mode, (unsigned) inode->i_ino); + res = -ENOKEY; goto out; } + if (DUMMY_ENCRYPTION_ENABLED(sbi)) { + memset(raw_key, 0x42, EXT4_AES_256_XTS_KEY_SIZE); + goto got_key; + } memcpy(full_key_descriptor, EXT4_KEY_DESC_PREFIX, EXT4_KEY_DESC_PREFIX_SIZE); sprintf(full_key_descriptor + EXT4_KEY_DESC_PREFIX_SIZE, @@ -177,6 +207,7 @@ int _ext4_get_encryption_info(struct inode *inode) keyring_key = NULL; goto out; } + crypt_info->ci_keyring_key = keyring_key; BUG_ON(keyring_key->type != &key_type_logon); ukp = ((struct user_key_payload *)keyring_key->payload.data); if (ukp->datalen != sizeof(struct ext4_encryption_key)) { @@ -188,19 +219,36 @@ int _ext4_get_encryption_info(struct inode *inode) EXT4_KEY_DERIVATION_NONCE_SIZE); BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE); res = ext4_derive_key_aes(ctx.nonce, master_key->raw, - crypt_info->ci_raw); -out: - if (res < 0) { - if (res == -ENOKEY) - res = 0; - kmem_cache_free(ext4_crypt_info_cachep, crypt_info); - } else { - ei->i_crypt_info = crypt_info; - crypt_info->ci_keyring_key = keyring_key; - keyring_key = NULL; + raw_key); +got_key: + ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0); + if (!ctfm || IS_ERR(ctfm)) { + res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; + printk(KERN_DEBUG + "%s: error %d (inode %u) allocating crypto tfm\n", + __func__, res, (unsigned) inode->i_ino); + goto out; + } + crypt_info->ci_ctfm = ctfm; + crypto_ablkcipher_clear_flags(ctfm, ~0); + crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm), + CRYPTO_TFM_REQ_WEAK_KEY); + res = crypto_ablkcipher_setkey(ctfm, raw_key, + ext4_encryption_key_size(mode)); + if (res) + goto out; + memzero_explicit(raw_key, sizeof(raw_key)); + if (cmpxchg(&ei->i_crypt_info, NULL, crypt_info) != NULL) { + ext4_free_crypt_info(crypt_info); + goto retry; } - if (keyring_key) - key_put(keyring_key); + return 0; + +out: + if (res == -ENOKEY) + res = 0; + ext4_free_crypt_info(crypt_info); + memzero_explicit(raw_key, sizeof(raw_key)); return res; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 28cb94f..e11e6ae 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -133,9 +133,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) return err; } - err = ext4_setup_fname_crypto(inode); - if (err) - return err; if (ext4_encrypted_inode(inode)) { err = ext4_fname_crypto_alloc_buffer(inode, EXT4_NAME_LEN, &fname_crypto_str); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 23e33fb..7435ff2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -911,7 +911,6 @@ struct ext4_inode_info { /* on-disk additional length */ __u16 i_extra_isize; - char i_crypt_policy_flags; /* Indicate the inline data space. */ u16 i_inline_off; @@ -2105,7 +2104,6 @@ int ext4_fname_usr_to_disk(struct inode *inode, const struct qstr *iname, struct ext4_str *oname); #ifdef CONFIG_EXT4_FS_ENCRYPTION -int ext4_setup_fname_crypto(struct inode *inode); void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str); int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct ext4_filename *fname); @@ -2131,7 +2129,8 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } /* crypto_key.c */ -void ext4_free_encryption_info(struct inode *inode); +void ext4_free_crypt_info(struct ext4_crypt_info *ci); +void ext4_free_encryption_info(struct inode *inode, struct ext4_crypt_info *ci); int _ext4_get_encryption_info(struct inode *inode); #ifdef CONFIG_EXT4_FS_ENCRYPTION diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index c5258f2..34e0d24 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -74,13 +74,11 @@ struct ext4_encryption_key { } __attribute__((__packed__)); struct ext4_crypt_info { - unsigned char ci_size; char ci_data_mode; char ci_filename_mode; char ci_flags; struct crypto_ablkcipher *ci_ctfm; struct key *ci_keyring_key; - char ci_raw[EXT4_MAX_KEY_SIZE]; char ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE]; }; @@ -89,7 +87,6 @@ struct ext4_crypt_info { #define EXT4_WRITE_PATH_FL 0x00000004 struct ext4_crypto_ctx { - struct crypto_tfm *tfm; /* Crypto API context */ union { struct { struct page *bounce_page; /* Ciphertext page */ diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 9bed99f..6ab50f8 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -607,11 +607,12 @@ static struct stats dx_show_leaf(struct inode *dir, char *name; struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; - int res; + int res = 0; name = de->name; len = de->name_len; - res = ext4_setup_fname_crypto(dir); + if (ext4_encrypted_inode(inode)) + res = ext4_get_encryption_info(dir); if (res) { printk(KERN_WARNING "Error setting up" " fname crypto: %d\n", res); @@ -953,12 +954,12 @@ static int htree_dirblock_to_tree(struct file *dir_file, EXT4_DIR_REC_LEN(0)); #ifdef CONFIG_EXT4_FS_ENCRYPTION /* Check if the directory is encrypted */ - err = ext4_setup_fname_crypto(dir); - if (err) { - brelse(bh); - return err; - } if (ext4_encrypted_inode(dir)) { + err = ext4_get_encryption_info(dir); + if (err < 0) { + brelse(bh); + return err; + } err = ext4_fname_crypto_alloc_buffer(dir, EXT4_NAME_LEN, &fname_crypto_str); if (err < 0) { @@ -3108,7 +3109,7 @@ static int ext4_symlink(struct inode *dir, err = ext4_inherit_context(dir, inode); if (err) goto err_drop_inode; - err = ext4_setup_fname_crypto(inode); + err = ext4_get_encryption_info(inode); if (err) goto err_drop_inode; istr.name = (const unsigned char *) symname; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b0bd1c1..56bfc2f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -959,7 +959,7 @@ void ext4_clear_inode(struct inode *inode) } #ifdef CONFIG_EXT4_FS_ENCRYPTION if (EXT4_I(inode)->i_crypt_info) - ext4_free_encryption_info(inode); + ext4_free_encryption_info(inode, EXT4_I(inode)->i_crypt_info); #endif } diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 3287088..68e915a 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -37,7 +37,7 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) if (!ext4_encrypted_inode(inode)) return page_follow_link_light(dentry, nd); - res = ext4_setup_fname_crypto(inode); + res = ext4_get_encryption_info(inode); if (res) return ERR_PTR(res); -- cgit v0.10.2 From 95ea68b4c7105179f507d31f7bf571623373aa0b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 31 May 2015 13:34:24 -0400 Subject: ext4 crypto: fix memory leaks in ext4_encrypted_zeroout ext4_encrypted_zeroout() could end up leaking a bio and bounce page. Fortunately it's not used much. While we're fixing things up, refactor out common code into the static function alloc_bounce_page() and fix up error handling if mempool_alloc() fails. Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index c3a9b08..1c9a8c4 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -314,6 +314,26 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, return 0; } +static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx) +{ + struct page *ciphertext_page = alloc_page(GFP_NOFS); + + if (!ciphertext_page) { + /* This is a potential bottleneck, but at least we'll have + * forward progress. */ + ciphertext_page = mempool_alloc(ext4_bounce_page_pool, + GFP_NOFS); + if (ciphertext_page == NULL) + return ERR_PTR(-ENOMEM); + ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; + } else { + ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; + } + ctx->flags |= EXT4_WRITE_PATH_FL; + ctx->w.bounce_page = ciphertext_page; + return ciphertext_page; +} + /** * ext4_encrypt() - Encrypts a page * @inode: The inode for which the encryption should take place @@ -343,28 +363,17 @@ struct page *ext4_encrypt(struct inode *inode, return (struct page *) ctx; /* The encryption operation will require a bounce page. */ - ciphertext_page = alloc_page(GFP_NOFS); - if (!ciphertext_page) { - /* This is a potential bottleneck, but at least we'll have - * forward progress. */ - ciphertext_page = mempool_alloc(ext4_bounce_page_pool, - GFP_NOFS); - if (WARN_ON_ONCE(!ciphertext_page)) { - ciphertext_page = mempool_alloc(ext4_bounce_page_pool, - GFP_NOFS | __GFP_WAIT); - } - ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; - } else { - ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; - } - ctx->flags |= EXT4_WRITE_PATH_FL; - ctx->w.bounce_page = ciphertext_page; + ciphertext_page = alloc_bounce_page(ctx); + if (IS_ERR(ciphertext_page)) + goto errout; ctx->w.control_page = plaintext_page; err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index, plaintext_page, ciphertext_page); if (err) { + ciphertext_page = ERR_PTR(err); + errout: ext4_release_crypto_ctx(ctx); - return ERR_PTR(err); + return ciphertext_page; } SetPagePrivate(ciphertext_page); set_page_private(ciphertext_page, (unsigned long)ctx); @@ -424,21 +433,11 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) if (IS_ERR(ctx)) return PTR_ERR(ctx); - ciphertext_page = alloc_page(GFP_NOFS); - if (!ciphertext_page) { - /* This is a potential bottleneck, but at least we'll have - * forward progress. */ - ciphertext_page = mempool_alloc(ext4_bounce_page_pool, - GFP_NOFS); - if (WARN_ON_ONCE(!ciphertext_page)) { - ciphertext_page = mempool_alloc(ext4_bounce_page_pool, - GFP_NOFS | __GFP_WAIT); - } - ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; - } else { - ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; + ciphertext_page = alloc_bounce_page(ctx); + if (IS_ERR(ciphertext_page)) { + err = PTR_ERR(ciphertext_page); + goto errout; } - ctx->w.bounce_page = ciphertext_page; while (len--) { err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk, @@ -460,6 +459,7 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) goto errout; } err = submit_bio_wait(WRITE, bio); + bio_put(bio); if (err) goto errout; } -- cgit v0.10.2 From 5555702955326ac1b9971f81569d8a6851384d49 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 31 May 2015 13:34:29 -0400 Subject: ext4 crypto: set up encryption info for new inodes in ext4_inherit_context() Set up the encryption information for newly created inodes immediately after they inherit their encryption context from their parent directories. Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index 683391f..81980a15 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -206,6 +206,7 @@ int ext4_inherit_context(struct inode *parent, struct inode *child) if (!res) { ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT); ext4_clear_inode_state(child, EXT4_STATE_MAY_INLINE_DATA); + res = ext4_get_encryption_info(child); } return res; } -- cgit v0.10.2 From 6bc445e0ff44c7e83225124c214d350407e68ccf Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 31 May 2015 13:34:57 -0400 Subject: ext4 crypto: make sure the encryption info is initialized on opendir(2) Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index e11e6ae..f9e1491 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -593,6 +593,13 @@ finished: return 0; } +static int ext4_dir_open(struct inode * inode, struct file * filp) +{ + if (ext4_encrypted_inode(inode)) + return ext4_get_encryption_info(inode) ? -EACCES : 0; + return 0; +} + static int ext4_release_dir(struct inode *inode, struct file *filp) { if (filp->private_data) @@ -635,5 +642,6 @@ const struct file_operations ext4_dir_operations = { .compat_ioctl = ext4_compat_ioctl, #endif .fsync = ext4_sync_file, + .open = ext4_dir_open, .release = ext4_release_dir, }; -- cgit v0.10.2 From e709e9df64928a99d41da75910b844976a535db7 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 31 May 2015 13:35:02 -0400 Subject: ext4 crypto: encrypt tmpfile located in encryption protected directory Factor out calls to ext4_inherit_context() and move them to __ext4_new_inode(); this fixes a problem where ext4_tmpfile() wasn't calling calling ext4_inherit_context(), so the temporary file wasn't getting protected. Since the blocks for the tmpfile could end up on disk, they really should be protected if the tmpfile is created within the context of an encrypted directory. Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7435ff2..bd8d32d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2149,6 +2149,11 @@ static inline int ext4_get_encryption_info(struct inode *inode) return 0; } +static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode) +{ + return EXT4_I(inode)->i_crypt_info; +} + #else static inline int ext4_has_encryption_key(struct inode *inode) { @@ -2158,6 +2163,10 @@ static inline int ext4_get_encryption_info(struct inode *inode) { return 0; } +static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode) +{ + return NULL; +} #endif diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index ddca169..173c1ae 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -726,11 +726,25 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, ext4_group_t i; ext4_group_t flex_group; struct ext4_group_info *grp; + int encrypt = 0; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) return ERR_PTR(-EPERM); + if ((ext4_encrypted_inode(dir) || + DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { + err = ext4_get_encryption_info(dir); + if (err) + return ERR_PTR(err); + if (ext4_encryption_info(dir) == NULL) + return ERR_PTR(-EPERM); + if (!handle) + nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb); + encrypt = 1; + } + sb = dir->i_sb; ngroups = ext4_get_groups_count(sb); trace_ext4_request_inode(dir, mode); @@ -996,12 +1010,6 @@ got: ei->i_block_group = group; ei->i_last_alloc_group = ~0; - /* If the directory encrypted, then we should encrypt the inode. */ - if ((S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) && - (ext4_encrypted_inode(dir) || - DUMMY_ENCRYPTION_ENABLED(sbi))) - ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); - ext4_set_inode_flags(inode); if (IS_DIRSYNC(inode)) ext4_handle_sync(handle); @@ -1063,6 +1071,12 @@ got: ei->i_datasync_tid = handle->h_transaction->t_tid; } + if (encrypt) { + err = ext4_inherit_context(dir, inode); + if (err) + goto fail_free_drop; + } + err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_std_error(sb, err); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 6ab50f8..1e7d65d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2437,20 +2437,7 @@ retry: inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); - err = 0; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - if (!err && (ext4_encrypted_inode(dir) || - DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb)))) { - err = ext4_inherit_context(dir, inode); - if (err) { - clear_nlink(inode); - unlock_new_inode(inode); - iput(inode); - } - } -#endif - if (!err) - err = ext4_add_nondir(handle, dentry, inode); + err = ext4_add_nondir(handle, dentry, inode); if (!err && IS_DIRSYNC(dir)) ext4_handle_sync(handle); } @@ -2631,14 +2618,6 @@ retry: err = ext4_init_new_dir(handle, dir, inode); if (err) goto out_clear_inode; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - if (ext4_encrypted_inode(dir) || - DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) { - err = ext4_inherit_context(dir, inode); - if (err) - goto out_clear_inode; - } -#endif err = ext4_mark_inode_dirty(handle, inode); if (!err) err = ext4_add_entry(handle, dentry, inode); @@ -3106,12 +3085,6 @@ static int ext4_symlink(struct inode *dir, err = -ENOMEM; goto err_drop_inode; } - err = ext4_inherit_context(dir, inode); - if (err) - goto err_drop_inode; - err = ext4_get_encryption_info(inode); - if (err) - goto err_drop_inode; istr.name = (const unsigned char *) symname; istr.len = len; ostr.name = sd->encrypted_path; -- cgit v0.10.2 From c2faccaff6a16d331df832135ede6d4774c2d2a0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 31 May 2015 13:35:09 -0400 Subject: ext4 crypto: enforce crypto policy restrictions on cross-renames Thanks to Chao Yu for pointing out the need for this check. Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 1e7d65d..401b099 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3647,6 +3647,15 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, u8 new_file_type; int retval; + if ((ext4_encrypted_inode(old_dir) || + ext4_encrypted_inode(new_dir)) && + (old_dir != new_dir) && + (!ext4_is_child_context_consistent_with_parent(new_dir, + old.inode) || + !ext4_is_child_context_consistent_with_parent(old_dir, + new.inode))) + return -EPERM; + dquot_initialize(old.dir); dquot_initialize(new.dir); -- cgit v0.10.2 From d87f6d78e996bbba27b649c0e0eed7a37d6b73ba Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 31 May 2015 13:35:14 -0400 Subject: ext4 crypto: policies may only be set on directories Thanks to Chao Yu for pointing out we were missing this check. Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index 81980a15..a1d434d 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -93,6 +93,8 @@ int ext4_process_policy(const struct ext4_encryption_policy *policy, return -EINVAL; if (!ext4_inode_has_encryption_context(inode)) { + if (!S_ISDIR(inode->i_mode)) + return -EINVAL; if (!ext4_empty_dir(inode)) return -ENOTEMPTY; return ext4_create_encryption_context_from_policy(inode, -- cgit v0.10.2 From 82d0d3e7e69ab509b5c91b61f12bd3593a7c6dcb Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 31 May 2015 13:35:22 -0400 Subject: ext4 crypto: clean up error handling in ext4_fname_setup_filename Fix a potential memory leak where fname->crypto_buf.name wouldn't get freed in some error paths, and also make the error handling easier to understand/audit. Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 29a2dc9..23af41f 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -401,7 +401,7 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, ((iname->name[1] == '.') && (iname->len == 2))))) { fname->disk_name.name = (unsigned char *) iname->name; fname->disk_name.len = iname->len; - goto out; + return 0; } ret = ext4_get_encryption_info(dir); if (ret) @@ -411,19 +411,16 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, ret = ext4_fname_crypto_alloc_buffer(dir, iname->len, &fname->crypto_buf); if (ret < 0) - goto out; + return ret; ret = ext4_fname_encrypt(dir, iname, &fname->crypto_buf); if (ret < 0) - goto out; + goto errout; fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; - ret = 0; - goto out; - } - if (!lookup) { - ret = -EACCES; - goto out; + return 0; } + if (!lookup) + return -EACCES; /* We don't have the key and we are doing a lookup; decode the * user-supplied name @@ -431,19 +428,17 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, if (iname->name[0] == '_') bigname = 1; if ((bigname && (iname->len != 33)) || - (!bigname && (iname->len > 43))) { - ret = -ENOENT; - } + (!bigname && (iname->len > 43))) + return -ENOENT; + fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); - if (fname->crypto_buf.name == NULL) { - ret = -ENOMEM; - goto out; - } + if (fname->crypto_buf.name == NULL) + return -ENOMEM; ret = digest_decode(iname->name + bigname, iname->len - bigname, fname->crypto_buf.name); if (ret < 0) { ret = -ENOENT; - goto out; + goto errout; } fname->crypto_buf.len = ret; if (bigname) { @@ -453,8 +448,10 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; } - ret = 0; -out: + return 0; +errout: + kfree(fname->crypto_buf.name); + fname->crypto_buf.name = NULL; return ret; } -- cgit v0.10.2 From 4d3c4e5b8cae3bb45ba933a22670504239958aa1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 31 May 2015 13:35:32 -0400 Subject: ext4 crypto: allocate the right amount of memory for the on-disk symlink Previously we were taking the required padding when allocating space for the on-disk symlink. This caused a buffer overrun which could trigger a krenel crash when running fsstress. Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 23af41f..7dc4eb5 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -262,8 +262,20 @@ u32 ext4_fname_crypto_round_up(u32 size, u32 blksize) return ((size+blksize-1)/blksize)*blksize; } -/** - * ext4_fname_crypto_alloc_obuff() - +unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen) +{ + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + int padding = 32; + + if (ci) + padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK); + if (ilen < EXT4_CRYPTO_BLOCK_SIZE) + ilen = EXT4_CRYPTO_BLOCK_SIZE; + return ext4_fname_crypto_round_up(ilen, padding); +} + +/* + * ext4_fname_crypto_alloc_buffer() - * * Allocates an output buffer that is sufficient for the crypto operation * specified by the context and the direction. @@ -271,15 +283,8 @@ u32 ext4_fname_crypto_round_up(u32 size, u32 blksize) int ext4_fname_crypto_alloc_buffer(struct inode *inode, u32 ilen, struct ext4_str *crypto_str) { - unsigned int olen; - int padding = 16; - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + unsigned int olen = ext4_fname_encrypted_size(inode, ilen); - if (ci) - padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK); - if (padding < EXT4_CRYPTO_BLOCK_SIZE) - padding = EXT4_CRYPTO_BLOCK_SIZE; - olen = ext4_fname_crypto_round_up(ilen, padding); crypto_str->len = olen; if (olen < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) olen = EXT4_FNAME_CRYPTO_DIGEST_SIZE*2; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bd8d32d..730c88d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2090,6 +2090,7 @@ static inline int ext4_sb_has_crypto(struct super_block *sb) /* crypto_fname.c */ bool ext4_valid_filenames_enc_mode(uint32_t mode); u32 ext4_fname_crypto_round_up(u32 size, u32 blksize); +unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen); int ext4_fname_crypto_alloc_buffer(struct inode *inode, u32 ilen, struct ext4_str *crypto_str); int _ext4_fname_disk_to_usr(struct inode *inode, diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 401b099..bda4a5d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3039,10 +3039,23 @@ static int ext4_symlink(struct inode *dir, encryption_required = (ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))); - if (encryption_required) - disk_link.len = encrypted_symlink_data_len(len) + 1; - if (disk_link.len > dir->i_sb->s_blocksize) - return -ENAMETOOLONG; + if (encryption_required) { + err = ext4_get_encryption_info(dir); + if (err) + return err; + if (ext4_encryption_info(dir) == NULL) + return -EPERM; + disk_link.len = (ext4_fname_encrypted_size(dir, len) + + sizeof(struct ext4_encrypted_symlink_data)); + sd = kzalloc(disk_link.len, GFP_KERNEL); + if (!sd) + return -ENOMEM; + } + + if (disk_link.len > dir->i_sb->s_blocksize) { + err = -ENAMETOOLONG; + goto err_free_sd; + } dquot_initialize(dir); @@ -3073,18 +3086,14 @@ static int ext4_symlink(struct inode *dir, if (IS_ERR(inode)) { if (handle) ext4_journal_stop(handle); - return PTR_ERR(inode); + err = PTR_ERR(inode); + goto err_free_sd; } if (encryption_required) { struct qstr istr; struct ext4_str ostr; - sd = kzalloc(disk_link.len, GFP_NOFS); - if (!sd) { - err = -ENOMEM; - goto err_drop_inode; - } istr.name = (const unsigned char *) symname; istr.len = len; ostr.name = sd->encrypted_path; @@ -3156,10 +3165,11 @@ static int ext4_symlink(struct inode *dir, err_drop_inode: if (handle) ext4_journal_stop(handle); - kfree(sd); clear_nlink(inode); unlock_new_inode(inode); iput(inode); +err_free_sd: + kfree(sd); return err; } -- cgit v0.10.2 From abdd438b26b409eaccf9c847fcf9c3ab52f1959e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 31 May 2015 13:35:39 -0400 Subject: ext4 crypto: handle unexpected lack of encryption keys Fix up attempts by users to try to write to a file when they don't have access to the encryption key. Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 1c9a8c4..efcb7c0 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -104,7 +104,8 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) unsigned long flags; struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - BUG_ON(ci == NULL); + if (ci == NULL) + return ERR_PTR(-ENOKEY); /* * We first try getting the ctx from a free list because in diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index a1d434d..02c4e5d 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -183,7 +183,8 @@ int ext4_inherit_context(struct inode *parent, struct inode *child) if (res < 0) return res; ci = EXT4_I(parent)->i_crypt_info; - BUG_ON(ci == NULL); + if (ci == NULL) + return -ENOKEY; ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) { diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 875ca6b..ac517f1 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -226,6 +226,8 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) int err = ext4_get_encryption_info(inode); if (err) return 0; + if (ext4_encryption_info(inode) == NULL) + return -ENOKEY; } file_accessed(file); if (IS_DAX(file_inode(file))) { @@ -278,6 +280,13 @@ static int ext4_file_open(struct inode * inode, struct file * filp) ext4_journal_stop(handle); } } + if (ext4_encrypted_inode(inode)) { + ret = ext4_get_encryption_info(inode); + if (ret) + return -EACCES; + if (ext4_encryption_info(inode) == NULL) + return -ENOKEY; + } /* * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present @@ -287,13 +296,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) if (ret < 0) return ret; } - ret = dquot_file_open(inode, filp); - if (!ret && ext4_encrypted_inode(inode)) { - ret = ext4_get_encryption_info(inode); - if (ret) - ret = -EACCES; - } - return ret; + return dquot_file_open(inode, filp); } /* -- cgit v0.10.2 From e298e73bd766768707a7af440691ce2f418f5acc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 31 May 2015 13:37:35 -0400 Subject: ext4 crypto: release crypto resource on module exit Crypto resource should be released when ext4 module exits, otherwise it will cause memory leak. Signed-off-by: Chao Yu Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 56bfc2f..31e85be 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5645,6 +5645,7 @@ out7: static void __exit ext4_exit_fs(void) { + ext4_exit_crypto(); ext4_destroy_lazyinit_thread(); unregister_as_ext2(); unregister_as_ext3(); -- cgit v0.10.2 From 3dbb5eb9a3aa04f40e551338eee5e8d06f352fe8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 3 Jun 2015 09:32:39 -0400 Subject: ext4 crypto: allocate bounce pages using GFP_NOWAIT Previously we allocated bounce pages using a combination of alloc_page() and mempool_alloc() with the __GFP_WAIT bit set. Instead, use mempool_alloc() with GFP_NOWAIT. The mempool_alloc() function will try using alloc_pages() initially, and then only use the mempool reserve of pages if alloc_pages() is unable to fulfill the request. This minimizes the the impact on the mm layer when we need to do a large amount of writeback of encrypted files, as Jaeguk Kim had reported that under a heavy fio workload on a system with restricted amounts memory (which unfortunately, includes many mobile handsets), he had observed the the OOM killer getting triggered several times. Using GFP_NOWAIT If the mempool_alloc() function fails, we will retry the page writeback at a later time; the function of the mempool is to ensure that we can writeback at least 32 pages at a time, so we can more efficiently dispatch I/O under high memory pressure situations. In the future we should make this be a tunable so we can determine the best tradeoff between permanently sequestering memory and the ability to quickly launder pages so we can free up memory quickly when necessary. Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index efcb7c0..f5c82e8 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -71,12 +71,8 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx) { unsigned long flags; - if (ctx->flags & EXT4_WRITE_PATH_FL && ctx->w.bounce_page) { - if (ctx->flags & EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL) - __free_page(ctx->w.bounce_page); - else - mempool_free(ctx->w.bounce_page, ext4_bounce_page_pool); - } + if (ctx->flags & EXT4_WRITE_PATH_FL && ctx->w.bounce_page) + mempool_free(ctx->w.bounce_page, ext4_bounce_page_pool); ctx->w.bounce_page = NULL; ctx->w.control_page = NULL; if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) { @@ -317,22 +313,11 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx) { - struct page *ciphertext_page = alloc_page(GFP_NOFS); - - if (!ciphertext_page) { - /* This is a potential bottleneck, but at least we'll have - * forward progress. */ - ciphertext_page = mempool_alloc(ext4_bounce_page_pool, - GFP_NOFS); - if (ciphertext_page == NULL) - return ERR_PTR(-ENOMEM); - ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; - } else { - ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; - } + ctx->w.bounce_page = mempool_alloc(ext4_bounce_page_pool, GFP_NOWAIT); + if (ctx->w.bounce_page == NULL) + return ERR_PTR(-ENOMEM); ctx->flags |= EXT4_WRITE_PATH_FL; - ctx->w.bounce_page = ciphertext_page; - return ciphertext_page; + return ctx->w.bounce_page; } /** diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index 34e0d24..ac7d4e8 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -83,8 +83,7 @@ struct ext4_crypt_info { }; #define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 -#define EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL 0x00000002 -#define EXT4_WRITE_PATH_FL 0x00000004 +#define EXT4_WRITE_PATH_FL 0x00000002 struct ext4_crypto_ctx { union { -- cgit v0.10.2 From 6ccaf3e2f302b6af8d9e17ce4e7f0af26b6baa0e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 8 Jun 2015 10:53:10 -0400 Subject: jbd2: revert must-not-fail allocation loops back to GFP_NOFAIL This basically reverts 47def82672b3 (jbd2: Remove __GFP_NOFAIL from jbd2 layer). The deprecation of __GFP_NOFAIL was a bad choice because it led to open coding the endless loop around the allocator rather than removing the dependency on the non failing allocation. So the deprecation was a clear failure and the reality tells us that __GFP_NOFAIL is not even close to go away. It is still true that __GFP_NOFAIL allocations are generally discouraged and new uses should be evaluated and an alternative (pre-allocations or reservations) should be considered but it doesn't make any sense to lie the allocator about the requirements. Allocator can take steps to help making a progress if it knows the requirements. Signed-off-by: Michal Hocko Signed-off-by: Theodore Ts'o Acked-by: David Rientjes diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index b96bd80..0bc333b 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -371,16 +371,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, */ J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); -retry_alloc: - new_bh = alloc_buffer_head(GFP_NOFS); - if (!new_bh) { - /* - * Failure is not an option, but __GFP_NOFAIL is going - * away; so we retry ourselves here. - */ - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto retry_alloc; - } + new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); /* keep subsequent assertions sane */ atomic_set(&new_bh->b_count, 1); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index ff2f2e6..799242c 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -278,22 +278,16 @@ static int start_this_handle(journal_t *journal, handle_t *handle, alloc_transaction: if (!journal->j_running_transaction) { + /* + * If __GFP_FS is not present, then we may be being called from + * inside the fs writeback layer, so we MUST NOT fail. + */ + if ((gfp_mask & __GFP_FS) == 0) + gfp_mask |= __GFP_NOFAIL; new_transaction = kmem_cache_zalloc(transaction_cache, gfp_mask); - if (!new_transaction) { - /* - * If __GFP_FS is not present, then we may be - * being called from inside the fs writeback - * layer, so we MUST NOT fail. Since - * __GFP_NOFAIL is going away, we will arrange - * to retry the allocation ourselves. - */ - if ((gfp_mask & __GFP_FS) == 0) { - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto alloc_transaction; - } + if (!new_transaction) return -ENOMEM; - } } jbd_debug(3, "New handle %p going live.\n", handle); -- cgit v0.10.2 From 41e5b7ed3e9597ccc46b6affc81872e6370936d9 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Mon, 8 Jun 2015 11:18:52 -0400 Subject: ext4: verify block bitmap even after fresh initialization If we want to rely on the buffer_verified() flag of the block bitmap buffer, we have to set it consistently. However currently if we're initializing uninitialized block bitmap in ext4_read_block_bitmap_nowait() we're not going to set buffer verified at all. We can do this by simply setting the flag on the buffer, but I think it's actually better to run ext4_validate_block_bitmap() to make sure that what we did in the ext4_init_block_bitmap() is right. So run ext4_validate_block_bitmap() even after the block bitmap initialization. Also bail out early from ext4_validate_block_bitmap() if we see corrupt bitmap, since we already know it's corrupt and we do not need to verify that. Signed-off-by: Lukas Czerner Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 955bf49..cd6ea29 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -369,7 +369,7 @@ static void ext4_validate_block_bitmap(struct super_block *sb, struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); - if (buffer_verified(bh)) + if (buffer_verified(bh) || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) return; ext4_lock_group(sb, block_group); @@ -446,7 +446,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) unlock_buffer(bh); if (err) ext4_error(sb, "Checksum bad for grp %u", block_group); - return bh; + goto verify; } ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { -- cgit v0.10.2 From bbdc322f2c600667e3d23dfadf8bbaad08f7edd3 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Mon, 8 Jun 2015 11:38:37 -0400 Subject: ext4: try to initialize all groups we can in case of failure on ppc64 Currently on the machines with page size > block size when initializing block group buddy cache we initialize it for all the block group bitmaps in the page. However in the case of read error, checksum error, or if a single bitmap is in any way corrupted we would fail to initialize all of the bitmaps. This is problematic because we will not have access to the other allocation groups even though those might be perfectly fine and usable. Fix this by reading all the bitmaps instead of error out on the first problem and simply skip the bitmaps which were either not read properly, or are not valid. Signed-off-by: Lukas Czerner Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8d1e602..df02951 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -882,10 +882,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* wait for I/O completion */ for (i = 0, group = first_group; i < groups_per_page; i++, group++) { - if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) { + if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) err = -EIO; - goto out; - } } first_block = page->index * blocks_per_page; @@ -898,6 +896,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* skip initialized uptodate buddy */ continue; + if (!buffer_verified(bh[group - first_group])) + /* Skip faulty bitmaps */ + continue; + err = 0; + /* * data carry information regarding this * particular group in the format specified -- cgit v0.10.2 From 42ac1848eac58875ceb081c2ed915d6f07ec1f30 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Mon, 8 Jun 2015 11:40:40 -0400 Subject: ext4: return error code from ext4_mb_good_group() Currently ext4_mb_good_group() only returns 0 or 1 depending on whether the allocation group is suitable for use or not. However we might get various errors and fail while initializing new group including -EIO which would never get propagated up the call chain. This might lead to an endless loop at writeback when we're trying to find a good group to allocate from and we fail to initialize new group (read error for example). Fix this by returning proper error code from ext4_mb_good_group() and using it in ext4_mb_regular_allocator(). In ext4_mb_regular_allocator() we will always return only the first occurred error from ext4_mb_good_group() and we only propagate it back to the caller if we do not get any other errors and we fail to allocate any blocks. Note that with other modes than errors=continue, we will fail immediately in ext4_mb_good_group() in case of error, however with errors=continue we should try to continue using the file system, that's why we're not going to fail immediately when we see an error from ext4_mb_good_group(), but rather when we fail to find a suitable block group to allocate from due to an problem in group initialization. Signed-off-by: Lukas Czerner Signed-off-by: Theodore Ts'o Reviewed-by: Darrick J. Wong diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index df02951..78e58f7 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2011,7 +2011,12 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, } } -/* This is now called BEFORE we load the buddy bitmap. */ +/* + * This is now called BEFORE we load the buddy bitmap. + * Returns either 1 or 0 indicating that the group is either suitable + * for the allocation or not. In addition it can also return negative + * error code when something goes wrong. + */ static int ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, int cr) { @@ -2034,7 +2039,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { int ret = ext4_mb_init_group(ac->ac_sb, group); if (ret) - return 0; + return ret; } fragments = grp->bb_fragments; @@ -2081,7 +2086,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t ngroups, group, i; int cr; - int err = 0; + int err = 0, first_err = 0; struct ext4_sb_info *sbi; struct super_block *sb; struct ext4_buddy e4b; @@ -2148,6 +2153,7 @@ repeat: group = ac->ac_g_ex.fe_group; for (i = 0; i < ngroups; group++, i++) { + int ret = 0; cond_resched(); /* * Artificially restricted ngroups for non-extent @@ -2157,8 +2163,12 @@ repeat: group = 0; /* This now checks without needing the buddy page */ - if (!ext4_mb_good_group(ac, group, cr)) + ret = ext4_mb_good_group(ac, group, cr); + if (ret <= 0) { + if (!first_err) + first_err = ret; continue; + } err = ext4_mb_load_buddy(sb, group, &e4b); if (err) @@ -2170,9 +2180,12 @@ repeat: * We need to check again after locking the * block group */ - if (!ext4_mb_good_group(ac, group, cr)) { + ret = ext4_mb_good_group(ac, group, cr); + if (ret <= 0) { ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); + if (!first_err) + first_err = ret; continue; } @@ -2219,6 +2232,8 @@ repeat: } } out: + if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) + err = first_err; return err; } -- cgit v0.10.2 From ad0a0ce894d554b112afab6a48fd500e636686a6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 8 Jun 2015 11:54:56 -0400 Subject: ext4 crypto: fix ext4_get_crypto_ctx()'s calling convention in ext4_decrypt_one Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index f5c82e8..4573155 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -396,8 +396,8 @@ int ext4_decrypt_one(struct inode *inode, struct page *page) struct ext4_crypto_ctx *ctx = ext4_get_crypto_ctx(inode); - if (!ctx) - return -ENOMEM; + if (IS_ERR(ctx)) + return PTR_ERR(ctx); ret = ext4_decrypt(ctx, page); ext4_release_crypto_ctx(ctx); return ret; -- cgit v0.10.2 From 8bc3b1e6e8fdc1c605c06c027d999b5cca434779 Mon Sep 17 00:00:00 2001 From: David Moore Date: Mon, 8 Jun 2015 11:59:12 -0400 Subject: ext4: BUG_ON assertion repeated for inode1, not done for inode2 During a source code review of fs/ext4/extents.c I noted identical consecutive lines. An assertion is repeated for inode1 and never done for inode2. This is not in keeping with the rest of the code in the ext4_swap_extents function and appears to be a bug. Assert that the inode2 mutex is not locked. Signed-off-by: David Moore Signed-off-by: Theodore Ts'o Reviewed-by: Eric Sandeen diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e003a1e..f38a6d6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5542,7 +5542,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); BUG_ON(!mutex_is_locked(&inode1->i_mutex)); - BUG_ON(!mutex_is_locked(&inode1->i_mutex)); + BUG_ON(!mutex_is_locked(&inode2->i_mutex)); *erp = ext4_es_remove_extent(inode1, lblk1, count); if (unlikely(*erp)) -- cgit v0.10.2 From b4ab9e29820bf2e8842281a6b5e645e59c9992a5 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 8 Jun 2015 12:23:21 -0400 Subject: ext4 crypto: fix sparse warnings in fs/ext4/ioctl.c [ Added another sparse fix for EXT4_IOC_GET_ENCRYPTION_POLICY while we're at it. --tytso ] Signed-off-by: Fabian Frederick Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 2cb9e17..7ce8582 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -675,8 +675,8 @@ encryption_policy_out: if (err) return err; } - if (copy_to_user((void *) arg, sbi->s_es->s_encrypt_pw_salt, - 16)) + if (copy_to_user((void __user *) arg, + sbi->s_es->s_encrypt_pw_salt, 16)) return -EFAULT; return 0; } @@ -690,7 +690,7 @@ encryption_policy_out: err = ext4_get_policy(inode, &policy); if (err) return err; - if (copy_to_user((void *)arg, &policy, sizeof(policy))) + if (copy_to_user((void __user *)arg, &policy, sizeof(policy))) return -EFAULT; return 0; #else -- cgit v0.10.2 From ee57aba159a5c329dc78c181a3ae0549e59f0925 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 8 Jun 2015 12:39:07 -0400 Subject: jbd2: simplify code flow in do_get_write_access() needs_copy is set only in one place in do_get_write_access(), just move the frozen buffer copying into that place and factor it out to a separate function to make do_get_write_access() slightly more readable. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 799242c..2bcb43d 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -755,6 +755,30 @@ static void warn_dirty_buffer(struct buffer_head *bh) bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } +/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */ +static void jbd2_freeze_jh_data(struct journal_head *jh) +{ + struct page *page; + int offset; + char *source; + struct buffer_head *bh = jh2bh(jh); + + J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n"); + page = bh->b_page; + offset = offset_in_page(bh->b_data); + source = kmap_atomic(page); + /* Fire data frozen trigger just before we copy the data */ + jbd2_buffer_frozen_trigger(jh, source + offset, jh->b_triggers); + memcpy(jh->b_frozen_data, source + offset, bh->b_size); + kunmap_atomic(source); + + /* + * Now that the frozen data is saved off, we need to store any matching + * triggers. + */ + jh->b_frozen_triggers = jh->b_triggers; +} + /* * If the buffer is already part of the current transaction, then there * is nothing we need to do. If it is already part of a prior @@ -774,7 +798,6 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, journal_t *journal; int error; char *frozen_buffer = NULL; - int need_copy = 0; unsigned long start_lock, time_lock; if (is_handle_aborted(handle)) @@ -931,7 +954,7 @@ repeat: } jh->b_frozen_data = frozen_buffer; frozen_buffer = NULL; - need_copy = 1; + jbd2_freeze_jh_data(jh); } jh->b_next_transaction = transaction; } @@ -952,28 +975,6 @@ repeat: } done: - if (need_copy) { - struct page *page; - int offset; - char *source; - - J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), - "Possible IO failure.\n"); - page = jh2bh(jh)->b_page; - offset = offset_in_page(jh2bh(jh)->b_data); - source = kmap_atomic(page); - /* Fire data frozen trigger just before we copy the data */ - jbd2_buffer_frozen_trigger(jh, source + offset, - jh->b_triggers); - memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); - kunmap_atomic(source); - - /* - * Now that the frozen data is saved off, we need to store - * any matching triggers. - */ - jh->b_frozen_triggers = jh->b_triggers; - } jbd_unlock_bh_state(bh); /* -- cgit v0.10.2 From d012aa5965160a39b24a4b41139a322f681cdfd7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 8 Jun 2015 12:40:39 -0400 Subject: jbd2: simplify error path on allocation failure in do_get_write_access() We were acquiring bh_state_lock when allocation of buffer failed in do_get_write_access() only to be able to jump to a label that releases the lock and does all other checks that don't make sense for this error path. Just jump into the right label instead. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 2bcb43d..3b2e617 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -947,8 +947,7 @@ repeat: __func__); JBUFFER_TRACE(jh, "oom!"); error = -ENOMEM; - jbd_lock_bh_state(bh); - goto done; + goto out; } goto repeat; } -- cgit v0.10.2 From 8b00f400eedf91d074f831077003c0d4d9147377 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 8 Jun 2015 12:44:21 -0400 Subject: jbd2: more simplifications in do_get_write_access() Check for the simple case of unjournaled buffer first, handle it and bail out. This allows us to remove one if and unindent the difficult case by one tab. The result is easier to read. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 3b2e617..1bbcf86 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -884,6 +884,20 @@ repeat: jh->b_modified = 0; /* + * If the buffer is not journaled right now, we need to make sure it + * doesn't get written to disk before the caller actually commits the + * new data + */ + if (!jh->b_transaction) { + JBUFFER_TRACE(jh, "no transaction"); + J_ASSERT_JH(jh, !jh->b_next_transaction); + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + spin_lock(&journal->j_list_lock); + __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); + spin_unlock(&journal->j_list_lock); + goto done; + } + /* * If there is already a copy-out version of this buffer, then we don't * need to make another one */ @@ -894,84 +908,58 @@ repeat: goto done; } - /* Is there data here we need to preserve? */ + JBUFFER_TRACE(jh, "owned by older transaction"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction); - if (jh->b_transaction && jh->b_transaction != transaction) { - JBUFFER_TRACE(jh, "owned by older transaction"); - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_committing_transaction); + /* + * There is one case we have to be very careful about. If the + * committing transaction is currently writing this buffer out to disk + * and has NOT made a copy-out, then we cannot modify the buffer + * contents at all right now. The essence of copy-out is that it is + * the extra copy, not the primary copy, which gets journaled. If the + * primary copy is already going to disk then we cannot do copy-out + * here. + */ + if (buffer_shadow(bh)) { + JBUFFER_TRACE(jh, "on shadow: sleep"); + jbd_unlock_bh_state(bh); + wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE); + goto repeat; + } - /* There is one case we have to be very careful about. - * If the committing transaction is currently writing - * this buffer out to disk and has NOT made a copy-out, - * then we cannot modify the buffer contents at all - * right now. The essence of copy-out is that it is the - * extra copy, not the primary copy, which gets - * journaled. If the primary copy is already going to - * disk then we cannot do copy-out here. */ - - if (buffer_shadow(bh)) { - JBUFFER_TRACE(jh, "on shadow: sleep"); + /* + * Only do the copy if the currently-owning transaction still needs it. + * If buffer isn't on BJ_Metadata list, the committing transaction is + * past that stage (here we use the fact that BH_Shadow is set under + * bh_state lock together with refiling to BJ_Shadow list and at this + * point we know the buffer doesn't have BH_Shadow set). + * + * Subtle point, though: if this is a get_undo_access, then we will be + * relying on the frozen_data to contain the new value of the + * committed_data record after the transaction, so we HAVE to force the + * frozen_data copy in that case. + */ + if (jh->b_jlist == BJ_Metadata || force_copy) { + JBUFFER_TRACE(jh, "generate frozen data"); + if (!frozen_buffer) { + JBUFFER_TRACE(jh, "allocate memory for buffer"); jbd_unlock_bh_state(bh); - wait_on_bit_io(&bh->b_state, BH_Shadow, - TASK_UNINTERRUPTIBLE); - goto repeat; - } - - /* - * Only do the copy if the currently-owning transaction still - * needs it. If buffer isn't on BJ_Metadata list, the - * committing transaction is past that stage (here we use the - * fact that BH_Shadow is set under bh_state lock together with - * refiling to BJ_Shadow list and at this point we know the - * buffer doesn't have BH_Shadow set). - * - * Subtle point, though: if this is a get_undo_access, - * then we will be relying on the frozen_data to contain - * the new value of the committed_data record after the - * transaction, so we HAVE to force the frozen_data copy - * in that case. - */ - if (jh->b_jlist == BJ_Metadata || force_copy) { - JBUFFER_TRACE(jh, "generate frozen data"); + frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!frozen_buffer) { - JBUFFER_TRACE(jh, "allocate memory for buffer"); - jbd_unlock_bh_state(bh); - frozen_buffer = - jbd2_alloc(jh2bh(jh)->b_size, - GFP_NOFS); - if (!frozen_buffer) { - printk(KERN_ERR - "%s: OOM for frozen_buffer\n", - __func__); - JBUFFER_TRACE(jh, "oom!"); - error = -ENOMEM; - goto out; - } - goto repeat; + printk(KERN_ERR "%s: OOM for frozen_buffer\n", + __func__); + JBUFFER_TRACE(jh, "oom!"); + error = -ENOMEM; + goto out; } - jh->b_frozen_data = frozen_buffer; - frozen_buffer = NULL; - jbd2_freeze_jh_data(jh); + goto repeat; } - jh->b_next_transaction = transaction; - } - - - /* - * Finally, if the buffer is not journaled right now, we need to make - * sure it doesn't get written to disk before the caller actually - * commits the new data - */ - if (!jh->b_transaction) { - JBUFFER_TRACE(jh, "no transaction"); - J_ASSERT_JH(jh, !jh->b_next_transaction); - JBUFFER_TRACE(jh, "file as BJ_Reserved"); - spin_lock(&journal->j_list_lock); - __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); - spin_unlock(&journal->j_list_lock); + jh->b_frozen_data = frozen_buffer; + frozen_buffer = NULL; + jbd2_freeze_jh_data(jh); } + jh->b_next_transaction = transaction; done: jbd_unlock_bh_state(bh); -- cgit v0.10.2 From de92c8caf16ca84926fa31b7a5590c0fb9c0d5ca Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 8 Jun 2015 12:46:37 -0400 Subject: jbd2: speedup jbd2_journal_get_[write|undo]_access() jbd2_journal_get_write_access() and jbd2_journal_get_create_access() are frequently called for buffers that are already part of the running transaction - most frequently it is the case for bitmaps, inode table blocks, and superblock. Since in such cases we have nothing to do, it is unfortunate we still grab reference to journal head, lock the bh, lock bh_state only to find out there's nothing to do. Improving this is a bit subtle though since until we find out journal head is attached to the running transaction, it can disappear from under us because checkpointing / commit decided it's no longer needed. We deal with this by protecting journal_head slab with RCU. We still have to be careful about journal head being freed & reallocated within slab and about exposing journal head in consistent state (in particular b_modified and b_frozen_data must be in correct state before we allow user to touch the buffer). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0bc333b..303ccd9 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2321,7 +2321,7 @@ static int jbd2_journal_init_journal_head_cache(void) jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", sizeof(struct journal_head), 0, /* offset */ - SLAB_TEMPORARY, /* flags */ + SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU, NULL); /* ctor */ retval = 0; if (!jbd2_journal_head_cache) { diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 1bbcf86..f3d0617 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -892,6 +892,12 @@ repeat: JBUFFER_TRACE(jh, "no transaction"); J_ASSERT_JH(jh, !jh->b_next_transaction); JBUFFER_TRACE(jh, "file as BJ_Reserved"); + /* + * Make sure all stores to jh (b_modified, b_frozen_data) are + * visible before attaching it to the running transaction. + * Paired with barrier in jbd2_write_access_granted() + */ + smp_wmb(); spin_lock(&journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); spin_unlock(&journal->j_list_lock); @@ -904,8 +910,7 @@ repeat: if (jh->b_frozen_data) { JBUFFER_TRACE(jh, "has frozen data"); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - jh->b_next_transaction = transaction; - goto done; + goto attach_next; } JBUFFER_TRACE(jh, "owned by older transaction"); @@ -959,6 +964,13 @@ repeat: frozen_buffer = NULL; jbd2_freeze_jh_data(jh); } +attach_next: + /* + * Make sure all stores to jh (b_modified, b_frozen_data) are visible + * before attaching it to the running transaction. Paired with barrier + * in jbd2_write_access_granted() + */ + smp_wmb(); jh->b_next_transaction = transaction; done: @@ -978,6 +990,55 @@ out: return error; } +/* Fast check whether buffer is already attached to the required transaction */ +static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh) +{ + struct journal_head *jh; + bool ret = false; + + /* Dirty buffers require special handling... */ + if (buffer_dirty(bh)) + return false; + + /* + * RCU protects us from dereferencing freed pages. So the checks we do + * are guaranteed not to oops. However the jh slab object can get freed + * & reallocated while we work with it. So we have to be careful. When + * we see jh attached to the running transaction, we know it must stay + * so until the transaction is committed. Thus jh won't be freed and + * will be attached to the same bh while we run. However it can + * happen jh gets freed, reallocated, and attached to the transaction + * just after we get pointer to it from bh. So we have to be careful + * and recheck jh still belongs to our bh before we return success. + */ + rcu_read_lock(); + if (!buffer_jbd(bh)) + goto out; + /* This should be bh2jh() but that doesn't work with inline functions */ + jh = READ_ONCE(bh->b_private); + if (!jh) + goto out; + if (jh->b_transaction != handle->h_transaction && + jh->b_next_transaction != handle->h_transaction) + goto out; + /* + * There are two reasons for the barrier here: + * 1) Make sure to fetch b_bh after we did previous checks so that we + * detect when jh went through free, realloc, attach to transaction + * while we were checking. Paired with implicit barrier in that path. + * 2) So that access to bh done after jbd2_write_access_granted() + * doesn't get reordered and see inconsistent state of concurrent + * do_get_write_access(). + */ + smp_mb(); + if (unlikely(jh->b_bh != bh)) + goto out; + ret = true; +out: + rcu_read_unlock(); + return ret; +} + /** * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. * @handle: transaction to add buffer modifications to @@ -991,9 +1052,13 @@ out: int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) { - struct journal_head *jh = jbd2_journal_add_journal_head(bh); + struct journal_head *jh; int rc; + if (jbd2_write_access_granted(handle, bh)) + return 0; + + jh = jbd2_journal_add_journal_head(bh); /* We do not want to get caught playing with fields which the * log thread also manipulates. Make sure that the buffer * completes any outstanding IO before proceeding. */ @@ -1123,11 +1188,14 @@ out: int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) { int err; - struct journal_head *jh = jbd2_journal_add_journal_head(bh); + struct journal_head *jh; char *committed_data = NULL; JBUFFER_TRACE(jh, "entry"); + if (jbd2_write_access_granted(handle, bh)) + return 0; + jh = jbd2_journal_add_journal_head(bh); /* * Do this first --- it can drop the journal lock, so we want to * make sure that obtaining the committed_data is done -- cgit v0.10.2 From 331573febb6a224bc50322e3670da326cb7f4cfc Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 9 Jun 2015 01:55:03 -0400 Subject: ext4: Add support FALLOC_FL_INSERT_RANGE for fallocate This patch implements fallocate's FALLOC_FL_INSERT_RANGE for Ext4. 1) Make sure that both offset and len are block size aligned. 2) Update the i_size of inode by len bytes. 3) Compute the file's logical block number against offset. If the computed block number is not the starting block of the extent, split the extent such that the block number is the starting block of the extent. 4) Shift all the extents which are lying between [offset, last allocated extent] towards right by len bytes. This step will make a hole of len bytes at offset. Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 730c88d..3ab7cd8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -90,6 +90,11 @@ typedef __u32 ext4_lblk_t; /* data type for block group number */ typedef unsigned int ext4_group_t; +enum SHIFT_DIRECTION { + SHIFT_LEFT = 0, + SHIFT_RIGHT, +}; + /* * Flags used in mballoc's allocation_context flags field. * @@ -2947,6 +2952,7 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); +extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f38a6d6..08f5afc 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4912,12 +4912,14 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) * bug we should fix.... */ if (ext4_encrypted_inode(inode) && - (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))) + (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE | + FALLOC_FL_ZERO_RANGE))) return -EOPNOTSUPP; /* Return error if mode is not supported */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | + FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) @@ -4930,6 +4932,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (mode & FALLOC_FL_COLLAPSE_RANGE) return ext4_collapse_range(inode, offset, len); + if (mode & FALLOC_FL_INSERT_RANGE) + return ext4_insert_range(inode, offset, len); + if (mode & FALLOC_FL_ZERO_RANGE) return ext4_zero_range(file, offset, len, mode); @@ -5224,13 +5229,13 @@ ext4_access_path(handle_t *handle, struct inode *inode, /* * ext4_ext_shift_path_extents: * Shift the extents of a path structure lying between path[depth].p_ext - * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift - * from starting block for each extent. + * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells + * if it is right shift or left shift operation. */ static int ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, struct inode *inode, handle_t *handle, - ext4_lblk_t *start) + enum SHIFT_DIRECTION SHIFT) { int depth, err = 0; struct ext4_extent *ex_start, *ex_last; @@ -5252,19 +5257,25 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) update = 1; - *start = le32_to_cpu(ex_last->ee_block) + - ext4_ext_get_actual_len(ex_last); - while (ex_start <= ex_last) { - le32_add_cpu(&ex_start->ee_block, -shift); - /* Try to merge to the left. */ - if ((ex_start > - EXT_FIRST_EXTENT(path[depth].p_hdr)) && - ext4_ext_try_to_merge_right(inode, - path, ex_start - 1)) + if (SHIFT == SHIFT_LEFT) { + le32_add_cpu(&ex_start->ee_block, + -shift); + /* Try to merge to the left. */ + if ((ex_start > + EXT_FIRST_EXTENT(path[depth].p_hdr)) + && + ext4_ext_try_to_merge_right(inode, + path, ex_start - 1)) + ex_last--; + else + ex_start++; + } else { + le32_add_cpu(&ex_last->ee_block, shift); + ext4_ext_try_to_merge_right(inode, path, + ex_last); ex_last--; - else - ex_start++; + } } err = ext4_ext_dirty(handle, inode, path + depth); if (err) @@ -5279,7 +5290,10 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, if (err) goto out; - le32_add_cpu(&path[depth].p_idx->ei_block, -shift); + if (SHIFT == SHIFT_LEFT) + le32_add_cpu(&path[depth].p_idx->ei_block, -shift); + else + le32_add_cpu(&path[depth].p_idx->ei_block, shift); err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; @@ -5297,19 +5311,20 @@ out: /* * ext4_ext_shift_extents: - * All the extents which lies in the range from start to the last allocated - * block for the file are shifted downwards by shift blocks. + * All the extents which lies in the range from @start to the last allocated + * block for the @inode are shifted either towards left or right (depending + * upon @SHIFT) by @shift blocks. * On success, 0 is returned, error otherwise. */ static int ext4_ext_shift_extents(struct inode *inode, handle_t *handle, - ext4_lblk_t start, ext4_lblk_t shift) + ext4_lblk_t start, ext4_lblk_t shift, + enum SHIFT_DIRECTION SHIFT) { struct ext4_ext_path *path; int ret = 0, depth; struct ext4_extent *extent; - ext4_lblk_t stop_block; - ext4_lblk_t ex_start, ex_end; + ext4_lblk_t stop, *iterator, ex_start, ex_end; /* Let path point to the last extent */ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); @@ -5321,58 +5336,84 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, if (!extent) goto out; - stop_block = le32_to_cpu(extent->ee_block) + + stop = le32_to_cpu(extent->ee_block) + ext4_ext_get_actual_len(extent); - /* Nothing to shift, if hole is at the end of file */ - if (start >= stop_block) - goto out; + /* + * In case of left shift, Don't start shifting extents until we make + * sure the hole is big enough to accommodate the shift. + */ + if (SHIFT == SHIFT_LEFT) { + path = ext4_find_extent(inode, start - 1, &path, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = path->p_depth; + extent = path[depth].p_ext; + if (extent) { + ex_start = le32_to_cpu(extent->ee_block); + ex_end = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + } else { + ex_start = 0; + ex_end = 0; + } - /* - * Don't start shifting extents until we make sure the hole is big - * enough to accomodate the shift. - */ - path = ext4_find_extent(inode, start - 1, &path, 0); - if (IS_ERR(path)) - return PTR_ERR(path); - depth = path->p_depth; - extent = path[depth].p_ext; - if (extent) { - ex_start = le32_to_cpu(extent->ee_block); - ex_end = le32_to_cpu(extent->ee_block) + - ext4_ext_get_actual_len(extent); - } else { - ex_start = 0; - ex_end = 0; + if ((start == ex_start && shift > ex_start) || + (shift > start - ex_end)) { + ext4_ext_drop_refs(path); + kfree(path); + return -EINVAL; + } } - if ((start == ex_start && shift > ex_start) || - (shift > start - ex_end)) - return -EINVAL; + /* + * In case of left shift, iterator points to start and it is increased + * till we reach stop. In case of right shift, iterator points to stop + * and it is decreased till we reach start. + */ + if (SHIFT == SHIFT_LEFT) + iterator = &start; + else + iterator = &stop; /* Its safe to start updating extents */ - while (start < stop_block) { - path = ext4_find_extent(inode, start, &path, 0); + while (start < stop) { + path = ext4_find_extent(inode, *iterator, &path, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; if (!extent) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", - (unsigned long) start); + (unsigned long) *iterator); return -EIO; } - if (start > le32_to_cpu(extent->ee_block)) { + if (SHIFT == SHIFT_LEFT && *iterator > + le32_to_cpu(extent->ee_block)) { /* Hole, move to the next extent */ if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) { path[depth].p_ext++; } else { - start = ext4_ext_next_allocated_block(path); + *iterator = ext4_ext_next_allocated_block(path); continue; } } + + if (SHIFT == SHIFT_LEFT) { + extent = EXT_LAST_EXTENT(path[depth].p_hdr); + *iterator = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + } else { + extent = EXT_FIRST_EXTENT(path[depth].p_hdr); + *iterator = le32_to_cpu(extent->ee_block) > 0 ? + le32_to_cpu(extent->ee_block) - 1 : 0; + /* Update path extent in case we need to stop */ + while (le32_to_cpu(extent->ee_block) < start) + extent++; + path[depth].p_ext = extent; + } ret = ext4_ext_shift_path_extents(path, shift, inode, - handle, &start); + handle, SHIFT); if (ret) break; } @@ -5485,7 +5526,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ext4_discard_preallocations(inode); ret = ext4_ext_shift_extents(inode, handle, punch_stop, - punch_stop - punch_start); + punch_stop - punch_start, SHIFT_LEFT); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; @@ -5510,6 +5551,174 @@ out_mutex: return ret; } +/* + * ext4_insert_range: + * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate. + * The data blocks starting from @offset to the EOF are shifted by @len + * towards right to create a hole in the @inode. Inode size is increased + * by len bytes. + * Returns 0 on success, error otherwise. + */ +int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) +{ + struct super_block *sb = inode->i_sb; + handle_t *handle; + struct ext4_ext_path *path; + struct ext4_extent *extent; + ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0; + unsigned int credits, ee_len; + int ret = 0, depth, split_flag = 0; + loff_t ioffset; + + /* + * We need to test this early because xfstests assumes that an + * insert range of (0, 1) will return EOPNOTSUPP if the file + * system does not support insert range. + */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return -EOPNOTSUPP; + + /* Insert range works only on fs block size aligned offsets. */ + if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || + len & (EXT4_CLUSTER_SIZE(sb) - 1)) + return -EINVAL; + + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + trace_ext4_insert_range(inode, offset, len); + + offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); + len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); + + /* Call ext4_force_commit to flush all data in case of data=journal */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; + } + + /* + * Need to round down to align start offset to page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + + /* Write out all dirty pages */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); + if (ret) + return ret; + + /* Take mutex lock */ + mutex_lock(&inode->i_mutex); + + /* Currently just for extent based files */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ret = -EOPNOTSUPP; + goto out_mutex; + } + + /* Check for wrap through zero */ + if (inode->i_size + len > inode->i_sb->s_maxbytes) { + ret = -EFBIG; + goto out_mutex; + } + + /* Offset should be less than i_size */ + if (offset >= i_size_read(inode)) { + ret = -EINVAL; + goto out_mutex; + } + + truncate_pagecache(inode, ioffset); + + /* Wait for existing dio to complete */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + credits = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_dio; + } + + /* Expand file to avoid data loss if there is error while shifting */ + inode->i_size += len; + EXT4_I(inode)->i_disksize += len; + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ret = ext4_mark_inode_dirty(handle, inode); + if (ret) + goto out_stop; + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + path = ext4_find_extent(inode, offset_lblk, NULL, 0); + if (IS_ERR(path)) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + depth = ext_depth(inode); + extent = path[depth].p_ext; + if (extent) { + ee_start_lblk = le32_to_cpu(extent->ee_block); + ee_len = ext4_ext_get_actual_len(extent); + + /* + * If offset_lblk is not the starting block of extent, split + * the extent @offset_lblk + */ + if ((offset_lblk > ee_start_lblk) && + (offset_lblk < (ee_start_lblk + ee_len))) { + if (ext4_ext_is_unwritten(extent)) + split_flag = EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2; + ret = ext4_split_extent_at(handle, inode, &path, + offset_lblk, split_flag, + EXT4_EX_NOCACHE | + EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_METADATA_NOFAIL); + } + + ext4_ext_drop_refs(path); + kfree(path); + if (ret < 0) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + } + + ret = ext4_es_remove_extent(inode, offset_lblk, + EXT_MAX_BLOCKS - offset_lblk); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + /* + * if offset_lblk lies in a hole which is at start of file, use + * ee_start_lblk to shift extents + */ + ret = ext4_ext_shift_extents(inode, handle, + ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk, + len_lblk, SHIFT_RIGHT); + + up_write(&EXT4_I(inode)->i_data_sem); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + +out_stop: + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; +} + /** * ext4_swap_extents - Swap extents between two inodes * diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 08ec3dd..0faf570 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2478,6 +2478,31 @@ TRACE_EVENT(ext4_collapse_range, __entry->offset, __entry->len) ); +TRACE_EVENT(ext4_insert_range, + TP_PROTO(struct inode *inode, loff_t offset, loff_t len), + + TP_ARGS(inode, offset, len), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(loff_t, offset) + __field(loff_t, len) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->offset = offset; + __entry->len = len; + ), + + TP_printk("dev %d,%d ino %lu offset %lld len %lld", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->offset, __entry->len) +); + TRACE_EVENT(ext4_es_shrink, TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time, int nr_skipped, int retried), -- cgit v0.10.2 From 1cb767cd4a79703105f4f3774c76896d621fdc54 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 12 Jun 2015 23:44:33 -0400 Subject: ext4 crypto: fail the mount if blocksize != pagesize We currently don't correctly handle the case where blocksize != pagesize, so disallow the mount in those cases. Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 31e85be..e13fe40 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4065,7 +4065,15 @@ no_journal: } } - if (unlikely(sbi->s_mount_flags & EXT4_MF_TEST_DUMMY_ENCRYPTION) && + if ((DUMMY_ENCRYPTION_ENABLED(sbi) || + EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) && + (blocksize != PAGE_CACHE_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Unsupported blocksize for fs encryption"); + goto failed_mount_wq; + } + + if (DUMMY_ENCRYPTION_ENABLED(sbi) && !(sb->s_flags & MS_RDONLY) && !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) { EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT); -- cgit v0.10.2 From bdf96838aea6a265f2ae6cbcfb12a778c84a0b8e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 12 Jun 2015 23:45:33 -0400 Subject: ext4: fix race between truncate and __ext4_journalled_writepage() The commit cf108bca465d: "ext4: Invert the locking order of page_lock and transaction start" caused __ext4_journalled_writepage() to drop the page lock before the page was written back, as part of changing the locking order to jbd2_journal_start -> page_lock. However, this introduced a potential race if there was a truncate racing with the data=journalled writeback mode. Fix this by grabbing the page lock after starting the journal handle, and then checking to see if page had gotten truncated out from under us. This fixes a number of different warnings or BUG_ON's when running xfstests generic/086 in data=journalled mode, including: jbd2_journal_dirty_metadata: vdc-8: bad jh for block 115643: transaction (ee3fe7 c0, 164), jh->b_transaction ( (null), 0), jh->b_next_transaction ( (null), 0), jlist 0 - and - kernel BUG at /usr/projects/linux/ext4/fs/jbd2/transaction.c:2200! ... Call Trace: [] ? __ext4_journalled_invalidatepage+0x117/0x117 [] __ext4_journalled_invalidatepage+0x10f/0x117 [] ? __ext4_journalled_invalidatepage+0x117/0x117 [] ? lock_buffer+0x36/0x36 [] ext4_journalled_invalidatepage+0xd/0x22 [] do_invalidatepage+0x22/0x26 [] truncate_inode_page+0x5b/0x85 [] truncate_inode_pages_range+0x156/0x38c [] truncate_inode_pages+0x11/0x15 [] truncate_pagecache+0x55/0x71 [] ext4_setattr+0x4a9/0x560 [] ? current_kernel_time+0x10/0x44 [] notify_change+0x1c7/0x2be [] do_truncate+0x65/0x85 [] ? file_ra_state_init+0x12/0x29 - and - WARNING: CPU: 1 PID: 1331 at /usr/projects/linux/ext4/fs/jbd2/transaction.c:1396 irty_metadata+0x14a/0x1ae() ... Call Trace: [] ? console_unlock+0x3a1/0x3ce [] dump_stack+0x48/0x60 [] warn_slowpath_common+0x89/0xa0 [] ? jbd2_journal_dirty_metadata+0x14a/0x1ae [] warn_slowpath_null+0x14/0x18 [] jbd2_journal_dirty_metadata+0x14a/0x1ae [] __ext4_handle_dirty_metadata+0xd4/0x19d [] write_end_fn+0x40/0x53 [] ext4_walk_page_buffers+0x4e/0x6a [] ext4_writepage+0x354/0x3b8 [] ? mpage_release_unused_pages+0xd4/0xd4 [] ? wait_on_buffer+0x2c/0x2c [] ? ext4_writepage+0x3b8/0x3b8 [] __writepage+0x10/0x2e [] write_cache_pages+0x22d/0x32c [] ? ext4_writepage+0x3b8/0x3b8 [] ext4_writepages+0x102/0x607 [] ? sched_clock_local+0x10/0x10e [] ? __lock_is_held+0x2e/0x44 [] ? lock_is_held+0x43/0x51 [] do_writepages+0x1c/0x29 [] __writeback_single_inode+0xc3/0x545 [] writeback_sb_inodes+0x21f/0x36d ... Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0554b0b..263a46c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1701,19 +1701,32 @@ static int __ext4_journalled_writepage(struct page *page, ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); } - /* As soon as we unlock the page, it can go away, but we have - * references to buffers so we are safe */ + /* + * We need to release the page lock before we start the + * journal, so grab a reference so the page won't disappear + * out from under us. + */ + get_page(page); unlock_page(page); handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, ext4_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out; + put_page(page); + goto out_no_pagelock; } - BUG_ON(!ext4_handle_valid(handle)); + lock_page(page); + put_page(page); + if (page->mapping != mapping) { + /* The page got truncated from under us */ + ext4_journal_stop(handle); + ret = 0; + goto out; + } + if (inline_data) { BUFFER_TRACE(inode_bh, "get write access"); ret = ext4_journal_get_write_access(handle, inode_bh); @@ -1739,6 +1752,8 @@ static int __ext4_journalled_writepage(struct page *page, NULL, bput_one); ext4_set_inode_state(inode, EXT4_STATE_JDATA); out: + unlock_page(page); +out_no_pagelock: brelse(inode_bh); return ret; } -- cgit v0.10.2 From 4b7e2db5c08ca3d9f3b75b2a46f53c358ea55540 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 12 Jun 2015 23:46:33 -0400 Subject: ext4: use swap() in memswap() Use kernel.h macro definition. Thanks to Julia Lawall for Coccinelle scripting support. Signed-off-by: Fabian Frederick Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 7ce8582..cb84512 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -31,14 +31,11 @@ static void memswap(void *a, void *b, size_t len) { unsigned char *ap, *bp; - unsigned char tmp; ap = (unsigned char *)a; bp = (unsigned char *)b; while (len-- > 0) { - tmp = *ap; - *ap = *bp; - *bp = tmp; + swap(*ap, *bp); ap++; bp++; } -- cgit v0.10.2 From bf86546760502b24e16fad75e3affde61efb5e2c Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 12 Jun 2015 23:47:33 -0400 Subject: ext4: use swap() in mext_page_double_lock() Use kernel.h macro definition. Thanks to Julia Lawall for Coccinelle scripting support. Signed-off-by: Fabian Frederick Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 370420b..8c04afb 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -166,12 +166,9 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2, */ wait_on_page_writeback(page[0]); wait_on_page_writeback(page[1]); - if (inode1 > inode2) { - struct page *tmp; - tmp = page[0]; - page[0] = page[1]; - page[1] = tmp; - } + if (inode1 > inode2) + swap(page[0], page[1]); + return 0; } -- cgit v0.10.2 From b4f1afcd068f6e533230dfed00782cd8a907f96b Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 15 Jun 2015 00:18:02 -0400 Subject: jbd2: use GFP_NOFS in jbd2_cleanup_journal_tail() jbd2_cleanup_journal_tail() can be invoked by jbd2__journal_start() So allocations should be done with GFP_NOFS [Full stack trace snipped from 3.10-rh7] [] dump_stack+0x19/0x1b [] warn_slowpath_common+0x61/0x80 [] warn_slowpath_null+0x1a/0x20 [] slab_pre_alloc_hook.isra.31.part.32+0x15/0x17 [] kmem_cache_alloc+0x55/0x210 [] ? mempool_alloc_slab+0x15/0x20 [] mempool_alloc_slab+0x15/0x20 [] mempool_alloc+0x69/0x170 [] ? _raw_spin_unlock_irq+0xe/0x20 [] ? finish_task_switch+0x5d/0x150 [] bio_alloc_bioset+0x1be/0x2e0 [] blkdev_issue_flush+0x99/0x120 [] jbd2_cleanup_journal_tail+0x93/0xa0 [jbd2] -->GFP_KERNEL [] jbd2_log_do_checkpoint+0x221/0x4a0 [jbd2] [] __jbd2_log_wait_for_space+0xa7/0x1e0 [jbd2] [] start_this_handle+0x2d8/0x550 [jbd2] [] ? __memcg_kmem_put_cache+0x29/0x30 [] ? kmem_cache_alloc+0x130/0x210 [] jbd2__journal_start+0xba/0x190 [jbd2] [] ? lru_cache_add+0xe/0x10 [] ? ext4_da_write_begin+0xf9/0x330 [ext4] [] __ext4_journal_start_sb+0x77/0x160 [ext4] [] ext4_da_write_begin+0xf9/0x330 [ext4] [] generic_file_buffered_write_iter+0x10c/0x270 [] __generic_file_write_iter+0x178/0x390 [] __generic_file_aio_write+0x8b/0xb0 [] generic_file_aio_write+0x5d/0xc0 [] ext4_file_write+0xa9/0x450 [ext4] [] ? pipe_read+0x379/0x4f0 [] do_sync_write+0x90/0xe0 [] vfs_write+0xbd/0x1e0 [] SyS_write+0x58/0xb0 [] system_call_fastpath+0x16/0x1b Signed-off-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 988b32e..6b7b73a 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -405,7 +405,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * jbd2_cleanup_journal_tail() doesn't get called all that often. */ if (journal->j_flags & JBD2_BARRIER) - blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL); __jbd2_update_log_tail(journal, first_tid, blocknr); return 0; -- cgit v0.10.2 From 4134f5c88dcd5b00e4a5f37c3842b2b831a61ee1 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Mon, 15 Jun 2015 00:20:46 -0400 Subject: ext4: recalculate journal credits as inode depth changes Currently in ext4_alloc_file_blocks() the number of credits is calculated only once before we enter the allocation loop. However within the allocation loop the extent tree depth can change, hence the number of credits needed can increase potentially exceeding the number of credits reserved in the handle which can cause journal failures. Fix this by recalculating number of credits when the inode depth changes. Note that even though ext4_alloc_file_blocks() is only currently used by extent base inodes we will avoid recalculating number of credits unnecessarily in the case of indirect based inodes. Signed-off-by: Lukas Czerner Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 08f5afc..fc76bda 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4663,6 +4663,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, int ret = 0; int ret2 = 0; int retries = 0; + int depth = 0; struct ext4_map_blocks map; unsigned int credits; loff_t epos; @@ -4681,9 +4682,24 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, len); + /* + * We can only call ext_depth() on extent based inodes + */ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + depth = ext_depth(inode); + else + depth = -1; retry: while (ret >= 0 && len) { + /* + * Recalculate credits when extent tree depth changes. + */ + if (depth >= 0 && depth != ext_depth(inode)) { + credits = ext4_chunk_trans_blocks(inode, len); + depth = ext_depth(inode); + } + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); if (IS_ERR(handle)) { -- cgit v0.10.2 From 0d306dcf86e8f065dff42a4a934ae9d99af35ba5 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Mon, 15 Jun 2015 00:23:53 -0400 Subject: ext4: wait for existing dio workers in ext4_alloc_file_blocks() Currently existing dio workers can jump in and potentially increase extent tree depth while we're allocating blocks in ext4_alloc_file_blocks(). This may cause us to underestimate the number of credits needed for the transaction because the extent tree depth can change after our estimation. Fix this by waiting for all the existing dio workers in the same way as we do it in ext4_punch_hole. We've seen errors caused by this in xfstest generic/299, however it's really hard to reproduce. Signed-off-by: Lukas Czerner Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index fc76bda..1ba8b4a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4678,6 +4678,10 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, if (len <= EXT_UNWRITTEN_MAX_LEN) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + /* * credits to insert 1 extent into extent tree */ @@ -4741,6 +4745,8 @@ retry: goto retry; } + ext4_inode_resume_unlocked_dio(inode); + return ret > 0 ? ret2 : ret; } -- cgit v0.10.2 From 97b4af2f7672476eedeb5cf5000b927ed4b516a4 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 15 Jun 2015 00:32:58 -0400 Subject: ext4: mballoc: avoid 20-argument function call Making a function call with 20 arguments is rather expensive in both stack and .text. In this case, doing the formatting manually doesn't make it any less readable, so we might as well save 155 bytes of .text and 112 bytes of stack. Signed-off-by: Rasmus Villemoes diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 78e58f7..1c535fa 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2275,12 +2275,9 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) group--; if (group == 0) - seq_printf(seq, "#%-5s: %-5s %-5s %-5s " - "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " - "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", - "group", "free", "frags", "first", - "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6", - "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); + seq_puts(seq, "#group: free frags first [" + " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " + " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]"); i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + sizeof(struct ext4_group_info); -- cgit v0.10.2 From 6f6a6fda294506dfe0e3e0a253bb2d2923f28f0a Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 15 Jun 2015 14:36:01 -0400 Subject: jbd2: fix ocfs2 corrupt when updating journal superblock fails If updating journal superblock fails after journal data has been flushed, the error is omitted and this will mislead the caller as a normal case. In ocfs2, the checkpoint will be treated successfully and the other node can get the lock to update. Since the sb_start is still pointing to the old log block, it will rewrite the journal data during journal recovery by the other node. Thus the new updates will be overwritten and ocfs2 corrupts. So in above case we have to return the error, and ocfs2_commit_cache will take care of the error and prevent the other node to do update first. And only after recovering journal it can do the new updates. The issue discussion mail can be found at: https://oss.oracle.com/pipermail/ocfs2-devel/2015-June/010856.html http://comments.gmane.org/gmane.comp.file-systems.ext4/48841 [ Fixed bug in patch which allowed a non-negative error return from jbd2_cleanup_journal_tail() to leak out of jbd2_fjournal_flush(); this was causing xfstests ext4/306 to fail. -- Ted ] Reported-by: Yiwen Jiang Signed-off-by: Joseph Qi Signed-off-by: Theodore Ts'o Tested-by: Yiwen Jiang Cc: Junxiao Bi Cc: stable@vger.kernel.org diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 6b7b73a..4227dc4 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -390,7 +390,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) unsigned long blocknr; if (is_journal_aborted(journal)) - return 1; + return -EIO; if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr)) return 1; @@ -407,8 +407,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) if (journal->j_flags & JBD2_BARRIER) blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL); - __jbd2_update_log_tail(journal, first_tid, blocknr); - return 0; + return __jbd2_update_log_tail(journal, first_tid, blocknr); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 303ccd9..5804466 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -876,9 +876,10 @@ int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, * * Requires j_checkpoint_mutex */ -void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) +int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) { unsigned long freed; + int ret; BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); @@ -888,7 +889,10 @@ void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) * space and if we lose sb update during power failure we'd replay * old transaction with possibly newly overwritten data. */ - jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA); + ret = jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA); + if (ret) + goto out; + write_lock(&journal->j_state_lock); freed = block - journal->j_tail; if (block < journal->j_tail) @@ -904,6 +908,9 @@ void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) journal->j_tail_sequence = tid; journal->j_tail = block; write_unlock(&journal->j_state_lock); + +out: + return ret; } /* @@ -1322,7 +1329,7 @@ static int journal_reset(journal_t *journal) return jbd2_journal_start_thread(journal); } -static void jbd2_write_superblock(journal_t *journal, int write_op) +static int jbd2_write_superblock(journal_t *journal, int write_op) { struct buffer_head *bh = journal->j_sb_buffer; journal_superblock_t *sb = journal->j_superblock; @@ -1361,7 +1368,10 @@ static void jbd2_write_superblock(journal_t *journal, int write_op) printk(KERN_ERR "JBD2: Error %d detected when updating " "journal superblock for %s.\n", ret, journal->j_devname); + jbd2_journal_abort(journal, ret); } + + return ret; } /** @@ -1374,10 +1384,11 @@ static void jbd2_write_superblock(journal_t *journal, int write_op) * Update a journal's superblock information about log tail and write it to * disk, waiting for the IO to complete. */ -void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, +int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, unsigned long tail_block, int write_op) { journal_superblock_t *sb = journal->j_superblock; + int ret; BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", @@ -1386,13 +1397,18 @@ void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, sb->s_sequence = cpu_to_be32(tail_tid); sb->s_start = cpu_to_be32(tail_block); - jbd2_write_superblock(journal, write_op); + ret = jbd2_write_superblock(journal, write_op); + if (ret) + goto out; /* Log is no longer empty */ write_lock(&journal->j_state_lock); WARN_ON(!sb->s_sequence); journal->j_flags &= ~JBD2_FLUSHED; write_unlock(&journal->j_state_lock); + +out: + return ret; } /** @@ -1941,7 +1957,14 @@ int jbd2_journal_flush(journal_t *journal) return -EIO; mutex_lock(&journal->j_checkpoint_mutex); - jbd2_cleanup_journal_tail(journal); + if (!err) { + err = jbd2_cleanup_journal_tail(journal); + if (err < 0) { + mutex_unlock(&journal->j_checkpoint_mutex); + goto out; + } + err = 0; + } /* Finally, mark the journal as really needing no recovery. * This sets s_start==0 in the underlying superblock, which is @@ -1957,7 +1980,8 @@ int jbd2_journal_flush(journal_t *journal) J_ASSERT(journal->j_head == journal->j_tail); J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); write_unlock(&journal->j_state_lock); - return 0; +out: + return err; } /** diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 20e7f78..edb640a 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1035,7 +1035,7 @@ struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal); int jbd2_journal_next_log_block(journal_t *, unsigned long long *); int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, unsigned long *block); -void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); +int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); /* Commit management */ @@ -1157,7 +1157,7 @@ extern int jbd2_journal_recover (journal_t *journal); extern int jbd2_journal_wipe (journal_t *, int); extern int jbd2_journal_skip_recovery (journal_t *); extern void jbd2_journal_update_sb_errno(journal_t *); -extern void jbd2_journal_update_sb_log_tail (journal_t *, tid_t, +extern int jbd2_journal_update_sb_log_tail (journal_t *, tid_t, unsigned long, int); extern void __jbd2_journal_abort_hard (journal_t *); extern void jbd2_journal_abort (journal_t *, int); -- cgit v0.10.2 From b03a2f7eb21cc06b541142684abf7eed6aaccf3e Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Mon, 15 Jun 2015 14:50:26 -0400 Subject: ext4: improve warning directory handling messages Several ext4_warning() messages in the directory handling code do not report the inode number of the (potentially corrupt) directory where a problem is seen, and others report this in an ad-hoc manner. Add an ext4_warning_inode() helper to print the inode number and command name consistent with ext4_error_inode(). Consolidate the place in ext4.h that these macros are defined. Clean up some other directory error and warning messages to print the calling function name. Minor code style fixes in nearby lines. Signed-off-by: Andreas Dilger Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3ab7cd8..02a5762 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -69,15 +69,6 @@ #define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif -#define EXT4_ERROR_INODE(inode, fmt, a...) \ - ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) - -#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ - ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) - -#define EXT4_ERROR_FILE(file, block, fmt, a...) \ - ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) - /* data type for block offset of block group */ typedef int ext4_grpblk_t; @@ -2405,6 +2396,9 @@ void __ext4_abort(struct super_block *, const char *, unsigned int, extern __printf(4, 5) void __ext4_warning(struct super_block *, const char *, unsigned int, const char *, ...); +extern __printf(4, 5) +void __ext4_warning_inode(const struct inode *inode, const char *function, + unsigned int line, const char *fmt, ...); extern __printf(3, 4) void __ext4_msg(struct super_block *, const char *, const char *, ...); extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, @@ -2415,6 +2409,15 @@ void __ext4_grp_locked_error(const char *, unsigned int, unsigned long, ext4_fsblk_t, const char *, ...); +#define EXT4_ERROR_INODE(inode, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) + +#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) + +#define EXT4_ERROR_FILE(file, block, fmt, a...) \ + ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) + #ifdef CONFIG_PRINTK #define ext4_error_inode(inode, func, line, block, fmt, ...) \ @@ -2427,6 +2430,8 @@ void __ext4_grp_locked_error(const char *, unsigned int, __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) #define ext4_warning(sb, fmt, ...) \ __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_warning_inode(inode, fmt, ...) \ + __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__) #define ext4_msg(sb, level, fmt, ...) \ __ext4_msg(sb, level, fmt, ##__VA_ARGS__) #define dump_mmp_msg(sb, mmp, msg) \ @@ -2462,6 +2467,11 @@ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_warning(sb, "", 0, " "); \ } while (0) +#define ext4_warning_inode(inode, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning_inode(inode, "", 0, " "); \ +} while (0) #define ext4_msg(sb, level, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index bda4a5d..5e7676f 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -84,12 +84,13 @@ typedef enum { } dirblock_type_t; #define ext4_read_dirblock(inode, block, type) \ - __ext4_read_dirblock((inode), (block), (type), __LINE__) + __ext4_read_dirblock((inode), (block), (type), __func__, __LINE__) static struct buffer_head *__ext4_read_dirblock(struct inode *inode, - ext4_lblk_t block, - dirblock_type_t type, - unsigned int line) + ext4_lblk_t block, + dirblock_type_t type, + const char *func, + unsigned int line) { struct buffer_head *bh; struct ext4_dir_entry *dirent; @@ -97,15 +98,17 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, bh = ext4_bread(NULL, inode, block, 0); if (IS_ERR(bh)) { - __ext4_warning(inode->i_sb, __func__, line, - "error %ld reading directory block " - "(ino %lu, block %lu)", PTR_ERR(bh), inode->i_ino, - (unsigned long) block); + __ext4_warning(inode->i_sb, func, line, + "inode #%lu: lblock %lu: comm %s: " + "error %ld reading directory block", + inode->i_ino, (unsigned long)block, + current->comm, PTR_ERR(bh)); return bh; } if (!bh) { - ext4_error_inode(inode, __func__, line, block, "Directory hole found"); + ext4_error_inode(inode, func, line, block, + "Directory hole found"); return ERR_PTR(-EIO); } dirent = (struct ext4_dir_entry *) bh->b_data; @@ -119,7 +122,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, is_dx_block = 1; } if (!is_dx_block && type == INDEX) { - ext4_error_inode(inode, __func__, line, block, + ext4_error_inode(inode, func, line, block, "directory leaf block found instead of index block"); return ERR_PTR(-EIO); } @@ -136,8 +139,8 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, if (ext4_dx_csum_verify(inode, dirent)) set_buffer_verified(bh); else { - ext4_error_inode(inode, __func__, line, block, - "Directory index failed checksum"); + ext4_error_inode(inode, func, line, block, + "Directory index failed checksum"); brelse(bh); return ERR_PTR(-EIO); } @@ -146,8 +149,8 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, if (ext4_dirent_csum_verify(inode, dirent)) set_buffer_verified(bh); else { - ext4_error_inode(inode, __func__, line, block, - "Directory block failed checksum"); + ext4_error_inode(inode, func, line, block, + "Directory block failed checksum"); brelse(bh); return ERR_PTR(-EIO); } @@ -327,10 +330,14 @@ static __le32 ext4_dirent_csum(struct inode *inode, return cpu_to_le32(csum); } -static void warn_no_space_for_csum(struct inode *inode) +#define warn_no_space_for_csum(inode) \ + __warn_no_space_for_csum((inode), __func__, __LINE__) + +static void __warn_no_space_for_csum(struct inode *inode, const char *func, + unsigned int line) { - ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for " - "checksum. Please run e2fsck -D.", inode->i_ino); + __ext4_warning_inode(inode, func, line, + "No space for directory leaf checksum. Please run e2fsck -D."); } int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) @@ -738,8 +745,8 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && root->info.hash_version != DX_HASH_LEGACY) { - ext4_warning(dir->i_sb, "Unrecognised inode hash code %d", - root->info.hash_version); + ext4_warning_inode(dir, "Unrecognised inode hash code %u", + root->info.hash_version); goto fail; } if (fname) @@ -753,23 +760,26 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, hash = hinfo->hash; if (root->info.unused_flags & 1) { - ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x", - root->info.unused_flags); + ext4_warning_inode(dir, "Unimplemented hash flags: %#06x", + root->info.unused_flags); goto fail; } - if ((indirect = root->info.indirect_levels) > 1) { - ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", - root->info.indirect_levels); + indirect = root->info.indirect_levels; + if (indirect > 1) { + ext4_warning_inode(dir, "Unimplemented hash depth: %#06x", + root->info.indirect_levels); goto fail; } - entries = (struct dx_entry *) (((char *)&root->info) + - root->info.info_length); + entries = (struct dx_entry *)(((char *)&root->info) + + root->info.info_length); if (dx_get_limit(entries) != dx_root_limit(dir, root->info.info_length)) { - ext4_warning(dir->i_sb, "dx entry: limit != root limit"); + ext4_warning_inode(dir, "dx entry: limit %u != root limit %u", + dx_get_limit(entries), + dx_root_limit(dir, root->info.info_length)); goto fail; } @@ -777,15 +787,16 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, while (1) { count = dx_get_count(entries); if (!count || count > dx_get_limit(entries)) { - ext4_warning(dir->i_sb, - "dx entry: no count or count > limit"); + ext4_warning_inode(dir, + "dx entry: count %u beyond limit %u", + count, dx_get_limit(entries)); goto fail; } p = entries + 1; q = entries + count - 1; while (p <= q) { - m = p + (q - p)/2; + m = p + (q - p) / 2; dxtrace(printk(".")); if (dx_get_hash(m) > hash) q = m - 1; @@ -809,7 +820,8 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, } at = p - 1; - dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); + dxtrace(printk(" %x->%u\n", at == entries ? 0 : dx_get_hash(at), + dx_get_block(at))); frame->entries = entries; frame->at = at; if (!indirect--) @@ -823,9 +835,10 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, } entries = ((struct dx_node *) frame->bh->b_data)->entries; - if (dx_get_limit(entries) != dx_node_limit (dir)) { - ext4_warning(dir->i_sb, - "dx entry: limit != node limit"); + if (dx_get_limit(entries) != dx_node_limit(dir)) { + ext4_warning_inode(dir, + "dx entry: limit %u != node limit %u", + dx_get_limit(entries), dx_node_limit(dir)); goto fail; } } @@ -836,18 +849,17 @@ fail: } if (ret_err == ERR_PTR(ERR_BAD_DX_DIR)) - ext4_warning(dir->i_sb, - "Corrupt dir inode %lu, running e2fsck is " - "recommended.", dir->i_ino); + ext4_warning_inode(dir, + "Corrupt directory, running e2fsck is recommended"); return ret_err; } -static void dx_release (struct dx_frame *frames) +static void dx_release(struct dx_frame *frames) { if (frames[0].bh == NULL) return; - if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) + if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels) brelse(frames[1].bh); brelse(frames[0].bh); } @@ -1524,9 +1536,9 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame, frames, NULL); if (retval < 0) { - ext4_warning(sb, - "error %d reading index page in directory #%lu", - retval, dir->i_ino); + ext4_warning_inode(dir, + "error %d reading directory index block", + retval); bh = ERR_PTR(retval); goto errout; } @@ -2187,7 +2199,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, if (levels && (dx_get_count(frames->entries) == dx_get_limit(frames->entries))) { - ext4_warning(sb, "Directory index full!"); + ext4_warning_inode(dir, "Directory index full!"); err = -ENOSPC; goto cleanup; } @@ -2678,12 +2690,9 @@ int ext4_empty_dir(struct inode *inode) de = (struct ext4_dir_entry_2 *) bh->b_data; de1 = ext4_next_entry(de, sb->s_blocksize); if (le32_to_cpu(de->inode) != inode->i_ino || - !le32_to_cpu(de1->inode) || - strcmp(".", de->name) || - strcmp("..", de1->name)) { - ext4_warning(inode->i_sb, - "bad directory (dir #%lu) - no `.' or `..'", - inode->i_ino); + le32_to_cpu(de1->inode) == 0 || + strcmp(".", de->name) || strcmp("..", de1->name)) { + ext4_warning_inode(inode, "directory missing '.' and/or '..'"); brelse(bh); return 1; } @@ -2936,8 +2945,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) if (retval) goto end_rmdir; if (!EXT4_DIR_LINK_EMPTY(inode)) - ext4_warning(inode->i_sb, - "empty directory has too many links (%d)", + ext4_warning_inode(inode, + "empty directory '%.*s' has too many links (%u)", + dentry->d_name.len, dentry->d_name.name, inode->i_nlink); inode->i_version++; clear_nlink(inode); @@ -2997,10 +3007,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - if (!inode->i_nlink) { - ext4_warning(inode->i_sb, - "Deleting nonexistent file (%lu), %d", - inode->i_ino, inode->i_nlink); + if (inode->i_nlink == 0) { + ext4_warning_inode(inode, "Deleting file '%.*s' with no links", + dentry->d_name.len, dentry->d_name.name); set_nlink(inode, 1); } retval = ext4_delete_entry(handle, dir, de, bh); @@ -3385,9 +3394,9 @@ static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent, } if (retval) { - ext4_warning(ent->dir->i_sb, - "Deleting old file (%lu), %d, error=%d", - ent->dir->i_ino, ent->dir->i_nlink, retval); + ext4_warning_inode(ent->dir, + "Deleting old file: nlink %d, error=%d", + ent->dir->i_nlink, retval); } } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e13fe40..fdac076 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -591,14 +591,17 @@ void __ext4_msg(struct super_block *sb, va_end(args); } +#define ext4_warning_ratelimit(sb) \ + ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), \ + "EXT4-fs warning") + void __ext4_warning(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) { struct va_format vaf; va_list args; - if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), - "EXT4-fs warning")) + if (!ext4_warning_ratelimit(sb)) return; va_start(args, fmt); @@ -609,6 +612,24 @@ void __ext4_warning(struct super_block *sb, const char *function, va_end(args); } +void __ext4_warning_inode(const struct inode *inode, const char *function, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (!ext4_warning_ratelimit(inode->i_sb)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: " + "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, + function, line, inode->i_ino, current->comm, &vaf); + va_end(args); +} + void __ext4_grp_locked_error(const char *function, unsigned int line, struct super_block *sb, ext4_group_t grp, unsigned long ino, ext4_fsblk_t block, -- cgit v0.10.2 From 7b506b1035326543b7cd2d768449ccbd1ef3f368 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 15 Jun 2015 15:45:58 -0400 Subject: jbd2: get rid of open coded allocation retry loop insert_revoke_hash does an open coded endless allocation loop if journal_oom_retry is true. It doesn't implement any allocation fallback strategy between the retries, though. The memory allocator doesn't know about the never fail requirement so it cannot potentially help to move on with the allocation (e.g. use memory reserves). Get rid of the retry loop and use __GFP_NOFAIL instead. We will lose the debugging message but I am not sure it is anyhow helpful. Do the same for journal_alloc_journal_head which is doing a similar thing. Signed-off-by: Michal Hocko Signed-off-by: Theodore Ts'o diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5804466..179d7d8 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2377,10 +2377,8 @@ static struct journal_head *journal_alloc_journal_head(void) if (!ret) { jbd_debug(1, "out of memory for journal_head\n"); pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__); - while (!ret) { - yield(); - ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS); - } + ret = kmem_cache_zalloc(jbd2_journal_head_cache, + GFP_NOFS | __GFP_NOFAIL); } return ret; } diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 14214da..0abf2e7 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -141,11 +141,13 @@ static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr, { struct list_head *hash_list; struct jbd2_revoke_record_s *record; + gfp_t gfp_mask = GFP_NOFS; -repeat: - record = kmem_cache_alloc(jbd2_revoke_record_cache, GFP_NOFS); + if (journal_oom_retry) + gfp_mask |= __GFP_NOFAIL; + record = kmem_cache_alloc(jbd2_revoke_record_cache, gfp_mask); if (!record) - goto oom; + return -ENOMEM; record->sequence = seq; record->blocknr = blocknr; @@ -154,13 +156,6 @@ repeat: list_add(&record->hash, hash_list); spin_unlock(&journal->j_revoke_lock); return 0; - -oom: - if (!journal_oom_retry) - return -ENOMEM; - jbd_debug(1, "ENOMEM in %s, retrying\n", __func__); - yield(); - goto repeat; } /* Find a revoke record in the journal's hash table. */ -- cgit v0.10.2 From 2143c1965a761332ae417b22fd477b636e4f54ec Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 20 Jun 2015 21:44:17 -0400 Subject: jbd2: speedup jbd2_journal_dirty_metadata() It is often the case that we mark buffer as having dirty metadata when the buffer is already in that state (frequent for bitmaps, inode table blocks, superblock). Thus it is unnecessary to contend on grabbing journal head reference and bh_state lock. Avoid that by checking whether any modification to the buffer is needed before grabbing any locks or references. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index f3d0617..cbe8b3a 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1280,8 +1280,6 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh, triggers->t_abort(triggers, jh2bh(jh)); } - - /** * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata * @handle: transaction to add buffer to. @@ -1314,12 +1312,36 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) if (is_handle_aborted(handle)) return -EROFS; - journal = transaction->t_journal; - jh = jbd2_journal_grab_journal_head(bh); - if (!jh) { + if (!buffer_jbd(bh)) { ret = -EUCLEAN; goto out; } + /* + * We don't grab jh reference here since the buffer must be part + * of the running transaction. + */ + jh = bh2jh(bh); + J_ASSERT_JH(jh, jh->b_transaction == transaction || + jh->b_next_transaction == transaction); + if (jh->b_modified == 1) { + /* + * If it's in our transaction it must be in BJ_Metadata list. + * The assertion is unreliable since we may see jh in + * inconsistent state unless we grab bh_state lock. But this + * is crutial to catch bugs so let's do a reliable check until + * the lockless handling is fully proven. + */ + if (jh->b_transaction == transaction && + jh->b_jlist != BJ_Metadata) { + jbd_lock_bh_state(bh); + J_ASSERT_JH(jh, jh->b_transaction != transaction || + jh->b_jlist == BJ_Metadata); + jbd_unlock_bh_state(bh); + } + goto out; + } + + journal = transaction->t_journal; jbd_debug(5, "journal_head %p\n", jh); JBUFFER_TRACE(jh, "entry"); @@ -1410,7 +1432,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) spin_unlock(&journal->j_list_lock); out_unlock_bh: jbd_unlock_bh_state(bh); - jbd2_journal_put_journal_head(jh); out: JBUFFER_TRACE(jh, "exit"); return ret; -- cgit v0.10.2 From 89d96a6f8e6491f24fc8f99fd6ae66820e85c6c1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 20 Jun 2015 22:50:33 -0400 Subject: ext4: call sync_blockdev() before invalidate_bdev() in put_super() Normally all of the buffers will have been forced out to disk before we call invalidate_bdev(), but there will be some cases, where a file system operation was aborted due to an ext4_error(), where there may still be some dirty buffers in the buffer cache for the device. So try to force them out to memory before calling invalidate_bdev(). This fixes a warning triggered by generic/081: WARNING: CPU: 1 PID: 3473 at /usr/projects/linux/ext4/fs/block_dev.c:56 __blkdev_put+0xb5/0x16f() Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org diff --git a/fs/ext4/super.c b/fs/ext4/super.c index fdac076..2858ac0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -828,6 +828,7 @@ static void ext4_put_super(struct super_block *sb) dump_orphan_list(sb, sbi); J_ASSERT(list_empty(&sbi->s_orphan)); + sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) { /* -- cgit v0.10.2 From c5e298ae53dc2eb69f2f7153be03454c8a33c658 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 21 Jun 2015 01:25:29 -0400 Subject: ext4: prevent ext4_quota_write() from failing due to ENOSPC In order to prevent quota block tracking to be inaccurate when ext4_quota_write() fails with ENOSPC, we make two changes. The quota file can now use the reserved block (since the quota file is arguably file system metadata), and ext4_quota_write() now uses ext4_should_retry_alloc() to retry the block allocation after a commit has completed and released some blocks for allocation. This fixes failures of xfstests generic/270: Quota error (device vdc): write_blk: dquota write failed Quota error (device vdc): qtree_write_dquot: Error -28 occurred while creating quota Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 1ba8b4a..d86d262 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4456,6 +4456,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ar.flags |= EXT4_MB_HINT_NOPREALLOC; if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ar.flags |= EXT4_MB_DELALLOC_RESERVED; + if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) + ar.flags |= EXT4_MB_USE_RESERVED; newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out2; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 9588240..9962d57 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -576,6 +576,8 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, ar.flags = EXT4_MB_HINT_DATA; if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ar.flags |= EXT4_MB_DELALLOC_RESERVED; + if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) + ar.flags |= EXT4_MB_USE_RESERVED; ar.goal = ext4_find_goal(inode, map->m_lblk, partial); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 263a46c..e8a67b8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -731,18 +731,18 @@ int ext4_get_block(struct inode *inode, sector_t iblock, * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, - ext4_lblk_t block, int create) + ext4_lblk_t block, int map_flags) { struct ext4_map_blocks map; struct buffer_head *bh; + int create = map_flags & EXT4_GET_BLOCKS_CREATE; int err; J_ASSERT(handle != NULL || create == 0); map.m_lblk = block; map.m_len = 1; - err = ext4_map_blocks(handle, inode, &map, - create ? EXT4_GET_BLOCKS_CREATE : 0); + err = ext4_map_blocks(handle, inode, &map, map_flags); if (err == 0) return create ? ERR_PTR(-ENOSPC) : NULL; @@ -788,11 +788,11 @@ errout: } struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, - ext4_lblk_t block, int create) + ext4_lblk_t block, int map_flags) { struct buffer_head *bh; - bh = ext4_getblk(handle, inode, block, create); + bh = ext4_getblk(handle, inode, block, map_flags); if (IS_ERR(bh)) return bh; if (!bh || buffer_uptodate(bh)) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 5e7676f..e230b31 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -61,7 +61,7 @@ static struct buffer_head *ext4_append(handle_t *handle, *block = inode->i_size >> inode->i_sb->s_blocksize_bits; - bh = ext4_bread(handle, inode, *block, 1); + bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE); if (IS_ERR(bh)) return bh; inode->i_size += inode->i_sb->s_blocksize; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2858ac0..bd4df9d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5438,6 +5438,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); int err, offset = off & (sb->s_blocksize - 1); + int retries = 0; struct buffer_head *bh; handle_t *handle = journal_current_handle(); @@ -5458,7 +5459,12 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, return -EIO; } - bh = ext4_bread(handle, inode, blk, 1); + do { + bh = ext4_bread(handle, inode, blk, + EXT4_GET_BLOCKS_CREATE | + EXT4_GET_BLOCKS_METADATA_NOFAIL); + } while (IS_ERR(bh) && (PTR_ERR(bh) == -ENOSPC) && + ext4_should_retry_alloc(inode->i_sb, &retries)); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) -- cgit v0.10.2 From 292db1bc6c105d86111e858859456bcb11f90f91 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 21 Jun 2015 21:10:51 -0400 Subject: ext4: don't retry file block mapping on bigalloc fs with non-extent file ext4 isn't willing to map clusters to a non-extent file. Don't signal this with an out of space error, since the FS will retry the allocation (which didn't fail) forever. Instead, return EUCLEAN so that the operation will fail immediately all the way back to userspace. (The fix is either to run e2fsck -E bmap2extent, or to chattr +e the file.) Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 9962d57..4f6ac49 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -565,7 +565,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { EXT4_ERROR_INODE(inode, "Can't allocate blocks for " "non-extent mapped inodes with bigalloc"); - return -ENOSPC; + return -EUCLEAN; } /* Set up for the direct block allocation */ -- cgit v0.10.2 From c27e43a10c9755231f8a1c618efc1ac299dd5007 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Sun, 21 Jun 2015 21:37:05 -0400 Subject: ext4: minor cleanup of ext4_da_reserve_space() Remove outdated comments and dead code from ext4_da_reserve_space. Clean up its trace point, and relocate it to make it more useful. While we're at it, fix a nearby conditional used to determine if we have a non-bigalloc file system. It doesn't match usage elsewhere in the code, and misleadingly suggests that an s_cluster_ratio value of 0 would be legal. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e8a67b8..ae93f0b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1261,13 +1261,12 @@ static int ext4_journalled_write_end(struct file *file, } /* - * Reserve a single cluster located at lblock + * Reserve space for a single cluster */ -static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) +static int ext4_da_reserve_space(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); - unsigned int md_needed; int ret; /* @@ -1279,25 +1278,14 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) if (ret) return ret; - /* - * recalculate the amount of metadata blocks to reserve - * in order to allocate nrblocks - * worse case is one extent per block - */ spin_lock(&ei->i_block_reservation_lock); - /* - * ext4_calc_metadata_amount() has side effects, which we have - * to be prepared undo if we fail to claim space. - */ - md_needed = 0; - trace_ext4_da_reserve_space(inode, 0); - if (ext4_claim_free_clusters(sbi, 1, 0)) { spin_unlock(&ei->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); return -ENOSPC; } ei->i_reserved_data_blocks++; + trace_ext4_da_reserve_space(inode); spin_unlock(&ei->i_block_reservation_lock); return 0; /* success */ @@ -1566,9 +1554,9 @@ add_delayed: * then we don't need to reserve it again. However we still need * to reserve metadata for every block we're going to write. */ - if (EXT4_SB(inode->i_sb)->s_cluster_ratio <= 1 || + if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 || !ext4_find_delalloc_cluster(inode, map->m_lblk)) { - ret = ext4_da_reserve_space(inode, iblock); + ret = ext4_da_reserve_space(inode); if (ret) { /* not enough space to reserve */ retval = ret; diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 0faf570..594b4b2 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -1185,15 +1185,14 @@ TRACE_EVENT(ext4_da_update_reserve_space, ); TRACE_EVENT(ext4_da_reserve_space, - TP_PROTO(struct inode *inode, int md_needed), + TP_PROTO(struct inode *inode), - TP_ARGS(inode, md_needed), + TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, i_blocks ) - __field( int, md_needed ) __field( int, reserved_data_blocks ) __field( int, reserved_meta_blocks ) __field( __u16, mode ) @@ -1203,18 +1202,17 @@ TRACE_EVENT(ext4_da_reserve_space, __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->i_blocks = inode->i_blocks; - __entry->md_needed = md_needed; __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; __entry->mode = inode->i_mode; ), - TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu md_needed %d " + TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu " "reserved_data_blocks %d reserved_meta_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->i_blocks, - __entry->md_needed, __entry->reserved_data_blocks, + __entry->reserved_data_blocks, __entry->reserved_meta_blocks) ); -- cgit v0.10.2 From 04e22412f420ade46dbf792a10e7f0d26ae55359 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Sun, 21 Jun 2015 21:38:03 -0400 Subject: ext4: make online defrag error reporting consistent Make the error reporting behavior resulting from the unsupported use of online defrag on files with data journaling enabled consistent with that implemented for bigalloc file systems. Difference found with ext4/308. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o Reviewed-by: Darrick J. Wong diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 8c04afb..fb6f117 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -571,12 +571,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - /* TODO: This is non obvious task to swap blocks for inodes with full - jornaling enabled */ + + /* TODO: it's not obvious how to swap blocks for inodes with full + journaling enabled */ if (ext4_should_journal_data(orig_inode) || ext4_should_journal_data(donor_inode)) { - return -EINVAL; + ext4_msg(orig_inode->i_sb, KERN_ERR, + "Online defrag not supported with data journaling"); + return -EOPNOTSUPP; } + /* Protect orig and donor inodes against a truncate */ lock_two_nondirectories(orig_inode, donor_inode); -- cgit v0.10.2 From 3da40c7b089810ac9cf2bb1e59633f619f3a7312 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 22 Jun 2015 00:31:26 -0400 Subject: ext4: only call ext4_truncate when size <= isize At LSF we decided that if we truncate up from isize we shouldn't trim fallocated blocks that were fallocated with KEEP_SIZE and are past the new i_size. This patch fixes ext4 to do this. [ Completely reworked patch so that i_disksize would actually get set when truncating up. Also reworked the code for handling truncate so that it's easier to handle. -- tytso ] Signed-off-by: Josef Bacik Signed-off-by: Theodore Ts'o Reviewed-by: Lukas Czerner diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ae93f0b..e057c6f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4681,8 +4681,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ext4_journal_stop(handle); } - if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { + if (attr->ia_valid & ATTR_SIZE) { handle_t *handle; + loff_t oldsize = inode->i_size; + int shrink = (attr->ia_size <= inode->i_size); if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -4690,24 +4692,26 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_size > sbi->s_bitmap_maxbytes) return -EFBIG; } + if (!S_ISREG(inode->i_mode)) + return -EINVAL; if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size) inode_inc_iversion(inode); - if (S_ISREG(inode->i_mode) && + if (ext4_should_order_data(inode) && (attr->ia_size < inode->i_size)) { - if (ext4_should_order_data(inode)) { - error = ext4_begin_ordered_truncate(inode, + error = ext4_begin_ordered_truncate(inode, attr->ia_size); - if (error) - goto err_out; - } + if (error) + goto err_out; + } + if (attr->ia_size != inode->i_size) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; } - if (ext4_handle_valid(handle)) { + if (ext4_handle_valid(handle) && shrink) { error = ext4_orphan_add(handle, inode); orphan = 1; } @@ -4726,15 +4730,13 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) up_write(&EXT4_I(inode)->i_data_sem); ext4_journal_stop(handle); if (error) { - ext4_orphan_del(NULL, inode); + if (orphan) + ext4_orphan_del(NULL, inode); goto err_out; } - } else { - loff_t oldsize = inode->i_size; - - i_size_write(inode, attr->ia_size); - pagecache_isize_extended(inode, oldsize, inode->i_size); } + if (!shrink) + pagecache_isize_extended(inode, oldsize, inode->i_size); /* * Blocks are going to be removed from the inode. Wait @@ -4754,13 +4756,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * in data=journal mode to make pages freeable. */ truncate_pagecache(inode, inode->i_size); + if (shrink) + ext4_truncate(inode); } - /* - * We want to call ext4_truncate() even if attr->ia_size == - * inode->i_size for cases like truncation of fallocated space - */ - if (attr->ia_valid & ATTR_SIZE) - ext4_truncate(inode); if (!rc) { setattr_copy(inode, attr); -- cgit v0.10.2 From a2fd66d069d86d793e9d39d4079b96f46d13f237 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 23 Jun 2015 11:03:54 -0400 Subject: ext4: set lazytime on remount if MS_LAZYTIME is set by mount Newer versions of mount parse the lazytime feature and pass it to the mount system call via the flags field in the mount system call, removing the lazytime string from the mount options list. So we need to check for the presence of MS_LAZYTIME and set it in sb->s_flags in order for this flag to be set on a remount. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bd4df9d..90ec13f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4971,6 +4971,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); } + if (*flags & MS_LAZYTIME) + sb->s_flags |= MS_LAZYTIME; + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { err = -EROFS; -- cgit v0.10.2