From f96c450dabf5497794af8c45f589d44b4549d1fc Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Sun, 21 Feb 2016 18:31:41 -0500 Subject: ext4: make sure to revoke all the freeable blocks in ext4_free_blocks Now, ext4_free_blocks() doesn't revoke data blocks of per-file data journalled inode and it can cause file data inconsistency problems. Even though data blocks of per-file data journalled inode are already forgotten by jbd2_journal_invalidatepage() in advance of invoking ext4_free_blocks(), we still need to revoke the data blocks here. Moreover some of the metadata blocks, which are not found by sb_find_get_block(), are still needed to be revoked, but this is also missing here. Signed-off-by: Daeho Jeong Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4424b7b..f6ff478 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4695,16 +4695,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, } /* - * We need to make sure we don't reuse the freed block until - * after the transaction is committed, which we can do by - * treating the block as metadata, below. We make an - * exception if the inode is to be written in writeback mode - * since writeback mode has weak data consistency guarantees. - */ - if (!ext4_should_writeback_data(inode)) - flags |= EXT4_FREE_BLOCKS_METADATA; - - /* * If the extent to be freed does not begin on a cluster * boundary, we need to deal with partial clusters at the * beginning and end of the extent. Normally we will free @@ -4738,14 +4728,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { int i; + int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA; for (i = 0; i < count; i++) { cond_resched(); - bh = sb_find_get_block(inode->i_sb, block + i); - if (!bh) - continue; - ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, - inode, bh, block + i); + if (is_metadata) + bh = sb_find_get_block(inode->i_sb, block + i); + ext4_forget(handle, is_metadata, inode, bh, block + i); } } @@ -4819,12 +4808,17 @@ do_more: if (err) goto error_return; - if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) { + /* + * We need to make sure we don't reuse the freed block until after the + * transaction is committed. We make an exception if the inode is to be + * written in writeback mode since writeback mode has weak data + * consistency guarantees. + */ + if (ext4_handle_valid(handle) && + ((flags & EXT4_FREE_BLOCKS_METADATA) || + !ext4_should_writeback_data(inode))) { struct ext4_free_data *new_entry; /* - * blocks being freed are metadata. these blocks shouldn't - * be used until this transaction is committed - * * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed * to fail. */ -- cgit v0.10.2 From 87f9a031af48defee9f34c6aaf06d6f1988c244d Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Sun, 21 Feb 2016 18:38:44 -0500 Subject: ext4: iterate over buffer heads correctly in move_extent_per_page() In commit bcff24887d00 ("ext4: don't read blocks from disk after extents being swapped") bh is not updated correctly in the for loop and wrong data has been written to disk. generic/324 catches this on sub-page block size ext4. Fixes: bcff24887d00 ("ext4: don't read blocks from disk after extentsbeing swapped") Signed-off-by: Eryu Guan Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index e032a04..4098acc 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -390,6 +390,7 @@ data_copy: *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); if (*err < 0) break; + bh = bh->b_this_page; } if (!*err) *err = block_commit_write(pagep[0], from, from + replaced_size); -- cgit v0.10.2 From f9a61eb4e2471c56a63cd804c7474128138c38ac Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2016 11:49:09 -0500 Subject: mbcache2: reimplement mbcache Original mbcache was designed to have more features than what ext? filesystems ended up using. It supported entry being in more hashes, it had a home-grown rwlocking of each entry, and one cache could cache entries from multiple filesystems. This genericity also resulted in more complex locking, larger cache entries, and generally more code complexity. This is reimplementation of the mbcache functionality to exactly fit the purpose ext? filesystems use it for. Cache entries are now considerably smaller (7 instead of 13 longs), the code is considerably smaller as well (414 vs 913 lines of code), and IMO also simpler. The new code is also much more lightweight. I have measured the speed using artificial xattr-bench benchmark, which spawns P processes, each process sets xattr for F different files, and the value of xattr is randomly chosen from a pool of V values. Averages of runtimes for 5 runs for various combinations of parameters are below. The first value in each cell is old mbache, the second value is the new mbcache. V=10 F\P 1 2 4 8 16 32 64 10 0.158,0.157 0.208,0.196 0.500,0.277 0.798,0.400 3.258,0.584 13.807,1.047 61.339,2.803 100 0.172,0.167 0.279,0.222 0.520,0.275 0.825,0.341 2.981,0.505 12.022,1.202 44.641,2.943 1000 0.185,0.174 0.297,0.239 0.445,0.283 0.767,0.340 2.329,0.480 6.342,1.198 16.440,3.888 V=100 F\P 1 2 4 8 16 32 64 10 0.162,0.153 0.200,0.186 0.362,0.257 0.671,0.496 1.433,0.943 3.801,1.345 7.938,2.501 100 0.153,0.160 0.221,0.199 0.404,0.264 0.945,0.379 1.556,0.485 3.761,1.156 7.901,2.484 1000 0.215,0.191 0.303,0.246 0.471,0.288 0.960,0.347 1.647,0.479 3.916,1.176 8.058,3.160 V=1000 F\P 1 2 4 8 16 32 64 10 0.151,0.129 0.210,0.163 0.326,0.245 0.685,0.521 1.284,0.859 3.087,2.251 6.451,4.801 100 0.154,0.153 0.211,0.191 0.276,0.282 0.687,0.506 1.202,0.877 3.259,1.954 8.738,2.887 1000 0.145,0.179 0.202,0.222 0.449,0.319 0.899,0.333 1.577,0.524 4.221,1.240 9.782,3.579 V=10000 F\P 1 2 4 8 16 32 64 10 0.161,0.154 0.198,0.190 0.296,0.256 0.662,0.480 1.192,0.818 2.989,2.200 6.362,4.746 100 0.176,0.174 0.236,0.203 0.326,0.255 0.696,0.511 1.183,0.855 4.205,3.444 19.510,17.760 1000 0.199,0.183 0.240,0.227 1.159,1.014 2.286,2.154 6.023,6.039 ---,10.933 ---,36.620 V=100000 F\P 1 2 4 8 16 32 64 10 0.171,0.162 0.204,0.198 0.285,0.230 0.692,0.500 1.225,0.881 2.990,2.243 6.379,4.771 100 0.151,0.171 0.220,0.210 0.295,0.255 0.720,0.518 1.226,0.844 3.423,2.831 19.234,17.544 1000 0.192,0.189 0.249,0.225 1.162,1.043 2.257,2.093 5.853,4.997 ---,10.399 ---,32.198 We see that the new code is faster in pretty much all the cases and starting from 4 processes there are significant gains with the new code resulting in upto 20-times shorter runtimes. Also for large numbers of cached entries all values for the old code could not be measured as the kernel started hitting softlockups and died before the test completed. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/Makefile b/fs/Makefile index 79f5225..15b3d6c 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -41,7 +41,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o -obj-$(CONFIG_FS_MBCACHE) += mbcache.o +obj-$(CONFIG_FS_MBCACHE) += mbcache.o mbcache2.o obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_COREDUMP) += coredump.o diff --git a/fs/mbcache2.c b/fs/mbcache2.c new file mode 100644 index 0000000..5c3e1a8 --- /dev/null +++ b/fs/mbcache2.c @@ -0,0 +1,359 @@ +#include +#include +#include +#include +#include +#include +#include + +/* + * Mbcache is a simple key-value store. Keys need not be unique, however + * key-value pairs are expected to be unique (we use this fact in + * mb2_cache_entry_delete_block()). + * + * Ext2 and ext4 use this cache for deduplication of extended attribute blocks. + * They use hash of a block contents as a key and block number as a value. + * That's why keys need not be unique (different xattr blocks may end up having + * the same hash). However block number always uniquely identifies a cache + * entry. + * + * We provide functions for creation and removal of entries, search by key, + * and a special "delete entry with given key-value pair" operation. Fixed + * size hash table is used for fast key lookups. + */ + +struct mb2_cache { + /* Hash table of entries */ + struct hlist_bl_head *c_hash; + /* log2 of hash table size */ + int c_bucket_bits; + /* Protects c_lru_list, c_entry_count */ + spinlock_t c_lru_list_lock; + struct list_head c_lru_list; + /* Number of entries in cache */ + unsigned long c_entry_count; + struct shrinker c_shrink; +}; + +static struct kmem_cache *mb2_entry_cache; + +/* + * mb2_cache_entry_create - create entry in cache + * @cache - cache where the entry should be created + * @mask - gfp mask with which the entry should be allocated + * @key - key of the entry + * @block - block that contains data + * + * Creates entry in @cache with key @key and records that data is stored in + * block @block. The function returns -EBUSY if entry with the same key + * and for the same block already exists in cache. Otherwise 0 is returned. + */ +int mb2_cache_entry_create(struct mb2_cache *cache, gfp_t mask, u32 key, + sector_t block) +{ + struct mb2_cache_entry *entry, *dup; + struct hlist_bl_node *dup_node; + struct hlist_bl_head *head; + + entry = kmem_cache_alloc(mb2_entry_cache, mask); + if (!entry) + return -ENOMEM; + + INIT_LIST_HEAD(&entry->e_lru_list); + /* One ref for hash, one ref returned */ + atomic_set(&entry->e_refcnt, 1); + entry->e_key = key; + entry->e_block = block; + head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; + entry->e_hash_list_head = head; + hlist_bl_lock(head); + hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { + if (dup->e_key == key && dup->e_block == block) { + hlist_bl_unlock(head); + kmem_cache_free(mb2_entry_cache, entry); + return -EBUSY; + } + } + hlist_bl_add_head(&entry->e_hash_list, head); + hlist_bl_unlock(head); + + spin_lock(&cache->c_lru_list_lock); + list_add_tail(&entry->e_lru_list, &cache->c_lru_list); + /* Grab ref for LRU list */ + atomic_inc(&entry->e_refcnt); + cache->c_entry_count++; + spin_unlock(&cache->c_lru_list_lock); + + return 0; +} +EXPORT_SYMBOL(mb2_cache_entry_create); + +void __mb2_cache_entry_free(struct mb2_cache_entry *entry) +{ + kmem_cache_free(mb2_entry_cache, entry); +} +EXPORT_SYMBOL(__mb2_cache_entry_free); + +static struct mb2_cache_entry *__entry_find(struct mb2_cache *cache, + struct mb2_cache_entry *entry, + u32 key) +{ + struct mb2_cache_entry *old_entry = entry; + struct hlist_bl_node *node; + struct hlist_bl_head *head; + + if (entry) + head = entry->e_hash_list_head; + else + head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; + hlist_bl_lock(head); + if (entry && !hlist_bl_unhashed(&entry->e_hash_list)) + node = entry->e_hash_list.next; + else + node = hlist_bl_first(head); + while (node) { + entry = hlist_bl_entry(node, struct mb2_cache_entry, + e_hash_list); + if (entry->e_key == key) { + atomic_inc(&entry->e_refcnt); + goto out; + } + node = node->next; + } + entry = NULL; +out: + hlist_bl_unlock(head); + if (old_entry) + mb2_cache_entry_put(cache, old_entry); + + return entry; +} + +/* + * mb2_cache_entry_find_first - find the first entry in cache with given key + * @cache: cache where we should search + * @key: key to look for + * + * Search in @cache for entry with key @key. Grabs reference to the first + * entry found and returns the entry. + */ +struct mb2_cache_entry *mb2_cache_entry_find_first(struct mb2_cache *cache, + u32 key) +{ + return __entry_find(cache, NULL, key); +} +EXPORT_SYMBOL(mb2_cache_entry_find_first); + +/* + * mb2_cache_entry_find_next - find next entry in cache with the same + * @cache: cache where we should search + * @entry: entry to start search from + * + * Finds next entry in the hash chain which has the same key as @entry. + * If @entry is unhashed (which can happen when deletion of entry races + * with the search), finds the first entry in the hash chain. The function + * drops reference to @entry and returns with a reference to the found entry. + */ +struct mb2_cache_entry *mb2_cache_entry_find_next(struct mb2_cache *cache, + struct mb2_cache_entry *entry) +{ + return __entry_find(cache, entry, entry->e_key); +} +EXPORT_SYMBOL(mb2_cache_entry_find_next); + +/* mb2_cache_entry_delete_block - remove information about block from cache + * @cache - cache we work with + * @key - key of the entry to remove + * @block - block containing data for @key + * + * Remove entry from cache @cache with key @key with data stored in @block. + */ +void mb2_cache_entry_delete_block(struct mb2_cache *cache, u32 key, + sector_t block) +{ + struct hlist_bl_node *node; + struct hlist_bl_head *head; + struct mb2_cache_entry *entry; + + head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; + hlist_bl_lock(head); + hlist_bl_for_each_entry(entry, node, head, e_hash_list) { + if (entry->e_key == key && entry->e_block == block) { + /* We keep hash list reference to keep entry alive */ + hlist_bl_del_init(&entry->e_hash_list); + hlist_bl_unlock(head); + spin_lock(&cache->c_lru_list_lock); + if (!list_empty(&entry->e_lru_list)) { + list_del_init(&entry->e_lru_list); + cache->c_entry_count--; + atomic_dec(&entry->e_refcnt); + } + spin_unlock(&cache->c_lru_list_lock); + mb2_cache_entry_put(cache, entry); + return; + } + } + hlist_bl_unlock(head); +} +EXPORT_SYMBOL(mb2_cache_entry_delete_block); + +/* mb2_cache_entry_touch - cache entry got used + * @cache - cache the entry belongs to + * @entry - entry that got used + * + * Move entry in lru list to reflect the fact that it was used. + */ +void mb2_cache_entry_touch(struct mb2_cache *cache, + struct mb2_cache_entry *entry) +{ + spin_lock(&cache->c_lru_list_lock); + if (!list_empty(&entry->e_lru_list)) + list_move_tail(&cache->c_lru_list, &entry->e_lru_list); + spin_unlock(&cache->c_lru_list_lock); +} +EXPORT_SYMBOL(mb2_cache_entry_touch); + +static unsigned long mb2_cache_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct mb2_cache *cache = container_of(shrink, struct mb2_cache, + c_shrink); + + return cache->c_entry_count; +} + +/* Shrink number of entries in cache */ +static unsigned long mb2_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + int nr_to_scan = sc->nr_to_scan; + struct mb2_cache *cache = container_of(shrink, struct mb2_cache, + c_shrink); + struct mb2_cache_entry *entry; + struct hlist_bl_head *head; + unsigned int shrunk = 0; + + spin_lock(&cache->c_lru_list_lock); + while (nr_to_scan-- && !list_empty(&cache->c_lru_list)) { + entry = list_first_entry(&cache->c_lru_list, + struct mb2_cache_entry, e_lru_list); + list_del_init(&entry->e_lru_list); + cache->c_entry_count--; + /* + * We keep LRU list reference so that entry doesn't go away + * from under us. + */ + spin_unlock(&cache->c_lru_list_lock); + head = entry->e_hash_list_head; + hlist_bl_lock(head); + if (!hlist_bl_unhashed(&entry->e_hash_list)) { + hlist_bl_del_init(&entry->e_hash_list); + atomic_dec(&entry->e_refcnt); + } + hlist_bl_unlock(head); + if (mb2_cache_entry_put(cache, entry)) + shrunk++; + cond_resched(); + spin_lock(&cache->c_lru_list_lock); + } + spin_unlock(&cache->c_lru_list_lock); + + return shrunk; +} + +/* + * mb2_cache_create - create cache + * @bucket_bits: log2 of the hash table size + * + * Create cache for keys with 2^bucket_bits hash entries. + */ +struct mb2_cache *mb2_cache_create(int bucket_bits) +{ + struct mb2_cache *cache; + int bucket_count = 1 << bucket_bits; + int i; + + if (!try_module_get(THIS_MODULE)) + return NULL; + + cache = kzalloc(sizeof(struct mb2_cache), GFP_KERNEL); + if (!cache) + goto err_out; + cache->c_bucket_bits = bucket_bits; + INIT_LIST_HEAD(&cache->c_lru_list); + spin_lock_init(&cache->c_lru_list_lock); + cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head), + GFP_KERNEL); + if (!cache->c_hash) { + kfree(cache); + goto err_out; + } + for (i = 0; i < bucket_count; i++) + INIT_HLIST_BL_HEAD(&cache->c_hash[i]); + + cache->c_shrink.count_objects = mb2_cache_count; + cache->c_shrink.scan_objects = mb2_cache_scan; + cache->c_shrink.seeks = DEFAULT_SEEKS; + register_shrinker(&cache->c_shrink); + + return cache; + +err_out: + module_put(THIS_MODULE); + return NULL; +} +EXPORT_SYMBOL(mb2_cache_create); + +/* + * mb2_cache_destroy - destroy cache + * @cache: the cache to destroy + * + * Free all entries in cache and cache itself. Caller must make sure nobody + * (except shrinker) can reach @cache when calling this. + */ +void mb2_cache_destroy(struct mb2_cache *cache) +{ + struct mb2_cache_entry *entry, *next; + + unregister_shrinker(&cache->c_shrink); + + /* + * We don't bother with any locking. Cache must not be used at this + * point. + */ + list_for_each_entry_safe(entry, next, &cache->c_lru_list, e_lru_list) { + if (!hlist_bl_unhashed(&entry->e_hash_list)) { + hlist_bl_del_init(&entry->e_hash_list); + atomic_dec(&entry->e_refcnt); + } else + WARN_ON(1); + list_del(&entry->e_lru_list); + WARN_ON(atomic_read(&entry->e_refcnt) != 1); + mb2_cache_entry_put(cache, entry); + } + kfree(cache->c_hash); + kfree(cache); + module_put(THIS_MODULE); +} +EXPORT_SYMBOL(mb2_cache_destroy); + +static int __init mb2cache_init(void) +{ + mb2_entry_cache = kmem_cache_create("mbcache", + sizeof(struct mb2_cache_entry), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); + BUG_ON(!mb2_entry_cache); + return 0; +} + +static void __exit mb2cache_exit(void) +{ + kmem_cache_destroy(mb2_entry_cache); +} + +module_init(mb2cache_init) +module_exit(mb2cache_exit) + +MODULE_AUTHOR("Jan Kara "); +MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mbcache2.h b/include/linux/mbcache2.h new file mode 100644 index 0000000..b6f160f --- /dev/null +++ b/include/linux/mbcache2.h @@ -0,0 +1,50 @@ +#ifndef _LINUX_MB2CACHE_H +#define _LINUX_MB2CACHE_H + +#include +#include +#include +#include +#include + +struct mb2_cache; + +struct mb2_cache_entry { + /* LRU list - protected by cache->c_lru_list_lock */ + struct list_head e_lru_list; + /* Hash table list - protected by bitlock in e_hash_list_head */ + struct hlist_bl_node e_hash_list; + atomic_t e_refcnt; + /* Key in hash - stable during lifetime of the entry */ + u32 e_key; + /* Block number of hashed block - stable during lifetime of the entry */ + sector_t e_block; + /* Head of hash list (for list bit lock) - stable */ + struct hlist_bl_head *e_hash_list_head; +}; + +struct mb2_cache *mb2_cache_create(int bucket_bits); +void mb2_cache_destroy(struct mb2_cache *cache); + +int mb2_cache_entry_create(struct mb2_cache *cache, gfp_t mask, u32 key, + sector_t block); +void __mb2_cache_entry_free(struct mb2_cache_entry *entry); +static inline int mb2_cache_entry_put(struct mb2_cache *cache, + struct mb2_cache_entry *entry) +{ + if (!atomic_dec_and_test(&entry->e_refcnt)) + return 0; + __mb2_cache_entry_free(entry); + return 1; +} + +void mb2_cache_entry_delete_block(struct mb2_cache *cache, u32 key, + sector_t block); +struct mb2_cache_entry *mb2_cache_entry_find_first(struct mb2_cache *cache, + u32 key); +struct mb2_cache_entry *mb2_cache_entry_find_next(struct mb2_cache *cache, + struct mb2_cache_entry *entry); +void mb2_cache_entry_touch(struct mb2_cache *cache, + struct mb2_cache_entry *entry); + +#endif /* _LINUX_MB2CACHE_H */ -- cgit v0.10.2 From 82939d7999dfc1f1998c4b1c12e2f19edbdff272 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2016 11:50:13 -0500 Subject: ext4: convert to mbcache2 The conversion is generally straightforward. The only tricky part is that xattr block corresponding to found mbcache entry can get freed before we get buffer lock for that block. So we have to check whether the entry is still valid after getting buffer lock. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 157b458..9ac9e62 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1468,7 +1468,7 @@ struct ext4_sb_info { struct list_head s_es_list; /* List of inodes with reclaimable extents */ long s_es_nr_inode; struct ext4_es_stats s_es_stats; - struct mb_cache *s_mb_cache; + struct mb2_cache *s_mb_cache; spinlock_t s_es_lock ____cacheline_aligned_in_smp; /* Ratelimit ext4 messages. */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3ed01ec..ecc37e1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -844,7 +844,6 @@ static void ext4_put_super(struct super_block *sb) ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); - ext4_xattr_put_super(sb); if (!(sb->s_flags & MS_RDONLY)) { ext4_clear_feature_journal_needs_recovery(sb); @@ -3797,7 +3796,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) no_journal: if (ext4_mballoc_ready) { - sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id); + sbi->s_mb_cache = ext4_xattr_create_cache(); if (!sbi->s_mb_cache) { ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); goto failed_mount_wq; @@ -4027,6 +4026,10 @@ failed_mount4: if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); failed_mount_wq: + if (sbi->s_mb_cache) { + ext4_xattr_destroy_cache(sbi->s_mb_cache); + sbi->s_mb_cache = NULL; + } if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index a95151e..fe9f8d6 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -53,7 +53,7 @@ #include #include #include -#include +#include #include #include "ext4_jbd2.h" #include "ext4.h" @@ -78,10 +78,10 @@ # define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif -static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *); +static void ext4_xattr_cache_insert(struct mb2_cache *, struct buffer_head *); static struct buffer_head *ext4_xattr_cache_find(struct inode *, struct ext4_xattr_header *, - struct mb_cache_entry **); + struct mb2_cache_entry **); static void ext4_xattr_rehash(struct ext4_xattr_header *, struct ext4_xattr_entry *); static int ext4_xattr_list(struct dentry *dentry, char *buffer, @@ -276,7 +276,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, struct ext4_xattr_entry *entry; size_t size; int error; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); @@ -428,7 +428,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct inode *inode = d_inode(dentry); struct buffer_head *bh = NULL; int error; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); @@ -545,11 +545,8 @@ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, struct buffer_head *bh) { - struct mb_cache_entry *ce = NULL; int error = 0; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); - ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr); BUFFER_TRACE(bh, "get_write_access"); error = ext4_journal_get_write_access(handle, bh); if (error) @@ -557,9 +554,15 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, lock_buffer(bh); if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { + __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); + ea_bdebug(bh, "refcount now=0; freeing"); - if (ce) - mb_cache_entry_free(ce); + /* + * This must happen under buffer lock for + * ext4_xattr_block_set() to reliably detect freed block + */ + mb2_cache_entry_delete_block(EXT4_GET_MB_CACHE(inode), hash, + bh->b_blocknr); get_bh(bh); unlock_buffer(bh); ext4_free_blocks(handle, inode, bh, 0, 1, @@ -567,8 +570,6 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, EXT4_FREE_BLOCKS_FORGET); } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); - if (ce) - mb_cache_entry_release(ce); /* * Beware of this ugliness: Releasing of xattr block references * from different inodes can race and so we have to protect @@ -781,17 +782,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; struct ext4_xattr_search *s = &bs->s; - struct mb_cache_entry *ce = NULL; + struct mb2_cache_entry *ce = NULL; int error = 0; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); #define header(x) ((struct ext4_xattr_header *)(x)) if (i->value && i->value_len > sb->s_blocksize) return -ENOSPC; if (s->base) { - ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev, - bs->bh->b_blocknr); BUFFER_TRACE(bs->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, bs->bh); if (error) @@ -799,10 +798,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, lock_buffer(bs->bh); if (header(s->base)->h_refcount == cpu_to_le32(1)) { - if (ce) { - mb_cache_entry_free(ce); - ce = NULL; - } + __u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash); + + /* + * This must happen under buffer lock for + * ext4_xattr_block_set() to reliably detect modified + * block + */ + mb2_cache_entry_delete_block(ext4_mb_cache, hash, + bs->bh->b_blocknr); ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s); if (!error) { @@ -826,10 +830,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, int offset = (char *)s->here - bs->bh->b_data; unlock_buffer(bs->bh); - if (ce) { - mb_cache_entry_release(ce); - ce = NULL; - } ea_bdebug(bs->bh, "cloning"); s->base = kmalloc(bs->bh->b_size, GFP_NOFS); error = -ENOMEM; @@ -884,6 +884,31 @@ inserted: if (error) goto cleanup_dquot; lock_buffer(new_bh); + /* + * We have to be careful about races with + * freeing or rehashing of xattr block. Once we + * hold buffer lock xattr block's state is + * stable so we can check whether the block got + * freed / rehashed or not. Since we unhash + * mbcache entry under buffer lock when freeing + * / rehashing xattr block, checking whether + * entry is still hashed is reliable. + */ + if (hlist_bl_unhashed(&ce->e_hash_list)) { + /* + * Undo everything and check mbcache + * again. + */ + unlock_buffer(new_bh); + dquot_free_block(inode, + EXT4_C2B(EXT4_SB(sb), + 1)); + brelse(new_bh); + mb2_cache_entry_put(ext4_mb_cache, ce); + ce = NULL; + new_bh = NULL; + goto inserted; + } le32_add_cpu(&BHDR(new_bh)->h_refcount, 1); ea_bdebug(new_bh, "reusing; refcount now=%d", le32_to_cpu(BHDR(new_bh)->h_refcount)); @@ -894,7 +919,8 @@ inserted: if (error) goto cleanup_dquot; } - mb_cache_entry_release(ce); + mb2_cache_entry_touch(ext4_mb_cache, ce); + mb2_cache_entry_put(ext4_mb_cache, ce); ce = NULL; } else if (bs->bh && s->base == bs->bh->b_data) { /* We were modifying this block in-place. */ @@ -959,7 +985,7 @@ getblk_failed: cleanup: if (ce) - mb_cache_entry_release(ce); + mb2_cache_entry_put(ext4_mb_cache, ce); brelse(new_bh); if (!(bs->bh && s->base == bs->bh->b_data)) kfree(s->base); @@ -1512,17 +1538,6 @@ cleanup: } /* - * ext4_xattr_put_super() - * - * This is called when a file system is unmounted. - */ -void -ext4_xattr_put_super(struct super_block *sb) -{ - mb_cache_shrink(sb->s_bdev); -} - -/* * ext4_xattr_cache_insert() * * Create a new entry in the extended attribute cache, and insert @@ -1531,28 +1546,18 @@ ext4_xattr_put_super(struct super_block *sb) * Returns 0, or a negative error number on failure. */ static void -ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) +ext4_xattr_cache_insert(struct mb2_cache *ext4_mb_cache, struct buffer_head *bh) { __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); - struct mb_cache_entry *ce; int error; - ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS); - if (!ce) { - ea_bdebug(bh, "out of memory"); - return; - } - error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash); + error = mb2_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash, + bh->b_blocknr); if (error) { - mb_cache_entry_free(ce); - if (error == -EBUSY) { + if (error == -EBUSY) ea_bdebug(bh, "already in cache"); - error = 0; - } - } else { + } else ea_bdebug(bh, "inserting [%x]", (int)hash); - mb_cache_entry_release(ce); - } } /* @@ -1605,26 +1610,19 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, */ static struct buffer_head * ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, - struct mb_cache_entry **pce) + struct mb2_cache_entry **pce) { __u32 hash = le32_to_cpu(header->h_hash); - struct mb_cache_entry *ce; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb2_cache_entry *ce; + struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); -again: - ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev, - hash); + ce = mb2_cache_entry_find_first(ext4_mb_cache, hash); while (ce) { struct buffer_head *bh; - if (IS_ERR(ce)) { - if (PTR_ERR(ce) == -EAGAIN) - goto again; - break; - } bh = sb_bread(inode->i_sb, ce->e_block); if (!bh) { EXT4_ERROR_INODE(inode, "block %lu read error", @@ -1640,7 +1638,7 @@ again: return bh; } brelse(bh); - ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); + ce = mb2_cache_entry_find_next(ext4_mb_cache, ce); } return NULL; } @@ -1715,15 +1713,15 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header, #define HASH_BUCKET_BITS 10 -struct mb_cache * -ext4_xattr_create_cache(char *name) +struct mb2_cache * +ext4_xattr_create_cache(void) { - return mb_cache_create(name, HASH_BUCKET_BITS); + return mb2_cache_create(HASH_BUCKET_BITS); } -void ext4_xattr_destroy_cache(struct mb_cache *cache) +void ext4_xattr_destroy_cache(struct mb2_cache *cache) { if (cache) - mb_cache_destroy(cache); + mb2_cache_destroy(cache); } diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index ddc0957..10b0f73 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -108,7 +108,6 @@ extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_ extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); extern void ext4_xattr_delete_inode(handle_t *, struct inode *); -extern void ext4_xattr_put_super(struct super_block *); extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); @@ -124,8 +123,8 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is); -extern struct mb_cache *ext4_xattr_create_cache(char *name); -extern void ext4_xattr_destroy_cache(struct mb_cache *); +extern struct mb2_cache *ext4_xattr_create_cache(void); +extern void ext4_xattr_destroy_cache(struct mb2_cache *); #ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, -- cgit v0.10.2 From be0726d33cb8f411945884664924bed3cb8c70ee Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2016 11:56:38 -0500 Subject: ext2: convert to mbcache2 The conversion is generally straightforward. We convert filesystem from a global cache to per-fs one. Similarly to ext4 the tricky part is that xattr block corresponding to found mbcache entry can get freed before we get buffer lock for that block. So we have to check whether the entry is still valid after getting the buffer lock. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 4c69c94..f98ce7e 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -61,6 +61,8 @@ struct ext2_block_alloc_info { #define rsv_start rsv_window._rsv_start #define rsv_end rsv_window._rsv_end +struct mb2_cache; + /* * second extended-fs super-block data in memory */ @@ -111,6 +113,7 @@ struct ext2_sb_info { * of the mount options. */ spinlock_t s_lock; + struct mb2_cache *s_mb_cache; }; static inline spinlock_t * diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 2a18841..b78caf2 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -131,7 +131,10 @@ static void ext2_put_super (struct super_block * sb) dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); - ext2_xattr_put_super(sb); + if (sbi->s_mb_cache) { + ext2_xattr_destroy_cache(sbi->s_mb_cache); + sbi->s_mb_cache = NULL; + } if (!(sb->s_flags & MS_RDONLY)) { struct ext2_super_block *es = sbi->s_es; @@ -1104,6 +1107,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) ext2_msg(sb, KERN_ERR, "error: insufficient memory"); goto failed_mount3; } + +#ifdef CONFIG_EXT2_FS_XATTR + sbi->s_mb_cache = ext2_xattr_create_cache(); + if (!sbi->s_mb_cache) { + ext2_msg(sb, KERN_ERR, "Failed to create an mb_cache"); + goto failed_mount3; + } +#endif /* * set up enough so that it can read an inode */ @@ -1149,6 +1160,8 @@ cantfind_ext2: sb->s_id); goto failed_mount; failed_mount3: + if (sbi->s_mb_cache) + ext2_xattr_destroy_cache(sbi->s_mb_cache); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); @@ -1555,20 +1568,17 @@ MODULE_ALIAS_FS("ext2"); static int __init init_ext2_fs(void) { - int err = init_ext2_xattr(); - if (err) - return err; + int err; + err = init_inodecache(); if (err) - goto out1; + return err; err = register_filesystem(&ext2_fs_type); if (err) goto out; return 0; out: destroy_inodecache(); -out1: - exit_ext2_xattr(); return err; } @@ -1576,7 +1586,6 @@ static void __exit exit_ext2_fs(void) { unregister_filesystem(&ext2_fs_type); destroy_inodecache(); - exit_ext2_xattr(); } MODULE_AUTHOR("Remy Card and others"); diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index f57a7ab..7162b48 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -56,7 +56,7 @@ #include #include #include -#include +#include #include #include #include @@ -90,14 +90,12 @@ static int ext2_xattr_set2(struct inode *, struct buffer_head *, struct ext2_xattr_header *); -static int ext2_xattr_cache_insert(struct buffer_head *); +static int ext2_xattr_cache_insert(struct mb2_cache *, struct buffer_head *); static struct buffer_head *ext2_xattr_cache_find(struct inode *, struct ext2_xattr_header *); static void ext2_xattr_rehash(struct ext2_xattr_header *, struct ext2_xattr_entry *); -static struct mb_cache *ext2_xattr_cache; - static const struct xattr_handler *ext2_xattr_handler_map[] = { [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, #ifdef CONFIG_EXT2_FS_POSIX_ACL @@ -152,6 +150,7 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name, size_t name_len, size; char *end; int error; + struct mb2_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); @@ -196,7 +195,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", goto found; entry = next; } - if (ext2_xattr_cache_insert(bh)) + if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) ea_idebug(inode, "cache insert failed"); error = -ENODATA; goto cleanup; @@ -209,7 +208,7 @@ found: le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) goto bad_block; - if (ext2_xattr_cache_insert(bh)) + if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) ea_idebug(inode, "cache insert failed"); if (buffer) { error = -ERANGE; @@ -247,6 +246,7 @@ ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) char *end; size_t rest = buffer_size; int error; + struct mb2_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); @@ -281,7 +281,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", goto bad_block; entry = next; } - if (ext2_xattr_cache_insert(bh)) + if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) ea_idebug(inode, "cache insert failed"); /* list the attribute names */ @@ -483,22 +483,23 @@ bad_block: ext2_error(sb, "ext2_xattr_set", /* Here we know that we can set the new attribute. */ if (header) { - struct mb_cache_entry *ce; - /* assert(header == HDR(bh)); */ - ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, - bh->b_blocknr); lock_buffer(bh); if (header->h_refcount == cpu_to_le32(1)) { + __u32 hash = le32_to_cpu(header->h_hash); + ea_bdebug(bh, "modifying in-place"); - if (ce) - mb_cache_entry_free(ce); + /* + * This must happen under buffer lock for + * ext2_xattr_set2() to reliably detect modified block + */ + mb2_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache, + hash, bh->b_blocknr); + /* keep the buffer locked while modifying it. */ } else { int offset; - if (ce) - mb_cache_entry_release(ce); unlock_buffer(bh); ea_bdebug(bh, "cloning"); header = kmalloc(bh->b_size, GFP_KERNEL); @@ -626,6 +627,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; int error; + struct mb2_cache *ext2_mb_cache = EXT2_SB(sb)->s_mb_cache; if (header) { new_bh = ext2_xattr_cache_find(inode, header); @@ -653,7 +655,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, don't need to change the reference count. */ new_bh = old_bh; get_bh(new_bh); - ext2_xattr_cache_insert(new_bh); + ext2_xattr_cache_insert(ext2_mb_cache, new_bh); } else { /* We need to allocate a new block */ ext2_fsblk_t goal = ext2_group_first_block_no(sb, @@ -674,7 +676,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, memcpy(new_bh->b_data, header, new_bh->b_size); set_buffer_uptodate(new_bh); unlock_buffer(new_bh); - ext2_xattr_cache_insert(new_bh); + ext2_xattr_cache_insert(ext2_mb_cache, new_bh); ext2_xattr_update_super_block(sb); } @@ -707,19 +709,21 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, error = 0; if (old_bh && old_bh != new_bh) { - struct mb_cache_entry *ce; - /* * If there was an old block and we are no longer using it, * release the old block. */ - ce = mb_cache_entry_get(ext2_xattr_cache, old_bh->b_bdev, - old_bh->b_blocknr); lock_buffer(old_bh); if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) { + __u32 hash = le32_to_cpu(HDR(old_bh)->h_hash); + + /* + * This must happen under buffer lock for + * ext2_xattr_set2() to reliably detect freed block + */ + mb2_cache_entry_delete_block(ext2_mb_cache, + hash, old_bh->b_blocknr); /* Free the old block. */ - if (ce) - mb_cache_entry_free(ce); ea_bdebug(old_bh, "freeing"); ext2_free_blocks(inode, old_bh->b_blocknr, 1); mark_inode_dirty(inode); @@ -730,8 +734,6 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, } else { /* Decrement the refcount only. */ le32_add_cpu(&HDR(old_bh)->h_refcount, -1); - if (ce) - mb_cache_entry_release(ce); dquot_free_block_nodirty(inode, 1); mark_inode_dirty(inode); mark_buffer_dirty(old_bh); @@ -757,7 +759,6 @@ void ext2_xattr_delete_inode(struct inode *inode) { struct buffer_head *bh = NULL; - struct mb_cache_entry *ce; down_write(&EXT2_I(inode)->xattr_sem); if (!EXT2_I(inode)->i_file_acl) @@ -777,19 +778,22 @@ ext2_xattr_delete_inode(struct inode *inode) EXT2_I(inode)->i_file_acl); goto cleanup; } - ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, bh->b_blocknr); lock_buffer(bh); if (HDR(bh)->h_refcount == cpu_to_le32(1)) { - if (ce) - mb_cache_entry_free(ce); + __u32 hash = le32_to_cpu(HDR(bh)->h_hash); + + /* + * This must happen under buffer lock for ext2_xattr_set2() to + * reliably detect freed block + */ + mb2_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache, + hash, bh->b_blocknr); ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1); get_bh(bh); bforget(bh); unlock_buffer(bh); } else { le32_add_cpu(&HDR(bh)->h_refcount, -1); - if (ce) - mb_cache_entry_release(ce); ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount)); unlock_buffer(bh); @@ -806,18 +810,6 @@ cleanup: } /* - * ext2_xattr_put_super() - * - * This is called when a file system is unmounted. - */ -void -ext2_xattr_put_super(struct super_block *sb) -{ - mb_cache_shrink(sb->s_bdev); -} - - -/* * ext2_xattr_cache_insert() * * Create a new entry in the extended attribute cache, and insert @@ -826,28 +818,20 @@ ext2_xattr_put_super(struct super_block *sb) * Returns 0, or a negative error number on failure. */ static int -ext2_xattr_cache_insert(struct buffer_head *bh) +ext2_xattr_cache_insert(struct mb2_cache *cache, struct buffer_head *bh) { __u32 hash = le32_to_cpu(HDR(bh)->h_hash); - struct mb_cache_entry *ce; int error; - ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS); - if (!ce) - return -ENOMEM; - error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash); + error = mb2_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr); if (error) { - mb_cache_entry_free(ce); if (error == -EBUSY) { ea_bdebug(bh, "already in cache (%d cache entries)", atomic_read(&ext2_xattr_cache->c_entry_count)); error = 0; } - } else { - ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, - atomic_read(&ext2_xattr_cache->c_entry_count)); - mb_cache_entry_release(ce); - } + } else + ea_bdebug(bh, "inserting [%x]", (int)hash); return error; } @@ -903,23 +887,17 @@ static struct buffer_head * ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) { __u32 hash = le32_to_cpu(header->h_hash); - struct mb_cache_entry *ce; + struct mb2_cache_entry *ce; + struct mb2_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); again: - ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev, - hash); + ce = mb2_cache_entry_find_first(ext2_mb_cache, hash); while (ce) { struct buffer_head *bh; - if (IS_ERR(ce)) { - if (PTR_ERR(ce) == -EAGAIN) - goto again; - break; - } - bh = sb_bread(inode->i_sb, ce->e_block); if (!bh) { ext2_error(inode->i_sb, "ext2_xattr_cache_find", @@ -927,7 +905,21 @@ again: inode->i_ino, (unsigned long) ce->e_block); } else { lock_buffer(bh); - if (le32_to_cpu(HDR(bh)->h_refcount) > + /* + * We have to be careful about races with freeing or + * rehashing of xattr block. Once we hold buffer lock + * xattr block's state is stable so we can check + * whether the block got freed / rehashed or not. + * Since we unhash mbcache entry under buffer lock when + * freeing / rehashing xattr block, checking whether + * entry is still hashed is reliable. + */ + if (hlist_bl_unhashed(&ce->e_hash_list)) { + mb2_cache_entry_put(ext2_mb_cache, ce); + unlock_buffer(bh); + brelse(bh); + goto again; + } else if (le32_to_cpu(HDR(bh)->h_refcount) > EXT2_XATTR_REFCOUNT_MAX) { ea_idebug(inode, "block %ld refcount %d>%d", (unsigned long) ce->e_block, @@ -936,13 +928,14 @@ again: } else if (!ext2_xattr_cmp(header, HDR(bh))) { ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); - mb_cache_entry_release(ce); + mb2_cache_entry_touch(ext2_mb_cache, ce); + mb2_cache_entry_put(ext2_mb_cache, ce); return bh; } unlock_buffer(bh); brelse(bh); } - ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); + ce = mb2_cache_entry_find_next(ext2_mb_cache, ce); } return NULL; } @@ -1015,17 +1008,15 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *header, #undef BLOCK_HASH_SHIFT -int __init -init_ext2_xattr(void) +#define HASH_BUCKET_BITS 10 + +struct mb2_cache *ext2_xattr_create_cache(void) { - ext2_xattr_cache = mb_cache_create("ext2_xattr", 6); - if (!ext2_xattr_cache) - return -ENOMEM; - return 0; + return mb2_cache_create(HASH_BUCKET_BITS); } -void -exit_ext2_xattr(void) +void ext2_xattr_destroy_cache(struct mb2_cache *cache) { - mb_cache_destroy(ext2_xattr_cache); + if (cache) + mb2_cache_destroy(cache); } diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h index 60edf29..6ea38aa 100644 --- a/fs/ext2/xattr.h +++ b/fs/ext2/xattr.h @@ -53,6 +53,8 @@ struct ext2_xattr_entry { #define EXT2_XATTR_SIZE(size) \ (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND) +struct mb2_cache; + # ifdef CONFIG_EXT2_FS_XATTR extern const struct xattr_handler ext2_xattr_user_handler; @@ -65,10 +67,9 @@ extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t); extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int); extern void ext2_xattr_delete_inode(struct inode *); -extern void ext2_xattr_put_super(struct super_block *); -extern int init_ext2_xattr(void); -extern void exit_ext2_xattr(void); +extern struct mb2_cache *ext2_xattr_create_cache(void); +extern void ext2_xattr_destroy_cache(struct mb2_cache *cache); extern const struct xattr_handler *ext2_xattr_handlers[]; @@ -93,19 +94,7 @@ ext2_xattr_delete_inode(struct inode *inode) { } -static inline void -ext2_xattr_put_super(struct super_block *sb) -{ -} - -static inline int -init_ext2_xattr(void) -{ - return 0; -} - -static inline void -exit_ext2_xattr(void) +static inline void ext2_xattr_destroy_cache(struct mb2_cache *cache) { } -- cgit v0.10.2 From ecd1e64412d5242b8afdef58a714bab3c5464f79 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2016 12:21:14 -0500 Subject: mbcache: remove mbcache Both ext2 and ext4 are now converted to mbcache2. Remove the old mbcache code. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/Makefile b/fs/Makefile index 15b3d6c..59b8440 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -41,7 +41,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o -obj-$(CONFIG_FS_MBCACHE) += mbcache.o mbcache2.o +obj-$(CONFIG_FS_MBCACHE) += mbcache2.o obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_COREDUMP) += coredump.o diff --git a/fs/mbcache.c b/fs/mbcache.c deleted file mode 100644 index 187477d..0000000 --- a/fs/mbcache.c +++ /dev/null @@ -1,858 +0,0 @@ -/* - * linux/fs/mbcache.c - * (C) 2001-2002 Andreas Gruenbacher, - */ - -/* - * Filesystem Meta Information Block Cache (mbcache) - * - * The mbcache caches blocks of block devices that need to be located - * by their device/block number, as well as by other criteria (such - * as the block's contents). - * - * There can only be one cache entry in a cache per device and block number. - * Additional indexes need not be unique in this sense. The number of - * additional indexes (=other criteria) can be hardwired at compile time - * or specified at cache create time. - * - * Each cache entry is of fixed size. An entry may be `valid' or `invalid' - * in the cache. A valid entry is in the main hash tables of the cache, - * and may also be in the lru list. An invalid entry is not in any hashes - * or lists. - * - * A valid cache entry is only in the lru list if no handles refer to it. - * Invalid cache entries will be freed when the last handle to the cache - * entry is released. Entries that cannot be freed immediately are put - * back on the lru list. - */ - -/* - * Lock descriptions and usage: - * - * Each hash chain of both the block and index hash tables now contains - * a built-in lock used to serialize accesses to the hash chain. - * - * Accesses to global data structures mb_cache_list and mb_cache_lru_list - * are serialized via the global spinlock mb_cache_spinlock. - * - * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize - * accesses to its local data, such as e_used and e_queued. - * - * Lock ordering: - * - * Each block hash chain's lock has the highest lock order, followed by an - * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's - * lock), and mb_cach_spinlock, with the lowest order. While holding - * either a block or index hash chain lock, a thread can acquire an - * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock. - * - * Synchronization: - * - * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and - * index hash chian, it needs to lock the corresponding hash chain. For each - * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to - * prevent either any simultaneous release or free on the entry and also - * to serialize accesses to either the e_used or e_queued member of the entry. - * - * To avoid having a dangling reference to an already freed - * mb_cache_entry, an mb_cache_entry is only freed when it is not on a - * block hash chain and also no longer being referenced, both e_used, - * and e_queued are 0's. When an mb_cache_entry is explicitly freed it is - * first removed from a block hash chain. - */ - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef MB_CACHE_DEBUG -# define mb_debug(f...) do { \ - printk(KERN_DEBUG f); \ - printk("\n"); \ - } while (0) -#define mb_assert(c) do { if (!(c)) \ - printk(KERN_ERR "assertion " #c " failed\n"); \ - } while(0) -#else -# define mb_debug(f...) do { } while(0) -# define mb_assert(c) do { } while(0) -#endif -#define mb_error(f...) do { \ - printk(KERN_ERR f); \ - printk("\n"); \ - } while(0) - -#define MB_CACHE_WRITER ((unsigned short)~0U >> 1) - -#define MB_CACHE_ENTRY_LOCK_BITS ilog2(NR_BG_LOCKS) -#define MB_CACHE_ENTRY_LOCK_INDEX(ce) \ - (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS)) - -static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue); -static struct blockgroup_lock *mb_cache_bg_lock; -static struct kmem_cache *mb_cache_kmem_cache; - -MODULE_AUTHOR("Andreas Gruenbacher "); -MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); -MODULE_LICENSE("GPL"); - -EXPORT_SYMBOL(mb_cache_create); -EXPORT_SYMBOL(mb_cache_shrink); -EXPORT_SYMBOL(mb_cache_destroy); -EXPORT_SYMBOL(mb_cache_entry_alloc); -EXPORT_SYMBOL(mb_cache_entry_insert); -EXPORT_SYMBOL(mb_cache_entry_release); -EXPORT_SYMBOL(mb_cache_entry_free); -EXPORT_SYMBOL(mb_cache_entry_get); -#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) -EXPORT_SYMBOL(mb_cache_entry_find_first); -EXPORT_SYMBOL(mb_cache_entry_find_next); -#endif - -/* - * Global data: list of all mbcache's, lru list, and a spinlock for - * accessing cache data structures on SMP machines. The lru list is - * global across all mbcaches. - */ - -static LIST_HEAD(mb_cache_list); -static LIST_HEAD(mb_cache_lru_list); -static DEFINE_SPINLOCK(mb_cache_spinlock); - -static inline void -__spin_lock_mb_cache_entry(struct mb_cache_entry *ce) -{ - spin_lock(bgl_lock_ptr(mb_cache_bg_lock, - MB_CACHE_ENTRY_LOCK_INDEX(ce))); -} - -static inline void -__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce) -{ - spin_unlock(bgl_lock_ptr(mb_cache_bg_lock, - MB_CACHE_ENTRY_LOCK_INDEX(ce))); -} - -static inline int -__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce) -{ - return !hlist_bl_unhashed(&ce->e_block_list); -} - - -static inline void -__mb_cache_entry_unhash_block(struct mb_cache_entry *ce) -{ - if (__mb_cache_entry_is_block_hashed(ce)) - hlist_bl_del_init(&ce->e_block_list); -} - -static inline int -__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce) -{ - return !hlist_bl_unhashed(&ce->e_index.o_list); -} - -static inline void -__mb_cache_entry_unhash_index(struct mb_cache_entry *ce) -{ - if (__mb_cache_entry_is_index_hashed(ce)) - hlist_bl_del_init(&ce->e_index.o_list); -} - -/* - * __mb_cache_entry_unhash_unlock() - * - * This function is called to unhash both the block and index hash - * chain. - * It assumes both the block and index hash chain is locked upon entry. - * It also unlock both hash chains both exit - */ -static inline void -__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce) -{ - __mb_cache_entry_unhash_index(ce); - hlist_bl_unlock(ce->e_index_hash_p); - __mb_cache_entry_unhash_block(ce); - hlist_bl_unlock(ce->e_block_hash_p); -} - -static void -__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask) -{ - struct mb_cache *cache = ce->e_cache; - - mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))); - kmem_cache_free(cache->c_entry_cache, ce); - atomic_dec(&cache->c_entry_count); -} - -static void -__mb_cache_entry_release(struct mb_cache_entry *ce) -{ - /* First lock the entry to serialize access to its local data. */ - __spin_lock_mb_cache_entry(ce); - /* Wake up all processes queuing for this cache entry. */ - if (ce->e_queued) - wake_up_all(&mb_cache_queue); - if (ce->e_used >= MB_CACHE_WRITER) - ce->e_used -= MB_CACHE_WRITER; - /* - * Make sure that all cache entries on lru_list have - * both e_used and e_qued of 0s. - */ - ce->e_used--; - if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) { - if (!__mb_cache_entry_is_block_hashed(ce)) { - __spin_unlock_mb_cache_entry(ce); - goto forget; - } - /* - * Need access to lru list, first drop entry lock, - * then reacquire the lock in the proper order. - */ - spin_lock(&mb_cache_spinlock); - if (list_empty(&ce->e_lru_list)) - list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); - spin_unlock(&mb_cache_spinlock); - } - __spin_unlock_mb_cache_entry(ce); - return; -forget: - mb_assert(list_empty(&ce->e_lru_list)); - __mb_cache_entry_forget(ce, GFP_KERNEL); -} - -/* - * mb_cache_shrink_scan() memory pressure callback - * - * This function is called by the kernel memory management when memory - * gets low. - * - * @shrink: (ignored) - * @sc: shrink_control passed from reclaim - * - * Returns the number of objects freed. - */ -static unsigned long -mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) -{ - LIST_HEAD(free_list); - struct mb_cache_entry *entry, *tmp; - int nr_to_scan = sc->nr_to_scan; - gfp_t gfp_mask = sc->gfp_mask; - unsigned long freed = 0; - - mb_debug("trying to free %d entries", nr_to_scan); - spin_lock(&mb_cache_spinlock); - while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) { - struct mb_cache_entry *ce = - list_entry(mb_cache_lru_list.next, - struct mb_cache_entry, e_lru_list); - list_del_init(&ce->e_lru_list); - if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)) - continue; - spin_unlock(&mb_cache_spinlock); - /* Prevent any find or get operation on the entry */ - hlist_bl_lock(ce->e_block_hash_p); - hlist_bl_lock(ce->e_index_hash_p); - /* Ignore if it is touched by a find/get */ - if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) || - !list_empty(&ce->e_lru_list)) { - hlist_bl_unlock(ce->e_index_hash_p); - hlist_bl_unlock(ce->e_block_hash_p); - spin_lock(&mb_cache_spinlock); - continue; - } - __mb_cache_entry_unhash_unlock(ce); - list_add_tail(&ce->e_lru_list, &free_list); - spin_lock(&mb_cache_spinlock); - } - spin_unlock(&mb_cache_spinlock); - - list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) { - __mb_cache_entry_forget(entry, gfp_mask); - freed++; - } - return freed; -} - -static unsigned long -mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) -{ - struct mb_cache *cache; - unsigned long count = 0; - - spin_lock(&mb_cache_spinlock); - list_for_each_entry(cache, &mb_cache_list, c_cache_list) { - mb_debug("cache %s (%d)", cache->c_name, - atomic_read(&cache->c_entry_count)); - count += atomic_read(&cache->c_entry_count); - } - spin_unlock(&mb_cache_spinlock); - - return vfs_pressure_ratio(count); -} - -static struct shrinker mb_cache_shrinker = { - .count_objects = mb_cache_shrink_count, - .scan_objects = mb_cache_shrink_scan, - .seeks = DEFAULT_SEEKS, -}; - -/* - * mb_cache_create() create a new cache - * - * All entries in one cache are equal size. Cache entries may be from - * multiple devices. If this is the first mbcache created, registers - * the cache with kernel memory management. Returns NULL if no more - * memory was available. - * - * @name: name of the cache (informal) - * @bucket_bits: log2(number of hash buckets) - */ -struct mb_cache * -mb_cache_create(const char *name, int bucket_bits) -{ - int n, bucket_count = 1 << bucket_bits; - struct mb_cache *cache = NULL; - - if (!mb_cache_bg_lock) { - mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock), - GFP_KERNEL); - if (!mb_cache_bg_lock) - return NULL; - bgl_lock_init(mb_cache_bg_lock); - } - - cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL); - if (!cache) - return NULL; - cache->c_name = name; - atomic_set(&cache->c_entry_count, 0); - cache->c_bucket_bits = bucket_bits; - cache->c_block_hash = kmalloc(bucket_count * - sizeof(struct hlist_bl_head), GFP_KERNEL); - if (!cache->c_block_hash) - goto fail; - for (n=0; nc_block_hash[n]); - cache->c_index_hash = kmalloc(bucket_count * - sizeof(struct hlist_bl_head), GFP_KERNEL); - if (!cache->c_index_hash) - goto fail; - for (n=0; nc_index_hash[n]); - if (!mb_cache_kmem_cache) { - mb_cache_kmem_cache = kmem_cache_create(name, - sizeof(struct mb_cache_entry), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); - if (!mb_cache_kmem_cache) - goto fail2; - } - cache->c_entry_cache = mb_cache_kmem_cache; - - /* - * Set an upper limit on the number of cache entries so that the hash - * chains won't grow too long. - */ - cache->c_max_entries = bucket_count << 4; - - spin_lock(&mb_cache_spinlock); - list_add(&cache->c_cache_list, &mb_cache_list); - spin_unlock(&mb_cache_spinlock); - return cache; - -fail2: - kfree(cache->c_index_hash); - -fail: - kfree(cache->c_block_hash); - kfree(cache); - return NULL; -} - - -/* - * mb_cache_shrink() - * - * Removes all cache entries of a device from the cache. All cache entries - * currently in use cannot be freed, and thus remain in the cache. All others - * are freed. - * - * @bdev: which device's cache entries to shrink - */ -void -mb_cache_shrink(struct block_device *bdev) -{ - LIST_HEAD(free_list); - struct list_head *l; - struct mb_cache_entry *ce, *tmp; - - l = &mb_cache_lru_list; - spin_lock(&mb_cache_spinlock); - while (!list_is_last(l, &mb_cache_lru_list)) { - l = l->next; - ce = list_entry(l, struct mb_cache_entry, e_lru_list); - if (ce->e_bdev == bdev) { - list_del_init(&ce->e_lru_list); - if (ce->e_used || ce->e_queued || - atomic_read(&ce->e_refcnt)) - continue; - spin_unlock(&mb_cache_spinlock); - /* - * Prevent any find or get operation on the entry. - */ - hlist_bl_lock(ce->e_block_hash_p); - hlist_bl_lock(ce->e_index_hash_p); - /* Ignore if it is touched by a find/get */ - if (ce->e_used || ce->e_queued || - atomic_read(&ce->e_refcnt) || - !list_empty(&ce->e_lru_list)) { - hlist_bl_unlock(ce->e_index_hash_p); - hlist_bl_unlock(ce->e_block_hash_p); - l = &mb_cache_lru_list; - spin_lock(&mb_cache_spinlock); - continue; - } - __mb_cache_entry_unhash_unlock(ce); - mb_assert(!(ce->e_used || ce->e_queued || - atomic_read(&ce->e_refcnt))); - list_add_tail(&ce->e_lru_list, &free_list); - l = &mb_cache_lru_list; - spin_lock(&mb_cache_spinlock); - } - } - spin_unlock(&mb_cache_spinlock); - - list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) { - __mb_cache_entry_forget(ce, GFP_KERNEL); - } -} - - -/* - * mb_cache_destroy() - * - * Shrinks the cache to its minimum possible size (hopefully 0 entries), - * and then destroys it. If this was the last mbcache, un-registers the - * mbcache from kernel memory management. - */ -void -mb_cache_destroy(struct mb_cache *cache) -{ - LIST_HEAD(free_list); - struct mb_cache_entry *ce, *tmp; - - spin_lock(&mb_cache_spinlock); - list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) { - if (ce->e_cache == cache) - list_move_tail(&ce->e_lru_list, &free_list); - } - list_del(&cache->c_cache_list); - spin_unlock(&mb_cache_spinlock); - - list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) { - list_del_init(&ce->e_lru_list); - /* - * Prevent any find or get operation on the entry. - */ - hlist_bl_lock(ce->e_block_hash_p); - hlist_bl_lock(ce->e_index_hash_p); - mb_assert(!(ce->e_used || ce->e_queued || - atomic_read(&ce->e_refcnt))); - __mb_cache_entry_unhash_unlock(ce); - __mb_cache_entry_forget(ce, GFP_KERNEL); - } - - if (atomic_read(&cache->c_entry_count) > 0) { - mb_error("cache %s: %d orphaned entries", - cache->c_name, - atomic_read(&cache->c_entry_count)); - } - - if (list_empty(&mb_cache_list)) { - kmem_cache_destroy(mb_cache_kmem_cache); - mb_cache_kmem_cache = NULL; - } - kfree(cache->c_index_hash); - kfree(cache->c_block_hash); - kfree(cache); -} - -/* - * mb_cache_entry_alloc() - * - * Allocates a new cache entry. The new entry will not be valid initially, - * and thus cannot be looked up yet. It should be filled with data, and - * then inserted into the cache using mb_cache_entry_insert(). Returns NULL - * if no more memory was available. - */ -struct mb_cache_entry * -mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags) -{ - struct mb_cache_entry *ce; - - if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) { - struct list_head *l; - - l = &mb_cache_lru_list; - spin_lock(&mb_cache_spinlock); - while (!list_is_last(l, &mb_cache_lru_list)) { - l = l->next; - ce = list_entry(l, struct mb_cache_entry, e_lru_list); - if (ce->e_cache == cache) { - list_del_init(&ce->e_lru_list); - if (ce->e_used || ce->e_queued || - atomic_read(&ce->e_refcnt)) - continue; - spin_unlock(&mb_cache_spinlock); - /* - * Prevent any find or get operation on the - * entry. - */ - hlist_bl_lock(ce->e_block_hash_p); - hlist_bl_lock(ce->e_index_hash_p); - /* Ignore if it is touched by a find/get */ - if (ce->e_used || ce->e_queued || - atomic_read(&ce->e_refcnt) || - !list_empty(&ce->e_lru_list)) { - hlist_bl_unlock(ce->e_index_hash_p); - hlist_bl_unlock(ce->e_block_hash_p); - l = &mb_cache_lru_list; - spin_lock(&mb_cache_spinlock); - continue; - } - mb_assert(list_empty(&ce->e_lru_list)); - mb_assert(!(ce->e_used || ce->e_queued || - atomic_read(&ce->e_refcnt))); - __mb_cache_entry_unhash_unlock(ce); - goto found; - } - } - spin_unlock(&mb_cache_spinlock); - } - - ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); - if (!ce) - return NULL; - atomic_inc(&cache->c_entry_count); - INIT_LIST_HEAD(&ce->e_lru_list); - INIT_HLIST_BL_NODE(&ce->e_block_list); - INIT_HLIST_BL_NODE(&ce->e_index.o_list); - ce->e_cache = cache; - ce->e_queued = 0; - atomic_set(&ce->e_refcnt, 0); -found: - ce->e_block_hash_p = &cache->c_block_hash[0]; - ce->e_index_hash_p = &cache->c_index_hash[0]; - ce->e_used = 1 + MB_CACHE_WRITER; - return ce; -} - - -/* - * mb_cache_entry_insert() - * - * Inserts an entry that was allocated using mb_cache_entry_alloc() into - * the cache. After this, the cache entry can be looked up, but is not yet - * in the lru list as the caller still holds a handle to it. Returns 0 on - * success, or -EBUSY if a cache entry for that device + inode exists - * already (this may happen after a failed lookup, but when another process - * has inserted the same cache entry in the meantime). - * - * @bdev: device the cache entry belongs to - * @block: block number - * @key: lookup key - */ -int -mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev, - sector_t block, unsigned int key) -{ - struct mb_cache *cache = ce->e_cache; - unsigned int bucket; - struct hlist_bl_node *l; - struct hlist_bl_head *block_hash_p; - struct hlist_bl_head *index_hash_p; - struct mb_cache_entry *lce; - - mb_assert(ce); - bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), - cache->c_bucket_bits); - block_hash_p = &cache->c_block_hash[bucket]; - hlist_bl_lock(block_hash_p); - hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) { - if (lce->e_bdev == bdev && lce->e_block == block) { - hlist_bl_unlock(block_hash_p); - return -EBUSY; - } - } - mb_assert(!__mb_cache_entry_is_block_hashed(ce)); - __mb_cache_entry_unhash_block(ce); - __mb_cache_entry_unhash_index(ce); - ce->e_bdev = bdev; - ce->e_block = block; - ce->e_block_hash_p = block_hash_p; - ce->e_index.o_key = key; - hlist_bl_add_head(&ce->e_block_list, block_hash_p); - hlist_bl_unlock(block_hash_p); - bucket = hash_long(key, cache->c_bucket_bits); - index_hash_p = &cache->c_index_hash[bucket]; - hlist_bl_lock(index_hash_p); - ce->e_index_hash_p = index_hash_p; - hlist_bl_add_head(&ce->e_index.o_list, index_hash_p); - hlist_bl_unlock(index_hash_p); - return 0; -} - - -/* - * mb_cache_entry_release() - * - * Release a handle to a cache entry. When the last handle to a cache entry - * is released it is either freed (if it is invalid) or otherwise inserted - * in to the lru list. - */ -void -mb_cache_entry_release(struct mb_cache_entry *ce) -{ - __mb_cache_entry_release(ce); -} - - -/* - * mb_cache_entry_free() - * - */ -void -mb_cache_entry_free(struct mb_cache_entry *ce) -{ - mb_assert(ce); - mb_assert(list_empty(&ce->e_lru_list)); - hlist_bl_lock(ce->e_index_hash_p); - __mb_cache_entry_unhash_index(ce); - hlist_bl_unlock(ce->e_index_hash_p); - hlist_bl_lock(ce->e_block_hash_p); - __mb_cache_entry_unhash_block(ce); - hlist_bl_unlock(ce->e_block_hash_p); - __mb_cache_entry_release(ce); -} - - -/* - * mb_cache_entry_get() - * - * Get a cache entry by device / block number. (There can only be one entry - * in the cache per device and block.) Returns NULL if no such cache entry - * exists. The returned cache entry is locked for exclusive access ("single - * writer"). - */ -struct mb_cache_entry * -mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev, - sector_t block) -{ - unsigned int bucket; - struct hlist_bl_node *l; - struct mb_cache_entry *ce; - struct hlist_bl_head *block_hash_p; - - bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), - cache->c_bucket_bits); - block_hash_p = &cache->c_block_hash[bucket]; - /* First serialize access to the block corresponding hash chain. */ - hlist_bl_lock(block_hash_p); - hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) { - mb_assert(ce->e_block_hash_p == block_hash_p); - if (ce->e_bdev == bdev && ce->e_block == block) { - /* - * Prevent a free from removing the entry. - */ - atomic_inc(&ce->e_refcnt); - hlist_bl_unlock(block_hash_p); - __spin_lock_mb_cache_entry(ce); - atomic_dec(&ce->e_refcnt); - if (ce->e_used > 0) { - DEFINE_WAIT(wait); - while (ce->e_used > 0) { - ce->e_queued++; - prepare_to_wait(&mb_cache_queue, &wait, - TASK_UNINTERRUPTIBLE); - __spin_unlock_mb_cache_entry(ce); - schedule(); - __spin_lock_mb_cache_entry(ce); - ce->e_queued--; - } - finish_wait(&mb_cache_queue, &wait); - } - ce->e_used += 1 + MB_CACHE_WRITER; - __spin_unlock_mb_cache_entry(ce); - - if (!list_empty(&ce->e_lru_list)) { - spin_lock(&mb_cache_spinlock); - list_del_init(&ce->e_lru_list); - spin_unlock(&mb_cache_spinlock); - } - if (!__mb_cache_entry_is_block_hashed(ce)) { - __mb_cache_entry_release(ce); - return NULL; - } - return ce; - } - } - hlist_bl_unlock(block_hash_p); - return NULL; -} - -#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) - -static struct mb_cache_entry * -__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head, - struct block_device *bdev, unsigned int key) -{ - - /* The index hash chain is alredy acquire by caller. */ - while (l != NULL) { - struct mb_cache_entry *ce = - hlist_bl_entry(l, struct mb_cache_entry, - e_index.o_list); - mb_assert(ce->e_index_hash_p == head); - if (ce->e_bdev == bdev && ce->e_index.o_key == key) { - /* - * Prevent a free from removing the entry. - */ - atomic_inc(&ce->e_refcnt); - hlist_bl_unlock(head); - __spin_lock_mb_cache_entry(ce); - atomic_dec(&ce->e_refcnt); - ce->e_used++; - /* Incrementing before holding the lock gives readers - priority over writers. */ - if (ce->e_used >= MB_CACHE_WRITER) { - DEFINE_WAIT(wait); - - while (ce->e_used >= MB_CACHE_WRITER) { - ce->e_queued++; - prepare_to_wait(&mb_cache_queue, &wait, - TASK_UNINTERRUPTIBLE); - __spin_unlock_mb_cache_entry(ce); - schedule(); - __spin_lock_mb_cache_entry(ce); - ce->e_queued--; - } - finish_wait(&mb_cache_queue, &wait); - } - __spin_unlock_mb_cache_entry(ce); - if (!list_empty(&ce->e_lru_list)) { - spin_lock(&mb_cache_spinlock); - list_del_init(&ce->e_lru_list); - spin_unlock(&mb_cache_spinlock); - } - if (!__mb_cache_entry_is_block_hashed(ce)) { - __mb_cache_entry_release(ce); - return ERR_PTR(-EAGAIN); - } - return ce; - } - l = l->next; - } - hlist_bl_unlock(head); - return NULL; -} - - -/* - * mb_cache_entry_find_first() - * - * Find the first cache entry on a given device with a certain key in - * an additional index. Additional matches can be found with - * mb_cache_entry_find_next(). Returns NULL if no match was found. The - * returned cache entry is locked for shared access ("multiple readers"). - * - * @cache: the cache to search - * @bdev: the device the cache entry should belong to - * @key: the key in the index - */ -struct mb_cache_entry * -mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev, - unsigned int key) -{ - unsigned int bucket = hash_long(key, cache->c_bucket_bits); - struct hlist_bl_node *l; - struct mb_cache_entry *ce = NULL; - struct hlist_bl_head *index_hash_p; - - index_hash_p = &cache->c_index_hash[bucket]; - hlist_bl_lock(index_hash_p); - if (!hlist_bl_empty(index_hash_p)) { - l = hlist_bl_first(index_hash_p); - ce = __mb_cache_entry_find(l, index_hash_p, bdev, key); - } else - hlist_bl_unlock(index_hash_p); - return ce; -} - - -/* - * mb_cache_entry_find_next() - * - * Find the next cache entry on a given device with a certain key in an - * additional index. Returns NULL if no match could be found. The previous - * entry is atomatically released, so that mb_cache_entry_find_next() can - * be called like this: - * - * entry = mb_cache_entry_find_first(); - * while (entry) { - * ... - * entry = mb_cache_entry_find_next(entry, ...); - * } - * - * @prev: The previous match - * @bdev: the device the cache entry should belong to - * @key: the key in the index - */ -struct mb_cache_entry * -mb_cache_entry_find_next(struct mb_cache_entry *prev, - struct block_device *bdev, unsigned int key) -{ - struct mb_cache *cache = prev->e_cache; - unsigned int bucket = hash_long(key, cache->c_bucket_bits); - struct hlist_bl_node *l; - struct mb_cache_entry *ce; - struct hlist_bl_head *index_hash_p; - - index_hash_p = &cache->c_index_hash[bucket]; - mb_assert(prev->e_index_hash_p == index_hash_p); - hlist_bl_lock(index_hash_p); - mb_assert(!hlist_bl_empty(index_hash_p)); - l = prev->e_index.o_list.next; - ce = __mb_cache_entry_find(l, index_hash_p, bdev, key); - __mb_cache_entry_release(prev); - return ce; -} - -#endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */ - -static int __init init_mbcache(void) -{ - register_shrinker(&mb_cache_shrinker); - return 0; -} - -static void __exit exit_mbcache(void) -{ - unregister_shrinker(&mb_cache_shrinker); -} - -module_init(init_mbcache) -module_exit(exit_mbcache) - diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h deleted file mode 100644 index 6a392e7..0000000 --- a/include/linux/mbcache.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - File: linux/mbcache.h - - (C) 2001 by Andreas Gruenbacher, -*/ -struct mb_cache_entry { - struct list_head e_lru_list; - struct mb_cache *e_cache; - unsigned short e_used; - unsigned short e_queued; - atomic_t e_refcnt; - struct block_device *e_bdev; - sector_t e_block; - struct hlist_bl_node e_block_list; - struct { - struct hlist_bl_node o_list; - unsigned int o_key; - } e_index; - struct hlist_bl_head *e_block_hash_p; - struct hlist_bl_head *e_index_hash_p; -}; - -struct mb_cache { - struct list_head c_cache_list; - const char *c_name; - atomic_t c_entry_count; - int c_max_entries; - int c_bucket_bits; - struct kmem_cache *c_entry_cache; - struct hlist_bl_head *c_block_hash; - struct hlist_bl_head *c_index_hash; -}; - -/* Functions on caches */ - -struct mb_cache *mb_cache_create(const char *, int); -void mb_cache_shrink(struct block_device *); -void mb_cache_destroy(struct mb_cache *); - -/* Functions on cache entries */ - -struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *, gfp_t); -int mb_cache_entry_insert(struct mb_cache_entry *, struct block_device *, - sector_t, unsigned int); -void mb_cache_entry_release(struct mb_cache_entry *); -void mb_cache_entry_free(struct mb_cache_entry *); -struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *, - struct block_device *, - sector_t); -struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, - struct block_device *, - unsigned int); -struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *, - struct block_device *, - unsigned int); -- cgit v0.10.2 From c2f3140fe2eceb3a6c1615b2648b9471544881c6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2016 12:33:03 -0500 Subject: mbcache2: limit cache size So far number of entries in mbcache is limited only by the pressure from the shrinker. Since too many entries degrade the hash table and generally we expect that caching more entries has diminishing returns, limit number of entries the same way as in the old mbcache to 16 * hash table size. Once we exceed the desired maximum number of entries, we schedule a backround work to reclaim entries. If the background work cannot keep up and the number of entries exceeds two times the desired maximum, we reclaim some entries directly when allocating a new entry. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/mbcache2.c b/fs/mbcache2.c index 5c3e1a8..3e3198d 100644 --- a/fs/mbcache2.c +++ b/fs/mbcache2.c @@ -4,6 +4,7 @@ #include #include #include +#include #include /* @@ -27,16 +28,29 @@ struct mb2_cache { struct hlist_bl_head *c_hash; /* log2 of hash table size */ int c_bucket_bits; + /* Maximum entries in cache to avoid degrading hash too much */ + int c_max_entries; /* Protects c_lru_list, c_entry_count */ spinlock_t c_lru_list_lock; struct list_head c_lru_list; /* Number of entries in cache */ unsigned long c_entry_count; struct shrinker c_shrink; + /* Work for shrinking when the cache has too many entries */ + struct work_struct c_shrink_work; }; static struct kmem_cache *mb2_entry_cache; +static unsigned long mb2_cache_shrink(struct mb2_cache *cache, + unsigned int nr_to_scan); + +/* + * Number of entries to reclaim synchronously when there are too many entries + * in cache + */ +#define SYNC_SHRINK_BATCH 64 + /* * mb2_cache_entry_create - create entry in cache * @cache - cache where the entry should be created @@ -55,6 +69,13 @@ int mb2_cache_entry_create(struct mb2_cache *cache, gfp_t mask, u32 key, struct hlist_bl_node *dup_node; struct hlist_bl_head *head; + /* Schedule background reclaim if there are too many entries */ + if (cache->c_entry_count >= cache->c_max_entries) + schedule_work(&cache->c_shrink_work); + /* Do some sync reclaim if background reclaim cannot keep up */ + if (cache->c_entry_count >= 2*cache->c_max_entries) + mb2_cache_shrink(cache, SYNC_SHRINK_BATCH); + entry = kmem_cache_alloc(mb2_entry_cache, mask); if (!entry) return -ENOMEM; @@ -223,12 +244,9 @@ static unsigned long mb2_cache_count(struct shrinker *shrink, } /* Shrink number of entries in cache */ -static unsigned long mb2_cache_scan(struct shrinker *shrink, - struct shrink_control *sc) +static unsigned long mb2_cache_shrink(struct mb2_cache *cache, + unsigned int nr_to_scan) { - int nr_to_scan = sc->nr_to_scan; - struct mb2_cache *cache = container_of(shrink, struct mb2_cache, - c_shrink); struct mb2_cache_entry *entry; struct hlist_bl_head *head; unsigned int shrunk = 0; @@ -261,6 +279,25 @@ static unsigned long mb2_cache_scan(struct shrinker *shrink, return shrunk; } +static unsigned long mb2_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + int nr_to_scan = sc->nr_to_scan; + struct mb2_cache *cache = container_of(shrink, struct mb2_cache, + c_shrink); + return mb2_cache_shrink(cache, nr_to_scan); +} + +/* We shrink 1/X of the cache when we have too many entries in it */ +#define SHRINK_DIVISOR 16 + +static void mb2_cache_shrink_worker(struct work_struct *work) +{ + struct mb2_cache *cache = container_of(work, struct mb2_cache, + c_shrink_work); + mb2_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR); +} + /* * mb2_cache_create - create cache * @bucket_bits: log2 of the hash table size @@ -280,6 +317,7 @@ struct mb2_cache *mb2_cache_create(int bucket_bits) if (!cache) goto err_out; cache->c_bucket_bits = bucket_bits; + cache->c_max_entries = bucket_count << 4; INIT_LIST_HEAD(&cache->c_lru_list); spin_lock_init(&cache->c_lru_list_lock); cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head), @@ -296,6 +334,8 @@ struct mb2_cache *mb2_cache_create(int bucket_bits) cache->c_shrink.seeks = DEFAULT_SEEKS; register_shrinker(&cache->c_shrink); + INIT_WORK(&cache->c_shrink_work, mb2_cache_shrink_worker); + return cache; err_out: -- cgit v0.10.2 From f0c8b46238db9d51ef9ea0858259958d0c601cec Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2016 18:23:47 -0500 Subject: mbcache2: Use referenced bit instead of LRU Currently we maintain perfect LRU list by moving entry to the tail of the list when it gets used. However these operations on cache-global list are relatively expensive. In this patch we switch to lazy updates of LRU list. Whenever entry gets used, we set a referenced bit in it. When reclaiming entries, we give referenced entries another round in the LRU. Since the list is not a real LRU anymore, rename it to just 'list'. In my testing this logic gives about 30% boost to workloads with mostly unique xattr blocks (e.g. xattr-bench with 10 files and 10000 unique xattr values). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/mbcache2.c b/fs/mbcache2.c index 3e3198d..49f7a6f 100644 --- a/fs/mbcache2.c +++ b/fs/mbcache2.c @@ -30,9 +30,9 @@ struct mb2_cache { int c_bucket_bits; /* Maximum entries in cache to avoid degrading hash too much */ int c_max_entries; - /* Protects c_lru_list, c_entry_count */ - spinlock_t c_lru_list_lock; - struct list_head c_lru_list; + /* Protects c_list, c_entry_count */ + spinlock_t c_list_lock; + struct list_head c_list; /* Number of entries in cache */ unsigned long c_entry_count; struct shrinker c_shrink; @@ -45,6 +45,29 @@ static struct kmem_cache *mb2_entry_cache; static unsigned long mb2_cache_shrink(struct mb2_cache *cache, unsigned int nr_to_scan); +static inline bool mb2_cache_entry_referenced(struct mb2_cache_entry *entry) +{ + return entry->_e_hash_list_head & 1; +} + +static inline void mb2_cache_entry_set_referenced(struct mb2_cache_entry *entry) +{ + entry->_e_hash_list_head |= 1; +} + +static inline void mb2_cache_entry_clear_referenced( + struct mb2_cache_entry *entry) +{ + entry->_e_hash_list_head &= ~1; +} + +static inline struct hlist_bl_head *mb2_cache_entry_head( + struct mb2_cache_entry *entry) +{ + return (struct hlist_bl_head *) + (entry->_e_hash_list_head & ~1); +} + /* * Number of entries to reclaim synchronously when there are too many entries * in cache @@ -80,13 +103,13 @@ int mb2_cache_entry_create(struct mb2_cache *cache, gfp_t mask, u32 key, if (!entry) return -ENOMEM; - INIT_LIST_HEAD(&entry->e_lru_list); + INIT_LIST_HEAD(&entry->e_list); /* One ref for hash, one ref returned */ atomic_set(&entry->e_refcnt, 1); entry->e_key = key; entry->e_block = block; head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; - entry->e_hash_list_head = head; + entry->_e_hash_list_head = (unsigned long)head; hlist_bl_lock(head); hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { if (dup->e_key == key && dup->e_block == block) { @@ -98,12 +121,12 @@ int mb2_cache_entry_create(struct mb2_cache *cache, gfp_t mask, u32 key, hlist_bl_add_head(&entry->e_hash_list, head); hlist_bl_unlock(head); - spin_lock(&cache->c_lru_list_lock); - list_add_tail(&entry->e_lru_list, &cache->c_lru_list); + spin_lock(&cache->c_list_lock); + list_add_tail(&entry->e_list, &cache->c_list); /* Grab ref for LRU list */ atomic_inc(&entry->e_refcnt); cache->c_entry_count++; - spin_unlock(&cache->c_lru_list_lock); + spin_unlock(&cache->c_list_lock); return 0; } @@ -124,7 +147,7 @@ static struct mb2_cache_entry *__entry_find(struct mb2_cache *cache, struct hlist_bl_head *head; if (entry) - head = entry->e_hash_list_head; + head = mb2_cache_entry_head(entry); else head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; hlist_bl_lock(head); @@ -203,13 +226,13 @@ void mb2_cache_entry_delete_block(struct mb2_cache *cache, u32 key, /* We keep hash list reference to keep entry alive */ hlist_bl_del_init(&entry->e_hash_list); hlist_bl_unlock(head); - spin_lock(&cache->c_lru_list_lock); - if (!list_empty(&entry->e_lru_list)) { - list_del_init(&entry->e_lru_list); + spin_lock(&cache->c_list_lock); + if (!list_empty(&entry->e_list)) { + list_del_init(&entry->e_list); cache->c_entry_count--; atomic_dec(&entry->e_refcnt); } - spin_unlock(&cache->c_lru_list_lock); + spin_unlock(&cache->c_list_lock); mb2_cache_entry_put(cache, entry); return; } @@ -222,15 +245,12 @@ EXPORT_SYMBOL(mb2_cache_entry_delete_block); * @cache - cache the entry belongs to * @entry - entry that got used * - * Move entry in lru list to reflect the fact that it was used. + * Marks entry as used to give hit higher chances of surviving in cache. */ void mb2_cache_entry_touch(struct mb2_cache *cache, struct mb2_cache_entry *entry) { - spin_lock(&cache->c_lru_list_lock); - if (!list_empty(&entry->e_lru_list)) - list_move_tail(&cache->c_lru_list, &entry->e_lru_list); - spin_unlock(&cache->c_lru_list_lock); + mb2_cache_entry_set_referenced(entry); } EXPORT_SYMBOL(mb2_cache_entry_touch); @@ -251,18 +271,23 @@ static unsigned long mb2_cache_shrink(struct mb2_cache *cache, struct hlist_bl_head *head; unsigned int shrunk = 0; - spin_lock(&cache->c_lru_list_lock); - while (nr_to_scan-- && !list_empty(&cache->c_lru_list)) { - entry = list_first_entry(&cache->c_lru_list, - struct mb2_cache_entry, e_lru_list); - list_del_init(&entry->e_lru_list); + spin_lock(&cache->c_list_lock); + while (nr_to_scan-- && !list_empty(&cache->c_list)) { + entry = list_first_entry(&cache->c_list, + struct mb2_cache_entry, e_list); + if (mb2_cache_entry_referenced(entry)) { + mb2_cache_entry_clear_referenced(entry); + list_move_tail(&cache->c_list, &entry->e_list); + continue; + } + list_del_init(&entry->e_list); cache->c_entry_count--; /* * We keep LRU list reference so that entry doesn't go away * from under us. */ - spin_unlock(&cache->c_lru_list_lock); - head = entry->e_hash_list_head; + spin_unlock(&cache->c_list_lock); + head = mb2_cache_entry_head(entry); hlist_bl_lock(head); if (!hlist_bl_unhashed(&entry->e_hash_list)) { hlist_bl_del_init(&entry->e_hash_list); @@ -272,9 +297,9 @@ static unsigned long mb2_cache_shrink(struct mb2_cache *cache, if (mb2_cache_entry_put(cache, entry)) shrunk++; cond_resched(); - spin_lock(&cache->c_lru_list_lock); + spin_lock(&cache->c_list_lock); } - spin_unlock(&cache->c_lru_list_lock); + spin_unlock(&cache->c_list_lock); return shrunk; } @@ -318,8 +343,8 @@ struct mb2_cache *mb2_cache_create(int bucket_bits) goto err_out; cache->c_bucket_bits = bucket_bits; cache->c_max_entries = bucket_count << 4; - INIT_LIST_HEAD(&cache->c_lru_list); - spin_lock_init(&cache->c_lru_list_lock); + INIT_LIST_HEAD(&cache->c_list); + spin_lock_init(&cache->c_list_lock); cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head), GFP_KERNEL); if (!cache->c_hash) { @@ -361,13 +386,13 @@ void mb2_cache_destroy(struct mb2_cache *cache) * We don't bother with any locking. Cache must not be used at this * point. */ - list_for_each_entry_safe(entry, next, &cache->c_lru_list, e_lru_list) { + list_for_each_entry_safe(entry, next, &cache->c_list, e_list) { if (!hlist_bl_unhashed(&entry->e_hash_list)) { hlist_bl_del_init(&entry->e_hash_list); atomic_dec(&entry->e_refcnt); } else WARN_ON(1); - list_del(&entry->e_lru_list); + list_del(&entry->e_list); WARN_ON(atomic_read(&entry->e_refcnt) != 1); mb2_cache_entry_put(cache, entry); } diff --git a/include/linux/mbcache2.h b/include/linux/mbcache2.h index b6f160f..c934843 100644 --- a/include/linux/mbcache2.h +++ b/include/linux/mbcache2.h @@ -10,8 +10,8 @@ struct mb2_cache; struct mb2_cache_entry { - /* LRU list - protected by cache->c_lru_list_lock */ - struct list_head e_lru_list; + /* List of entries in cache - protected by cache->c_list_lock */ + struct list_head e_list; /* Hash table list - protected by bitlock in e_hash_list_head */ struct hlist_bl_node e_hash_list; atomic_t e_refcnt; @@ -19,8 +19,11 @@ struct mb2_cache_entry { u32 e_key; /* Block number of hashed block - stable during lifetime of the entry */ sector_t e_block; - /* Head of hash list (for list bit lock) - stable */ - struct hlist_bl_head *e_hash_list_head; + /* + * Head of hash list (for list bit lock) - stable. Combined with + * referenced bit of entry + */ + unsigned long _e_hash_list_head; }; struct mb2_cache *mb2_cache_create(int bucket_bits); -- cgit v0.10.2 From 7a2508e1b657cfc7e1371550f88c7a7bc4288f32 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2016 22:35:22 -0500 Subject: mbcache2: rename to mbcache Since old mbcache code is gone, let's rename new code to mbcache since number 2 is now meaningless. This is just a mechanical replacement. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/Makefile b/fs/Makefile index 59b8440..79f5225 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -41,7 +41,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o -obj-$(CONFIG_FS_MBCACHE) += mbcache2.o +obj-$(CONFIG_FS_MBCACHE) += mbcache.o obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_COREDUMP) += coredump.o diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index f98ce7e..170939f 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -61,7 +61,7 @@ struct ext2_block_alloc_info { #define rsv_start rsv_window._rsv_start #define rsv_end rsv_window._rsv_end -struct mb2_cache; +struct mb_cache; /* * second extended-fs super-block data in memory @@ -113,7 +113,7 @@ struct ext2_sb_info { * of the mount options. */ spinlock_t s_lock; - struct mb2_cache *s_mb_cache; + struct mb_cache *s_mb_cache; }; static inline spinlock_t * diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 7162b48..71d58c2 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -56,7 +56,7 @@ #include #include #include -#include +#include #include #include #include @@ -90,7 +90,7 @@ static int ext2_xattr_set2(struct inode *, struct buffer_head *, struct ext2_xattr_header *); -static int ext2_xattr_cache_insert(struct mb2_cache *, struct buffer_head *); +static int ext2_xattr_cache_insert(struct mb_cache *, struct buffer_head *); static struct buffer_head *ext2_xattr_cache_find(struct inode *, struct ext2_xattr_header *); static void ext2_xattr_rehash(struct ext2_xattr_header *, @@ -150,7 +150,7 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name, size_t name_len, size; char *end; int error; - struct mb2_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; + struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); @@ -246,7 +246,7 @@ ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) char *end; size_t rest = buffer_size; int error; - struct mb2_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; + struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); @@ -493,8 +493,8 @@ bad_block: ext2_error(sb, "ext2_xattr_set", * This must happen under buffer lock for * ext2_xattr_set2() to reliably detect modified block */ - mb2_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache, - hash, bh->b_blocknr); + mb_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache, + hash, bh->b_blocknr); /* keep the buffer locked while modifying it. */ } else { @@ -627,7 +627,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; int error; - struct mb2_cache *ext2_mb_cache = EXT2_SB(sb)->s_mb_cache; + struct mb_cache *ext2_mb_cache = EXT2_SB(sb)->s_mb_cache; if (header) { new_bh = ext2_xattr_cache_find(inode, header); @@ -721,8 +721,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, * This must happen under buffer lock for * ext2_xattr_set2() to reliably detect freed block */ - mb2_cache_entry_delete_block(ext2_mb_cache, - hash, old_bh->b_blocknr); + mb_cache_entry_delete_block(ext2_mb_cache, + hash, old_bh->b_blocknr); /* Free the old block. */ ea_bdebug(old_bh, "freeing"); ext2_free_blocks(inode, old_bh->b_blocknr, 1); @@ -786,8 +786,8 @@ ext2_xattr_delete_inode(struct inode *inode) * This must happen under buffer lock for ext2_xattr_set2() to * reliably detect freed block */ - mb2_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache, - hash, bh->b_blocknr); + mb_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache, + hash, bh->b_blocknr); ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1); get_bh(bh); bforget(bh); @@ -818,12 +818,12 @@ cleanup: * Returns 0, or a negative error number on failure. */ static int -ext2_xattr_cache_insert(struct mb2_cache *cache, struct buffer_head *bh) +ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh) { __u32 hash = le32_to_cpu(HDR(bh)->h_hash); int error; - error = mb2_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr); + error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr); if (error) { if (error == -EBUSY) { ea_bdebug(bh, "already in cache (%d cache entries)", @@ -887,14 +887,14 @@ static struct buffer_head * ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) { __u32 hash = le32_to_cpu(header->h_hash); - struct mb2_cache_entry *ce; - struct mb2_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; + struct mb_cache_entry *ce; + struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); again: - ce = mb2_cache_entry_find_first(ext2_mb_cache, hash); + ce = mb_cache_entry_find_first(ext2_mb_cache, hash); while (ce) { struct buffer_head *bh; @@ -915,7 +915,7 @@ again: * entry is still hashed is reliable. */ if (hlist_bl_unhashed(&ce->e_hash_list)) { - mb2_cache_entry_put(ext2_mb_cache, ce); + mb_cache_entry_put(ext2_mb_cache, ce); unlock_buffer(bh); brelse(bh); goto again; @@ -928,14 +928,14 @@ again: } else if (!ext2_xattr_cmp(header, HDR(bh))) { ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); - mb2_cache_entry_touch(ext2_mb_cache, ce); - mb2_cache_entry_put(ext2_mb_cache, ce); + mb_cache_entry_touch(ext2_mb_cache, ce); + mb_cache_entry_put(ext2_mb_cache, ce); return bh; } unlock_buffer(bh); brelse(bh); } - ce = mb2_cache_entry_find_next(ext2_mb_cache, ce); + ce = mb_cache_entry_find_next(ext2_mb_cache, ce); } return NULL; } @@ -1010,13 +1010,13 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *header, #define HASH_BUCKET_BITS 10 -struct mb2_cache *ext2_xattr_create_cache(void) +struct mb_cache *ext2_xattr_create_cache(void) { - return mb2_cache_create(HASH_BUCKET_BITS); + return mb_cache_create(HASH_BUCKET_BITS); } -void ext2_xattr_destroy_cache(struct mb2_cache *cache) +void ext2_xattr_destroy_cache(struct mb_cache *cache) { if (cache) - mb2_cache_destroy(cache); + mb_cache_destroy(cache); } diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h index 6ea38aa..6f82ab1 100644 --- a/fs/ext2/xattr.h +++ b/fs/ext2/xattr.h @@ -53,7 +53,7 @@ struct ext2_xattr_entry { #define EXT2_XATTR_SIZE(size) \ (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND) -struct mb2_cache; +struct mb_cache; # ifdef CONFIG_EXT2_FS_XATTR @@ -68,8 +68,8 @@ extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_ extern void ext2_xattr_delete_inode(struct inode *); -extern struct mb2_cache *ext2_xattr_create_cache(void); -extern void ext2_xattr_destroy_cache(struct mb2_cache *cache); +extern struct mb_cache *ext2_xattr_create_cache(void); +extern void ext2_xattr_destroy_cache(struct mb_cache *cache); extern const struct xattr_handler *ext2_xattr_handlers[]; @@ -94,7 +94,7 @@ ext2_xattr_delete_inode(struct inode *inode) { } -static inline void ext2_xattr_destroy_cache(struct mb2_cache *cache) +static inline void ext2_xattr_destroy_cache(struct mb_cache *cache) { } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9ac9e62..157b458 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1468,7 +1468,7 @@ struct ext4_sb_info { struct list_head s_es_list; /* List of inodes with reclaimable extents */ long s_es_nr_inode; struct ext4_es_stats s_es_stats; - struct mb2_cache *s_mb_cache; + struct mb_cache *s_mb_cache; spinlock_t s_es_lock ____cacheline_aligned_in_smp; /* Ratelimit ext4 messages. */ diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index fe9f8d6..c6af8a7 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -53,7 +53,7 @@ #include #include #include -#include +#include #include #include "ext4_jbd2.h" #include "ext4.h" @@ -78,10 +78,10 @@ # define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif -static void ext4_xattr_cache_insert(struct mb2_cache *, struct buffer_head *); +static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *); static struct buffer_head *ext4_xattr_cache_find(struct inode *, struct ext4_xattr_header *, - struct mb2_cache_entry **); + struct mb_cache_entry **); static void ext4_xattr_rehash(struct ext4_xattr_header *, struct ext4_xattr_entry *); static int ext4_xattr_list(struct dentry *dentry, char *buffer, @@ -276,7 +276,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, struct ext4_xattr_entry *entry; size_t size; int error; - struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); @@ -428,7 +428,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct inode *inode = d_inode(dentry); struct buffer_head *bh = NULL; int error; - struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); @@ -561,8 +561,8 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, * This must happen under buffer lock for * ext4_xattr_block_set() to reliably detect freed block */ - mb2_cache_entry_delete_block(EXT4_GET_MB_CACHE(inode), hash, - bh->b_blocknr); + mb_cache_entry_delete_block(EXT4_GET_MB_CACHE(inode), hash, + bh->b_blocknr); get_bh(bh); unlock_buffer(bh); ext4_free_blocks(handle, inode, bh, 0, 1, @@ -782,9 +782,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; struct ext4_xattr_search *s = &bs->s; - struct mb2_cache_entry *ce = NULL; + struct mb_cache_entry *ce = NULL; int error = 0; - struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); #define header(x) ((struct ext4_xattr_header *)(x)) @@ -805,8 +805,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, * ext4_xattr_block_set() to reliably detect modified * block */ - mb2_cache_entry_delete_block(ext4_mb_cache, hash, - bs->bh->b_blocknr); + mb_cache_entry_delete_block(ext4_mb_cache, hash, + bs->bh->b_blocknr); ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s); if (!error) { @@ -904,7 +904,7 @@ inserted: EXT4_C2B(EXT4_SB(sb), 1)); brelse(new_bh); - mb2_cache_entry_put(ext4_mb_cache, ce); + mb_cache_entry_put(ext4_mb_cache, ce); ce = NULL; new_bh = NULL; goto inserted; @@ -919,8 +919,8 @@ inserted: if (error) goto cleanup_dquot; } - mb2_cache_entry_touch(ext4_mb_cache, ce); - mb2_cache_entry_put(ext4_mb_cache, ce); + mb_cache_entry_touch(ext4_mb_cache, ce); + mb_cache_entry_put(ext4_mb_cache, ce); ce = NULL; } else if (bs->bh && s->base == bs->bh->b_data) { /* We were modifying this block in-place. */ @@ -985,7 +985,7 @@ getblk_failed: cleanup: if (ce) - mb2_cache_entry_put(ext4_mb_cache, ce); + mb_cache_entry_put(ext4_mb_cache, ce); brelse(new_bh); if (!(bs->bh && s->base == bs->bh->b_data)) kfree(s->base); @@ -1546,13 +1546,13 @@ cleanup: * Returns 0, or a negative error number on failure. */ static void -ext4_xattr_cache_insert(struct mb2_cache *ext4_mb_cache, struct buffer_head *bh) +ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) { __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); int error; - error = mb2_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash, - bh->b_blocknr); + error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash, + bh->b_blocknr); if (error) { if (error == -EBUSY) ea_bdebug(bh, "already in cache"); @@ -1610,16 +1610,16 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, */ static struct buffer_head * ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, - struct mb2_cache_entry **pce) + struct mb_cache_entry **pce) { __u32 hash = le32_to_cpu(header->h_hash); - struct mb2_cache_entry *ce; - struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache_entry *ce; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); - ce = mb2_cache_entry_find_first(ext4_mb_cache, hash); + ce = mb_cache_entry_find_first(ext4_mb_cache, hash); while (ce) { struct buffer_head *bh; @@ -1638,7 +1638,7 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, return bh; } brelse(bh); - ce = mb2_cache_entry_find_next(ext4_mb_cache, ce); + ce = mb_cache_entry_find_next(ext4_mb_cache, ce); } return NULL; } @@ -1713,15 +1713,15 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header, #define HASH_BUCKET_BITS 10 -struct mb2_cache * +struct mb_cache * ext4_xattr_create_cache(void) { - return mb2_cache_create(HASH_BUCKET_BITS); + return mb_cache_create(HASH_BUCKET_BITS); } -void ext4_xattr_destroy_cache(struct mb2_cache *cache) +void ext4_xattr_destroy_cache(struct mb_cache *cache) { if (cache) - mb2_cache_destroy(cache); + mb_cache_destroy(cache); } diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 10b0f73..69dd3e6 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -123,8 +123,8 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is); -extern struct mb2_cache *ext4_xattr_create_cache(void); -extern void ext4_xattr_destroy_cache(struct mb2_cache *); +extern struct mb_cache *ext4_xattr_create_cache(void); +extern void ext4_xattr_destroy_cache(struct mb_cache *); #ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, diff --git a/fs/mbcache.c b/fs/mbcache.c new file mode 100644 index 0000000..4241b63 --- /dev/null +++ b/fs/mbcache.c @@ -0,0 +1,424 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Mbcache is a simple key-value store. Keys need not be unique, however + * key-value pairs are expected to be unique (we use this fact in + * mb_cache_entry_delete_block()). + * + * Ext2 and ext4 use this cache for deduplication of extended attribute blocks. + * They use hash of a block contents as a key and block number as a value. + * That's why keys need not be unique (different xattr blocks may end up having + * the same hash). However block number always uniquely identifies a cache + * entry. + * + * We provide functions for creation and removal of entries, search by key, + * and a special "delete entry with given key-value pair" operation. Fixed + * size hash table is used for fast key lookups. + */ + +struct mb_cache { + /* Hash table of entries */ + struct hlist_bl_head *c_hash; + /* log2 of hash table size */ + int c_bucket_bits; + /* Maximum entries in cache to avoid degrading hash too much */ + int c_max_entries; + /* Protects c_list, c_entry_count */ + spinlock_t c_list_lock; + struct list_head c_list; + /* Number of entries in cache */ + unsigned long c_entry_count; + struct shrinker c_shrink; + /* Work for shrinking when the cache has too many entries */ + struct work_struct c_shrink_work; +}; + +static struct kmem_cache *mb_entry_cache; + +static unsigned long mb_cache_shrink(struct mb_cache *cache, + unsigned int nr_to_scan); + +static inline bool mb_cache_entry_referenced(struct mb_cache_entry *entry) +{ + return entry->_e_hash_list_head & 1; +} + +static inline void mb_cache_entry_set_referenced(struct mb_cache_entry *entry) +{ + entry->_e_hash_list_head |= 1; +} + +static inline void mb_cache_entry_clear_referenced( + struct mb_cache_entry *entry) +{ + entry->_e_hash_list_head &= ~1; +} + +static inline struct hlist_bl_head *mb_cache_entry_head( + struct mb_cache_entry *entry) +{ + return (struct hlist_bl_head *) + (entry->_e_hash_list_head & ~1); +} + +/* + * Number of entries to reclaim synchronously when there are too many entries + * in cache + */ +#define SYNC_SHRINK_BATCH 64 + +/* + * mb_cache_entry_create - create entry in cache + * @cache - cache where the entry should be created + * @mask - gfp mask with which the entry should be allocated + * @key - key of the entry + * @block - block that contains data + * + * Creates entry in @cache with key @key and records that data is stored in + * block @block. The function returns -EBUSY if entry with the same key + * and for the same block already exists in cache. Otherwise 0 is returned. + */ +int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, + sector_t block) +{ + struct mb_cache_entry *entry, *dup; + struct hlist_bl_node *dup_node; + struct hlist_bl_head *head; + + /* Schedule background reclaim if there are too many entries */ + if (cache->c_entry_count >= cache->c_max_entries) + schedule_work(&cache->c_shrink_work); + /* Do some sync reclaim if background reclaim cannot keep up */ + if (cache->c_entry_count >= 2*cache->c_max_entries) + mb_cache_shrink(cache, SYNC_SHRINK_BATCH); + + entry = kmem_cache_alloc(mb_entry_cache, mask); + if (!entry) + return -ENOMEM; + + INIT_LIST_HEAD(&entry->e_list); + /* One ref for hash, one ref returned */ + atomic_set(&entry->e_refcnt, 1); + entry->e_key = key; + entry->e_block = block; + head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; + entry->_e_hash_list_head = (unsigned long)head; + hlist_bl_lock(head); + hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { + if (dup->e_key == key && dup->e_block == block) { + hlist_bl_unlock(head); + kmem_cache_free(mb_entry_cache, entry); + return -EBUSY; + } + } + hlist_bl_add_head(&entry->e_hash_list, head); + hlist_bl_unlock(head); + + spin_lock(&cache->c_list_lock); + list_add_tail(&entry->e_list, &cache->c_list); + /* Grab ref for LRU list */ + atomic_inc(&entry->e_refcnt); + cache->c_entry_count++; + spin_unlock(&cache->c_list_lock); + + return 0; +} +EXPORT_SYMBOL(mb_cache_entry_create); + +void __mb_cache_entry_free(struct mb_cache_entry *entry) +{ + kmem_cache_free(mb_entry_cache, entry); +} +EXPORT_SYMBOL(__mb_cache_entry_free); + +static struct mb_cache_entry *__entry_find(struct mb_cache *cache, + struct mb_cache_entry *entry, + u32 key) +{ + struct mb_cache_entry *old_entry = entry; + struct hlist_bl_node *node; + struct hlist_bl_head *head; + + if (entry) + head = mb_cache_entry_head(entry); + else + head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; + hlist_bl_lock(head); + if (entry && !hlist_bl_unhashed(&entry->e_hash_list)) + node = entry->e_hash_list.next; + else + node = hlist_bl_first(head); + while (node) { + entry = hlist_bl_entry(node, struct mb_cache_entry, + e_hash_list); + if (entry->e_key == key) { + atomic_inc(&entry->e_refcnt); + goto out; + } + node = node->next; + } + entry = NULL; +out: + hlist_bl_unlock(head); + if (old_entry) + mb_cache_entry_put(cache, old_entry); + + return entry; +} + +/* + * mb_cache_entry_find_first - find the first entry in cache with given key + * @cache: cache where we should search + * @key: key to look for + * + * Search in @cache for entry with key @key. Grabs reference to the first + * entry found and returns the entry. + */ +struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, + u32 key) +{ + return __entry_find(cache, NULL, key); +} +EXPORT_SYMBOL(mb_cache_entry_find_first); + +/* + * mb_cache_entry_find_next - find next entry in cache with the same + * @cache: cache where we should search + * @entry: entry to start search from + * + * Finds next entry in the hash chain which has the same key as @entry. + * If @entry is unhashed (which can happen when deletion of entry races + * with the search), finds the first entry in the hash chain. The function + * drops reference to @entry and returns with a reference to the found entry. + */ +struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, + struct mb_cache_entry *entry) +{ + return __entry_find(cache, entry, entry->e_key); +} +EXPORT_SYMBOL(mb_cache_entry_find_next); + +/* mb_cache_entry_delete_block - remove information about block from cache + * @cache - cache we work with + * @key - key of the entry to remove + * @block - block containing data for @key + * + * Remove entry from cache @cache with key @key with data stored in @block. + */ +void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key, + sector_t block) +{ + struct hlist_bl_node *node; + struct hlist_bl_head *head; + struct mb_cache_entry *entry; + + head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; + hlist_bl_lock(head); + hlist_bl_for_each_entry(entry, node, head, e_hash_list) { + if (entry->e_key == key && entry->e_block == block) { + /* We keep hash list reference to keep entry alive */ + hlist_bl_del_init(&entry->e_hash_list); + hlist_bl_unlock(head); + spin_lock(&cache->c_list_lock); + if (!list_empty(&entry->e_list)) { + list_del_init(&entry->e_list); + cache->c_entry_count--; + atomic_dec(&entry->e_refcnt); + } + spin_unlock(&cache->c_list_lock); + mb_cache_entry_put(cache, entry); + return; + } + } + hlist_bl_unlock(head); +} +EXPORT_SYMBOL(mb_cache_entry_delete_block); + +/* mb_cache_entry_touch - cache entry got used + * @cache - cache the entry belongs to + * @entry - entry that got used + * + * Marks entry as used to give hit higher chances of surviving in cache. + */ +void mb_cache_entry_touch(struct mb_cache *cache, + struct mb_cache_entry *entry) +{ + mb_cache_entry_set_referenced(entry); +} +EXPORT_SYMBOL(mb_cache_entry_touch); + +static unsigned long mb_cache_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct mb_cache *cache = container_of(shrink, struct mb_cache, + c_shrink); + + return cache->c_entry_count; +} + +/* Shrink number of entries in cache */ +static unsigned long mb_cache_shrink(struct mb_cache *cache, + unsigned int nr_to_scan) +{ + struct mb_cache_entry *entry; + struct hlist_bl_head *head; + unsigned int shrunk = 0; + + spin_lock(&cache->c_list_lock); + while (nr_to_scan-- && !list_empty(&cache->c_list)) { + entry = list_first_entry(&cache->c_list, + struct mb_cache_entry, e_list); + if (mb_cache_entry_referenced(entry)) { + mb_cache_entry_clear_referenced(entry); + list_move_tail(&cache->c_list, &entry->e_list); + continue; + } + list_del_init(&entry->e_list); + cache->c_entry_count--; + /* + * We keep LRU list reference so that entry doesn't go away + * from under us. + */ + spin_unlock(&cache->c_list_lock); + head = mb_cache_entry_head(entry); + hlist_bl_lock(head); + if (!hlist_bl_unhashed(&entry->e_hash_list)) { + hlist_bl_del_init(&entry->e_hash_list); + atomic_dec(&entry->e_refcnt); + } + hlist_bl_unlock(head); + if (mb_cache_entry_put(cache, entry)) + shrunk++; + cond_resched(); + spin_lock(&cache->c_list_lock); + } + spin_unlock(&cache->c_list_lock); + + return shrunk; +} + +static unsigned long mb_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + int nr_to_scan = sc->nr_to_scan; + struct mb_cache *cache = container_of(shrink, struct mb_cache, + c_shrink); + return mb_cache_shrink(cache, nr_to_scan); +} + +/* We shrink 1/X of the cache when we have too many entries in it */ +#define SHRINK_DIVISOR 16 + +static void mb_cache_shrink_worker(struct work_struct *work) +{ + struct mb_cache *cache = container_of(work, struct mb_cache, + c_shrink_work); + mb_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR); +} + +/* + * mb_cache_create - create cache + * @bucket_bits: log2 of the hash table size + * + * Create cache for keys with 2^bucket_bits hash entries. + */ +struct mb_cache *mb_cache_create(int bucket_bits) +{ + struct mb_cache *cache; + int bucket_count = 1 << bucket_bits; + int i; + + if (!try_module_get(THIS_MODULE)) + return NULL; + + cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL); + if (!cache) + goto err_out; + cache->c_bucket_bits = bucket_bits; + cache->c_max_entries = bucket_count << 4; + INIT_LIST_HEAD(&cache->c_list); + spin_lock_init(&cache->c_list_lock); + cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head), + GFP_KERNEL); + if (!cache->c_hash) { + kfree(cache); + goto err_out; + } + for (i = 0; i < bucket_count; i++) + INIT_HLIST_BL_HEAD(&cache->c_hash[i]); + + cache->c_shrink.count_objects = mb_cache_count; + cache->c_shrink.scan_objects = mb_cache_scan; + cache->c_shrink.seeks = DEFAULT_SEEKS; + register_shrinker(&cache->c_shrink); + + INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker); + + return cache; + +err_out: + module_put(THIS_MODULE); + return NULL; +} +EXPORT_SYMBOL(mb_cache_create); + +/* + * mb_cache_destroy - destroy cache + * @cache: the cache to destroy + * + * Free all entries in cache and cache itself. Caller must make sure nobody + * (except shrinker) can reach @cache when calling this. + */ +void mb_cache_destroy(struct mb_cache *cache) +{ + struct mb_cache_entry *entry, *next; + + unregister_shrinker(&cache->c_shrink); + + /* + * We don't bother with any locking. Cache must not be used at this + * point. + */ + list_for_each_entry_safe(entry, next, &cache->c_list, e_list) { + if (!hlist_bl_unhashed(&entry->e_hash_list)) { + hlist_bl_del_init(&entry->e_hash_list); + atomic_dec(&entry->e_refcnt); + } else + WARN_ON(1); + list_del(&entry->e_list); + WARN_ON(atomic_read(&entry->e_refcnt) != 1); + mb_cache_entry_put(cache, entry); + } + kfree(cache->c_hash); + kfree(cache); + module_put(THIS_MODULE); +} +EXPORT_SYMBOL(mb_cache_destroy); + +static int __init mbcache_init(void) +{ + mb_entry_cache = kmem_cache_create("mbcache", + sizeof(struct mb_cache_entry), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); + BUG_ON(!mb_entry_cache); + return 0; +} + +static void __exit mbcache_exit(void) +{ + kmem_cache_destroy(mb_entry_cache); +} + +module_init(mbcache_init) +module_exit(mbcache_exit) + +MODULE_AUTHOR("Jan Kara "); +MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); +MODULE_LICENSE("GPL"); diff --git a/fs/mbcache2.c b/fs/mbcache2.c deleted file mode 100644 index 49f7a6f..0000000 --- a/fs/mbcache2.c +++ /dev/null @@ -1,424 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Mbcache is a simple key-value store. Keys need not be unique, however - * key-value pairs are expected to be unique (we use this fact in - * mb2_cache_entry_delete_block()). - * - * Ext2 and ext4 use this cache for deduplication of extended attribute blocks. - * They use hash of a block contents as a key and block number as a value. - * That's why keys need not be unique (different xattr blocks may end up having - * the same hash). However block number always uniquely identifies a cache - * entry. - * - * We provide functions for creation and removal of entries, search by key, - * and a special "delete entry with given key-value pair" operation. Fixed - * size hash table is used for fast key lookups. - */ - -struct mb2_cache { - /* Hash table of entries */ - struct hlist_bl_head *c_hash; - /* log2 of hash table size */ - int c_bucket_bits; - /* Maximum entries in cache to avoid degrading hash too much */ - int c_max_entries; - /* Protects c_list, c_entry_count */ - spinlock_t c_list_lock; - struct list_head c_list; - /* Number of entries in cache */ - unsigned long c_entry_count; - struct shrinker c_shrink; - /* Work for shrinking when the cache has too many entries */ - struct work_struct c_shrink_work; -}; - -static struct kmem_cache *mb2_entry_cache; - -static unsigned long mb2_cache_shrink(struct mb2_cache *cache, - unsigned int nr_to_scan); - -static inline bool mb2_cache_entry_referenced(struct mb2_cache_entry *entry) -{ - return entry->_e_hash_list_head & 1; -} - -static inline void mb2_cache_entry_set_referenced(struct mb2_cache_entry *entry) -{ - entry->_e_hash_list_head |= 1; -} - -static inline void mb2_cache_entry_clear_referenced( - struct mb2_cache_entry *entry) -{ - entry->_e_hash_list_head &= ~1; -} - -static inline struct hlist_bl_head *mb2_cache_entry_head( - struct mb2_cache_entry *entry) -{ - return (struct hlist_bl_head *) - (entry->_e_hash_list_head & ~1); -} - -/* - * Number of entries to reclaim synchronously when there are too many entries - * in cache - */ -#define SYNC_SHRINK_BATCH 64 - -/* - * mb2_cache_entry_create - create entry in cache - * @cache - cache where the entry should be created - * @mask - gfp mask with which the entry should be allocated - * @key - key of the entry - * @block - block that contains data - * - * Creates entry in @cache with key @key and records that data is stored in - * block @block. The function returns -EBUSY if entry with the same key - * and for the same block already exists in cache. Otherwise 0 is returned. - */ -int mb2_cache_entry_create(struct mb2_cache *cache, gfp_t mask, u32 key, - sector_t block) -{ - struct mb2_cache_entry *entry, *dup; - struct hlist_bl_node *dup_node; - struct hlist_bl_head *head; - - /* Schedule background reclaim if there are too many entries */ - if (cache->c_entry_count >= cache->c_max_entries) - schedule_work(&cache->c_shrink_work); - /* Do some sync reclaim if background reclaim cannot keep up */ - if (cache->c_entry_count >= 2*cache->c_max_entries) - mb2_cache_shrink(cache, SYNC_SHRINK_BATCH); - - entry = kmem_cache_alloc(mb2_entry_cache, mask); - if (!entry) - return -ENOMEM; - - INIT_LIST_HEAD(&entry->e_list); - /* One ref for hash, one ref returned */ - atomic_set(&entry->e_refcnt, 1); - entry->e_key = key; - entry->e_block = block; - head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; - entry->_e_hash_list_head = (unsigned long)head; - hlist_bl_lock(head); - hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { - if (dup->e_key == key && dup->e_block == block) { - hlist_bl_unlock(head); - kmem_cache_free(mb2_entry_cache, entry); - return -EBUSY; - } - } - hlist_bl_add_head(&entry->e_hash_list, head); - hlist_bl_unlock(head); - - spin_lock(&cache->c_list_lock); - list_add_tail(&entry->e_list, &cache->c_list); - /* Grab ref for LRU list */ - atomic_inc(&entry->e_refcnt); - cache->c_entry_count++; - spin_unlock(&cache->c_list_lock); - - return 0; -} -EXPORT_SYMBOL(mb2_cache_entry_create); - -void __mb2_cache_entry_free(struct mb2_cache_entry *entry) -{ - kmem_cache_free(mb2_entry_cache, entry); -} -EXPORT_SYMBOL(__mb2_cache_entry_free); - -static struct mb2_cache_entry *__entry_find(struct mb2_cache *cache, - struct mb2_cache_entry *entry, - u32 key) -{ - struct mb2_cache_entry *old_entry = entry; - struct hlist_bl_node *node; - struct hlist_bl_head *head; - - if (entry) - head = mb2_cache_entry_head(entry); - else - head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; - hlist_bl_lock(head); - if (entry && !hlist_bl_unhashed(&entry->e_hash_list)) - node = entry->e_hash_list.next; - else - node = hlist_bl_first(head); - while (node) { - entry = hlist_bl_entry(node, struct mb2_cache_entry, - e_hash_list); - if (entry->e_key == key) { - atomic_inc(&entry->e_refcnt); - goto out; - } - node = node->next; - } - entry = NULL; -out: - hlist_bl_unlock(head); - if (old_entry) - mb2_cache_entry_put(cache, old_entry); - - return entry; -} - -/* - * mb2_cache_entry_find_first - find the first entry in cache with given key - * @cache: cache where we should search - * @key: key to look for - * - * Search in @cache for entry with key @key. Grabs reference to the first - * entry found and returns the entry. - */ -struct mb2_cache_entry *mb2_cache_entry_find_first(struct mb2_cache *cache, - u32 key) -{ - return __entry_find(cache, NULL, key); -} -EXPORT_SYMBOL(mb2_cache_entry_find_first); - -/* - * mb2_cache_entry_find_next - find next entry in cache with the same - * @cache: cache where we should search - * @entry: entry to start search from - * - * Finds next entry in the hash chain which has the same key as @entry. - * If @entry is unhashed (which can happen when deletion of entry races - * with the search), finds the first entry in the hash chain. The function - * drops reference to @entry and returns with a reference to the found entry. - */ -struct mb2_cache_entry *mb2_cache_entry_find_next(struct mb2_cache *cache, - struct mb2_cache_entry *entry) -{ - return __entry_find(cache, entry, entry->e_key); -} -EXPORT_SYMBOL(mb2_cache_entry_find_next); - -/* mb2_cache_entry_delete_block - remove information about block from cache - * @cache - cache we work with - * @key - key of the entry to remove - * @block - block containing data for @key - * - * Remove entry from cache @cache with key @key with data stored in @block. - */ -void mb2_cache_entry_delete_block(struct mb2_cache *cache, u32 key, - sector_t block) -{ - struct hlist_bl_node *node; - struct hlist_bl_head *head; - struct mb2_cache_entry *entry; - - head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; - hlist_bl_lock(head); - hlist_bl_for_each_entry(entry, node, head, e_hash_list) { - if (entry->e_key == key && entry->e_block == block) { - /* We keep hash list reference to keep entry alive */ - hlist_bl_del_init(&entry->e_hash_list); - hlist_bl_unlock(head); - spin_lock(&cache->c_list_lock); - if (!list_empty(&entry->e_list)) { - list_del_init(&entry->e_list); - cache->c_entry_count--; - atomic_dec(&entry->e_refcnt); - } - spin_unlock(&cache->c_list_lock); - mb2_cache_entry_put(cache, entry); - return; - } - } - hlist_bl_unlock(head); -} -EXPORT_SYMBOL(mb2_cache_entry_delete_block); - -/* mb2_cache_entry_touch - cache entry got used - * @cache - cache the entry belongs to - * @entry - entry that got used - * - * Marks entry as used to give hit higher chances of surviving in cache. - */ -void mb2_cache_entry_touch(struct mb2_cache *cache, - struct mb2_cache_entry *entry) -{ - mb2_cache_entry_set_referenced(entry); -} -EXPORT_SYMBOL(mb2_cache_entry_touch); - -static unsigned long mb2_cache_count(struct shrinker *shrink, - struct shrink_control *sc) -{ - struct mb2_cache *cache = container_of(shrink, struct mb2_cache, - c_shrink); - - return cache->c_entry_count; -} - -/* Shrink number of entries in cache */ -static unsigned long mb2_cache_shrink(struct mb2_cache *cache, - unsigned int nr_to_scan) -{ - struct mb2_cache_entry *entry; - struct hlist_bl_head *head; - unsigned int shrunk = 0; - - spin_lock(&cache->c_list_lock); - while (nr_to_scan-- && !list_empty(&cache->c_list)) { - entry = list_first_entry(&cache->c_list, - struct mb2_cache_entry, e_list); - if (mb2_cache_entry_referenced(entry)) { - mb2_cache_entry_clear_referenced(entry); - list_move_tail(&cache->c_list, &entry->e_list); - continue; - } - list_del_init(&entry->e_list); - cache->c_entry_count--; - /* - * We keep LRU list reference so that entry doesn't go away - * from under us. - */ - spin_unlock(&cache->c_list_lock); - head = mb2_cache_entry_head(entry); - hlist_bl_lock(head); - if (!hlist_bl_unhashed(&entry->e_hash_list)) { - hlist_bl_del_init(&entry->e_hash_list); - atomic_dec(&entry->e_refcnt); - } - hlist_bl_unlock(head); - if (mb2_cache_entry_put(cache, entry)) - shrunk++; - cond_resched(); - spin_lock(&cache->c_list_lock); - } - spin_unlock(&cache->c_list_lock); - - return shrunk; -} - -static unsigned long mb2_cache_scan(struct shrinker *shrink, - struct shrink_control *sc) -{ - int nr_to_scan = sc->nr_to_scan; - struct mb2_cache *cache = container_of(shrink, struct mb2_cache, - c_shrink); - return mb2_cache_shrink(cache, nr_to_scan); -} - -/* We shrink 1/X of the cache when we have too many entries in it */ -#define SHRINK_DIVISOR 16 - -static void mb2_cache_shrink_worker(struct work_struct *work) -{ - struct mb2_cache *cache = container_of(work, struct mb2_cache, - c_shrink_work); - mb2_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR); -} - -/* - * mb2_cache_create - create cache - * @bucket_bits: log2 of the hash table size - * - * Create cache for keys with 2^bucket_bits hash entries. - */ -struct mb2_cache *mb2_cache_create(int bucket_bits) -{ - struct mb2_cache *cache; - int bucket_count = 1 << bucket_bits; - int i; - - if (!try_module_get(THIS_MODULE)) - return NULL; - - cache = kzalloc(sizeof(struct mb2_cache), GFP_KERNEL); - if (!cache) - goto err_out; - cache->c_bucket_bits = bucket_bits; - cache->c_max_entries = bucket_count << 4; - INIT_LIST_HEAD(&cache->c_list); - spin_lock_init(&cache->c_list_lock); - cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head), - GFP_KERNEL); - if (!cache->c_hash) { - kfree(cache); - goto err_out; - } - for (i = 0; i < bucket_count; i++) - INIT_HLIST_BL_HEAD(&cache->c_hash[i]); - - cache->c_shrink.count_objects = mb2_cache_count; - cache->c_shrink.scan_objects = mb2_cache_scan; - cache->c_shrink.seeks = DEFAULT_SEEKS; - register_shrinker(&cache->c_shrink); - - INIT_WORK(&cache->c_shrink_work, mb2_cache_shrink_worker); - - return cache; - -err_out: - module_put(THIS_MODULE); - return NULL; -} -EXPORT_SYMBOL(mb2_cache_create); - -/* - * mb2_cache_destroy - destroy cache - * @cache: the cache to destroy - * - * Free all entries in cache and cache itself. Caller must make sure nobody - * (except shrinker) can reach @cache when calling this. - */ -void mb2_cache_destroy(struct mb2_cache *cache) -{ - struct mb2_cache_entry *entry, *next; - - unregister_shrinker(&cache->c_shrink); - - /* - * We don't bother with any locking. Cache must not be used at this - * point. - */ - list_for_each_entry_safe(entry, next, &cache->c_list, e_list) { - if (!hlist_bl_unhashed(&entry->e_hash_list)) { - hlist_bl_del_init(&entry->e_hash_list); - atomic_dec(&entry->e_refcnt); - } else - WARN_ON(1); - list_del(&entry->e_list); - WARN_ON(atomic_read(&entry->e_refcnt) != 1); - mb2_cache_entry_put(cache, entry); - } - kfree(cache->c_hash); - kfree(cache); - module_put(THIS_MODULE); -} -EXPORT_SYMBOL(mb2_cache_destroy); - -static int __init mb2cache_init(void) -{ - mb2_entry_cache = kmem_cache_create("mbcache", - sizeof(struct mb2_cache_entry), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); - BUG_ON(!mb2_entry_cache); - return 0; -} - -static void __exit mb2cache_exit(void) -{ - kmem_cache_destroy(mb2_entry_cache); -} - -module_init(mb2cache_init) -module_exit(mb2cache_exit) - -MODULE_AUTHOR("Jan Kara "); -MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); -MODULE_LICENSE("GPL"); diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h new file mode 100644 index 0000000..a74a1f3 --- /dev/null +++ b/include/linux/mbcache.h @@ -0,0 +1,53 @@ +#ifndef _LINUX_MBCACHE_H +#define _LINUX_MBCACHE_H + +#include +#include +#include +#include +#include + +struct mb_cache; + +struct mb_cache_entry { + /* List of entries in cache - protected by cache->c_list_lock */ + struct list_head e_list; + /* Hash table list - protected by bitlock in e_hash_list_head */ + struct hlist_bl_node e_hash_list; + atomic_t e_refcnt; + /* Key in hash - stable during lifetime of the entry */ + u32 e_key; + /* Block number of hashed block - stable during lifetime of the entry */ + sector_t e_block; + /* + * Head of hash list (for list bit lock) - stable. Combined with + * referenced bit of entry + */ + unsigned long _e_hash_list_head; +}; + +struct mb_cache *mb_cache_create(int bucket_bits); +void mb_cache_destroy(struct mb_cache *cache); + +int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, + sector_t block); +void __mb_cache_entry_free(struct mb_cache_entry *entry); +static inline int mb_cache_entry_put(struct mb_cache *cache, + struct mb_cache_entry *entry) +{ + if (!atomic_dec_and_test(&entry->e_refcnt)) + return 0; + __mb_cache_entry_free(entry); + return 1; +} + +void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key, + sector_t block); +struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, + u32 key); +struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, + struct mb_cache_entry *entry); +void mb_cache_entry_touch(struct mb_cache *cache, + struct mb_cache_entry *entry); + +#endif /* _LINUX_MBCACHE_H */ diff --git a/include/linux/mbcache2.h b/include/linux/mbcache2.h deleted file mode 100644 index c934843..0000000 --- a/include/linux/mbcache2.h +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef _LINUX_MB2CACHE_H -#define _LINUX_MB2CACHE_H - -#include -#include -#include -#include -#include - -struct mb2_cache; - -struct mb2_cache_entry { - /* List of entries in cache - protected by cache->c_list_lock */ - struct list_head e_list; - /* Hash table list - protected by bitlock in e_hash_list_head */ - struct hlist_bl_node e_hash_list; - atomic_t e_refcnt; - /* Key in hash - stable during lifetime of the entry */ - u32 e_key; - /* Block number of hashed block - stable during lifetime of the entry */ - sector_t e_block; - /* - * Head of hash list (for list bit lock) - stable. Combined with - * referenced bit of entry - */ - unsigned long _e_hash_list_head; -}; - -struct mb2_cache *mb2_cache_create(int bucket_bits); -void mb2_cache_destroy(struct mb2_cache *cache); - -int mb2_cache_entry_create(struct mb2_cache *cache, gfp_t mask, u32 key, - sector_t block); -void __mb2_cache_entry_free(struct mb2_cache_entry *entry); -static inline int mb2_cache_entry_put(struct mb2_cache *cache, - struct mb2_cache_entry *entry) -{ - if (!atomic_dec_and_test(&entry->e_refcnt)) - return 0; - __mb2_cache_entry_free(entry); - return 1; -} - -void mb2_cache_entry_delete_block(struct mb2_cache *cache, u32 key, - sector_t block); -struct mb2_cache_entry *mb2_cache_entry_find_first(struct mb2_cache *cache, - u32 key); -struct mb2_cache_entry *mb2_cache_entry_find_next(struct mb2_cache *cache, - struct mb2_cache_entry *entry); -void mb2_cache_entry_touch(struct mb2_cache *cache, - struct mb2_cache_entry *entry); - -#endif /* _LINUX_MB2CACHE_H */ -- cgit v0.10.2 From 2335d05f3a83f5290ec28c1ed30c1c742a37edc9 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 22 Feb 2016 22:41:05 -0500 Subject: ext4: kill ext4_mballoc_ready This variable, introduced in commit 9c191f70, is unnecessary: it is set once the module has been initialized correctly, and ext4_fill_super cannot run unless the module has been initialized correctly. Signed-off-by: Andreas Gruenbacher Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ecc37e1..2f55051 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -55,7 +55,6 @@ static struct ext4_lazy_init *ext4_li_info; static struct mutex ext4_li_mtx; -static int ext4_mballoc_ready; static struct ratelimit_state ext4_mount_msg_ratelimit; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, @@ -3795,12 +3794,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; no_journal: - if (ext4_mballoc_ready) { - sbi->s_mb_cache = ext4_xattr_create_cache(); - if (!sbi->s_mb_cache) { - ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); - goto failed_mount_wq; - } + sbi->s_mb_cache = ext4_xattr_create_cache(); + if (!sbi->s_mb_cache) { + ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); + goto failed_mount_wq; } if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) && @@ -5361,8 +5358,6 @@ static int __init ext4_init_fs(void) err = ext4_init_mballoc(); if (err) goto out2; - else - ext4_mballoc_ready = 1; err = init_inodecache(); if (err) goto out1; @@ -5378,7 +5373,6 @@ out: unregister_as_ext3(); destroy_inodecache(); out1: - ext4_mballoc_ready = 0; ext4_exit_mballoc(); out2: ext4_exit_sysfs(); -- cgit v0.10.2 From dc8d5e565f00c9442fa1cbf9acc115475628527c Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 22 Feb 2016 22:42:05 -0500 Subject: mbcache: get rid of _e_hash_list_head Get rid of field _e_hash_list_head in cache entries and add bit field e_referenced instead. Signed-off-by: Andreas Gruenbacher Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/mbcache.c b/fs/mbcache.c index 4241b63..903be15 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -45,27 +45,10 @@ static struct kmem_cache *mb_entry_cache; static unsigned long mb_cache_shrink(struct mb_cache *cache, unsigned int nr_to_scan); -static inline bool mb_cache_entry_referenced(struct mb_cache_entry *entry) +static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache, + u32 key) { - return entry->_e_hash_list_head & 1; -} - -static inline void mb_cache_entry_set_referenced(struct mb_cache_entry *entry) -{ - entry->_e_hash_list_head |= 1; -} - -static inline void mb_cache_entry_clear_referenced( - struct mb_cache_entry *entry) -{ - entry->_e_hash_list_head &= ~1; -} - -static inline struct hlist_bl_head *mb_cache_entry_head( - struct mb_cache_entry *entry) -{ - return (struct hlist_bl_head *) - (entry->_e_hash_list_head & ~1); + return &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; } /* @@ -108,8 +91,7 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, atomic_set(&entry->e_refcnt, 1); entry->e_key = key; entry->e_block = block; - head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; - entry->_e_hash_list_head = (unsigned long)head; + head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { if (dup->e_key == key && dup->e_block == block) { @@ -146,10 +128,7 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache, struct hlist_bl_node *node; struct hlist_bl_head *head; - if (entry) - head = mb_cache_entry_head(entry); - else - head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; + head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); if (entry && !hlist_bl_unhashed(&entry->e_hash_list)) node = entry->e_hash_list.next; @@ -219,7 +198,7 @@ void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key, struct hlist_bl_head *head; struct mb_cache_entry *entry; - head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; + head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(entry, node, head, e_hash_list) { if (entry->e_key == key && entry->e_block == block) { @@ -250,7 +229,7 @@ EXPORT_SYMBOL(mb_cache_entry_delete_block); void mb_cache_entry_touch(struct mb_cache *cache, struct mb_cache_entry *entry) { - mb_cache_entry_set_referenced(entry); + entry->e_referenced = 1; } EXPORT_SYMBOL(mb_cache_entry_touch); @@ -275,8 +254,8 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache, while (nr_to_scan-- && !list_empty(&cache->c_list)) { entry = list_first_entry(&cache->c_list, struct mb_cache_entry, e_list); - if (mb_cache_entry_referenced(entry)) { - mb_cache_entry_clear_referenced(entry); + if (entry->e_referenced) { + entry->e_referenced = 0; list_move_tail(&cache->c_list, &entry->e_list); continue; } @@ -287,7 +266,7 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache, * from under us. */ spin_unlock(&cache->c_list_lock); - head = mb_cache_entry_head(entry); + head = mb_cache_entry_head(cache, entry->e_key); hlist_bl_lock(head); if (!hlist_bl_unhashed(&entry->e_hash_list)) { hlist_bl_del_init(&entry->e_hash_list); diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h index a74a1f3..607e696 100644 --- a/include/linux/mbcache.h +++ b/include/linux/mbcache.h @@ -12,18 +12,14 @@ struct mb_cache; struct mb_cache_entry { /* List of entries in cache - protected by cache->c_list_lock */ struct list_head e_list; - /* Hash table list - protected by bitlock in e_hash_list_head */ + /* Hash table list - protected by hash chain bitlock */ struct hlist_bl_node e_hash_list; atomic_t e_refcnt; /* Key in hash - stable during lifetime of the entry */ u32 e_key; + u32 e_referenced:1; /* Block number of hashed block - stable during lifetime of the entry */ sector_t e_block; - /* - * Head of hash list (for list bit lock) - stable. Combined with - * referenced bit of entry - */ - unsigned long _e_hash_list_head; }; struct mb_cache *mb_cache_create(int bucket_bits); -- cgit v0.10.2 From 3fd164629d25b04f291a79a013dcc7ce1a301269 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2016 22:43:04 -0500 Subject: ext4: shortcut setting of xattr to the same value When someone tried to set xattr to the same value (i.e., not changing anything) we did all the work of removing original xattr, possibly breaking references to shared xattr block, inserting new xattr, and merging xattr blocks again. Since this is not so rare operation and it is relatively cheap for us to detect this case, check for this and shortcut xattr setting in that case. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index c6af8a7..b661ae8 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1096,6 +1096,17 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, return 0; } +static int ext4_xattr_value_same(struct ext4_xattr_search *s, + struct ext4_xattr_info *i) +{ + void *value; + + if (le32_to_cpu(s->here->e_value_size) != i->value_len) + return 0; + value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs); + return !memcmp(value, i->value, i->value_len); +} + /* * ext4_xattr_set_handle() * @@ -1172,6 +1183,13 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, else if (!bs.s.not_found) error = ext4_xattr_block_set(handle, inode, &i, &bs); } else { + error = 0; + /* Xattr value did not change? Save us some work and bail out */ + if (!is.s.not_found && ext4_xattr_value_same(&is.s, &i)) + goto cleanup; + if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i)) + goto cleanup; + error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (!error && !bs.s.not_found) { i.value = NULL; -- cgit v0.10.2 From 6048c64b26097a0ffbd966866b599f990e674e9b Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 22 Feb 2016 22:44:04 -0500 Subject: mbcache: add reusable flag to cache entries To reduce amount of damage caused by single bad block, we limit number of inodes sharing an xattr block to 1024. Thus there can be more xattr blocks with the same contents when there are lots of files with the same extended attributes. These xattr blocks naturally result in hash collisions and can form long hash chains and we unnecessarily check each such block only to find out we cannot use it because it is already shared by too many inodes. Add a reusable flag to cache entries which is cleared when a cache entry has reached its maximum refcount. Cache entries which are not marked reusable are skipped by mb_cache_entry_find_{first,next}. This significantly speeds up mbcache when there are many same xattr blocks. For example for xattr-bench with 5 values and each process handling 20000 files, the run for 64 processes is 25x faster with this patch. Even for 8 processes the speedup is almost 3x. We have also verified that for situations where there is only one xattr block of each kind, the patch doesn't have a measurable cost. [JK: Remove handling of setting the same value since it is not needed anymore, check for races in e_reusable setting, improve changelog, add measurements] Signed-off-by: Andreas Gruenbacher Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 71d58c2..1a5e3bf 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -823,7 +823,7 @@ ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh) __u32 hash = le32_to_cpu(HDR(bh)->h_hash); int error; - error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr); + error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr, 1); if (error) { if (error == -EBUSY) { ea_bdebug(bh, "already in cache (%d cache entries)", diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index b661ae8..0441e05 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -545,6 +545,8 @@ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, struct buffer_head *bh) { + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + u32 hash, ref; int error = 0; BUFFER_TRACE(bh, "get_write_access"); @@ -553,23 +555,34 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, goto out; lock_buffer(bh); - if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { - __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); - + hash = le32_to_cpu(BHDR(bh)->h_hash); + ref = le32_to_cpu(BHDR(bh)->h_refcount); + if (ref == 1) { ea_bdebug(bh, "refcount now=0; freeing"); /* * This must happen under buffer lock for * ext4_xattr_block_set() to reliably detect freed block */ - mb_cache_entry_delete_block(EXT4_GET_MB_CACHE(inode), hash, - bh->b_blocknr); + mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr); get_bh(bh); unlock_buffer(bh); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } else { - le32_add_cpu(&BHDR(bh)->h_refcount, -1); + ref--; + BHDR(bh)->h_refcount = cpu_to_le32(ref); + if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) { + struct mb_cache_entry *ce; + + ce = mb_cache_entry_get(ext4_mb_cache, hash, + bh->b_blocknr); + if (ce) { + ce->e_reusable = 1; + mb_cache_entry_put(ext4_mb_cache, ce); + } + } + /* * Beware of this ugliness: Releasing of xattr block references * from different inodes can race and so we have to protect @@ -872,6 +885,8 @@ inserted: if (new_bh == bs->bh) ea_bdebug(new_bh, "keeping"); else { + u32 ref; + /* The old block is released after updating the inode. */ error = dquot_alloc_block(inode, @@ -886,15 +901,18 @@ inserted: lock_buffer(new_bh); /* * We have to be careful about races with - * freeing or rehashing of xattr block. Once we - * hold buffer lock xattr block's state is - * stable so we can check whether the block got - * freed / rehashed or not. Since we unhash - * mbcache entry under buffer lock when freeing - * / rehashing xattr block, checking whether - * entry is still hashed is reliable. + * freeing, rehashing or adding references to + * xattr block. Once we hold buffer lock xattr + * block's state is stable so we can check + * whether the block got freed / rehashed or + * not. Since we unhash mbcache entry under + * buffer lock when freeing / rehashing xattr + * block, checking whether entry is still + * hashed is reliable. Same rules hold for + * e_reusable handling. */ - if (hlist_bl_unhashed(&ce->e_hash_list)) { + if (hlist_bl_unhashed(&ce->e_hash_list) || + !ce->e_reusable) { /* * Undo everything and check mbcache * again. @@ -909,9 +927,12 @@ inserted: new_bh = NULL; goto inserted; } - le32_add_cpu(&BHDR(new_bh)->h_refcount, 1); + ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1; + BHDR(new_bh)->h_refcount = cpu_to_le32(ref); + if (ref >= EXT4_XATTR_REFCOUNT_MAX) + ce->e_reusable = 0; ea_bdebug(new_bh, "reusing; refcount now=%d", - le32_to_cpu(BHDR(new_bh)->h_refcount)); + ref); unlock_buffer(new_bh); error = ext4_handle_dirty_xattr_block(handle, inode, @@ -1566,11 +1587,14 @@ cleanup: static void ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) { - __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); + struct ext4_xattr_header *header = BHDR(bh); + __u32 hash = le32_to_cpu(header->h_hash); + int reusable = le32_to_cpu(header->h_refcount) < + EXT4_XATTR_REFCOUNT_MAX; int error; error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash, - bh->b_blocknr); + bh->b_blocknr, reusable); if (error) { if (error == -EBUSY) ea_bdebug(bh, "already in cache"); @@ -1645,12 +1669,6 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, if (!bh) { EXT4_ERROR_INODE(inode, "block %lu read error", (unsigned long) ce->e_block); - } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= - EXT4_XATTR_REFCOUNT_MAX) { - ea_idebug(inode, "block %lu refcount %d>=%d", - (unsigned long) ce->e_block, - le32_to_cpu(BHDR(bh)->h_refcount), - EXT4_XATTR_REFCOUNT_MAX); } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { *pce = ce; return bh; diff --git a/fs/mbcache.c b/fs/mbcache.c index 903be15..eccda3a 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -63,13 +63,14 @@ static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache, * @mask - gfp mask with which the entry should be allocated * @key - key of the entry * @block - block that contains data + * @reusable - is the block reusable by other inodes? * * Creates entry in @cache with key @key and records that data is stored in * block @block. The function returns -EBUSY if entry with the same key * and for the same block already exists in cache. Otherwise 0 is returned. */ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, - sector_t block) + sector_t block, bool reusable) { struct mb_cache_entry *entry, *dup; struct hlist_bl_node *dup_node; @@ -91,6 +92,7 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, atomic_set(&entry->e_refcnt, 1); entry->e_key = key; entry->e_block = block; + entry->e_reusable = reusable; head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { @@ -137,7 +139,7 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache, while (node) { entry = hlist_bl_entry(node, struct mb_cache_entry, e_hash_list); - if (entry->e_key == key) { + if (entry->e_key == key && entry->e_reusable) { atomic_inc(&entry->e_refcnt); goto out; } @@ -184,10 +186,38 @@ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, } EXPORT_SYMBOL(mb_cache_entry_find_next); +/* + * mb_cache_entry_get - get a cache entry by block number (and key) + * @cache - cache we work with + * @key - key of block number @block + * @block - block number + */ +struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, + sector_t block) +{ + struct hlist_bl_node *node; + struct hlist_bl_head *head; + struct mb_cache_entry *entry; + + head = mb_cache_entry_head(cache, key); + hlist_bl_lock(head); + hlist_bl_for_each_entry(entry, node, head, e_hash_list) { + if (entry->e_key == key && entry->e_block == block) { + atomic_inc(&entry->e_refcnt); + goto out; + } + } + entry = NULL; +out: + hlist_bl_unlock(head); + return entry; +} +EXPORT_SYMBOL(mb_cache_entry_get); + /* mb_cache_entry_delete_block - remove information about block from cache * @cache - cache we work with - * @key - key of the entry to remove - * @block - block containing data for @key + * @key - key of block @block + * @block - block number * * Remove entry from cache @cache with key @key with data stored in @block. */ diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h index 607e696..86c9a8b 100644 --- a/include/linux/mbcache.h +++ b/include/linux/mbcache.h @@ -18,6 +18,7 @@ struct mb_cache_entry { /* Key in hash - stable during lifetime of the entry */ u32 e_key; u32 e_referenced:1; + u32 e_reusable:1; /* Block number of hashed block - stable during lifetime of the entry */ sector_t e_block; }; @@ -26,7 +27,7 @@ struct mb_cache *mb_cache_create(int bucket_bits); void mb_cache_destroy(struct mb_cache *cache); int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, - sector_t block); + sector_t block, bool reusable); void __mb_cache_entry_free(struct mb_cache_entry *entry); static inline int mb_cache_entry_put(struct mb_cache *cache, struct mb_cache_entry *entry) @@ -39,6 +40,8 @@ static inline int mb_cache_entry_put(struct mb_cache *cache, void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key, sector_t block); +struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, + sector_t block); struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, u32 key); struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, -- cgit v0.10.2 From 29c6eaffc8868ef6fa71997d0ea507a02c52712c Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Mon, 22 Feb 2016 22:58:55 -0500 Subject: ext4: trim unused parameter from convert_initialized_extent() The flags parameter is also unused. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3753ceb..73a48c1 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3927,7 +3927,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, static int convert_initialized_extent(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path **ppath, int flags, + struct ext4_ext_path **ppath, unsigned int allocated) { struct ext4_ext_path *path = *ppath; @@ -4347,7 +4347,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { allocated = convert_initialized_extent( handle, inode, map, &path, - flags, allocated); + allocated); goto out2; } else if (!ext4_ext_is_unwritten(ex)) goto out; -- cgit v0.10.2 From 9bcf976cb8b86eb40e0c0b495a14e4cb967b9c6e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2016 23:07:30 -0500 Subject: jbd2: remove unnecessary arguments of jbd2_journal_write_revoke_records jbd2_journal_write_revoke_records() takes journal pointer and write_op, although journal can be obtained from the passed transaction and write_op is always WRITE_SYNC. Remove these superfluous arguments. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 36345fe..ae4402d 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -554,8 +554,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd2_journal_abort(journal, err); blk_start_plug(&plug); - jbd2_journal_write_revoke_records(journal, commit_transaction, - &log_bufs, WRITE_SYNC); + jbd2_journal_write_revoke_records(commit_transaction, &log_bufs); jbd_debug(3, "JBD2: commit phase 2b\n"); diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 705ae57..c839332 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -122,11 +122,11 @@ struct jbd2_revoke_table_s #ifdef __KERNEL__ -static void write_one_revoke_record(journal_t *, transaction_t *, +static void write_one_revoke_record(transaction_t *, struct list_head *, struct buffer_head **, int *, - struct jbd2_revoke_record_s *, int); -static void flush_descriptor(journal_t *, struct buffer_head *, int, int); + struct jbd2_revoke_record_s *); +static void flush_descriptor(journal_t *, struct buffer_head *, int); #endif /* Utility functions to maintain the revoke table */ @@ -519,11 +519,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal) * Write revoke records to the journal for all entries in the current * revoke hash, deleting the entries as we go. */ -void jbd2_journal_write_revoke_records(journal_t *journal, - transaction_t *transaction, - struct list_head *log_bufs, - int write_op) +void jbd2_journal_write_revoke_records(transaction_t *transaction, + struct list_head *log_bufs) { + journal_t *journal = transaction->t_journal; struct buffer_head *descriptor; struct jbd2_revoke_record_s *record; struct jbd2_revoke_table_s *revoke; @@ -544,16 +543,15 @@ void jbd2_journal_write_revoke_records(journal_t *journal, while (!list_empty(hash_list)) { record = (struct jbd2_revoke_record_s *) hash_list->next; - write_one_revoke_record(journal, transaction, log_bufs, - &descriptor, &offset, - record, write_op); + write_one_revoke_record(transaction, log_bufs, + &descriptor, &offset, record); count++; list_del(&record->hash); kmem_cache_free(jbd2_revoke_record_cache, record); } } if (descriptor) - flush_descriptor(journal, descriptor, offset, write_op); + flush_descriptor(journal, descriptor, offset); jbd_debug(1, "Wrote %d revoke records\n", count); } @@ -562,14 +560,13 @@ void jbd2_journal_write_revoke_records(journal_t *journal, * block if the old one is full or if we have not already created one. */ -static void write_one_revoke_record(journal_t *journal, - transaction_t *transaction, +static void write_one_revoke_record(transaction_t *transaction, struct list_head *log_bufs, struct buffer_head **descriptorp, int *offsetp, - struct jbd2_revoke_record_s *record, - int write_op) + struct jbd2_revoke_record_s *record) { + journal_t *journal = transaction->t_journal; int csum_size = 0; struct buffer_head *descriptor; int sz, offset; @@ -597,7 +594,7 @@ static void write_one_revoke_record(journal_t *journal, /* Make sure we have a descriptor with space left for the record */ if (descriptor) { if (offset + sz > journal->j_blocksize - csum_size) { - flush_descriptor(journal, descriptor, offset, write_op); + flush_descriptor(journal, descriptor, offset); descriptor = NULL; } } @@ -654,7 +651,7 @@ static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh) static void flush_descriptor(journal_t *journal, struct buffer_head *descriptor, - int offset, int write_op) + int offset) { jbd2_journal_revoke_header_t *header; @@ -670,7 +667,7 @@ static void flush_descriptor(journal_t *journal, set_buffer_jwrite(descriptor); BUFFER_TRACE(descriptor, "write"); set_buffer_dirty(descriptor); - write_dirty_buffer(descriptor, write_op); + write_dirty_buffer(descriptor, WRITE_SYNC); } #endif diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 65407f6..5716d95 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1327,10 +1327,8 @@ extern int jbd2_journal_init_revoke_caches(void); extern void jbd2_journal_destroy_revoke(journal_t *); extern int jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *); extern int jbd2_journal_cancel_revoke(handle_t *, struct journal_head *); -extern void jbd2_journal_write_revoke_records(journal_t *journal, - transaction_t *transaction, - struct list_head *log_bufs, - int write_op); +extern void jbd2_journal_write_revoke_records(transaction_t *transaction, + struct list_head *log_bufs); /* Recovery revoke support */ extern int jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t); -- cgit v0.10.2 From 32ab671599a89534f37e97d146c27690e371b661 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2016 23:17:15 -0500 Subject: jbd2: factor out common descriptor block initialization Descriptor block header is initialized in several places. Factor out the common code into jbd2_journal_get_descriptor_buffer(). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index ae4402d..cf221f3 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -131,14 +131,12 @@ static int journal_submit_commit_record(journal_t *journal, if (is_journal_aborted(journal)) return 0; - bh = jbd2_journal_get_descriptor_buffer(journal); + bh = jbd2_journal_get_descriptor_buffer(commit_transaction, + JBD2_COMMIT_BLOCK); if (!bh) return 1; tmp = (struct commit_header *)bh->b_data; - tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); - tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); - tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); tmp->h_commit_sec = cpu_to_be64(now.tv_sec); tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); @@ -379,7 +377,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) ktime_t start_time; u64 commit_time; char *tagp = NULL; - journal_header_t *header; journal_block_tag_t *tag = NULL; int space_left = 0; int first_tag = 0; @@ -615,7 +612,9 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd_debug(4, "JBD2: get descriptor\n"); - descriptor = jbd2_journal_get_descriptor_buffer(journal); + descriptor = jbd2_journal_get_descriptor_buffer( + commit_transaction, + JBD2_DESCRIPTOR_BLOCK); if (!descriptor) { jbd2_journal_abort(journal, -EIO); continue; @@ -624,11 +623,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd_debug(4, "JBD2: got buffer %llu (%p)\n", (unsigned long long)descriptor->b_blocknr, descriptor->b_data); - header = (journal_header_t *)descriptor->b_data; - header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); - header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK); - header->h_sequence = cpu_to_be32(commit_transaction->t_tid); - tagp = &descriptor->b_data[sizeof(journal_header_t)]; space_left = descriptor->b_size - sizeof(journal_header_t); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 81e6226..28d05bd 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -805,10 +805,13 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, * But we don't bother doing that, so there will be coherency problems with * mmaps of blockdevs which hold live JBD-controlled filesystems. */ -struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) +struct buffer_head * +jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type) { + journal_t *journal = transaction->t_journal; struct buffer_head *bh; unsigned long long blocknr; + journal_header_t *header; int err; err = jbd2_journal_next_log_block(journal, &blocknr); @@ -821,6 +824,10 @@ struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) return NULL; lock_buffer(bh); memset(bh->b_data, 0, journal->j_blocksize); + header = (journal_header_t *)bh->b_data; + header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); + header->h_blocktype = cpu_to_be32(type); + header->h_sequence = cpu_to_be32(transaction->t_tid); set_buffer_uptodate(bh); unlock_buffer(bh); BUFFER_TRACE(bh, "return this buffer"); diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index c839332..d1ebb1d 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -570,7 +570,6 @@ static void write_one_revoke_record(transaction_t *transaction, int csum_size = 0; struct buffer_head *descriptor; int sz, offset; - journal_header_t *header; /* If we are already aborting, this all becomes a noop. We still need to go round the loop in @@ -600,13 +599,10 @@ static void write_one_revoke_record(transaction_t *transaction, } if (!descriptor) { - descriptor = jbd2_journal_get_descriptor_buffer(journal); + descriptor = jbd2_journal_get_descriptor_buffer(transaction, + JBD2_REVOKE_BLOCK); if (!descriptor) return; - header = (journal_header_t *)descriptor->b_data; - header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); - header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK); - header->h_sequence = cpu_to_be32(transaction->t_tid); /* Record it so that we can wait for IO completion later */ BUFFER_TRACE(descriptor, "file in log_bufs"); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 5716d95..3649cb8 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1137,7 +1137,7 @@ static inline void jbd2_unfile_log_bh(struct buffer_head *bh) } /* Log buffer allocation */ -struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal); +struct buffer_head *jbd2_journal_get_descriptor_buffer(transaction_t *, int); int jbd2_journal_next_log_block(journal_t *, unsigned long long *); int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, unsigned long *block); -- cgit v0.10.2 From 1101cd4d13ba2f42c5da4c1b9081f35a73b5df31 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2016 23:19:09 -0500 Subject: jbd2: unify revoke and tag block checksum handling Revoke and tag descriptor blocks are just different kinds of descriptor blocks and thus have checksum in the same place. Unify computation and checking of checksums for these. Reviewed-by: Darrick J. Wong Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index cf221f3..ae832cd 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -317,22 +317,6 @@ static void write_tag_block(journal_t *j, journal_block_tag_t *tag, tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); } -static void jbd2_descr_block_csum_set(journal_t *j, - struct buffer_head *bh) -{ - struct jbd2_journal_block_tail *tail; - __u32 csum; - - if (!jbd2_journal_has_csum_v2or3(j)) - return; - - tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - - sizeof(struct jbd2_journal_block_tail)); - tail->t_checksum = 0; - csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); - tail->t_checksum = cpu_to_be32(csum); -} - static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, struct buffer_head *bh, __u32 sequence) { @@ -714,7 +698,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); - jbd2_descr_block_csum_set(journal, descriptor); + jbd2_descriptor_block_csum_set(journal, descriptor); start_journal_io: for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 28d05bd..defa962 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -834,6 +834,21 @@ jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type) return bh; } +void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh) +{ + struct jbd2_journal_block_tail *tail; + __u32 csum; + + if (!jbd2_journal_has_csum_v2or3(j)) + return; + + tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - + sizeof(struct jbd2_journal_block_tail)); + tail->t_checksum = 0; + csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); + tail->t_checksum = cpu_to_be32(csum); +} + /* * Return tid of the oldest transaction in the journal and block in the journal * where the transaction starts. diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 7f277e4..08a456b 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -174,8 +174,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal, return 0; } -static int jbd2_descr_block_csum_verify(journal_t *j, - void *buf) +static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf) { struct jbd2_journal_block_tail *tail; __be32 provided; @@ -522,8 +521,8 @@ static int do_one_pass(journal_t *journal, descr_csum_size = sizeof(struct jbd2_journal_block_tail); if (descr_csum_size > 0 && - !jbd2_descr_block_csum_verify(journal, - bh->b_data)) { + !jbd2_descriptor_block_csum_verify(journal, + bh->b_data)) { printk(KERN_ERR "JBD2: Invalid checksum " "recovering block %lu in log\n", next_log_block); @@ -811,26 +810,6 @@ static int do_one_pass(journal_t *journal, return err; } -static int jbd2_revoke_block_csum_verify(journal_t *j, - void *buf) -{ - struct jbd2_journal_revoke_tail *tail; - __be32 provided; - __u32 calculated; - - if (!jbd2_journal_has_csum_v2or3(j)) - return 1; - - tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize - - sizeof(struct jbd2_journal_revoke_tail)); - provided = tail->r_checksum; - tail->r_checksum = 0; - calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); - tail->r_checksum = provided; - - return provided == cpu_to_be32(calculated); -} - /* Scan a revoke record, marking all blocks mentioned as revoked. */ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, @@ -846,11 +825,11 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, offset = sizeof(jbd2_journal_revoke_header_t); rcount = be32_to_cpu(header->r_count); - if (!jbd2_revoke_block_csum_verify(journal, header)) + if (!jbd2_descriptor_block_csum_verify(journal, header)) return -EFSBADCRC; if (jbd2_journal_has_csum_v2or3(journal)) - csum_size = sizeof(struct jbd2_journal_revoke_tail); + csum_size = sizeof(struct jbd2_journal_block_tail); if (rcount > journal->j_blocksize - csum_size) return -EINVAL; max = rcount; diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index d1ebb1d..91171dc 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -583,7 +583,7 @@ static void write_one_revoke_record(transaction_t *transaction, /* Do we need to leave space at the end for a checksum? */ if (jbd2_journal_has_csum_v2or3(journal)) - csum_size = sizeof(struct jbd2_journal_revoke_tail); + csum_size = sizeof(struct jbd2_journal_block_tail); if (jbd2_has_feature_64bit(journal)) sz = 8; @@ -623,21 +623,6 @@ static void write_one_revoke_record(transaction_t *transaction, *offsetp = offset; } -static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh) -{ - struct jbd2_journal_revoke_tail *tail; - __u32 csum; - - if (!jbd2_journal_has_csum_v2or3(j)) - return; - - tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize - - sizeof(struct jbd2_journal_revoke_tail)); - tail->r_checksum = 0; - csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); - tail->r_checksum = cpu_to_be32(csum); -} - /* * Flush a revoke descriptor out to the journal. If we are aborting, * this is a noop; otherwise we are generating a buffer which needs to @@ -658,7 +643,7 @@ static void flush_descriptor(journal_t *journal, header = (jbd2_journal_revoke_header_t *)descriptor->b_data; header->r_count = cpu_to_be32(offset); - jbd2_revoke_csum_set(journal, descriptor); + jbd2_descriptor_block_csum_set(journal, descriptor); set_buffer_jwrite(descriptor); BUFFER_TRACE(descriptor, "write"); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 3649cb8..fd1083c 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -200,7 +200,7 @@ typedef struct journal_block_tag_s __be32 t_blocknr_high; /* most-significant high 32bits. */ } journal_block_tag_t; -/* Tail of descriptor block, for checksumming */ +/* Tail of descriptor or revoke block, for checksumming */ struct jbd2_journal_block_tail { __be32 t_checksum; /* crc32c(uuid+descr_block) */ }; @@ -215,11 +215,6 @@ typedef struct jbd2_journal_revoke_header_s __be32 r_count; /* Count of bytes used in the block */ } jbd2_journal_revoke_header_t; -/* Tail of revoke block, for checksumming */ -struct jbd2_journal_revoke_tail { - __be32 r_checksum; /* crc32c(uuid+revoke_block) */ -}; - /* Definitions for the journal tag flags word: */ #define JBD2_FLAG_ESCAPE 1 /* on-disk block is escaped */ #define JBD2_FLAG_SAME_UUID 2 /* block has same uuid as previous */ @@ -1138,6 +1133,7 @@ static inline void jbd2_unfile_log_bh(struct buffer_head *bh) /* Log buffer allocation */ struct buffer_head *jbd2_journal_get_descriptor_buffer(transaction_t *, int); +void jbd2_descriptor_block_csum_set(journal_t *, struct buffer_head *); int jbd2_journal_next_log_block(journal_t *, unsigned long long *); int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, unsigned long *block); -- cgit v0.10.2 From cb0d9d47a39decbdfaeaa5c063467ed21b2c70b3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2016 23:20:30 -0500 Subject: jbd2: save some atomic ops in __JI_COMMIT_RUNNING handling Currently we used atomic bit operations to manipulate __JI_COMMIT_RUNNING bit. However this is unnecessary as i_flags are always written and read under j_list_lock. So just change the operations to standard bit operations. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index ae832cd..517f2de 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -220,7 +220,7 @@ static int journal_submit_data_buffers(journal_t *journal, spin_lock(&journal->j_list_lock); list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { mapping = jinode->i_vfs_inode->i_mapping; - set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); + jinode->i_flags |= JI_COMMIT_RUNNING; spin_unlock(&journal->j_list_lock); /* * submit the inode data buffers. We use writepage @@ -234,8 +234,8 @@ static int journal_submit_data_buffers(journal_t *journal, ret = err; spin_lock(&journal->j_list_lock); J_ASSERT(jinode->i_transaction == commit_transaction); - clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); - smp_mb__after_atomic(); + jinode->i_flags &= ~JI_COMMIT_RUNNING; + smp_mb(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } spin_unlock(&journal->j_list_lock); @@ -256,7 +256,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, /* For locking, see the comment in journal_submit_data_buffers() */ spin_lock(&journal->j_list_lock); list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { - set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); + jinode->i_flags |= JI_COMMIT_RUNNING; spin_unlock(&journal->j_list_lock); err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); if (err) { @@ -272,8 +272,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal, ret = err; } spin_lock(&journal->j_list_lock); - clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); - smp_mb__after_atomic(); + jinode->i_flags &= ~JI_COMMIT_RUNNING; + smp_mb(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index defa962..7bf1683 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2587,7 +2587,7 @@ void jbd2_journal_release_jbd_inode(journal_t *journal, restart: spin_lock(&journal->j_list_lock); /* Is commit writing out inode - we have to wait */ - if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) { + if (jinode->i_flags & JI_COMMIT_RUNNING) { wait_queue_head_t *wq; DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); -- cgit v0.10.2 From 3bd6ad7b688e200ac7633b16affa164d7cd5ef07 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 8 Mar 2016 22:26:39 -0500 Subject: ext4: pack ioend structure better On 64-bit architectures we have two 4-byte holes in struct ext4_io_end. Order entries better to avoid this and thus make the structure occupy 64 instead of 72 bytes for 64-bit architectures. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 157b458..5035dfe 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -182,9 +182,9 @@ typedef struct ext4_io_end { struct bio *bio; /* Linked list of completed * bios covering the extent */ unsigned int flag; /* unwritten or not */ + atomic_t count; /* reference counter */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ - atomic_t count; /* reference counter */ } ext4_io_end_t; struct ext4_io_submit { -- cgit v0.10.2 From e142d05263a4beedefd331d445c394f4397e9f03 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 8 Mar 2016 22:44:50 -0500 Subject: ext4: use i_mutex to serialize unaligned AIO DIO Currently we've used hashed aio_mutex to serialize unaligned AIO DIO. However the code cleanups that happened after 2011 when the lock was introduced made aio_mutex acquired at almost the same places where we already have exclusion using i_mutex. So just use i_mutex for the exclusion of unaligned AIO DIO. The change moves waiting for pending unwritten extent conversion under i_mutex. That makes special handling of O_APPEND writes unnecessary and also avoids possible livelocking of unaligned AIO DIO with aligned one (nothing was preventing contiguous stream of aligned AIO DIOs to let unaligned AIO DIO wait forever). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5035dfe..f4e9521 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3285,10 +3285,7 @@ static inline void ext4_inode_resume_unlocked_dio(struct inode *inode) #define EXT4_WQ_HASH_SZ 37 #define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ EXT4_WQ_HASH_SZ]) -#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\ - EXT4_WQ_HASH_SZ]) extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; -extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; #define EXT4_RESIZING 0 extern int ext4_resize_begin(struct super_block *sb); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 474f1a4..4a11535 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -93,31 +93,29 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(iocb->ki_filp); - struct mutex *aio_mutex = NULL; struct blk_plug plug; int o_direct = iocb->ki_flags & IOCB_DIRECT; + int unaligned_aio = 0; int overwrite = 0; ssize_t ret; + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + /* - * Unaligned direct AIO must be serialized; see comment above - * In the case of O_APPEND, assume that we must always serialize + * Unaligned direct AIO must be serialized among each other as zeroing + * of partial blocks of two competing unaligned AIOs can result in data + * corruption. */ - if (o_direct && - ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && + if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && !is_sync_kiocb(iocb) && - (iocb->ki_flags & IOCB_APPEND || - ext4_unaligned_aio(inode, from, iocb->ki_pos))) { - aio_mutex = ext4_aio_mutex(inode); - mutex_lock(aio_mutex); + ext4_unaligned_aio(inode, from, iocb->ki_pos)) { + unaligned_aio = 1; ext4_unwritten_wait(inode); } - inode_lock(inode); - ret = generic_write_checks(iocb, from); - if (ret <= 0) - goto out; - /* * If we have encountered a bitmap-format file, the size limit * is smaller than s_maxbytes, which is for extent-mapped files. @@ -139,7 +137,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) blk_start_plug(&plug); /* check whether we do a DIO overwrite or not */ - if (ext4_should_dioread_nolock(inode) && !aio_mutex && + if (ext4_should_dioread_nolock(inode) && !unaligned_aio && !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; @@ -181,14 +179,10 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (o_direct) blk_finish_plug(&plug); - if (aio_mutex) - mutex_unlock(aio_mutex); return ret; out: inode_unlock(inode); - if (aio_mutex) - mutex_unlock(aio_mutex); return ret; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2f55051..4d57560 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5321,7 +5321,6 @@ MODULE_ALIAS_FS("ext4"); /* Shared across all ext4 file systems */ wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; -struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; static int __init ext4_init_fs(void) { @@ -5334,10 +5333,8 @@ static int __init ext4_init_fs(void) /* Build-time check for flags consistency */ ext4_check_flag_values(); - for (i = 0; i < EXT4_WQ_HASH_SZ; i++) { - mutex_init(&ext4__aio_mutex[i]); + for (i = 0; i < EXT4_WQ_HASH_SZ; i++) init_waitqueue_head(&ext4__ioend_wq[i]); - } err = ext4_init_es(); if (err) -- cgit v0.10.2 From 705965bd6dfadc3b2e0241da1423ef660bdd04c8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 8 Mar 2016 23:08:10 -0500 Subject: ext4: rename and split get blocks functions Rename ext4_get_blocks_write() to ext4_get_blocks_unwritten() to better describe what it does. Also split out get blocks functions for direct IO. Later we move functionality from _ext4_get_blocks() there. There's no functional change in this patch. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f4e9521..d60db18 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2506,12 +2506,14 @@ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); int ext4_inode_is_fast_symlink(struct inode *inode); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); -int ext4_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); +int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); + struct buffer_head *bh_result, int create); +int ext4_dio_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create); int ext4_walk_page_buffers(handle_t *handle, diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 355ef9c..1655ff1 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -693,21 +693,21 @@ retry: } if (IS_DAX(inode)) ret = dax_do_io(iocb, inode, iter, offset, - ext4_get_block, NULL, 0); + ext4_dio_get_block, NULL, 0); else ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, - offset, ext4_get_block, NULL, - NULL, 0); + offset, ext4_dio_get_block, + NULL, NULL, 0); inode_dio_end(inode); } else { locked: if (IS_DAX(inode)) ret = dax_do_io(iocb, inode, iter, offset, - ext4_get_block, NULL, DIO_LOCKING); + ext4_dio_get_block, NULL, DIO_LOCKING); else ret = blockdev_direct_IO(iocb, inode, iter, offset, - ext4_get_block); + ext4_dio_get_block); if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { loff_t isize = i_size_read(inode); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index dfe3b9b..36d8cc9 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -581,9 +581,10 @@ retry: if (ret) goto out; - if (ext4_should_dioread_nolock(inode)) - ret = __block_write_begin(page, from, to, ext4_get_block_write); - else + if (ext4_should_dioread_nolock(inode)) { + ret = __block_write_begin(page, from, to, + ext4_get_block_unwritten); + } else ret = __block_write_begin(page, from, to, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9cc57c3..bf545d0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -769,6 +769,60 @@ int ext4_get_block(struct inode *inode, sector_t iblock, } /* + * Get block function used when preparing for buffered write if we require + * creating an unwritten extent if blocks haven't been allocated. The extent + * will be converted to written after the IO is complete. + */ +int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", + inode->i_ino, create); + return _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_IO_CREATE_EXT); +} + +/* Get block function for DIO reads and writes to inodes without extents */ +int ext4_dio_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + return _ext4_get_block(inode, iblock, bh, + create ? EXT4_GET_BLOCKS_CREATE : 0); +} + +/* + * Get block function for DIO writes when we create unwritten extent if + * blocks are not allocated yet. The extent will be converted to written + * after IO is complete. + */ +static int ext4_dio_get_block_unwritten(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + ext4_debug("ext4_dio_get_block_unwritten: inode %lu, create flag %d\n", + inode->i_ino, create); + return _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_IO_CREATE_EXT); +} + +static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret; + + ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n", + inode->i_ino, create); + ret = _ext4_get_block(inode, iblock, bh_result, 0); + /* + * Blocks should have been preallocated! ext4_file_write_iter() checks + * that. + */ + WARN_ON_ONCE(!buffer_mapped(bh_result)); + + return ret; +} + + +/* * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, @@ -1079,13 +1133,14 @@ retry_journal: #ifdef CONFIG_EXT4_FS_ENCRYPTION if (ext4_should_dioread_nolock(inode)) ret = ext4_block_write_begin(page, pos, len, - ext4_get_block_write); + ext4_get_block_unwritten); else ret = ext4_block_write_begin(page, pos, len, ext4_get_block); #else if (ext4_should_dioread_nolock(inode)) - ret = __block_write_begin(page, pos, len, ext4_get_block_write); + ret = __block_write_begin(page, pos, len, + ext4_get_block_unwritten); else ret = __block_write_begin(page, pos, len, ext4_get_block); #endif @@ -3084,37 +3139,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait) return try_to_free_buffers(page); } -/* - * ext4_get_block used when preparing for a DIO write or buffer write. - * We allocate an uinitialized extent if blocks haven't been allocated. - * The extent will be converted to initialized after the IO is complete. - */ -int ext4_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", - inode->i_ino, create); - return _ext4_get_block(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); -} - -static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - int ret; - - ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n", - inode->i_ino, create); - ret = _ext4_get_block(inode, iblock, bh_result, 0); - /* - * Blocks should have been preallocated! ext4_file_write_iter() checks - * that. - */ - WARN_ON_ONCE(!buffer_mapped(bh_result)); - - return ret; -} - #ifdef CONFIG_FS_DAX int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -3282,7 +3306,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, */ iocb->private = NULL; if (overwrite) { - get_block_func = ext4_get_block_overwrite; + get_block_func = ext4_dio_get_block_overwrite; } else { ext4_inode_aio_set(inode, NULL); if (!is_sync_kiocb(iocb)) { @@ -3304,7 +3328,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, */ ext4_inode_aio_set(inode, io_end); } - get_block_func = ext4_get_block_write; + get_block_func = ext4_dio_get_block_unwritten; dio_flags = DIO_LOCKING; } #ifdef CONFIG_EXT4_FS_ENCRYPTION @@ -5498,7 +5522,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) unlock_page(page); /* OK, we need to fill the hole... */ if (ext4_should_dioread_nolock(inode)) - get_block = ext4_get_block_write; + get_block = ext4_get_block_unwritten; else get_block = ext4_get_block; retry_alloc: -- cgit v0.10.2 From efe70c29511544b0468723fe92c1847b3b0ca046 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 8 Mar 2016 23:35:46 -0500 Subject: ext4: move trans handling and completion deferal out of _ext4_get_block There is no need to handle starting of a transaction and deferal of DIO completion in _ext4_get_block() function. We can move this out to get block functions for direct IO that need it. That way we can add stricter checks verifying things work as we expect. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bf545d0..aea67d9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -714,16 +714,11 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) cmpxchg(&bh->b_state, old_state, new_state) != old_state)); } -/* Maximum number of blocks we map for direct IO at once. */ -#define DIO_MAX_BLOCKS 4096 - static int _ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int flags) { - handle_t *handle = ext4_journal_current_handle(); struct ext4_map_blocks map; - int ret = 0, started = 0; - int dio_credits; + int ret = 0; if (ext4_has_inline_data(inode)) return -ERANGE; @@ -731,33 +726,14 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; - if (flags && !handle) { - /* Direct IO write... */ - if (map.m_len > DIO_MAX_BLOCKS) - map.m_len = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, - dio_credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - return ret; - } - started = 1; - } - - ret = ext4_map_blocks(handle, inode, &map, flags); + ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map, + flags); if (ret > 0) { - ext4_io_end_t *io_end = ext4_inode_aio(inode); - map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); - if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) - set_buffer_defer_completion(bh); bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; } - if (started) - ext4_journal_stop(handle); return ret; } @@ -782,12 +758,42 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, EXT4_GET_BLOCKS_IO_CREATE_EXT); } +/* Maximum number of blocks we map for direct IO at once. */ +#define DIO_MAX_BLOCKS 4096 + +static handle_t *start_dio_trans(struct inode *inode, + struct buffer_head *bh_result) +{ + int dio_credits; + + /* Trim mapping request to maximum we can map at once for DIO */ + if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS) + bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits; + dio_credits = ext4_chunk_trans_blocks(inode, + bh_result->b_size >> inode->i_blkbits); + return ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); +} + /* Get block function for DIO reads and writes to inodes without extents */ int ext4_dio_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { - return _ext4_get_block(inode, iblock, bh, - create ? EXT4_GET_BLOCKS_CREATE : 0); + handle_t *handle; + int ret; + + /* We don't expect handle for direct IO */ + WARN_ON_ONCE(ext4_journal_current_handle()); + + if (create) { + handle = start_dio_trans(inode, bh); + if (IS_ERR(handle)) + return PTR_ERR(handle); + } + ret = _ext4_get_block(inode, iblock, bh, + create ? EXT4_GET_BLOCKS_CREATE : 0); + if (create) + ext4_journal_stop(handle); + return ret; } /* @@ -798,10 +804,28 @@ int ext4_dio_get_block(struct inode *inode, sector_t iblock, static int ext4_dio_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { + handle_t *handle; + int ret; + ext4_debug("ext4_dio_get_block_unwritten: inode %lu, create flag %d\n", inode->i_ino, create); - return _ext4_get_block(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); + /* We don't expect handle for direct IO */ + WARN_ON_ONCE(ext4_journal_current_handle()); + + handle = start_dio_trans(inode, bh_result); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_IO_CREATE_EXT); + ext4_journal_stop(handle); + if (!ret && buffer_unwritten(bh_result)) { + ext4_io_end_t *io_end = ext4_inode_aio(inode); + + set_buffer_defer_completion(bh_result); + WARN_ON_ONCE(io_end && !(io_end->flag & EXT4_IO_END_UNWRITTEN)); + } + + return ret; } static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock, @@ -811,12 +835,15 @@ static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock, ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n", inode->i_ino, create); + /* We don't expect handle for direct IO */ + WARN_ON_ONCE(ext4_journal_current_handle()); + ret = _ext4_get_block(inode, iblock, bh_result, 0); /* * Blocks should have been preallocated! ext4_file_write_iter() checks * that. */ - WARN_ON_ONCE(!buffer_mapped(bh_result)); + WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result)); return ret; } -- cgit v0.10.2 From 109811c20fb8ec46e2ed01750214a32a9163d164 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 8 Mar 2016 23:36:46 -0500 Subject: ext4: simplify io_end handling for AIO DIO When mapping blocks for direct IO, we allocate io_end structure before mapping blocks and store pointer to it in the inode. This creates a requirement that any AIO DIO using io_end must be protected by i_mutex. This created problems in the past with dioread_nolock mode which was corrupting io_end pointers. Also io_end is allocated unnecessarily in case where we don't need to convert any extents (which is a common case for example when overwriting file). We fix the problem by allocating io_end only once we return unwritten extent from block mapping function for AIO DIO (so we can save some pointless io_end allocations) and we pass pointer to it in bh->b_private which generic DIO code later passes to our end IO callback. That way we remove any need for global pointer to io_end structure and thus fix the races. The downside of this change is that the checking for unwritten IO in flight in ext4_extents_can_be_merged() is more racy since we now increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after dropping i_data_sem. However the check has been racy already before because ext4_writepages() already increment i_unwritten after dropping i_data_sem and reserved blocks save us from hitting ENOSPC in the worst case. Signed-off-by: Jan Kara diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d60db18..9f3156c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1513,16 +1513,6 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode, } } -static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode) -{ - return inode->i_private; -} - -static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io) -{ - inode->i_private = io; -} - /* * Inode dynamic state flags */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 73a48c1..f1b3c25 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1736,6 +1736,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, */ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; + /* + * The check for IO to unwritten extent is somewhat racy as we + * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after + * dropping i_data_sem. But reserved blocks should save us in that + * case. + */ if (ext4_ext_is_unwritten(ex1) && (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || atomic_read(&EXT4_I(inode)->i_unwritten) || @@ -4007,7 +4013,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_ext_path *path = *ppath; int ret = 0; int err = 0; - ext4_io_end_t *io = ext4_inode_aio(inode); ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " "block %llu, max_blocks %u, flags %x, allocated %u\n", @@ -4030,15 +4035,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, flags | EXT4_GET_BLOCKS_CONVERT); if (ret <= 0) goto out; - /* - * Flag the inode(non aio case) or end_io struct (aio case) - * that this IO needs to conversion to written when IO is - * completed - */ - if (io) - ext4_set_io_unwritten_flag(inode, io); - else - ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); map->m_flags |= EXT4_MAP_UNWRITTEN; goto out; } @@ -4283,9 +4279,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; - ext4_io_end_t *io = ext4_inode_aio(inode); ext4_lblk_t cluster_offset; - int set_unwritten = 0; bool map_from_cluster = false; ext_debug("blocks %u/%u requested for inode %lu\n", @@ -4482,15 +4476,6 @@ got_allocated_blocks: if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ ext4_ext_mark_unwritten(&newex); map->m_flags |= EXT4_MAP_UNWRITTEN; - /* - * io_end structure was created for every IO write to an - * unwritten extent. To avoid unnecessary conversion, - * here we flag the IO that really needs the conversion. - * For non asycn direct IO case, flag the inode state - * that we need to perform conversion when IO is done. - */ - if (flags & EXT4_GET_BLOCKS_PRE_IO) - set_unwritten = 1; } err = 0; @@ -4501,14 +4486,6 @@ got_allocated_blocks: err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags); - if (!err && set_unwritten) { - if (io) - ext4_set_io_unwritten_flag(inode, io); - else - ext4_set_inode_state(inode, - EXT4_STATE_DIO_UNWRITTEN); - } - if (err && free_on_err) { int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index aea67d9..c67c16e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -797,18 +797,16 @@ int ext4_dio_get_block(struct inode *inode, sector_t iblock, } /* - * Get block function for DIO writes when we create unwritten extent if + * Get block function for AIO DIO writes when we create unwritten extent if * blocks are not allocated yet. The extent will be converted to written * after IO is complete. */ -static int ext4_dio_get_block_unwritten(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int ext4_dio_get_block_unwritten_async(struct inode *inode, + sector_t iblock, struct buffer_head *bh_result, int create) { handle_t *handle; int ret; - ext4_debug("ext4_dio_get_block_unwritten: inode %lu, create flag %d\n", - inode->i_ino, create); /* We don't expect handle for direct IO */ WARN_ON_ONCE(ext4_journal_current_handle()); @@ -818,16 +816,62 @@ static int ext4_dio_get_block_unwritten(struct inode *inode, sector_t iblock, ret = _ext4_get_block(inode, iblock, bh_result, EXT4_GET_BLOCKS_IO_CREATE_EXT); ext4_journal_stop(handle); - if (!ret && buffer_unwritten(bh_result)) { - ext4_io_end_t *io_end = ext4_inode_aio(inode); + /* + * When doing DIO using unwritten extents, we need io_end to convert + * unwritten extents to written on IO completion. We allocate io_end + * once we spot unwritten extent and store it in b_private. Generic + * DIO code keeps b_private set and furthermore passes the value to + * our completion callback in 'private' argument. + */ + if (!ret && buffer_unwritten(bh_result)) { + if (!bh_result->b_private) { + ext4_io_end_t *io_end; + + io_end = ext4_init_io_end(inode, GFP_KERNEL); + if (!io_end) + return -ENOMEM; + bh_result->b_private = io_end; + ext4_set_io_unwritten_flag(inode, io_end); + } set_buffer_defer_completion(bh_result); - WARN_ON_ONCE(io_end && !(io_end->flag & EXT4_IO_END_UNWRITTEN)); } return ret; } +/* + * Get block function for non-AIO DIO writes when we create unwritten extent if + * blocks are not allocated yet. The extent will be converted to written + * after IO is complete from ext4_ext_direct_IO() function. + */ +static int ext4_dio_get_block_unwritten_sync(struct inode *inode, + sector_t iblock, struct buffer_head *bh_result, int create) +{ + handle_t *handle; + int ret; + + /* We don't expect handle for direct IO */ + WARN_ON_ONCE(ext4_journal_current_handle()); + + handle = start_dio_trans(inode, bh_result); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_IO_CREATE_EXT); + ext4_journal_stop(handle); + + /* + * Mark inode as having pending DIO writes to unwritten extents. + * ext4_ext_direct_IO() checks this flag and converts extents to + * written. + */ + if (!ret && buffer_unwritten(bh_result)) + ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); + + return ret; +} + static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -3243,7 +3287,7 @@ out: static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ssize_t size, void *private) { - ext4_io_end_t *io_end = iocb->private; + ext4_io_end_t *io_end = private; /* if not async direct IO just return */ if (!io_end) @@ -3251,10 +3295,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ext_debug("ext4_end_io_dio(): io_end 0x%p " "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", - iocb->private, io_end->inode->i_ino, iocb, offset, - size); + io_end, io_end->inode->i_ino, iocb, offset, size); - iocb->private = NULL; io_end->offset = offset; io_end->size = size; ext4_put_io_end(io_end); @@ -3290,7 +3332,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, get_block_t *get_block_func = NULL; int dio_flags = 0; loff_t final_size = offset + count; - ext4_io_end_t *io_end = NULL; /* Use the old path for reads and writes beyond i_size. */ if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size) @@ -3315,16 +3356,17 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, /* * We could direct write to holes and fallocate. * - * Allocated blocks to fill the hole are marked as - * unwritten to prevent parallel buffered read to expose - * the stale data before DIO complete the data IO. + * Allocated blocks to fill the hole are marked as unwritten to prevent + * parallel buffered read to expose the stale data before DIO complete + * the data IO. * - * As to previously fallocated extents, ext4 get_block will - * just simply mark the buffer mapped but still keep the - * extents unwritten. + * As to previously fallocated extents, ext4 get_block will just simply + * mark the buffer mapped but still keep the extents unwritten. * - * For non AIO case, we will convert those unwritten extents - * to written after return back from blockdev_direct_IO. + * For non AIO case, we will convert those unwritten extents to written + * after return back from blockdev_direct_IO. That way we save us from + * allocating io_end structure and also the overhead of offloading + * the extent convertion to a workqueue. * * For async DIO, the conversion needs to be deferred when the * IO is completed. The ext4 end_io callback function will be @@ -3332,30 +3374,13 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * case, we allocate an io_end structure to hook to the iocb. */ iocb->private = NULL; - if (overwrite) { + if (overwrite) get_block_func = ext4_dio_get_block_overwrite; + else if (is_sync_kiocb(iocb)) { + get_block_func = ext4_dio_get_block_unwritten_sync; + dio_flags = DIO_LOCKING; } else { - ext4_inode_aio_set(inode, NULL); - if (!is_sync_kiocb(iocb)) { - io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_end) { - ret = -ENOMEM; - goto retake_lock; - } - /* - * Grab reference for DIO. Will be dropped in - * ext4_end_io_dio() - */ - iocb->private = ext4_get_io_end(io_end); - /* - * we save the io structure for current async direct - * IO, so that later ext4_map_blocks() could flag the - * io structure whether there is a unwritten extents - * needs to be converted when IO is completed. - */ - ext4_inode_aio_set(inode, io_end); - } - get_block_func = ext4_dio_get_block_unwritten; + get_block_func = ext4_dio_get_block_unwritten_async; dio_flags = DIO_LOCKING; } #ifdef CONFIG_EXT4_FS_ENCRYPTION @@ -3370,27 +3395,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, get_block_func, ext4_end_io_dio, NULL, dio_flags); - /* - * Put our reference to io_end. This can free the io_end structure e.g. - * in sync IO case or in case of error. It can even perform extent - * conversion if all bios we submitted finished before we got here. - * Note that in that case iocb->private can be already set to NULL - * here. - */ - if (io_end) { - ext4_inode_aio_set(inode, NULL); - ext4_put_io_end(io_end); - /* - * When no IO was submitted ext4_end_io_dio() was not - * called so we have to put iocb's reference. - */ - if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) { - WARN_ON(iocb->private != io_end); - WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); - ext4_put_io_end(io_end); - iocb->private = NULL; - } - } if (ret > 0 && !overwrite && ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN)) { int err; @@ -3405,7 +3409,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); } -retake_lock: if (iov_iter_rw(iter) == WRITE) inode_dio_end(inode); /* take i_mutex locking again if we do a ovewrite dio */ -- cgit v0.10.2 From 600be30a8bc1d405f791e01dbef84679e14529b8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 8 Mar 2016 23:39:21 -0500 Subject: ext4: remove i_ioend_count Remove counter of pending io ends as it is unused. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9f3156c..70b8e04 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1024,13 +1024,8 @@ struct ext4_inode_info { * transaction reserved */ struct list_head i_rsv_conversion_list; - /* - * Completed IOs that need unwritten extents handling and don't have - * transaction reserved - */ - atomic_t i_ioend_count; /* Number of outstanding io_end structs */ - atomic_t i_unwritten; /* Nr. of inflight conversions pending */ struct work_struct i_rsv_conversion_work; + atomic_t i_unwritten; /* Nr. of inflight conversions pending */ spinlock_t i_block_reservation_lock; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c67c16e..971892d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -216,7 +216,6 @@ void ext4_evict_inode(struct inode *inode) } truncate_inode_pages_final(&inode->i_data); - WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); goto no_delete; } @@ -228,8 +227,6 @@ void ext4_evict_inode(struct inode *inode) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages_final(&inode->i_data); - WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); - /* * Protect us against freezing - iput() caller didn't have to have any * protection against it diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 090b349..349d7aa 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -128,9 +128,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); WARN_ON(io_end->handle); - if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) - wake_up_all(ext4_ioend_wq(io_end->inode)); - for (bio = io_end->bio; bio; bio = next_bio) { next_bio = bio->bi_private; ext4_finish_bio(bio); @@ -265,7 +262,6 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) { ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); if (io) { - atomic_inc(&EXT4_I(inode)->i_ioend_count); io->inode = inode; INIT_LIST_HEAD(&io->list); atomic_set(&io->count, 1); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4d57560..de02a9e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -942,7 +942,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; - atomic_set(&ei->i_ioend_count, 0); atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); #ifdef CONFIG_EXT4_FS_ENCRYPTION -- cgit v0.10.2 From 87d8a74b56746d4b6125602365b10995116afd00 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 9 Mar 2016 22:26:55 -0500 Subject: ext4: fix setting of referenced bit in ext4_es_lookup_extent() We were setting referenced bit on the extent structure we return from ext4_es_lookup_extent() which is just a private structure on stack. Thus setting had no effect. Set the bit in the structure in the status tree instead. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index ac748b3..e38b987a 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -823,8 +823,8 @@ out: es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; - if (!ext4_es_is_referenced(es)) - ext4_es_set_referenced(es); + if (!ext4_es_is_referenced(es1)) + ext4_es_set_referenced(es1); stats->es_stats_cache_hits++; } else { stats->es_stats_cache_misses++; -- cgit v0.10.2 From 140a52508a68387e22486659ff9eaa89a24f6e40 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 9 Mar 2016 22:46:57 -0500 Subject: ext4: factor out determining of hole size ext4_ext_put_gap_in_cache() determines hole size in the extent tree, then trims this with possible delayed allocated blocks, and inserts the result into the extent status tree. Factor out determination of the size of the hole in the extent tree as we will need this information in ext4_ext_map_blocks() as well. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f1b3c25..7858fc1 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2299,59 +2299,69 @@ static int ext4_fill_fiemap_extents(struct inode *inode, } /* - * ext4_ext_put_gap_in_cache: - * calculate boundaries of the gap that the requested block fits into - * and cache this gap + * ext4_ext_determine_hole - determine hole around given block + * @inode: inode we lookup in + * @path: path in extent tree to @lblk + * @lblk: pointer to logical block around which we want to determine hole + * + * Determine hole length (and start if easily possible) around given logical + * block. We don't try too hard to find the beginning of the hole but @path + * actually points to extent before @lblk, we provide it. + * + * The function returns the length of a hole starting at @lblk. We update @lblk + * to the beginning of the hole if we managed to find it. */ -static void -ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, - ext4_lblk_t block) +static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t *lblk) { int depth = ext_depth(inode); - ext4_lblk_t len; - ext4_lblk_t lblock; struct ext4_extent *ex; - struct extent_status es; + ext4_lblk_t len; ex = path[depth].p_ext; if (ex == NULL) { /* there is no extent yet, so gap is [0;-] */ - lblock = 0; + *lblk = 0; len = EXT_MAX_BLOCKS; - ext_debug("cache gap(whole file):"); - } else if (block < le32_to_cpu(ex->ee_block)) { - lblock = block; - len = le32_to_cpu(ex->ee_block) - block; - ext_debug("cache gap(before): %u [%u:%u]", - block, - le32_to_cpu(ex->ee_block), - ext4_ext_get_actual_len(ex)); - } else if (block >= le32_to_cpu(ex->ee_block) + } else if (*lblk < le32_to_cpu(ex->ee_block)) { + len = le32_to_cpu(ex->ee_block) - *lblk; + } else if (*lblk >= le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)) { ext4_lblk_t next; - lblock = le32_to_cpu(ex->ee_block) - + ext4_ext_get_actual_len(ex); + *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); next = ext4_ext_next_allocated_block(path); - ext_debug("cache gap(after): [%u:%u] %u", - le32_to_cpu(ex->ee_block), - ext4_ext_get_actual_len(ex), - block); - BUG_ON(next == lblock); - len = next - lblock; + BUG_ON(next == *lblk); + len = next - *lblk; } else { BUG(); } + return len; +} - ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es); +/* + * ext4_ext_put_gap_in_cache: + * calculate boundaries of the gap that the requested block fits into + * and cache this gap + */ +static void +ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start, + ext4_lblk_t hole_len) +{ + struct extent_status es; + + ext4_es_find_delayed_extent_range(inode, hole_start, + hole_start + hole_len - 1, &es); if (es.es_len) { /* There's delayed extent containing lblock? */ - if (es.es_lblk <= lblock) + if (es.es_lblk <= hole_start) return; - len = min(es.es_lblk - lblock, len); + hole_len = min(es.es_lblk - hole_start, hole_len); } - ext_debug(" -> %u:%u\n", lblock, len); - ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE); + ext_debug(" -> %u:%u\n", hole_start, hole_len); + ext4_es_insert_extent(inode, hole_start, hole_len, ~0, + EXTENT_STATUS_HOLE); } /* @@ -4362,11 +4372,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * we couldn't try to create block if create flag is zero */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + ext4_lblk_t hole_start, hole_len; + /* * put just found gap into cache to speed up * subsequent requests */ - ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); + hole_start = map->m_lblk; + hole_len = ext4_ext_determine_hole(inode, path, &hole_start); + ext4_ext_put_gap_in_cache(inode, hole_start, hole_len); goto out2; } -- cgit v0.10.2 From facab4d9711e7aa3532cb82643803e8f1b9518e8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 9 Mar 2016 22:54:00 -0500 Subject: ext4: return hole from ext4_map_blocks() Currently, ext4_map_blocks() just returns 0 when it finds a hole and allocation is not requested. However we have all the information available to tell how large the hole actually is and there are callers of ext4_map_blocks() which would save some block-by-block hole iteration if they knew this information. So fill in struct ext4_map_blocks even for holes with the information we have. We keep returning 0 for holes to maintain backward compatibility of the function. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7858fc1..2730039 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4374,13 +4374,20 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { ext4_lblk_t hole_start, hole_len; + hole_start = map->m_lblk; + hole_len = ext4_ext_determine_hole(inode, path, &hole_start); /* * put just found gap into cache to speed up * subsequent requests */ - hole_start = map->m_lblk; - hole_len = ext4_ext_determine_hole(inode, path, &hole_start); ext4_ext_put_gap_in_cache(inode, hole_start, hole_len); + + /* Update hole_len to reflect hole size after map->m_lblk */ + if (hole_start != map->m_lblk) + hole_len -= map->m_lblk - hole_start; + map->m_pblk = 0; + map->m_len = min_t(unsigned int, map->m_len, hole_len); + goto out2; } diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 1655ff1..3027fa6 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -555,8 +555,23 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, goto got_it; } - /* Next simple case - plain lookup or failed read of indirect block */ - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) + /* Next simple case - plain lookup failed */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + unsigned epb = inode->i_sb->s_blocksize / sizeof(u32); + int i; + + /* Count number blocks in a subtree under 'partial' */ + count = 1; + for (i = 0; partial + i != chain + depth - 1; i++) + count *= epb; + /* Fill in size of a hole we found */ + map->m_pblk = 0; + map->m_len = min_t(unsigned int, map->m_len, count); + goto cleanup; + } + + /* Failed read of indirect block */ + if (err == -EIO) goto cleanup; /* diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 971892d..1a156ec 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -455,13 +455,13 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * - * On success, it returns the number of blocks being mapped or allocated. - * if create==0 and the blocks are pre-allocated and unwritten block, - * the result buffer head is unmapped. If the create ==1, it will make sure - * the buffer head is mapped. + * On success, it returns the number of blocks being mapped or allocated. if + * create==0 and the blocks are pre-allocated and unwritten, the resulting @map + * is marked as unwritten. If the create == 1, it will mark @map as mapped. * * It returns 0 if plain look up failed (blocks have not been allocated), in - * that case, buffer head is unmapped + * that case, @map is returned as unmapped but we still do fill map->m_len to + * indicate the length of a hole starting at map->m_lblk. * * It returns the error in case of allocation failure. */ @@ -504,6 +504,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, retval = map->m_len; map->m_len = retval; } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { + map->m_pblk = 0; + retval = es.es_len - (map->m_lblk - es.es_lblk); + if (retval > map->m_len) + retval = map->m_len; + map->m_len = retval; retval = 0; } else { BUG_ON(1); -- cgit v0.10.2 From e3fb8eb14eafd2847c04cf48b52a705c36f4db98 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 9 Mar 2016 23:03:27 -0500 Subject: ext4: cleanup handling of bh->b_state in DAX mmap ext4_dax_mmap_get_block() updates bh->b_state directly instead of using ext4_update_bh_state(). This is mostly a cosmetic issue since DAX code always passes on-stack buffer_head but clean this up to make code more uniform. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1a156ec..fddc6dd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3272,13 +3272,12 @@ out: WARN_ON_ONCE(ret == 0 && create); if (ret > 0) { map_bh(bh_result, inode->i_sb, map.m_pblk); - bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) | - map.m_flags; /* * At least for now we have to clear BH_New so that DAX code * doesn't attempt to zero blocks again in a racy way. */ - bh_result->b_state &= ~(1 << BH_New); + map.m_flags &= ~EXT4_MAP_NEW; + ext4_update_bh_state(bh_result, map.m_flags); bh_result->b_size = map.m_len << inode->i_blkbits; ret = 0; } -- cgit v0.10.2 From 2d90c160e5f1d784e180f1e1458d56eee4d7f4f4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 9 Mar 2016 23:11:13 -0500 Subject: ext4: more efficient SEEK_DATA implementation Using SEEK_DATA in a huge sparse file can easily lead to sotflockups as ext4_seek_data() iterates hole block-by-block. Fix the problem by using returned hole size from ext4_map_blocks() and thus skip the hole in one go. Update also SEEK_HOLE implementation to follow the same pattern as SEEK_DATA to make future maintenance easier. Furthermore we add cond_resched() to both ext4_seek_data() and ext4_seek_hole() to avoid softlockups in case evil user creates huge fragmented file and we have to go through lots of extents. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 70b8e04..5623eec 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2546,6 +2546,9 @@ extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len); +extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, + unsigned int map_len, + struct extent_status *result); /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 4a11535..e93a7ef 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -426,7 +426,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) */ static int ext4_find_unwritten_pgoff(struct inode *inode, int whence, - struct ext4_map_blocks *map, + ext4_lblk_t end_blk, loff_t *offset) { struct pagevec pvec; @@ -441,7 +441,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, blkbits = inode->i_sb->s_blocksize_bits; startoff = *offset; lastoff = startoff; - endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits; + endoff = (loff_t)end_blk << blkbits; index = startoff >> PAGE_CACHE_SHIFT; end = endoff >> PAGE_CACHE_SHIFT; @@ -559,12 +559,11 @@ out: static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) { struct inode *inode = file->f_mapping->host; - struct ext4_map_blocks map; struct extent_status es; ext4_lblk_t start, last, end; loff_t dataoff, isize; int blkbits; - int ret = 0; + int ret; inode_lock(inode); @@ -581,41 +580,32 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) dataoff = offset; do { - map.m_lblk = last; - map.m_len = end - last + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { - if (last != start) - dataoff = (loff_t)last << blkbits; - break; + ret = ext4_get_next_extent(inode, last, end - last + 1, &es); + if (ret <= 0) { + /* No extent found -> no data */ + if (ret == 0) + ret = -ENXIO; + inode_unlock(inode); + return ret; } - /* - * If there is a delay extent at this offset, - * it will be as a data. - */ - ext4_es_find_delayed_extent_range(inode, last, last, &es); - if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { - if (last != start) - dataoff = (loff_t)last << blkbits; + last = es.es_lblk; + if (last != start) + dataoff = (loff_t)last << blkbits; + if (!ext4_es_is_unwritten(&es)) break; - } /* * If there is a unwritten extent at this offset, * it will be as a data or a hole according to page * cache that has data or not. */ - if (map.m_flags & EXT4_MAP_UNWRITTEN) { - int unwritten; - unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, - &map, &dataoff); - if (unwritten) - break; - } - - last++; + if (ext4_find_unwritten_pgoff(inode, SEEK_DATA, + es.es_lblk + es.es_len, &dataoff)) + break; + last += es.es_len; dataoff = (loff_t)last << blkbits; + cond_resched(); } while (last <= end); inode_unlock(inode); @@ -632,12 +622,11 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) { struct inode *inode = file->f_mapping->host; - struct ext4_map_blocks map; struct extent_status es; ext4_lblk_t start, last, end; loff_t holeoff, isize; int blkbits; - int ret = 0; + int ret; inode_lock(inode); @@ -654,44 +643,30 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) holeoff = offset; do { - map.m_lblk = last; - map.m_len = end - last + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { - last += ret; - holeoff = (loff_t)last << blkbits; - continue; + ret = ext4_get_next_extent(inode, last, end - last + 1, &es); + if (ret < 0) { + inode_unlock(inode); + return ret; } - - /* - * If there is a delay extent at this offset, - * we will skip this extent. - */ - ext4_es_find_delayed_extent_range(inode, last, last, &es); - if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { - last = es.es_lblk + es.es_len; - holeoff = (loff_t)last << blkbits; - continue; + /* Found a hole? */ + if (ret == 0 || es.es_lblk > last) { + if (last != start) + holeoff = (loff_t)last << blkbits; + break; } - /* * If there is a unwritten extent at this offset, * it will be as a data or a hole according to page * cache that has data or not. */ - if (map.m_flags & EXT4_MAP_UNWRITTEN) { - int unwritten; - unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, - &map, &holeoff); - if (!unwritten) { - last += ret; - holeoff = (loff_t)last << blkbits; - continue; - } - } + if (ext4_es_is_unwritten(&es) && + ext4_find_unwritten_pgoff(inode, SEEK_HOLE, + last + es.es_len, &holeoff)) + break; - /* find a hole */ - break; + last += es.es_len; + holeoff = (loff_t)last << blkbits; + cond_resched(); } while (last <= end); inode_unlock(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fddc6dd..ce2c4c6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5596,3 +5596,70 @@ int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return err; } + +/* + * Find the first extent at or after @lblk in an inode that is not a hole. + * Search for @map_len blocks at most. The extent is returned in @result. + * + * The function returns 1 if we found an extent. The function returns 0 in + * case there is no extent at or after @lblk and in that case also sets + * @result->es_len to 0. In case of error, the error code is returned. + */ +int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, + unsigned int map_len, struct extent_status *result) +{ + struct ext4_map_blocks map; + struct extent_status es = {}; + int ret; + + map.m_lblk = lblk; + map.m_len = map_len; + + /* + * For non-extent based files this loop may iterate several times since + * we do not determine full hole size. + */ + while (map.m_len > 0) { + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; + /* There's extent covering m_lblk? Just return it. */ + if (ret > 0) { + int status; + + ext4_es_store_pblock(result, map.m_pblk); + result->es_lblk = map.m_lblk; + result->es_len = map.m_len; + if (map.m_flags & EXT4_MAP_UNWRITTEN) + status = EXTENT_STATUS_UNWRITTEN; + else + status = EXTENT_STATUS_WRITTEN; + ext4_es_store_status(result, status); + return 1; + } + ext4_es_find_delayed_extent_range(inode, map.m_lblk, + map.m_lblk + map.m_len - 1, + &es); + /* Is delalloc data before next block in extent tree? */ + if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) { + ext4_lblk_t offset = 0; + + if (es.es_lblk < lblk) + offset = lblk - es.es_lblk; + result->es_lblk = es.es_lblk + offset; + ext4_es_store_pblock(result, + ext4_es_pblock(&es) + offset); + result->es_len = es.es_len - offset; + ext4_es_store_status(result, ext4_es_status(&es)); + + return 1; + } + /* There's a hole at m_lblk, advance us after it */ + map.m_lblk += map.m_len; + map_len -= map.m_len; + map.m_len = map_len; + cond_resched(); + } + result->es_len = 0; + return 0; +} -- cgit v0.10.2 From c0a2ad9b50dd80eeccd73d9ff962234590d5ec93 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Wed, 9 Mar 2016 23:47:25 -0500 Subject: jbd2: fix FS corruption possibility in jbd2_journal_destroy() on umount path On umount path, jbd2_journal_destroy() writes latest transaction ID (->j_tail_sequence) to be used at next mount. The bug is that ->j_tail_sequence is not holding latest transaction ID in some cases. So, at next mount, there is chance to conflict with remaining (not overwritten yet) transactions. mount (id=10) write transaction (id=11) write transaction (id=12) umount (id=10) <= the bug doesn't write latest ID mount (id=10) write transaction (id=11) crash mount [recovery process] transaction (id=11) transaction (id=12) <= valid transaction ID, but old commit must not replay Like above, this bug become the cause of recovery failure, or FS corruption. So why ->j_tail_sequence doesn't point latest ID? Because if checkpoint transactions was reclaimed by memory pressure (i.e. bdev_try_to_free_page()), then ->j_tail_sequence is not updated. (And another case is, __jbd2_journal_clean_checkpoint_list() is called with empty transaction.) So in above cases, ->j_tail_sequence is not pointing latest transaction ID at umount path. Plus, REQ_FLUSH for checkpoint is not done too. So, to fix this problem with minimum changes, this patch updates ->j_tail_sequence, and issue REQ_FLUSH. (With more complex changes, some optimizations would be possible to avoid unnecessary REQ_FLUSH for example though.) BTW, journal->j_tail_sequence = ++journal->j_transaction_sequence; Increment of ->j_transaction_sequence seems to be unnecessary, but ext3 does this. Signed-off-by: OGAWA Hirofumi Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 7bf1683..de73a95 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1430,11 +1430,12 @@ out: /** * jbd2_mark_journal_empty() - Mark on disk journal as empty. * @journal: The journal to update. + * @write_op: With which operation should we write the journal sb * * Update a journal's dynamic superblock fields to show that journal is empty. * Write updated superblock to disk waiting for IO to complete. */ -static void jbd2_mark_journal_empty(journal_t *journal) +static void jbd2_mark_journal_empty(journal_t *journal, int write_op) { journal_superblock_t *sb = journal->j_superblock; @@ -1452,7 +1453,7 @@ static void jbd2_mark_journal_empty(journal_t *journal) sb->s_start = cpu_to_be32(0); read_unlock(&journal->j_state_lock); - jbd2_write_superblock(journal, WRITE_FUA); + jbd2_write_superblock(journal, write_op); /* Log is no longer empty */ write_lock(&journal->j_state_lock); @@ -1738,7 +1739,13 @@ int jbd2_journal_destroy(journal_t *journal) if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { mutex_lock(&journal->j_checkpoint_mutex); - jbd2_mark_journal_empty(journal); + + write_lock(&journal->j_state_lock); + journal->j_tail_sequence = + ++journal->j_transaction_sequence; + write_unlock(&journal->j_state_lock); + + jbd2_mark_journal_empty(journal, WRITE_FLUSH_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } else err = -EIO; @@ -1997,7 +2004,7 @@ int jbd2_journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ - jbd2_mark_journal_empty(journal); + jbd2_mark_journal_empty(journal, WRITE_FUA); mutex_unlock(&journal->j_checkpoint_mutex); write_lock(&journal->j_state_lock); J_ASSERT(!journal->j_running_transaction); @@ -2043,7 +2050,7 @@ int jbd2_journal_wipe(journal_t *journal, int write) if (write) { /* Lock to make assertions happy... */ mutex_lock(&journal->j_checkpoint_mutex); - jbd2_mark_journal_empty(journal); + jbd2_mark_journal_empty(journal, WRITE_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } -- cgit v0.10.2 From b8a07463c8c5fd7c609590c7cd9eda897a1b6cd6 Mon Sep 17 00:00:00 2001 From: Adam Buchbinder Date: Wed, 9 Mar 2016 23:49:05 -0500 Subject: ext4: fix misspellings in comments. Signed-off-by: Adam Buchbinder Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 3c93815..8ecf84b 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -11,7 +11,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public Licens + * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2730039..95bf467 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -15,7 +15,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public Licens + * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index acc0ad5..237b877 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -787,7 +787,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, sbi = EXT4_SB(sb); /* - * Initalize owners and quota early so that we don't have to account + * Initialize owners and quota early so that we don't have to account * for quota initialization worst case in standard inode creating * transaction */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f6ff478..0ca2f4c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -11,7 +11,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public Licens + * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- */ diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index a465189..364ea4d 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -361,7 +361,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, * blocks. * * While converting to extents we need not - * update the orignal inode i_blocks for extent blocks + * update the original inode i_blocks for extent blocks * via quota APIs. The quota update happened via tmp_inode already. */ spin_lock(&inode->i_lock); -- cgit v0.10.2 From a8ed9b8695065d37d45efc446efbde5654f9f7c2 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 10 Mar 2016 00:18:57 -0500 Subject: ext4: drop unneeded BUFFER_TRACE in ext4_delete_inline_entry() BUFFER_TRACE info "call ext4_handle_dirty_metadata" doesn't match the code, so drop it. Signed-off-by: Geliang Tang Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 36d8cc9..7cbdd375 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1697,7 +1697,6 @@ int ext4_delete_inline_entry(handle_t *handle, if (err) goto out; - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_mark_inode_dirty(handle, dir); if (unlikely(err)) goto out; -- cgit v0.10.2 From 5e1021f2b6dff1a86a468a1424d59faae2bc63c1 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Sat, 12 Mar 2016 21:40:32 -0500 Subject: ext4: fix NULL pointer dereference in ext4_mark_inode_dirty() ext4_reserve_inode_write() in ext4_mark_inode_dirty() could fail on error (e.g. EIO) and iloc.bh can be NULL in this case. But the error is ignored in the following "if" condition and ext4_expand_extra_isize() might be called with NULL iloc.bh set, which triggers NULL pointer dereference. This is uncovered by commit 8b4953e13f4c ("ext4: reserve code points for the project quota feature"), which enlarges the ext4_inode size, and run the following script on new kernel but with old mke2fs: #/bin/bash mnt=/mnt/ext4 devname=ext4-error dev=/dev/mapper/$devname fsimg=/home/fs.img trap cleanup 0 1 2 3 9 15 cleanup() { umount $mnt >/dev/null 2>&1 dmsetup remove $devname losetup -d $backend_dev rm -f $fsimg exit 0 } rm -f $fsimg fallocate -l 1g $fsimg backend_dev=`losetup -f --show $fsimg` devsize=`blockdev --getsz $backend_dev` good_tab="0 $devsize linear $backend_dev 0" error_tab="0 $devsize error $backend_dev 0" dmsetup create $devname --table "$good_tab" mkfs -t ext4 $dev mount -t ext4 -o errors=continue,strictatime $dev $mnt dmsetup load $devname --table "$error_tab" && dmsetup resume $devname echo 3 > /proc/sys/vm/drop_caches ls -l $mnt exit 0 [ Patch changed to simplify the function a tiny bit. -- Ted ] Signed-off-by: Eryu Guan Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ce2c4c6..b41432e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5312,6 +5312,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) might_sleep(); trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + return err; if (ext4_handle_valid(handle) && EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { @@ -5342,9 +5344,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) } } } - if (!err) - err = ext4_mark_iloc_dirty(handle, inode, &iloc); - return err; + return ext4_mark_iloc_dirty(handle, inode, &iloc); } /* -- cgit v0.10.2 From 7915a861c01839a05eb7346023741742c4d2135e Mon Sep 17 00:00:00 2001 From: Ales Novak Date: Sat, 12 Mar 2016 21:55:50 -0500 Subject: ext4: print ext4 mount option data_err=abort correctly If data_err=abort option is specified for an ext3/ext4 mount, /proc/mounts does show it as "(null)". This is caused by token2str() returning NULL for Opt_data_err_abort (due to its pattern containing '='). Signed-off-by: Ales Novak Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/super.c b/fs/ext4/super.c index de02a9e..99996e9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1422,9 +1422,9 @@ static const struct mount_opts { {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, - MOPT_NO_EXT2 | MOPT_SET}, + MOPT_NO_EXT2}, {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, - MOPT_NO_EXT2 | MOPT_CLEAR}, + MOPT_NO_EXT2}, {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, @@ -1702,6 +1702,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, ext4_msg(sb, KERN_INFO, "dax option not supported"); return -1; #endif + } else if (token == Opt_data_err_abort) { + sbi->s_mount_opt |= m->mount_opt; + } else if (token == Opt_data_err_ignore) { + sbi->s_mount_opt &= ~m->mount_opt; } else { if (!args->from) arg = 1; @@ -1911,6 +1915,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); if (nodefs || sbi->s_max_dir_size_kb) SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); + if (test_opt(sb, DATA_ERR_ABORT)) + SEQ_OPTS_PUTS("data_err=abort"); ext4_show_quota_options(seq, sb); return 0; -- cgit v0.10.2 From a2821e34df141ac6019552618b57fa09227ff0dd Mon Sep 17 00:00:00 2001 From: Aihua Zhang Date: Sun, 13 Mar 2016 17:18:12 -0400 Subject: ext4: fix compile error while opening the macro DOUBLE_CHECK the error is: fs/ext4/mballoc.c:475:43: error: 'struct ext4_group_info' has no member named 'bb_bitmap'. so, the definition of macro DOUBLE_CHECK should before 'struct ext4_group_info', I fixed it, and I moved the macro AGGRESSIVE_CHECK together, because I think they shoule be together. Signed-off-by: Aihua Zhang Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5623eec..393689d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -42,6 +42,18 @@ */ /* + * with AGGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + +/* + * with DOUBLE_CHECK defined mballoc creates persistent in-core + * bitmaps, maintains and uses them to check for double allocations + */ +#define DOUBLE_CHECK__ + +/* * Define EXT4FS_DEBUG to produce debug messages */ #undef EXT4FS_DEBUG diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index d634e18..3ef1df6 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -23,18 +23,6 @@ #include "ext4.h" /* - * with AGGRESSIVE_CHECK allocator runs consistency checks over - * structures. these checks slow things down a lot - */ -#define AGGRESSIVE_CHECK__ - -/* - * with DOUBLE_CHECK defined mballoc creates persistent in-core - * bitmaps, maintains and uses them to check for double allocations - */ -#define DOUBLE_CHECK__ - -/* */ #ifdef CONFIG_EXT4_DEBUG extern ushort ext4_mballoc_debug; -- cgit v0.10.2 From adb7ef600cc9d9d15ecc934cc26af5c1379777df Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sun, 13 Mar 2016 17:29:06 -0400 Subject: ext4: use __GFP_NOFAIL in ext4_free_blocks() This might be unexpected but pages allocated for sbi->s_buddy_cache are charged to current memory cgroup. So, GFP_NOFS allocation could fail if current task has been killed by OOM or if current memory cgroup has no free memory left. Block allocator cannot handle such failures here yet. Signed-off-by: Konstantin Khlebnikov Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 0ca2f4c..50e05df 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -815,7 +815,7 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b) * for this page; do not hold this lock when calling this routine! */ -static int ext4_mb_init_cache(struct page *page, char *incore) +static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) { ext4_group_t ngroups; int blocksize; @@ -848,7 +848,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* allocate buffer_heads to read bitmaps */ if (groups_per_page > 1) { i = sizeof(struct buffer_head *) * groups_per_page; - bh = kzalloc(i, GFP_NOFS); + bh = kzalloc(i, gfp); if (bh == NULL) { err = -ENOMEM; goto out; @@ -983,7 +983,7 @@ out: * are on the same page e4b->bd_buddy_page is NULL and return value is 0. */ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, - ext4_group_t group, struct ext4_buddy *e4b) + ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { struct inode *inode = EXT4_SB(sb)->s_buddy_cache; int block, pnum, poff; @@ -1002,7 +1002,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, block = group * 2; pnum = block / blocks_per_page; poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, pnum, gfp); if (!page) return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); @@ -1016,7 +1016,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, block++; pnum = block / blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, pnum, gfp); if (!page) return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); @@ -1042,7 +1042,7 @@ static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) * calling this routine! */ static noinline_for_stack -int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) +int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) { struct ext4_group_info *this_grp; @@ -1062,7 +1062,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) * The call to ext4_mb_get_buddy_page_lock will mark the * page accessed. */ - ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); + ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp); if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { /* * somebody initialized the group @@ -1072,7 +1072,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) } page = e4b.bd_bitmap_page; - ret = ext4_mb_init_cache(page, NULL); + ret = ext4_mb_init_cache(page, NULL, gfp); if (ret) goto err; if (!PageUptodate(page)) { @@ -1091,7 +1091,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) } /* init buddy cache */ page = e4b.bd_buddy_page; - ret = ext4_mb_init_cache(page, e4b.bd_bitmap); + ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp); if (ret) goto err; if (!PageUptodate(page)) { @@ -1109,8 +1109,8 @@ err: * calling this routine! */ static noinline_for_stack int -ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, - struct ext4_buddy *e4b) +ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, + struct ext4_buddy *e4b, gfp_t gfp) { int blocks_per_page; int block; @@ -1140,7 +1140,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, * we need full data about the group * to make a good selection */ - ret = ext4_mb_init_group(sb, group); + ret = ext4_mb_init_group(sb, group, gfp); if (ret) return ret; } @@ -1168,11 +1168,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, * wait for it to initialize. */ page_cache_release(page); - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, pnum, gfp); if (page) { BUG_ON(page->mapping != inode->i_mapping); if (!PageUptodate(page)) { - ret = ext4_mb_init_cache(page, NULL); + ret = ext4_mb_init_cache(page, NULL, gfp); if (ret) { unlock_page(page); goto err; @@ -1204,11 +1204,12 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, if (page == NULL || !PageUptodate(page)) { if (page) page_cache_release(page); - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, pnum, gfp); if (page) { BUG_ON(page->mapping != inode->i_mapping); if (!PageUptodate(page)) { - ret = ext4_mb_init_cache(page, e4b->bd_bitmap); + ret = ext4_mb_init_cache(page, e4b->bd_bitmap, + gfp); if (ret) { unlock_page(page); goto err; @@ -1247,6 +1248,12 @@ err: return ret; } +static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, + struct ext4_buddy *e4b) +{ + return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS); +} + static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_page) @@ -2045,7 +2052,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { - int ret = ext4_mb_init_group(ac->ac_sb, group); + int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS); if (ret) return ret; } @@ -4804,7 +4811,9 @@ do_more: #endif trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); - err = ext4_mb_load_buddy(sb, block_group, &e4b); + /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ + err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, + GFP_NOFS|__GFP_NOFAIL); if (err) goto error_return; @@ -5211,7 +5220,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) grp = ext4_get_group_info(sb, group); /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { - ret = ext4_mb_init_group(sb, group); + ret = ext4_mb_init_group(sb, group, GFP_NOFS); if (ret) break; } -- cgit v0.10.2 From 490c1b444ce653d0784a9a5fa4d11287029feeb9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Sun, 13 Mar 2016 17:38:20 -0400 Subject: jbd2: do not fail journal because of frozen_buffer allocation failure Journal transaction might fail prematurely because the frozen_buffer is allocated by GFP_NOFS request: [ 72.440013] do_get_write_access: OOM for frozen_buffer [ 72.440014] EXT4-fs: ext4_reserve_inode_write:4729: aborting transaction: Out of memory in __ext4_journal_get_write_access [ 72.440015] EXT4-fs error (device sda1) in ext4_reserve_inode_write:4735: Out of memory (...snipped....) [ 72.495559] do_get_write_access: OOM for frozen_buffer [ 72.495560] EXT4-fs: ext4_reserve_inode_write:4729: aborting transaction: Out of memory in __ext4_journal_get_write_access [ 72.496839] do_get_write_access: OOM for frozen_buffer [ 72.496841] EXT4-fs: ext4_reserve_inode_write:4729: aborting transaction: Out of memory in __ext4_journal_get_write_access [ 72.505766] Aborting journal on device sda1-8. [ 72.505851] EXT4-fs (sda1): Remounting filesystem read-only This wasn't a problem until "mm: page_alloc: do not lock up GFP_NOFS allocations upon OOM" because small GPF_NOFS allocations never failed. This allocation seems essential for the journal and GFP_NOFS is too restrictive to the memory allocator so let's use __GFP_NOFAIL here to emulate the previous behavior. Signed-off-by: Michal Hocko Signed-off-by: Theodore Ts'o diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 081dff0..01e4652d 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -966,14 +966,8 @@ repeat: if (!frozen_buffer) { JBUFFER_TRACE(jh, "allocate memory for buffer"); jbd_unlock_bh_state(bh); - frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); - if (!frozen_buffer) { - printk(KERN_ERR "%s: OOM for frozen_buffer\n", - __func__); - JBUFFER_TRACE(jh, "oom!"); - error = -ENOMEM; - goto out; - } + frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, + GFP_NOFS | __GFP_NOFAIL); goto repeat; } jh->b_frozen_data = frozen_buffer; @@ -1226,15 +1220,9 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) goto out; repeat: - if (!jh->b_committed_data) { - committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); - if (!committed_data) { - printk(KERN_ERR "%s: No memory for committed data\n", - __func__); - err = -ENOMEM; - goto out; - } - } + if (!jh->b_committed_data) + committed_data = jbd2_alloc(jh2bh(jh)->b_size, + GFP_NOFS|__GFP_NOFAIL); jbd_lock_bh_state(bh); if (!jh->b_committed_data) { -- cgit v0.10.2 From 0304688676bdfc8159e165313d71da19c118ba27 Mon Sep 17 00:00:00 2001 From: "vikram.jadhav07" Date: Sun, 13 Mar 2016 17:56:52 -0400 Subject: ext4: clean up error handling in the MMP support There is memory leak as both caller function kmmpd() and callee read_mmp_block() not releasing bh_check (i.e buffer_head). Given patch fixes this problem. [ Additional changes suggested by Andreas Dilger -- TYT ] Signed-off-by: Jadhav Vikram Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 0a512aa..2444527 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -91,21 +91,22 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); wait_on_buffer(*bh); if (!buffer_uptodate(*bh)) { - brelse(*bh); - *bh = NULL; ret = -EIO; goto warn_exit; } - mmp = (struct mmp_struct *)((*bh)->b_data); - if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) + if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) { ret = -EFSCORRUPTED; - else if (!ext4_mmp_csum_verify(sb, mmp)) + goto warn_exit; + } + if (!ext4_mmp_csum_verify(sb, mmp)) { ret = -EFSBADCRC; - else - return 0; - + goto warn_exit; + } + return 0; warn_exit: + brelse(*bh); + *bh = NULL; ext4_warning(sb, "Error %d while reading MMP block %llu", ret, mmp_block); return ret; @@ -181,15 +182,13 @@ static int kmmpd(void *data) EXT4_FEATURE_INCOMPAT_MMP)) { ext4_warning(sb, "kmmpd being stopped since MMP feature" " has been disabled."); - EXT4_SB(sb)->s_mmp_tsk = NULL; - goto failed; + goto exit_thread; } if (sb->s_flags & MS_RDONLY) { ext4_warning(sb, "kmmpd being stopped since filesystem " "has been remounted as readonly."); - EXT4_SB(sb)->s_mmp_tsk = NULL; - goto failed; + goto exit_thread; } diff = jiffies - last_update_time; @@ -211,9 +210,7 @@ static int kmmpd(void *data) if (retval) { ext4_error(sb, "error reading MMP data: %d", retval); - - EXT4_SB(sb)->s_mmp_tsk = NULL; - goto failed; + goto exit_thread; } mmp_check = (struct mmp_struct *)(bh_check->b_data); @@ -225,7 +222,9 @@ static int kmmpd(void *data) "The filesystem seems to have been" " multiply mounted."); ext4_error(sb, "abort"); - goto failed; + put_bh(bh_check); + retval = -EBUSY; + goto exit_thread; } put_bh(bh_check); } @@ -248,7 +247,8 @@ static int kmmpd(void *data) retval = write_mmp_block(sb, bh); -failed: +exit_thread: + EXT4_SB(sb)->s_mmp_tsk = NULL; kfree(data); brelse(bh); return retval; -- cgit v0.10.2