diff options
author | Ingo Molnar <mingo@kernel.org> | 2013-08-15 08:00:09 (GMT) |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2013-08-15 08:00:09 (GMT) |
commit | c9572f010d369d9905309f63e31180f291b66a8a (patch) | |
tree | fbca2a0576c4223790e23a3c1d5f50e2bdf64e10 /fs | |
parent | 58cea307432e3376293e6b2be88d1f6e6e99274a (diff) | |
parent | d4e4ab86bcba5a72779c43dc1459f71fea3d89c8 (diff) | |
download | linux-fsl-qoriq-c9572f010d369d9905309f63e31180f291b66a8a.tar.xz |
Merge tag 'v3.11-rc5' into perf/core
Merge Linux 3.11-rc5, to sync up with the latest upstream fixes since -rc1.
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'fs')
41 files changed, 414 insertions, 333 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index eaf1333..8bc5e8c 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -36,16 +36,23 @@ static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb, u64 extent_item_pos, struct extent_inode_elem **eie) { - u64 data_offset; - u64 data_len; + u64 offset = 0; struct extent_inode_elem *e; - data_offset = btrfs_file_extent_offset(eb, fi); - data_len = btrfs_file_extent_num_bytes(eb, fi); + if (!btrfs_file_extent_compression(eb, fi) && + !btrfs_file_extent_encryption(eb, fi) && + !btrfs_file_extent_other_encoding(eb, fi)) { + u64 data_offset; + u64 data_len; - if (extent_item_pos < data_offset || - extent_item_pos >= data_offset + data_len) - return 1; + data_offset = btrfs_file_extent_offset(eb, fi); + data_len = btrfs_file_extent_num_bytes(eb, fi); + + if (extent_item_pos < data_offset || + extent_item_pos >= data_offset + data_len) + return 1; + offset = extent_item_pos - data_offset; + } e = kmalloc(sizeof(*e), GFP_NOFS); if (!e) @@ -53,7 +60,7 @@ static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb, e->next = *eie; e->inum = key->objectid; - e->offset = key->offset + (extent_item_pos - data_offset); + e->offset = key->offset + offset; *eie = e; return 0; @@ -189,7 +196,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *eb; struct btrfs_key key; struct btrfs_file_extent_item *fi; - struct extent_inode_elem *eie = NULL; + struct extent_inode_elem *eie = NULL, *old = NULL; u64 disk_byte; if (level != 0) { @@ -223,6 +230,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (disk_byte == wanted_disk_byte) { eie = NULL; + old = NULL; if (extent_item_pos) { ret = check_extent_in_eb(&key, eb, fi, *extent_item_pos, @@ -230,18 +238,20 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (ret < 0) break; } - if (!ret) { - ret = ulist_add(parents, eb->start, - (uintptr_t)eie, GFP_NOFS); - if (ret < 0) - break; - if (!extent_item_pos) { - ret = btrfs_next_old_leaf(root, path, - time_seq); - continue; - } + if (ret > 0) + goto next; + ret = ulist_add_merge(parents, eb->start, + (uintptr_t)eie, + (u64 *)&old, GFP_NOFS); + if (ret < 0) + break; + if (!ret && extent_item_pos) { + while (old->next) + old = old->next; + old->next = eie; } } +next: ret = btrfs_next_old_item(root, path, time_seq); } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5bf4c39..ed50460 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1271,7 +1271,6 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, BUG_ON(!eb_rewin); } - extent_buffer_get(eb_rewin); btrfs_tree_read_unlock(eb); free_extent_buffer(eb); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0236de7..1204c8e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7466,6 +7466,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int err = 0; int ret; int level; + bool root_dropped = false; path = btrfs_alloc_path(); if (!path) { @@ -7523,6 +7524,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, while (1) { btrfs_tree_lock(path->nodes[level]); btrfs_set_lock_blocking(path->nodes[level]); + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; ret = btrfs_lookup_extent_info(trans, root, path->nodes[level]->start, @@ -7538,6 +7540,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, break; btrfs_tree_unlock(path->nodes[level]); + path->locks[level] = 0; WARN_ON(wc->refs[level] != 1); level--; } @@ -7552,11 +7555,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { - if (!for_reloc && btrfs_need_cleaner_sleep(root)) { - pr_debug("btrfs: drop snapshot early exit\n"); - err = -EAGAIN; - goto out_end_trans; - } ret = walk_down_tree(trans, root, path, wc); if (ret < 0) { @@ -7584,7 +7582,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } BUG_ON(wc->level == 0); - if (btrfs_should_end_transaction(trans, tree_root)) { + if (btrfs_should_end_transaction(trans, tree_root) || + (!for_reloc && btrfs_need_cleaner_sleep(root))) { ret = btrfs_update_root(trans, tree_root, &root->root_key, root_item); @@ -7595,6 +7594,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } btrfs_end_transaction_throttle(trans, tree_root); + if (!for_reloc && btrfs_need_cleaner_sleep(root)) { + pr_debug("btrfs: drop snapshot early exit\n"); + err = -EAGAIN; + goto out_free; + } + trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) { err = PTR_ERR(trans); @@ -7639,12 +7644,22 @@ int btrfs_drop_snapshot(struct btrfs_root *root, free_extent_buffer(root->commit_root); btrfs_put_fs_root(root); } + root_dropped = true; out_end_trans: btrfs_end_transaction_throttle(trans, tree_root); out_free: kfree(wc); btrfs_free_path(path); out: + /* + * So if we need to stop dropping the snapshot for whatever reason we + * need to make sure to add it back to the dead root list so that we + * keep trying to do the work later. This also cleans up roots if we + * don't have it in the radix (like when we recover after a power fail + * or unmount) so we don't leak memory. + */ + if (root_dropped == false) + btrfs_add_dead_root(root); if (err) btrfs_std_error(root->fs_info, err); return err; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 583d98b..fe443fe 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4048,7 +4048,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } while (!end) { - u64 offset_in_extent; + u64 offset_in_extent = 0; /* break if the extent we found is outside the range */ if (em->start >= max || extent_map_end(em) < off) @@ -4064,9 +4064,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, /* * record the offset from the start of the extent - * for adjusting the disk offset below + * for adjusting the disk offset below. Only do this if the + * extent isn't compressed since our in ram offset may be past + * what we have actually allocated on disk. */ - offset_in_extent = em_start - em->start; + if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + offset_in_extent = em_start - em->start; em_end = extent_map_end(em); em_len = em_end - em_start; emflags = em->flags; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a005fe2..8e686a4 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -596,20 +596,29 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, if (no_splits) goto next; - if (em->block_start < EXTENT_MAP_LAST_BYTE && - em->start < start) { + if (em->start < start) { split->start = em->start; split->len = start - em->start; - split->orig_start = em->orig_start; - split->block_start = em->block_start; - if (compressed) - split->block_len = em->block_len; - else - split->block_len = split->len; - split->ram_bytes = em->ram_bytes; - split->orig_block_len = max(split->block_len, - em->orig_block_len); + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_start = em->orig_start; + split->block_start = em->block_start; + + if (compressed) + split->block_len = em->block_len; + else + split->block_len = split->len; + split->orig_block_len = max(split->block_len, + em->orig_block_len); + split->ram_bytes = em->ram_bytes; + } else { + split->orig_start = split->start; + split->block_len = 0; + split->block_start = em->block_start; + split->orig_block_len = 0; + split->ram_bytes = split->len; + } + split->generation = gen; split->bdev = em->bdev; split->flags = flags; @@ -620,8 +629,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split = split2; split2 = NULL; } - if (em->block_start < EXTENT_MAP_LAST_BYTE && - testend && em->start + em->len > start + len) { + if (testend && em->start + em->len > start + len) { u64 diff = start + len - em->start; split->start = start + len; @@ -630,18 +638,28 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->flags = flags; split->compress_type = em->compress_type; split->generation = gen; - split->orig_block_len = max(em->block_len, + + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_block_len = max(em->block_len, em->orig_block_len); - split->ram_bytes = em->ram_bytes; - if (compressed) { - split->block_len = em->block_len; - split->block_start = em->block_start; - split->orig_start = em->orig_start; + split->ram_bytes = em->ram_bytes; + if (compressed) { + split->block_len = em->block_len; + split->block_start = em->block_start; + split->orig_start = em->orig_start; + } else { + split->block_len = split->len; + split->block_start = em->block_start + + diff; + split->orig_start = em->orig_start; + } } else { - split->block_len = split->len; - split->block_start = em->block_start + diff; - split->orig_start = em->orig_start; + split->ram_bytes = split->len; + split->orig_start = split->start; + split->block_len = 0; + split->block_start = em->block_start; + split->orig_block_len = 0; } ret = add_extent_mapping(em_tree, split, modified); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6d1b93c..021694c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2166,16 +2166,23 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) continue; - extent_offset = btrfs_file_extent_offset(leaf, extent); - if (key.offset - extent_offset != offset) + /* + * 'offset' refers to the exact key.offset, + * NOT the 'offset' field in btrfs_extent_data_ref, ie. + * (key.offset - extent_offset). + */ + if (key.offset != offset) continue; + extent_offset = btrfs_file_extent_offset(leaf, extent); num_bytes = btrfs_file_extent_num_bytes(leaf, extent); + if (extent_offset >= old->extent_offset + old->offset + old->len || extent_offset + num_bytes <= old->extent_offset + old->offset) continue; + ret = 0; break; } @@ -2187,7 +2194,7 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, backref->root_id = root_id; backref->inum = inum; - backref->file_pos = offset + extent_offset; + backref->file_pos = offset; backref->num_bytes = num_bytes; backref->extent_offset = extent_offset; backref->generation = btrfs_file_extent_generation(leaf, extent); @@ -2210,7 +2217,8 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path, new->path = path; list_for_each_entry_safe(old, tmp, &new->head, list) { - ret = iterate_inodes_from_logical(old->bytenr, fs_info, + ret = iterate_inodes_from_logical(old->bytenr + + old->extent_offset, fs_info, path, record_one_backref, old); BUG_ON(ret < 0 && ret != -ENOENT); @@ -4391,9 +4399,6 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) int mask = attr->ia_valid; int ret; - if (newsize == oldsize) - return 0; - /* * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a * special case where we need to update the times despite not having @@ -5165,14 +5170,31 @@ next: } /* Reached end of directory/root. Bump pos past the last item. */ - if (key_type == BTRFS_DIR_INDEX_KEY) - /* - * 32-bit glibc will use getdents64, but then strtol - - * so the last number we can serve is this. - */ - ctx->pos = 0x7fffffff; - else - ctx->pos++; + ctx->pos++; + + /* + * Stop new entries from being returned after we return the last + * entry. + * + * New directory entries are assigned a strictly increasing + * offset. This means that new entries created during readdir + * are *guaranteed* to be seen in the future by that readdir. + * This has broken buggy programs which operate on names as + * they're returned by readdir. Until we re-use freed offsets + * we have this hack to stop new entries from being returned + * under the assumption that they'll never reach this huge + * offset. + * + * This is being careful not to overflow 32bit loff_t unless the + * last entry requires it because doing so has broken 32bit apps + * in the past. + */ + if (key_type == BTRFS_DIR_INDEX_KEY) { + if (ctx->pos >= INT_MAX) + ctx->pos = LLONG_MAX; + else + ctx->pos = INT_MAX; + } nopos: ret = 0; err: diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 4ba2a69..64a157b 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2495,7 +2495,7 @@ again: ret = scrub_extent(sctx, extent_logical, extent_len, extent_physical, extent_dev, flags, generation, extent_mirror_num, - extent_physical); + extent_logical - logical + physical); if (ret) goto out; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d58cce7..af1931a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -983,12 +983,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, * a dirty root struct and adds it into the list of dead roots that need to * be deleted */ -int btrfs_add_dead_root(struct btrfs_root *root) +void btrfs_add_dead_root(struct btrfs_root *root) { spin_lock(&root->fs_info->trans_lock); - list_add_tail(&root->root_list, &root->fs_info->dead_roots); + if (list_empty(&root->root_list)) + list_add_tail(&root->root_list, &root->fs_info->dead_roots); spin_unlock(&root->fs_info->trans_lock); - return 0; } /* @@ -1925,7 +1925,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) } root = list_first_entry(&fs_info->dead_roots, struct btrfs_root, root_list); - list_del(&root->root_list); + list_del_init(&root->root_list); spin_unlock(&fs_info->trans_lock); pr_debug("btrfs: cleaner removing %llu\n", diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 005b037..defbc42 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -143,7 +143,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_add_dead_root(struct btrfs_root *root); +void btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 2c67914..ff60d89 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3746,8 +3746,9 @@ next_slot: } log_extents: + btrfs_release_path(path); + btrfs_release_path(dst_path); if (fast_search) { - btrfs_release_path(dst_path); ret = btrfs_log_changed_extents(trans, root, inode, dst_path); if (ret) { err = ret; @@ -3764,8 +3765,6 @@ log_extents: } if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { - btrfs_release_path(path); - btrfs_release_path(dst_path); ret = log_directory_changes(trans, root, inode, path, dst_path); if (ret) { err = ret; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 4888cb3..c7c83ff 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -533,8 +533,7 @@ EXPORT_SYMBOL_GPL(debugfs_remove); */ void debugfs_remove_recursive(struct dentry *dentry) { - struct dentry *child; - struct dentry *parent; + struct dentry *child, *next, *parent; if (IS_ERR_OR_NULL(dentry)) return; @@ -544,61 +543,37 @@ void debugfs_remove_recursive(struct dentry *dentry) return; parent = dentry; + down: mutex_lock(&parent->d_inode->i_mutex); + list_for_each_entry_safe(child, next, &parent->d_subdirs, d_u.d_child) { + if (!debugfs_positive(child)) + continue; - while (1) { - /* - * When all dentries under "parent" has been removed, - * walk up the tree until we reach our starting point. - */ - if (list_empty(&parent->d_subdirs)) { - mutex_unlock(&parent->d_inode->i_mutex); - if (parent == dentry) - break; - parent = parent->d_parent; - mutex_lock(&parent->d_inode->i_mutex); - } - child = list_entry(parent->d_subdirs.next, struct dentry, - d_u.d_child); - next_sibling: - - /* - * If "child" isn't empty, walk down the tree and - * remove all its descendants first. - */ + /* perhaps simple_empty(child) makes more sense */ if (!list_empty(&child->d_subdirs)) { mutex_unlock(&parent->d_inode->i_mutex); parent = child; - mutex_lock(&parent->d_inode->i_mutex); - continue; + goto down; } - __debugfs_remove(child, parent); - if (parent->d_subdirs.next == &child->d_u.d_child) { - /* - * Try the next sibling. - */ - if (child->d_u.d_child.next != &parent->d_subdirs) { - child = list_entry(child->d_u.d_child.next, - struct dentry, - d_u.d_child); - goto next_sibling; - } - - /* - * Avoid infinite loop if we fail to remove - * one dentry. - */ - mutex_unlock(&parent->d_inode->i_mutex); - break; - } - simple_release_fs(&debugfs_mount, &debugfs_mount_count); + up: + if (!__debugfs_remove(child, parent)) + simple_release_fs(&debugfs_mount, &debugfs_mount_count); } - parent = dentry->d_parent; + mutex_unlock(&parent->d_inode->i_mutex); + child = parent; + parent = parent->d_parent; mutex_lock(&parent->d_inode->i_mutex); - __debugfs_remove(dentry, parent); + + if (child != dentry) { + next = list_entry(child->d_u.d_child.next, struct dentry, + d_u.d_child); + goto up; + } + + if (!__debugfs_remove(child, parent)) + simple_release_fs(&debugfs_mount, &debugfs_mount_count); mutex_unlock(&parent->d_inode->i_mutex); - simple_release_fs(&debugfs_mount, &debugfs_mount_count); } EXPORT_SYMBOL_GPL(debugfs_remove_recursive); diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 911649a..8121491 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -686,7 +686,6 @@ static int device_close(struct inode *inode, struct file *file) device_remove_lockspace() */ sigprocmask(SIG_SETMASK, &tmpsig, NULL); - recalc_sigpending(); return 0; } diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 998ea11..1194b1f 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1780,11 +1780,11 @@ retry: inode->i_op = &ext3_file_inode_operations; inode->i_fop = &ext3_file_operations; ext3_set_aops(inode); + d_tmpfile(dentry, inode); err = ext3_orphan_add(handle, inode); if (err) goto err_drop_inode; mark_inode_dirty(inode); - d_tmpfile(dentry, inode); unlock_new_inode(inode); } ext3_journal_stop(handle); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7097b0f..72ba470 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2835,6 +2835,9 @@ again: err = -EIO; break; } + /* Yield here to deal with large extent trees. + * Should be a no-op if we did IO above. */ + cond_resched(); if (WARN_ON(i + 1 > depth)) { err = -EIO; break; @@ -4261,8 +4264,8 @@ got_allocated_blocks: /* not a good idea to call discard here directly, * but otherwise we'd need to call it every free() */ ext4_discard_preallocations(inode); - ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), - ext4_ext_get_actual_len(&newex), fb_flags); + ext4_free_blocks(handle, inode, NULL, newblock, + EXT4_C2B(sbi, allocated_clusters), fb_flags); goto out2; } @@ -4382,8 +4385,9 @@ out2: } out3: - trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); - + trace_ext4_ext_map_blocks_exit(inode, flags, map, + err ? err : allocated); + ext4_es_lru_add(inode); return err ? err : allocated; } @@ -4405,9 +4409,20 @@ void ext4_ext_truncate(handle_t *handle, struct inode *inode) last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); +retry: err = ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); + if (err == -ENOMEM) { + cond_resched(); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + if (err) { + ext4_std_error(inode->i_sb, err); + return; + } err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); + ext4_std_error(inode->i_sb, err); } static void ext4_falloc_update_inode(struct inode *inode, diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 4b8df7f..91cb110 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -148,6 +148,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end); static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, int nr_to_scan); +static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei); int __init ext4_init_es(void) { @@ -665,7 +667,13 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, err = __es_remove_extent(inode, lblk, end); if (err != 0) goto error; +retry: err = __es_insert_extent(inode, &newes); + if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, + EXT4_I(inode))) + goto retry; + if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) + err = 0; error: write_unlock(&EXT4_I(inode)->i_es_lock); @@ -744,8 +752,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status orig_es; ext4_lblk_t len1, len2; ext4_fsblk_t block; - int err = 0; + int err; +retry: + err = 0; es = __es_tree_search(&tree->root, lblk); if (!es) goto out; @@ -780,6 +790,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, if (err) { es->es_lblk = orig_es.es_lblk; es->es_len = orig_es.es_len; + if ((err == -ENOMEM) && + __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, + EXT4_I(inode))) + goto retry; goto out; } } else { @@ -889,22 +903,14 @@ static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, return -1; } -static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) +static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei) { - struct ext4_sb_info *sbi = container_of(shrink, - struct ext4_sb_info, s_es_shrinker); struct ext4_inode_info *ei; struct list_head *cur, *tmp; LIST_HEAD(skiped); - int nr_to_scan = sc->nr_to_scan; int ret, nr_shrunk = 0; - ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); - trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret); - - if (!nr_to_scan) - return ret; - spin_lock(&sbi->s_es_lru_lock); /* @@ -933,7 +939,7 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) continue; } - if (ei->i_es_lru_nr == 0) + if (ei->i_es_lru_nr == 0 || ei == locked_ei) continue; write_lock(&ei->i_es_lock); @@ -952,6 +958,27 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) list_splice_tail(&skiped, &sbi->s_es_lru); spin_unlock(&sbi->s_es_lru_lock); + if (locked_ei && nr_shrunk == 0) + nr_shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); + + return nr_shrunk; +} + +static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) +{ + struct ext4_sb_info *sbi = container_of(shrink, + struct ext4_sb_info, s_es_shrinker); + int nr_to_scan = sc->nr_to_scan; + int ret, nr_shrunk; + + ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); + trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret); + + if (!nr_to_scan) + return ret; + + nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); + ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); return ret; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f03598c..8bf5999 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -734,11 +734,8 @@ repeat_in_this_group: ino = ext4_find_next_zero_bit((unsigned long *) inode_bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino); - if (ino >= EXT4_INODES_PER_GROUP(sb)) { - if (++group == ngroups) - group = 0; - continue; - } + if (ino >= EXT4_INODES_PER_GROUP(sb)) + goto next_group; if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) { ext4_error(sb, "reserved inode found cleared - " "inode=%lu", ino + 1); @@ -769,6 +766,9 @@ repeat_in_this_group: goto got; /* we grabbed the inode! */ if (ino < EXT4_INODES_PER_GROUP(sb)) goto repeat_in_this_group; +next_group: + if (++group == ngroups) + group = 0; } err = -ENOSPC; goto out; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 98b9bff..dd32a2e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -514,10 +514,9 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, "logical block %lu\n", inode->i_ino, flags, map->m_len, (unsigned long) map->m_lblk); - ext4_es_lru_add(inode); - /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + ext4_es_lru_add(inode); if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; @@ -556,14 +555,13 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, int ret; unsigned long long status; -#ifdef ES_AGGRESSIVE_TEST - if (retval != map->m_len) { - printk("ES len assertion failed for inode: %lu " - "retval %d != map->m_len %d " - "in %s (lookup)\n", inode->i_ino, retval, - map->m_len, __func__); + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); } -#endif status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; @@ -657,14 +655,13 @@ found: int ret; unsigned long long status; -#ifdef ES_AGGRESSIVE_TEST - if (retval != map->m_len) { - printk("ES len assertion failed for inode: %lu " - "retval %d != map->m_len %d " - "in %s (allocation)\n", inode->i_ino, retval, - map->m_len, __func__); + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); } -#endif /* * If the extent has been zeroed out, we don't need to update @@ -1529,11 +1526,9 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, "logical block %lu\n", inode->i_ino, map->m_len, (unsigned long) map->m_lblk); - ext4_es_lru_add(inode); - /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, iblock, &es)) { - + ext4_es_lru_add(inode); if (ext4_es_is_hole(&es)) { retval = 0; down_read((&EXT4_I(inode)->i_data_sem)); @@ -1640,14 +1635,13 @@ add_delayed: int ret; unsigned long long status; -#ifdef ES_AGGRESSIVE_TEST - if (retval != map->m_len) { - printk("ES len assertion failed for inode: %lu " - "retval %d != map->m_len %d " - "in %s (lookup)\n", inode->i_ino, retval, - map->m_len, __func__); + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); } -#endif status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 234b834..35f55a0 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2316,11 +2316,11 @@ retry: inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); + d_tmpfile(dentry, inode); err = ext4_orphan_add(handle, inode); if (err) goto err_drop_inode; mark_inode_dirty(inode); - d_tmpfile(dentry, inode); unlock_new_inode(inode); } if (handle) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bca26f3..36b141e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5481,6 +5481,7 @@ static void __exit ext4_exit_fs(void) kset_unregister(ext4_kset); ext4_exit_system_zone(); ext4_exit_pageio(); + ext4_exit_es(); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); @@ -730,14 +730,14 @@ static int __init fcntl_init(void) * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ - BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( + BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | O_APPEND | /* O_NONBLOCK | */ __O_SYNC | O_DSYNC | FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | O_NOATIME | O_CLOEXEC | - __FMODE_EXEC | O_PATH + __FMODE_EXEC | O_PATH | __O_TMPFILE )); fasync_cache = kmem_cache_create("fasync_cache", diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 0eda527..72a5d5b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1223,30 +1223,46 @@ static int fuse_direntplus_link(struct file *file, if (name.name[1] == '.' && name.len == 2) return 0; } + + if (invalid_nodeid(o->nodeid)) + return -EIO; + if (!fuse_valid_type(o->attr.mode)) + return -EIO; + fc = get_fuse_conn(dir); name.hash = full_name_hash(name.name, name.len); dentry = d_lookup(parent, &name); - if (dentry && dentry->d_inode) { + if (dentry) { inode = dentry->d_inode; - if (get_node_id(inode) == o->nodeid) { + if (!inode) { + d_drop(dentry); + } else if (get_node_id(inode) != o->nodeid || + ((o->attr.mode ^ inode->i_mode) & S_IFMT)) { + err = d_invalidate(dentry); + if (err) + goto out; + } else if (is_bad_inode(inode)) { + err = -EIO; + goto out; + } else { struct fuse_inode *fi; fi = get_fuse_inode(inode); spin_lock(&fc->lock); fi->nlookup++; spin_unlock(&fc->lock); + fuse_change_attributes(inode, &o->attr, + entry_attr_timeout(o), + attr_version); + /* * The other branch to 'found' comes via fuse_iget() * which bumps nlookup inside */ goto found; } - err = d_invalidate(dentry); - if (err) - goto out; dput(dentry); - dentry = NULL; } dentry = d_alloc(parent, &name); @@ -1259,25 +1275,30 @@ static int fuse_direntplus_link(struct file *file, if (!inode) goto out; - alias = d_materialise_unique(dentry, inode); - err = PTR_ERR(alias); - if (IS_ERR(alias)) - goto out; + if (S_ISDIR(inode->i_mode)) { + mutex_lock(&fc->inst_mutex); + alias = fuse_d_add_directory(dentry, inode); + mutex_unlock(&fc->inst_mutex); + err = PTR_ERR(alias); + if (IS_ERR(alias)) { + iput(inode); + goto out; + } + } else { + alias = d_splice_alias(inode, dentry); + } + if (alias) { dput(dentry); dentry = alias; } found: - fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o), - attr_version); - fuse_change_entry_timeout(dentry, o); err = 0; out: - if (dentry) - dput(dentry); + dput(dentry); return err; } diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 01bfe76..41e491b 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -64,12 +64,17 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init) nlm_init->protocol, nlm_version, nlm_init->hostname, nlm_init->noresvport, nlm_init->net); - if (host == NULL) { - lockd_down(nlm_init->net); - return ERR_PTR(-ENOLCK); - } + if (host == NULL) + goto out_nohost; + if (host->h_rpcclnt == NULL && nlm_bind_host(host) == NULL) + goto out_nobind; return host; +out_nobind: + nlmclnt_release_host(host); +out_nohost: + lockd_down(nlm_init->net); + return ERR_PTR(-ENOLCK); } EXPORT_SYMBOL_GPL(nlmclnt_init); diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 9760ecb..acd3947 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -125,14 +125,15 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl) { struct nlm_args *argp = &req->a_args; struct nlm_lock *lock = &argp->lock; + char *nodename = req->a_host->h_rpcclnt->cl_nodename; nlmclnt_next_cookie(&argp->cookie); memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh)); - lock->caller = utsname()->nodename; + lock->caller = nodename; lock->oh.data = req->a_owner; lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s", (unsigned int)fl->fl_u.nfs_fl.owner->pid, - utsname()->nodename); + nodename); lock->svid = fl->fl_u.nfs_fl.owner->pid; lock->fl.fl_start = fl->fl_start; lock->fl.fl_end = fl->fl_end; @@ -3671,15 +3671,11 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; /* - * To use null names we require CAP_DAC_READ_SEARCH - * This ensures that not everyone will be able to create - * handlink using the passed filedescriptor. + * Using empty names is equivalent to using AT_SYMLINK_FOLLOW + * on /proc/self/fd/<fd>. */ - if (flags & AT_EMPTY_PATH) { - if (!capable(CAP_DAC_READ_SEARCH)) - return -ENOENT; + if (flags & AT_EMPTY_PATH) how = LOOKUP_EMPTY; - } if (flags & AT_SYMLINK_FOLLOW) how |= LOOKUP_FOLLOW; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index af6e806..941246f 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -463,7 +463,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st unlock_new_inode(inode); } else nfs_refresh_inode(inode, fattr); - nfs_setsecurity(inode, fattr, label); dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode), @@ -963,9 +962,15 @@ EXPORT_SYMBOL_GPL(nfs_revalidate_inode); static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) { struct nfs_inode *nfsi = NFS_I(inode); - + int ret; + if (mapping->nrpages != 0) { - int ret = invalidate_inode_pages2(mapping); + if (S_ISREG(inode->i_mode)) { + ret = nfs_sync_mapping(mapping); + if (ret < 0) + return ret; + } + ret = invalidate_inode_pages2(mapping); if (ret < 0) return ret; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index cf11799..108a774 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3071,15 +3071,13 @@ struct rpc_clnt * nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { + struct rpc_clnt *client = NFS_CLIENT(dir); int status; - struct rpc_clnt *client = rpc_clone_client(NFS_CLIENT(dir)); status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, NULL); - if (status < 0) { - rpc_shutdown_client(client); + if (status < 0) return ERR_PTR(status); - } - return client; + return (client == NFS_CLIENT(dir)) ? rpc_clone_client(client) : client; } static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 0abfb846..3850b01 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -999,6 +999,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, __be32 *p; __be32 *q; int len; + uint32_t bmval_len = 2; uint32_t bmval0 = 0; uint32_t bmval1 = 0; uint32_t bmval2 = 0; @@ -1010,7 +1011,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, * = 40 bytes, plus any contribution from variable-length fields * such as owner/group. */ - len = 20; + len = 8; /* Sigh */ if (iap->ia_valid & ATTR_SIZE) @@ -1040,8 +1041,6 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, } len += 4 + (XDR_QUADLEN(owner_grouplen) << 2); } - if (label) - len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2); if (iap->ia_valid & ATTR_ATIME_SET) len += 16; else if (iap->ia_valid & ATTR_ATIME) @@ -1050,15 +1049,22 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, len += 16; else if (iap->ia_valid & ATTR_MTIME) len += 4; + if (label) { + len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2); + bmval_len = 3; + } + + len += bmval_len << 2; p = reserve_space(xdr, len); /* * We write the bitmap length now, but leave the bitmap and the attribute * buffer length to be backfilled at the end of this routine. */ - *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(bmval_len); q = p; - p += 4; + /* Skip bitmap entries + attrlen */ + p += bmval_len + 1; if (iap->ia_valid & ATTR_SIZE) { bmval0 |= FATTR4_WORD0_SIZE; @@ -1112,10 +1118,11 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, len, ((char *)p - (char *)q) + 4); BUG(); } - len = (char *)p - (char *)q - 16; *q++ = htonl(bmval0); *q++ = htonl(bmval1); - *q++ = htonl(bmval2); + if (bmval_len == 3) + *q++ = htonl(bmval2); + len = (char *)p - (char *)(q + 1); *q = htonl(len); /* out: */ diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 71fdc0d..f6db66d 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2478,6 +2478,10 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, if (server->flags & NFS_MOUNT_NOAC) sb_mntdata.mntflags |= MS_SYNCHRONOUS; + if (mount_info->cloned != NULL && mount_info->cloned->sb != NULL) + if (mount_info->cloned->sb->s_flags & MS_SYNCHRONOUS) + sb_mntdata.mntflags |= MS_SYNCHRONOUS; + /* Get a superblock - note that we may end up sharing one that already exists */ s = sget(nfs_mod->nfs_fs, compare_super, nfs_set_super, flags, &sb_mntdata); if (IS_ERR(s)) { diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 0d4c410..419572f 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1524,7 +1524,7 @@ static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\ - 1 + 1 + 0 + /* eir_flags, spr_how, SP4_NONE (for now) */\ + 1 + 1 + 2 + /* eir_flags, spr_how, spo_must_enforce & _allow */\ 2 + /*eir_server_owner.so_minor_id */\ /* eir_server_owner.so_major_id<> */\ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\ diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 280acef..43f4229 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1264,6 +1264,8 @@ static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp) struct svc_cred *cr = &rqstp->rq_cred; u32 service; + if (!cr->cr_gss_mech) + return false; service = gss_pseudoflavor_to_service(cr->cr_gss_mech, cr->cr_flavor); return service == RPC_GSS_SVC_INTEGRITY || service == RPC_GSS_SVC_PRIVACY; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 0c0f3ea9..c2a4701 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3360,7 +3360,8 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, 8 /* eir_clientid */ + 4 /* eir_sequenceid */ + 4 /* eir_flags */ + - 4 /* spr_how (SP4_NONE) */ + + 4 /* spr_how */ + + 8 /* spo_must_enforce, spo_must_allow */ + 8 /* so_minor_id */ + 4 /* so_major_id.len */ + (XDR_QUADLEN(major_id_sz) * 4) + @@ -3372,8 +3373,6 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, WRITE32(exid->seqid); WRITE32(exid->flags); - /* state_protect4_r. Currently only support SP4_NONE */ - BUG_ON(exid->spa_how != SP4_NONE); WRITE32(exid->spa_how); switch (exid->spa_how) { case SP4_NONE: diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 8ff6a00..c827acb 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -830,9 +830,10 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, flags = O_WRONLY|O_LARGEFILE; } *filp = dentry_open(&path, flags, current_cred()); - if (IS_ERR(*filp)) + if (IS_ERR(*filp)) { host_err = PTR_ERR(*filp); - else { + *filp = NULL; + } else { host_err = ima_file_check(*filp, may_flags); if (may_flags & NFSD_MAY_64BIT_COOKIE) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 998b17e..9f6b96a 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2965,6 +2965,11 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, to = map_end & (PAGE_CACHE_SIZE - 1); page = find_or_create_page(mapping, page_index, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + mlog_errno(ret); + break; + } /* * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page @@ -823,7 +823,7 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o int lookup_flags = 0; int acc_mode; - if (flags & O_CREAT) + if (flags & (O_CREAT | __O_TMPFILE)) op->mode = (mode & S_IALLUGO) | S_IFREG; else op->mode = 0; @@ -844,6 +844,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o if ((flags & O_TMPFILE_MASK) != O_TMPFILE) return -EINVAL; acc_mode = MAY_OPEN | ACC_MODE(flags); + if (!(acc_mode & MAY_WRITE)) + return -EINVAL; } else if (flags & O_PATH) { /* * If we have O_PATH in the open flag. Then we diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 2850317..a1a16eb 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -223,7 +223,7 @@ static inline char *alloc_elfnotes_buf(size_t notes_sz) * regions in the 1st kernel pointed to by PT_LOAD entries) into * virtually contiguous user-space in ELF layout. */ -#ifdef CONFIG_MMU +#if defined(CONFIG_MMU) && !defined(CONFIG_S390) static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) { size_t size = vma->vm_end - vma->vm_start; diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 33532f7..a958444 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -19,12 +19,13 @@ /* * LOCKING: * - * We rely on new Alexander Viro's super-block locking. + * These guys are evicted from procfs as the very first step in ->kill_sb(). * */ -static int show_version(struct seq_file *m, struct super_block *sb) +static int show_version(struct seq_file *m, void *unused) { + struct super_block *sb = m->private; char *format; if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) { @@ -66,8 +67,9 @@ static int show_version(struct seq_file *m, struct super_block *sb) #define DJP( x ) le32_to_cpu( jp -> x ) #define JF( x ) ( r -> s_journal -> x ) -static int show_super(struct seq_file *m, struct super_block *sb) +static int show_super(struct seq_file *m, void *unused) { + struct super_block *sb = m->private; struct reiserfs_sb_info *r = REISERFS_SB(sb); seq_printf(m, "state: \t%s\n" @@ -128,8 +130,9 @@ static int show_super(struct seq_file *m, struct super_block *sb) return 0; } -static int show_per_level(struct seq_file *m, struct super_block *sb) +static int show_per_level(struct seq_file *m, void *unused) { + struct super_block *sb = m->private; struct reiserfs_sb_info *r = REISERFS_SB(sb); int level; @@ -186,8 +189,9 @@ static int show_per_level(struct seq_file *m, struct super_block *sb) return 0; } -static int show_bitmap(struct seq_file *m, struct super_block *sb) +static int show_bitmap(struct seq_file *m, void *unused) { + struct super_block *sb = m->private; struct reiserfs_sb_info *r = REISERFS_SB(sb); seq_printf(m, "free_block: %lu\n" @@ -218,8 +222,9 @@ static int show_bitmap(struct seq_file *m, struct super_block *sb) return 0; } -static int show_on_disk_super(struct seq_file *m, struct super_block *sb) +static int show_on_disk_super(struct seq_file *m, void *unused) { + struct super_block *sb = m->private; struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); struct reiserfs_super_block *rs = sb_info->s_rs; int hash_code = DFL(s_hash_function_code); @@ -261,8 +266,9 @@ static int show_on_disk_super(struct seq_file *m, struct super_block *sb) return 0; } -static int show_oidmap(struct seq_file *m, struct super_block *sb) +static int show_oidmap(struct seq_file *m, void *unused) { + struct super_block *sb = m->private; struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); struct reiserfs_super_block *rs = sb_info->s_rs; unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize); @@ -291,8 +297,9 @@ static int show_oidmap(struct seq_file *m, struct super_block *sb) return 0; } -static int show_journal(struct seq_file *m, struct super_block *sb) +static int show_journal(struct seq_file *m, void *unused) { + struct super_block *sb = m->private; struct reiserfs_sb_info *r = REISERFS_SB(sb); struct reiserfs_super_block *rs = r->s_rs; struct journal_params *jp = &rs->s_v1.s_journal; @@ -383,92 +390,24 @@ static int show_journal(struct seq_file *m, struct super_block *sb) return 0; } -/* iterator */ -static int test_sb(struct super_block *sb, void *data) -{ - return data == sb; -} - -static int set_sb(struct super_block *sb, void *data) -{ - return -ENOENT; -} - -struct reiserfs_seq_private { - struct super_block *sb; - int (*show) (struct seq_file *, struct super_block *); -}; - -static void *r_start(struct seq_file *m, loff_t * pos) -{ - struct reiserfs_seq_private *priv = m->private; - loff_t l = *pos; - - if (l) - return NULL; - - if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, 0, priv->sb))) - return NULL; - - up_write(&priv->sb->s_umount); - return priv->sb; -} - -static void *r_next(struct seq_file *m, void *v, loff_t * pos) -{ - ++*pos; - if (v) - deactivate_super(v); - return NULL; -} - -static void r_stop(struct seq_file *m, void *v) -{ - if (v) - deactivate_super(v); -} - -static int r_show(struct seq_file *m, void *v) -{ - struct reiserfs_seq_private *priv = m->private; - return priv->show(m, v); -} - -static const struct seq_operations r_ops = { - .start = r_start, - .next = r_next, - .stop = r_stop, - .show = r_show, -}; - static int r_open(struct inode *inode, struct file *file) { - struct reiserfs_seq_private *priv; - int ret = seq_open_private(file, &r_ops, - sizeof(struct reiserfs_seq_private)); - - if (!ret) { - struct seq_file *m = file->private_data; - priv = m->private; - priv->sb = proc_get_parent_data(inode); - priv->show = PDE_DATA(inode); - } - return ret; + return single_open(file, PDE_DATA(inode), + proc_get_parent_data(inode)); } static const struct file_operations r_file_operations = { .open = r_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, - .owner = THIS_MODULE, + .release = single_release, }; static struct proc_dir_entry *proc_info_root = NULL; static const char proc_info_root_name[] = "fs/reiserfs"; static void add_file(struct super_block *sb, char *name, - int (*func) (struct seq_file *, struct super_block *)) + int (*func) (struct seq_file *, void *)) { proc_create_data(name, 0, REISERFS_SB(sb)->procdir, &r_file_operations, func); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index f8a23c3..e2e202a 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -499,6 +499,7 @@ int remove_save_link(struct inode *inode, int truncate) static void reiserfs_kill_sb(struct super_block *s) { if (REISERFS_SB(s)) { + reiserfs_proc_info_done(s); /* * Force any pending inode evictions to occur now. Any * inodes to be removed that have extended attributes @@ -554,8 +555,6 @@ static void reiserfs_put_super(struct super_block *s) REISERFS_SB(s)->reserved_blocks); } - reiserfs_proc_info_done(s); - reiserfs_write_unlock(s); mutex_destroy(&REISERFS_SB(s)->lock); kfree(s->s_fs_info); @@ -336,19 +336,19 @@ EXPORT_SYMBOL(deactivate_super); * and want to turn it into a full-blown active reference. grab_super() * is called with sb_lock held and drops it. Returns 1 in case of * success, 0 if we had failed (superblock contents was already dead or - * dying when grab_super() had been called). + * dying when grab_super() had been called). Note that this is only + * called for superblocks not in rundown mode (== ones still on ->fs_supers + * of their type), so increment of ->s_count is OK here. */ static int grab_super(struct super_block *s) __releases(sb_lock) { - if (atomic_inc_not_zero(&s->s_active)) { - spin_unlock(&sb_lock); - return 1; - } - /* it's going away */ s->s_count++; spin_unlock(&sb_lock); - /* wait for it to die */ down_write(&s->s_umount); + if ((s->s_flags & MS_BORN) && atomic_inc_not_zero(&s->s_active)) { + put_super(s); + return 1; + } up_write(&s->s_umount); put_super(s); return 0; @@ -463,11 +463,6 @@ retry: destroy_super(s); s = NULL; } - down_write(&old->s_umount); - if (unlikely(!(old->s_flags & MS_BORN))) { - deactivate_locked_super(old); - goto retry; - } return old; } } @@ -660,10 +655,10 @@ restart: if (hlist_unhashed(&sb->s_instances)) continue; if (sb->s_bdev == bdev) { - if (grab_super(sb)) /* drops sb_lock */ - return sb; - else + if (!grab_super(sb)) goto restart; + up_write(&sb->s_umount); + return sb; } } spin_unlock(&sb_lock); diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index 07d735a..e5869b5 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -39,6 +39,9 @@ typedef struct xfs_timestamp { * There is a very similar struct icdinode in xfs_inode which matches the * layout of the first 96 bytes of this structure, but is kept in native * format instead of big endian. + * + * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed + * padding field for v3 inodes. */ typedef struct xfs_dinode { __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b78481f..bb262c2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -896,7 +896,6 @@ xfs_dinode_to_disk( to->di_projid_lo = cpu_to_be16(from->di_projid_lo); to->di_projid_hi = cpu_to_be16(from->di_projid_hi); memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); - to->di_flushiter = cpu_to_be16(from->di_flushiter); to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); @@ -924,6 +923,9 @@ xfs_dinode_to_disk( to->di_lsn = cpu_to_be64(from->di_lsn); memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &from->di_uuid); + to->di_flushiter = 0; + } else { + to->di_flushiter = cpu_to_be16(from->di_flushiter); } } @@ -1029,10 +1031,14 @@ xfs_dinode_calc_crc( /* * Read the disk inode attributes into the in-core inode structure. * - * If we are initialising a new inode and we are not utilising the - * XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new inode core - * with a random generation number. If we are keeping inodes around, we need to - * read the inode cluster to get the existing generation number off disk. + * For version 5 superblocks, if we are initialising a new inode and we are not + * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new + * inode core with a random generation number. If we are keeping inodes around, + * we need to read the inode cluster to get the existing generation number off + * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode + * format) then log recovery is dependent on the di_flushiter field being + * initialised from the current on-disk value and hence we must also read the + * inode off disk. */ int xfs_iread( @@ -1054,6 +1060,7 @@ xfs_iread( /* shortcut IO on inode allocation if possible */ if ((iget_flags & XFS_IGET_CREATE) && + xfs_sb_version_hascrc(&mp->m_sb) && !(mp->m_flags & XFS_MOUNT_IKEEP)) { /* initialise the on-disk inode core */ memset(&ip->i_d, 0, sizeof(ip->i_d)); @@ -2882,12 +2889,18 @@ xfs_iflush_int( __func__, ip->i_ino, ip->i_d.di_forkoff, ip); goto corrupt_out; } + /* - * bump the flush iteration count, used to detect flushes which - * postdate a log record during recovery. This is redundant as we now - * log every change and hence this can't happen. Still, it doesn't hurt. + * Inode item log recovery for v1/v2 inodes are dependent on the + * di_flushiter count for correct sequencing. We bump the flush + * iteration count so we can detect flushes which postdate a log record + * during recovery. This is redundant as we now log every change and + * hence this can't happen but we need to still do it to ensure + * backwards compatibility with old kernels that predate logging all + * inode changes. */ - ip->i_d.di_flushiter++; + if (ip->i_d.di_version < 3) + ip->i_d.di_flushiter++; /* * Copy the dirty parts of the inode into the on-disk diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 6fcc910a..7681b19 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2592,8 +2592,16 @@ xlog_recover_inode_pass2( goto error; } - /* Skip replay when the on disk inode is newer than the log one */ - if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { + /* + * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes + * are transactional and if ordering is necessary we can determine that + * more accurately by the LSN field in the V3 inode core. Don't trust + * the inode versions we might be changing them here - use the + * superblock flag to determine whether we need to look at di_flushiter + * to skip replay when the on disk inode is newer than the log one + */ + if (!xfs_sb_version_hascrc(&mp->m_sb) && + dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { /* * Deal with the wrap case, DI_MAX_FLUSH is less * than smaller numbers @@ -2608,6 +2616,7 @@ xlog_recover_inode_pass2( goto error; } } + /* Take the opportunity to reset the flush iteration count */ dicp->di_flushiter = 0; |