Merge tag 'v4.0-rc5' into linux-next

Merge the upstream -rc5 tag because we needed a more up-to-date base our further work.
author: Artem Bityutskiy <artem.bityutskiy@linux.intel.com> 2015-03-25 09:03:07 (GMT)
committer: Artem Bityutskiy <artem.bityutskiy@linux.intel.com> 2015-03-25 09:03:07 (GMT)
commit: 3527a86b7ae17c949307d00e1eb7087604bca1b4 (patch)
tree: 8e8eab905e7e73a3bf2c4d47abd356166533c9db /fs
parent: b388e6a7a6ba988998ddd83919ae8d3debf1a13d (diff)
parent: bc465aa9d045feb0e13b4a8f32cc33c1943f62d6 (diff)
download: linux-3527a86b7ae17c949307d00e1eb7087604bca1b4.tar.xz
121 files changed, 2373 insertions, 1118 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 9ee5343..3662f1d 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1127,7 +1127,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	}
 
 	/* Write all dirty data */
-	if (S_ISREG(dentry->d_inode->i_mode))
+	if (d_is_reg(dentry))
 		filemap_write_and_wait(dentry->d_inode->i_mapping);
 
 	retval = p9_client_wstat(fid, &wstat);
diff --git a/fs/aio.c b/fs/aio.c
index 118a2e0..f8e52a1 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1285,7 +1285,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
 
 	ret = -EINVAL;
 	if (unlikely(ctx || nr_events == 0)) {
-		pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",
+		pr_debug("EINVAL: ctx %lu nr_events %u\n",
 		         ctx, nr_events);
 		goto out;
 	}
@@ -1333,7 +1333,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
 
 		return ret;
 	}
-	pr_debug("EINVAL: io_destroy: invalid context id\n");
+	pr_debug("EINVAL: invalid context id\n");
 	return -EINVAL;
 }
 
@@ -1515,7 +1515,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	    (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
 	    ((ssize_t)iocb->aio_nbytes < 0)
 	   )) {
-		pr_debug("EINVAL: io_submit: overflow check\n");
+		pr_debug("EINVAL: overflow check\n");
 		return -EINVAL;
 	}
 
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index aaf96cb..ac7d921 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -95,7 +95,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
  */
 static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
 {
-	struct autofs_dev_ioctl tmp;
+	struct autofs_dev_ioctl tmp, *res;
 
 	if (copy_from_user(&tmp, in, sizeof(tmp)))
 		return ERR_PTR(-EFAULT);
@@ -106,7 +106,11 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
 	if (tmp.size > (PATH_MAX + sizeof(tmp)))
 		return ERR_PTR(-ENAMETOOLONG);
 
-	return memdup_user(in, tmp.size);
+	res = memdup_user(in, tmp.size);
+	if (!IS_ERR(res))
+		res->size = tmp.size;
+
+	return res;
 }
 
 static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index bfdbaba..11dd118 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -374,7 +374,7 @@ static struct dentry *should_expire(struct dentry *dentry,
 		return NULL;
 	}
 
-	if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
+	if (dentry->d_inode && d_is_symlink(dentry)) {
 		DPRINTK("checking symlink %p %pd", dentry, dentry);
 		/*
 		 * A symlink can't be "busy" in the usual sense so
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index dbb5b72..7e44fdd 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -108,7 +108,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
 	struct dentry *dentry = file->f_path.dentry;
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 
-	DPRINTK("file=%p dentry=%p %pD", file, dentry, dentry);
+	DPRINTK("file=%p dentry=%p %pd", file, dentry, dentry);
 
 	if (autofs4_oz_mode(sbi))
 		goto out;
@@ -371,7 +371,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
 	 * having d_mountpoint() true, so there's no need to call back
 	 * to the daemon.
 	 */
-	if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
+	if (dentry->d_inode && d_is_symlink(dentry)) {
 		spin_unlock(&sbi->fs_lock);
 		goto done;
 	}
@@ -485,7 +485,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
 		 * an incorrect ELOOP error return.
 		 */
 		if ((!d_mountpoint(dentry) && !simple_empty(dentry)) ||
-		    (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
+		    (dentry->d_inode && d_is_symlink(dentry)))
 			status = -EISDIR;
 	}
 	spin_unlock(&sbi->fs_lock);
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index afd2b44..861b1e1 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -15,161 +15,14 @@
 #include <linux/namei.h>
 #include <linux/poll.h>
 
-
-static loff_t bad_file_llseek(struct file *file, loff_t offset, int whence)
-{
-	return -EIO;
-}
-
-static ssize_t bad_file_read(struct file *filp, char __user *buf,
-			size_t size, loff_t *ppos)
-{
-        return -EIO;
-}
-
-static ssize_t bad_file_write(struct file *filp, const char __user *buf,
-			size_t siz, loff_t *ppos)
-{
-        return -EIO;
-}
-
-static ssize_t bad_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-			unsigned long nr_segs, loff_t pos)
-{
-	return -EIO;
-}
-
-static ssize_t bad_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-			unsigned long nr_segs, loff_t pos)
-{
-	return -EIO;
-}
-
-static int bad_file_readdir(struct file *file, struct dir_context *ctx)
-{
-	return -EIO;
-}
-
-static unsigned int bad_file_poll(struct file *filp, poll_table *wait)
-{
-	return POLLERR;
-}
-
-static long bad_file_unlocked_ioctl(struct file *file, unsigned cmd,
-			unsigned long arg)
-{
-	return -EIO;
-}
-
-static long bad_file_compat_ioctl(struct file *file, unsigned int cmd,
-			unsigned long arg)
-{
-	return -EIO;
-}
-
-static int bad_file_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	return -EIO;
-}
-
 static int bad_file_open(struct inode *inode, struct file *filp)
 {
 	return -EIO;
 }
 
-static int bad_file_flush(struct file *file, fl_owner_t id)
-{
-	return -EIO;
-}
-
-static int bad_file_release(struct inode *inode, struct file *filp)
-{
-	return -EIO;
-}
-
-static int bad_file_fsync(struct file *file, loff_t start, loff_t end,
-			  int datasync)
-{
-	return -EIO;
-}
-
-static int bad_file_aio_fsync(struct kiocb *iocb, int datasync)
-{
-	return -EIO;
-}
-
-static int bad_file_fasync(int fd, struct file *filp, int on)
-{
-	return -EIO;
-}
-
-static int bad_file_lock(struct file *file, int cmd, struct file_lock *fl)
-{
-	return -EIO;
-}
-
-static ssize_t bad_file_sendpage(struct file *file, struct page *page,
-			int off, size_t len, loff_t *pos, int more)
-{
-	return -EIO;
-}
-
-static unsigned long bad_file_get_unmapped_area(struct file *file,
-				unsigned long addr, unsigned long len,
-				unsigned long pgoff, unsigned long flags)
-{
-	return -EIO;
-}
-
-static int bad_file_check_flags(int flags)
-{
-	return -EIO;
-}
-
-static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl)
-{
-	return -EIO;
-}
-
-static ssize_t bad_file_splice_write(struct pipe_inode_info *pipe,
-			struct file *out, loff_t *ppos, size_t len,
-			unsigned int flags)
-{
-	return -EIO;
-}
-
-static ssize_t bad_file_splice_read(struct file *in, loff_t *ppos,
-			struct pipe_inode_info *pipe, size_t len,
-			unsigned int flags)
-{
-	return -EIO;
-}
-
 static const struct file_operations bad_file_ops =
 {
-	.llseek		= bad_file_llseek,
-	.read		= bad_file_read,
-	.write		= bad_file_write,
-	.aio_read	= bad_file_aio_read,
-	.aio_write	= bad_file_aio_write,
-	.iterate	= bad_file_readdir,
-	.poll		= bad_file_poll,
-	.unlocked_ioctl	= bad_file_unlocked_ioctl,
-	.compat_ioctl	= bad_file_compat_ioctl,
-	.mmap		= bad_file_mmap,
 	.open		= bad_file_open,
-	.flush		= bad_file_flush,
-	.release	= bad_file_release,
-	.fsync		= bad_file_fsync,
-	.aio_fsync	= bad_file_aio_fsync,
-	.fasync		= bad_file_fasync,
-	.lock		= bad_file_lock,
-	.sendpage	= bad_file_sendpage,
-	.get_unmapped_area = bad_file_get_unmapped_area,
-	.check_flags	= bad_file_check_flags,
-	.flock		= bad_file_flock,
-	.splice_write	= bad_file_splice_write,
-	.splice_read	= bad_file_splice_read,
 };
 
 static int bad_inode_create (struct inode *dir, struct dentry *dentry,
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 02b1691..995986b 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -645,11 +645,12 @@ out:
 
 static unsigned long randomize_stack_top(unsigned long stack_top)
 {
-	unsigned int random_variable = 0;
+	unsigned long random_variable = 0;
 
 	if ((current->flags & PF_RANDOMIZE) &&
 		!(current->personality & ADDR_NO_RANDOMIZE)) {
-		random_variable = get_random_int() & STACK_RND_MASK;
+		random_variable = (unsigned long) get_random_int();
+		random_variable &= STACK_RND_MASK;
 		random_variable <<= PAGE_SHIFT;
 	}
 #ifdef CONFIG_STACK_GROWSUP
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9936421..6d67f32 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1645,14 +1645,14 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 
 	parent_nritems = btrfs_header_nritems(parent);
 	blocksize = root->nodesize;
-	end_slot = parent_nritems;
+	end_slot = parent_nritems - 1;
 
-	if (parent_nritems == 1)
+	if (parent_nritems <= 1)
 		return 0;
 
 	btrfs_set_lock_blocking(parent);
 
-	for (i = start_slot; i < end_slot; i++) {
+	for (i = start_slot; i <= end_slot; i++) {
 		int close = 1;
 
 		btrfs_node_key(parent, &disk_key, i);
@@ -1669,7 +1669,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 			other = btrfs_node_blockptr(parent, i - 1);
 			close = close_blocks(blocknr, other, blocksize);
 		}
-		if (!close && i < end_slot - 2) {
+		if (!close && i < end_slot) {
 			other = btrfs_node_blockptr(parent, i + 1);
 			close = close_blocks(blocknr, other, blocksize);
 		}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 84c3b00..f9c89ca 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3387,6 +3387,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root);
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root);
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
 int btrfs_read_block_groups(struct btrfs_root *root);
@@ -3909,6 +3911,9 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
 				    loff_t actual_len, u64 *alloc_hint);
 int btrfs_inode_check_errors(struct inode *inode);
 extern const struct dentry_operations btrfs_dentry_operations;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_inode_set_ops(struct inode *inode);
+#endif
 
 /* ioctl.c */
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f79f385..639f266 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3921,7 +3921,7 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 	}
 	if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
 			+ sizeof(struct btrfs_chunk)) {
-		printk(KERN_ERR "BTRFS: system chunk array too small %u < %lu\n",
+		printk(KERN_ERR "BTRFS: system chunk array too small %u < %zu\n",
 				btrfs_super_sys_array_size(sb),
 				sizeof(struct btrfs_disk_key)
 				+ sizeof(struct btrfs_chunk));
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 571f402..8b353ad 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3208,6 +3208,8 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
 		return 0;
 	}
 
+	if (trans->aborted)
+		return 0;
 again:
 	inode = lookup_free_space_inode(root, block_group, path);
 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
@@ -3243,6 +3245,20 @@ again:
 	 */
 	BTRFS_I(inode)->generation = 0;
 	ret = btrfs_update_inode(trans, root, inode);
+	if (ret) {
+		/*
+		 * So theoretically we could recover from this, simply set the
+		 * super cache generation to 0 so we know to invalidate the
+		 * cache, but then we'd have to keep track of the block groups
+		 * that fail this way so we know we _have_ to reset this cache
+		 * before the next commit or risk reading stale cache.  So to
+		 * limit our exposure to horrible edge cases lets just abort the
+		 * transaction, this only happens in really bad situations
+		 * anyway.
+		 */
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_put;
+	}
 	WARN_ON(ret);
 
 	if (i_size_read(inode) > 0) {
@@ -3309,6 +3325,32 @@ out:
 	return ret;
 }
 
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root)
+{
+	struct btrfs_block_group_cache *cache, *tmp;
+	struct btrfs_transaction *cur_trans = trans->transaction;
+	struct btrfs_path *path;
+
+	if (list_empty(&cur_trans->dirty_bgs) ||
+	    !btrfs_test_opt(root, SPACE_CACHE))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/* Could add new block groups, use _safe just in case */
+	list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
+				 dirty_list) {
+		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+			cache_save_setup(cache, trans, path);
+	}
+
+	btrfs_free_path(path);
+	return 0;
+}
+
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root)
 {
@@ -5094,7 +5136,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	num_bytes = ALIGN(num_bytes, root->sectorsize);
 
 	spin_lock(&BTRFS_I(inode)->lock);
-	BTRFS_I(inode)->outstanding_extents++;
+	nr_extents = (unsigned)div64_u64(num_bytes +
+					 BTRFS_MAX_EXTENT_SIZE - 1,
+					 BTRFS_MAX_EXTENT_SIZE);
+	BTRFS_I(inode)->outstanding_extents += nr_extents;
+	nr_extents = 0;
 
 	if (BTRFS_I(inode)->outstanding_extents >
 	    BTRFS_I(inode)->reserved_extents)
@@ -5239,6 +5285,9 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 	if (dropped > 0)
 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
 
+	if (btrfs_test_is_dummy_root(root))
+		return;
+
 	trace_btrfs_space_reservation(root->fs_info, "delalloc",
 				      btrfs_ino(inode), to_free, 0);
 	if (root->fs_info->quota_enabled) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c7233ff..d688cfe 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4968,6 +4968,12 @@ static int release_extent_buffer(struct extent_buffer *eb)
 
 		/* Should be safe to release our pages at this point */
 		btrfs_release_extent_buffer_page(eb);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+		if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) {
+			__free_extent_buffer(eb);
+			return 1;
+		}
+#endif
 		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
 		return 1;
 	}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b78bbba..30982bb 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1811,22 +1811,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	mutex_unlock(&inode->i_mutex);
 
 	/*
-	 * we want to make sure fsync finds this change
-	 * but we haven't joined a transaction running right now.
-	 *
-	 * Later on, someone is sure to update the inode and get the
-	 * real transid recorded.
-	 *
-	 * We set last_trans now to the fs_info generation + 1,
-	 * this will either be one more than the running transaction
-	 * or the generation used for the next transaction if there isn't
-	 * one running right now.
-	 *
 	 * We also have to set last_sub_trans to the current log transid,
 	 * otherwise subsequent syncs to a file that's been synced in this
 	 * transaction will appear to have already occured.
 	 */
-	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
 	BTRFS_I(inode)->last_sub_trans = root->log_transid;
 	if (num_written > 0) {
 		err = generic_write_sync(file, pos, num_written);
@@ -1959,25 +1947,37 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	atomic_inc(&root->log_batch);
 
 	/*
-	 * check the transaction that last modified this inode
-	 * and see if its already been committed
-	 */
-	if (!BTRFS_I(inode)->last_trans) {
-		mutex_unlock(&inode->i_mutex);
-		goto out;
-	}
-
-	/*
-	 * if the last transaction that changed this file was before
-	 * the current transaction, we can bail out now without any
-	 * syncing
+	 * If the last transaction that changed this file was before the current
+	 * transaction and we have the full sync flag set in our inode, we can
+	 * bail out now without any syncing.
+	 *
+	 * Note that we can't bail out if the full sync flag isn't set. This is
+	 * because when the full sync flag is set we start all ordered extents
+	 * and wait for them to fully complete - when they complete they update
+	 * the inode's last_trans field through:
+	 *
+	 *     btrfs_finish_ordered_io() ->
+	 *         btrfs_update_inode_fallback() ->
+	 *             btrfs_update_inode() ->
+	 *                 btrfs_set_inode_last_trans()
+	 *
+	 * So we are sure that last_trans is up to date and can do this check to
+	 * bail out safely. For the fast path, when the full sync flag is not
+	 * set in our inode, we can not do it because we start only our ordered
+	 * extents and don't wait for them to complete (that is when
+	 * btrfs_finish_ordered_io runs), so here at this point their last_trans
+	 * value might be less than or equals to fs_info->last_trans_committed,
+	 * and setting a speculative last_trans for an inode when a buffered
+	 * write is made (such as fs_info->generation + 1 for example) would not
+	 * be reliable since after setting the value and before fsync is called
+	 * any number of transactions can start and commit (transaction kthread
+	 * commits the current transaction periodically), and a transaction
+	 * commit does not start nor waits for ordered extents to complete.
 	 */
 	smp_mb();
 	if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
-	    BTRFS_I(inode)->last_trans <=
-	    root->fs_info->last_trans_committed) {
-		BTRFS_I(inode)->last_trans = 0;
-
+	    (full_sync && BTRFS_I(inode)->last_trans <=
+	     root->fs_info->last_trans_committed)) {
 		/*
 		 * We'v had everything committed since the last time we were
 		 * modified so clear this flag in case it was set for whatever
@@ -2275,6 +2275,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	bool same_page;
 	bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
 	u64 ino_size;
+	bool truncated_page = false;
+	bool updated_inode = false;
 
 	ret = btrfs_wait_ordered_range(inode, offset, len);
 	if (ret)
@@ -2306,13 +2308,18 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	 * entire page.
 	 */
 	if (same_page && len < PAGE_CACHE_SIZE) {
-		if (offset < ino_size)
+		if (offset < ino_size) {
+			truncated_page = true;
 			ret = btrfs_truncate_page(inode, offset, len, 0);
+		} else {
+			ret = 0;
+		}
 		goto out_only_mutex;
 	}
 
 	/* zero back part of the first page */
 	if (offset < ino_size) {
+		truncated_page = true;
 		ret = btrfs_truncate_page(inode, offset, 0, 0);
 		if (ret) {
 			mutex_unlock(&inode->i_mutex);
@@ -2348,6 +2355,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		if (!ret) {
 			/* zero the front end of the last page */
 			if (tail_start + tail_len < ino_size) {
+				truncated_page = true;
 				ret = btrfs_truncate_page(inode,
 						tail_start + tail_len, 0, 1);
 				if (ret)
@@ -2357,8 +2365,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	}
 
 	if (lockend < lockstart) {
-		mutex_unlock(&inode->i_mutex);
-		return 0;
+		ret = 0;
+		goto out_only_mutex;
 	}
 
 	while (1) {
@@ -2506,6 +2514,7 @@ out_trans:
 
 	trans->block_rsv = &root->fs_info->trans_block_rsv;
 	ret = btrfs_update_inode(trans, root, inode);
+	updated_inode = true;
 	btrfs_end_transaction(trans, root);
 	btrfs_btree_balance_dirty(root);
 out_free:
@@ -2515,6 +2524,22 @@ out:
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 			     &cached_state, GFP_NOFS);
 out_only_mutex:
+	if (!updated_inode && truncated_page && !ret && !err) {
+		/*
+		 * If we only end up zeroing part of a page, we still need to
+		 * update the inode item, so that all the time fields are
+		 * updated as well as the necessary btrfs inode in memory fields
+		 * for detecting, at fsync time, if the inode isn't yet in the
+		 * log tree or it's there but not up to date.
+		 */
+		trans = btrfs_start_transaction(root, 1);
+		if (IS_ERR(trans)) {
+			err = PTR_ERR(trans);
+		} else {
+			err = btrfs_update_inode(trans, root, inode);
+			ret = btrfs_end_transaction(trans, root);
+		}
+	}
 	mutex_unlock(&inode->i_mutex);
 	if (ret && !err)
 		err = ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a85c23d..d2e732d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -108,6 +108,13 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
 
 static int btrfs_dirty_inode(struct inode *inode);
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_inode_set_ops(struct inode *inode)
+{
+	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+}
+#endif
+
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 				     struct inode *inode,  struct inode *dir,
 				     const struct qstr *qstr)
@@ -1542,30 +1549,17 @@ static void btrfs_split_extent_hook(struct inode *inode,
 		u64 new_size;
 
 		/*
-		 * We need the largest size of the remaining extent to see if we
-		 * need to add a new outstanding extent.  Think of the following
-		 * case
-		 *
-		 * [MEAX_EXTENT_SIZEx2 - 4k][4k]
-		 *
-		 * The new_size would just be 4k and we'd think we had enough
-		 * outstanding extents for this if we only took one side of the
-		 * split, same goes for the other direction.  We need to see if
-		 * the larger size still is the same amount of extents as the
-		 * original size, because if it is we need to add a new
-		 * outstanding extent.  But if we split up and the larger size
-		 * is less than the original then we are good to go since we've
-		 * already accounted for the extra extent in our original
-		 * accounting.
+		 * See the explanation in btrfs_merge_extent_hook, the same
+		 * applies here, just in reverse.
 		 */
 		new_size = orig->end - split + 1;
-		if ((split - orig->start) > new_size)
-			new_size = split - orig->start;
-
-		num_extents = div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
+		num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
 					BTRFS_MAX_EXTENT_SIZE);
-		if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
-			      BTRFS_MAX_EXTENT_SIZE) < num_extents)
+		new_size = split - orig->start;
+		num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+					BTRFS_MAX_EXTENT_SIZE);
+		if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
+			      BTRFS_MAX_EXTENT_SIZE) >= num_extents)
 			return;
 	}
 
@@ -1591,8 +1585,10 @@ static void btrfs_merge_extent_hook(struct inode *inode,
 	if (!(other->state & EXTENT_DELALLOC))
 		return;
 
-	old_size = other->end - other->start + 1;
-	new_size = old_size + (new->end - new->start + 1);
+	if (new->start > other->start)
+		new_size = new->end - other->start + 1;
+	else
+		new_size = other->end - new->start + 1;
 
 	/* we're not bigger than the max, unreserve the space and go */
 	if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
@@ -1603,13 +1599,32 @@ static void btrfs_merge_extent_hook(struct inode *inode,
 	}
 
 	/*
-	 * If we grew by another max_extent, just return, we want to keep that
-	 * reserved amount.
+	 * We have to add up either side to figure out how many extents were
+	 * accounted for before we merged into one big extent.  If the number of
+	 * extents we accounted for is <= the amount we need for the new range
+	 * then we can return, otherwise drop.  Think of it like this
+	 *
+	 * [ 4k][MAX_SIZE]
+	 *
+	 * So we've grown the extent by a MAX_SIZE extent, this would mean we
+	 * need 2 outstanding extents, on one side we have 1 and the other side
+	 * we have 1 so they are == and we can return.  But in this case
+	 *
+	 * [MAX_SIZE+4k][MAX_SIZE+4k]
+	 *
+	 * Each range on their own accounts for 2 extents, but merged together
+	 * they are only 3 extents worth of accounting, so we need to drop in
+	 * this case.
 	 */
+	old_size = other->end - other->start + 1;
 	num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
 				BTRFS_MAX_EXTENT_SIZE);
+	old_size = new->end - new->start + 1;
+	num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
+				 BTRFS_MAX_EXTENT_SIZE);
+
 	if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
-		      BTRFS_MAX_EXTENT_SIZE) > num_extents)
+		      BTRFS_MAX_EXTENT_SIZE) >= num_extents)
 		return;
 
 	spin_lock(&BTRFS_I(inode)->lock);
@@ -1686,6 +1701,10 @@ static void btrfs_set_bit_hook(struct inode *inode,
 			spin_unlock(&BTRFS_I(inode)->lock);
 		}
 
+		/* For sanity tests */
+		if (btrfs_test_is_dummy_root(root))
+			return;
+
 		__percpu_counter_add(&root->fs_info->delalloc_bytes, len,
 				     root->fs_info->delalloc_batch);
 		spin_lock(&BTRFS_I(inode)->lock);
@@ -1741,6 +1760,10 @@ static void btrfs_clear_bit_hook(struct inode *inode,
 		    root != root->fs_info->tree_root)
 			btrfs_delalloc_release_metadata(inode, len);
 
+		/* For sanity tests. */
+		if (btrfs_test_is_dummy_root(root))
+			return;
+
 		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
 		    && do_list && !(state->state & EXTENT_NORESERVE))
 			btrfs_free_reserved_data_space(inode, len);
@@ -7213,7 +7236,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 	u64 start = iblock << inode->i_blkbits;
 	u64 lockstart, lockend;
 	u64 len = bh_result->b_size;
-	u64 orig_len = len;
+	u64 *outstanding_extents = NULL;
 	int unlock_bits = EXTENT_LOCKED;
 	int ret = 0;
 
@@ -7225,6 +7248,16 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 	lockstart = start;
 	lockend = start + len - 1;
 
+	if (current->journal_info) {
+		/*
+		 * Need to pull our outstanding extents and set journal_info to NULL so
+		 * that anything that needs to check if there's a transction doesn't get
+		 * confused.
+		 */
+		outstanding_extents = current->journal_info;
+		current->journal_info = NULL;
+	}
+
 	/*
 	 * If this errors out it's because we couldn't invalidate pagecache for
 	 * this range and we need to fallback to buffered.
@@ -7285,7 +7318,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
 	     em->block_start != EXTENT_MAP_HOLE)) {
 		int type;
-		int ret;
 		u64 block_start, orig_start, orig_block_len, ram_bytes;
 
 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
@@ -7349,11 +7381,20 @@ unlock:
 		if (start + len > i_size_read(inode))
 			i_size_write(inode, start + len);
 
-		if (len < orig_len) {
+		/*
+		 * If we have an outstanding_extents count still set then we're
+		 * within our reservation, otherwise we need to adjust our inode
+		 * counter appropriately.
+		 */
+		if (*outstanding_extents) {
+			(*outstanding_extents)--;
+		} else {
 			spin_lock(&BTRFS_I(inode)->lock);
 			BTRFS_I(inode)->outstanding_extents++;
 			spin_unlock(&BTRFS_I(inode)->lock);
 		}
+
+		current->journal_info = outstanding_extents;
 		btrfs_free_reserved_data_space(inode, len);
 	}
 
@@ -7377,6 +7418,8 @@ unlock:
 unlock_err:
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 			 unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+	if (outstanding_extents)
+		current->journal_info = outstanding_extents;
 	return ret;
 }
 
@@ -8076,6 +8119,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
+	u64 outstanding_extents = 0;
 	size_t count = 0;
 	int flags = 0;
 	bool wakeup = true;
@@ -8113,6 +8157,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 		ret = btrfs_delalloc_reserve_space(inode, count);
 		if (ret)
 			goto out;
+		outstanding_extents = div64_u64(count +
+						BTRFS_MAX_EXTENT_SIZE - 1,
+						BTRFS_MAX_EXTENT_SIZE);
+
+		/*
+		 * We need to know how many extents we reserved so that we can
+		 * do the accounting properly if we go over the number we
+		 * originally calculated.  Abuse current->journal_info for this.
+		 */
+		current->journal_info = &outstanding_extents;
 	} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
 				     &BTRFS_I(inode)->runtime_flags)) {
 		inode_dio_done(inode);
@@ -8125,6 +8179,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 			iter, offset, btrfs_get_blocks_direct, NULL,
 			btrfs_submit_direct, flags);
 	if (rw & WRITE) {
+		current->journal_info = NULL;
 		if (ret < 0 && ret != -EIOCBQUEUED)
 			btrfs_delalloc_release_space(inode, count);
 		else if (ret >= 0 && (size_t)ret < count)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d49fe8a..74609b9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -776,11 +776,11 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
 	    IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
 		return -EPERM;
 	if (isdir) {
-		if (!S_ISDIR(victim->d_inode->i_mode))
+		if (!d_is_dir(victim))
 			return -ENOTDIR;
 		if (IS_ROOT(victim))
 			return -EBUSY;
-	} else if (S_ISDIR(victim->d_inode->i_mode))
+	} else if (d_is_dir(victim))
 		return -EISDIR;
 	if (IS_DEADDIR(dir))
 		return -ENOENT;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 534544e..157cc54 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -452,9 +452,7 @@ void btrfs_get_logged_extents(struct inode *inode,
 			continue;
 		if (entry_end(ordered) <= start)
 			break;
-		if (!list_empty(&ordered->log_list))
-			continue;
-		if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+		if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
 			continue;
 		list_add(&ordered->log_list, logged_list);
 		atomic_inc(&ordered->refs);
@@ -511,8 +509,7 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
 		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
 						   &ordered->flags));
 
-		if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
-			list_add_tail(&ordered->trans_list, &trans->ordered);
+		list_add_tail(&ordered->trans_list, &trans->ordered);
 		spin_lock_irq(&log->log_extents_lock[index]);
 	}
 	spin_unlock_irq(&log->log_extents_lock[index]);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 97159a8..058c79e 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1259,7 +1259,7 @@ static int comp_oper(struct btrfs_qgroup_operation *oper1,
 	if (oper1->seq < oper2->seq)
 		return -1;
 	if (oper1->seq > oper2->seq)
-		return -1;
+		return 1;
 	if (oper1->ref_root < oper2->ref_root)
 		return -1;
 	if (oper1->ref_root > oper2->ref_root)
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index fe58572..d6033f5 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -230,6 +230,7 @@ struct pending_dir_move {
 	u64 parent_ino;
 	u64 ino;
 	u64 gen;
+	bool is_orphan;
 	struct list_head update_refs;
 };
 
@@ -2984,7 +2985,8 @@ static int add_pending_dir_move(struct send_ctx *sctx,
 				u64 ino_gen,
 				u64 parent_ino,
 				struct list_head *new_refs,
-				struct list_head *deleted_refs)
+				struct list_head *deleted_refs,
+				const bool is_orphan)
 {
 	struct rb_node **p = &sctx->pending_dir_moves.rb_node;
 	struct rb_node *parent = NULL;
@@ -2999,6 +3001,7 @@ static int add_pending_dir_move(struct send_ctx *sctx,
 	pm->parent_ino = parent_ino;
 	pm->ino = ino;
 	pm->gen = ino_gen;
+	pm->is_orphan = is_orphan;
 	INIT_LIST_HEAD(&pm->list);
 	INIT_LIST_HEAD(&pm->update_refs);
 	RB_CLEAR_NODE(&pm->node);
@@ -3131,16 +3134,20 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 	rmdir_ino = dm->rmdir_ino;
 	free_waiting_dir_move(sctx, dm);
 
-	ret = get_first_ref(sctx->parent_root, pm->ino,
-			    &parent_ino, &parent_gen, name);
-	if (ret < 0)
-		goto out;
-
-	ret = get_cur_path(sctx, parent_ino, parent_gen,
-			   from_path);
-	if (ret < 0)
-		goto out;
-	ret = fs_path_add_path(from_path, name);
+	if (pm->is_orphan) {
+		ret = gen_unique_name(sctx, pm->ino,
+				      pm->gen, from_path);
+	} else {
+		ret = get_first_ref(sctx->parent_root, pm->ino,
+				    &parent_ino, &parent_gen, name);
+		if (ret < 0)
+			goto out;
+		ret = get_cur_path(sctx, parent_ino, parent_gen,
+				   from_path);
+		if (ret < 0)
+			goto out;
+		ret = fs_path_add_path(from_path, name);
+	}
 	if (ret < 0)
 		goto out;
 
@@ -3150,7 +3157,8 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 		LIST_HEAD(deleted_refs);
 		ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
 		ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
-					   &pm->update_refs, &deleted_refs);
+					   &pm->update_refs, &deleted_refs,
+					   pm->is_orphan);
 		if (ret < 0)
 			goto out;
 		if (rmdir_ino) {
@@ -3283,6 +3291,127 @@ out:
 	return ret;
 }
 
+/*
+ * We might need to delay a directory rename even when no ancestor directory
+ * (in the send root) with a higher inode number than ours (sctx->cur_ino) was
+ * renamed. This happens when we rename a directory to the old name (the name
+ * in the parent root) of some other unrelated directory that got its rename
+ * delayed due to some ancestor with higher number that got renamed.
+ *
+ * Example:
+ *
+ * Parent snapshot:
+ * .                                       (ino 256)
+ * |---- a/                                (ino 257)
+ * |     |---- file                        (ino 260)
+ * |
+ * |---- b/                                (ino 258)
+ * |---- c/                                (ino 259)
+ *
+ * Send snapshot:
+ * .                                       (ino 256)
+ * |---- a/                                (ino 258)
+ * |---- x/                                (ino 259)
+ *       |---- y/                          (ino 257)
+ *             |----- file                 (ino 260)
+ *
+ * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257
+ * from 'a' to 'x/y' happening first, which in turn depends on the rename of
+ * inode 259 from 'c' to 'x'. So the order of rename commands the send stream
+ * must issue is:
+ *
+ * 1 - rename 259 from 'c' to 'x'
+ * 2 - rename 257 from 'a' to 'x/y'
+ * 3 - rename 258 from 'b' to 'a'
+ *
+ * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can
+ * be done right away and < 0 on error.
+ */
+static int wait_for_dest_dir_move(struct send_ctx *sctx,
+				  struct recorded_ref *parent_ref,
+				  const bool is_orphan)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_key di_key;
+	struct btrfs_dir_item *di;
+	u64 left_gen;
+	u64 right_gen;
+	int ret = 0;
+
+	if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
+		return 0;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = parent_ref->dir;
+	key.type = BTRFS_DIR_ITEM_KEY;
+	key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len);
+
+	ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
+	if (ret < 0) {
+		goto out;
+	} else if (ret > 0) {
+		ret = 0;
+		goto out;
+	}
+
+	di = btrfs_match_dir_item_name(sctx->parent_root, path,
+				       parent_ref->name, parent_ref->name_len);
+	if (!di) {
+		ret = 0;
+		goto out;
+	}
+	/*
+	 * di_key.objectid has the number of the inode that has a dentry in the
+	 * parent directory with the same name that sctx->cur_ino is being
+	 * renamed to. We need to check if that inode is in the send root as
+	 * well and if it is currently marked as an inode with a pending rename,
+	 * if it is, we need to delay the rename of sctx->cur_ino as well, so
+	 * that it happens after that other inode is renamed.
+	 */
+	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key);
+	if (di_key.type != BTRFS_INODE_ITEM_KEY) {
+		ret = 0;
+		goto out;
+	}
+
+	ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL,
+			     &left_gen, NULL, NULL, NULL, NULL);
+	if (ret < 0)
+		goto out;
+	ret = get_inode_info(sctx->send_root, di_key.objectid, NULL,
+			     &right_gen, NULL, NULL, NULL, NULL);
+	if (ret < 0) {
+		if (ret == -ENOENT)
+			ret = 0;
+		goto out;
+	}
+
+	/* Different inode, no need to delay the rename of sctx->cur_ino */
+	if (right_gen != left_gen) {
+		ret = 0;
+		goto out;
+	}
+
+	if (is_waiting_for_move(sctx, di_key.objectid)) {
+		ret = add_pending_dir_move(sctx,
+					   sctx->cur_ino,
+					   sctx->cur_inode_gen,
+					   di_key.objectid,
+					   &sctx->new_refs,
+					   &sctx->deleted_refs,
+					   is_orphan);
+		if (!ret)
+			ret = 1;
+	}
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
 static int wait_for_parent_move(struct send_ctx *sctx,
 				struct recorded_ref *parent_ref)
 {
@@ -3349,7 +3478,8 @@ out:
 					   sctx->cur_inode_gen,
 					   ino,
 					   &sctx->new_refs,
-					   &sctx->deleted_refs);
+					   &sctx->deleted_refs,
+					   false);
 		if (!ret)
 			ret = 1;
 	}
@@ -3372,6 +3502,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
 	int did_overwrite = 0;
 	int is_orphan = 0;
 	u64 last_dir_ino_rm = 0;
+	bool can_rename = true;
 
 verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 
@@ -3490,12 +3621,22 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 			}
 		}
 
+		if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
+			ret = wait_for_dest_dir_move(sctx, cur, is_orphan);
+			if (ret < 0)
+				goto out;
+			if (ret == 1) {
+				can_rename = false;
+				*pending_move = 1;
+			}
+		}
+
 		/*
 		 * link/move the ref to the new place. If we have an orphan
 		 * inode, move it and update valid_path. If not, link or move
 		 * it depending on the inode mode.
 		 */
-		if (is_orphan) {
+		if (is_orphan && can_rename) {
 			ret = send_rename(sctx, valid_path, cur->full_path);
 			if (ret < 0)
 				goto out;
@@ -3503,7 +3644,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 			ret = fs_path_copy(valid_path, cur->full_path);
 			if (ret < 0)
 				goto out;
-		} else {
+		} else if (can_rename) {
 			if (S_ISDIR(sctx->cur_inode_mode)) {
 				/*
 				 * Dirs can't be linked, so move it. For moved
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index a116b55..054fc0d 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -911,6 +911,197 @@ out:
 	return ret;
 }
 
+static int test_extent_accounting(void)
+{
+	struct inode *inode = NULL;
+	struct btrfs_root *root = NULL;
+	int ret = -ENOMEM;
+
+	inode = btrfs_new_test_inode();
+	if (!inode) {
+		test_msg("Couldn't allocate inode\n");
+		return ret;
+	}
+
+	root = btrfs_alloc_dummy_root();
+	if (IS_ERR(root)) {
+		test_msg("Couldn't allocate root\n");
+		goto out;
+	}
+
+	root->fs_info = btrfs_alloc_dummy_fs_info();
+	if (!root->fs_info) {
+		test_msg("Couldn't allocate dummy fs info\n");
+		goto out;
+	}
+
+	BTRFS_I(inode)->root = root;
+	btrfs_test_inode_set_ops(inode);
+
+	/* [BTRFS_MAX_EXTENT_SIZE] */
+	BTRFS_I(inode)->outstanding_extents++;
+	ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1,
+					NULL);
+	if (ret) {
+		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+		goto out;
+	}
+	if (BTRFS_I(inode)->outstanding_extents != 1) {
+		ret = -EINVAL;
+		test_msg("Miscount, wanted 1, got %u\n",
+			 BTRFS_I(inode)->outstanding_extents);
+		goto out;
+	}
+
+	/* [BTRFS_MAX_EXTENT_SIZE][4k] */
+	BTRFS_I(inode)->outstanding_extents++;
+	ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
+					BTRFS_MAX_EXTENT_SIZE + 4095, NULL);
+	if (ret) {
+		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+		goto out;
+	}
+	if (BTRFS_I(inode)->outstanding_extents != 2) {
+		ret = -EINVAL;
+		test_msg("Miscount, wanted 2, got %u\n",
+			 BTRFS_I(inode)->outstanding_extents);
+		goto out;
+	}
+
+	/* [BTRFS_MAX_EXTENT_SIZE/2][4K HOLE][the rest] */
+	ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
+			       BTRFS_MAX_EXTENT_SIZE >> 1,
+			       (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
+			       EXTENT_DELALLOC | EXTENT_DIRTY |
+			       EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0,
+			       NULL, GFP_NOFS);
+	if (ret) {
+		test_msg("clear_extent_bit returned %d\n", ret);
+		goto out;
+	}
+	if (BTRFS_I(inode)->outstanding_extents != 2) {
+		ret = -EINVAL;
+		test_msg("Miscount, wanted 2, got %u\n",
+			 BTRFS_I(inode)->outstanding_extents);
+		goto out;
+	}
+
+	/* [BTRFS_MAX_EXTENT_SIZE][4K] */
+	BTRFS_I(inode)->outstanding_extents++;
+	ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1,
+					(BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
+					NULL);
+	if (ret) {
+		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+		goto out;
+	}
+	if (BTRFS_I(inode)->outstanding_extents != 2) {
+		ret = -EINVAL;
+		test_msg("Miscount, wanted 2, got %u\n",
+			 BTRFS_I(inode)->outstanding_extents);
+		goto out;
+	}
+
+	/*
+	 * [BTRFS_MAX_EXTENT_SIZE+4K][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4K]
+	 *
+	 * I'm artificially adding 2 to outstanding_extents because in the
+	 * buffered IO case we'd add things up as we go, but I don't feel like
+	 * doing that here, this isn't the interesting case we want to test.
+	 */
+	BTRFS_I(inode)->outstanding_extents += 2;
+	ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 8192,
+					(BTRFS_MAX_EXTENT_SIZE << 1) + 12287,
+					NULL);
+	if (ret) {
+		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+		goto out;
+	}
+	if (BTRFS_I(inode)->outstanding_extents != 4) {
+		ret = -EINVAL;
+		test_msg("Miscount, wanted 4, got %u\n",
+			 BTRFS_I(inode)->outstanding_extents);
+		goto out;
+	}
+
+	/* [BTRFS_MAX_EXTENT_SIZE+4k][4k][BTRFS_MAX_EXTENT_SIZE+4k] */
+	BTRFS_I(inode)->outstanding_extents++;
+	ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096,
+					BTRFS_MAX_EXTENT_SIZE+8191, NULL);
+	if (ret) {
+		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+		goto out;
+	}
+	if (BTRFS_I(inode)->outstanding_extents != 3) {
+		ret = -EINVAL;
+		test_msg("Miscount, wanted 3, got %u\n",
+			 BTRFS_I(inode)->outstanding_extents);
+		goto out;
+	}
+
+	/* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */
+	ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
+			       BTRFS_MAX_EXTENT_SIZE+4096,
+			       BTRFS_MAX_EXTENT_SIZE+8191,
+			       EXTENT_DIRTY | EXTENT_DELALLOC |
+			       EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+			       NULL, GFP_NOFS);
+	if (ret) {
+		test_msg("clear_extent_bit returned %d\n", ret);
+		goto out;
+	}
+	if (BTRFS_I(inode)->outstanding_extents != 4) {
+		ret = -EINVAL;
+		test_msg("Miscount, wanted 4, got %u\n",
+			 BTRFS_I(inode)->outstanding_extents);
+		goto out;
+	}
+
+	/*
+	 * Refill the hole again just for good measure, because I thought it
+	 * might fail and I'd rather satisfy my paranoia at this point.
+	 */
+	BTRFS_I(inode)->outstanding_extents++;
+	ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096,
+					BTRFS_MAX_EXTENT_SIZE+8191, NULL);
+	if (ret) {
+		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+		goto out;
+	}
+	if (BTRFS_I(inode)->outstanding_extents != 3) {
+		ret = -EINVAL;
+		test_msg("Miscount, wanted 3, got %u\n",
+			 BTRFS_I(inode)->outstanding_extents);
+		goto out;
+	}
+
+	/* Empty */
+	ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+			       EXTENT_DIRTY | EXTENT_DELALLOC |
+			       EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+			       NULL, GFP_NOFS);
+	if (ret) {
+		test_msg("clear_extent_bit returned %d\n", ret);
+		goto out;
+	}
+	if (BTRFS_I(inode)->outstanding_extents) {
+		ret = -EINVAL;
+		test_msg("Miscount, wanted 0, got %u\n",
+			 BTRFS_I(inode)->outstanding_extents);
+		goto out;
+	}
+	ret = 0;
+out:
+	if (ret)
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+				 EXTENT_DIRTY | EXTENT_DELALLOC |
+				 EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+				 NULL, GFP_NOFS);
+	iput(inode);
+	btrfs_free_dummy_root(root);
+	return ret;
+}
+
 int btrfs_test_inodes(void)
 {
 	int ret;
@@ -924,5 +1115,9 @@ int btrfs_test_inodes(void)
 	if (ret)
 		return ret;
 	test_msg("Running hole first btrfs_get_extent test\n");
-	return test_hole_first();
+	ret = test_hole_first();
+	if (ret)
+		return ret;
+	test_msg("Running outstanding_extents tests\n");
+	return test_extent_accounting();
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7e80f32..8be4278 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1023,17 +1023,13 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 	u64 old_root_bytenr;
 	u64 old_root_used;
 	struct btrfs_root *tree_root = root->fs_info->tree_root;
-	bool extent_root = (root->objectid == BTRFS_EXTENT_TREE_OBJECTID);
 
 	old_root_used = btrfs_root_used(&root->root_item);
-	btrfs_write_dirty_block_groups(trans, root);
 
 	while (1) {
 		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
 		if (old_root_bytenr == root->node->start &&
-		    old_root_used == btrfs_root_used(&root->root_item) &&
-		    (!extent_root ||
-		     list_empty(&trans->transaction->dirty_bgs)))
+		    old_root_used == btrfs_root_used(&root->root_item))
 			break;
 
 		btrfs_set_root_node(&root->root_item, root->node);
@@ -1044,17 +1040,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 			return ret;
 
 		old_root_used = btrfs_root_used(&root->root_item);
-		if (extent_root) {
-			ret = btrfs_write_dirty_block_groups(trans, root);
-			if (ret)
-				return ret;
-		}
-		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-		if (ret)
-			return ret;
-		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-		if (ret)
-			return ret;
 	}
 
 	return 0;
@@ -1071,6 +1056,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 					 struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
 	struct list_head *next;
 	struct extent_buffer *eb;
 	int ret;
@@ -1098,11 +1084,15 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 	if (ret)
 		return ret;
 
+	ret = btrfs_setup_space_cache(trans, root);
+	if (ret)
+		return ret;
+
 	/* run_qgroups might have added some more refs */
 	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
 	if (ret)
 		return ret;
-
+again:
 	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
 		next = fs_info->dirty_cowonly_roots.next;
 		list_del_init(next);
@@ -1115,8 +1105,23 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 		ret = update_cowonly_root(trans, root);
 		if (ret)
 			return ret;
+		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+		if (ret)
+			return ret;
 	}
 
+	while (!list_empty(dirty_bgs)) {
+		ret = btrfs_write_dirty_block_groups(trans, root);
+		if (ret)
+			return ret;
+		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+		if (ret)
+			return ret;
+	}
+
+	if (!list_empty(&fs_info->dirty_cowonly_roots))
+		goto again;
+
 	list_add_tail(&fs_info->extent_root->dirty_list,
 		      &trans->transaction->switch_commits);
 	btrfs_after_dev_replace_commit(fs_info);
@@ -1814,6 +1819,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 		wait_for_commit(root, cur_trans);
 
+		if (unlikely(cur_trans->aborted))
+			ret = cur_trans->aborted;
+
 		btrfs_put_transaction(cur_trans);
 
 		return ret;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9a37f8b..c5b8ba3 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1012,7 +1012,7 @@ again:
 		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
 
 		while (cur_offset < item_size) {
-			extref = (struct btrfs_inode_extref *)base + cur_offset;
+			extref = (struct btrfs_inode_extref *)(base + cur_offset);
 
 			victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cd4d131..8222f6f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4903,10 +4903,17 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
 {
 	struct btrfs_bio *bbio = kzalloc(
+		 /* the size of the btrfs_bio */
 		sizeof(struct btrfs_bio) +
+		/* plus the variable array for the stripes */
 		sizeof(struct btrfs_bio_stripe) * (total_stripes) +
+		/* plus the variable array for the tgt dev */
 		sizeof(int) * (real_stripes) +
-		sizeof(u64) * (real_stripes),
+		/*
+		 * plus the raid_map, which includes both the tgt dev
+		 * and the stripes
+		 */
+		sizeof(u64) * (total_stripes),
 		GFP_NOFS);
 	if (!bbio)
 		return NULL;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 47b1946..883b936 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -111,6 +111,8 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
 					name, name_len, -1);
 		if (!di && (flags & XATTR_REPLACE))
 			ret = -ENODATA;
+		else if (IS_ERR(di))
+			ret = PTR_ERR(di);
 		else if (di)
 			ret = btrfs_delete_one_dir_name(trans, root, path, di);
 		goto out;
@@ -127,10 +129,12 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
 		ASSERT(mutex_is_locked(&inode->i_mutex));
 		di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
 					name, name_len, 0);
-		if (!di) {
+		if (!di)
 			ret = -ENODATA;
+		else if (IS_ERR(di))
+			ret = PTR_ERR(di);
+		if (ret)
 			goto out;
-		}
 		btrfs_release_path(path);
 		di = NULL;
 	}
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index ce1b115..f601def 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -574,7 +574,7 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
 	/* extract the directory dentry from the cwd */
 	get_fs_pwd(current->fs, &path);
 
-	if (!S_ISDIR(path.dentry->d_inode->i_mode))
+	if (!d_can_lookup(path.dentry))
 		goto notdir;
 
 	cachefiles_begin_secure(cache, &saved_cred);
@@ -646,7 +646,7 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
 	/* extract the directory dentry from the cwd */
 	get_fs_pwd(current->fs, &path);
 
-	if (!S_ISDIR(path.dentry->d_inode->i_mode))
+	if (!d_can_lookup(path.dentry))
 		goto notdir;
 
 	cachefiles_begin_secure(cache, &saved_cred);
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 1c7293c..2324262 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -437,7 +437,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
 	if (!object->backer)
 		return -ENOBUFS;
 
-	ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+	ASSERT(d_is_reg(object->backer));
 
 	fscache_set_store_limit(&object->fscache, ni_size);
 
@@ -501,7 +501,7 @@ static void cachefiles_invalidate_object(struct fscache_operation *op)
 	       op->object->debug_id, (unsigned long long)ni_size);
 
 	if (object->backer) {
-		ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+		ASSERT(d_is_reg(object->backer));
 
 		fscache_set_store_limit(&object->fscache, ni_size);
 
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 7f8e83f..1e51714e 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -277,7 +277,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
 	_debug("remove %p from %p", rep, dir);
 
 	/* non-directories can just be unlinked */
-	if (!S_ISDIR(rep->d_inode->i_mode)) {
+	if (!d_is_dir(rep)) {
 		_debug("unlink stale object");
 
 		path.mnt = cache->mnt;
@@ -323,7 +323,7 @@ try_again:
 		return 0;
 	}
 
-	if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) {
+	if (!d_can_lookup(cache->graveyard)) {
 		unlock_rename(cache->graveyard, dir);
 		cachefiles_io_error(cache, "Graveyard no longer a directory");
 		return -EIO;
@@ -475,7 +475,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
 	ASSERT(parent->dentry);
 	ASSERT(parent->dentry->d_inode);
 
-	if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) {
+	if (!(d_is_dir(parent->dentry))) {
 		// TODO: convert file to dir
 		_leave("looking up in none directory");
 		return -ENOBUFS;
@@ -539,7 +539,7 @@ lookup_again:
 			_debug("mkdir -> %p{%p{ino=%lu}}",
 			       next, next->d_inode, next->d_inode->i_ino);
 
-		} else if (!S_ISDIR(next->d_inode->i_mode)) {
+		} else if (!d_can_lookup(next)) {
 			pr_err("inode %lu is not a directory\n",
 			       next->d_inode->i_ino);
 			ret = -ENOBUFS;
@@ -568,8 +568,8 @@ lookup_again:
 			_debug("create -> %p{%p{ino=%lu}}",
 			       next, next->d_inode, next->d_inode->i_ino);
 
-		} else if (!S_ISDIR(next->d_inode->i_mode) &&
-			   !S_ISREG(next->d_inode->i_mode)
+		} else if (!d_can_lookup(next) &&
+			   !d_is_reg(next)
 			   ) {
 			pr_err("inode %lu is not a file or directory\n",
 			       next->d_inode->i_ino);
@@ -642,7 +642,7 @@ lookup_again:
 
 	/* open a file interface onto a data file */
 	if (object->type != FSCACHE_COOKIE_TYPE_INDEX) {
-		if (S_ISREG(object->dentry->d_inode->i_mode)) {
+		if (d_is_reg(object->dentry)) {
 			const struct address_space_operations *aops;
 
 			ret = -EPERM;
@@ -763,7 +763,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
 	/* we need to make sure the subdir is a directory */
 	ASSERT(subdir->d_inode);
 
-	if (!S_ISDIR(subdir->d_inode->i_mode)) {
+	if (!d_can_lookup(subdir)) {
 		pr_err("%s is not a directory\n", dirname);
 		ret = -EIO;
 		goto check_error;
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 616db0e7..c6cd8d7 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -900,7 +900,7 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
 		return -ENOBUFS;
 	}
 
-	ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+	ASSERT(d_is_reg(object->backer));
 
 	cache = container_of(object->fscache.cache,
 			     struct cachefiles_cache, cache);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0411dbb..83e9976 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -904,7 +904,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
 	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
 		dout("unlink/rmdir dir %p dn %p inode %p\n",
 		     dir, dentry, inode);
-		op = S_ISDIR(dentry->d_inode->i_mode) ?
+		op = d_is_dir(dentry) ?
 			CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
 	} else
 		goto out;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index a3d774b..d533075 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -292,7 +292,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 	}
 	if (err)
 		goto out_req;
-	if (dn || dentry->d_inode == NULL || S_ISLNK(dentry->d_inode->i_mode)) {
+	if (dn || dentry->d_inode == NULL || d_is_symlink(dentry)) {
 		/* make vfs retry on splice, ENOENT, or symlink */
 		dout("atomic_open finish_no_open on dn %p\n", dn);
 		err = finish_no_open(file, dn);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 281ee01..60cb88c 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -304,7 +304,7 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
 			     (const char *) old_name, (const char *)new_name);
 	if (!error) {
 		if (new_dentry->d_inode) {
-			if (S_ISDIR(new_dentry->d_inode->i_mode)) {
+			if (d_is_dir(new_dentry)) {
 				coda_dir_drop_nlink(old_dir);
 				coda_dir_inc_nlink(new_dir);
 			}
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index a315677..b65d1ef 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -69,14 +69,13 @@ extern struct kmem_cache *configfs_dir_cachep;
 extern int configfs_is_root(struct config_item *item);
 
 extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *);
-extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *));
+extern int configfs_create(struct dentry *, umode_t mode, void (*init)(struct inode *));
 
 extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
 extern int configfs_make_dirent(struct configfs_dirent *,
 				struct dentry *, void *, umode_t, int);
 extern int configfs_dirent_is_ready(struct configfs_dirent *);
 
-extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int);
 extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
 
 extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index c9c298b..cf0db00 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -240,60 +240,26 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd,
 	return 0;
 }
 
-static int init_dir(struct inode * inode)
+static void init_dir(struct inode * inode)
 {
 	inode->i_op = &configfs_dir_inode_operations;
 	inode->i_fop = &configfs_dir_operations;
 
 	/* directory inodes start off with i_nlink == 2 (for "." entry) */
 	inc_nlink(inode);
-	return 0;
 }
 
-static int configfs_init_file(struct inode * inode)
+static void configfs_init_file(struct inode * inode)
 {
 	inode->i_size = PAGE_SIZE;
 	inode->i_fop = &configfs_file_operations;
-	return 0;
 }
 
-static int init_symlink(struct inode * inode)
+static void init_symlink(struct inode * inode)
 {
 	inode->i_op = &configfs_symlink_inode_operations;
-	return 0;
-}
-
-static int create_dir(struct config_item *k, struct dentry *d)
-{
-	int error;
-	umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
-	struct dentry *p = d->d_parent;
-
-	BUG_ON(!k);
-
-	error = configfs_dirent_exists(p->d_fsdata, d->d_name.name);
-	if (!error)
-		error = configfs_make_dirent(p->d_fsdata, d, k, mode,
-					     CONFIGFS_DIR | CONFIGFS_USET_CREATING);
-	if (!error) {
-		configfs_set_dir_dirent_depth(p->d_fsdata, d->d_fsdata);
-		error = configfs_create(d, mode, init_dir);
-		if (!error) {
-			inc_nlink(p->d_inode);
-		} else {
-			struct configfs_dirent *sd = d->d_fsdata;
-			if (sd) {
-				spin_lock(&configfs_dirent_lock);
-				list_del_init(&sd->s_sibling);
-				spin_unlock(&configfs_dirent_lock);
-				configfs_put(sd);
-			}
-		}
-	}
-	return error;
 }
 
-
 /**
  *	configfs_create_dir - create a directory for an config_item.
  *	@item:		config_itemwe're creating directory for.
@@ -303,11 +269,37 @@ static int create_dir(struct config_item *k, struct dentry *d)
  *	until it is validated by configfs_dir_set_ready()
  */
 
-static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
+static int configfs_create_dir(struct config_item *item, struct dentry *dentry)
 {
-	int error = create_dir(item, dentry);
-	if (!error)
+	int error;
+	umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
+	struct dentry *p = dentry->d_parent;
+
+	BUG_ON(!item);
+
+	error = configfs_dirent_exists(p->d_fsdata, dentry->d_name.name);
+	if (unlikely(error))
+		return error;
+
+	error = configfs_make_dirent(p->d_fsdata, dentry, item, mode,
+				     CONFIGFS_DIR | CONFIGFS_USET_CREATING);
+	if (unlikely(error))
+		return error;
+
+	configfs_set_dir_dirent_depth(p->d_fsdata, dentry->d_fsdata);
+	error = configfs_create(dentry, mode, init_dir);
+	if (!error) {
+		inc_nlink(p->d_inode);
 		item->ci_dentry = dentry;
+	} else {
+		struct configfs_dirent *sd = dentry->d_fsdata;
+		if (sd) {
+			spin_lock(&configfs_dirent_lock);
+			list_del_init(&sd->s_sibling);
+			spin_unlock(&configfs_dirent_lock);
+			configfs_put(sd);
+		}
+	}
 	return error;
 }
 
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 1d1c41f..56d2cdc 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -313,21 +313,6 @@ const struct file_operations configfs_file_operations = {
 	.release	= configfs_release,
 };
 
-
-int configfs_add_file(struct dentry * dir, const struct configfs_attribute * attr, int type)
-{
-	struct configfs_dirent * parent_sd = dir->d_fsdata;
-	umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
-	int error = 0;
-
-	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL);
-	error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
-	mutex_unlock(&dir->d_inode->i_mutex);
-
-	return error;
-}
-
-
 /**
  *	configfs_create_file - create an attribute file for an item.
  *	@item:	item we're creating for.
@@ -336,9 +321,16 @@ int configfs_add_file(struct dentry * dir, const struct configfs_attribute * att
 
 int configfs_create_file(struct config_item * item, const struct configfs_attribute * attr)
 {
-	BUG_ON(!item || !item->ci_dentry || !attr);
+	struct dentry *dir = item->ci_dentry;
+	struct configfs_dirent *parent_sd = dir->d_fsdata;
+	umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
+	int error = 0;
 
-	return configfs_add_file(item->ci_dentry, attr,
-				 CONFIGFS_ITEM_ATTR);
+	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL);
+	error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode,
+				     CONFIGFS_ITEM_ATTR);
+	mutex_unlock(&dir->d_inode->i_mutex);
+
+	return error;
 }
 
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 65af861..5423a6a 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -176,7 +176,7 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
 
 #endif /* CONFIG_LOCKDEP */
 
-int configfs_create(struct dentry * dentry, umode_t mode, int (*init)(struct inode *))
+int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct inode *))
 {
 	int error = 0;
 	struct inode *inode = NULL;
@@ -198,13 +198,7 @@ int configfs_create(struct dentry * dentry, umode_t mode, int (*init)(struct ino
 	p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
 	configfs_set_inode_lock_class(sd, inode);
 
-	if (init) {
-		error = init(inode);
-		if (error) {
-			iput(inode);
-			return error;
-		}
-	}
+	init(inode);
 	d_instantiate(dentry, inode);
 	if (S_ISDIR(mode) || S_ISLNK(mode))
 		dget(dentry);  /* pin link and directory dentries in core */
@@ -242,7 +236,7 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
 
 	if (dentry) {
 		spin_lock(&dentry->d_lock);
-		if (!(d_unhashed(dentry) && dentry->d_inode)) {
+		if (!d_unhashed(dentry) && dentry->d_inode) {
 			dget_dlock(dentry);
 			__d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
diff --git a/fs/coredump.c b/fs/coredump.c
index b5c86ff..f319926 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -572,7 +572,7 @@ void do_coredump(const siginfo_t *siginfo)
 			 *
 			 * Normally core limits are irrelevant to pipes, since
 			 * we're not writing to the file system, but we use
-			 * cprm.limit of 1 here as a speacial value, this is a
+			 * cprm.limit of 1 here as a special value, this is a
 			 * consistent way to catch recursive crashes.
 			 * We can still crash if the core_pattern binary sets
 			 * RLIM_CORE = !1, but it runs as root, and can do
diff --git a/fs/dcache.c b/fs/dcache.c
index dc400fd..c71e373 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1659,9 +1659,25 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
 }
 EXPORT_SYMBOL(d_set_d_op);
 
+
+/*
+ * d_set_fallthru - Mark a dentry as falling through to a lower layer
+ * @dentry - The dentry to mark
+ *
+ * Mark a dentry as falling through to the lower layer (as set with
+ * d_pin_lower()).  This flag may be recorded on the medium.
+ */
+void d_set_fallthru(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	dentry->d_flags |= DCACHE_FALLTHRU;
+	spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL(d_set_fallthru);
+
 static unsigned d_flags_for_inode(struct inode *inode)
 {
-	unsigned add_flags = DCACHE_FILE_TYPE;
+	unsigned add_flags = DCACHE_REGULAR_TYPE;
 
 	if (!inode)
 		return DCACHE_MISS_TYPE;
@@ -1674,13 +1690,21 @@ static unsigned d_flags_for_inode(struct inode *inode)
 			else
 				inode->i_opflags |= IOP_LOOKUP;
 		}
-	} else if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
-		if (unlikely(inode->i_op->follow_link))
+		goto type_determined;
+	}
+
+	if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
+		if (unlikely(inode->i_op->follow_link)) {
 			add_flags = DCACHE_SYMLINK_TYPE;
-		else
-			inode->i_opflags |= IOP_NOFOLLOW;
+			goto type_determined;
+		}
+		inode->i_opflags |= IOP_NOFOLLOW;
 	}
 
+	if (unlikely(!S_ISREG(inode->i_mode)))
+		add_flags = DCACHE_SPECIAL_TYPE;
+
+type_determined:
 	if (unlikely(IS_AUTOMOUNT(inode)))
 		add_flags |= DCACHE_NEED_AUTOMOUNT;
 	return add_flags;
@@ -1691,7 +1715,8 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 	unsigned add_flags = d_flags_for_inode(inode);
 
 	spin_lock(&dentry->d_lock);
-	__d_set_type(dentry, add_flags);
+	dentry->d_flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
+	dentry->d_flags |= add_flags;
 	if (inode)
 		hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
 	dentry->d_inode = inode;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 45b18a5..96400ab 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -169,10 +169,19 @@ static int debugfs_show_options(struct seq_file *m, struct dentry *root)
 	return 0;
 }
 
+static void debugfs_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+	if (S_ISLNK(inode->i_mode))
+		kfree(inode->i_private);
+}
+
 static const struct super_operations debugfs_super_operations = {
 	.statfs		= simple_statfs,
 	.remount_fs	= debugfs_remount,
 	.show_options	= debugfs_show_options,
+	.evict_inode	= debugfs_evict_inode,
 };
 
 static struct vfsmount *debugfs_automount(struct path *path)
@@ -511,23 +520,14 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
 	int ret = 0;
 
 	if (debugfs_positive(dentry)) {
-		if (dentry->d_inode) {
-			dget(dentry);
-			switch (dentry->d_inode->i_mode & S_IFMT) {
-			case S_IFDIR:
-				ret = simple_rmdir(parent->d_inode, dentry);
-				break;
-			case S_IFLNK:
-				kfree(dentry->d_inode->i_private);
-				/* fall through */
-			default:
-				simple_unlink(parent->d_inode, dentry);
-				break;
-			}
-			if (!ret)
-				d_delete(dentry);
-			dput(dentry);
-		}
+		dget(dentry);
+		if (S_ISDIR(dentry->d_inode->i_mode))
+			ret = simple_rmdir(parent->d_inode, dentry);
+		else
+			simple_unlink(parent->d_inode, dentry);
+		if (!ret)
+			d_delete(dentry);
+		dput(dentry);
 	}
 	return ret;
 }
@@ -690,7 +690,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
 	}
 	d_move(old_dentry, dentry);
 	fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name,
-		S_ISDIR(old_dentry->d_inode->i_mode),
+		d_is_dir(old_dentry),
 		NULL, old_dentry);
 	fsnotify_oldname_free(old_name);
 	unlock_rename(new_dir, old_dir);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 90d1882..5ba029e 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -124,7 +124,7 @@ ecryptfs_get_key_payload_data(struct key *key)
 }
 
 #define ECRYPTFS_MAX_KEYSET_SIZE 1024
-#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32
+#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 31
 #define ECRYPTFS_MAX_NUM_ENC_KEYS 64
 #define ECRYPTFS_MAX_IV_BYTES 16	/* 128 bits */
 #define ECRYPTFS_SALT_BYTES 2
@@ -237,7 +237,7 @@ struct ecryptfs_crypt_stat {
 	struct crypto_ablkcipher *tfm;
 	struct crypto_hash *hash_tfm; /* Crypto context for generating
 				       * the initialization vectors */
-	unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
+	unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
 	unsigned char key[ECRYPTFS_MAX_KEY_BYTES];
 	unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES];
 	struct list_head keysig_list;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 6f4e659..fd39bad 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -230,7 +230,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
 	}
 	ecryptfs_set_file_lower(
 		file, ecryptfs_inode_to_private(inode)->lower_file);
-	if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
+	if (d_is_dir(ecryptfs_dentry)) {
 		ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
 		mutex_lock(&crypt_stat->cs_mutex);
 		crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
@@ -303,9 +303,22 @@ ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	struct file *lower_file = ecryptfs_file_to_lower(file);
 	long rc = -ENOTTY;
 
-	if (lower_file->f_op->unlocked_ioctl)
+	if (!lower_file->f_op->unlocked_ioctl)
+		return rc;
+
+	switch (cmd) {
+	case FITRIM:
+	case FS_IOC_GETFLAGS:
+	case FS_IOC_SETFLAGS:
+	case FS_IOC_GETVERSION:
+	case FS_IOC_SETVERSION:
 		rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
-	return rc;
+		fsstack_copy_attr_all(file_inode(file), file_inode(lower_file));
+
+		return rc;
+	default:
+		return rc;
+	}
 }
 
 #ifdef CONFIG_COMPAT
@@ -315,9 +328,22 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	struct file *lower_file = ecryptfs_file_to_lower(file);
 	long rc = -ENOIOCTLCMD;
 
-	if (lower_file->f_op->compat_ioctl)
+	if (!lower_file->f_op->compat_ioctl)
+		return rc;
+
+	switch (cmd) {
+	case FITRIM:
+	case FS_IOC32_GETFLAGS:
+	case FS_IOC32_SETFLAGS:
+	case FS_IOC32_GETVERSION:
+	case FS_IOC32_SETVERSION:
 		rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
-	return rc;
+		fsstack_copy_attr_all(file_inode(file), file_inode(lower_file));
+
+		return rc;
+	default:
+		return rc;
+	}
 }
 #endif
 
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 34b36a5..b08b518 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -907,9 +907,9 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
 	lower_inode = ecryptfs_inode_to_lower(inode);
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	mutex_lock(&crypt_stat->cs_mutex);
-	if (S_ISDIR(dentry->d_inode->i_mode))
+	if (d_is_dir(dentry))
 		crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
-	else if (S_ISREG(dentry->d_inode->i_mode)
+	else if (d_is_reg(dentry)
 		 && (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)
 		     || !(crypt_stat->flags & ECRYPTFS_KEY_VALID))) {
 		struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 917bd5c..6bd67e2 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -891,7 +891,7 @@ struct ecryptfs_parse_tag_70_packet_silly_stack {
 	struct blkcipher_desc desc;
 	char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1];
 	char iv[ECRYPTFS_MAX_IV_BYTES];
-	char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
+	char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
 };
 
 /**
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 1895d60..c095d32 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -407,7 +407,7 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options,
 	if (!cipher_name_set) {
 		int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
 
-		BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+		BUG_ON(cipher_name_len > ECRYPTFS_MAX_CIPHER_NAME_SIZE);
 		strcpy(mount_crypt_stat->global_default_cipher_name,
 		       ECRYPTFS_DEFAULT_CIPHER);
 	}
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index fdfd206..714cd37 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -429,7 +429,7 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 	if (IS_ERR(result))
 		return result;
 
-	if (S_ISDIR(result->d_inode->i_mode)) {
+	if (d_is_dir(result)) {
 		/*
 		 * This request is for a directory.
 		 *
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 982d934..f63c3d5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -364,7 +364,8 @@ struct flex_groups {
 #define EXT4_DIRTY_FL			0x00000100
 #define EXT4_COMPRBLK_FL		0x00000200 /* One or more compressed clusters */
 #define EXT4_NOCOMPR_FL			0x00000400 /* Don't compress */
-#define EXT4_ECOMPR_FL			0x00000800 /* Compression error */
+	/* nb: was previously EXT2_ECOMPR_FL */
+#define EXT4_ENCRYPT_FL			0x00000800 /* encrypted file */
 /* End compression flags --- maybe not all used */
 #define EXT4_INDEX_FL			0x00001000 /* hash-indexed directory */
 #define EXT4_IMAGIC_FL			0x00002000 /* AFS directory */
@@ -421,7 +422,7 @@ enum {
 	EXT4_INODE_DIRTY	= 8,
 	EXT4_INODE_COMPRBLK	= 9,	/* One or more compressed clusters */
 	EXT4_INODE_NOCOMPR	= 10,	/* Don't compress */
-	EXT4_INODE_ECOMPR	= 11,	/* Compression error */
+	EXT4_INODE_ENCRYPT	= 11,	/* Compression error */
 /* End compression flags --- maybe not all used */
 	EXT4_INODE_INDEX	= 12,	/* hash-indexed directory */
 	EXT4_INODE_IMAGIC	= 13,	/* AFS directory */
@@ -466,7 +467,7 @@ static inline void ext4_check_flag_values(void)
 	CHECK_FLAG_VALUE(DIRTY);
 	CHECK_FLAG_VALUE(COMPRBLK);
 	CHECK_FLAG_VALUE(NOCOMPR);
-	CHECK_FLAG_VALUE(ECOMPR);
+	CHECK_FLAG_VALUE(ENCRYPT);
 	CHECK_FLAG_VALUE(INDEX);
 	CHECK_FLAG_VALUE(IMAGIC);
 	CHECK_FLAG_VALUE(JOURNAL_DATA);
@@ -1048,6 +1049,12 @@ extern void ext4_set_bits(void *bm, int cur, int len);
 /* Metadata checksum algorithm codes */
 #define EXT4_CRC32C_CHKSUM		1
 
+/* Encryption algorithms */
+#define EXT4_ENCRYPTION_MODE_INVALID		0
+#define EXT4_ENCRYPTION_MODE_AES_256_XTS	1
+#define EXT4_ENCRYPTION_MODE_AES_256_GCM	2
+#define EXT4_ENCRYPTION_MODE_AES_256_CBC	3
+
 /*
  * Structure of the super block
  */
@@ -1161,7 +1168,8 @@ struct ext4_super_block {
 	__le32	s_grp_quota_inum;	/* inode for tracking group quota */
 	__le32	s_overhead_clusters;	/* overhead blocks/clusters in fs */
 	__le32	s_backup_bgs[2];	/* groups with sparse_super2 SBs */
-	__le32	s_reserved[106];	/* Padding to the end of the block */
+	__u8	s_encrypt_algos[4];	/* Encryption algorithms in use  */
+	__le32	s_reserved[105];	/* Padding to the end of the block */
 	__le32	s_checksum;		/* crc32c(superblock) */
 };
 
@@ -1527,6 +1535,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
  * GDT_CSUM bits are mutually exclusive.
  */
 #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM	0x0400
+#define EXT4_FEATURE_RO_COMPAT_READONLY		0x1000
 
 #define EXT4_FEATURE_INCOMPAT_COMPRESSION	0x0001
 #define EXT4_FEATURE_INCOMPAT_FILETYPE		0x0002
@@ -1542,6 +1551,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM	0x2000 /* use crc32c for bg */
 #define EXT4_FEATURE_INCOMPAT_LARGEDIR		0x4000 /* >2GB or 3-lvl htree */
 #define EXT4_FEATURE_INCOMPAT_INLINE_DATA	0x8000 /* data in inode */
+#define EXT4_FEATURE_INCOMPAT_ENCRYPT		0x10000
 
 #define EXT2_FEATURE_COMPAT_SUPP	EXT4_FEATURE_COMPAT_EXT_ATTR
 #define EXT2_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 6b9878a..45fe924 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -1401,10 +1401,7 @@ end_range:
 				 * to free. Everything was covered by the start
 				 * of the range.
 				 */
-				return 0;
-			} else {
-				/* Shared branch grows from an indirect block */
-				partial2--;
+				goto do_indirects;
 			}
 		} else {
 			/*
@@ -1435,56 +1432,96 @@ end_range:
 	/* Punch happened within the same level (n == n2) */
 	partial = ext4_find_shared(inode, n, offsets, chain, &nr);
 	partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
-	/*
-	 * ext4_find_shared returns Indirect structure which
-	 * points to the last element which should not be
-	 * removed by truncate. But this is end of the range
-	 * in punch_hole so we need to point to the next element
-	 */
-	partial2->p++;
-	while ((partial > chain) || (partial2 > chain2)) {
-		/* We're at the same block, so we're almost finished */
-		if ((partial->bh && partial2->bh) &&
-		    (partial->bh->b_blocknr == partial2->bh->b_blocknr)) {
-			if ((partial > chain) && (partial2 > chain2)) {
+
+	/* Free top, but only if partial2 isn't its subtree. */
+	if (nr) {
+		int level = min(partial - chain, partial2 - chain2);
+		int i;
+		int subtree = 1;
+
+		for (i = 0; i <= level; i++) {
+			if (offsets[i] != offsets2[i]) {
+				subtree = 0;
+				break;
+			}
+		}
+
+		if (!subtree) {
+			if (partial == chain) {
+				/* Shared branch grows from the inode */
+				ext4_free_branches(handle, inode, NULL,
+						   &nr, &nr+1,
+						   (chain+n-1) - partial);
+				*partial->p = 0;
+			} else {
+				/* Shared branch grows from an indirect block */
+				BUFFER_TRACE(partial->bh, "get_write_access");
 				ext4_free_branches(handle, inode, partial->bh,
-						   partial->p + 1,
-						   partial2->p,
+						   partial->p,
+						   partial->p+1,
 						   (chain+n-1) - partial);
-				BUFFER_TRACE(partial->bh, "call brelse");
-				brelse(partial->bh);
-				BUFFER_TRACE(partial2->bh, "call brelse");
-				brelse(partial2->bh);
 			}
-			return 0;
 		}
+	}
+
+	if (!nr2) {
 		/*
-		 * Clear the ends of indirect blocks on the shared branch
-		 * at the start of the range
+		 * ext4_find_shared returns Indirect structure which
+		 * points to the last element which should not be
+		 * removed by truncate. But this is end of the range
+		 * in punch_hole so we need to point to the next element
 		 */
-		if (partial > chain) {
+		partial2->p++;
+	}
+
+	while (partial > chain || partial2 > chain2) {
+		int depth = (chain+n-1) - partial;
+		int depth2 = (chain2+n2-1) - partial2;
+
+		if (partial > chain && partial2 > chain2 &&
+		    partial->bh->b_blocknr == partial2->bh->b_blocknr) {
+			/*
+			 * We've converged on the same block. Clear the range,
+			 * then we're done.
+			 */
 			ext4_free_branches(handle, inode, partial->bh,
-				   partial->p + 1,
-				   (__le32 *)partial->bh->b_data+addr_per_block,
-				   (chain+n-1) - partial);
+					   partial->p + 1,
+					   partial2->p,
+					   (chain+n-1) - partial);
 			BUFFER_TRACE(partial->bh, "call brelse");
 			brelse(partial->bh);
-			partial--;
+			BUFFER_TRACE(partial2->bh, "call brelse");
+			brelse(partial2->bh);
+			return 0;
 		}
+
 		/*
-		 * Clear the ends of indirect blocks on the shared branch
-		 * at the end of the range
+		 * The start and end partial branches may not be at the same
+		 * level even though the punch happened within one level. So, we
+		 * give them a chance to arrive at the same level, then walk
+		 * them in step with each other until we converge on the same
+		 * block.
 		 */
-		if (partial2 > chain2) {
+		if (partial > chain && depth <= depth2) {
+			ext4_free_branches(handle, inode, partial->bh,
+					   partial->p + 1,
+					   (__le32 *)partial->bh->b_data+addr_per_block,
+					   (chain+n-1) - partial);
+			BUFFER_TRACE(partial->bh, "call brelse");
+			brelse(partial->bh);
+			partial--;
+		}
+		if (partial2 > chain2 && depth2 <= depth) {
 			ext4_free_branches(handle, inode, partial2->bh,
 					   (__le32 *)partial2->bh->b_data,
 					   partial2->p,
-					   (chain2+n-1) - partial2);
+					   (chain2+n2-1) - partial2);
 			BUFFER_TRACE(partial2->bh, "call brelse");
 			brelse(partial2->bh);
 			partial2--;
 		}
 	}
+	return 0;
 
 do_indirects:
 	/* Kill the remaining (whole) subtrees */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 85404f1..5cb9a21 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1024,6 +1024,7 @@ static int ext4_write_end(struct file *file,
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = mapping->host;
+	loff_t old_size = inode->i_size;
 	int ret = 0, ret2;
 	int i_size_changed = 0;
 
@@ -1054,6 +1055,8 @@ static int ext4_write_end(struct file *file,
 	unlock_page(page);
 	page_cache_release(page);
 
+	if (old_size < pos)
+		pagecache_isize_extended(inode, old_size, pos);
 	/*
 	 * Don't mark the inode dirty under page lock. First, it unnecessarily
 	 * makes the holding time of page lock longer. Second, it forces lock
@@ -1095,6 +1098,7 @@ static int ext4_journalled_write_end(struct file *file,
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = mapping->host;
+	loff_t old_size = inode->i_size;
 	int ret = 0, ret2;
 	int partial = 0;
 	unsigned from, to;
@@ -1127,6 +1131,9 @@ static int ext4_journalled_write_end(struct file *file,
 	unlock_page(page);
 	page_cache_release(page);
 
+	if (old_size < pos)
+		pagecache_isize_extended(inode, old_size, pos);
+
 	if (size_changed) {
 		ret2 = ext4_mark_inode_dirty(handle, inode);
 		if (!ret)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1adac68..e061e66 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2779,6 +2779,12 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
 	if (readonly)
 		return 1;
 
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_READONLY)) {
+		ext4_msg(sb, KERN_INFO, "filesystem is read-only");
+		sb->s_flags |= MS_RDONLY;
+		return 1;
+	}
+
 	/* Check that feature set is OK for a read-write mount */
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
 		ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
@@ -3936,9 +3942,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
 
-	init_timer(&sbi->s_err_report);
-	sbi->s_err_report.function = print_daily_error_info;
-	sbi->s_err_report.data = (unsigned long) sb;
+	setup_timer(&sbi->s_err_report, print_daily_error_info,
+		(unsigned long) sb);
 
 	/* Register extent status tree shrinker */
 	if (ext4_es_register_shrinker(sbi))
@@ -4866,9 +4871,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	if (sbi->s_journal && sbi->s_journal->j_task->io_context)
 		journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
 
-	/*
-	 * Allow the "check" option to be passed as a remount option.
-	 */
 	if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
 		err = -EINVAL;
 		goto restore_opts;
@@ -4877,17 +4879,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
 	    test_opt(sb, JOURNAL_CHECKSUM)) {
 		ext4_msg(sb, KERN_ERR, "changing journal_checksum "
-			 "during remount not supported");
-		err = -EINVAL;
-		goto restore_opts;
-	}
-
-	if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
-	    test_opt(sb, JOURNAL_CHECKSUM)) {
-		ext4_msg(sb, KERN_ERR, "changing journal_checksum "
-			 "during remount not supported");
-		err = -EINVAL;
-		goto restore_opts;
+			 "during remount not supported; ignoring");
+		sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM;
 	}
 
 	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
@@ -4963,7 +4956,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 				ext4_mark_recovery_complete(sb, es);
 		} else {
 			/* Make sure we can mount this feature set readwrite */
-			if (!ext4_feature_set_ok(sb, 0)) {
+			if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_READONLY) ||
+			    !ext4_feature_set_ok(sb, 0)) {
 				err = -EROFS;
 				goto restore_opts;
 			}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 073657f..e907052 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -769,9 +769,9 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
 		struct inode *inode = wb_inode(wb->b_io.prev);
 		struct super_block *sb = inode->i_sb;
 
-		if (!grab_super_passive(sb)) {
+		if (!trylock_super(sb)) {
 			/*
-			 * grab_super_passive() may fail consistently due to
+			 * trylock_super() may fail consistently due to
 			 * s_umount being grabbed by someone else. Don't use
 			 * requeue_io() to avoid busy retrying the inode/sb.
 			 */
@@ -779,7 +779,7 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
 			continue;
 		}
 		wrote += writeback_sb_inodes(sb, wb, work);
-		drop_super(sb);
+		up_read(&sb->s_umount);
 
 		/* refer to the same tests at the end of writeback_sb_inodes */
 		if (wrote) {
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ed19a7d..39706c5 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -890,8 +890,8 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
 
 	newpage = buf->page;
 
-	if (WARN_ON(!PageUptodate(newpage)))
-		return -EIO;
+	if (!PageUptodate(newpage))
+		SetPageUptodate(newpage);
 
 	ClearPageMappedToDisk(newpage);
 
@@ -1353,6 +1353,17 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
 	return err;
 }
 
+static int fuse_dev_open(struct inode *inode, struct file *file)
+{
+	/*
+	 * The fuse device's file's private_data is used to hold
+	 * the fuse_conn(ection) when it is mounted, and is used to
+	 * keep track of whether the file has been mounted already.
+	 */
+	file->private_data = NULL;
+	return 0;
+}
+
 static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
 			      unsigned long nr_segs, loff_t pos)
 {
@@ -1797,6 +1808,9 @@ copy_finish:
 static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 		       unsigned int size, struct fuse_copy_state *cs)
 {
+	/* Don't try to move pages (yet) */
+	cs->move_pages = 0;
+
 	switch (code) {
 	case FUSE_NOTIFY_POLL:
 		return fuse_notify_poll(fc, size, cs);
@@ -2217,6 +2231,7 @@ static int fuse_dev_fasync(int fd, struct file *file, int on)
 
 const struct file_operations fuse_dev_operations = {
 	.owner		= THIS_MODULE,
+	.open		= fuse_dev_open,
 	.llseek		= no_llseek,
 	.read		= do_sync_read,
 	.aio_read	= fuse_dev_read,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 08e7b1a..1545b71 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -971,7 +971,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
 			err = -EBUSY;
 			goto badentry;
 		}
-		if (S_ISDIR(entry->d_inode->i_mode)) {
+		if (d_is_dir(entry)) {
 			shrink_dcache_parent(entry);
 			if (!simple_empty(entry)) {
 				err = -ENOTEMPTY;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 6371192..487527b 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1809,7 +1809,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
 		gfs2_consist_inode(dip);
 	dip->i_entries--;
 	dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv;
-	if (S_ISDIR(dentry->d_inode->i_mode))
+	if (d_is_dir(dentry))
 		drop_nlink(&dip->i_inode);
 	mark_inode_dirty(&dip->i_inode);
 
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 435bea2..f0235c1 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -530,7 +530,7 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	/* Unlink destination if it already exists */
 	if (new_dentry->d_inode) {
-		if (S_ISDIR(new_dentry->d_inode->i_mode))
+		if (d_is_dir(new_dentry))
 			res = hfsplus_rmdir(new_dir, new_dentry);
 		else
 			res = hfsplus_unlink(new_dir, new_dentry);
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 5f27551..043ac9d 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -678,10 +678,10 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
 		return NULL;
 	}
 
-	if (S_ISDIR(dentry->d_inode->i_mode)) {
+	if (d_is_dir(dentry)) {
 		inode->i_op = &hppfs_dir_iops;
 		inode->i_fop = &hppfs_dir_fops;
-	} else if (S_ISLNK(dentry->d_inode->i_mode)) {
+	} else if (d_is_symlink(dentry)) {
 		inode->i_op = &hppfs_link_iops;
 		inode->i_fop = &hppfs_file_fops;
 	} else {
diff --git a/fs/internal.h b/fs/internal.h
index 30459da..01dce1d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -84,7 +84,7 @@ extern struct file *get_empty_filp(void);
  * super.c
  */
 extern int do_remount_sb(struct super_block *, int, void *, int);
-extern bool grab_super_passive(struct super_block *sb);
+extern bool trylock_super(struct super_block *sb);
 extern struct dentry *mount_fs(struct file_system_type *,
 			       int, const char *, void *);
 extern struct super_block *user_get_super(dev_t);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index bcbef08..b5128c6 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -524,6 +524,9 @@ static int do_one_pass(journal_t *journal,
 			if (descr_csum_size > 0 &&
 			    !jbd2_descr_block_csum_verify(journal,
 							  bh->b_data)) {
+				printk(KERN_ERR "JBD2: Invalid checksum "
+				       "recovering block %lu in log\n",
+				       next_log_block);
 				err = -EIO;
 				brelse(bh);
 				goto failed;
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 9385560..f21b6fb 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -252,7 +252,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
 	if (!f->inocache)
 		return -EIO;
 
-	if (S_ISDIR(old_dentry->d_inode->i_mode))
+	if (d_is_dir(old_dentry))
 		return -EPERM;
 
 	/* XXX: This is ugly */
@@ -772,7 +772,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 	 */
 	if (new_dentry->d_inode) {
 		victim_f = JFFS2_INODE_INFO(new_dentry->d_inode);
-		if (S_ISDIR(new_dentry->d_inode->i_mode)) {
+		if (d_is_dir(new_dentry)) {
 			struct jffs2_full_dirent *fd;
 
 			mutex_lock(&victim_f->sem);
@@ -807,7 +807,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 
 	if (victim_f) {
 		/* There was a victim. Kill it off nicely */
-		if (S_ISDIR(new_dentry->d_inode->i_mode))
+		if (d_is_dir(new_dentry))
 			clear_nlink(new_dentry->d_inode);
 		else
 			drop_nlink(new_dentry->d_inode);
@@ -815,7 +815,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 		   inode which didn't exist. */
 		if (victim_f->inocache) {
 			mutex_lock(&victim_f->sem);
-			if (S_ISDIR(new_dentry->d_inode->i_mode))
+			if (d_is_dir(new_dentry))
 				victim_f->inocache->pino_nlink = 0;
 			else
 				victim_f->inocache->pino_nlink--;
@@ -825,7 +825,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 
 	/* If it was a directory we moved, and there was no victim,
 	   increase i_nlink on its new parent */
-	if (S_ISDIR(old_dentry->d_inode->i_mode) && !victim_f)
+	if (d_is_dir(old_dentry) && !victim_f)
 		inc_nlink(new_dir_i);
 
 	/* Unlink the original */
@@ -839,7 +839,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 		struct jffs2_inode_info *f = JFFS2_INODE_INFO(old_dentry->d_inode);
 		mutex_lock(&f->sem);
 		inc_nlink(old_dentry->d_inode);
-		if (f->inocache && !S_ISDIR(old_dentry->d_inode->i_mode))
+		if (f->inocache && !d_is_dir(old_dentry))
 			f->inocache->pino_nlink++;
 		mutex_unlock(&f->sem);
 
@@ -852,7 +852,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 		return ret;
 	}
 
-	if (S_ISDIR(old_dentry->d_inode->i_mode))
+	if (d_is_dir(old_dentry))
 		drop_nlink(old_dir_i);
 
 	new_dir_i->i_mtime = new_dir_i->i_ctime = old_dir_i->i_mtime = old_dir_i->i_ctime = ITIME(now);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 0918f0e..3d76f28 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -138,7 +138,7 @@ static struct dentry *jffs2_get_parent(struct dentry *child)
 	struct jffs2_inode_info *f;
 	uint32_t pino;
 
-	BUG_ON(!S_ISDIR(child->d_inode->i_mode));
+	BUG_ON(!d_is_dir(child));
 
 	f = JFFS2_INODE_INFO(child->d_inode);
 
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index b684e8a..2bacb99 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -207,6 +207,7 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
 		goto out_free;
 	}
 
+	of->event = atomic_read(&of->kn->attr.open->event);
 	ops = kernfs_ops(of->kn);
 	if (ops->read)
 		len = ops->read(of, buf, len, *ppos);
diff --git a/fs/libfs.c b/fs/libfs.c
index b2ffdb0..0ab6512 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -329,7 +329,7 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
 		struct inode *new_dir, struct dentry *new_dentry)
 {
 	struct inode *inode = old_dentry->d_inode;
-	int they_are_dirs = S_ISDIR(old_dentry->d_inode->i_mode);
+	int they_are_dirs = d_is_dir(old_dentry);
 
 	if (!simple_empty(new_dentry))
 		return -ENOTEMPTY;
diff --git a/fs/locks.c b/fs/locks.c
index 365c82e..528fedf 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1665,7 +1665,8 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	}
 
 	if (my_fl != NULL) {
-		error = lease->fl_lmops->lm_change(my_fl, arg, &dispose);
+		lease = my_fl;
+		error = lease->fl_lmops->lm_change(lease, arg, &dispose);
 		if (error)
 			goto out;
 		goto out_setup;
@@ -1727,7 +1728,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
 			break;
 		}
 	}
-	trace_generic_delete_lease(inode, fl);
+	trace_generic_delete_lease(inode, victim);
 	if (victim)
 		error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
 	spin_unlock(&ctx->flc_lock);
diff --git a/fs/namei.c b/fs/namei.c
index 96ca11d..c83145a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2814,7 +2814,7 @@ no_open:
 			} else if (!dentry->d_inode) {
 				goto out;
 			} else if ((open_flag & O_TRUNC) &&
-				   S_ISREG(dentry->d_inode->i_mode)) {
+				   d_is_reg(dentry)) {
 				goto out;
 			}
 			/* will fail later, go on to get the right error */
diff --git a/fs/namespace.c b/fs/namespace.c
index 72a286e..82ef140 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1907,8 +1907,8 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
 	if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)
 		return -EINVAL;
 
-	if (S_ISDIR(mp->m_dentry->d_inode->i_mode) !=
-	      S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode))
+	if (d_is_dir(mp->m_dentry) !=
+	      d_is_dir(mnt->mnt.mnt_root))
 		return -ENOTDIR;
 
 	return attach_recursive_mnt(mnt, p, mp, NULL);
@@ -2180,8 +2180,8 @@ static int do_move_mount(struct path *path, const char *old_name)
 	if (!mnt_has_parent(old))
 		goto out1;
 
-	if (S_ISDIR(path->dentry->d_inode->i_mode) !=
-	      S_ISDIR(old_path.dentry->d_inode->i_mode))
+	if (d_is_dir(path->dentry) !=
+	      d_is_dir(old_path.dentry))
 		goto out1;
 	/*
 	 * Don't move a mount residing in a shared parent.
@@ -2271,7 +2271,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
 		goto unlock;
 
 	err = -EINVAL;
-	if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))
+	if (d_is_symlink(newmnt->mnt.mnt_root))
 		goto unlock;
 
 	newmnt->mnt.mnt_flags = mnt_flags;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index e36a9d7..197806f 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -427,6 +427,8 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 	if (clp == NULL)
 		goto out;
 
+	if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
+		goto out;
 	tbl = &clp->cl_session->bc_slot_table;
 
 	spin_lock(&tbl->slot_tbl_lock);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index f4ccfe6..19ca95c 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -313,7 +313,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
 		goto out;
 	}
 
-	args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL);
+	args->devs = kmalloc_array(n, sizeof(*args->devs), GFP_KERNEL);
 	if (!args->devs) {
 		status = htonl(NFS4ERR_DELAY);
 		goto out;
@@ -415,7 +415,7 @@ static __be32 decode_rc_list(struct xdr_stream *xdr,
 			     rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t));
 		if (unlikely(p == NULL))
 			goto out;
-		rc_list->rcl_refcalls = kmalloc(rc_list->rcl_nrefcalls *
+		rc_list->rcl_refcalls = kmalloc_array(rc_list->rcl_nrefcalls,
 						sizeof(*rc_list->rcl_refcalls),
 						GFP_KERNEL);
 		if (unlikely(rc_list->rcl_refcalls == NULL))
@@ -464,8 +464,10 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
 
 		for (i = 0; i < args->csa_nrclists; i++) {
 			status = decode_rc_list(xdr, &args->csa_rclists[i]);
-			if (status)
+			if (status) {
+				args->csa_nrclists = i;
 				goto out_free;
+			}
 		}
 	}
 	status = 0;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index f9f4845..1987415 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -433,7 +433,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
 
 static bool nfs_client_init_is_complete(const struct nfs_client *clp)
 {
-	return clp->cl_cons_state != NFS_CS_INITING;
+	return clp->cl_cons_state <= NFS_CS_READY;
 }
 
 int nfs_wait_client_init_complete(const struct nfs_client *clp)
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index da54332..a6ad688 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -180,10 +180,9 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
 			delegation->cred = get_rpccred(cred);
 			clear_bit(NFS_DELEGATION_NEED_RECLAIM,
 				  &delegation->flags);
-			NFS_I(inode)->delegation_state = delegation->type;
 			spin_unlock(&delegation->lock);
-			put_rpccred(oldcred);
 			rcu_read_unlock();
+			put_rpccred(oldcred);
 			trace_nfs4_reclaim_delegation(inode, res->delegation_type);
 		} else {
 			/* We appear to have raced with a delegation return. */
@@ -275,7 +274,6 @@ nfs_detach_delegation_locked(struct nfs_inode *nfsi,
 	set_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
 	list_del_rcu(&delegation->super_list);
 	delegation->inode = NULL;
-	nfsi->delegation_state = 0;
 	rcu_assign_pointer(nfsi->delegation, NULL);
 	spin_unlock(&delegation->lock);
 	return delegation;
@@ -355,7 +353,6 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 					&delegation->stateid)) {
 			nfs_update_inplace_delegation(old_delegation,
 					delegation);
-			nfsi->delegation_state = old_delegation->type;
 			goto out;
 		}
 		/*
@@ -373,13 +370,15 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 			delegation = NULL;
 			goto out;
 		}
-		freeme = nfs_detach_delegation_locked(nfsi, 
+		if (test_and_set_bit(NFS_DELEGATION_RETURNING,
+					&old_delegation->flags))
+			goto out;
+		freeme = nfs_detach_delegation_locked(nfsi,
 				old_delegation, clp);
 		if (freeme == NULL)
 			goto out;
 	}
 	list_add_rcu(&delegation->super_list, &server->delegations);
-	nfsi->delegation_state = delegation->type;
 	rcu_assign_pointer(nfsi->delegation, delegation);
 	delegation = NULL;
 
@@ -437,6 +436,8 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation)
 {
 	bool ret = false;
 
+	if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+		goto out;
 	if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
 		ret = true;
 	if (test_and_clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) && !ret) {
@@ -448,6 +449,7 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation)
 			ret = true;
 		spin_unlock(&delegation->lock);
 	}
+out:
 	return ret;
 }
 
@@ -475,14 +477,20 @@ restart:
 								super_list) {
 			if (!nfs_delegation_need_return(delegation))
 				continue;
-			inode = nfs_delegation_grab_inode(delegation);
-			if (inode == NULL)
+			if (!nfs_sb_active(server->super))
 				continue;
+			inode = nfs_delegation_grab_inode(delegation);
+			if (inode == NULL) {
+				rcu_read_unlock();
+				nfs_sb_deactive(server->super);
+				goto restart;
+			}
 			delegation = nfs_start_delegation_return_locked(NFS_I(inode));
 			rcu_read_unlock();
 
 			err = nfs_end_delegation_return(inode, delegation, 0);
 			iput(inode);
+			nfs_sb_deactive(server->super);
 			if (!err)
 				goto restart;
 			set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
@@ -813,19 +821,30 @@ restart:
 	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
 		list_for_each_entry_rcu(delegation, &server->delegations,
 								super_list) {
+			if (test_bit(NFS_DELEGATION_RETURNING,
+						&delegation->flags))
+				continue;
 			if (test_bit(NFS_DELEGATION_NEED_RECLAIM,
 						&delegation->flags) == 0)
 				continue;
-			inode = nfs_delegation_grab_inode(delegation);
-			if (inode == NULL)
+			if (!nfs_sb_active(server->super))
 				continue;
-			delegation = nfs_detach_delegation(NFS_I(inode),
-					delegation, server);
+			inode = nfs_delegation_grab_inode(delegation);
+			if (inode == NULL) {
+				rcu_read_unlock();
+				nfs_sb_deactive(server->super);
+				goto restart;
+			}
+			delegation = nfs_start_delegation_return_locked(NFS_I(inode));
 			rcu_read_unlock();
-
-			if (delegation != NULL)
-				nfs_free_delegation(delegation);
+			if (delegation != NULL) {
+				delegation = nfs_detach_delegation(NFS_I(inode),
+					delegation, server);
+				if (delegation != NULL)
+					nfs_free_delegation(delegation);
+			}
 			iput(inode);
+			nfs_sb_deactive(server->super);
 			goto restart;
 		}
 	}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 9b0c55c..c19e16f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -408,14 +408,22 @@ static int xdr_decode(nfs_readdir_descriptor_t *desc,
 	return 0;
 }
 
+/* Match file and dirent using either filehandle or fileid
+ * Note: caller is responsible for checking the fsid
+ */
 static
 int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
 {
+	struct nfs_inode *nfsi;
+
 	if (dentry->d_inode == NULL)
 		goto different;
-	if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0)
-		goto different;
-	return 1;
+
+	nfsi = NFS_I(dentry->d_inode);
+	if (entry->fattr->fileid == nfsi->fileid)
+		return 1;
+	if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0)
+		return 1;
 different:
 	return 0;
 }
@@ -469,6 +477,10 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
 	struct inode *inode;
 	int status;
 
+	if (!(entry->fattr->valid & NFS_ATTR_FATTR_FILEID))
+		return;
+	if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID))
+		return;
 	if (filename.name[0] == '.') {
 		if (filename.len == 1)
 			return;
@@ -479,6 +491,10 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
 
 	dentry = d_lookup(parent, &filename);
 	if (dentry != NULL) {
+		/* Is there a mountpoint here? If so, just exit */
+		if (!nfs_fsid_equal(&NFS_SB(dentry->d_sb)->fsid,
+					&entry->fattr->fsid))
+			goto out;
 		if (nfs_same_file(dentry, entry)) {
 			nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 			status = nfs_refresh_inode(dentry->d_inode, entry->fattr);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 7077521..e907c8c 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -283,7 +283,7 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
 void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
 			      struct nfs_direct_req *dreq)
 {
-	cinfo->lock = &dreq->lock;
+	cinfo->lock = &dreq->inode->i_lock;
 	cinfo->mds = &dreq->mds_cinfo;
 	cinfo->ds = &dreq->ds_cinfo;
 	cinfo->dreq = dreq;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 94712fc..e679d24 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -178,7 +178,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
 		iocb->ki_filp,
 		iov_iter_count(to), (unsigned long) iocb->ki_pos);
 
-	result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
+	result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping);
 	if (!result) {
 		result = generic_file_read_iter(iocb, to);
 		if (result > 0)
@@ -199,7 +199,7 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
 	dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n",
 		filp, (unsigned long) count, (unsigned long long) *ppos);
 
-	res = nfs_revalidate_mapping(inode, filp->f_mapping);
+	res = nfs_revalidate_mapping_protected(inode, filp->f_mapping);
 	if (!res) {
 		res = generic_file_splice_read(filp, ppos, pipe, count, flags);
 		if (res > 0)
@@ -372,6 +372,10 @@ start:
 				 nfs_wait_bit_killable, TASK_KILLABLE);
 	if (ret)
 		return ret;
+	/*
+	 * Wait for O_DIRECT to complete
+	 */
+	nfs_inode_dio_wait(mapping->host);
 
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
@@ -619,6 +623,9 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	/* make sure the cache has finished storing the page */
 	nfs_fscache_wait_on_page_write(NFS_I(inode), page);
 
+	wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING,
+			nfs_wait_bit_killable, TASK_KILLABLE);
+
 	lock_page(page);
 	mapping = page_file_mapping(page);
 	if (mapping != inode->i_mapping)
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 7ae1c26..91e88a7 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -960,52 +960,19 @@ filelayout_mark_request_commit(struct nfs_page *req,
 {
 	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
 	u32 i, j;
-	struct list_head *list;
-	struct pnfs_commit_bucket *buckets;
 
 	if (fl->commit_through_mds) {
-		list = &cinfo->mds->list;
-		spin_lock(cinfo->lock);
-		goto mds_commit;
-	}
-
-	/* Note that we are calling nfs4_fl_calc_j_index on each page
-	 * that ends up being committed to a data server.  An attractive
-	 * alternative is to add a field to nfs_write_data and nfs_page
-	 * to store the value calculated in filelayout_write_pagelist
-	 * and just use that here.
-	 */
-	j = nfs4_fl_calc_j_index(lseg, req_offset(req));
-	i = select_bucket_index(fl, j);
-	spin_lock(cinfo->lock);
-	buckets = cinfo->ds->buckets;
-	list = &buckets[i].written;
-	if (list_empty(list)) {
-		/* Non-empty buckets hold a reference on the lseg.  That ref
-		 * is normally transferred to the COMMIT call and released
-		 * there.  It could also be released if the last req is pulled
-		 * off due to a rewrite, in which case it will be done in
-		 * pnfs_generic_clear_request_commit
+		nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
+	} else {
+		/* Note that we are calling nfs4_fl_calc_j_index on each page
+		 * that ends up being committed to a data server.  An attractive
+		 * alternative is to add a field to nfs_write_data and nfs_page
+		 * to store the value calculated in filelayout_write_pagelist
+		 * and just use that here.
 		 */
-		buckets[i].wlseg = pnfs_get_lseg(lseg);
-	}
-	set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
-	cinfo->ds->nwritten++;
-
-mds_commit:
-	/* nfs_request_add_commit_list(). We need to add req to list without
-	 * dropping cinfo lock.
-	 */
-	set_bit(PG_CLEAN, &(req)->wb_flags);
-	nfs_list_add_request(req, list);
-	cinfo->mds->ncommit++;
-	spin_unlock(cinfo->lock);
-	if (!cinfo->dreq) {
-		inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-		inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
-			     BDI_RECLAIMABLE);
-		__mark_inode_dirty(req->wb_context->dentry->d_inode,
-				   I_DIRTY_DATASYNC);
+		j = nfs4_fl_calc_j_index(lseg, req_offset(req));
+		i = select_bucket_index(fl, j);
+		pnfs_layout_mark_request_commit(req, lseg, cinfo, i);
 	}
 }
 
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index c22ecaa..315cc68 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1332,47 +1332,6 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
 	return PNFS_ATTEMPTED;
 }
 
-static void
-ff_layout_mark_request_commit(struct nfs_page *req,
-			      struct pnfs_layout_segment *lseg,
-			      struct nfs_commit_info *cinfo,
-			      u32 ds_commit_idx)
-{
-	struct list_head *list;
-	struct pnfs_commit_bucket *buckets;
-
-	spin_lock(cinfo->lock);
-	buckets = cinfo->ds->buckets;
-	list = &buckets[ds_commit_idx].written;
-	if (list_empty(list)) {
-		/* Non-empty buckets hold a reference on the lseg.  That ref
-		 * is normally transferred to the COMMIT call and released
-		 * there.  It could also be released if the last req is pulled
-		 * off due to a rewrite, in which case it will be done in
-		 * pnfs_common_clear_request_commit
-		 */
-		WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL);
-		buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg);
-	}
-	set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
-	cinfo->ds->nwritten++;
-
-	/* nfs_request_add_commit_list(). We need to add req to list without
-	 * dropping cinfo lock.
-	 */
-	set_bit(PG_CLEAN, &(req)->wb_flags);
-	nfs_list_add_request(req, list);
-	cinfo->mds->ncommit++;
-	spin_unlock(cinfo->lock);
-	if (!cinfo->dreq) {
-		inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-		inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
-			     BDI_RECLAIMABLE);
-		__mark_inode_dirty(req->wb_context->dentry->d_inode,
-				   I_DIRTY_DATASYNC);
-	}
-}
-
 static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
 {
 	return i;
@@ -1540,7 +1499,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
 	.pg_write_ops		= &ff_layout_pg_write_ops,
 	.get_ds_info		= ff_layout_get_ds_info,
 	.free_deviceid_node	= ff_layout_free_deveiceid_node,
-	.mark_request_commit	= ff_layout_mark_request_commit,
+	.mark_request_commit	= pnfs_layout_mark_request_commit,
 	.clear_request_commit	= pnfs_generic_clear_request_commit,
 	.scan_commit_lists	= pnfs_generic_scan_commit_lists,
 	.recover_commit_reqs	= pnfs_generic_recover_commit_reqs,
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e4f0dce..d42dff6 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -556,6 +556,7 @@ EXPORT_SYMBOL_GPL(nfs_setattr);
  * This is a copy of the common vmtruncate, but with the locking
  * corrected to take into account the fact that NFS requires
  * inode->i_size to be updated under the inode->i_lock.
+ * Note: must be called with inode->i_lock held!
  */
 static int nfs_vmtruncate(struct inode * inode, loff_t offset)
 {
@@ -565,14 +566,14 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)
 	if (err)
 		goto out;
 
-	spin_lock(&inode->i_lock);
 	i_size_write(inode, offset);
 	/* Optimisation */
 	if (offset == 0)
 		NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA;
-	spin_unlock(&inode->i_lock);
 
+	spin_unlock(&inode->i_lock);
 	truncate_pagecache(inode, offset);
+	spin_lock(&inode->i_lock);
 out:
 	return err;
 }
@@ -585,10 +586,15 @@ out:
  * Note: we do this in the *proc.c in order to ensure that
  *       it works for things like exclusive creates too.
  */
-void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
+void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
+		struct nfs_fattr *fattr)
 {
+	/* Barrier: bump the attribute generation count. */
+	nfs_fattr_set_barrier(fattr);
+
+	spin_lock(&inode->i_lock);
+	NFS_I(inode)->attr_gencount = fattr->gencount;
 	if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) {
-		spin_lock(&inode->i_lock);
 		if ((attr->ia_valid & ATTR_MODE) != 0) {
 			int mode = attr->ia_mode & S_IALLUGO;
 			mode |= inode->i_mode & ~S_IALLUGO;
@@ -600,12 +606,13 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
 			inode->i_gid = attr->ia_gid;
 		nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS
 				| NFS_INO_INVALID_ACL);
-		spin_unlock(&inode->i_lock);
 	}
 	if ((attr->ia_valid & ATTR_SIZE) != 0) {
 		nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
 		nfs_vmtruncate(inode, attr->ia_size);
 	}
+	nfs_update_inode(inode, fattr);
+	spin_unlock(&inode->i_lock);
 }
 EXPORT_SYMBOL_GPL(nfs_setattr_update_inode);
 
@@ -1028,6 +1035,7 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
 
 	if (mapping->nrpages != 0) {
 		if (S_ISREG(inode->i_mode)) {
+			unmap_mapping_range(mapping, 0, 0, 0);
 			ret = nfs_sync_mapping(mapping);
 			if (ret < 0)
 				return ret;
@@ -1060,11 +1068,14 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
 }
 
 /**
- * nfs_revalidate_mapping - Revalidate the pagecache
+ * __nfs_revalidate_mapping - Revalidate the pagecache
  * @inode - pointer to host inode
  * @mapping - pointer to mapping
+ * @may_lock - take inode->i_mutex?
  */
-int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+static int __nfs_revalidate_mapping(struct inode *inode,
+		struct address_space *mapping,
+		bool may_lock)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	unsigned long *bitlock = &nfsi->flags;
@@ -1113,7 +1124,12 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 	nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
 	spin_unlock(&inode->i_lock);
 	trace_nfs_invalidate_mapping_enter(inode);
-	ret = nfs_invalidate_mapping(inode, mapping);
+	if (may_lock) {
+		mutex_lock(&inode->i_mutex);
+		ret = nfs_invalidate_mapping(inode, mapping);
+		mutex_unlock(&inode->i_mutex);
+	} else
+		ret = nfs_invalidate_mapping(inode, mapping);
 	trace_nfs_invalidate_mapping_exit(inode, ret);
 
 	clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
@@ -1123,6 +1139,29 @@ out:
 	return ret;
 }
 
+/**
+ * nfs_revalidate_mapping - Revalidate the pagecache
+ * @inode - pointer to host inode
+ * @mapping - pointer to mapping
+ */
+int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+{
+	return __nfs_revalidate_mapping(inode, mapping, false);
+}
+
+/**
+ * nfs_revalidate_mapping_protected - Revalidate the pagecache
+ * @inode - pointer to host inode
+ * @mapping - pointer to mapping
+ *
+ * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex
+ * while invalidating the mapping.
+ */
+int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping)
+{
+	return __nfs_revalidate_mapping(inode, mapping, true);
+}
+
 static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
@@ -1231,13 +1270,6 @@ static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fat
 	return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
 }
 
-static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
-{
-	if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
-		return 0;
-	return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
-}
-
 static atomic_long_t nfs_attr_generation_counter;
 
 static unsigned long nfs_read_attr_generation_counter(void)
@@ -1249,6 +1281,7 @@ unsigned long nfs_inc_attr_generation_counter(void)
 {
 	return atomic_long_inc_return(&nfs_attr_generation_counter);
 }
+EXPORT_SYMBOL_GPL(nfs_inc_attr_generation_counter);
 
 void nfs_fattr_init(struct nfs_fattr *fattr)
 {
@@ -1260,6 +1293,22 @@ void nfs_fattr_init(struct nfs_fattr *fattr)
 }
 EXPORT_SYMBOL_GPL(nfs_fattr_init);
 
+/**
+ * nfs_fattr_set_barrier
+ * @fattr: attributes
+ *
+ * Used to set a barrier after an attribute was updated. This
+ * barrier ensures that older attributes from RPC calls that may
+ * have raced with our update cannot clobber these new values.
+ * Note that you are still responsible for ensuring that other
+ * operations which change the attribute on the server do not
+ * collide.
+ */
+void nfs_fattr_set_barrier(struct nfs_fattr *fattr)
+{
+	fattr->gencount = nfs_inc_attr_generation_counter();
+}
+
 struct nfs_fattr *nfs_alloc_fattr(void)
 {
 	struct nfs_fattr *fattr;
@@ -1370,7 +1419,6 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
 
 	return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
 		nfs_ctime_need_update(inode, fattr) ||
-		nfs_size_need_update(inode, fattr) ||
 		((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
 }
 
@@ -1460,6 +1508,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	int status;
 
 	spin_lock(&inode->i_lock);
+	nfs_fattr_set_barrier(fattr);
 	status = nfs_post_op_update_inode_locked(inode, fattr);
 	spin_unlock(&inode->i_lock);
 
@@ -1468,7 +1517,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
 
 /**
- * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache
+ * nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache
  * @inode - pointer to inode
  * @fattr - updated attributes
  *
@@ -1478,11 +1527,10 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
  *
  * This function is mainly designed to be used by the ->write_done() functions.
  */
-int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr)
+int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr)
 {
 	int status;
 
-	spin_lock(&inode->i_lock);
 	/* Don't do a WCC update if these attributes are already stale */
 	if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
 			!nfs_inode_attrs_need_update(inode, fattr)) {
@@ -1514,6 +1562,27 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
 	}
 out_noforce:
 	status = nfs_post_op_update_inode_locked(inode, fattr);
+	return status;
+}
+
+/**
+ * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache
+ * @inode - pointer to inode
+ * @fattr - updated attributes
+ *
+ * After an operation that has changed the inode metadata, mark the
+ * attribute cache as being invalid, then try to update it. Fake up
+ * weak cache consistency data, if none exist.
+ *
+ * This function is mainly designed to be used by the ->write_done() functions.
+ */
+int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr)
+{
+	int status;
+
+	spin_lock(&inode->i_lock);
+	nfs_fattr_set_barrier(fattr);
+	status = nfs_post_op_update_inode_force_wcc_locked(inode, fattr);
 	spin_unlock(&inode->i_lock);
 	return status;
 }
@@ -1715,6 +1784,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
 		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 		nfsi->attrtimeo_timestamp = now;
+		/* Set barrier to be more recent than all outstanding updates */
 		nfsi->attr_gencount = nfs_inc_attr_generation_counter();
 	} else {
 		if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
@@ -1722,6 +1792,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
 			nfsi->attrtimeo_timestamp = now;
 		}
+		/* Set the barrier to be more recent than this fattr */
+		if ((long)fattr->gencount - (long)nfsi->attr_gencount > 0)
+			nfsi->attr_gencount = fattr->gencount;
 	}
 	invalid &= ~NFS_INO_INVALID_ATTR;
 	/* Don't invalidate the data if we were to blame */
@@ -1775,7 +1848,6 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
 #if IS_ENABLED(CONFIG_NFS_V4)
 	INIT_LIST_HEAD(&nfsi->open_states);
 	nfsi->delegation = NULL;
-	nfsi->delegation_state = 0;
 	init_rwsem(&nfsi->rwsem);
 	nfsi->layout = NULL;
 #endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 212b8c8..9e6475b 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -459,6 +459,7 @@ void nfs_mark_request_commit(struct nfs_page *req,
 			     struct nfs_commit_info *cinfo,
 			     u32 ds_commit_idx);
 int nfs_write_need_commit(struct nfs_pgio_header *);
+void nfs_writeback_update_inode(struct nfs_pgio_header *hdr);
 int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
 			    int how, struct nfs_commit_info *cinfo);
 void nfs_retry_commit(struct list_head *page_list,
@@ -598,6 +599,19 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
 }
 
 /*
+ * Record the page as unstable and mark its inode as dirty.
+ */
+static inline
+void nfs_mark_page_unstable(struct page *page)
+{
+	struct inode *inode = page_file_mapping(page)->host;
+
+	inc_zone_page_state(page, NR_UNSTABLE_NFS);
+	inc_bdi_stat(inode_to_bdi(inode), BDI_RECLAIMABLE);
+	 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+}
+
+/*
  * Determine the number of bytes of data the page contains
  */
 static inline
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 78e557c..1f11d25 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -138,7 +138,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 	nfs_fattr_init(fattr);
 	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
 	if (status == 0)
-		nfs_setattr_update_inode(inode, sattr);
+		nfs_setattr_update_inode(inode, sattr, fattr);
 	dprintk("NFS reply setattr: %d\n", status);
 	return status;
 }
@@ -834,7 +834,7 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 	if (nfs3_async_handle_jukebox(task, inode))
 		return -EAGAIN;
 	if (task->tk_status >= 0)
-		nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr);
+		nfs_writeback_update_inode(hdr);
 	return 0;
 }
 
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 2a932fd..53852a4 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1987,6 +1987,11 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 		if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
 			entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
 
+		if (entry->fattr->fileid != entry->ino) {
+			entry->fattr->mounted_on_fileid = entry->ino;
+			entry->fattr->valid |= NFS_ATTR_FATTR_MOUNTED_ON_FILEID;
+		}
+
 		/* In fact, a post_op_fh3: */
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(p == NULL))
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 8646af9..86d6214 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -621,6 +621,9 @@ int nfs41_walk_client_list(struct nfs_client *new,
 	spin_lock(&nn->nfs_client_lock);
 	list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
 
+		if (pos == new)
+			goto found;
+
 		if (pos->rpc_ops != new->rpc_ops)
 			continue;
 
@@ -639,10 +642,6 @@ int nfs41_walk_client_list(struct nfs_client *new,
 			prev = pos;
 
 			status = nfs_wait_client_init_complete(pos);
-			if (pos->cl_cons_state == NFS_CS_SESSION_INITING) {
-				nfs4_schedule_lease_recovery(pos);
-				status = nfs4_wait_clnt_recover(pos);
-			}
 			spin_lock(&nn->nfs_client_lock);
 			if (status < 0)
 				break;
@@ -668,7 +667,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
 		 */
 		if (!nfs4_match_client_owner_id(pos, new))
 			continue;
-
+found:
 		atomic_inc(&pos->cl_count);
 		*result = pos;
 		status = 0;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2e7c9f7..627f37c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -901,6 +901,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 	if (!cinfo->atomic || cinfo->before != dir->i_version)
 		nfs_force_lookup_revalidate(dir);
 	dir->i_version = cinfo->after;
+	nfsi->attr_gencount = nfs_inc_attr_generation_counter();
 	nfs_fscache_invalidate(dir);
 	spin_unlock(&dir->i_lock);
 }
@@ -1552,6 +1553,9 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod
 
 	opendata->o_arg.open_flags = 0;
 	opendata->o_arg.fmode = fmode;
+	opendata->o_arg.share_access = nfs4_map_atomic_open_share(
+			NFS_SB(opendata->dentry->d_sb),
+			fmode, 0);
 	memset(&opendata->o_res, 0, sizeof(opendata->o_res));
 	memset(&opendata->c_res, 0, sizeof(opendata->c_res));
 	nfs4_init_opendata_res(opendata);
@@ -2413,8 +2417,8 @@ static int _nfs4_do_open(struct inode *dir,
 				opendata->o_res.f_attr, sattr,
 				state, label, olabel);
 		if (status == 0) {
-			nfs_setattr_update_inode(state->inode, sattr);
-			nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
+			nfs_setattr_update_inode(state->inode, sattr,
+					opendata->o_res.f_attr);
 			nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
 		}
 	}
@@ -2651,7 +2655,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 		case -NFS4ERR_BAD_STATEID:
 		case -NFS4ERR_EXPIRED:
 			if (!nfs4_stateid_match(&calldata->arg.stateid,
-						&state->stateid)) {
+						&state->open_stateid)) {
 				rpc_restart_call_prepare(task);
 				goto out_release;
 			}
@@ -2687,7 +2691,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 	is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
 	is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
 	is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
-	nfs4_stateid_copy(&calldata->arg.stateid, &state->stateid);
+	nfs4_stateid_copy(&calldata->arg.stateid, &state->open_stateid);
 	/* Calculate the change in open mode */
 	calldata->arg.fmode = 0;
 	if (state->n_rdwr == 0) {
@@ -3288,7 +3292,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 
 	status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label);
 	if (status == 0) {
-		nfs_setattr_update_inode(inode, sattr);
+		nfs_setattr_update_inode(inode, sattr, fattr);
 		nfs_setsecurity(inode, fattr, label);
 	}
 	nfs4_label_free(label);
@@ -4234,7 +4238,7 @@ static int nfs4_write_done_cb(struct rpc_task *task,
 	}
 	if (task->tk_status >= 0) {
 		renew_lease(NFS_SERVER(inode), hdr->timestamp);
-		nfs_post_op_update_inode_force_wcc(inode, &hdr->fattr);
+		nfs_writeback_update_inode(hdr);
 	}
 	return 0;
 }
@@ -6648,47 +6652,47 @@ nfs41_same_server_scope(struct nfs41_server_scope *a,
 int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred)
 {
 	int status;
+	struct nfs41_bind_conn_to_session_args args = {
+		.client = clp,
+		.dir = NFS4_CDFC4_FORE_OR_BOTH,
+	};
 	struct nfs41_bind_conn_to_session_res res;
 	struct rpc_message msg = {
 		.rpc_proc =
 			&nfs4_procedures[NFSPROC4_CLNT_BIND_CONN_TO_SESSION],
-		.rpc_argp = clp,
+		.rpc_argp = &args,
 		.rpc_resp = &res,
 		.rpc_cred = cred,
 	};
 
 	dprintk("--> %s\n", __func__);
 
-	res.session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
-	if (unlikely(res.session == NULL)) {
-		status = -ENOMEM;
-		goto out;
-	}
+	nfs4_copy_sessionid(&args.sessionid, &clp->cl_session->sess_id);
+	if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
+		args.dir = NFS4_CDFC4_FORE;
 
 	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
 	trace_nfs4_bind_conn_to_session(clp, status);
 	if (status == 0) {
-		if (memcmp(res.session->sess_id.data,
+		if (memcmp(res.sessionid.data,
 		    clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) {
 			dprintk("NFS: %s: Session ID mismatch\n", __func__);
 			status = -EIO;
-			goto out_session;
+			goto out;
 		}
-		if (res.dir != NFS4_CDFS4_BOTH) {
+		if ((res.dir & args.dir) != res.dir || res.dir == 0) {
 			dprintk("NFS: %s: Unexpected direction from server\n",
 				__func__);
 			status = -EIO;
-			goto out_session;
+			goto out;
 		}
-		if (res.use_conn_in_rdma_mode) {
+		if (res.use_conn_in_rdma_mode != args.use_conn_in_rdma_mode) {
 			dprintk("NFS: %s: Server returned RDMA mode = true\n",
 				__func__);
 			status = -EIO;
-			goto out_session;
+			goto out;
 		}
 	}
-out_session:
-	kfree(res.session);
 out:
 	dprintk("<-- %s status= %d\n", __func__, status);
 	return status;
@@ -6893,9 +6897,13 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 
 	if (status == 0) {
 		clp->cl_clientid = res.clientid;
-		clp->cl_exchange_flags = (res.flags & ~EXCHGID4_FLAG_CONFIRMED_R);
-		if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R))
+		clp->cl_exchange_flags = res.flags;
+		/* Client ID is not confirmed */
+		if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) {
+			clear_bit(NFS4_SESSION_ESTABLISHED,
+					&clp->cl_session->session_state);
 			clp->cl_seqid = res.seqid;
+		}
 
 		kfree(clp->cl_serverowner);
 		clp->cl_serverowner = res.server_owner;
@@ -7166,10 +7174,11 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
 		args->bc_attrs.max_reqs);
 }
 
-static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
+static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args,
+		struct nfs41_create_session_res *res)
 {
 	struct nfs4_channel_attrs *sent = &args->fc_attrs;
-	struct nfs4_channel_attrs *rcvd = &session->fc_attrs;
+	struct nfs4_channel_attrs *rcvd = &res->fc_attrs;
 
 	if (rcvd->max_resp_sz > sent->max_resp_sz)
 		return -EINVAL;
@@ -7188,11 +7197,14 @@ static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args
 	return 0;
 }
 
-static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
+static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args,
+		struct nfs41_create_session_res *res)
 {
 	struct nfs4_channel_attrs *sent = &args->bc_attrs;
-	struct nfs4_channel_attrs *rcvd = &session->bc_attrs;
+	struct nfs4_channel_attrs *rcvd = &res->bc_attrs;
 
+	if (!(res->flags & SESSION4_BACK_CHAN))
+		goto out;
 	if (rcvd->max_rqst_sz > sent->max_rqst_sz)
 		return -EINVAL;
 	if (rcvd->max_resp_sz < sent->max_resp_sz)
@@ -7204,18 +7216,33 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args
 		return -EINVAL;
 	if (rcvd->max_reqs != sent->max_reqs)
 		return -EINVAL;
+out:
 	return 0;
 }
 
 static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
-				     struct nfs4_session *session)
+				     struct nfs41_create_session_res *res)
 {
 	int ret;
 
-	ret = nfs4_verify_fore_channel_attrs(args, session);
+	ret = nfs4_verify_fore_channel_attrs(args, res);
 	if (ret)
 		return ret;
-	return nfs4_verify_back_channel_attrs(args, session);
+	return nfs4_verify_back_channel_attrs(args, res);
+}
+
+static void nfs4_update_session(struct nfs4_session *session,
+		struct nfs41_create_session_res *res)
+{
+	nfs4_copy_sessionid(&session->sess_id, &res->sessionid);
+	/* Mark client id and session as being confirmed */
+	session->clp->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R;
+	set_bit(NFS4_SESSION_ESTABLISHED, &session->session_state);
+	session->flags = res->flags;
+	memcpy(&session->fc_attrs, &res->fc_attrs, sizeof(session->fc_attrs));
+	if (res->flags & SESSION4_BACK_CHAN)
+		memcpy(&session->bc_attrs, &res->bc_attrs,
+				sizeof(session->bc_attrs));
 }
 
 static int _nfs4_proc_create_session(struct nfs_client *clp,
@@ -7224,11 +7251,12 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
 	struct nfs4_session *session = clp->cl_session;
 	struct nfs41_create_session_args args = {
 		.client = clp,
+		.clientid = clp->cl_clientid,
+		.seqid = clp->cl_seqid,
 		.cb_program = NFS4_CALLBACK,
 	};
-	struct nfs41_create_session_res res = {
-		.client = clp,
-	};
+	struct nfs41_create_session_res res;
+
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION],
 		.rpc_argp = &args,
@@ -7245,11 +7273,15 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
 
 	if (!status) {
 		/* Verify the session's negotiated channel_attrs values */
-		status = nfs4_verify_channel_attrs(&args, session);
+		status = nfs4_verify_channel_attrs(&args, &res);
 		/* Increment the clientid slot sequence id */
-		clp->cl_seqid++;
+		if (clp->cl_seqid == res.seqid)
+			clp->cl_seqid++;
+		if (status)
+			goto out;
+		nfs4_update_session(session, &res);
 	}
-
+out:
 	return status;
 }
 
@@ -7301,8 +7333,8 @@ int nfs4_proc_destroy_session(struct nfs4_session *session,
 	dprintk("--> nfs4_proc_destroy_session\n");
 
 	/* session is still being setup */
-	if (session->clp->cl_cons_state != NFS_CS_READY)
-		return status;
+	if (!test_and_clear_bit(NFS4_SESSION_ESTABLISHED, &session->session_state))
+		return 0;
 
 	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
 	trace_nfs4_destroy_session(session->clp, status);
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index e799dc3..e23366e 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -450,7 +450,7 @@ int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
 	tbl = &ses->fc_slot_table;
 	tbl->session = ses;
 	status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
-	if (status) /* -ENOMEM */
+	if (status || !(ses->flags & SESSION4_BACK_CHAN)) /* -ENOMEM */
 		return status;
 	/* Back channel */
 	tbl = &ses->bc_slot_table;
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index b34ada9..e3ea2c5 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -70,6 +70,7 @@ struct nfs4_session {
 
 enum nfs4_session_state {
 	NFS4_SESSION_INITING,
+	NFS4_SESSION_ESTABLISHED,
 };
 
 extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl,
@@ -118,6 +119,12 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
 	return 0;
 }
 
+static inline void nfs4_copy_sessionid(struct nfs4_sessionid *dst,
+		const struct nfs4_sessionid *src)
+{
+	memcpy(dst->data, src->data, NFS4_MAX_SESSIONID_LEN);
+}
+
 #ifdef CONFIG_CRC32
 /*
  * nfs_session_id_hash - calculate the crc32 hash for the session id
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 5ad908e..f95e3b5 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -346,9 +346,23 @@ int nfs41_discover_server_trunking(struct nfs_client *clp,
 	status = nfs4_proc_exchange_id(clp, cred);
 	if (status != NFS4_OK)
 		return status;
-	set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
 
-	return nfs41_walk_client_list(clp, result, cred);
+	status = nfs41_walk_client_list(clp, result, cred);
+	if (status < 0)
+		return status;
+	if (clp != *result)
+		return 0;
+
+	/* Purge state if the client id was established in a prior instance */
+	if (clp->cl_exchange_flags & EXCHGID4_FLAG_CONFIRMED_R)
+		set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state);
+	else
+		set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+	nfs4_schedule_state_manager(clp);
+	status = nfs_wait_client_init_complete(clp);
+	if (status < 0)
+		nfs_put_client(clp);
+	return status;
 }
 
 #endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e23a0a6..5c399ec 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1715,17 +1715,17 @@ static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, stru
 #if defined(CONFIG_NFS_V4_1)
 /* NFSv4.1 operations */
 static void encode_bind_conn_to_session(struct xdr_stream *xdr,
-				   struct nfs4_session *session,
+				   struct nfs41_bind_conn_to_session_args *args,
 				   struct compound_hdr *hdr)
 {
 	__be32 *p;
 
 	encode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION,
 		decode_bind_conn_to_session_maxsz, hdr);
-	encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+	encode_opaque_fixed(xdr, args->sessionid.data, NFS4_MAX_SESSIONID_LEN);
 	p = xdr_reserve_space(xdr, 8);
-	*p++ = cpu_to_be32(NFS4_CDFC4_BACK_OR_BOTH);
-	*p = 0;	/* use_conn_in_rdma_mode = False */
+	*p++ = cpu_to_be32(args->dir);
+	*p = (args->use_conn_in_rdma_mode) ? cpu_to_be32(1) : cpu_to_be32(0);
 }
 
 static void encode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map)
@@ -1806,8 +1806,8 @@ static void encode_create_session(struct xdr_stream *xdr,
 
 	encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr);
 	p = reserve_space(xdr, 16 + 2*28 + 20 + clnt->cl_nodelen + 12);
-	p = xdr_encode_hyper(p, clp->cl_clientid);
-	*p++ = cpu_to_be32(clp->cl_seqid);			/*Sequence id */
+	p = xdr_encode_hyper(p, args->clientid);
+	*p++ = cpu_to_be32(args->seqid);			/*Sequence id */
 	*p++ = cpu_to_be32(args->flags);			/*flags */
 
 	/* Fore Channel */
@@ -2734,14 +2734,14 @@ static void nfs4_xdr_enc_fsid_present(struct rpc_rqst *req,
  */
 static void nfs4_xdr_enc_bind_conn_to_session(struct rpc_rqst *req,
 				struct xdr_stream *xdr,
-				struct nfs_client *clp)
+				struct nfs41_bind_conn_to_session_args *args)
 {
 	struct compound_hdr hdr = {
-		.minorversion = clp->cl_mvops->minor_version,
+		.minorversion = args->client->cl_mvops->minor_version,
 	};
 
 	encode_compound_hdr(xdr, req, &hdr);
-	encode_bind_conn_to_session(xdr, clp->cl_session, &hdr);
+	encode_bind_conn_to_session(xdr, args, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -5613,7 +5613,7 @@ static int decode_bind_conn_to_session(struct xdr_stream *xdr,
 
 	status = decode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION);
 	if (!status)
-		status = decode_sessionid(xdr, &res->session->sess_id);
+		status = decode_sessionid(xdr, &res->sessionid);
 	if (unlikely(status))
 		return status;
 
@@ -5641,12 +5641,10 @@ static int decode_create_session(struct xdr_stream *xdr,
 {
 	__be32 *p;
 	int status;
-	struct nfs_client *clp = res->client;
-	struct nfs4_session *session = clp->cl_session;
 
 	status = decode_op_hdr(xdr, OP_CREATE_SESSION);
 	if (!status)
-		status = decode_sessionid(xdr, &session->sess_id);
+		status = decode_sessionid(xdr, &res->sessionid);
 	if (unlikely(status))
 		return status;
 
@@ -5654,13 +5652,13 @@ static int decode_create_session(struct xdr_stream *xdr,
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
 		goto out_overflow;
-	clp->cl_seqid = be32_to_cpup(p++);
-	session->flags = be32_to_cpup(p);
+	res->seqid = be32_to_cpup(p++);
+	res->flags = be32_to_cpup(p);
 
 	/* Channel attributes */
-	status = decode_chan_attrs(xdr, &session->fc_attrs);
+	status = decode_chan_attrs(xdr, &res->fc_attrs);
 	if (!status)
-		status = decode_chan_attrs(xdr, &session->bc_attrs);
+		status = decode_chan_attrs(xdr, &res->bc_attrs);
 	return status;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 797cd62..635f086 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -344,6 +344,10 @@ void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
 struct nfs4_pnfs_ds_addr *nfs4_decode_mp_ds_addr(struct net *net,
 						 struct xdr_stream *xdr,
 						 gfp_t gfp_flags);
+void pnfs_layout_mark_request_commit(struct nfs_page *req,
+				     struct pnfs_layout_segment *lseg,
+				     struct nfs_commit_info *cinfo,
+				     u32 ds_commit_idx);
 
 static inline bool nfs_have_layout(struct inode *inode)
 {
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index fdc4f65..54e36b3 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -838,3 +838,33 @@ out_err:
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(nfs4_decode_mp_ds_addr);
+
+void
+pnfs_layout_mark_request_commit(struct nfs_page *req,
+				struct pnfs_layout_segment *lseg,
+				struct nfs_commit_info *cinfo,
+				u32 ds_commit_idx)
+{
+	struct list_head *list;
+	struct pnfs_commit_bucket *buckets;
+
+	spin_lock(cinfo->lock);
+	buckets = cinfo->ds->buckets;
+	list = &buckets[ds_commit_idx].written;
+	if (list_empty(list)) {
+		/* Non-empty buckets hold a reference on the lseg.  That ref
+		 * is normally transferred to the COMMIT call and released
+		 * there.  It could also be released if the last req is pulled
+		 * off due to a rewrite, in which case it will be done in
+		 * pnfs_common_clear_request_commit
+		 */
+		WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL);
+		buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg);
+	}
+	set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+	cinfo->ds->nwritten++;
+	spin_unlock(cinfo->lock);
+
+	nfs_request_add_commit_list(req, list, cinfo);
+}
+EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index b09cc23..c63189a 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -139,7 +139,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 	nfs_fattr_init(fattr);
 	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
 	if (status == 0)
-		nfs_setattr_update_inode(inode, sattr);
+		nfs_setattr_update_inode(inode, sattr, fattr);
 	dprintk("NFS reply setattr: %d\n", status);
 	return status;
 }
@@ -609,10 +609,8 @@ static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task,
 
 static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-	struct inode *inode = hdr->inode;
-
 	if (task->tk_status >= 0)
-		nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr);
+		nfs_writeback_update_inode(hdr);
 	return 0;
 }
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 88a6d21..849ed78 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -789,13 +789,8 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
 	nfs_list_add_request(req, dst);
 	cinfo->mds->ncommit++;
 	spin_unlock(cinfo->lock);
-	if (!cinfo->dreq) {
-		inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-		inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
-			     BDI_RECLAIMABLE);
-		__mark_inode_dirty(req->wb_context->dentry->d_inode,
-				   I_DIRTY_DATASYNC);
-	}
+	if (!cinfo->dreq)
+		nfs_mark_page_unstable(req->wb_page);
 }
 EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
 
@@ -1382,6 +1377,36 @@ static int nfs_should_remove_suid(const struct inode *inode)
 	return 0;
 }
 
+static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr,
+		struct nfs_fattr *fattr)
+{
+	struct nfs_pgio_args *argp = &hdr->args;
+	struct nfs_pgio_res *resp = &hdr->res;
+
+	if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
+		return;
+	if (argp->offset + resp->count != fattr->size)
+		return;
+	if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode))
+		return;
+	/* Set attribute barrier */
+	nfs_fattr_set_barrier(fattr);
+}
+
+void nfs_writeback_update_inode(struct nfs_pgio_header *hdr)
+{
+	struct nfs_fattr *fattr = hdr->res.fattr;
+	struct inode *inode = hdr->inode;
+
+	if (fattr == NULL)
+		return;
+	spin_lock(&inode->i_lock);
+	nfs_writeback_check_extend(hdr, fattr);
+	nfs_post_op_update_inode_force_wcc_locked(inode, fattr);
+	spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_writeback_update_inode);
+
 /*
  * This function is called when the WRITE call is complete.
  */
@@ -1605,11 +1630,8 @@ void nfs_retry_commit(struct list_head *page_list,
 		req = nfs_list_entry(page_list->next);
 		nfs_list_remove_request(req);
 		nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx);
-		if (!cinfo->dreq) {
-			dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-			dec_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
-				     BDI_RECLAIMABLE);
-		}
+		if (!cinfo->dreq)
+			nfs_clear_page_commit(req->wb_page);
 		nfs_unlock_and_release_request(req);
 	}
 }
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 3c1bfa1..1028a06 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -587,8 +587,6 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
 
 	rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
 
-	nfsd4_cb_layout_fail(ls);
-
 	printk(KERN_WARNING
 		"nfsd: client %s failed to respond to layout recall. "
 		"  Fencing..\n", addr_str);
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index cc6a760..1c307f0 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -583,7 +583,7 @@ nfs4_reset_recoverydir(char *recdir)
 	if (status)
 		return status;
 	status = -ENOTDIR;
-	if (S_ISDIR(path.dentry->d_inode->i_mode)) {
+	if (d_is_dir(path.dentry)) {
 		strcpy(user_recovery_dirname, recdir);
 		status = 0;
 	}
@@ -1426,7 +1426,7 @@ nfsd4_client_tracking_init(struct net *net)
 	nn->client_tracking_ops = &nfsd4_legacy_tracking_ops;
 	status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path);
 	if (!status) {
-		status = S_ISDIR(path.dentry->d_inode->i_mode);
+		status = d_is_dir(path.dentry);
 		path_put(&path);
 		if (status)
 			goto do_init;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f6b2a09..d2f2c37 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1638,7 +1638,7 @@ __destroy_client(struct nfs4_client *clp)
 		nfs4_put_stid(&dp->dl_stid);
 	}
 	while (!list_empty(&clp->cl_revoked)) {
-		dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
+		dp = list_entry(clp->cl_revoked.next, struct nfs4_delegation, dl_recall_lru);
 		list_del_init(&dp->dl_recall_lru);
 		nfs4_put_stid(&dp->dl_stid);
 	}
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 965b478..e9fa966 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -114,8 +114,8 @@ static inline __be32 check_pseudo_root(struct svc_rqst *rqstp,
 	 * We're exposing only the directories and symlinks that have to be
 	 * traversed on the way to real exports:
 	 */
-	if (unlikely(!S_ISDIR(dentry->d_inode->i_mode) &&
-		     !S_ISLNK(dentry->d_inode->i_mode)))
+	if (unlikely(!d_is_dir(dentry) &&
+		     !d_is_symlink(dentry)))
 		return nfserr_stale;
 	/*
 	 * A pseudoroot export gives permission to access only one
@@ -259,7 +259,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
 		goto out;
 	}
 
-	if (S_ISDIR(dentry->d_inode->i_mode) &&
+	if (d_is_dir(dentry) &&
 			(dentry->d_flags & DCACHE_DISCONNECTED)) {
 		printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %pd2\n",
 				dentry);
@@ -414,7 +414,7 @@ static inline void _fh_update_old(struct dentry *dentry,
 {
 	fh->ofh_ino = ino_t_to_u32(dentry->d_inode->i_ino);
 	fh->ofh_generation = dentry->d_inode->i_generation;
-	if (S_ISDIR(dentry->d_inode->i_mode) ||
+	if (d_is_dir(dentry) ||
 	    (exp->ex_flags & NFSEXP_NOSUBTREECHECK))
 		fh->ofh_dirino = 0;
 }
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 5685c67..3685265 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -615,9 +615,9 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
 	export = fhp->fh_export;
 	dentry = fhp->fh_dentry;
 
-	if (S_ISREG(dentry->d_inode->i_mode))
+	if (d_is_reg(dentry))
 		map = nfs3_regaccess;
-	else if (S_ISDIR(dentry->d_inode->i_mode))
+	else if (d_is_dir(dentry))
 		map = nfs3_diraccess;
 	else
 		map = nfs3_anyaccess;
@@ -1402,7 +1402,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 
 		switch (createmode) {
 		case NFS3_CREATE_UNCHECKED:
-			if (! S_ISREG(dchild->d_inode->i_mode))
+			if (! d_is_reg(dchild))
 				goto out;
 			else if (truncp) {
 				/* in nfsv4, we need to treat this case a little
@@ -1615,7 +1615,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 	if (err)
 		goto out;
 	err = nfserr_isdir;
-	if (S_ISDIR(tfhp->fh_dentry->d_inode->i_mode))
+	if (d_is_dir(tfhp->fh_dentry))
 		goto out;
 	err = nfserr_perm;
 	if (!len)
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index b2e3ff3..ecdbae1 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -31,6 +31,8 @@
 #include "alloc.h"
 #include "dat.h"
 
+static void __nilfs_btree_init(struct nilfs_bmap *bmap);
+
 static struct nilfs_btree_path *nilfs_btree_alloc_path(void)
 {
 	struct nilfs_btree_path *path;
@@ -368,6 +370,34 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
 	return ret;
 }
 
+/**
+ * nilfs_btree_root_broken - verify consistency of btree root node
+ * @node: btree root node to be examined
+ * @ino: inode number
+ *
+ * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
+ */
+static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
+				   unsigned long ino)
+{
+	int level, flags, nchildren;
+	int ret = 0;
+
+	level = nilfs_btree_node_get_level(node);
+	flags = nilfs_btree_node_get_flags(node);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+
+	if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
+		     level > NILFS_BTREE_LEVEL_MAX ||
+		     nchildren < 0 ||
+		     nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) {
+		pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n",
+			ino, level, flags, nchildren);
+		ret = 1;
+	}
+	return ret;
+}
+
 int nilfs_btree_broken_node_block(struct buffer_head *bh)
 {
 	int ret;
@@ -1713,7 +1743,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree,
 
 	/* convert and insert */
 	dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
-	nilfs_btree_init(btree);
+	__nilfs_btree_init(btree);
 	if (nreq != NULL) {
 		nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
 		nilfs_bmap_commit_alloc_ptr(btree, nreq, dat);
@@ -2294,12 +2324,23 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
 	.bop_gather_data	=	NULL,
 };
 
-int nilfs_btree_init(struct nilfs_bmap *bmap)
+static void __nilfs_btree_init(struct nilfs_bmap *bmap)
 {
 	bmap->b_ops = &nilfs_btree_ops;
 	bmap->b_nchildren_per_block =
 		NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
-	return 0;
+}
+
+int nilfs_btree_init(struct nilfs_bmap *bmap)
+{
+	int ret = 0;
+
+	__nilfs_btree_init(bmap);
+
+	if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap),
+				    bmap->b_inode->i_ino))
+		ret = -EIO;
+	return ret;
 }
 
 void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 469086b..0c3f303 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1907,6 +1907,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
 					     struct the_nilfs *nilfs)
 {
 	struct nilfs_inode_info *ii, *n;
+	int during_mount = !(sci->sc_super->s_flags & MS_ACTIVE);
 	int defer_iput = false;
 
 	spin_lock(&nilfs->ns_inode_lock);
@@ -1919,10 +1920,10 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
 		brelse(ii->i_bh);
 		ii->i_bh = NULL;
 		list_del_init(&ii->i_dirty);
-		if (!ii->vfs_inode.i_nlink) {
+		if (!ii->vfs_inode.i_nlink || during_mount) {
 			/*
-			 * Defer calling iput() to avoid a deadlock
-			 * over I_SYNC flag for inodes with i_nlink == 0
+			 * Defer calling iput() to avoid deadlocks if
+			 * i_nlink == 0 or mount is not yet finished.
 			 */
 			list_add_tail(&ii->i_dirty, &sci->sc_iput_queue);
 			defer_iput = true;
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 51ceb81..d2f97ec 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -115,8 +115,8 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
 		return false;
 
 	/* sorry, fanotify only gives a damn about files and dirs */
-	if (!S_ISREG(path->dentry->d_inode->i_mode) &&
-	    !S_ISDIR(path->dentry->d_inode->i_mode))
+	if (!d_is_reg(path->dentry) &&
+	    !d_can_lookup(path->dentry))
 		return false;
 
 	if (inode_mark && vfsmnt_mark) {
@@ -139,11 +139,12 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
 		BUG();
 	}
 
-	if (S_ISDIR(path->dentry->d_inode->i_mode) &&
+	if (d_is_dir(path->dentry) &&
 	    !(marks_mask & FS_ISDIR & ~marks_ignored_mask))
 		return false;
 
-	if (event_mask & marks_mask & ~marks_ignored_mask)
+	if (event_mask & FAN_ALL_OUTGOING_EVENTS & marks_mask &
+				 ~marks_ignored_mask)
 		return true;
 
 	return false;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 8490c64..460c6c3 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -502,7 +502,7 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
 
 static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb)
 {
-	if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_APPEND_DIO)
 		return 1;
 	return 0;
 }
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 20e37a3..db64ce2 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -102,11 +102,11 @@
 					 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
 					 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
 					 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG	\
-					 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
+					 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO \
+					 | OCFS2_FEATURE_INCOMPAT_APPEND_DIO)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP	(OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
 					 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
-					 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA \
-					 | OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
+					 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
 
 /*
  * Heartbeat-only devices are missing journals and other files.  The
@@ -179,6 +179,11 @@
 #define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO	0x4000
 
 /*
+ * Append Direct IO support
+ */
+#define OCFS2_FEATURE_INCOMPAT_APPEND_DIO	0x8000
+
+/*
  * backup superblock flag is used to indicate that this volume
  * has backup superblocks.
  */
@@ -200,10 +205,6 @@
 #define OCFS2_FEATURE_RO_COMPAT_USRQUOTA	0x0002
 #define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA	0x0004
 
-/*
- * Append Direct IO support
- */
-#define OCFS2_FEATURE_RO_COMPAT_APPEND_DIO	0x0008
 
 /* The byte offset of the first backup block will be 1G.
  * The following will be 4G, 16G, 64G, 256G and 1T.
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index ea10a87..24f6404 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -191,7 +191,6 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
 		ovl_set_timestamps(upperdentry, stat);
 
 	return err;
-
 }
 
 static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
@@ -385,7 +384,7 @@ int ovl_copy_up(struct dentry *dentry)
 		struct kstat stat;
 		enum ovl_path_type type = ovl_path_type(dentry);
 
-		if (type != OVL_PATH_LOWER)
+		if (OVL_TYPE_UPPER(type))
 			break;
 
 		next = dget(dentry);
@@ -394,7 +393,7 @@ int ovl_copy_up(struct dentry *dentry)
 			parent = dget_parent(next);
 
 			type = ovl_path_type(parent);
-			if (type != OVL_PATH_LOWER)
+			if (OVL_TYPE_UPPER(type))
 				break;
 
 			dput(next);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 8ffc4b9..d139405 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -19,7 +19,7 @@ void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
 	int err;
 
 	dget(wdentry);
-	if (S_ISDIR(wdentry->d_inode->i_mode))
+	if (d_is_dir(wdentry))
 		err = ovl_do_rmdir(wdir, wdentry);
 	else
 		err = ovl_do_unlink(wdir, wdentry);
@@ -118,14 +118,14 @@ int ovl_create_real(struct inode *dir, struct dentry *newdentry,
 
 static int ovl_set_opaque(struct dentry *upperdentry)
 {
-	return ovl_do_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0);
+	return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
 }
 
 static void ovl_remove_opaque(struct dentry *upperdentry)
 {
 	int err;
 
-	err = ovl_do_removexattr(upperdentry, ovl_opaque_xattr);
+	err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
 	if (err) {
 		pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
 			upperdentry->d_name.name, err);
@@ -152,7 +152,7 @@ static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	 * correct link count.  nlink=1 seems to pacify 'find' and
 	 * other utilities.
 	 */
-	if (type == OVL_PATH_MERGE)
+	if (OVL_TYPE_MERGE(type))
 		stat->nlink = 1;
 
 	return 0;
@@ -506,7 +506,7 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
 	struct dentry *opaquedir = NULL;
 	int err;
 
-	if (is_dir) {
+	if (is_dir && OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
 		opaquedir = ovl_check_empty_and_clear(dentry);
 		err = PTR_ERR(opaquedir);
 		if (IS_ERR(opaquedir))
@@ -630,7 +630,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
 		goto out_drop_write;
 
 	type = ovl_path_type(dentry);
-	if (type == OVL_PATH_PURE_UPPER) {
+	if (OVL_TYPE_PURE_UPPER(type)) {
 		err = ovl_remove_upper(dentry, is_dir);
 	} else {
 		const struct cred *old_cred;
@@ -693,7 +693,7 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
 	bool new_create = false;
 	bool cleanup_whiteout = false;
 	bool overwrite = !(flags & RENAME_EXCHANGE);
-	bool is_dir = S_ISDIR(old->d_inode->i_mode);
+	bool is_dir = d_is_dir(old);
 	bool new_is_dir = false;
 	struct dentry *opaquedir = NULL;
 	const struct cred *old_cred = NULL;
@@ -712,7 +712,7 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
 	/* Don't copy up directory trees */
 	old_type = ovl_path_type(old);
 	err = -EXDEV;
-	if ((old_type == OVL_PATH_LOWER || old_type == OVL_PATH_MERGE) && is_dir)
+	if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
 		goto out;
 
 	if (new->d_inode) {
@@ -720,30 +720,30 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
 		if (err)
 			goto out;
 
-		if (S_ISDIR(new->d_inode->i_mode))
+		if (d_is_dir(new))
 			new_is_dir = true;
 
 		new_type = ovl_path_type(new);
 		err = -EXDEV;
-		if (!overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir)
+		if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
 			goto out;
 
 		err = 0;
-		if (new_type == OVL_PATH_LOWER && old_type == OVL_PATH_LOWER) {
+		if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
 			if (ovl_dentry_lower(old)->d_inode ==
 			    ovl_dentry_lower(new)->d_inode)
 				goto out;
 		}
-		if (new_type != OVL_PATH_LOWER && old_type != OVL_PATH_LOWER) {
+		if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
 			if (ovl_dentry_upper(old)->d_inode ==
 			    ovl_dentry_upper(new)->d_inode)
 				goto out;
 		}
 	} else {
 		if (ovl_dentry_is_opaque(new))
-			new_type = OVL_PATH_UPPER;
+			new_type = __OVL_PATH_UPPER;
 		else
-			new_type = OVL_PATH_PURE_UPPER;
+			new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
 	}
 
 	err = ovl_want_write(old);
@@ -763,8 +763,8 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
 			goto out_drop_write;
 	}
 
-	old_opaque = old_type != OVL_PATH_PURE_UPPER;
-	new_opaque = new_type != OVL_PATH_PURE_UPPER;
+	old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
+	new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
 
 	if (old_opaque || new_opaque) {
 		err = -ENOMEM;
@@ -787,7 +787,7 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
 		old_cred = override_creds(override_cred);
 	}
 
-	if (overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) {
+	if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
 		opaquedir = ovl_check_empty_and_clear(new);
 		err = PTR_ERR(opaquedir);
 		if (IS_ERR(opaquedir)) {
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 07d74b2..04f1248 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -205,7 +205,7 @@ static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
 
 static bool ovl_is_private_xattr(const char *name)
 {
-	return strncmp(name, "trusted.overlay.", 14) == 0;
+	return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
 }
 
 int ovl_setxattr(struct dentry *dentry, const char *name,
@@ -238,7 +238,10 @@ out:
 static bool ovl_need_xattr_filter(struct dentry *dentry,
 				  enum ovl_path_type type)
 {
-	return type == OVL_PATH_UPPER && S_ISDIR(dentry->d_inode->i_mode);
+	if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
+		return S_ISDIR(dentry->d_inode->i_mode);
+	else
+		return false;
 }
 
 ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
@@ -299,7 +302,7 @@ int ovl_removexattr(struct dentry *dentry, const char *name)
 	if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
 		goto out_drop_write;
 
-	if (type == OVL_PATH_LOWER) {
+	if (!OVL_TYPE_UPPER(type)) {
 		err = vfs_getxattr(realpath.dentry, name, NULL, 0);
 		if (err < 0)
 			goto out_drop_write;
@@ -321,7 +324,7 @@ out:
 static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
 				  struct dentry *realdentry)
 {
-	if (type != OVL_PATH_LOWER)
+	if (OVL_TYPE_UPPER(type))
 		return false;
 
 	if (special_file(realdentry->d_inode->i_mode))
@@ -430,5 +433,4 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
 	}
 
 	return inode;
-
 }
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 814bed3..17ac5af 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -12,13 +12,20 @@
 struct ovl_entry;
 
 enum ovl_path_type {
-	OVL_PATH_PURE_UPPER,
-	OVL_PATH_UPPER,
-	OVL_PATH_MERGE,
-	OVL_PATH_LOWER,
+	__OVL_PATH_PURE		= (1 << 0),
+	__OVL_PATH_UPPER	= (1 << 1),
+	__OVL_PATH_MERGE	= (1 << 2),
 };
 
-extern const char *ovl_opaque_xattr;
+#define OVL_TYPE_UPPER(type)	((type) & __OVL_PATH_UPPER)
+#define OVL_TYPE_MERGE(type)	((type) & __OVL_PATH_MERGE)
+#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
+#define OVL_TYPE_MERGE_OR_LOWER(type) \
+	(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
+
+#define OVL_XATTR_PRE_NAME "trusted.overlay."
+#define OVL_XATTR_PRE_LEN  16
+#define OVL_XATTR_OPAQUE   OVL_XATTR_PRE_NAME"opaque"
 
 static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
 {
@@ -130,6 +137,7 @@ void ovl_dentry_version_inc(struct dentry *dentry);
 void ovl_path_upper(struct dentry *dentry, struct path *path);
 void ovl_path_lower(struct dentry *dentry, struct path *path);
 enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
 struct dentry *ovl_dentry_upper(struct dentry *dentry);
 struct dentry *ovl_dentry_lower(struct dentry *dentry);
 struct dentry *ovl_dentry_real(struct dentry *dentry);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index c020599..907870e 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -24,7 +24,6 @@ struct ovl_cache_entry {
 	struct list_head l_node;
 	struct rb_node node;
 	bool is_whiteout;
-	bool is_cursor;
 	char name[];
 };
 
@@ -40,6 +39,7 @@ struct ovl_readdir_data {
 	struct rb_root root;
 	struct list_head *list;
 	struct list_head middle;
+	struct dentry *dir;
 	int count;
 	int err;
 };
@@ -48,7 +48,7 @@ struct ovl_dir_file {
 	bool is_real;
 	bool is_upper;
 	struct ovl_dir_cache *cache;
-	struct ovl_cache_entry cursor;
+	struct list_head *cursor;
 	struct file *realfile;
 	struct file *upperfile;
 };
@@ -79,23 +79,49 @@ static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
 	return NULL;
 }
 
-static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len,
+static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir,
+						   const char *name, int len,
 						   u64 ino, unsigned int d_type)
 {
 	struct ovl_cache_entry *p;
 	size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
 
 	p = kmalloc(size, GFP_KERNEL);
-	if (p) {
-		memcpy(p->name, name, len);
-		p->name[len] = '\0';
-		p->len = len;
-		p->type = d_type;
-		p->ino = ino;
-		p->is_whiteout = false;
-		p->is_cursor = false;
-	}
+	if (!p)
+		return NULL;
+
+	memcpy(p->name, name, len);
+	p->name[len] = '\0';
+	p->len = len;
+	p->type = d_type;
+	p->ino = ino;
+	p->is_whiteout = false;
+
+	if (d_type == DT_CHR) {
+		struct dentry *dentry;
+		const struct cred *old_cred;
+		struct cred *override_cred;
+
+		override_cred = prepare_creds();
+		if (!override_cred) {
+			kfree(p);
+			return NULL;
+		}
+
+		/*
+		 * CAP_DAC_OVERRIDE for lookup
+		 */
+		cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+		old_cred = override_creds(override_cred);
 
+		dentry = lookup_one_len(name, dir, len);
+		if (!IS_ERR(dentry)) {
+			p->is_whiteout = ovl_is_whiteout(dentry);
+			dput(dentry);
+		}
+		revert_creds(old_cred);
+		put_cred(override_cred);
+	}
 	return p;
 }
 
@@ -122,7 +148,7 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
 			return 0;
 	}
 
-	p = ovl_cache_entry_new(name, len, ino, d_type);
+	p = ovl_cache_entry_new(rdd->dir, name, len, ino, d_type);
 	if (p == NULL)
 		return -ENOMEM;
 
@@ -143,7 +169,7 @@ static int ovl_fill_lower(struct ovl_readdir_data *rdd,
 	if (p) {
 		list_move_tail(&p->l_node, &rdd->middle);
 	} else {
-		p = ovl_cache_entry_new(name, namelen, ino, d_type);
+		p = ovl_cache_entry_new(rdd->dir, name, namelen, ino, d_type);
 		if (p == NULL)
 			rdd->err = -ENOMEM;
 		else
@@ -168,7 +194,6 @@ static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
 {
 	struct ovl_dir_cache *cache = od->cache;
 
-	list_del_init(&od->cursor.l_node);
 	WARN_ON(cache->refcount <= 0);
 	cache->refcount--;
 	if (!cache->refcount) {
@@ -204,6 +229,7 @@ static inline int ovl_dir_read(struct path *realpath,
 	if (IS_ERR(realfile))
 		return PTR_ERR(realfile);
 
+	rdd->dir = realpath->dentry;
 	rdd->ctx.pos = 0;
 	do {
 		rdd->count = 0;
@@ -227,108 +253,58 @@ static void ovl_dir_reset(struct file *file)
 	if (cache && ovl_dentry_version_get(dentry) != cache->version) {
 		ovl_cache_put(od, dentry);
 		od->cache = NULL;
+		od->cursor = NULL;
 	}
-	WARN_ON(!od->is_real && type != OVL_PATH_MERGE);
-	if (od->is_real && type == OVL_PATH_MERGE)
+	WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
+	if (od->is_real && OVL_TYPE_MERGE(type))
 		od->is_real = false;
 }
 
-static int ovl_dir_mark_whiteouts(struct dentry *dir,
-				  struct ovl_readdir_data *rdd)
-{
-	struct ovl_cache_entry *p;
-	struct dentry *dentry;
-	const struct cred *old_cred;
-	struct cred *override_cred;
-
-	override_cred = prepare_creds();
-	if (!override_cred) {
-		ovl_cache_free(rdd->list);
-		return -ENOMEM;
-	}
-
-	/*
-	 * CAP_DAC_OVERRIDE for lookup
-	 */
-	cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
-	old_cred = override_creds(override_cred);
-
-	mutex_lock(&dir->d_inode->i_mutex);
-	list_for_each_entry(p, rdd->list, l_node) {
-		if (p->is_cursor)
-			continue;
-
-		if (p->type != DT_CHR)
-			continue;
-
-		dentry = lookup_one_len(p->name, dir, p->len);
-		if (IS_ERR(dentry))
-			continue;
-
-		p->is_whiteout = ovl_is_whiteout(dentry);
-		dput(dentry);
-	}
-	mutex_unlock(&dir->d_inode->i_mutex);
-
-	revert_creds(old_cred);
-	put_cred(override_cred);
-
-	return 0;
-}
-
 static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
 {
 	int err;
-	struct path lowerpath;
-	struct path upperpath;
+	struct path realpath;
 	struct ovl_readdir_data rdd = {
 		.ctx.actor = ovl_fill_merge,
 		.list = list,
 		.root = RB_ROOT,
 		.is_merge = false,
 	};
+	int idx, next;
 
-	ovl_path_lower(dentry, &lowerpath);
-	ovl_path_upper(dentry, &upperpath);
+	for (idx = 0; idx != -1; idx = next) {
+		next = ovl_path_next(idx, dentry, &realpath);
 
-	if (upperpath.dentry) {
-		err = ovl_dir_read(&upperpath, &rdd);
-		if (err)
-			goto out;
-
-		if (lowerpath.dentry) {
-			err = ovl_dir_mark_whiteouts(upperpath.dentry, &rdd);
+		if (next != -1) {
+			err = ovl_dir_read(&realpath, &rdd);
 			if (err)
-				goto out;
+				break;
+		} else {
+			/*
+			 * Insert lowest layer entries before upper ones, this
+			 * allows offsets to be reasonably constant
+			 */
+			list_add(&rdd.middle, rdd.list);
+			rdd.is_merge = true;
+			err = ovl_dir_read(&realpath, &rdd);
+			list_del(&rdd.middle);
 		}
 	}
-	if (lowerpath.dentry) {
-		/*
-		 * Insert lowerpath entries before upperpath ones, this allows
-		 * offsets to be reasonably constant
-		 */
-		list_add(&rdd.middle, rdd.list);
-		rdd.is_merge = true;
-		err = ovl_dir_read(&lowerpath, &rdd);
-		list_del(&rdd.middle);
-	}
-out:
 	return err;
 }
 
 static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
 {
-	struct ovl_cache_entry *p;
+	struct list_head *p;
 	loff_t off = 0;
 
-	list_for_each_entry(p, &od->cache->entries, l_node) {
-		if (p->is_cursor)
-			continue;
+	list_for_each(p, &od->cache->entries) {
 		if (off >= pos)
 			break;
 		off++;
 	}
-	list_move_tail(&od->cursor.l_node, &p->l_node);
+	/* Cursor is safe since the cache is stable */
+	od->cursor = p;
 }
 
 static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
@@ -367,6 +343,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
 {
 	struct ovl_dir_file *od = file->private_data;
 	struct dentry *dentry = file->f_path.dentry;
+	struct ovl_cache_entry *p;
 
 	if (!ctx->pos)
 		ovl_dir_reset(file);
@@ -385,19 +362,13 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
 		ovl_seek_cursor(od, ctx->pos);
 	}
 
-	while (od->cursor.l_node.next != &od->cache->entries) {
-		struct ovl_cache_entry *p;
-
-		p = list_entry(od->cursor.l_node.next, struct ovl_cache_entry, l_node);
-		/* Skip cursors */
-		if (!p->is_cursor) {
-			if (!p->is_whiteout) {
-				if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
-					break;
-			}
-			ctx->pos++;
-		}
-		list_move(&od->cursor.l_node, &p->l_node);
+	while (od->cursor != &od->cache->entries) {
+		p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
+		if (!p->is_whiteout)
+			if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
+				break;
+		od->cursor = p->l_node.next;
+		ctx->pos++;
 	}
 	return 0;
 }
@@ -452,7 +423,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
 	/*
 	 * Need to check if we started out being a lower dir, but got copied up
 	 */
-	if (!od->is_upper && ovl_path_type(dentry) != OVL_PATH_LOWER) {
+	if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
 		struct inode *inode = file_inode(file);
 
 		realfile = lockless_dereference(od->upperfile);
@@ -516,11 +487,9 @@ static int ovl_dir_open(struct inode *inode, struct file *file)
 		kfree(od);
 		return PTR_ERR(realfile);
 	}
-	INIT_LIST_HEAD(&od->cursor.l_node);
 	od->realfile = realfile;
-	od->is_real = (type != OVL_PATH_MERGE);
-	od->is_upper = (type != OVL_PATH_LOWER);
-	od->cursor.is_cursor = true;
+	od->is_real = !OVL_TYPE_MERGE(type);
+	od->is_upper = OVL_TYPE_UPPER(type);
 	file->private_data = od;
 
 	return 0;
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index f16d318..5f0d199 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -35,7 +35,8 @@ struct ovl_config {
 /* private information held for overlayfs's superblock */
 struct ovl_fs {
 	struct vfsmount *upper_mnt;
-	struct vfsmount *lower_mnt;
+	unsigned numlower;
+	struct vfsmount **lower_mnt;
 	struct dentry *workdir;
 	long lower_namelen;
 	/* pathnames of lower and upper dirs, for show_options */
@@ -47,7 +48,6 @@ struct ovl_dir_cache;
 /* private information held for every overlayfs dentry */
 struct ovl_entry {
 	struct dentry *__upperdentry;
-	struct dentry *lowerdentry;
 	struct ovl_dir_cache *cache;
 	union {
 		struct {
@@ -56,30 +56,36 @@ struct ovl_entry {
 		};
 		struct rcu_head rcu;
 	};
+	unsigned numlower;
+	struct path lowerstack[];
 };
 
-const char *ovl_opaque_xattr = "trusted.overlay.opaque";
+#define OVL_MAX_STACK 500
 
+static struct dentry *__ovl_dentry_lower(struct ovl_entry *oe)
+{
+	return oe->numlower ? oe->lowerstack[0].dentry : NULL;
+}
 
 enum ovl_path_type ovl_path_type(struct dentry *dentry)
 {
 	struct ovl_entry *oe = dentry->d_fsdata;
+	enum ovl_path_type type = 0;
 
 	if (oe->__upperdentry) {
-		if (oe->lowerdentry) {
+		type = __OVL_PATH_UPPER;
+
+		if (oe->numlower) {
 			if (S_ISDIR(dentry->d_inode->i_mode))
-				return OVL_PATH_MERGE;
-			else
-				return OVL_PATH_UPPER;
-		} else {
-			if (oe->opaque)
-				return OVL_PATH_UPPER;
-			else
-				return OVL_PATH_PURE_UPPER;
+				type |= __OVL_PATH_MERGE;
+		} else if (!oe->opaque) {
+			type |= __OVL_PATH_PURE;
 		}
 	} else {
-		return OVL_PATH_LOWER;
+		if (oe->numlower > 1)
+			type |= __OVL_PATH_MERGE;
 	}
+	return type;
 }
 
 static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe)
@@ -98,10 +104,9 @@ void ovl_path_upper(struct dentry *dentry, struct path *path)
 
 enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
 {
-
 	enum ovl_path_type type = ovl_path_type(dentry);
 
-	if (type == OVL_PATH_LOWER)
+	if (!OVL_TYPE_UPPER(type))
 		ovl_path_lower(dentry, path);
 	else
 		ovl_path_upper(dentry, path);
@@ -120,7 +125,7 @@ struct dentry *ovl_dentry_lower(struct dentry *dentry)
 {
 	struct ovl_entry *oe = dentry->d_fsdata;
 
-	return oe->lowerdentry;
+	return __ovl_dentry_lower(oe);
 }
 
 struct dentry *ovl_dentry_real(struct dentry *dentry)
@@ -130,7 +135,7 @@ struct dentry *ovl_dentry_real(struct dentry *dentry)
 
 	realdentry = ovl_upperdentry_dereference(oe);
 	if (!realdentry)
-		realdentry = oe->lowerdentry;
+		realdentry = __ovl_dentry_lower(oe);
 
 	return realdentry;
 }
@@ -143,7 +148,7 @@ struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper)
 	if (realdentry) {
 		*is_upper = true;
 	} else {
-		realdentry = oe->lowerdentry;
+		realdentry = __ovl_dentry_lower(oe);
 		*is_upper = false;
 	}
 	return realdentry;
@@ -165,11 +170,9 @@ void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache)
 
 void ovl_path_lower(struct dentry *dentry, struct path *path)
 {
-	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
 	struct ovl_entry *oe = dentry->d_fsdata;
 
-	path->mnt = ofs->lower_mnt;
-	path->dentry = oe->lowerdentry;
+	*path = oe->numlower ? oe->lowerstack[0] : (struct path) { NULL, NULL };
 }
 
 int ovl_want_write(struct dentry *dentry)
@@ -249,7 +252,7 @@ static bool ovl_is_opaquedir(struct dentry *dentry)
 	if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr)
 		return false;
 
-	res = inode->i_op->getxattr(dentry, ovl_opaque_xattr, &val, 1);
+	res = inode->i_op->getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1);
 	if (res == 1 && val == 'y')
 		return true;
 
@@ -261,8 +264,11 @@ static void ovl_dentry_release(struct dentry *dentry)
 	struct ovl_entry *oe = dentry->d_fsdata;
 
 	if (oe) {
+		unsigned int i;
+
 		dput(oe->__upperdentry);
-		dput(oe->lowerdentry);
+		for (i = 0; i < oe->numlower; i++)
+			dput(oe->lowerstack[i].dentry);
 		kfree_rcu(oe, rcu);
 	}
 }
@@ -271,9 +277,15 @@ static const struct dentry_operations ovl_dentry_operations = {
 	.d_release = ovl_dentry_release,
 };
 
-static struct ovl_entry *ovl_alloc_entry(void)
+static struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
 {
-	return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL);
+	size_t size = offsetof(struct ovl_entry, lowerstack[numlower]);
+	struct ovl_entry *oe = kzalloc(size, GFP_KERNEL);
+
+	if (oe)
+		oe->numlower = numlower;
+
+	return oe;
 }
 
 static inline struct dentry *ovl_lookup_real(struct dentry *dir,
@@ -295,82 +307,154 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir,
 	return dentry;
 }
 
+/*
+ * Returns next layer in stack starting from top.
+ * Returns -1 if this is the last layer.
+ */
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	BUG_ON(idx < 0);
+	if (idx == 0) {
+		ovl_path_upper(dentry, path);
+		if (path->dentry)
+			return oe->numlower ? 1 : -1;
+		idx++;
+	}
+	BUG_ON(idx > oe->numlower);
+	*path = oe->lowerstack[idx - 1];
+
+	return (idx < oe->numlower) ? idx + 1 : -1;
+}
+
 struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 			  unsigned int flags)
 {
 	struct ovl_entry *oe;
-	struct dentry *upperdir;
-	struct dentry *lowerdir;
-	struct dentry *upperdentry = NULL;
-	struct dentry *lowerdentry = NULL;
+	struct ovl_entry *poe = dentry->d_parent->d_fsdata;
+	struct path *stack = NULL;
+	struct dentry *upperdir, *upperdentry = NULL;
+	unsigned int ctr = 0;
 	struct inode *inode = NULL;
+	bool upperopaque = false;
+	struct dentry *this, *prev = NULL;
+	unsigned int i;
 	int err;
 
-	err = -ENOMEM;
-	oe = ovl_alloc_entry();
-	if (!oe)
-		goto out;
-
-	upperdir = ovl_dentry_upper(dentry->d_parent);
-	lowerdir = ovl_dentry_lower(dentry->d_parent);
-
+	upperdir = ovl_upperdentry_dereference(poe);
 	if (upperdir) {
-		upperdentry = ovl_lookup_real(upperdir, &dentry->d_name);
-		err = PTR_ERR(upperdentry);
-		if (IS_ERR(upperdentry))
-			goto out_put_dir;
-
-		if (lowerdir && upperdentry) {
-			if (ovl_is_whiteout(upperdentry)) {
-				dput(upperdentry);
-				upperdentry = NULL;
-				oe->opaque = true;
-			} else if (ovl_is_opaquedir(upperdentry)) {
-				oe->opaque = true;
+		this = ovl_lookup_real(upperdir, &dentry->d_name);
+		err = PTR_ERR(this);
+		if (IS_ERR(this))
+			goto out;
+
+		if (this) {
+			if (ovl_is_whiteout(this)) {
+				dput(this);
+				this = NULL;
+				upperopaque = true;
+			} else if (poe->numlower && ovl_is_opaquedir(this)) {
+				upperopaque = true;
 			}
 		}
+		upperdentry = prev = this;
 	}
-	if (lowerdir && !oe->opaque) {
-		lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name);
-		err = PTR_ERR(lowerdentry);
-		if (IS_ERR(lowerdentry))
-			goto out_dput_upper;
+
+	if (!upperopaque && poe->numlower) {
+		err = -ENOMEM;
+		stack = kcalloc(poe->numlower, sizeof(struct path), GFP_KERNEL);
+		if (!stack)
+			goto out_put_upper;
 	}
 
-	if (lowerdentry && upperdentry &&
-	    (!S_ISDIR(upperdentry->d_inode->i_mode) ||
-	     !S_ISDIR(lowerdentry->d_inode->i_mode))) {
-		dput(lowerdentry);
-		lowerdentry = NULL;
-		oe->opaque = true;
+	for (i = 0; !upperopaque && i < poe->numlower; i++) {
+		bool opaque = false;
+		struct path lowerpath = poe->lowerstack[i];
+
+		this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name);
+		err = PTR_ERR(this);
+		if (IS_ERR(this)) {
+			/*
+			 * If it's positive, then treat ENAMETOOLONG as ENOENT.
+			 */
+			if (err == -ENAMETOOLONG && (upperdentry || ctr))
+				continue;
+			goto out_put;
+		}
+		if (!this)
+			continue;
+		if (ovl_is_whiteout(this)) {
+			dput(this);
+			break;
+		}
+		/*
+		 * Only makes sense to check opaque dir if this is not the
+		 * lowermost layer.
+		 */
+		if (i < poe->numlower - 1 && ovl_is_opaquedir(this))
+			opaque = true;
+
+		if (prev && (!S_ISDIR(prev->d_inode->i_mode) ||
+			     !S_ISDIR(this->d_inode->i_mode))) {
+			/*
+			 * FIXME: check for upper-opaqueness maybe better done
+			 * in remove code.
+			 */
+			if (prev == upperdentry)
+				upperopaque = true;
+			dput(this);
+			break;
+		}
+		/*
+		 * If this is a non-directory then stop here.
+		 */
+		if (!S_ISDIR(this->d_inode->i_mode))
+			opaque = true;
+
+		stack[ctr].dentry = this;
+		stack[ctr].mnt = lowerpath.mnt;
+		ctr++;
+		prev = this;
+		if (opaque)
+			break;
 	}
 
-	if (lowerdentry || upperdentry) {
+	oe = ovl_alloc_entry(ctr);
+	err = -ENOMEM;
+	if (!oe)
+		goto out_put;
+
+	if (upperdentry || ctr) {
 		struct dentry *realdentry;
 
-		realdentry = upperdentry ? upperdentry : lowerdentry;
+		realdentry = upperdentry ? upperdentry : stack[0].dentry;
+
 		err = -ENOMEM;
 		inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode,
 				      oe);
 		if (!inode)
-			goto out_dput;
+			goto out_free_oe;
 		ovl_copyattr(realdentry->d_inode, inode);
 	}
 
+	oe->opaque = upperopaque;
 	oe->__upperdentry = upperdentry;
-	oe->lowerdentry = lowerdentry;
-
+	memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr);
+	kfree(stack);
 	dentry->d_fsdata = oe;
 	d_add(dentry, inode);
 
 	return NULL;
 
-out_dput:
-	dput(lowerdentry);
-out_dput_upper:
-	dput(upperdentry);
-out_put_dir:
+out_free_oe:
 	kfree(oe);
+out_put:
+	for (i = 0; i < ctr; i++)
+		dput(stack[i].dentry);
+	kfree(stack);
+out_put_upper:
+	dput(upperdentry);
 out:
 	return ERR_PTR(err);
 }
@@ -383,10 +467,12 @@ struct file *ovl_path_open(struct path *path, int flags)
 static void ovl_put_super(struct super_block *sb)
 {
 	struct ovl_fs *ufs = sb->s_fs_info;
+	unsigned i;
 
 	dput(ufs->workdir);
 	mntput(ufs->upper_mnt);
-	mntput(ufs->lower_mnt);
+	for (i = 0; i < ufs->numlower; i++)
+		mntput(ufs->lower_mnt[i]);
 
 	kfree(ufs->config.lowerdir);
 	kfree(ufs->config.upperdir);
@@ -400,7 +486,7 @@ static void ovl_put_super(struct super_block *sb)
  * @buf: The struct kstatfs to fill in with stats
  *
  * Get the filesystem statistics.  As writes always target the upper layer
- * filesystem pass the statfs to the same filesystem.
+ * filesystem pass the statfs to the upper filesystem (if it exists)
  */
 static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
@@ -409,7 +495,7 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct path path;
 	int err;
 
-	ovl_path_upper(root_dentry, &path);
+	ovl_path_real(root_dentry, &path);
 
 	err = vfs_statfs(&path, buf);
 	if (!err) {
@@ -432,8 +518,20 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
 	struct ovl_fs *ufs = sb->s_fs_info;
 
 	seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir);
-	seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
-	seq_printf(m, ",workdir=%s", ufs->config.workdir);
+	if (ufs->config.upperdir) {
+		seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
+		seq_printf(m, ",workdir=%s", ufs->config.workdir);
+	}
+	return 0;
+}
+
+static int ovl_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct ovl_fs *ufs = sb->s_fs_info;
+
+	if (!(*flags & MS_RDONLY) && !ufs->upper_mnt)
+		return -EROFS;
+
 	return 0;
 }
 
@@ -441,6 +539,7 @@ static const struct super_operations ovl_super_operations = {
 	.put_super	= ovl_put_super,
 	.statfs		= ovl_statfs,
 	.show_options	= ovl_show_options,
+	.remount_fs	= ovl_remount,
 };
 
 enum {
@@ -515,9 +614,19 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
 			break;
 
 		default:
+			pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p);
 			return -EINVAL;
 		}
 	}
+
+	/* Workdir is useless in non-upper mount */
+	if (!config->upperdir && config->workdir) {
+		pr_info("overlayfs: option \"workdir=%s\" is useless in a non-upper mount, ignore\n",
+			config->workdir);
+		kfree(config->workdir);
+		config->workdir = NULL;
+	}
+
 	return 0;
 }
 
@@ -585,24 +694,6 @@ static void ovl_unescape(char *s)
 	}
 }
 
-static int ovl_mount_dir(const char *name, struct path *path)
-{
-	int err;
-	char *tmp = kstrdup(name, GFP_KERNEL);
-
-	if (!tmp)
-		return -ENOMEM;
-
-	ovl_unescape(tmp);
-	err = kern_path(tmp, LOOKUP_FOLLOW, path);
-	if (err) {
-		pr_err("overlayfs: failed to resolve '%s': %i\n", tmp, err);
-		err = -EINVAL;
-	}
-	kfree(tmp);
-	return err;
-}
-
 static bool ovl_is_allowed_fs_type(struct dentry *root)
 {
 	const struct dentry_operations *dop = root->d_op;
@@ -622,6 +713,75 @@ static bool ovl_is_allowed_fs_type(struct dentry *root)
 	return true;
 }
 
+static int ovl_mount_dir_noesc(const char *name, struct path *path)
+{
+	int err = -EINVAL;
+
+	if (!*name) {
+		pr_err("overlayfs: empty lowerdir\n");
+		goto out;
+	}
+	err = kern_path(name, LOOKUP_FOLLOW, path);
+	if (err) {
+		pr_err("overlayfs: failed to resolve '%s': %i\n", name, err);
+		goto out;
+	}
+	err = -EINVAL;
+	if (!ovl_is_allowed_fs_type(path->dentry)) {
+		pr_err("overlayfs: filesystem on '%s' not supported\n", name);
+		goto out_put;
+	}
+	if (!S_ISDIR(path->dentry->d_inode->i_mode)) {
+		pr_err("overlayfs: '%s' not a directory\n", name);
+		goto out_put;
+	}
+	return 0;
+
+out_put:
+	path_put(path);
+out:
+	return err;
+}
+
+static int ovl_mount_dir(const char *name, struct path *path)
+{
+	int err = -ENOMEM;
+	char *tmp = kstrdup(name, GFP_KERNEL);
+
+	if (tmp) {
+		ovl_unescape(tmp);
+		err = ovl_mount_dir_noesc(tmp, path);
+		kfree(tmp);
+	}
+	return err;
+}
+
+static int ovl_lower_dir(const char *name, struct path *path, long *namelen,
+			 int *stack_depth)
+{
+	int err;
+	struct kstatfs statfs;
+
+	err = ovl_mount_dir_noesc(name, path);
+	if (err)
+		goto out;
+
+	err = vfs_statfs(path, &statfs);
+	if (err) {
+		pr_err("overlayfs: statfs failed on '%s'\n", name);
+		goto out_put;
+	}
+	*namelen = max(*namelen, statfs.f_namelen);
+	*stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth);
+
+	return 0;
+
+out_put:
+	path_put(path);
+out:
+	return err;
+}
+
 /* Workdir should not be subdir of upperdir and vice versa */
 static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir)
 {
@@ -634,16 +794,39 @@ static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir)
 	return ok;
 }
 
+static unsigned int ovl_split_lowerdirs(char *str)
+{
+	unsigned int ctr = 1;
+	char *s, *d;
+
+	for (s = d = str;; s++, d++) {
+		if (*s == '\\') {
+			s++;
+		} else if (*s == ':') {
+			*d = '\0';
+			ctr++;
+			continue;
+		}
+		*d = *s;
+		if (!*s)
+			break;
+	}
+	return ctr;
+}
+
 static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 {
-	struct path lowerpath;
-	struct path upperpath;
-	struct path workpath;
-	struct inode *root_inode;
+	struct path upperpath = { NULL, NULL };
+	struct path workpath = { NULL, NULL };
 	struct dentry *root_dentry;
 	struct ovl_entry *oe;
 	struct ovl_fs *ufs;
-	struct kstatfs statfs;
+	struct path *stack = NULL;
+	char *lowertmp;
+	char *lower;
+	unsigned int numlower;
+	unsigned int stacklen = 0;
+	unsigned int i;
 	int err;
 
 	err = -ENOMEM;
@@ -655,123 +838,147 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 	if (err)
 		goto out_free_config;
 
-	/* FIXME: workdir is not needed for a R/O mount */
 	err = -EINVAL;
-	if (!ufs->config.upperdir || !ufs->config.lowerdir ||
-	    !ufs->config.workdir) {
-		pr_err("overlayfs: missing upperdir or lowerdir or workdir\n");
+	if (!ufs->config.lowerdir) {
+		pr_err("overlayfs: missing 'lowerdir'\n");
 		goto out_free_config;
 	}
 
-	err = -ENOMEM;
-	oe = ovl_alloc_entry();
-	if (oe == NULL)
-		goto out_free_config;
-
-	err = ovl_mount_dir(ufs->config.upperdir, &upperpath);
-	if (err)
-		goto out_free_oe;
+	sb->s_stack_depth = 0;
+	if (ufs->config.upperdir) {
+		if (!ufs->config.workdir) {
+			pr_err("overlayfs: missing 'workdir'\n");
+			goto out_free_config;
+		}
 
-	err = ovl_mount_dir(ufs->config.lowerdir, &lowerpath);
-	if (err)
-		goto out_put_upperpath;
+		err = ovl_mount_dir(ufs->config.upperdir, &upperpath);
+		if (err)
+			goto out_free_config;
 
-	err = ovl_mount_dir(ufs->config.workdir, &workpath);
-	if (err)
-		goto out_put_lowerpath;
+		/* Upper fs should not be r/o */
+		if (upperpath.mnt->mnt_sb->s_flags & MS_RDONLY) {
+			pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n");
+			err = -EINVAL;
+			goto out_put_upperpath;
+		}
 
-	err = -EINVAL;
-	if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) ||
-	    !S_ISDIR(lowerpath.dentry->d_inode->i_mode) ||
-	    !S_ISDIR(workpath.dentry->d_inode->i_mode)) {
-		pr_err("overlayfs: upperdir or lowerdir or workdir not a directory\n");
-		goto out_put_workpath;
-	}
+		err = ovl_mount_dir(ufs->config.workdir, &workpath);
+		if (err)
+			goto out_put_upperpath;
 
-	if (upperpath.mnt != workpath.mnt) {
-		pr_err("overlayfs: workdir and upperdir must reside under the same mount\n");
-		goto out_put_workpath;
+		err = -EINVAL;
+		if (upperpath.mnt != workpath.mnt) {
+			pr_err("overlayfs: workdir and upperdir must reside under the same mount\n");
+			goto out_put_workpath;
+		}
+		if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) {
+			pr_err("overlayfs: workdir and upperdir must be separate subtrees\n");
+			goto out_put_workpath;
+		}
+		sb->s_stack_depth = upperpath.mnt->mnt_sb->s_stack_depth;
 	}
-	if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) {
-		pr_err("overlayfs: workdir and upperdir must be separate subtrees\n");
+	err = -ENOMEM;
+	lowertmp = kstrdup(ufs->config.lowerdir, GFP_KERNEL);
+	if (!lowertmp)
 		goto out_put_workpath;
-	}
 
-	if (!ovl_is_allowed_fs_type(upperpath.dentry)) {
-		pr_err("overlayfs: filesystem of upperdir is not supported\n");
-		goto out_put_workpath;
+	err = -EINVAL;
+	stacklen = ovl_split_lowerdirs(lowertmp);
+	if (stacklen > OVL_MAX_STACK) {
+		pr_err("overlayfs: too many lower directries, limit is %d\n",
+		       OVL_MAX_STACK);
+		goto out_free_lowertmp;
+	} else if (!ufs->config.upperdir && stacklen == 1) {
+		pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n");
+		goto out_free_lowertmp;
 	}
 
-	if (!ovl_is_allowed_fs_type(lowerpath.dentry)) {
-		pr_err("overlayfs: filesystem of lowerdir is not supported\n");
-		goto out_put_workpath;
-	}
+	stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL);
+	if (!stack)
+		goto out_free_lowertmp;
 
-	err = vfs_statfs(&lowerpath, &statfs);
-	if (err) {
-		pr_err("overlayfs: statfs failed on lowerpath\n");
-		goto out_put_workpath;
-	}
-	ufs->lower_namelen = statfs.f_namelen;
+	lower = lowertmp;
+	for (numlower = 0; numlower < stacklen; numlower++) {
+		err = ovl_lower_dir(lower, &stack[numlower],
+				    &ufs->lower_namelen, &sb->s_stack_depth);
+		if (err)
+			goto out_put_lowerpath;
 
-	sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth,
-				lowerpath.mnt->mnt_sb->s_stack_depth) + 1;
+		lower = strchr(lower, '\0') + 1;
+	}
 
 	err = -EINVAL;
+	sb->s_stack_depth++;
 	if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
 		pr_err("overlayfs: maximum fs stacking depth exceeded\n");
-		goto out_put_workpath;
+		goto out_put_lowerpath;
 	}
 
-	ufs->upper_mnt = clone_private_mount(&upperpath);
-	err = PTR_ERR(ufs->upper_mnt);
-	if (IS_ERR(ufs->upper_mnt)) {
-		pr_err("overlayfs: failed to clone upperpath\n");
-		goto out_put_workpath;
-	}
+	if (ufs->config.upperdir) {
+		ufs->upper_mnt = clone_private_mount(&upperpath);
+		err = PTR_ERR(ufs->upper_mnt);
+		if (IS_ERR(ufs->upper_mnt)) {
+			pr_err("overlayfs: failed to clone upperpath\n");
+			goto out_put_lowerpath;
+		}
 
-	ufs->lower_mnt = clone_private_mount(&lowerpath);
-	err = PTR_ERR(ufs->lower_mnt);
-	if (IS_ERR(ufs->lower_mnt)) {
-		pr_err("overlayfs: failed to clone lowerpath\n");
-		goto out_put_upper_mnt;
+		ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry);
+		err = PTR_ERR(ufs->workdir);
+		if (IS_ERR(ufs->workdir)) {
+			pr_err("overlayfs: failed to create directory %s/%s\n",
+			       ufs->config.workdir, OVL_WORKDIR_NAME);
+			goto out_put_upper_mnt;
+		}
 	}
 
-	ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry);
-	err = PTR_ERR(ufs->workdir);
-	if (IS_ERR(ufs->workdir)) {
-		pr_err("overlayfs: failed to create directory %s/%s\n",
-		       ufs->config.workdir, OVL_WORKDIR_NAME);
-		goto out_put_lower_mnt;
-	}
+	err = -ENOMEM;
+	ufs->lower_mnt = kcalloc(numlower, sizeof(struct vfsmount *), GFP_KERNEL);
+	if (ufs->lower_mnt == NULL)
+		goto out_put_workdir;
+	for (i = 0; i < numlower; i++) {
+		struct vfsmount *mnt = clone_private_mount(&stack[i]);
 
-	/*
-	 * Make lower_mnt R/O.  That way fchmod/fchown on lower file
-	 * will fail instead of modifying lower fs.
-	 */
-	ufs->lower_mnt->mnt_flags |= MNT_READONLY;
+		err = PTR_ERR(mnt);
+		if (IS_ERR(mnt)) {
+			pr_err("overlayfs: failed to clone lowerpath\n");
+			goto out_put_lower_mnt;
+		}
+		/*
+		 * Make lower_mnt R/O.  That way fchmod/fchown on lower file
+		 * will fail instead of modifying lower fs.
+		 */
+		mnt->mnt_flags |= MNT_READONLY;
+
+		ufs->lower_mnt[ufs->numlower] = mnt;
+		ufs->numlower++;
+	}
 
-	/* If the upper fs is r/o, we mark overlayfs r/o too */
-	if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY)
+	/* If the upper fs is nonexistent, we mark overlayfs r/o too */
+	if (!ufs->upper_mnt)
 		sb->s_flags |= MS_RDONLY;
 
 	sb->s_d_op = &ovl_dentry_operations;
 
 	err = -ENOMEM;
-	root_inode = ovl_new_inode(sb, S_IFDIR, oe);
-	if (!root_inode)
-		goto out_put_workdir;
+	oe = ovl_alloc_entry(numlower);
+	if (!oe)
+		goto out_put_lower_mnt;
 
-	root_dentry = d_make_root(root_inode);
+	root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, oe));
 	if (!root_dentry)
-		goto out_put_workdir;
+		goto out_free_oe;
 
 	mntput(upperpath.mnt);
-	mntput(lowerpath.mnt);
+	for (i = 0; i < numlower; i++)
+		mntput(stack[i].mnt);
 	path_put(&workpath);
+	kfree(lowertmp);
 
 	oe->__upperdentry = upperpath.dentry;
-	oe->lowerdentry = lowerpath.dentry;
+	for (i = 0; i < numlower; i++) {
+		oe->lowerstack[i].dentry = stack[i].dentry;
+		oe->lowerstack[i].mnt = ufs->lower_mnt[i];
+	}
 
 	root_dentry->d_fsdata = oe;
 
@@ -782,20 +989,26 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 
 	return 0;
 
+out_free_oe:
+	kfree(oe);
+out_put_lower_mnt:
+	for (i = 0; i < ufs->numlower; i++)
+		mntput(ufs->lower_mnt[i]);
+	kfree(ufs->lower_mnt);
 out_put_workdir:
 	dput(ufs->workdir);
-out_put_lower_mnt:
-	mntput(ufs->lower_mnt);
 out_put_upper_mnt:
 	mntput(ufs->upper_mnt);
+out_put_lowerpath:
+	for (i = 0; i < numlower; i++)
+		path_put(&stack[i]);
+	kfree(stack);
+out_free_lowertmp:
+	kfree(lowertmp);
 out_put_workpath:
 	path_put(&workpath);
-out_put_lowerpath:
-	path_put(&lowerpath);
 out_put_upperpath:
 	path_put(&upperpath);
-out_free_oe:
-	kfree(oe);
 out_free_config:
 	kfree(ufs->config.lowerdir);
 	kfree(ufs->config.upperdir);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 0855f77..3a48bb7 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -564,13 +564,11 @@ posix_acl_create(struct inode *dir, umode_t *mode,
 
 	*acl = posix_acl_clone(p, GFP_NOFS);
 	if (!*acl)
-		return -ENOMEM;
+		goto no_mem;
 
 	ret = posix_acl_create_masq(*acl, mode);
-	if (ret < 0) {
-		posix_acl_release(*acl);
-		return -ENOMEM;
-	}
+	if (ret < 0)
+		goto no_mem_clone;
 
 	if (ret == 0) {
 		posix_acl_release(*acl);
@@ -591,6 +589,12 @@ no_acl:
 	*default_acl = NULL;
 	*acl = NULL;
 	return 0;
+
+no_mem_clone:
+	posix_acl_release(*acl);
+no_mem:
+	posix_acl_release(p);
+	return -ENOMEM;
 }
 EXPORT_SYMBOL_GPL(posix_acl_create);
 
@@ -772,7 +776,7 @@ posix_acl_xattr_get(struct dentry *dentry, const char *name,
 
 	if (!IS_POSIXACL(dentry->d_inode))
 		return -EOPNOTSUPP;
-	if (S_ISLNK(dentry->d_inode->i_mode))
+	if (d_is_symlink(dentry))
 		return -EOPNOTSUPP;
 
 	acl = get_acl(dentry->d_inode, type);
@@ -832,7 +836,7 @@ posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size,
 
 	if (!IS_POSIXACL(dentry->d_inode))
 		return -EOPNOTSUPP;
-	if (S_ISLNK(dentry->d_inode->i_mode))
+	if (d_is_symlink(dentry))
 		return -EOPNOTSUPP;
 
 	if (type == ACL_TYPE_ACCESS)
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 3309f59..be65b20 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -19,7 +19,6 @@
 #include <linux/mount.h>
 #include <linux/init.h>
 #include <linux/idr.h>
-#include <linux/namei.h>
 #include <linux/bitops.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
@@ -223,17 +222,6 @@ void proc_free_inum(unsigned int inum)
 	spin_unlock_irqrestore(&proc_inum_lock, flags);
 }
 
-static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	nd_set_link(nd, __PDE_DATA(dentry->d_inode));
-	return NULL;
-}
-
-static const struct inode_operations proc_link_inode_operations = {
-	.readlink	= generic_readlink,
-	.follow_link	= proc_follow_link,
-};
-
 /*
  * Don't create negative dentries here, return -ENOENT by hand
  * instead.
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 13a50a3..7697b66 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -23,6 +23,7 @@
 #include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/magic.h>
+#include <linux/namei.h>
 
 #include <asm/uaccess.h>
 
@@ -393,6 +394,26 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
 };
 #endif
 
+static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct proc_dir_entry *pde = PDE(dentry->d_inode);
+	if (unlikely(!use_pde(pde)))
+		return ERR_PTR(-EINVAL);
+	nd_set_link(nd, pde->data);
+	return pde;
+}
+
+static void proc_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+{
+	unuse_pde(p);
+}
+
+const struct inode_operations proc_link_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= proc_follow_link,
+	.put_link	= proc_put_link,
+};
+
 struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
 {
 	struct inode *inode = new_inode_pseudo(sb);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 6fcdba5..c835b94 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -200,6 +200,7 @@ struct pde_opener {
 	int closing;
 	struct completion *c;
 };
+extern const struct inode_operations proc_link_inode_operations;
 
 extern const struct inode_operations proc_pid_link_inode_operations;
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 956b75d..6dee68d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1325,6 +1325,9 @@ out:
 
 static int pagemap_open(struct inode *inode, struct file *file)
 {
+	/* do not disclose physical addresses: attack vector */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
 	pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
 			"to stop being page-shift some time soon. See the "
 			"linux/Documentation/vm/pagemap.txt for details.\n");
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 04b0614..4e781e6 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -266,7 +266,7 @@ static int reiserfs_for_each_xattr(struct inode *inode,
 		for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) {
 			struct dentry *dentry = buf.dentries[i];
 
-			if (!S_ISDIR(dentry->d_inode->i_mode))
+			if (!d_is_dir(dentry))
 				err = action(dentry, data);
 
 			dput(dentry);
@@ -322,7 +322,7 @@ static int delete_one_xattr(struct dentry *dentry, void *data)
 	struct inode *dir = dentry->d_parent->d_inode;
 
 	/* This is the xattr dir, handle specially. */
-	if (S_ISDIR(dentry->d_inode->i_mode))
+	if (d_is_dir(dentry))
 		return xattr_rmdir(dir, dentry);
 
 	return xattr_unlink(dir, dentry);
diff --git a/fs/super.c b/fs/super.c
index 65a53ef..2b7dc90 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -71,7 +71,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
 	if (!(sc->gfp_mask & __GFP_FS))
 		return SHRINK_STOP;
 
-	if (!grab_super_passive(sb))
+	if (!trylock_super(sb))
 		return SHRINK_STOP;
 
 	if (sb->s_op->nr_cached_objects)
@@ -105,7 +105,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
 		freed += sb->s_op->free_cached_objects(sb, sc);
 	}
 
-	drop_super(sb);
+	up_read(&sb->s_umount);
 	return freed;
 }
 
@@ -118,7 +118,7 @@ static unsigned long super_cache_count(struct shrinker *shrink,
 	sb = container_of(shrink, struct super_block, s_shrink);
 
 	/*
-	 * Don't call grab_super_passive as it is a potential
+	 * Don't call trylock_super as it is a potential
 	 * scalability bottleneck. The counts could get updated
 	 * between super_cache_count and super_cache_scan anyway.
 	 * Call to super_cache_count with shrinker_rwsem held
@@ -348,35 +348,31 @@ static int grab_super(struct super_block *s) __releases(sb_lock)
 }
 
 /*
- *	grab_super_passive - acquire a passive reference
+ *	trylock_super - try to grab ->s_umount shared
  *	@sb: reference we are trying to grab
  *
- *	Tries to acquire a passive reference. This is used in places where we
+ *	Try to prevent fs shutdown.  This is used in places where we
  *	cannot take an active reference but we need to ensure that the
- *	superblock does not go away while we are working on it. It returns
- *	false if a reference was not gained, and returns true with the s_umount
- *	lock held in read mode if a reference is gained. On successful return,
- *	the caller must drop the s_umount lock and the passive reference when
- *	done.
+ *	filesystem is not shut down while we are working on it. It returns
+ *	false if we cannot acquire s_umount or if we lose the race and
+ *	filesystem already got into shutdown, and returns true with the s_umount
+ *	lock held in read mode in case of success. On successful return,
+ *	the caller must drop the s_umount lock when done.
+ *
+ *	Note that unlike get_super() et.al. this one does *not* bump ->s_count.
+ *	The reason why it's safe is that we are OK with doing trylock instead
+ *	of down_read().  There's a couple of places that are OK with that, but
+ *	it's very much not a general-purpose interface.
  */
-bool grab_super_passive(struct super_block *sb)
+bool trylock_super(struct super_block *sb)
 {
-	spin_lock(&sb_lock);
-	if (hlist_unhashed(&sb->s_instances)) {
-		spin_unlock(&sb_lock);
-		return false;
-	}
-
-	sb->s_count++;
-	spin_unlock(&sb_lock);
-
 	if (down_read_trylock(&sb->s_umount)) {
-		if (sb->s_root && (sb->s_flags & MS_BORN))
+		if (!hlist_unhashed(&sb->s_instances) &&
+		    sb->s_root && (sb->s_flags & MS_BORN))
 			return true;
 		up_read(&sb->s_umount);
 	}
 
-	put_super(sb);
 	return false;
 }
 
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d617999..df68285 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -121,3 +121,4 @@ xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
 xfs-$(CONFIG_PROC_FS)		+= xfs_stats.o
 xfs-$(CONFIG_SYSCTL)		+= xfs_sysctl.o
 xfs-$(CONFIG_COMPAT)		+= xfs_ioctl32.o
+xfs-$(CONFIG_NFSD_PNFS)		+= xfs_pnfs.o
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 5eb4a14..b97359b 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -30,6 +30,7 @@
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_log.h"
+#include "xfs_pnfs.h"
 
 /*
  * Note that we only accept fileids which are long enough rather than allow
@@ -245,4 +246,9 @@ const struct export_operations xfs_export_operations = {
 	.fh_to_parent		= xfs_fs_fh_to_parent,
 	.get_parent		= xfs_fs_get_parent,
 	.commit_metadata	= xfs_fs_nfs_commit_metadata,
+#ifdef CONFIG_NFSD_PNFS
+	.get_uuid		= xfs_fs_get_uuid,
+	.map_blocks		= xfs_fs_map_blocks,
+	.commit_blocks		= xfs_fs_commit_blocks,
+#endif
 };
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1cdba95..a2e1cb8 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -36,6 +36,7 @@
 #include "xfs_trace.h"
 #include "xfs_log.h"
 #include "xfs_icache.h"
+#include "xfs_pnfs.h"
 
 #include <linux/aio.h>
 #include <linux/dcache.h>
@@ -396,7 +397,8 @@ STATIC int				/* error (positive) */
 xfs_zero_last_block(
 	struct xfs_inode	*ip,
 	xfs_fsize_t		offset,
-	xfs_fsize_t		isize)
+	xfs_fsize_t		isize,
+	bool			*did_zeroing)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	xfs_fileoff_t		last_fsb = XFS_B_TO_FSBT(mp, isize);
@@ -424,6 +426,7 @@ xfs_zero_last_block(
 	zero_len = mp->m_sb.sb_blocksize - zero_offset;
 	if (isize + zero_len > offset)
 		zero_len = offset - isize;
+	*did_zeroing = true;
 	return xfs_iozero(ip, isize, zero_len);
 }
 
@@ -442,7 +445,8 @@ int					/* error (positive) */
 xfs_zero_eof(
 	struct xfs_inode	*ip,
 	xfs_off_t		offset,		/* starting I/O offset */
-	xfs_fsize_t		isize)		/* current inode size */
+	xfs_fsize_t		isize,		/* current inode size */
+	bool			*did_zeroing)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	xfs_fileoff_t		start_zero_fsb;
@@ -464,7 +468,7 @@ xfs_zero_eof(
 	 * We only zero a part of that block so it is handled specially.
 	 */
 	if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
-		error = xfs_zero_last_block(ip, offset, isize);
+		error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
 		if (error)
 			return error;
 	}
@@ -524,6 +528,7 @@ xfs_zero_eof(
 		if (error)
 			return error;
 
+		*did_zeroing = true;
 		start_zero_fsb = imap.br_startoff + imap.br_blockcount;
 		ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
 	}
@@ -554,6 +559,10 @@ restart:
 	if (error)
 		return error;
 
+	error = xfs_break_layouts(inode, iolock);
+	if (error)
+		return error;
+
 	/*
 	 * If the offset is beyond the size of the file, we need to zero any
 	 * blocks that fall between the existing EOF and the start of this
@@ -562,13 +571,15 @@ restart:
 	 * having to redo all checks before.
 	 */
 	if (*pos > i_size_read(inode)) {
+		bool	zero = false;
+
 		if (*iolock == XFS_IOLOCK_SHARED) {
 			xfs_rw_iunlock(ip, *iolock);
 			*iolock = XFS_IOLOCK_EXCL;
 			xfs_rw_ilock(ip, *iolock);
 			goto restart;
 		}
-		error = xfs_zero_eof(ip, *pos, i_size_read(inode));
+		error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero);
 		if (error)
 			return error;
 	}
@@ -822,6 +833,7 @@ xfs_file_fallocate(
 	struct xfs_inode	*ip = XFS_I(inode);
 	long			error;
 	enum xfs_prealloc_flags	flags = 0;
+	uint			iolock = XFS_IOLOCK_EXCL;
 	loff_t			new_size = 0;
 
 	if (!S_ISREG(inode->i_mode))
@@ -830,7 +842,11 @@ xfs_file_fallocate(
 		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
 		return -EOPNOTSUPP;
 
-	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+	xfs_ilock(ip, iolock);
+	error = xfs_break_layouts(inode, &iolock);
+	if (error)
+		goto out_unlock;
+
 	if (mode & FALLOC_FL_PUNCH_HOLE) {
 		error = xfs_free_file_space(ip, offset, len);
 		if (error)
@@ -894,7 +910,7 @@ xfs_file_fallocate(
 	}
 
 out_unlock:
-	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	xfs_iunlock(ip, iolock);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index fba6532..74efe5b 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -602,6 +602,12 @@ xfs_growfs_data(
 	if (!mutex_trylock(&mp->m_growlock))
 		return -EWOULDBLOCK;
 	error = xfs_growfs_data_private(mp, in);
+	/*
+	 * Increment the generation unconditionally, the error could be from
+	 * updating the secondary superblocks, in which case the new size
+	 * is live already.
+	 */
+	mp->m_generation++;
 	mutex_unlock(&mp->m_growlock);
 	return error;
 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index daafa1f..6163767 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2867,6 +2867,10 @@ xfs_rename(
 	 * Handle RENAME_EXCHANGE flags
 	 */
 	if (flags & RENAME_EXCHANGE) {
+		if (target_ip == NULL) {
+			error = -EINVAL;
+			goto error_return;
+		}
 		error = xfs_cross_rename(tp, src_dp, src_name, src_ip,
 					 target_dp, target_name, target_ip,
 					 &free_list, &first_block, spaceres);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 86cd6b3..a1cd55f 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -384,10 +384,11 @@ enum xfs_prealloc_flags {
 	XFS_PREALLOC_INVISIBLE	= (1 << 4),
 };
 
-int		xfs_update_prealloc_flags(struct xfs_inode *,
-			enum xfs_prealloc_flags);
-int		xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
-int		xfs_iozero(struct xfs_inode *, loff_t, size_t);
+int	xfs_update_prealloc_flags(struct xfs_inode *ip,
+				  enum xfs_prealloc_flags flags);
+int	xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
+		     xfs_fsize_t isize, bool *did_zeroing);
+int	xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
 
 
 #define IHOLD(ip) \
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index f7afb86..ac4feae 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -39,6 +39,7 @@
 #include "xfs_icache.h"
 #include "xfs_symlink.h"
 #include "xfs_trans.h"
+#include "xfs_pnfs.h"
 
 #include <linux/capability.h>
 #include <linux/dcache.h>
@@ -286,7 +287,7 @@ xfs_readlink_by_handle(
 		return PTR_ERR(dentry);
 
 	/* Restrict this handle operation to symlinks only. */
-	if (!S_ISLNK(dentry->d_inode->i_mode)) {
+	if (!d_is_symlink(dentry)) {
 		error = -EINVAL;
 		goto out_dput;
 	}
@@ -608,6 +609,7 @@ xfs_ioc_space(
 {
 	struct iattr		iattr;
 	enum xfs_prealloc_flags	flags = 0;
+	uint			iolock = XFS_IOLOCK_EXCL;
 	int			error;
 
 	/*
@@ -636,7 +638,10 @@ xfs_ioc_space(
 	if (error)
 		return error;
 
-	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+	xfs_ilock(ip, iolock);
+	error = xfs_break_layouts(inode, &iolock);
+	if (error)
+		goto out_unlock;
 
 	switch (bf->l_whence) {
 	case 0: /*SEEK_SET*/
@@ -725,7 +730,7 @@ xfs_ioc_space(
 	error = xfs_update_prealloc_flags(ip, flags);
 
 out_unlock:
-	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	xfs_iunlock(ip, iolock);
 	mnt_drop_write_file(filp);
 	return error;
 }
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ce80eeb..e53a903 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -37,6 +37,7 @@
 #include "xfs_da_btree.h"
 #include "xfs_dir2.h"
 #include "xfs_trans_space.h"
+#include "xfs_pnfs.h"
 
 #include <linux/capability.h>
 #include <linux/xattr.h>
@@ -505,7 +506,7 @@ xfs_setattr_mode(
 	inode->i_mode |= mode & ~S_IFMT;
 }
 
-static void
+void
 xfs_setattr_time(
 	struct xfs_inode	*ip,
 	struct iattr		*iattr)
@@ -750,6 +751,7 @@ xfs_setattr_size(
 	int			error;
 	uint			lock_flags = 0;
 	uint			commit_flags = 0;
+	bool			did_zeroing = false;
 
 	trace_xfs_setattr(ip);
 
@@ -793,20 +795,16 @@ xfs_setattr_size(
 		return error;
 
 	/*
-	 * Now we can make the changes.  Before we join the inode to the
-	 * transaction, take care of the part of the truncation that must be
-	 * done without the inode lock.  This needs to be done before joining
-	 * the inode to the transaction, because the inode cannot be unlocked
-	 * once it is a part of the transaction.
+	 * File data changes must be complete before we start the transaction to
+	 * modify the inode.  This needs to be done before joining the inode to
+	 * the transaction because the inode cannot be unlocked once it is a
+	 * part of the transaction.
+	 *
+	 * Start with zeroing any data block beyond EOF that we may expose on
+	 * file extension.
 	 */
 	if (newsize > oldsize) {
-		/*
-		 * Do the first part of growing a file: zero any data in the
-		 * last block that is beyond the old EOF.  We need to do this
-		 * before the inode is joined to the transaction to modify
-		 * i_size.
-		 */
-		error = xfs_zero_eof(ip, newsize, oldsize);
+		error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
 		if (error)
 			return error;
 	}
@@ -816,23 +814,18 @@ xfs_setattr_size(
 	 * any previous writes that are beyond the on disk EOF and the new
 	 * EOF that have not been written out need to be written here.  If we
 	 * do not write the data out, we expose ourselves to the null files
-	 * problem.
-	 *
-	 * Only flush from the on disk size to the smaller of the in memory
-	 * file size or the new size as that's the range we really care about
-	 * here and prevents waiting for other data not within the range we
-	 * care about here.
+	 * problem. Note that this includes any block zeroing we did above;
+	 * otherwise those blocks may not be zeroed after a crash.
 	 */
-	if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
+	if (newsize > ip->i_d.di_size &&
+	    (oldsize != ip->i_d.di_size || did_zeroing)) {
 		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
 						      ip->i_d.di_size, newsize);
 		if (error)
 			return error;
 	}
 
-	/*
-	 * Wait for all direct I/O to complete.
-	 */
+	/* Now wait for all direct I/O to complete. */
 	inode_dio_wait(inode);
 
 	/*
@@ -979,9 +972,13 @@ xfs_vn_setattr(
 	int			error;
 
 	if (iattr->ia_valid & ATTR_SIZE) {
-		xfs_ilock(ip, XFS_IOLOCK_EXCL);
-		error = xfs_setattr_size(ip, iattr);
-		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+		uint		iolock = XFS_IOLOCK_EXCL;
+
+		xfs_ilock(ip, iolock);
+		error = xfs_break_layouts(dentry->d_inode, &iolock);
+		if (!error)
+			error = xfs_setattr_size(ip, iattr);
+		xfs_iunlock(ip, iolock);
 	} else {
 		error = xfs_setattr_nonsize(ip, iattr, 0);
 	}
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index 1c34e43..ea7a98e 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -32,6 +32,7 @@ extern void xfs_setup_inode(struct xfs_inode *);
  */
 #define XFS_ATTR_NOACL		0x01	/* Don't call posix_acl_chmod */
 
+extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr);
 extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
 			       int flags);
 extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a5b2ff8..0d8abd6 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -174,6 +174,17 @@ typedef struct xfs_mount {
 	struct workqueue_struct	*m_reclaim_workqueue;
 	struct workqueue_struct	*m_log_workqueue;
 	struct workqueue_struct *m_eofblocks_workqueue;
+
+	/*
+	 * Generation of the filesysyem layout.  This is incremented by each
+	 * growfs, and used by the pNFS server to ensure the client updates
+	 * its view of the block device once it gets a layout that might
+	 * reference the newly added blocks.  Does not need to be persistent
+	 * as long as we only allow file system size increments, but if we
+	 * ever support shrinks it would have to be persisted in addition
+	 * to various other kinds of pain inflicted on the pNFS server.
+	 */
+	__uint32_t		m_generation;
 } xfs_mount_t;
 
 /*
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
new file mode 100644
index 0000000..365dd57
--- /dev/null
+++ b/fs/xfs/xfs_pnfs.c
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include "xfs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_error.h"
+#include "xfs_iomap.h"
+#include "xfs_shared.h"
+#include "xfs_bit.h"
+#include "xfs_pnfs.h"
+
+/*
+ * Ensure that we do not have any outstanding pNFS layouts that can be used by
+ * clients to directly read from or write to this inode.  This must be called
+ * before every operation that can remove blocks from the extent map.
+ * Additionally we call it during the write operation, where aren't concerned
+ * about exposing unallocated blocks but just want to provide basic
+ * synchronization between a local writer and pNFS clients.  mmap writes would
+ * also benefit from this sort of synchronization, but due to the tricky locking
+ * rules in the page fault path we don't bother.
+ */
+int
+xfs_break_layouts(
+	struct inode		*inode,
+	uint			*iolock)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	int			error;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
+
+	while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
+		xfs_iunlock(ip, *iolock);
+		error = break_layout(inode, true);
+		*iolock = XFS_IOLOCK_EXCL;
+		xfs_ilock(ip, *iolock);
+	}
+
+	return error;
+}
+
+/*
+ * Get a unique ID including its location so that the client can identify
+ * the exported device.
+ */
+int
+xfs_fs_get_uuid(
+	struct super_block	*sb,
+	u8			*buf,
+	u32			*len,
+	u64			*offset)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	printk_once(KERN_NOTICE
+"XFS (%s): using experimental pNFS feature, use at your own risk!\n",
+		mp->m_fsname);
+
+	if (*len < sizeof(uuid_t))
+		return -EINVAL;
+
+	memcpy(buf, &mp->m_sb.sb_uuid, sizeof(uuid_t));
+	*len = sizeof(uuid_t);
+	*offset = offsetof(struct xfs_dsb, sb_uuid);
+	return 0;
+}
+
+static void
+xfs_bmbt_to_iomap(
+	struct xfs_inode	*ip,
+	struct iomap		*iomap,
+	struct xfs_bmbt_irec	*imap)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	if (imap->br_startblock == HOLESTARTBLOCK) {
+		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->type = IOMAP_HOLE;
+	} else if (imap->br_startblock == DELAYSTARTBLOCK) {
+		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->type = IOMAP_DELALLOC;
+	} else {
+		iomap->blkno =
+			XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock);
+		if (imap->br_state == XFS_EXT_UNWRITTEN)
+			iomap->type = IOMAP_UNWRITTEN;
+		else
+			iomap->type = IOMAP_MAPPED;
+	}
+	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
+}
+
+/*
+ * Get a layout for the pNFS client.
+ */
+int
+xfs_fs_map_blocks(
+	struct inode		*inode,
+	loff_t			offset,
+	u64			length,
+	struct iomap		*iomap,
+	bool			write,
+	u32			*device_generation)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_bmbt_irec	imap;
+	xfs_fileoff_t		offset_fsb, end_fsb;
+	loff_t			limit;
+	int			bmapi_flags = XFS_BMAPI_ENTIRE;
+	int			nimaps = 1;
+	uint			lock_flags;
+	int			error = 0;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	/*
+	 * We can't export inodes residing on the realtime device.  The realtime
+	 * device doesn't have a UUID to identify it, so the client has no way
+	 * to find it.
+	 */
+	if (XFS_IS_REALTIME_INODE(ip))
+		return -ENXIO;
+
+	/*
+	 * Lock out any other I/O before we flush and invalidate the pagecache,
+	 * and then hand out a layout to the remote system.  This is very
+	 * similar to direct I/O, except that the synchronization is much more
+	 * complicated.  See the comment near xfs_break_layouts for a detailed
+	 * explanation.
+	 */
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+	error = -EINVAL;
+	limit = mp->m_super->s_maxbytes;
+	if (!write)
+		limit = max(limit, round_up(i_size_read(inode),
+				     inode->i_sb->s_blocksize));
+	if (offset > limit)
+		goto out_unlock;
+	if (offset > limit - length)
+		length = limit - offset;
+
+	error = filemap_write_and_wait(inode->i_mapping);
+	if (error)
+		goto out_unlock;
+	error = invalidate_inode_pages2(inode->i_mapping);
+	if (WARN_ON_ONCE(error))
+		return error;
+
+	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length);
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+
+	lock_flags = xfs_ilock_data_map_shared(ip);
+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+				&imap, &nimaps, bmapi_flags);
+	xfs_iunlock(ip, lock_flags);
+
+	if (error)
+		goto out_unlock;
+
+	if (write) {
+		enum xfs_prealloc_flags	flags = 0;
+
+		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+
+		if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) {
+			error = xfs_iomap_write_direct(ip, offset, length,
+						       &imap, nimaps);
+			if (error)
+				goto out_unlock;
+
+			/*
+			 * Ensure the next transaction is committed
+			 * synchronously so that the blocks allocated and
+			 * handed out to the client are guaranteed to be
+			 * present even after a server crash.
+			 */
+			flags |= XFS_PREALLOC_SET | XFS_PREALLOC_SYNC;
+		}
+
+		error = xfs_update_prealloc_flags(ip, flags);
+		if (error)
+			goto out_unlock;
+	}
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+
+	xfs_bmbt_to_iomap(ip, iomap, &imap);
+	*device_generation = mp->m_generation;
+	return error;
+out_unlock:
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
+}
+
+/*
+ * Ensure the size update falls into a valid allocated block.
+ */
+static int
+xfs_pnfs_validate_isize(
+	struct xfs_inode	*ip,
+	xfs_off_t		isize)
+{
+	struct xfs_bmbt_irec	imap;
+	int			nimaps = 1;
+	int			error = 0;
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	error = xfs_bmapi_read(ip, XFS_B_TO_FSBT(ip->i_mount, isize - 1), 1,
+				&imap, &nimaps, 0);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	if (error)
+		return error;
+
+	if (imap.br_startblock == HOLESTARTBLOCK ||
+	    imap.br_startblock == DELAYSTARTBLOCK ||
+	    imap.br_state == XFS_EXT_UNWRITTEN)
+		return -EIO;
+	return 0;
+}
+
+/*
+ * Make sure the blocks described by maps are stable on disk.  This includes
+ * converting any unwritten extents, flushing the disk cache and updating the
+ * time stamps.
+ *
+ * Note that we rely on the caller to always send us a timestamp update so that
+ * we always commit a transaction here.  If that stops being true we will have
+ * to manually flush the cache here similar to what the fsync code path does
+ * for datasyncs on files that have no dirty metadata.
+ */
+int
+xfs_fs_commit_blocks(
+	struct inode		*inode,
+	struct iomap		*maps,
+	int			nr_maps,
+	struct iattr		*iattr)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	bool			update_isize = false;
+	int			error, i;
+	loff_t			size;
+
+	ASSERT(iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME));
+
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+	size = i_size_read(inode);
+	if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size > size) {
+		update_isize = true;
+		size = iattr->ia_size;
+	}
+
+	for (i = 0; i < nr_maps; i++) {
+		u64 start, length, end;
+
+		start = maps[i].offset;
+		if (start > size)
+			continue;
+
+		end = start + maps[i].length;
+		if (end > size)
+			end = size;
+
+		length = end - start;
+		if (!length)
+			continue;
+	
+		/*
+		 * Make sure reads through the pagecache see the new data.
+		 */
+		error = invalidate_inode_pages2_range(inode->i_mapping,
+					start >> PAGE_CACHE_SHIFT,
+					(end - 1) >> PAGE_CACHE_SHIFT);
+		WARN_ON_ONCE(error);
+
+		error = xfs_iomap_write_unwritten(ip, start, length);
+		if (error)
+			goto out_drop_iolock;
+	}
+
+	if (update_isize) {
+		error = xfs_pnfs_validate_isize(ip, size);
+		if (error)
+			goto out_drop_iolock;
+	}
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		goto out_drop_iolock;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	xfs_setattr_time(ip, iattr);
+	if (update_isize) {
+		i_size_write(inode, iattr->ia_size);
+		ip->i_d.di_size = iattr->ia_size;
+	}
+
+	xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp, 0);
+
+out_drop_iolock:
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
+}
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
new file mode 100644
index 0000000..b7fbfce
--- /dev/null
+++ b/fs/xfs/xfs_pnfs.h
@@ -0,0 +1,18 @@
+#ifndef _XFS_PNFS_H
+#define _XFS_PNFS_H 1
+
+#ifdef CONFIG_NFSD_PNFS
+int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
+int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
+		struct iomap *iomap, bool write, u32 *device_generation);
+int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
+		struct iattr *iattr);
+
+int xfs_break_layouts(struct inode *inode, uint *iolock);
+#else
+static inline int xfs_break_layouts(struct inode *inode, uint *iolock)
+{
+	return 0;
+}
+#endif /* CONFIG_NFSD_PNFS */
+#endif /* _XFS_PNFS_H */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 53cc2aa..fbbb9e6 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -836,6 +836,11 @@ xfs_qm_reset_dqcounts(
 		 */
 		xfs_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
 			    "xfs_quotacheck");
+		/*
+		 * Reset type in case we are reusing group quota file for
+		 * project quotas or vice versa
+		 */
+		ddq->d_flags = type;
 		ddq->d_bcount = 0;
 		ddq->d_icount = 0;
 		ddq->d_rtbcount = 0;
author	Artem Bityutskiy <artem.bityutskiy@linux.intel.com>	2015-03-25 09:03:07 (GMT)
committer	Artem Bityutskiy <artem.bityutskiy@linux.intel.com>	2015-03-25 09:03:07 (GMT)
commit	3527a86b7ae17c949307d00e1eb7087604bca1b4 (patch)
tree	8e8eab905e7e73a3bf2c4d47abd356166533c9db /fs
parent	b388e6a7a6ba988998ddd83919ae8d3debf1a13d (diff)
parent	bc465aa9d045feb0e13b4a8f32cc33c1943f62d6 (diff)
download	linux-3527a86b7ae17c949307d00e1eb7087604bca1b4.tar.xz