From fac7d1917dfddfa53e98524f0abfbe60252740fe Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 4 Jun 2016 11:41:49 -0400
Subject: fix EOPENSTALE bug in do_last()

EOPENSTALE occuring at the last component of a trailing symlink ends up
with do_last() retrying its lookup.  After the symlink body has been
discarded.  The thing is, all this retry_lookup logics in there is not
needed at all - the upper layers will do the right thing if we simply
return that -EOPENSTALE as we would with any other error.  Trying to
microoptimize in do_last() is a lot of headache for no good reason.

Cc: stable@vger.kernel.org # v4.2+
Tested-by: Oleg Drokin <green@linuxhacker.ru>
Reviewed-and-Tested-by: Jeff Layton <jlayton@poochiereds.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 4c4f95a..3d9511e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3166,9 +3166,7 @@ static int do_last(struct nameidata *nd,
 	int acc_mode = op->acc_mode;
 	unsigned seq;
 	struct inode *inode;
-	struct path save_parent = { .dentry = NULL, .mnt = NULL };
 	struct path path;
-	bool retried = false;
 	int error;
 
 	nd->flags &= ~LOOKUP_PARENT;
@@ -3211,7 +3209,6 @@ static int do_last(struct nameidata *nd,
 			return -EISDIR;
 	}
 
-retry_lookup:
 	if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
 		error = mnt_want_write(nd->path.mnt);
 		if (!error)
@@ -3292,23 +3289,14 @@ finish_lookup:
 	if (unlikely(error))
 		return error;
 
-	if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) {
-		path_to_nameidata(&path, nd);
-	} else {
-		save_parent.dentry = nd->path.dentry;
-		save_parent.mnt = mntget(path.mnt);
-		nd->path.dentry = path.dentry;
-
-	}
+	path_to_nameidata(&path, nd);
 	nd->inode = inode;
 	nd->seq = seq;
 	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
 finish_open:
 	error = complete_walk(nd);
-	if (error) {
-		path_put(&save_parent);
+	if (error)
 		return error;
-	}
 	audit_inode(nd->name, nd->path.dentry, 0);
 	error = -EISDIR;
 	if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
@@ -3331,13 +3319,9 @@ finish_open_created:
 		goto out;
 	BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
 	error = vfs_open(&nd->path, file, current_cred());
-	if (!error) {
-		*opened |= FILE_OPENED;
-	} else {
-		if (error == -EOPENSTALE)
-			goto stale_open;
+	if (error)
 		goto out;
-	}
+	*opened |= FILE_OPENED;
 opened:
 	error = open_check_o_direct(file);
 	if (!error)
@@ -3353,26 +3337,7 @@ out:
 	}
 	if (got_write)
 		mnt_drop_write(nd->path.mnt);
-	path_put(&save_parent);
 	return error;
-
-stale_open:
-	/* If no saved parent or already retried then can't retry */
-	if (!save_parent.dentry || retried)
-		goto out;
-
-	BUG_ON(save_parent.dentry != dir);
-	path_put(&nd->path);
-	nd->path = save_parent;
-	nd->inode = dir->d_inode;
-	save_parent.mnt = NULL;
-	save_parent.dentry = NULL;
-	if (got_write) {
-		mnt_drop_write(nd->path.mnt);
-		got_write = false;
-	}
-	retried = true;
-	goto retry_lookup;
 }
 
 static int do_tmpfile(struct nameidata *nd, unsigned flags,
-- 
cgit v0.10.2


From e6ec03a25f12b312b7e0c037fe4a6471c4ee5665 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 5 Jun 2016 00:23:09 -0400
Subject: autofs braino fix for do_last()

It's an analogue of commit 7500c38a (fix the braino in "namei:
massage lookup_slow() to be usable by lookup_one_len_unlocked()").
The same problem (->lookup()-returned unhashed negative dentry
just might be an autofs one with ->d_manage() that would wait
until the daemon makes it positive) applies in do_last() - we
need to do follow_managed() first.

Fortunately, remaining callers of follow_managed() are OK - only
autofs has that weirdness (negative dentry that does not mean
an instant -ENOENT)) and autofs never has its negative dentries
hashed, so we can't pick one from a dcache lookup.

->d_manage() is a bloody mess ;-/

Cc: stable@vger.kernel.org # v4.6
Spotted-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 3d9511e..d7c0cac 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3260,6 +3260,10 @@ static int do_last(struct nameidata *nd,
 		got_write = false;
 	}
 
+	error = follow_managed(&path, nd);
+	if (unlikely(error < 0))
+		return error;
+
 	if (unlikely(d_is_negative(path.dentry))) {
 		path_to_nameidata(&path, nd);
 		return -ENOENT;
@@ -3275,10 +3279,6 @@ static int do_last(struct nameidata *nd,
 		return -EEXIST;
 	}
 
-	error = follow_managed(&path, nd);
-	if (unlikely(error < 0))
-		return error;
-
 	seq = 0;	/* out of RCU mode, so the value doesn't matter */
 	inode = d_backing_inode(path.dentry);
 finish_lookup:
-- 
cgit v0.10.2


From 3d56c25e3bb0726a5c5e16fc2d9e38f8ed763085 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Jun 2016 21:26:55 -0400
Subject: fix d_walk()/non-delayed __d_free() race

Ascend-to-parent logics in d_walk() depends on all encountered child
dentries not getting freed without an RCU delay.  Unfortunately, in
quite a few cases it is not true, with hard-to-hit oopsable race as
the result.

Fortunately, the fix is simiple; right now the rule is "if it ever
been hashed, freeing must be delayed" and changing it to "if it
ever had a parent, freeing must be delayed" closes that hole and
covers all cases the old rule used to cover.  Moreover, pipes and
sockets remain _not_ covered, so we do not introduce RCU delay in
the cases which are the reason for having that delay conditional
in the first place.

Cc: stable@vger.kernel.org # v3.2+ (and watch out for __d_materialise_dentry())
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/dcache.c b/fs/dcache.c
index ad4a542..817c243 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1636,7 +1636,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 	struct dentry *dentry = __d_alloc(parent->d_sb, name);
 	if (!dentry)
 		return NULL;
-
+	dentry->d_flags |= DCACHE_RCUACCESS;
 	spin_lock(&parent->d_lock);
 	/*
 	 * don't need child lock because it is not subject
@@ -2358,7 +2358,6 @@ static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b)
 {
 	BUG_ON(!d_unhashed(entry));
 	hlist_bl_lock(b);
-	entry->d_flags |= DCACHE_RCUACCESS;
 	hlist_bl_add_head_rcu(&entry->d_hash, b);
 	hlist_bl_unlock(b);
 }
@@ -2843,6 +2842,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
 	/* ... and switch them in the tree */
 	if (IS_ROOT(dentry)) {
 		/* splicing a tree */
+		dentry->d_flags |= DCACHE_RCUACCESS;
 		dentry->d_parent = target->d_parent;
 		target->d_parent = target;
 		list_del_init(&target->d_child);
-- 
cgit v0.10.2


From a01e718f7241c53f564402f7acff373eed5bd166 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Jun 2016 21:53:51 -0400
Subject: fix a regression in atomic_open()

open("/foo/no_such_file", O_RDONLY | O_CREAT) on should fail with
EACCES when /foo is not writable; failing with ENOENT is obviously
wrong.  That got broken by a braino introduced when moving the
creat_error logics from atomic_open() to lookup_open().  Easy to
fix, fortunately.

Spotted-by: "Yan, Zheng" <ukernel@gmail.com>
Tested-by: "Yan, Zheng" <ukernel@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index d7c0cac..28cb1cd 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2995,9 +2995,13 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
 			}
 			if (*opened & FILE_CREATED)
 				fsnotify_create(dir, dentry);
-			path->dentry = dentry;
-			path->mnt = nd->path.mnt;
-			return 1;
+			if (unlikely(d_is_negative(dentry))) {
+				error = -ENOENT;
+			} else {
+				path->dentry = dentry;
+				path->mnt = nd->path.mnt;
+				return 1;
+			}
 		}
 	}
 	dput(dentry);
-- 
cgit v0.10.2


From 1607f09c226d1378439c411baaaa020042750338 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mguzik@redhat.com>
Date: Sun, 5 Jun 2016 23:14:14 +0200
Subject: coredump: fix dumping through pipes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The offset in the core file used to be tracked with ->written field of
the coredump_params structure. The field was retired in favour of
file->f_pos.

However, ->f_pos is not maintained for pipes which leads to breakage.

Restore explicit tracking of the offset in coredump_params. Introduce
->pos field for this purpose since ->written was already reused.

Fixes: a00839395103 ("get rid of coredump_params->written").

Reported-by: Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>
Signed-off-by: Mateusz Guzik <mguzik@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index 84fb984..85c85eb 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -172,7 +172,7 @@ static int spufs_arch_write_note(struct spu_context *ctx, int i,
 	if (rc < 0)
 		goto out;
 
-	skip = roundup(cprm->file->f_pos - total + sz, 4) - cprm->file->f_pos;
+	skip = roundup(cprm->pos - total + sz, 4) - cprm->pos;
 	if (!dump_skip(cprm, skip))
 		goto Eio;
 out:
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e158b22..a7a28110 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2275,7 +2275,7 @@ static int elf_core_dump(struct coredump_params *cprm)
 		goto end_coredump;
 
 	/* Align to page */
-	if (!dump_skip(cprm, dataoff - cprm->file->f_pos))
+	if (!dump_skip(cprm, dataoff - cprm->pos))
 		goto end_coredump;
 
 	for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 71ade0e..2035893 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1787,7 +1787,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 				goto end_coredump;
 	}
 
-	if (!dump_skip(cprm, dataoff - cprm->file->f_pos))
+	if (!dump_skip(cprm, dataoff - cprm->pos))
 		goto end_coredump;
 
 	if (!elf_fdpic_dump_segments(cprm))
diff --git a/fs/coredump.c b/fs/coredump.c
index 38a7ab8..281b768 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -794,6 +794,7 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
 			return 0;
 		file->f_pos = pos;
 		cprm->written += n;
+		cprm->pos += n;
 		nr -= n;
 	}
 	return 1;
@@ -808,6 +809,7 @@ int dump_skip(struct coredump_params *cprm, size_t nr)
 		if (dump_interrupted() ||
 		    file->f_op->llseek(file, nr, SEEK_CUR) < 0)
 			return 0;
+		cprm->pos += nr;
 		return 1;
 	} else {
 		while (nr > PAGE_SIZE) {
@@ -822,7 +824,7 @@ EXPORT_SYMBOL(dump_skip);
 
 int dump_align(struct coredump_params *cprm, int align)
 {
-	unsigned mod = cprm->file->f_pos & (align - 1);
+	unsigned mod = cprm->pos & (align - 1);
 	if (align & (align - 1))
 		return 0;
 	return mod ? dump_skip(cprm, align - mod) : 1;
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 576e463..314b3ca 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -65,6 +65,7 @@ struct coredump_params {
 	unsigned long limit;
 	unsigned long mm_flags;
 	loff_t written;
+	loff_t pos;
 };
 
 /*
-- 
cgit v0.10.2


From ba65dc5ef16f82fba77869cecf7a7d515f61446b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 10 Jun 2016 11:32:47 -0400
Subject: much milder d_walk() race

d_walk() relies upon the tree not getting rearranged under it without
rename_lock being touched.  And we do grab rename_lock around the
places that change the tree topology.  Unfortunately, branch reordering
is just as bad from d_walk() POV and we have two places that do it
without touching rename_lock - one in handling of cursors (for ramfs-style
directories) and another in autofs.  autofs one is a separate story; this
commit deals with the cursors.
	* mark cursor dentries explicitly at allocation time
	* make __dentry_kill() leave ->d_child.next pointing to the next
non-cursor sibling, making sure that it won't be moved around unnoticed
before the parent is relocked on ascend-to-parent path in d_walk().
	* make d_walk() skip cursors explicitly; strictly speaking it's
not necessary (all callbacks we pass to d_walk() are no-ops on cursors),
but it makes analysis easier.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/dcache.c b/fs/dcache.c
index 817c243..b7eddfd 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -507,6 +507,44 @@ void d_drop(struct dentry *dentry)
 }
 EXPORT_SYMBOL(d_drop);
 
+static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent)
+{
+	struct dentry *next;
+	/*
+	 * Inform d_walk() and shrink_dentry_list() that we are no longer
+	 * attached to the dentry tree
+	 */
+	dentry->d_flags |= DCACHE_DENTRY_KILLED;
+	if (unlikely(list_empty(&dentry->d_child)))
+		return;
+	__list_del_entry(&dentry->d_child);
+	/*
+	 * Cursors can move around the list of children.  While we'd been
+	 * a normal list member, it didn't matter - ->d_child.next would've
+	 * been updated.  However, from now on it won't be and for the
+	 * things like d_walk() it might end up with a nasty surprise.
+	 * Normally d_walk() doesn't care about cursors moving around -
+	 * ->d_lock on parent prevents that and since a cursor has no children
+	 * of its own, we get through it without ever unlocking the parent.
+	 * There is one exception, though - if we ascend from a child that
+	 * gets killed as soon as we unlock it, the next sibling is found
+	 * using the value left in its ->d_child.next.  And if _that_
+	 * pointed to a cursor, and cursor got moved (e.g. by lseek())
+	 * before d_walk() regains parent->d_lock, we'll end up skipping
+	 * everything the cursor had been moved past.
+	 *
+	 * Solution: make sure that the pointer left behind in ->d_child.next
+	 * points to something that won't be moving around.  I.e. skip the
+	 * cursors.
+	 */
+	while (dentry->d_child.next != &parent->d_subdirs) {
+		next = list_entry(dentry->d_child.next, struct dentry, d_child);
+		if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR)))
+			break;
+		dentry->d_child.next = next->d_child.next;
+	}
+}
+
 static void __dentry_kill(struct dentry *dentry)
 {
 	struct dentry *parent = NULL;
@@ -532,12 +570,7 @@ static void __dentry_kill(struct dentry *dentry)
 	}
 	/* if it was on the hash then remove it */
 	__d_drop(dentry);
-	__list_del_entry(&dentry->d_child);
-	/*
-	 * Inform d_walk() that we are no longer attached to the
-	 * dentry tree
-	 */
-	dentry->d_flags |= DCACHE_DENTRY_KILLED;
+	dentry_unlist(dentry, parent);
 	if (parent)
 		spin_unlock(&parent->d_lock);
 	dentry_iput(dentry);
@@ -1203,6 +1236,9 @@ resume:
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
 		next = tmp->next;
 
+		if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR))
+			continue;
+
 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 
 		ret = enter(data, dentry);
@@ -1651,6 +1687,16 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 }
 EXPORT_SYMBOL(d_alloc);
 
+struct dentry *d_alloc_cursor(struct dentry * parent)
+{
+	struct dentry *dentry = __d_alloc(parent->d_sb, NULL);
+	if (dentry) {
+		dentry->d_flags |= DCACHE_RCUACCESS | DCACHE_DENTRY_CURSOR;
+		dentry->d_parent = dget(parent);
+	}
+	return dentry;
+}
+
 /**
  * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems)
  * @sb: the superblock
diff --git a/fs/internal.h b/fs/internal.h
index b71deee..f57ced5 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -130,6 +130,7 @@ extern int invalidate_inodes(struct super_block *, bool);
 extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
 extern int d_set_mounted(struct dentry *dentry);
 extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
+extern struct dentry *d_alloc_cursor(struct dentry *);
 
 /*
  * read_write.c
diff --git a/fs/libfs.c b/fs/libfs.c
index 3db2721..cedeacb 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -71,9 +71,7 @@ EXPORT_SYMBOL(simple_lookup);
 
 int dcache_dir_open(struct inode *inode, struct file *file)
 {
-	static struct qstr cursor_name = QSTR_INIT(".", 1);
-
-	file->private_data = d_alloc(file->f_path.dentry, &cursor_name);
+	file->private_data = d_alloc_cursor(file->f_path.dentry);
 
 	return file->private_data ? 0 : -ENOMEM;
 }
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 484c879..bcd0c64 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -212,6 +212,7 @@ struct dentry_operations {
 #define DCACHE_OP_REAL			0x08000000
 
 #define DCACHE_PAR_LOOKUP		0x10000000 /* being looked up (with parent locked shared) */
+#define DCACHE_DENTRY_CURSOR		0x20000000
 
 extern seqlock_t rename_lock;
 
-- 
cgit v0.10.2


From ea01a18494b3d7a91b2f1f2a6a5aaef4741bc294 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 12 Jun 2016 11:24:46 -0400
Subject: autofs races

* make autofs4_expire_indirect() skip the dentries being in process of
expiry
* do *not* mess with list_move(); making sure that dentry with
AUTOFS_INF_EXPIRING are not picked for expiry is enough.
* do not remove NO_RCU when we set EXPIRING, don't bother with smp_mb()
there.  Clear it at the same time we clear EXPIRING.  Makes a bunch of
tests simpler.
* rename NO_RCU to WANT_EXPIRE, which is what it really is.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index f0d268b..a439548 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -70,9 +70,13 @@ struct autofs_info {
 };
 
 #define AUTOFS_INF_EXPIRING	(1<<0) /* dentry in the process of expiring */
-#define AUTOFS_INF_NO_RCU	(1<<1) /* the dentry is being considered
+#define AUTOFS_INF_WANT_EXPIRE	(1<<1) /* the dentry is being considered
 					* for expiry, so RCU_walk is
-					* not permitted
+					* not permitted.  If it progresses to
+					* actual expiry attempt, the flag is
+					* not cleared when EXPIRING is set -
+					* in that case it gets cleared only
+					* when it comes to clearing EXPIRING.
 					*/
 #define AUTOFS_INF_PENDING	(1<<2) /* dentry pending mount */
 
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 9510d8d..b493909 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -316,19 +316,17 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
 	if (ino->flags & AUTOFS_INF_PENDING)
 		goto out;
 	if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
-		ino->flags |= AUTOFS_INF_NO_RCU;
+		ino->flags |= AUTOFS_INF_WANT_EXPIRE;
 		spin_unlock(&sbi->fs_lock);
 		synchronize_rcu();
 		spin_lock(&sbi->fs_lock);
 		if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
 			ino->flags |= AUTOFS_INF_EXPIRING;
-			smp_mb();
-			ino->flags &= ~AUTOFS_INF_NO_RCU;
 			init_completion(&ino->expire_complete);
 			spin_unlock(&sbi->fs_lock);
 			return root;
 		}
-		ino->flags &= ~AUTOFS_INF_NO_RCU;
+		ino->flags &= ~AUTOFS_INF_WANT_EXPIRE;
 	}
 out:
 	spin_unlock(&sbi->fs_lock);
@@ -446,7 +444,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 	while ((dentry = get_next_positive_subdir(dentry, root))) {
 		spin_lock(&sbi->fs_lock);
 		ino = autofs4_dentry_ino(dentry);
-		if (ino->flags & AUTOFS_INF_NO_RCU)
+		if (ino->flags & AUTOFS_INF_WANT_EXPIRE)
 			expired = NULL;
 		else
 			expired = should_expire(dentry, mnt, timeout, how);
@@ -455,7 +453,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 			continue;
 		}
 		ino = autofs4_dentry_ino(expired);
-		ino->flags |= AUTOFS_INF_NO_RCU;
+		ino->flags |= AUTOFS_INF_WANT_EXPIRE;
 		spin_unlock(&sbi->fs_lock);
 		synchronize_rcu();
 		spin_lock(&sbi->fs_lock);
@@ -465,7 +463,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 			goto found;
 		}
 
-		ino->flags &= ~AUTOFS_INF_NO_RCU;
+		ino->flags &= ~AUTOFS_INF_WANT_EXPIRE;
 		if (expired != dentry)
 			dput(expired);
 		spin_unlock(&sbi->fs_lock);
@@ -475,17 +473,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 found:
 	pr_debug("returning %p %pd\n", expired, expired);
 	ino->flags |= AUTOFS_INF_EXPIRING;
-	smp_mb();
-	ino->flags &= ~AUTOFS_INF_NO_RCU;
 	init_completion(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
-	spin_lock(&sbi->lookup_lock);
-	spin_lock(&expired->d_parent->d_lock);
-	spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
-	list_move(&expired->d_parent->d_subdirs, &expired->d_child);
-	spin_unlock(&expired->d_lock);
-	spin_unlock(&expired->d_parent->d_lock);
-	spin_unlock(&sbi->lookup_lock);
 	return expired;
 }
 
@@ -496,7 +485,7 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
 	int status;
 
 	/* Block on any pending expire */
-	if (!(ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU)))
+	if (!(ino->flags & AUTOFS_INF_WANT_EXPIRE))
 		return 0;
 	if (rcu_walk)
 		return -ECHILD;
@@ -554,7 +543,7 @@ int autofs4_expire_run(struct super_block *sb,
 	ino = autofs4_dentry_ino(dentry);
 	/* avoid rapid-fire expire attempts if expiry fails */
 	ino->last_used = now;
-	ino->flags &= ~AUTOFS_INF_EXPIRING;
+	ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE);
 	complete_all(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
 
@@ -583,7 +572,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 		spin_lock(&sbi->fs_lock);
 		/* avoid rapid-fire expire attempts if expiry fails */
 		ino->last_used = now;
-		ino->flags &= ~AUTOFS_INF_EXPIRING;
+		ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE);
 		complete_all(&ino->expire_complete);
 		spin_unlock(&sbi->fs_lock);
 		dput(dentry);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 78bd802..3767f66 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -458,7 +458,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
 		 */
 		struct inode *inode;
 
-		if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU))
+		if (ino->flags & AUTOFS_INF_WANT_EXPIRE)
 			return 0;
 		if (d_mountpoint(dentry))
 			return 0;
-- 
cgit v0.10.2


From e7d6ef9790bc281f5c29d0132b68031248523fe8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 20 Jun 2016 01:35:59 -0400
Subject: fix idiotic braino in d_alloc_parallel()

Check for d_unhashed() while searching in in-lookup hash was absolutely
wrong.  Worse, it masked a deadlock on dget() done under bitlock that
nests inside ->d_lock.  Thanks to J. R. Okajima for spotting it.

Spotted-by: "J. R. Okajima" <hooanon05g@gmail.com>
Wearing-brown-paperbag: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/dcache.c b/fs/dcache.c
index b7eddfd..d6847d7 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2503,7 +2503,6 @@ retry:
 		rcu_read_unlock();
 		goto retry;
 	}
-	rcu_read_unlock();
 	/*
 	 * No changes for the parent since the beginning of d_lookup().
 	 * Since all removals from the chain happen with hlist_bl_lock(),
@@ -2516,8 +2515,6 @@ retry:
 			continue;
 		if (dentry->d_parent != parent)
 			continue;
-		if (d_unhashed(dentry))
-			continue;
 		if (parent->d_flags & DCACHE_OP_COMPARE) {
 			int tlen = dentry->d_name.len;
 			const char *tname = dentry->d_name.name;
@@ -2529,9 +2526,18 @@ retry:
 			if (dentry_cmp(dentry, str, len))
 				continue;
 		}
-		dget(dentry);
 		hlist_bl_unlock(b);
-		/* somebody is doing lookup for it right now; wait for it */
+		/* now we can try to grab a reference */
+		if (!lockref_get_not_dead(&dentry->d_lockref)) {
+			rcu_read_unlock();
+			goto retry;
+		}
+
+		rcu_read_unlock();
+		/*
+		 * somebody is likely to be still doing lookup for it;
+		 * wait for them to finish
+		 */
 		spin_lock(&dentry->d_lock);
 		d_wait_lookup(dentry);
 		/*
@@ -2562,6 +2568,7 @@ retry:
 		dput(new);
 		return dentry;
 	}
+	rcu_read_unlock();
 	/* we can't take ->d_lock here; it's OK, though. */
 	new->d_flags |= DCACHE_PAR_LOOKUP;
 	new->d_wait = wq;
-- 
cgit v0.10.2


From 274f5b041d3c9c0974a7dd1f66b67c33bb5b0f42 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 6 Jun 2016 18:55:57 -0400
Subject: dcache_{readdir,dir_lseek}(): don't bother with nested ->d_lock

Make sure that directory is locked shared in dcache_dir_lseek();
for dcache_readdir() it's already tru, and that's enough to make
simple_positive() stable.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/libfs.c b/fs/libfs.c
index cedeacb..f56acb1 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -103,6 +103,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
 			struct dentry *cursor = file->private_data;
 			loff_t n = file->f_pos - 2;
 
+			inode_lock_shared(dentry->d_inode);
 			spin_lock(&dentry->d_lock);
 			/* d_lock not required for cursor */
 			list_del(&cursor->d_child);
@@ -110,14 +111,13 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
 			while (n && p != &dentry->d_subdirs) {
 				struct dentry *next;
 				next = list_entry(p, struct dentry, d_child);
-				spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
 				if (simple_positive(next))
 					n--;
-				spin_unlock(&next->d_lock);
 				p = p->next;
 			}
 			list_add_tail(&cursor->d_child, p);
 			spin_unlock(&dentry->d_lock);
+			inode_unlock_shared(dentry->d_inode);
 		}
 	}
 	return offset;
@@ -150,22 +150,16 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
 
 	for (p = q->next; p != &dentry->d_subdirs; p = p->next) {
 		struct dentry *next = list_entry(p, struct dentry, d_child);
-		spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
-		if (!simple_positive(next)) {
-			spin_unlock(&next->d_lock);
+		if (!simple_positive(next))
 			continue;
-		}
 
-		spin_unlock(&next->d_lock);
 		spin_unlock(&dentry->d_lock);
 		if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
 			      d_inode(next)->i_ino, dt_type(d_inode(next))))
 			return 0;
 		spin_lock(&dentry->d_lock);
-		spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
 		/* next is still alive */
 		list_move(q, p);
-		spin_unlock(&next->d_lock);
 		p = q;
 		ctx->pos++;
 	}
-- 
cgit v0.10.2


From 4f42c1b5b9c27b6228e6b9c57eee4beb3118b6b0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 6 Jun 2016 19:37:13 -0400
Subject: libfs.c: new helper - next_positive()

Return nth positive child after given or NULL if there's
less than n left.  dcache_readdir() and dcache_dir_lseek()
switched to it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/libfs.c b/fs/libfs.c
index f56acb1..b05b74a 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -84,6 +84,39 @@ int dcache_dir_close(struct inode *inode, struct file *file)
 }
 EXPORT_SYMBOL(dcache_dir_close);
 
+/* parent is locked at least shared */
+static struct dentry *next_positive(struct dentry *parent,
+				    struct list_head *from,
+				    int count)
+{
+	struct dentry *res = NULL;
+	struct list_head *p;
+
+	spin_lock(&parent->d_lock);
+	for (p = from->next; p != &parent->d_subdirs; p = p->next) {
+		struct dentry *d = list_entry(p, struct dentry, d_child);
+		if (simple_positive(d) && !--count) {
+			res = d;
+			break;
+		}
+	}
+	spin_unlock(&parent->d_lock);
+	return res;
+}
+
+static void move_cursor(struct dentry *cursor, struct list_head *after)
+{
+	struct dentry *parent = cursor->d_parent;
+
+	spin_lock(&parent->d_lock);
+	__list_del(cursor->d_child.prev, cursor->d_child.next);
+	if (after)
+		list_add(&cursor->d_child, after);
+	else
+		list_add_tail(&cursor->d_child, &parent->d_subdirs);
+	spin_unlock(&parent->d_lock);
+}
+
 loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
 {
 	struct dentry *dentry = file->f_path.dentry;
@@ -99,24 +132,13 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
 	if (offset != file->f_pos) {
 		file->f_pos = offset;
 		if (file->f_pos >= 2) {
-			struct list_head *p;
 			struct dentry *cursor = file->private_data;
+			struct dentry *to;
 			loff_t n = file->f_pos - 2;
 
 			inode_lock_shared(dentry->d_inode);
-			spin_lock(&dentry->d_lock);
-			/* d_lock not required for cursor */
-			list_del(&cursor->d_child);
-			p = dentry->d_subdirs.next;
-			while (n && p != &dentry->d_subdirs) {
-				struct dentry *next;
-				next = list_entry(p, struct dentry, d_child);
-				if (simple_positive(next))
-					n--;
-				p = p->next;
-			}
-			list_add_tail(&cursor->d_child, p);
-			spin_unlock(&dentry->d_lock);
+			to = next_positive(dentry, &dentry->d_subdirs, n);
+			move_cursor(cursor, to ? &to->d_child : NULL);
 			inode_unlock_shared(dentry->d_inode);
 		}
 	}
@@ -140,30 +162,25 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct dentry *cursor = file->private_data;
-	struct list_head *p, *q = &cursor->d_child;
+	struct list_head *p = &cursor->d_child;
+	struct dentry *next;
+	bool moved = false;
 
 	if (!dir_emit_dots(file, ctx))
 		return 0;
-	spin_lock(&dentry->d_lock);
-	if (ctx->pos == 2)
-		list_move(q, &dentry->d_subdirs);
 
-	for (p = q->next; p != &dentry->d_subdirs; p = p->next) {
-		struct dentry *next = list_entry(p, struct dentry, d_child);
-		if (!simple_positive(next))
-			continue;
-
-		spin_unlock(&dentry->d_lock);
+	if (ctx->pos == 2)
+		p = &dentry->d_subdirs;
+	while ((next = next_positive(dentry, p, 1)) != NULL) {
 		if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
 			      d_inode(next)->i_ino, dt_type(d_inode(next))))
-			return 0;
-		spin_lock(&dentry->d_lock);
-		/* next is still alive */
-		list_move(q, p);
-		p = q;
+			break;
+		moved = true;
+		p = &next->d_child;
 		ctx->pos++;
 	}
-	spin_unlock(&dentry->d_lock);
+	if (moved)
+		move_cursor(cursor, p);
 	return 0;
 }
 EXPORT_SYMBOL(dcache_readdir);
-- 
cgit v0.10.2


From ebaaa80e8f20ff2cbbccd6823f73a99565487502 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 6 Jun 2016 20:55:34 -0400
Subject: lockless next_positive()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/libfs.c b/fs/libfs.c
index b05b74a..74dc8b9 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -89,31 +89,53 @@ static struct dentry *next_positive(struct dentry *parent,
 				    struct list_head *from,
 				    int count)
 {
-	struct dentry *res = NULL;
+	unsigned *seq = &parent->d_inode->i_dir_seq, n;
+	struct dentry *res;
 	struct list_head *p;
+	bool skipped;
+	int i;
 
-	spin_lock(&parent->d_lock);
+retry:
+	i = count;
+	skipped = false;
+	n = smp_load_acquire(seq) & ~1;
+	res = NULL;
+	rcu_read_lock();
 	for (p = from->next; p != &parent->d_subdirs; p = p->next) {
 		struct dentry *d = list_entry(p, struct dentry, d_child);
-		if (simple_positive(d) && !--count) {
+		if (!simple_positive(d)) {
+			skipped = true;
+		} else if (!--i) {
 			res = d;
 			break;
 		}
 	}
-	spin_unlock(&parent->d_lock);
+	rcu_read_unlock();
+	if (skipped) {
+		smp_rmb();
+		if (unlikely(*seq != n))
+			goto retry;
+	}
 	return res;
 }
 
 static void move_cursor(struct dentry *cursor, struct list_head *after)
 {
 	struct dentry *parent = cursor->d_parent;
-
+	unsigned n, *seq = &parent->d_inode->i_dir_seq;
 	spin_lock(&parent->d_lock);
+	for (;;) {
+		n = *seq;
+		if (!(n & 1) && cmpxchg(seq, n, n + 1) == n)
+			break;
+		cpu_relax();
+	}
 	__list_del(cursor->d_child.prev, cursor->d_child.next);
 	if (after)
 		list_add(&cursor->d_child, after);
 	else
 		list_add_tail(&cursor->d_child, &parent->d_subdirs);
+	smp_store_release(seq, n + 2);
 	spin_unlock(&parent->d_lock);
 }
 
-- 
cgit v0.10.2


From b42b90d177d0c7f46f9c888c10c7900578ae7e09 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 24 Jun 2016 23:49:03 -0400
Subject: ceph: fix d_obtain_alias() misuses

on failure d_obtain_alias() will have done iput()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 6e72c98..1780218 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -95,10 +95,8 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
 	}
 
 	dentry = d_obtain_alias(inode);
-	if (IS_ERR(dentry)) {
-		iput(inode);
+	if (IS_ERR(dentry))
 		return dentry;
-	}
 	err = ceph_init_dentry(dentry);
 	if (err < 0) {
 		dput(dentry);
@@ -167,10 +165,8 @@ static struct dentry *__get_parent(struct super_block *sb,
 		return ERR_PTR(-ENOENT);
 
 	dentry = d_obtain_alias(inode);
-	if (IS_ERR(dentry)) {
-		iput(inode);
+	if (IS_ERR(dentry))
 		return dentry;
-	}
 	err = ceph_init_dentry(dentry);
 	if (err < 0) {
 		dput(dentry);
@@ -210,7 +206,7 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
 
 	dout("fh_to_parent %llx\n", cfh->parent_ino);
 	dentry = __get_parent(sb, NULL, cfh->ino);
-	if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT)
+	if (unlikely(dentry == ERR_PTR(-ENOENT)))
 		dentry = __fh_to_dentry(sb, cfh->parent_ino);
 	return dentry;
 }
-- 
cgit v0.10.2


From b403f0e37a11f84f7ceaf40b0075499e5bcfd220 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Wed, 29 Jun 2016 10:54:23 +0200
Subject: 9p: use file_dentry()

v9fs may be used as lower layer of overlayfs and accessing f_path.dentry
can lead to a crash.  In this case it's a NULL pointer dereference in
p9_fid_create().

Fix by replacing direct access of file->f_path.dentry with the
file_dentry() accessor, which will always return a native object.

Reported-by: Alessio Igor Bogani <alessioigorbogani@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Tested-by: Alessio Igor Bogani <alessioigorbogani@gmail.com>
Fixes: 4bacc9c9234c ("overlayfs: Make f_path always point to the overlay and f_inode to the underlay")
Cc: <stable@vger.kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index b84c291..d7b78d5 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -74,7 +74,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 					v9fs_proto_dotu(v9ses));
 	fid = file->private_data;
 	if (!fid) {
-		fid = v9fs_fid_clone(file->f_path.dentry);
+		fid = v9fs_fid_clone(file_dentry(file));
 		if (IS_ERR(fid))
 			return PTR_ERR(fid);
 
@@ -100,7 +100,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 		 * because we want write after unlink usecase
 		 * to work.
 		 */
-		fid = v9fs_writeback_fid(file->f_path.dentry);
+		fid = v9fs_writeback_fid(file_dentry(file));
 		if (IS_ERR(fid)) {
 			err = PTR_ERR(fid);
 			mutex_unlock(&v9inode->v_mutex);
@@ -516,7 +516,7 @@ v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma)
 		 * because we want write after unlink usecase
 		 * to work.
 		 */
-		fid = v9fs_writeback_fid(filp->f_path.dentry);
+		fid = v9fs_writeback_fid(file_dentry(filp));
 		if (IS_ERR(fid)) {
 			retval = PTR_ERR(fid);
 			mutex_unlock(&v9inode->v_mutex);
-- 
cgit v0.10.2


From e06b933e6ded42384164d28a2060b7f89243b895 Mon Sep 17 00:00:00 2001
From: Andrey Ulanov <andreyu@google.com>
Date: Fri, 15 Apr 2016 14:24:41 -0700
Subject: namespace: update event counter when umounting a deleted dentry

- m_start() in fs/namespace.c expects that ns->event is incremented each
  time a mount added or removed from ns->list.
- umount_tree() removes items from the list but does not increment event
  counter, expecting that it's done before the function is called.
- There are some codepaths that call umount_tree() without updating
  "event" counter. e.g. from __detach_mounts().
- When this happens m_start may reuse a cached mount structure that no
  longer belongs to ns->list (i.e. use after free which usually leads
  to infinite loop).

This change fixes the above problem by incrementing global event counter
before invoking umount_tree().

Change-Id: I622c8e84dcb9fb63542372c5dbf0178ee86bb589
Cc: stable@vger.kernel.org
Signed-off-by: Andrey Ulanov <andreyu@google.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namespace.c b/fs/namespace.c
index 4fb1691..298618b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1562,6 +1562,7 @@ void __detach_mounts(struct dentry *dentry)
 		goto out_unlock;
 
 	lock_mount_hash();
+	event++;
 	while (!hlist_empty(&mp->m_list)) {
 		mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
 		if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
-- 
cgit v0.10.2