From 35a92fe8770ce54c5eb275cd76128645bea2d200 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Thu, 17 Sep 2015 07:47:08 -0400
Subject: nfsd: serialize state seqid morphing operations

Andrew was seeing a race occur when an OPEN and OPEN_DOWNGRADE were
running in parallel. The server would receive the OPEN_DOWNGRADE first
and check its seqid, but then an OPEN would race in and bump it. The
OPEN_DOWNGRADE would then complete and bump the seqid again.  The result
was that the OPEN_DOWNGRADE would be applied after the OPEN, even though
it should have been rejected since the seqid changed.

The only recourse we have here I think is to serialize operations that
bump the seqid in a stateid, particularly when we're given a seqid in
the call. To address this, we add a new rw_semaphore to the
nfs4_ol_stateid struct. We do a down_write prior to checking the seqid
after looking up the stateid to ensure that nothing else is going to
bump it while we're operating on it.

In the case of OPEN, we do a down_read, as the call doesn't contain a
seqid. Those can run in parallel -- we just need to serialize them when
there is a concurrent OPEN_DOWNGRADE or CLOSE.

LOCK and LOCKU however always take the write lock as there is no
opportunity for parallelizing those.

Reported-and-Tested-by: Andrew W Elble <aweits@rit.edu>
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Cc: stable@vger.kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0f1d569..1b39edf 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3360,6 +3360,7 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = 0;
 	stp->st_openstp = NULL;
+	init_rwsem(&stp->st_rwsem);
 	spin_lock(&oo->oo_owner.so_client->cl_lock);
 	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
 	spin_lock(&fp->fi_lock);
@@ -4187,15 +4188,20 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	 */
 	if (stp) {
 		/* Stateid was found, this is an OPEN upgrade */
+		down_read(&stp->st_rwsem);
 		status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
-		if (status)
+		if (status) {
+			up_read(&stp->st_rwsem);
 			goto out;
+		}
 	} else {
 		stp = open->op_stp;
 		open->op_stp = NULL;
 		init_open_stateid(stp, fp, open);
+		down_read(&stp->st_rwsem);
 		status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
 		if (status) {
+			up_read(&stp->st_rwsem);
 			release_open_stateid(stp);
 			goto out;
 		}
@@ -4207,6 +4213,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	}
 	update_stateid(&stp->st_stid.sc_stateid);
 	memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+	up_read(&stp->st_rwsem);
 
 	if (nfsd4_has_session(&resp->cstate)) {
 		if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
@@ -4819,10 +4826,13 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
 		 * revoked delegations are kept only for free_stateid.
 		 */
 		return nfserr_bad_stateid;
+	down_write(&stp->st_rwsem);
 	status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
-	if (status)
-		return status;
-	return nfs4_check_fh(current_fh, &stp->st_stid);
+	if (status == nfs_ok)
+		status = nfs4_check_fh(current_fh, &stp->st_stid);
+	if (status != nfs_ok)
+		up_write(&stp->st_rwsem);
+	return status;
 }
 
 /* 
@@ -4869,6 +4879,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs
 		return status;
 	oo = openowner(stp->st_stateowner);
 	if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
+		up_write(&stp->st_rwsem);
 		nfs4_put_stid(&stp->st_stid);
 		return nfserr_bad_stateid;
 	}
@@ -4899,11 +4910,14 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto out;
 	oo = openowner(stp->st_stateowner);
 	status = nfserr_bad_stateid;
-	if (oo->oo_flags & NFS4_OO_CONFIRMED)
+	if (oo->oo_flags & NFS4_OO_CONFIRMED) {
+		up_write(&stp->st_rwsem);
 		goto put_stateid;
+	}
 	oo->oo_flags |= NFS4_OO_CONFIRMED;
 	update_stateid(&stp->st_stid.sc_stateid);
 	memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+	up_write(&stp->st_rwsem);
 	dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
 		__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
 
@@ -4982,6 +4996,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 	memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 	status = nfs_ok;
 put_stateid:
+	up_write(&stp->st_rwsem);
 	nfs4_put_stid(&stp->st_stid);
 out:
 	nfsd4_bump_seqid(cstate, status);
@@ -5035,6 +5050,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto out; 
 	update_stateid(&stp->st_stid.sc_stateid);
 	memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+	up_write(&stp->st_rwsem);
 
 	nfsd4_close_open_stateid(stp);
 
@@ -5260,6 +5276,7 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = open_stp->st_deny_bmap;
 	stp->st_openstp = open_stp;
+	init_rwsem(&stp->st_rwsem);
 	list_add(&stp->st_locks, &open_stp->st_locks);
 	list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
 	spin_lock(&fp->fi_lock);
@@ -5428,6 +5445,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 					&open_stp, nn);
 		if (status)
 			goto out;
+		up_write(&open_stp->st_rwsem);
 		open_sop = openowner(open_stp->st_stateowner);
 		status = nfserr_bad_stateid;
 		if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
@@ -5435,6 +5453,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			goto out;
 		status = lookup_or_create_lock_state(cstate, open_stp, lock,
 							&lock_stp, &new);
+		if (status == nfs_ok)
+			down_write(&lock_stp->st_rwsem);
 	} else {
 		status = nfs4_preprocess_seqid_op(cstate,
 				       lock->lk_old_lock_seqid,
@@ -5540,6 +5560,8 @@ out:
 		    seqid_mutating_err(ntohl(status)))
 			lock_sop->lo_owner.so_seqid++;
 
+		up_write(&lock_stp->st_rwsem);
+
 		/*
 		 * If this is a new, never-before-used stateid, and we are
 		 * returning an error, then just go ahead and release it.
@@ -5709,6 +5731,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 fput:
 	fput(filp);
 put_stateid:
+	up_write(&stp->st_rwsem);
 	nfs4_put_stid(&stp->st_stid);
 out:
 	nfsd4_bump_seqid(cstate, status);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 583ffc1..31bde12 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -534,15 +534,16 @@ struct nfs4_file {
  * Better suggestions welcome.
  */
 struct nfs4_ol_stateid {
-	struct nfs4_stid    st_stid; /* must be first field */
-	struct list_head              st_perfile;
-	struct list_head              st_perstateowner;
-	struct list_head              st_locks;
-	struct nfs4_stateowner      * st_stateowner;
-	struct nfs4_clnt_odstate    * st_clnt_odstate;
-	unsigned char                 st_access_bmap;
-	unsigned char                 st_deny_bmap;
-	struct nfs4_ol_stateid         * st_openstp;
+	struct nfs4_stid		st_stid;
+	struct list_head		st_perfile;
+	struct list_head		st_perstateowner;
+	struct list_head		st_locks;
+	struct nfs4_stateowner		*st_stateowner;
+	struct nfs4_clnt_odstate	*st_clnt_odstate;
+	unsigned char			st_access_bmap;
+	unsigned char			st_deny_bmap;
+	struct nfs4_ol_stateid		*st_openstp;
+	struct rw_semaphore		st_rwsem;
 };
 
 static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
-- 
cgit v0.10.2


From e79017ddce8273514d24728523d17033bcbeba6f Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@lip6.fr>
Date: Sun, 13 Sep 2015 14:15:15 +0200
Subject: nfsd: drop null test before destroy functions

Remove unneeded NULL test.

The semantic patch that makes this change is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@@ expression x; @@
-if (x != NULL) {
  \(kmem_cache_destroy\|mempool_destroy\|dma_pool_destroy\)(x);
  x = NULL;
-}
// </smpl>

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 46ec934..116940c 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -217,10 +217,8 @@ void nfsd_reply_cache_shutdown(void)
 	drc_hashtbl = NULL;
 	drc_hashsize = 0;
 
-	if (drc_slab) {
-		kmem_cache_destroy(drc_slab);
-		drc_slab = NULL;
-	}
+	kmem_cache_destroy(drc_slab);
+	drc_slab = NULL;
 }
 
 /*
-- 
cgit v0.10.2


From fcaba026a55803dd21523e6e191ba7f59e02a737 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Thu, 17 Sep 2015 08:28:38 -0400
Subject: nfsd: move svc_fh->fh_maxsize to just after fh_handle

This moves the hole in the struct down below the flags fields, which
allows us to potentially add a new flag without growing the struct.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 1e90dad..b1edd6b 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -26,9 +26,9 @@ static inline ino_t u32_to_ino_t(__u32 uino)
  */
 typedef struct svc_fh {
 	struct knfsd_fh		fh_handle;	/* FH data */
+	int			fh_maxsize;	/* max size for fh_handle */
 	struct dentry *		fh_dentry;	/* validated dentry */
 	struct svc_export *	fh_export;	/* export pointer */
-	int			fh_maxsize;	/* max size for fh_handle */
 
 	unsigned char		fh_locked;	/* inode locked by us */
 	unsigned char		fh_want_write;	/* remount protection taken */
-- 
cgit v0.10.2


From aaf91ec148910e0c2bfd135ea19f870e7196e64f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Thu, 17 Sep 2015 08:28:39 -0400
Subject: nfsd: switch unsigned char flags in svc_fh to bools

...just for clarity.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f6e7cba..00575d7 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -262,11 +262,11 @@ void fill_post_wcc(struct svc_fh *fhp)
 	err = fh_getattr(fhp, &fhp->fh_post_attr);
 	fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version;
 	if (err) {
-		fhp->fh_post_saved = 0;
+		fhp->fh_post_saved = false;
 		/* Grab the ctime anyway - set_change_info might use it */
 		fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime;
 	} else
-		fhp->fh_post_saved = 1;
+		fhp->fh_post_saved = true;
 }
 
 /*
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 350041a..c1681ce 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -631,10 +631,7 @@ fh_put(struct svc_fh *fhp)
 		fh_unlock(fhp);
 		fhp->fh_dentry = NULL;
 		dput(dentry);
-#ifdef CONFIG_NFSD_V3
-		fhp->fh_pre_saved = 0;
-		fhp->fh_post_saved = 0;
-#endif
+		fh_clear_wcc(fhp);
 	}
 	fh_drop_write(fhp);
 	if (exp) {
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index b1edd6b..2087bae 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -30,12 +30,12 @@ typedef struct svc_fh {
 	struct dentry *		fh_dentry;	/* validated dentry */
 	struct svc_export *	fh_export;	/* export pointer */
 
-	unsigned char		fh_locked;	/* inode locked by us */
-	unsigned char		fh_want_write;	/* remount protection taken */
+	bool			fh_locked;	/* inode locked by us */
+	bool			fh_want_write;	/* remount protection taken */
 
 #ifdef CONFIG_NFSD_V3
-	unsigned char		fh_post_saved;	/* post-op attrs saved */
-	unsigned char		fh_pre_saved;	/* pre-op attrs saved */
+	bool			fh_post_saved;	/* post-op attrs saved */
+	bool			fh_pre_saved;	/* pre-op attrs saved */
 
 	/* Pre-op attributes saved during fh_lock */
 	__u64			fh_pre_size;	/* size before operation */
@@ -213,8 +213,8 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
 static inline void
 fh_clear_wcc(struct svc_fh *fhp)
 {
-	fhp->fh_post_saved = 0;
-	fhp->fh_pre_saved = 0;
+	fhp->fh_post_saved = false;
+	fhp->fh_pre_saved = false;
 }
 
 /*
@@ -231,7 +231,7 @@ fill_pre_wcc(struct svc_fh *fhp)
 		fhp->fh_pre_ctime = inode->i_ctime;
 		fhp->fh_pre_size  = inode->i_size;
 		fhp->fh_pre_change = inode->i_version;
-		fhp->fh_pre_saved = 1;
+		fhp->fh_pre_saved = true;
 	}
 }
 
@@ -267,7 +267,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
 	inode = d_inode(dentry);
 	mutex_lock_nested(&inode->i_mutex, subclass);
 	fill_pre_wcc(fhp);
-	fhp->fh_locked = 1;
+	fhp->fh_locked = true;
 }
 
 static inline void
@@ -285,7 +285,7 @@ fh_unlock(struct svc_fh *fhp)
 	if (fhp->fh_locked) {
 		fill_post_wcc(fhp);
 		mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex);
-		fhp->fh_locked = 0;
+		fhp->fh_locked = false;
 	}
 }
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 45c0497..994d66f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1631,7 +1631,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	/* cannot use fh_lock as we need deadlock protective ordering
 	 * so do it by hand */
 	trap = lock_rename(tdentry, fdentry);
-	ffhp->fh_locked = tfhp->fh_locked = 1;
+	ffhp->fh_locked = tfhp->fh_locked = true;
 	fill_pre_wcc(ffhp);
 	fill_pre_wcc(tfhp);
 
@@ -1681,7 +1681,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	fill_post_wcc(ffhp);
 	fill_post_wcc(tfhp);
 	unlock_rename(tdentry, fdentry);
-	ffhp->fh_locked = tfhp->fh_locked = 0;
+	ffhp->fh_locked = tfhp->fh_locked = false;
 	fh_drop_write(ffhp);
 
 out:
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index fee2451..fcfc48c 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -112,14 +112,14 @@ static inline int fh_want_write(struct svc_fh *fh)
 	int ret = mnt_want_write(fh->fh_export->ex_path.mnt);
 
 	if (!ret)
-		fh->fh_want_write = 1;
+		fh->fh_want_write = true;
 	return ret;
 }
 
 static inline void fh_drop_write(struct svc_fh *fh)
 {
 	if (fh->fh_want_write) {
-		fh->fh_want_write = 0;
+		fh->fh_want_write = false;
 		mnt_drop_write(fh->fh_export->ex_path.mnt);
 	}
 }
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 9f99100..ce7362c 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -632,7 +632,7 @@ static inline void
 set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
 {
 	BUG_ON(!fhp->fh_pre_saved);
-	cinfo->atomic = fhp->fh_post_saved;
+	cinfo->atomic = (u32)fhp->fh_post_saved;
 	cinfo->change_supported = IS_I_VERSION(d_inode(fhp->fh_dentry));
 
 	cinfo->before_change = fhp->fh_pre_change;
-- 
cgit v0.10.2


From 0ad95472bf169a3501991f8f33f5147f792a8116 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Wed, 23 Sep 2015 15:49:29 +0300
Subject: lockd: create NSM handles per net namespace

Commit cb7323fffa85 ("lockd: create and use per-net NSM
 RPC clients on MON/UNMON requests") introduced per-net
NSM RPC clients. Unfortunately this doesn't make any sense
without per-net nsm_handle.

E.g. the following scenario could happen
Two hosts (X and Y) in different namespaces (A and B) share
the same nsm struct.

1. nsm_monitor(host_X) called => NSM rpc client created,
	nsm->sm_monitored bit set.
2. nsm_mointor(host-Y) called => nsm->sm_monitored already set,
	we just exit. Thus in namespace B ln->nsm_clnt == NULL.
3. host X destroyed => nsm->sm_count decremented to 1
4. host Y destroyed => nsm_unmonitor() => nsm_mon_unmon() => NULL-ptr
	dereference of *ln->nsm_clnt

So this could be fixed by making per-net nsm_handles list,
instead of global. Thus different net namespaces will not be able
share the same nsm_handle.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 969d589..b5f3c3ab 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -116,7 +116,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
 		atomic_inc(&nsm->sm_count);
 	else {
 		host = NULL;
-		nsm = nsm_get_handle(ni->sap, ni->salen,
+		nsm = nsm_get_handle(ni->net, ni->sap, ni->salen,
 					ni->hostname, ni->hostname_len);
 		if (unlikely(nsm == NULL)) {
 			dprintk("lockd: %s failed; no nsm handle\n",
@@ -534,17 +534,18 @@ static struct nlm_host *next_host_state(struct hlist_head *cache,
 
 /**
  * nlm_host_rebooted - Release all resources held by rebooted host
+ * @net:  network namespace
  * @info: pointer to decoded results of NLM_SM_NOTIFY call
  *
  * We were notified that the specified host has rebooted.  Release
  * all resources held by that peer.
  */
-void nlm_host_rebooted(const struct nlm_reboot *info)
+void nlm_host_rebooted(const struct net *net, const struct nlm_reboot *info)
 {
 	struct nsm_handle *nsm;
 	struct nlm_host	*host;
 
-	nsm = nsm_reboot_lookup(info);
+	nsm = nsm_reboot_lookup(net, info);
 	if (unlikely(nsm == NULL))
 		return;
 
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 47a32b6..6c05cd1 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -51,7 +51,6 @@ struct nsm_res {
 };
 
 static const struct rpc_program	nsm_program;
-static				LIST_HEAD(nsm_handles);
 static				DEFINE_SPINLOCK(nsm_lock);
 
 /*
@@ -264,33 +263,35 @@ void nsm_unmonitor(const struct nlm_host *host)
 	}
 }
 
-static struct nsm_handle *nsm_lookup_hostname(const char *hostname,
-					      const size_t len)
+static struct nsm_handle *nsm_lookup_hostname(const struct list_head *nsm_handles,
+					const char *hostname, const size_t len)
 {
 	struct nsm_handle *nsm;
 
-	list_for_each_entry(nsm, &nsm_handles, sm_link)
+	list_for_each_entry(nsm, nsm_handles, sm_link)
 		if (strlen(nsm->sm_name) == len &&
 		    memcmp(nsm->sm_name, hostname, len) == 0)
 			return nsm;
 	return NULL;
 }
 
-static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
+static struct nsm_handle *nsm_lookup_addr(const struct list_head *nsm_handles,
+					const struct sockaddr *sap)
 {
 	struct nsm_handle *nsm;
 
-	list_for_each_entry(nsm, &nsm_handles, sm_link)
+	list_for_each_entry(nsm, nsm_handles, sm_link)
 		if (rpc_cmp_addr(nsm_addr(nsm), sap))
 			return nsm;
 	return NULL;
 }
 
-static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv)
+static struct nsm_handle *nsm_lookup_priv(const struct list_head *nsm_handles,
+					const struct nsm_private *priv)
 {
 	struct nsm_handle *nsm;
 
-	list_for_each_entry(nsm, &nsm_handles, sm_link)
+	list_for_each_entry(nsm, nsm_handles, sm_link)
 		if (memcmp(nsm->sm_priv.data, priv->data,
 					sizeof(priv->data)) == 0)
 			return nsm;
@@ -353,6 +354,7 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
 
 /**
  * nsm_get_handle - Find or create a cached nsm_handle
+ * @net: network namespace
  * @sap: pointer to socket address of handle to find
  * @salen: length of socket address
  * @hostname: pointer to C string containing hostname to find
@@ -365,11 +367,13 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
  * @hostname cannot be found in the handle cache.  Returns NULL if
  * an error occurs.
  */
-struct nsm_handle *nsm_get_handle(const struct sockaddr *sap,
+struct nsm_handle *nsm_get_handle(const struct net *net,
+				  const struct sockaddr *sap,
 				  const size_t salen, const char *hostname,
 				  const size_t hostname_len)
 {
 	struct nsm_handle *cached, *new = NULL;
+	struct lockd_net *ln = net_generic(net, lockd_net_id);
 
 	if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
 		if (printk_ratelimit()) {
@@ -384,9 +388,10 @@ retry:
 	spin_lock(&nsm_lock);
 
 	if (nsm_use_hostnames && hostname != NULL)
-		cached = nsm_lookup_hostname(hostname, hostname_len);
+		cached = nsm_lookup_hostname(&ln->nsm_handles,
+					hostname, hostname_len);
 	else
-		cached = nsm_lookup_addr(sap);
+		cached = nsm_lookup_addr(&ln->nsm_handles, sap);
 
 	if (cached != NULL) {
 		atomic_inc(&cached->sm_count);
@@ -400,7 +405,7 @@ retry:
 	}
 
 	if (new != NULL) {
-		list_add(&new->sm_link, &nsm_handles);
+		list_add(&new->sm_link, &ln->nsm_handles);
 		spin_unlock(&nsm_lock);
 		dprintk("lockd: created nsm_handle for %s (%s)\n",
 				new->sm_name, new->sm_addrbuf);
@@ -417,19 +422,22 @@ retry:
 
 /**
  * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
+ * @net:  network namespace
  * @info: pointer to NLMPROC_SM_NOTIFY arguments
  *
  * Returns a matching nsm_handle if found in the nsm cache. The returned
  * nsm_handle's reference count is bumped. Otherwise returns NULL if some
  * error occurred.
  */
-struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
+struct nsm_handle *nsm_reboot_lookup(const struct net *net,
+				const struct nlm_reboot *info)
 {
 	struct nsm_handle *cached;
+	struct lockd_net *ln = net_generic(net, lockd_net_id);
 
 	spin_lock(&nsm_lock);
 
-	cached = nsm_lookup_priv(&info->priv);
+	cached = nsm_lookup_priv(&ln->nsm_handles, &info->priv);
 	if (unlikely(cached == NULL)) {
 		spin_unlock(&nsm_lock);
 		dprintk("lockd: never saw rebooted peer '%.*s' before\n",
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 097bfa3..89fe011 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -15,6 +15,7 @@ struct lockd_net {
 	spinlock_t nsm_clnt_lock;
 	unsigned int nsm_users;
 	struct rpc_clnt *nsm_clnt;
+	struct list_head nsm_handles;
 };
 
 extern int lockd_net_id;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index d678bcc..0dff13f 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -593,6 +593,7 @@ static int lockd_init_net(struct net *net)
 	INIT_LIST_HEAD(&ln->lockd_manager.list);
 	ln->lockd_manager.block_opens = false;
 	spin_lock_init(&ln->nsm_clnt_lock);
+	INIT_LIST_HEAD(&ln->nsm_handles);
 	return 0;
 }
 
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index b147d1a..09c576f 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -421,7 +421,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 		return rpc_system_err;
 	}
 
-	nlm_host_rebooted(argp);
+	nlm_host_rebooted(SVC_NET(rqstp), argp);
 	return rpc_success;
 }
 
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 21171f0..fb26b9f 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -464,7 +464,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 		return rpc_system_err;
 	}
 
-	nlm_host_rebooted(argp);
+	nlm_host_rebooted(SVC_NET(rqstp), argp);
 	return rpc_success;
 }
 
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index ff82a32..fd3b65b 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -235,7 +235,8 @@ void		  nlm_rebind_host(struct nlm_host *);
 struct nlm_host * nlm_get_host(struct nlm_host *);
 void		  nlm_shutdown_hosts(void);
 void		  nlm_shutdown_hosts_net(struct net *net);
-void		  nlm_host_rebooted(const struct nlm_reboot *);
+void		  nlm_host_rebooted(const struct net *net,
+					const struct nlm_reboot *);
 
 /*
  * Host monitoring
@@ -243,11 +244,13 @@ void		  nlm_host_rebooted(const struct nlm_reboot *);
 int		  nsm_monitor(const struct nlm_host *host);
 void		  nsm_unmonitor(const struct nlm_host *host);
 
-struct nsm_handle *nsm_get_handle(const struct sockaddr *sap,
+struct nsm_handle *nsm_get_handle(const struct net *net,
+					const struct sockaddr *sap,
 					const size_t salen,
 					const char *hostname,
 					const size_t hostname_len);
-struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info);
+struct nsm_handle *nsm_reboot_lookup(const struct net *net,
+					const struct nlm_reboot *info);
 void		  nsm_release(struct nsm_handle *nsm);
 
 /*
-- 
cgit v0.10.2


From 226453d8cfbd5f3bbdcef0d71e67186715bb811d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 7 Oct 2015 13:54:42 -0400
Subject: SUNRPC: Use MSG_SENDPAGE_NOTLAST when calling sendpage()

If we're sending more pages via kernel_sendpage(), then set
MSG_SENDPAGE_NOTLAST.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 0c81202..4892373 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -181,7 +181,7 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
 	struct page	**ppage = xdr->pages;
 	size_t		base = xdr->page_base;
 	unsigned int	pglen = xdr->page_len;
-	unsigned int	flags = MSG_MORE;
+	unsigned int	flags = MSG_MORE | MSG_SENDPAGE_NOTLAST;
 	int		slen;
 	int		len = 0;
 
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 1a85e0e..f264962 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -360,8 +360,10 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
 		int flags = XS_SENDMSG_FLAGS;
 
 		remainder -= len;
-		if (remainder != 0 || more)
+		if (more)
 			flags |= MSG_MORE;
+		if (remainder != 0)
+			flags |= MSG_SENDPAGE_NOTLAST | MSG_MORE;
 		err = do_sendpage(sock, *ppage, base, len, flags);
 		if (remainder == 0 || err != len)
 			break;
-- 
cgit v0.10.2


From 0d0f4aab4e4d290138a4ae7f2ef8469e48c9a669 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Wed, 7 Oct 2015 14:39:55 +0300
Subject: lockd: get rid of reference-counted NSM RPC clients

Currently we have reference-counted per-net NSM RPC client
which created on the first monitor request and destroyed
after the last unmonitor request. It's needed because
RPC client need to know 'utsname()->nodename', but utsname()
might be NULL when nsm_unmonitor() called.

So instead of holding the rpc client we could just save nodename
in struct nlm_host and pass it to the rpc_create().
Thus ther is no need in keeping rpc client until last
unmonitor request. We could create separate RPC clients
for each monitor/unmonitor requests.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index b5f3c3ab..d716c99 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -161,6 +161,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
 	host->h_nsmhandle  = nsm;
 	host->h_addrbuf    = nsm->sm_addrbuf;
 	host->net	   = ni->net;
+	strlcpy(host->nodename, utsname()->nodename, sizeof(host->nodename));
 
 out:
 	return host;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 6c05cd1..19166d4 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -42,7 +42,7 @@ struct nsm_args {
 	u32			proc;
 
 	char			*mon_name;
-	char			*nodename;
+	const char		*nodename;
 };
 
 struct nsm_res {
@@ -86,69 +86,18 @@ static struct rpc_clnt *nsm_create(struct net *net, const char *nodename)
 	return rpc_create(&args);
 }
 
-static struct rpc_clnt *nsm_client_set(struct lockd_net *ln,
-		struct rpc_clnt *clnt)
-{
-	spin_lock(&ln->nsm_clnt_lock);
-	if (ln->nsm_users == 0) {
-		if (clnt == NULL)
-			goto out;
-		ln->nsm_clnt = clnt;
-	}
-	clnt = ln->nsm_clnt;
-	ln->nsm_users++;
-out:
-	spin_unlock(&ln->nsm_clnt_lock);
-	return clnt;
-}
-
-static struct rpc_clnt *nsm_client_get(struct net *net, const char *nodename)
-{
-	struct rpc_clnt	*clnt, *new;
-	struct lockd_net *ln = net_generic(net, lockd_net_id);
-
-	clnt = nsm_client_set(ln, NULL);
-	if (clnt != NULL)
-		goto out;
-
-	clnt = new = nsm_create(net, nodename);
-	if (IS_ERR(clnt))
-		goto out;
-
-	clnt = nsm_client_set(ln, new);
-	if (clnt != new)
-		rpc_shutdown_client(new);
-out:
-	return clnt;
-}
-
-static void nsm_client_put(struct net *net)
-{
-	struct lockd_net *ln = net_generic(net, lockd_net_id);
-	struct rpc_clnt	*clnt = NULL;
-
-	spin_lock(&ln->nsm_clnt_lock);
-	ln->nsm_users--;
-	if (ln->nsm_users == 0) {
-		clnt = ln->nsm_clnt;
-		ln->nsm_clnt = NULL;
-	}
-	spin_unlock(&ln->nsm_clnt_lock);
-	if (clnt != NULL)
-		rpc_shutdown_client(clnt);
-}
-
 static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
-			 struct rpc_clnt *clnt)
+			 const struct nlm_host *host)
 {
 	int		status;
+	struct rpc_clnt *clnt;
 	struct nsm_args args = {
 		.priv		= &nsm->sm_priv,
 		.prog		= NLM_PROGRAM,
 		.vers		= 3,
 		.proc		= NLMPROC_NSM_NOTIFY,
 		.mon_name	= nsm->sm_mon_name,
-		.nodename	= clnt->cl_nodename,
+		.nodename	= host->nodename,
 	};
 	struct rpc_message msg = {
 		.rpc_argp	= &args,
@@ -157,6 +106,13 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
 
 	memset(res, 0, sizeof(*res));
 
+	clnt = nsm_create(host->net, host->nodename);
+	if (IS_ERR(clnt)) {
+		dprintk("lockd: failed to create NSM upcall transport, "
+			"status=%ld, net=%p\n", PTR_ERR(clnt), host->net);
+		return PTR_ERR(clnt);
+	}
+
 	msg.rpc_proc = &clnt->cl_procinfo[proc];
 	status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
 	if (status == -ECONNREFUSED) {
@@ -170,6 +126,8 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
 				status);
 	else
 		status = 0;
+
+	rpc_shutdown_client(clnt);
 	return status;
 }
 
@@ -189,32 +147,19 @@ int nsm_monitor(const struct nlm_host *host)
 	struct nsm_handle *nsm = host->h_nsmhandle;
 	struct nsm_res	res;
 	int		status;
-	struct rpc_clnt *clnt;
-	const char *nodename = NULL;
 
 	dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
 
 	if (nsm->sm_monitored)
 		return 0;
 
-	if (host->h_rpcclnt)
-		nodename = host->h_rpcclnt->cl_nodename;
-
 	/*
 	 * Choose whether to record the caller_name or IP address of
 	 * this peer in the local rpc.statd's database.
 	 */
 	nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
 
-	clnt = nsm_client_get(host->net, nodename);
-	if (IS_ERR(clnt)) {
-		status = PTR_ERR(clnt);
-		dprintk("lockd: failed to create NSM upcall transport, "
-				"status=%d, net=%p\n", status, host->net);
-		return status;
-	}
-
-	status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, clnt);
+	status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, host);
 	if (unlikely(res.status != 0))
 		status = -EIO;
 	if (unlikely(status < 0)) {
@@ -246,11 +191,9 @@ void nsm_unmonitor(const struct nlm_host *host)
 
 	if (atomic_read(&nsm->sm_count) == 1
 	 && nsm->sm_monitored && !nsm->sm_sticky) {
-		struct lockd_net *ln = net_generic(host->net, lockd_net_id);
-
 		dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
 
-		status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, ln->nsm_clnt);
+		status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, host);
 		if (res.status != 0)
 			status = -EIO;
 		if (status < 0)
@@ -258,8 +201,6 @@ void nsm_unmonitor(const struct nlm_host *host)
 					nsm->sm_name);
 		else
 			nsm->sm_monitored = 0;
-
-		nsm_client_put(host->net);
 	}
 }
 
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 89fe011..5426189 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -12,9 +12,6 @@ struct lockd_net {
 	struct delayed_work grace_period_end;
 	struct lock_manager lockd_manager;
 
-	spinlock_t nsm_clnt_lock;
-	unsigned int nsm_users;
-	struct rpc_clnt *nsm_clnt;
 	struct list_head nsm_handles;
 };
 
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 0dff13f..5f31ebd 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -592,7 +592,6 @@ static int lockd_init_net(struct net *net)
 	INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender);
 	INIT_LIST_HEAD(&ln->lockd_manager.list);
 	ln->lockd_manager.block_opens = false;
-	spin_lock_init(&ln->nsm_clnt_lock);
 	INIT_LIST_HEAD(&ln->nsm_handles);
 	return 0;
 }
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index fd3b65b..c153738 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -68,6 +68,7 @@ struct nlm_host {
 	struct nsm_handle	*h_nsmhandle;	/* NSM status handle */
 	char			*h_addrbuf;	/* address eyecatcher */
 	struct net		*net;		/* host net */
+	char			nodename[UNX_MAXNODENAME + 1];
 };
 
 /*
-- 
cgit v0.10.2


From cc6a7aab5570beef884ff95f7cade6634bf815a1 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 9 Oct 2015 16:13:45 +0200
Subject: sunrpc: avoid warning in gss_key_timeout

The gss_key_timeout() function causes a harmless warning in some
configurations, e.g. ARM imx_v6_v7_defconfig with gcc-5.2, if the
compiler cannot figure out the state of the 'expire' variable across
an rcu_read_unlock():

net/sunrpc/auth_gss/auth_gss.c: In function 'gss_key_timeout':
net/sunrpc/auth_gss/auth_gss.c:1422:211: warning: 'expire' may be used uninitialized in this function [-Wmaybe-uninitialized]

To avoid this warning without adding a bogus initialization, this
rewrites the function so the comparison is done inside of the
critical section. As a side-effect, it also becomes slightly
easier to understand because the implementation now more closely
resembles the comment above it.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: c5e6aecd034e7 ("sunrpc: fix RCU handling of gc_ctx field")
Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index dace13d..799e65b 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1411,17 +1411,16 @@ gss_key_timeout(struct rpc_cred *rc)
 {
 	struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base);
 	struct gss_cl_ctx *ctx;
-	unsigned long now = jiffies;
-	unsigned long expire;
+	unsigned long timeout = jiffies + (gss_key_expire_timeo * HZ);
+	int ret = 0;
 
 	rcu_read_lock();
 	ctx = rcu_dereference(gss_cred->gc_ctx);
-	if (ctx)
-		expire = ctx->gc_expiry - (gss_key_expire_timeo * HZ);
+	if (!ctx || time_after(timeout, ctx->gc_expiry))
+		ret = -EACCES;
 	rcu_read_unlock();
-	if (!ctx || time_after(now, expire))
-		return -EACCES;
-	return 0;
+
+	return ret;
 }
 
 static int
-- 
cgit v0.10.2


From 825213e59ec24110b0a0f94456db42621f928421 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jeff.layton@primarydata.com>
Date: Sat, 3 Oct 2015 08:19:57 -0400
Subject: nfsd: move include of state.h from trace.c to trace.h

Any file which includes trace.h will need to include state.h, even if
they aren't using any state tracepoints. Ensure that we include any
headers that might be needed in trace.h instead of relying on the
*.c files to have the right ones.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c
index 82f8907..9096746 100644
--- a/fs/nfsd/trace.c
+++ b/fs/nfsd/trace.c
@@ -1,5 +1,3 @@
 
-#include "state.h"
-
 #define CREATE_TRACE_POINTS
 #include "trace.h"
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index c668520..0befe76 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -9,6 +9,8 @@
 
 #include <linux/tracepoint.h>
 
+#include "state.h"
+
 DECLARE_EVENT_CLASS(nfsd_stateid_class,
 	TP_PROTO(stateid_t *stp),
 	TP_ARGS(stp),
-- 
cgit v0.10.2


From 778620364ef525e83597a6edee4d0a69db67fd3d Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.com>
Date: Fri, 16 Oct 2015 08:59:08 +1100
Subject: sunrpc/cache: make cache flushing more reliable.

The caches used to store sunrpc authentication information can be
flushed by writing a timestamp to a file in /proc.

This timestamp has a one-second resolution and any entry in cache that
was last_refreshed *before* that time is treated as expired.

This is problematic as it is not possible to reliably flush the cache
without interrupting NFS service.
If the current time is written to the "flush" file, any entry that was
added since the current second started will still be treated as valid.
If one second beyond than the current time is written to the file
then no entries can be valid until the second ticks over.  This will
mean that no NFS request will be handled for up to 1 second.

To resolve this issue we make two changes:

1/ treat an entry as expired if the timestamp when it was last_refreshed
  is before *or the same as* the expiry time.  This means that current
  code which writes out the current time will now flush the cache
  reliably.

2/ when a new entry in added to the cache -  set the last_refresh timestamp
  to 1 second *beyond* the current flush time, when that not in the
  past.
  This ensures that newly added entries will always be valid.

Now that we have a very reliable way to flush the cache, and also
since we are using "since-boot" timestamps which are monotonic,
change cache_purge() to set the smallest future flush_time which
will work, and leave it there: don't revert to '1'.

Also disable the setting of the 'flush_time' far into the future.
That has never been useful and is now awkward as it would cause
last_refresh times to be strange.
Finally: if a request is made to set the 'flush_time' to the current
second, assume the intent is to flush the cache and advance it, if
necessary, to 1 second beyond the current 'flush_time' so that all
active entries will be deemed to be expired.

As part of this we need to add a 'cache_detail' arg to cache_init()
and cache_fresh_locked() so they can find the current ->flush_time.

Signed-off-by: NeilBrown <neilb@suse.com>
Reported-by: Olaf Kirch <okir@suse.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 03d3b4c..ed03c9f 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -48,8 +48,10 @@
 struct cache_head {
 	struct hlist_node	cache_list;
 	time_t		expiry_time;	/* After time time, don't use the data */
-	time_t		last_refresh;   /* If CACHE_PENDING, this is when upcall 
-					 * was sent, else this is when update was received
+	time_t		last_refresh;   /* If CACHE_PENDING, this is when upcall was
+					 * sent, else this is when update was
+					 * received, though it is alway set to
+					 * be *after* ->flush_time.
 					 */
 	struct kref	ref;
 	unsigned long	flags;
@@ -105,8 +107,12 @@ struct cache_detail {
 	/* fields below this comment are for internal use
 	 * and should not be touched by cache owners
 	 */
-	time_t			flush_time;		/* flush all cache items with last_refresh
-							 * earlier than this */
+	time_t			flush_time;		/* flush all cache items with
+							 * last_refresh at or earlier
+							 * than this.  last_refresh
+							 * is never set at or earlier
+							 * than this.
+							 */
 	struct list_head	others;
 	time_t			nextcheck;
 	int			entries;
@@ -203,7 +209,7 @@ static inline void cache_put(struct cache_head *h, struct cache_detail *cd)
 static inline int cache_is_expired(struct cache_detail *detail, struct cache_head *h)
 {
 	return  (h->expiry_time < seconds_since_boot()) ||
-		(detail->flush_time > h->last_refresh);
+		(detail->flush_time >= h->last_refresh);
 }
 
 extern int cache_check(struct cache_detail *detail,
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 4a2340a..5e4f815 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -41,13 +41,16 @@
 static bool cache_defer_req(struct cache_req *req, struct cache_head *item);
 static void cache_revisit_request(struct cache_head *item);
 
-static void cache_init(struct cache_head *h)
+static void cache_init(struct cache_head *h, struct cache_detail *detail)
 {
 	time_t now = seconds_since_boot();
 	INIT_HLIST_NODE(&h->cache_list);
 	h->flags = 0;
 	kref_init(&h->ref);
 	h->expiry_time = now + CACHE_NEW_EXPIRY;
+	if (now <= detail->flush_time)
+		/* ensure it isn't already expired */
+		now = detail->flush_time + 1;
 	h->last_refresh = now;
 }
 
@@ -81,7 +84,7 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
 	 * we might get lose if we need to
 	 * cache_put it soon.
 	 */
-	cache_init(new);
+	cache_init(new, detail);
 	detail->init(new, key);
 
 	write_lock(&detail->hash_lock);
@@ -116,10 +119,15 @@ EXPORT_SYMBOL_GPL(sunrpc_cache_lookup);
 
 static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch);
 
-static void cache_fresh_locked(struct cache_head *head, time_t expiry)
+static void cache_fresh_locked(struct cache_head *head, time_t expiry,
+			       struct cache_detail *detail)
 {
+	time_t now = seconds_since_boot();
+	if (now <= detail->flush_time)
+		/* ensure it isn't immediately treated as expired */
+		now = detail->flush_time + 1;
 	head->expiry_time = expiry;
-	head->last_refresh = seconds_since_boot();
+	head->last_refresh = now;
 	smp_wmb(); /* paired with smp_rmb() in cache_is_valid() */
 	set_bit(CACHE_VALID, &head->flags);
 }
@@ -149,7 +157,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
 				set_bit(CACHE_NEGATIVE, &old->flags);
 			else
 				detail->update(old, new);
-			cache_fresh_locked(old, new->expiry_time);
+			cache_fresh_locked(old, new->expiry_time, detail);
 			write_unlock(&detail->hash_lock);
 			cache_fresh_unlocked(old, detail);
 			return old;
@@ -162,7 +170,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
 		cache_put(old, detail);
 		return NULL;
 	}
-	cache_init(tmp);
+	cache_init(tmp, detail);
 	detail->init(tmp, old);
 
 	write_lock(&detail->hash_lock);
@@ -173,8 +181,8 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
 	hlist_add_head(&tmp->cache_list, &detail->hash_table[hash]);
 	detail->entries++;
 	cache_get(tmp);
-	cache_fresh_locked(tmp, new->expiry_time);
-	cache_fresh_locked(old, 0);
+	cache_fresh_locked(tmp, new->expiry_time, detail);
+	cache_fresh_locked(old, 0, detail);
 	write_unlock(&detail->hash_lock);
 	cache_fresh_unlocked(tmp, detail);
 	cache_fresh_unlocked(old, detail);
@@ -219,7 +227,8 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h
 	rv = cache_is_valid(h);
 	if (rv == -EAGAIN) {
 		set_bit(CACHE_NEGATIVE, &h->flags);
-		cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY);
+		cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY,
+				   detail);
 		rv = -ENOENT;
 	}
 	write_unlock(&detail->hash_lock);
@@ -487,10 +496,13 @@ EXPORT_SYMBOL_GPL(cache_flush);
 
 void cache_purge(struct cache_detail *detail)
 {
-	detail->flush_time = LONG_MAX;
+	time_t now = seconds_since_boot();
+	if (detail->flush_time >= now)
+		now = detail->flush_time + 1;
+	/* 'now' is the maximum value any 'last_refresh' can have */
+	detail->flush_time = now;
 	detail->nextcheck = seconds_since_boot();
 	cache_flush();
-	detail->flush_time = 1;
 }
 EXPORT_SYMBOL_GPL(cache_purge);
 
@@ -1436,6 +1448,7 @@ static ssize_t write_flush(struct file *file, const char __user *buf,
 {
 	char tbuf[20];
 	char *bp, *ep;
+	time_t then, now;
 
 	if (*ppos || count > sizeof(tbuf)-1)
 		return -EINVAL;
@@ -1447,8 +1460,22 @@ static ssize_t write_flush(struct file *file, const char __user *buf,
 		return -EINVAL;
 
 	bp = tbuf;
-	cd->flush_time = get_expiry(&bp);
-	cd->nextcheck = seconds_since_boot();
+	then = get_expiry(&bp);
+	now = seconds_since_boot();
+	cd->nextcheck = now;
+	/* Can only set flush_time to 1 second beyond "now", or
+	 * possibly 1 second beyond flushtime.  This is because
+	 * flush_time never goes backwards so it mustn't get too far
+	 * ahead of time.
+	 */
+	if (then >= now) {
+		/* Want to flush everything, so behave like cache_purge() */
+		if (cd->flush_time >= now)
+			now = cd->flush_time + 1;
+		then = now;
+	}
+
+	cd->flush_time = then;
 	cache_flush();
 
 	*ppos += count;
-- 
cgit v0.10.2


From 2b63482185e6054cc11ca6d6c073f90160c161fd Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 15 Oct 2015 15:33:23 -0400
Subject: nfsd: fix clid_inuse on mount with security change

In bakeathon testing Solaris client was getting CLID_INUSE error when
doing a krb5 mount soon after an auth_sys mount, or vice versa.

That's not really necessary since in this case the old client doesn't
have any state any more:

	http://tools.ietf.org/html/rfc7530#page-103

	"when the server gets a SETCLIENTID for a client ID that
	currently has no state, or it has state but the lease has
	expired, rather than returning NFS4ERR_CLID_INUSE, the server
	MUST allow the SETCLIENTID and confirm the new client ID if
	followed by the appropriate SETCLIENTID_CONFIRM."

This doesn't fix the problem completely since our client_has_state()
check counts openowners left around to handle close replays, which we
should probably just remove in this case.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1b39edf..bac6207 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2262,7 +2262,8 @@ static bool client_has_state(struct nfs4_client *clp)
 	 * Note clp->cl_openowners check isn't quite right: there's no
 	 * need to count owners without stateid's.
 	 *
-	 * Also note we should probably be using this in 4.0 case too.
+	 * Also note in 4.0 case should also be checking for openowners
+	 * kept around just for close handling.
 	 */
 	return !list_empty(&clp->cl_openowners)
 #ifdef CONFIG_NFSD_PNFS
@@ -3049,7 +3050,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	/* Cases below refer to rfc 3530 section 14.2.33: */
 	spin_lock(&nn->client_lock);
 	conf = find_confirmed_client_by_name(&clname, nn);
-	if (conf) {
+	if (conf && client_has_state(conf)) {
 		/* case 0: */
 		status = nfserr_clid_inuse;
 		if (clp_used_exchangeid(conf))
@@ -3136,6 +3137,11 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 	} else { /* case 3: normal case; new or rebooted client */
 		old = find_confirmed_client_by_name(&unconf->cl_name, nn);
 		if (old) {
+			status = nfserr_clid_inuse;
+			if (client_has_state(old)
+					&& !same_creds(&unconf->cl_cred,
+							&old->cl_cred))
+				goto out;
 			status = mark_client_expired_locked(old);
 			if (status) {
 				old = NULL;
-- 
cgit v0.10.2


From 4eaea13425078272895ec37814c6878d78b8db9f Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 15 Oct 2015 16:11:01 -0400
Subject: nfsd: improve client_has_state to check for unused openowners

At least in the v4.0 case openowners can hang around for a while after
last close, but they shouldn't really block (for example), a new mount
with a different principal.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index bac6207..f2ea343 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2256,16 +2256,20 @@ nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
 	clid->flags = new->cl_exchange_flags;
 }
 
+static bool client_has_openowners(struct nfs4_client *clp)
+{
+	struct nfs4_openowner *oo;
+
+	list_for_each_entry(oo, &clp->cl_openowners, oo_perclient) {
+		if (!list_empty(&oo->oo_owner.so_stateids))
+			return true;
+	}
+	return false;
+}
+
 static bool client_has_state(struct nfs4_client *clp)
 {
-	/*
-	 * Note clp->cl_openowners check isn't quite right: there's no
-	 * need to count owners without stateid's.
-	 *
-	 * Also note in 4.0 case should also be checking for openowners
-	 * kept around just for close handling.
-	 */
-	return !list_empty(&clp->cl_openowners)
+	return client_has_openowners(clp)
 #ifdef CONFIG_NFSD_PNFS
 		|| !list_empty(&clp->cl_lo_states)
 #endif
-- 
cgit v0.10.2


From cc8a55320b5f1196bee5bd14e4bb2ebd3b983317 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Thu, 17 Sep 2015 07:58:24 -0400
Subject: nfsd: serialize layout stateid morphing operations

In order to allow the client to make a sane determination of what
happened with racing LAYOUTGET/LAYOUTRETURN/CB_LAYOUTRECALL calls, we
must ensure that the seqids return accurately represent the order of
operations. The simplest way to do that is to ensure that operations on
a single stateid are serialized.

This patch adds a mutex to the layout stateid, and locks it when
checking the layout stateid's seqid. The mutex is held over the entire
operation and released after the seqid is bumped.

Note that in the case of CB_LAYOUTRECALL we must move the increment of
the seqid and setting into a new cb "prepare" operation. The lease
infrastructure will call the lm_break callback with a spinlock held, so
and we can't take the mutex in that codepath.

Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index ebf90e4..4a68ab9 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -201,6 +201,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
 	INIT_LIST_HEAD(&ls->ls_perfile);
 	spin_lock_init(&ls->ls_lock);
 	INIT_LIST_HEAD(&ls->ls_layouts);
+	mutex_init(&ls->ls_mutex);
 	ls->ls_layout_type = layout_type;
 	nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
 			NFSPROC4_CLNT_CB_LAYOUT);
@@ -262,19 +263,23 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
 		status = nfserr_jukebox;
 		if (!ls)
 			goto out;
+		mutex_lock(&ls->ls_mutex);
 	} else {
 		ls = container_of(stid, struct nfs4_layout_stateid, ls_stid);
 
 		status = nfserr_bad_stateid;
+		mutex_lock(&ls->ls_mutex);
 		if (stateid->si_generation > stid->sc_stateid.si_generation)
-			goto out_put_stid;
+			goto out_unlock_stid;
 		if (layout_type != ls->ls_layout_type)
-			goto out_put_stid;
+			goto out_unlock_stid;
 	}
 
 	*lsp = ls;
 	return 0;
 
+out_unlock_stid:
+	mutex_unlock(&ls->ls_mutex);
 out_put_stid:
 	nfs4_put_stid(stid);
 out:
@@ -296,8 +301,6 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
 	trace_layout_recall(&ls->ls_stid.sc_stateid);
 
 	atomic_inc(&ls->ls_stid.sc_count);
-	update_stateid(&ls->ls_stid.sc_stateid);
-	memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
 	nfsd4_run_cb(&ls->ls_recall);
 
 out_unlock:
@@ -494,6 +497,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
 	}
 	spin_unlock(&ls->ls_lock);
 
+	mutex_unlock(&ls->ls_mutex);
 	nfs4_put_stid(&ls->ls_stid);
 	nfsd4_free_layouts(&reaplist);
 	return nfs_ok;
@@ -608,6 +612,17 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
 	}
 }
 
+static void
+nfsd4_cb_layout_prepare(struct nfsd4_callback *cb)
+{
+	struct nfs4_layout_stateid *ls =
+		container_of(cb, struct nfs4_layout_stateid, ls_recall);
+
+	mutex_lock(&ls->ls_mutex);
+	update_stateid(&ls->ls_stid.sc_stateid);
+	memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+}
+
 static int
 nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
 {
@@ -649,12 +664,14 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb)
 
 	trace_layout_recall_release(&ls->ls_stid.sc_stateid);
 
+	mutex_unlock(&ls->ls_mutex);
 	nfsd4_return_all_layouts(ls, &reaplist);
 	nfsd4_free_layouts(&reaplist);
 	nfs4_put_stid(&ls->ls_stid);
 }
 
 static struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
+	.prepare	= nfsd4_cb_layout_prepare,
 	.done		= nfsd4_cb_layout_done,
 	.release	= nfsd4_cb_layout_release,
 };
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 4ce6b97..a9f096c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1309,6 +1309,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp,
 	nfserr = nfsd4_insert_layout(lgp, ls);
 
 out_put_stid:
+	mutex_unlock(&ls->ls_mutex);
 	nfs4_put_stid(&ls->ls_stid);
 out:
 	return nfserr;
@@ -1362,6 +1363,9 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp,
 		goto out;
 	}
 
+	/* LAYOUTCOMMIT does not require any serialization */
+	mutex_unlock(&ls->ls_mutex);
+
 	if (new_size > i_size_read(inode)) {
 		lcp->lc_size_chg = 1;
 		lcp->lc_newsize = new_size;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 31bde12..1fa0f38 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -562,6 +562,7 @@ struct nfs4_layout_stateid {
 	struct nfsd4_callback		ls_recall;
 	stateid_t			ls_recall_sid;
 	bool				ls_recalled;
+	struct mutex			ls_mutex;
 };
 
 static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
-- 
cgit v0.10.2


From 9767feb2c64b29775f1ea683130b44f95f67d169 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Thu, 1 Oct 2015 09:05:50 -0400
Subject: nfsd: ensure that seqid morphing operations are atomic wrt to copies

Bruce points out that the increment of the seqid in stateids is not
serialized in any way, so it's possible for racing calls to bump it
twice and end up sending the same stateid. While we don't have any
reports of this problem it _is_ theoretically possible, and could lead
to spurious state recovery by the client.

In the current code, update_stateid is always followed by a memcpy of
that stateid, so we can combine the two operations. For better
atomicity, we add a spinlock to the nfs4_stid and hold that when bumping
the seqid and copying the stateid.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 4a68ab9..9ffef06 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -409,8 +409,7 @@ nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
 	list_add_tail(&new->lo_perstate, &ls->ls_layouts);
 	new = NULL;
 done:
-	update_stateid(&ls->ls_stid.sc_stateid);
-	memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+	nfs4_inc_and_copy_stateid(&lgp->lg_sid, &ls->ls_stid);
 	spin_unlock(&ls->ls_lock);
 out:
 	spin_unlock(&fp->fi_lock);
@@ -484,11 +483,8 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
 		}
 	}
 	if (!list_empty(&ls->ls_layouts)) {
-		if (found) {
-			update_stateid(&ls->ls_stid.sc_stateid);
-			memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid,
-				sizeof(stateid_t));
-		}
+		if (found)
+			nfs4_inc_and_copy_stateid(&lrp->lr_sid, &ls->ls_stid);
 		lrp->lrs_present = 1;
 	} else {
 		trace_layoutstate_unhash(&ls->ls_stid.sc_stateid);
@@ -619,8 +615,7 @@ nfsd4_cb_layout_prepare(struct nfsd4_callback *cb)
 		container_of(cb, struct nfs4_layout_stateid, ls_recall);
 
 	mutex_lock(&ls->ls_mutex);
-	update_stateid(&ls->ls_stid.sc_stateid);
-	memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+	nfs4_inc_and_copy_stateid(&ls->ls_recall_sid, &ls->ls_stid);
 }
 
 static int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f2ea343..0a69715 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -575,6 +575,7 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
 	stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid;
 	/* Will be incremented before return to client: */
 	atomic_set(&stid->sc_count, 1);
+	spin_lock_init(&stid->sc_lock);
 
 	/*
 	 * It shouldn't be a problem to reuse an opaque stateid value.
@@ -745,6 +746,18 @@ nfs4_put_stid(struct nfs4_stid *s)
 		put_nfs4_file(fp);
 }
 
+void
+nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid)
+{
+	stateid_t *src = &stid->sc_stateid;
+
+	spin_lock(&stid->sc_lock);
+	if (unlikely(++src->si_generation == 0))
+		src->si_generation = 1;
+	memcpy(dst, src, sizeof(*dst));
+	spin_unlock(&stid->sc_lock);
+}
+
 static void nfs4_put_deleg_lease(struct nfs4_file *fp)
 {
 	struct file *filp = NULL;
@@ -4221,8 +4234,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		if (stp->st_clnt_odstate == open->op_odstate)
 			open->op_odstate = NULL;
 	}
-	update_stateid(&stp->st_stid.sc_stateid);
-	memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+	nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid);
 	up_read(&stp->st_rwsem);
 
 	if (nfsd4_has_session(&resp->cstate)) {
@@ -4925,8 +4937,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto put_stateid;
 	}
 	oo->oo_flags |= NFS4_OO_CONFIRMED;
-	update_stateid(&stp->st_stid.sc_stateid);
-	memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+	nfs4_inc_and_copy_stateid(&oc->oc_resp_stateid, &stp->st_stid);
 	up_write(&stp->st_rwsem);
 	dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
 		__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
@@ -4999,11 +5010,8 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 		goto put_stateid;
 	}
 	nfs4_stateid_downgrade(stp, od->od_share_access);
-
 	reset_union_bmap_deny(od->od_share_deny, stp);
-
-	update_stateid(&stp->st_stid.sc_stateid);
-	memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+	nfs4_inc_and_copy_stateid(&od->od_stateid, &stp->st_stid);
 	status = nfs_ok;
 put_stateid:
 	up_write(&stp->st_rwsem);
@@ -5058,8 +5066,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	nfsd4_bump_seqid(cstate, status);
 	if (status)
 		goto out; 
-	update_stateid(&stp->st_stid.sc_stateid);
-	memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
 	up_write(&stp->st_rwsem);
 
 	nfsd4_close_open_stateid(stp);
@@ -5542,9 +5549,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	err = vfs_lock_file(filp, F_SETLK, file_lock, conflock);
 	switch (-err) {
 	case 0: /* success! */
-		update_stateid(&lock_stp->st_stid.sc_stateid);
-		memcpy(&lock->lk_resp_stateid, &lock_stp->st_stid.sc_stateid, 
-				sizeof(stateid_t));
+		nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid);
 		status = 0;
 		break;
 	case (EAGAIN):		/* conflock holds conflicting lock */
@@ -5736,8 +5741,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n");
 		goto out_nfserr;
 	}
-	update_stateid(&stp->st_stid.sc_stateid);
-	memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+	nfs4_inc_and_copy_stateid(&locku->lu_stateid, &stp->st_stid);
 fput:
 	fput(filp);
 put_stateid:
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 1fa0f38..77fdf4d 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -84,7 +84,7 @@ struct nfsd4_callback_ops {
  * fields that are of general use to any stateid.
  */
 struct nfs4_stid {
-	atomic_t sc_count;
+	atomic_t		sc_count;
 #define NFS4_OPEN_STID 1
 #define NFS4_LOCK_STID 2
 #define NFS4_DELEG_STID 4
@@ -94,11 +94,12 @@ struct nfs4_stid {
 #define NFS4_REVOKED_DELEG_STID 16
 #define NFS4_CLOSED_DELEG_STID 32
 #define NFS4_LAYOUT_STID 64
-	unsigned char sc_type;
-	stateid_t sc_stateid;
-	struct nfs4_client *sc_client;
-	struct nfs4_file *sc_file;
-	void (*sc_free)(struct nfs4_stid *);
+	unsigned char		sc_type;
+	stateid_t		sc_stateid;
+	spinlock_t		sc_lock;
+	struct nfs4_client	*sc_client;
+	struct nfs4_file	*sc_file;
+	void			(*sc_free)(struct nfs4_stid *);
 };
 
 /*
@@ -364,15 +365,6 @@ struct nfs4_client_reclaim {
 	char			cr_recdir[HEXDIR_LEN]; /* recover dir */
 };
 
-static inline void
-update_stateid(stateid_t *stateid)
-{
-	stateid->si_generation++;
-	/* Wraparound recommendation from 3530bis-13 9.1.3.2: */
-	if (stateid->si_generation == 0)
-		stateid->si_generation = 1;
-}
-
 /* A reasonable value for REPLAY_ISIZE was estimated as follows:  
  * The OPEN response, typically the largest, requires 
  *   4(status) + 8(stateid) + 20(changeinfo) + 4(rflags) +  8(verifier) + 
@@ -595,6 +587,7 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
 		struct kmem_cache *slab);
 void nfs4_unhash_stid(struct nfs4_stid *s);
 void nfs4_put_stid(struct nfs4_stid *s);
+void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid);
 void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
 extern void nfs4_release_reclaim(struct nfsd_net *);
 extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
-- 
cgit v0.10.2


From ea833f5de3ab49a0aac79a16633fd510390b83a6 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 27 Oct 2015 10:23:30 +0000
Subject: SUNRPC: drop stale comment in svc_setup_socket()

The svc_setup_socket() function does set the send and receive buffer
sizes, so the comment is out-of-date:

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 4892373..e0c7b33 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1367,7 +1367,6 @@ EXPORT_SYMBOL_GPL(svc_sock_update_bufs);
 
 /*
  * Initialize socket for RPC use and create svc_sock struct
- * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF.
  */
 static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 						struct socket *sock,
-- 
cgit v0.10.2


From 3e80dbcda7f3e1e349a779d7a14c0e08677c39fa Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Wed, 4 Nov 2015 11:02:29 -0500
Subject: nfsd: remove recurring workqueue job to clean DRC

We have a shrinker, we clean out the cache when nfsd is shut down, and
prune the chains on each request. A recurring workqueue job seems like
unnecessary overhead. Just remove it.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 116940c..54cde9a 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -63,7 +63,6 @@ static unsigned int		longest_chain;
 static unsigned int		longest_chain_cachesize;
 
 static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
-static void	cache_cleaner_func(struct work_struct *unused);
 static unsigned long nfsd_reply_cache_count(struct shrinker *shrink,
 					    struct shrink_control *sc);
 static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink,
@@ -76,13 +75,6 @@ static struct shrinker nfsd_reply_cache_shrinker = {
 };
 
 /*
- * locking for the reply cache:
- * A cache entry is "single use" if c_state == RC_INPROG
- * Otherwise, it when accessing _prev or _next, the lock must be held.
- */
-static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func);
-
-/*
  * Put a cap on the size of the DRC based on the amount of available
  * low memory in the machine.
  *
@@ -203,7 +195,6 @@ void nfsd_reply_cache_shutdown(void)
 	unsigned int i;
 
 	unregister_shrinker(&nfsd_reply_cache_shrinker);
-	cancel_delayed_work_sync(&cache_cleaner);
 
 	for (i = 0; i < drc_hashsize; i++) {
 		struct list_head *head = &drc_hashtbl[i].lru_head;
@@ -230,7 +221,6 @@ lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
 {
 	rp->c_timestamp = jiffies;
 	list_move_tail(&rp->c_lru, &b->lru_head);
-	schedule_delayed_work(&cache_cleaner, RC_EXPIRE);
 }
 
 static long
@@ -264,7 +254,6 @@ prune_cache_entries(void)
 {
 	unsigned int i;
 	long freed = 0;
-	bool cancel = true;
 
 	for (i = 0; i < drc_hashsize; i++) {
 		struct nfsd_drc_bucket *b = &drc_hashtbl[i];
@@ -273,26 +262,11 @@ prune_cache_entries(void)
 			continue;
 		spin_lock(&b->cache_lock);
 		freed += prune_bucket(b);
-		if (!list_empty(&b->lru_head))
-			cancel = false;
 		spin_unlock(&b->cache_lock);
 	}
-
-	/*
-	 * Conditionally rearm the job to run in RC_EXPIRE since we just
-	 * ran the pruner.
-	 */
-	if (!cancel)
-		mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE);
 	return freed;
 }
 
-static void
-cache_cleaner_func(struct work_struct *unused)
-{
-	prune_cache_entries();
-}
-
 static unsigned long
 nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
 {
-- 
cgit v0.10.2


From 34ed9872e745fa56f10e9bef2cf3d2336c6c8816 Mon Sep 17 00:00:00 2001
From: Andrew Elble <aweits@rit.edu>
Date: Thu, 15 Oct 2015 12:07:28 -0400
Subject: nfsd: eliminate sending duplicate and repeated delegations

We've observed the nfsd server in a state where there are
multiple delegations on the same nfs4_file for the same client.
The nfs client does attempt to DELEGRETURN these when they are presented to
it - but apparently under some (unknown) circumstances the client does not
manage to return all of them. This leads to the eventual
attempt to CB_RECALL more than one delegation with the same nfs
filehandle to the same client. The first recall will succeed, but the
next recall will fail with NFS4ERR_BADHANDLE. This leads to the server
having delegations on cl_revoked that the client has no way to FREE
or DELEGRETURN, with resulting inability to recover. The state manager
on the server will continually assert SEQ4_STATUS_RECALLABLE_STATE_REVOKED,
and the state manager on the client will be looping unable to satisfy
the server.

List discussion also reports a race between OPEN and DELEGRETURN that
will be avoided by only sending the delegation once to the
client. This is also logically in accordance with RFC5561 9.1.1 and 10.2.

So, let's:

1.) Not hand out duplicate delegations.
2.) Only send them to the client once.

RFC 5561:

9.1.1:
"Delegations and layouts, on the other hand, are not associated with a
specific owner but are associated with the client as a whole
(identified by a client ID)."

10.2:
"...the stateid for a delegation is associated with a client ID and may be
used on behalf of all the open-owners for the given client.  A
delegation is made to the client as a whole and not to any specific
process or thread of control within it."

Reported-by: Eric Meddaugh <etmsys@rit.edu>
Cc: Trond Myklebust <trond.myklebust@primarydata.com>
Cc: Olga Kornievskaia <aglo@umich.edu>
Signed-off-by: Andrew Elble <aweits@rit.edu>
Cc: stable@vger.kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0a69715..6411c34 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -778,16 +778,68 @@ void nfs4_unhash_stid(struct nfs4_stid *s)
 	s->sc_type = 0;
 }
 
-static void
+/**
+ * nfs4_get_existing_delegation - Discover if this delegation already exists
+ * @clp:     a pointer to the nfs4_client we're granting a delegation to
+ * @fp:      a pointer to the nfs4_file we're granting a delegation on
+ *
+ * Return:
+ *      On success: NULL if an existing delegation was not found.
+ *
+ *      On error: -EAGAIN if one was previously granted to this nfs4_client
+ *                 for this nfs4_file.
+ *
+ */
+
+static int
+nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp)
+{
+	struct nfs4_delegation *searchdp = NULL;
+	struct nfs4_client *searchclp = NULL;
+
+	lockdep_assert_held(&state_lock);
+	lockdep_assert_held(&fp->fi_lock);
+
+	list_for_each_entry(searchdp, &fp->fi_delegations, dl_perfile) {
+		searchclp = searchdp->dl_stid.sc_client;
+		if (clp == searchclp) {
+			return -EAGAIN;
+		}
+	}
+	return 0;
+}
+
+/**
+ * hash_delegation_locked - Add a delegation to the appropriate lists
+ * @dp:     a pointer to the nfs4_delegation we are adding.
+ * @fp:     a pointer to the nfs4_file we're granting a delegation on
+ *
+ * Return:
+ *      On success: NULL if the delegation was successfully hashed.
+ *
+ *      On error: -EAGAIN if one was previously granted to this
+ *                 nfs4_client for this nfs4_file. Delegation is not hashed.
+ *
+ */
+
+static int
 hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
 {
+	int status;
+	struct nfs4_client *clp = dp->dl_stid.sc_client;
+
 	lockdep_assert_held(&state_lock);
 	lockdep_assert_held(&fp->fi_lock);
 
+	status = nfs4_get_existing_delegation(clp, fp);
+	if (status)
+		return status;
+	++fp->fi_delegees;
 	atomic_inc(&dp->dl_stid.sc_count);
 	dp->dl_stid.sc_type = NFS4_DELEG_STID;
 	list_add(&dp->dl_perfile, &fp->fi_delegations);
-	list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
+	list_add(&dp->dl_perclnt, &clp->cl_delegations);
+	return 0;
 }
 
 static bool
@@ -3969,6 +4021,18 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag)
 	return fl;
 }
 
+/**
+ * nfs4_setlease - Obtain a delegation by requesting lease from vfs layer
+ * @dp:   a pointer to the nfs4_delegation we're adding.
+ *
+ * Return:
+ *      On success: Return code will be 0 on success.
+ *
+ *      On error: -EAGAIN if there was an existing delegation.
+ *                 nonzero if there is an error in other cases.
+ *
+ */
+
 static int nfs4_setlease(struct nfs4_delegation *dp)
 {
 	struct nfs4_file *fp = dp->dl_stid.sc_file;
@@ -4000,16 +4064,19 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
 		goto out_unlock;
 	/* Race breaker */
 	if (fp->fi_deleg_file) {
-		status = 0;
-		++fp->fi_delegees;
-		hash_delegation_locked(dp, fp);
+		status = hash_delegation_locked(dp, fp);
 		goto out_unlock;
 	}
 	fp->fi_deleg_file = filp;
-	fp->fi_delegees = 1;
-	hash_delegation_locked(dp, fp);
+	fp->fi_delegees = 0;
+	status = hash_delegation_locked(dp, fp);
 	spin_unlock(&fp->fi_lock);
 	spin_unlock(&state_lock);
+	if (status) {
+		/* Should never happen, this is a new fi_deleg_file  */
+		WARN_ON_ONCE(1);
+		goto out_fput;
+	}
 	return 0;
 out_unlock:
 	spin_unlock(&fp->fi_lock);
@@ -4029,6 +4096,15 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
 	if (fp->fi_had_conflict)
 		return ERR_PTR(-EAGAIN);
 
+	spin_lock(&state_lock);
+	spin_lock(&fp->fi_lock);
+	status = nfs4_get_existing_delegation(clp, fp);
+	spin_unlock(&fp->fi_lock);
+	spin_unlock(&state_lock);
+
+	if (status)
+		return ERR_PTR(status);
+
 	dp = alloc_init_deleg(clp, fh, odstate);
 	if (!dp)
 		return ERR_PTR(-ENOMEM);
@@ -4047,9 +4123,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
 		status = -EAGAIN;
 		goto out_unlock;
 	}
-	++fp->fi_delegees;
-	hash_delegation_locked(dp, fp);
-	status = 0;
+	status = hash_delegation_locked(dp, fp);
 out_unlock:
 	spin_unlock(&fp->fi_lock);
 	spin_unlock(&state_lock);
-- 
cgit v0.10.2


From 7fc0564e3a8d16df096f48c9c6425ba84d945c6e Mon Sep 17 00:00:00 2001
From: Andrew Elble <aweits@rit.edu>
Date: Thu, 5 Nov 2015 20:42:43 -0500
Subject: nfsd: fix race with open / open upgrade stateids

We observed multiple open stateids on the server for files that
seemingly should have been closed.

nfsd4_process_open2() tests for the existence of a preexisting
stateid. If one is not found, the locks are dropped and a new
one is created. The problem is that init_open_stateid(), which
is also responsible for hashing the newly initialized stateid,
doesn't check to see if another open has raced in and created
a matching stateid. This fix is to enable init_open_stateid() to
return the matching stateid and have nfsd4_process_open2()
swap to that stateid and switch to the open upgrade path.
In testing this patch, coverage to the newly created
path indicates that the race was indeed happening.

Signed-off-by: Andrew Elble <aweits@rit.edu>
Reviewed-by: Jeff Layton <jlayton@poochiereds.net>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6411c34..6b800b5b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3392,6 +3392,27 @@ static const struct nfs4_stateowner_operations openowner_ops = {
 	.so_free =	nfs4_free_openowner,
 };
 
+static struct nfs4_ol_stateid *
+nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
+{
+	struct nfs4_ol_stateid *local, *ret = NULL;
+	struct nfs4_openowner *oo = open->op_openowner;
+
+	lockdep_assert_held(&fp->fi_lock);
+
+	list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
+		/* ignore lock owners */
+		if (local->st_stateowner->so_is_open_owner == 0)
+			continue;
+		if (local->st_stateowner == &oo->oo_owner) {
+			ret = local;
+			atomic_inc(&ret->st_stid.sc_count);
+			break;
+		}
+	}
+	return ret;
+}
+
 static struct nfs4_openowner *
 alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
 			   struct nfsd4_compound_state *cstate)
@@ -3423,9 +3444,20 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
 	return ret;
 }
 
-static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
+static struct nfs4_ol_stateid *
+init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
+		struct nfsd4_open *open)
+{
+
 	struct nfs4_openowner *oo = open->op_openowner;
+	struct nfs4_ol_stateid *retstp = NULL;
 
+	spin_lock(&oo->oo_owner.so_client->cl_lock);
+	spin_lock(&fp->fi_lock);
+
+	retstp = nfsd4_find_existing_open(fp, open);
+	if (retstp)
+		goto out_unlock;
 	atomic_inc(&stp->st_stid.sc_count);
 	stp->st_stid.sc_type = NFS4_OPEN_STID;
 	INIT_LIST_HEAD(&stp->st_locks);
@@ -3436,12 +3468,13 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
 	stp->st_deny_bmap = 0;
 	stp->st_openstp = NULL;
 	init_rwsem(&stp->st_rwsem);
-	spin_lock(&oo->oo_owner.so_client->cl_lock);
 	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
-	spin_lock(&fp->fi_lock);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
+
+out_unlock:
 	spin_unlock(&fp->fi_lock);
 	spin_unlock(&oo->oo_owner.so_client->cl_lock);
+	return retstp;
 }
 
 /*
@@ -3852,27 +3885,6 @@ out:
 	return nfs_ok;
 }
 
-static struct nfs4_ol_stateid *
-nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
-{
-	struct nfs4_ol_stateid *local, *ret = NULL;
-	struct nfs4_openowner *oo = open->op_openowner;
-
-	spin_lock(&fp->fi_lock);
-	list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
-		/* ignore lock owners */
-		if (local->st_stateowner->so_is_open_owner == 0)
-			continue;
-		if (local->st_stateowner == &oo->oo_owner) {
-			ret = local;
-			atomic_inc(&ret->st_stid.sc_count);
-			break;
-		}
-	}
-	spin_unlock(&fp->fi_lock);
-	return ret;
-}
-
 static inline int nfs4_access_to_access(u32 nfs4_access)
 {
 	int flags = 0;
@@ -4258,6 +4270,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
 	struct nfs4_file *fp = NULL;
 	struct nfs4_ol_stateid *stp = NULL;
+	struct nfs4_ol_stateid *swapstp = NULL;
 	struct nfs4_delegation *dp = NULL;
 	__be32 status;
 
@@ -4271,7 +4284,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		status = nfs4_check_deleg(cl, open, &dp);
 		if (status)
 			goto out;
+		spin_lock(&fp->fi_lock);
 		stp = nfsd4_find_existing_open(fp, open);
+		spin_unlock(&fp->fi_lock);
 	} else {
 		open->op_file = NULL;
 		status = nfserr_bad_stateid;
@@ -4294,7 +4309,19 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	} else {
 		stp = open->op_stp;
 		open->op_stp = NULL;
-		init_open_stateid(stp, fp, open);
+		swapstp = init_open_stateid(stp, fp, open);
+		if (swapstp) {
+			nfs4_put_stid(&stp->st_stid);
+			stp = swapstp;
+			down_read(&stp->st_rwsem);
+			status = nfs4_upgrade_open(rqstp, fp, current_fh,
+						stp, open);
+			if (status) {
+				up_read(&stp->st_rwsem);
+				goto out;
+			}
+			goto upgrade_out;
+		}
 		down_read(&stp->st_rwsem);
 		status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
 		if (status) {
@@ -4308,6 +4335,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		if (stp->st_clnt_odstate == open->op_odstate)
 			open->op_odstate = NULL;
 	}
+upgrade_out:
 	nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid);
 	up_read(&stp->st_rwsem);
 
-- 
cgit v0.10.2


From 0442f14b15f8e7a8b3778a9f8cf640ef89b2df26 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 9 Oct 2015 01:44:07 +0000
Subject: svcrpc: document lack of some memory barriers

We're missing memory barriers in net/sunrpc/svcsock.c in some spots we'd
expect them.  But it doesn't appear they're necessary in our case, and
this is likely a hot path--for now just document the odd behavior.

Kosuke Tatsukawa found this issue while looking through the linux source
code for places calling waitqueue_active() before wake_up*(), but
without preceding memory barriers, after sending a patch to fix a
similar issue in drivers/tty/n_tty.c  (Details about the original issue
can be found here: https://lkml.org/lkml/2015/9/28/849).

Reported-by: Kosuke Tatsukawa <tatsu@ab.jp.nec.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index e0c7b33..1413cdc 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -399,6 +399,31 @@ static int svc_sock_secure_port(struct svc_rqst *rqstp)
 	return svc_port_is_privileged(svc_addr(rqstp));
 }
 
+static bool sunrpc_waitqueue_active(wait_queue_head_t *wq)
+{
+	if (!wq)
+		return false;
+	/*
+	 * There should normally be a memory * barrier here--see
+	 * wq_has_sleeper().
+	 *
+	 * It appears that isn't currently necessary, though, basically
+	 * because callers all appear to have sufficient memory barriers
+	 * between the time the relevant change is made and the
+	 * time they call these callbacks.
+	 *
+	 * The nfsd code itself doesn't actually explicitly wait on
+	 * these waitqueues, but it may wait on them for example in
+	 * sendpage() or sendmsg() calls.  (And those may be the only
+	 * places, since it it uses nonblocking reads.)
+	 *
+	 * Maybe we should add the memory barriers anyway, but these are
+	 * hot paths so we'd need to be convinced there's no sigificant
+	 * penalty.
+	 */
+	return waitqueue_active(wq);
+}
+
 /*
  * INET callback when data has been received on the socket.
  */
@@ -414,7 +439,7 @@ static void svc_udp_data_ready(struct sock *sk)
 		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 		svc_xprt_enqueue(&svsk->sk_xprt);
 	}
-	if (wq && waitqueue_active(wq))
+	if (sunrpc_waitqueue_active(wq))
 		wake_up_interruptible(wq);
 }
 
@@ -432,7 +457,7 @@ static void svc_write_space(struct sock *sk)
 		svc_xprt_enqueue(&svsk->sk_xprt);
 	}
 
-	if (wq && waitqueue_active(wq)) {
+	if (sunrpc_waitqueue_active(wq)) {
 		dprintk("RPC svc_write_space: someone sleeping on %p\n",
 		       svsk);
 		wake_up_interruptible(wq);
@@ -787,7 +812,7 @@ static void svc_tcp_listen_data_ready(struct sock *sk)
 	}
 
 	wq = sk_sleep(sk);
-	if (wq && waitqueue_active(wq))
+	if (sunrpc_waitqueue_active(wq))
 		wake_up_interruptible_all(wq);
 }
 
@@ -808,7 +833,7 @@ static void svc_tcp_state_change(struct sock *sk)
 		set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
 		svc_xprt_enqueue(&svsk->sk_xprt);
 	}
-	if (wq && waitqueue_active(wq))
+	if (sunrpc_waitqueue_active(wq))
 		wake_up_interruptible_all(wq);
 }
 
@@ -823,7 +848,7 @@ static void svc_tcp_data_ready(struct sock *sk)
 		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 		svc_xprt_enqueue(&svsk->sk_xprt);
 	}
-	if (wq && waitqueue_active(wq))
+	if (sunrpc_waitqueue_active(wq))
 		wake_up_interruptible(wq);
 }
 
@@ -1593,7 +1618,7 @@ static void svc_sock_detach(struct svc_xprt *xprt)
 	sk->sk_write_space = svsk->sk_owspace;
 
 	wq = sk_sleep(sk);
-	if (wq && waitqueue_active(wq))
+	if (sunrpc_waitqueue_active(wq))
 		wake_up_interruptible(wq);
 }
 
-- 
cgit v0.10.2