From 35a92fe8770ce54c5eb275cd76128645bea2d200 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 17 Sep 2015 07:47:08 -0400 Subject: nfsd: serialize state seqid morphing operations Andrew was seeing a race occur when an OPEN and OPEN_DOWNGRADE were running in parallel. The server would receive the OPEN_DOWNGRADE first and check its seqid, but then an OPEN would race in and bump it. The OPEN_DOWNGRADE would then complete and bump the seqid again. The result was that the OPEN_DOWNGRADE would be applied after the OPEN, even though it should have been rejected since the seqid changed. The only recourse we have here I think is to serialize operations that bump the seqid in a stateid, particularly when we're given a seqid in the call. To address this, we add a new rw_semaphore to the nfs4_ol_stateid struct. We do a down_write prior to checking the seqid after looking up the stateid to ensure that nothing else is going to bump it while we're operating on it. In the case of OPEN, we do a down_read, as the call doesn't contain a seqid. Those can run in parallel -- we just need to serialize them when there is a concurrent OPEN_DOWNGRADE or CLOSE. LOCK and LOCKU however always take the write lock as there is no opportunity for parallelizing those. Reported-and-Tested-by: Andrew W Elble Signed-off-by: Jeff Layton Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 0f1d569..1b39edf 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3360,6 +3360,7 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, stp->st_access_bmap = 0; stp->st_deny_bmap = 0; stp->st_openstp = NULL; + init_rwsem(&stp->st_rwsem); spin_lock(&oo->oo_owner.so_client->cl_lock); list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); spin_lock(&fp->fi_lock); @@ -4187,15 +4188,20 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf */ if (stp) { /* Stateid was found, this is an OPEN upgrade */ + down_read(&stp->st_rwsem); status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); - if (status) + if (status) { + up_read(&stp->st_rwsem); goto out; + } } else { stp = open->op_stp; open->op_stp = NULL; init_open_stateid(stp, fp, open); + down_read(&stp->st_rwsem); status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open); if (status) { + up_read(&stp->st_rwsem); release_open_stateid(stp); goto out; } @@ -4207,6 +4213,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf } update_stateid(&stp->st_stid.sc_stateid); memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); + up_read(&stp->st_rwsem); if (nfsd4_has_session(&resp->cstate)) { if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) { @@ -4819,10 +4826,13 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ * revoked delegations are kept only for free_stateid. */ return nfserr_bad_stateid; + down_write(&stp->st_rwsem); status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); - if (status) - return status; - return nfs4_check_fh(current_fh, &stp->st_stid); + if (status == nfs_ok) + status = nfs4_check_fh(current_fh, &stp->st_stid); + if (status != nfs_ok) + up_write(&stp->st_rwsem); + return status; } /* @@ -4869,6 +4879,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs return status; oo = openowner(stp->st_stateowner); if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { + up_write(&stp->st_rwsem); nfs4_put_stid(&stp->st_stid); return nfserr_bad_stateid; } @@ -4899,11 +4910,14 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; oo = openowner(stp->st_stateowner); status = nfserr_bad_stateid; - if (oo->oo_flags & NFS4_OO_CONFIRMED) + if (oo->oo_flags & NFS4_OO_CONFIRMED) { + up_write(&stp->st_rwsem); goto put_stateid; + } oo->oo_flags |= NFS4_OO_CONFIRMED; update_stateid(&stp->st_stid.sc_stateid); memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); + up_write(&stp->st_rwsem); dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid)); @@ -4982,6 +4996,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); status = nfs_ok; put_stateid: + up_write(&stp->st_rwsem); nfs4_put_stid(&stp->st_stid); out: nfsd4_bump_seqid(cstate, status); @@ -5035,6 +5050,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; update_stateid(&stp->st_stid.sc_stateid); memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); + up_write(&stp->st_rwsem); nfsd4_close_open_stateid(stp); @@ -5260,6 +5276,7 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; + init_rwsem(&stp->st_rwsem); list_add(&stp->st_locks, &open_stp->st_locks); list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); spin_lock(&fp->fi_lock); @@ -5428,6 +5445,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &open_stp, nn); if (status) goto out; + up_write(&open_stp->st_rwsem); open_sop = openowner(open_stp->st_stateowner); status = nfserr_bad_stateid; if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid, @@ -5435,6 +5453,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; status = lookup_or_create_lock_state(cstate, open_stp, lock, &lock_stp, &new); + if (status == nfs_ok) + down_write(&lock_stp->st_rwsem); } else { status = nfs4_preprocess_seqid_op(cstate, lock->lk_old_lock_seqid, @@ -5540,6 +5560,8 @@ out: seqid_mutating_err(ntohl(status))) lock_sop->lo_owner.so_seqid++; + up_write(&lock_stp->st_rwsem); + /* * If this is a new, never-before-used stateid, and we are * returning an error, then just go ahead and release it. @@ -5709,6 +5731,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fput: fput(filp); put_stateid: + up_write(&stp->st_rwsem); nfs4_put_stid(&stp->st_stid); out: nfsd4_bump_seqid(cstate, status); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 583ffc1..31bde12 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -534,15 +534,16 @@ struct nfs4_file { * Better suggestions welcome. */ struct nfs4_ol_stateid { - struct nfs4_stid st_stid; /* must be first field */ - struct list_head st_perfile; - struct list_head st_perstateowner; - struct list_head st_locks; - struct nfs4_stateowner * st_stateowner; - struct nfs4_clnt_odstate * st_clnt_odstate; - unsigned char st_access_bmap; - unsigned char st_deny_bmap; - struct nfs4_ol_stateid * st_openstp; + struct nfs4_stid st_stid; + struct list_head st_perfile; + struct list_head st_perstateowner; + struct list_head st_locks; + struct nfs4_stateowner *st_stateowner; + struct nfs4_clnt_odstate *st_clnt_odstate; + unsigned char st_access_bmap; + unsigned char st_deny_bmap; + struct nfs4_ol_stateid *st_openstp; + struct rw_semaphore st_rwsem; }; static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s) -- cgit v0.10.2 From e79017ddce8273514d24728523d17033bcbeba6f Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 13 Sep 2015 14:15:15 +0200 Subject: nfsd: drop null test before destroy functions Remove unneeded NULL test. The semantic patch that makes this change is as follows: (http://coccinelle.lip6.fr/) // @@ expression x; @@ -if (x != NULL) { \(kmem_cache_destroy\|mempool_destroy\|dma_pool_destroy\)(x); x = NULL; -} // Signed-off-by: Julia Lawall Signed-off-by: J. Bruce Fields diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 46ec934..116940c 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -217,10 +217,8 @@ void nfsd_reply_cache_shutdown(void) drc_hashtbl = NULL; drc_hashsize = 0; - if (drc_slab) { - kmem_cache_destroy(drc_slab); - drc_slab = NULL; - } + kmem_cache_destroy(drc_slab); + drc_slab = NULL; } /* -- cgit v0.10.2 From fcaba026a55803dd21523e6e191ba7f59e02a737 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 17 Sep 2015 08:28:38 -0400 Subject: nfsd: move svc_fh->fh_maxsize to just after fh_handle This moves the hole in the struct down below the flags fields, which allows us to potentially add a new flag without growing the struct. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 1e90dad..b1edd6b 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -26,9 +26,9 @@ static inline ino_t u32_to_ino_t(__u32 uino) */ typedef struct svc_fh { struct knfsd_fh fh_handle; /* FH data */ + int fh_maxsize; /* max size for fh_handle */ struct dentry * fh_dentry; /* validated dentry */ struct svc_export * fh_export; /* export pointer */ - int fh_maxsize; /* max size for fh_handle */ unsigned char fh_locked; /* inode locked by us */ unsigned char fh_want_write; /* remount protection taken */ -- cgit v0.10.2 From aaf91ec148910e0c2bfd135ea19f870e7196e64f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 17 Sep 2015 08:28:39 -0400 Subject: nfsd: switch unsigned char flags in svc_fh to bools ...just for clarity. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index f6e7cba..00575d7 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -262,11 +262,11 @@ void fill_post_wcc(struct svc_fh *fhp) err = fh_getattr(fhp, &fhp->fh_post_attr); fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version; if (err) { - fhp->fh_post_saved = 0; + fhp->fh_post_saved = false; /* Grab the ctime anyway - set_change_info might use it */ fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime; } else - fhp->fh_post_saved = 1; + fhp->fh_post_saved = true; } /* diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 350041a..c1681ce 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -631,10 +631,7 @@ fh_put(struct svc_fh *fhp) fh_unlock(fhp); fhp->fh_dentry = NULL; dput(dentry); -#ifdef CONFIG_NFSD_V3 - fhp->fh_pre_saved = 0; - fhp->fh_post_saved = 0; -#endif + fh_clear_wcc(fhp); } fh_drop_write(fhp); if (exp) { diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index b1edd6b..2087bae 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -30,12 +30,12 @@ typedef struct svc_fh { struct dentry * fh_dentry; /* validated dentry */ struct svc_export * fh_export; /* export pointer */ - unsigned char fh_locked; /* inode locked by us */ - unsigned char fh_want_write; /* remount protection taken */ + bool fh_locked; /* inode locked by us */ + bool fh_want_write; /* remount protection taken */ #ifdef CONFIG_NFSD_V3 - unsigned char fh_post_saved; /* post-op attrs saved */ - unsigned char fh_pre_saved; /* pre-op attrs saved */ + bool fh_post_saved; /* post-op attrs saved */ + bool fh_pre_saved; /* pre-op attrs saved */ /* Pre-op attributes saved during fh_lock */ __u64 fh_pre_size; /* size before operation */ @@ -213,8 +213,8 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) static inline void fh_clear_wcc(struct svc_fh *fhp) { - fhp->fh_post_saved = 0; - fhp->fh_pre_saved = 0; + fhp->fh_post_saved = false; + fhp->fh_pre_saved = false; } /* @@ -231,7 +231,7 @@ fill_pre_wcc(struct svc_fh *fhp) fhp->fh_pre_ctime = inode->i_ctime; fhp->fh_pre_size = inode->i_size; fhp->fh_pre_change = inode->i_version; - fhp->fh_pre_saved = 1; + fhp->fh_pre_saved = true; } } @@ -267,7 +267,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) inode = d_inode(dentry); mutex_lock_nested(&inode->i_mutex, subclass); fill_pre_wcc(fhp); - fhp->fh_locked = 1; + fhp->fh_locked = true; } static inline void @@ -285,7 +285,7 @@ fh_unlock(struct svc_fh *fhp) if (fhp->fh_locked) { fill_post_wcc(fhp); mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex); - fhp->fh_locked = 0; + fhp->fh_locked = false; } } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 45c0497..994d66f 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1631,7 +1631,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, /* cannot use fh_lock as we need deadlock protective ordering * so do it by hand */ trap = lock_rename(tdentry, fdentry); - ffhp->fh_locked = tfhp->fh_locked = 1; + ffhp->fh_locked = tfhp->fh_locked = true; fill_pre_wcc(ffhp); fill_pre_wcc(tfhp); @@ -1681,7 +1681,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, fill_post_wcc(ffhp); fill_post_wcc(tfhp); unlock_rename(tdentry, fdentry); - ffhp->fh_locked = tfhp->fh_locked = 0; + ffhp->fh_locked = tfhp->fh_locked = false; fh_drop_write(ffhp); out: diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index fee2451..fcfc48c 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -112,14 +112,14 @@ static inline int fh_want_write(struct svc_fh *fh) int ret = mnt_want_write(fh->fh_export->ex_path.mnt); if (!ret) - fh->fh_want_write = 1; + fh->fh_want_write = true; return ret; } static inline void fh_drop_write(struct svc_fh *fh) { if (fh->fh_want_write) { - fh->fh_want_write = 0; + fh->fh_want_write = false; mnt_drop_write(fh->fh_export->ex_path.mnt); } } diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 9f99100..ce7362c 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -632,7 +632,7 @@ static inline void set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) { BUG_ON(!fhp->fh_pre_saved); - cinfo->atomic = fhp->fh_post_saved; + cinfo->atomic = (u32)fhp->fh_post_saved; cinfo->change_supported = IS_I_VERSION(d_inode(fhp->fh_dentry)); cinfo->before_change = fhp->fh_pre_change; -- cgit v0.10.2 From 0ad95472bf169a3501991f8f33f5147f792a8116 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 23 Sep 2015 15:49:29 +0300 Subject: lockd: create NSM handles per net namespace Commit cb7323fffa85 ("lockd: create and use per-net NSM RPC clients on MON/UNMON requests") introduced per-net NSM RPC clients. Unfortunately this doesn't make any sense without per-net nsm_handle. E.g. the following scenario could happen Two hosts (X and Y) in different namespaces (A and B) share the same nsm struct. 1. nsm_monitor(host_X) called => NSM rpc client created, nsm->sm_monitored bit set. 2. nsm_mointor(host-Y) called => nsm->sm_monitored already set, we just exit. Thus in namespace B ln->nsm_clnt == NULL. 3. host X destroyed => nsm->sm_count decremented to 1 4. host Y destroyed => nsm_unmonitor() => nsm_mon_unmon() => NULL-ptr dereference of *ln->nsm_clnt So this could be fixed by making per-net nsm_handles list, instead of global. Thus different net namespaces will not be able share the same nsm_handle. Signed-off-by: Andrey Ryabinin Cc: Signed-off-by: J. Bruce Fields diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 969d589..b5f3c3ab 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -116,7 +116,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni, atomic_inc(&nsm->sm_count); else { host = NULL; - nsm = nsm_get_handle(ni->sap, ni->salen, + nsm = nsm_get_handle(ni->net, ni->sap, ni->salen, ni->hostname, ni->hostname_len); if (unlikely(nsm == NULL)) { dprintk("lockd: %s failed; no nsm handle\n", @@ -534,17 +534,18 @@ static struct nlm_host *next_host_state(struct hlist_head *cache, /** * nlm_host_rebooted - Release all resources held by rebooted host + * @net: network namespace * @info: pointer to decoded results of NLM_SM_NOTIFY call * * We were notified that the specified host has rebooted. Release * all resources held by that peer. */ -void nlm_host_rebooted(const struct nlm_reboot *info) +void nlm_host_rebooted(const struct net *net, const struct nlm_reboot *info) { struct nsm_handle *nsm; struct nlm_host *host; - nsm = nsm_reboot_lookup(info); + nsm = nsm_reboot_lookup(net, info); if (unlikely(nsm == NULL)) return; diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 47a32b6..6c05cd1 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -51,7 +51,6 @@ struct nsm_res { }; static const struct rpc_program nsm_program; -static LIST_HEAD(nsm_handles); static DEFINE_SPINLOCK(nsm_lock); /* @@ -264,33 +263,35 @@ void nsm_unmonitor(const struct nlm_host *host) } } -static struct nsm_handle *nsm_lookup_hostname(const char *hostname, - const size_t len) +static struct nsm_handle *nsm_lookup_hostname(const struct list_head *nsm_handles, + const char *hostname, const size_t len) { struct nsm_handle *nsm; - list_for_each_entry(nsm, &nsm_handles, sm_link) + list_for_each_entry(nsm, nsm_handles, sm_link) if (strlen(nsm->sm_name) == len && memcmp(nsm->sm_name, hostname, len) == 0) return nsm; return NULL; } -static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap) +static struct nsm_handle *nsm_lookup_addr(const struct list_head *nsm_handles, + const struct sockaddr *sap) { struct nsm_handle *nsm; - list_for_each_entry(nsm, &nsm_handles, sm_link) + list_for_each_entry(nsm, nsm_handles, sm_link) if (rpc_cmp_addr(nsm_addr(nsm), sap)) return nsm; return NULL; } -static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv) +static struct nsm_handle *nsm_lookup_priv(const struct list_head *nsm_handles, + const struct nsm_private *priv) { struct nsm_handle *nsm; - list_for_each_entry(nsm, &nsm_handles, sm_link) + list_for_each_entry(nsm, nsm_handles, sm_link) if (memcmp(nsm->sm_priv.data, priv->data, sizeof(priv->data)) == 0) return nsm; @@ -353,6 +354,7 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, /** * nsm_get_handle - Find or create a cached nsm_handle + * @net: network namespace * @sap: pointer to socket address of handle to find * @salen: length of socket address * @hostname: pointer to C string containing hostname to find @@ -365,11 +367,13 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, * @hostname cannot be found in the handle cache. Returns NULL if * an error occurs. */ -struct nsm_handle *nsm_get_handle(const struct sockaddr *sap, +struct nsm_handle *nsm_get_handle(const struct net *net, + const struct sockaddr *sap, const size_t salen, const char *hostname, const size_t hostname_len) { struct nsm_handle *cached, *new = NULL; + struct lockd_net *ln = net_generic(net, lockd_net_id); if (hostname && memchr(hostname, '/', hostname_len) != NULL) { if (printk_ratelimit()) { @@ -384,9 +388,10 @@ retry: spin_lock(&nsm_lock); if (nsm_use_hostnames && hostname != NULL) - cached = nsm_lookup_hostname(hostname, hostname_len); + cached = nsm_lookup_hostname(&ln->nsm_handles, + hostname, hostname_len); else - cached = nsm_lookup_addr(sap); + cached = nsm_lookup_addr(&ln->nsm_handles, sap); if (cached != NULL) { atomic_inc(&cached->sm_count); @@ -400,7 +405,7 @@ retry: } if (new != NULL) { - list_add(&new->sm_link, &nsm_handles); + list_add(&new->sm_link, &ln->nsm_handles); spin_unlock(&nsm_lock); dprintk("lockd: created nsm_handle for %s (%s)\n", new->sm_name, new->sm_addrbuf); @@ -417,19 +422,22 @@ retry: /** * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle + * @net: network namespace * @info: pointer to NLMPROC_SM_NOTIFY arguments * * Returns a matching nsm_handle if found in the nsm cache. The returned * nsm_handle's reference count is bumped. Otherwise returns NULL if some * error occurred. */ -struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info) +struct nsm_handle *nsm_reboot_lookup(const struct net *net, + const struct nlm_reboot *info) { struct nsm_handle *cached; + struct lockd_net *ln = net_generic(net, lockd_net_id); spin_lock(&nsm_lock); - cached = nsm_lookup_priv(&info->priv); + cached = nsm_lookup_priv(&ln->nsm_handles, &info->priv); if (unlikely(cached == NULL)) { spin_unlock(&nsm_lock); dprintk("lockd: never saw rebooted peer '%.*s' before\n", diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h index 097bfa3..89fe011 100644 --- a/fs/lockd/netns.h +++ b/fs/lockd/netns.h @@ -15,6 +15,7 @@ struct lockd_net { spinlock_t nsm_clnt_lock; unsigned int nsm_users; struct rpc_clnt *nsm_clnt; + struct list_head nsm_handles; }; extern int lockd_net_id; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index d678bcc..0dff13f 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -593,6 +593,7 @@ static int lockd_init_net(struct net *net) INIT_LIST_HEAD(&ln->lockd_manager.list); ln->lockd_manager.block_opens = false; spin_lock_init(&ln->nsm_clnt_lock); + INIT_LIST_HEAD(&ln->nsm_handles); return 0; } diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index b147d1a..09c576f 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -421,7 +421,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, return rpc_system_err; } - nlm_host_rebooted(argp); + nlm_host_rebooted(SVC_NET(rqstp), argp); return rpc_success; } diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 21171f0..fb26b9f 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -464,7 +464,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, return rpc_system_err; } - nlm_host_rebooted(argp); + nlm_host_rebooted(SVC_NET(rqstp), argp); return rpc_success; } diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index ff82a32..fd3b65b 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -235,7 +235,8 @@ void nlm_rebind_host(struct nlm_host *); struct nlm_host * nlm_get_host(struct nlm_host *); void nlm_shutdown_hosts(void); void nlm_shutdown_hosts_net(struct net *net); -void nlm_host_rebooted(const struct nlm_reboot *); +void nlm_host_rebooted(const struct net *net, + const struct nlm_reboot *); /* * Host monitoring @@ -243,11 +244,13 @@ void nlm_host_rebooted(const struct nlm_reboot *); int nsm_monitor(const struct nlm_host *host); void nsm_unmonitor(const struct nlm_host *host); -struct nsm_handle *nsm_get_handle(const struct sockaddr *sap, +struct nsm_handle *nsm_get_handle(const struct net *net, + const struct sockaddr *sap, const size_t salen, const char *hostname, const size_t hostname_len); -struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info); +struct nsm_handle *nsm_reboot_lookup(const struct net *net, + const struct nlm_reboot *info); void nsm_release(struct nsm_handle *nsm); /* -- cgit v0.10.2 From 226453d8cfbd5f3bbdcef0d71e67186715bb811d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 7 Oct 2015 13:54:42 -0400 Subject: SUNRPC: Use MSG_SENDPAGE_NOTLAST when calling sendpage() If we're sending more pages via kernel_sendpage(), then set MSG_SENDPAGE_NOTLAST. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 0c81202..4892373 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -181,7 +181,7 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr, struct page **ppage = xdr->pages; size_t base = xdr->page_base; unsigned int pglen = xdr->page_len; - unsigned int flags = MSG_MORE; + unsigned int flags = MSG_MORE | MSG_SENDPAGE_NOTLAST; int slen; int len = 0; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 1a85e0e..f264962 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -360,8 +360,10 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i int flags = XS_SENDMSG_FLAGS; remainder -= len; - if (remainder != 0 || more) + if (more) flags |= MSG_MORE; + if (remainder != 0) + flags |= MSG_SENDPAGE_NOTLAST | MSG_MORE; err = do_sendpage(sock, *ppage, base, len, flags); if (remainder == 0 || err != len) break; -- cgit v0.10.2 From 0d0f4aab4e4d290138a4ae7f2ef8469e48c9a669 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 7 Oct 2015 14:39:55 +0300 Subject: lockd: get rid of reference-counted NSM RPC clients Currently we have reference-counted per-net NSM RPC client which created on the first monitor request and destroyed after the last unmonitor request. It's needed because RPC client need to know 'utsname()->nodename', but utsname() might be NULL when nsm_unmonitor() called. So instead of holding the rpc client we could just save nodename in struct nlm_host and pass it to the rpc_create(). Thus ther is no need in keeping rpc client until last unmonitor request. We could create separate RPC clients for each monitor/unmonitor requests. Signed-off-by: Andrey Ryabinin Signed-off-by: J. Bruce Fields diff --git a/fs/lockd/host.c b/fs/lockd/host.c index b5f3c3ab..d716c99 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -161,6 +161,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni, host->h_nsmhandle = nsm; host->h_addrbuf = nsm->sm_addrbuf; host->net = ni->net; + strlcpy(host->nodename, utsname()->nodename, sizeof(host->nodename)); out: return host; diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 6c05cd1..19166d4 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -42,7 +42,7 @@ struct nsm_args { u32 proc; char *mon_name; - char *nodename; + const char *nodename; }; struct nsm_res { @@ -86,69 +86,18 @@ static struct rpc_clnt *nsm_create(struct net *net, const char *nodename) return rpc_create(&args); } -static struct rpc_clnt *nsm_client_set(struct lockd_net *ln, - struct rpc_clnt *clnt) -{ - spin_lock(&ln->nsm_clnt_lock); - if (ln->nsm_users == 0) { - if (clnt == NULL) - goto out; - ln->nsm_clnt = clnt; - } - clnt = ln->nsm_clnt; - ln->nsm_users++; -out: - spin_unlock(&ln->nsm_clnt_lock); - return clnt; -} - -static struct rpc_clnt *nsm_client_get(struct net *net, const char *nodename) -{ - struct rpc_clnt *clnt, *new; - struct lockd_net *ln = net_generic(net, lockd_net_id); - - clnt = nsm_client_set(ln, NULL); - if (clnt != NULL) - goto out; - - clnt = new = nsm_create(net, nodename); - if (IS_ERR(clnt)) - goto out; - - clnt = nsm_client_set(ln, new); - if (clnt != new) - rpc_shutdown_client(new); -out: - return clnt; -} - -static void nsm_client_put(struct net *net) -{ - struct lockd_net *ln = net_generic(net, lockd_net_id); - struct rpc_clnt *clnt = NULL; - - spin_lock(&ln->nsm_clnt_lock); - ln->nsm_users--; - if (ln->nsm_users == 0) { - clnt = ln->nsm_clnt; - ln->nsm_clnt = NULL; - } - spin_unlock(&ln->nsm_clnt_lock); - if (clnt != NULL) - rpc_shutdown_client(clnt); -} - static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, - struct rpc_clnt *clnt) + const struct nlm_host *host) { int status; + struct rpc_clnt *clnt; struct nsm_args args = { .priv = &nsm->sm_priv, .prog = NLM_PROGRAM, .vers = 3, .proc = NLMPROC_NSM_NOTIFY, .mon_name = nsm->sm_mon_name, - .nodename = clnt->cl_nodename, + .nodename = host->nodename, }; struct rpc_message msg = { .rpc_argp = &args, @@ -157,6 +106,13 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, memset(res, 0, sizeof(*res)); + clnt = nsm_create(host->net, host->nodename); + if (IS_ERR(clnt)) { + dprintk("lockd: failed to create NSM upcall transport, " + "status=%ld, net=%p\n", PTR_ERR(clnt), host->net); + return PTR_ERR(clnt); + } + msg.rpc_proc = &clnt->cl_procinfo[proc]; status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN); if (status == -ECONNREFUSED) { @@ -170,6 +126,8 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, status); else status = 0; + + rpc_shutdown_client(clnt); return status; } @@ -189,32 +147,19 @@ int nsm_monitor(const struct nlm_host *host) struct nsm_handle *nsm = host->h_nsmhandle; struct nsm_res res; int status; - struct rpc_clnt *clnt; - const char *nodename = NULL; dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name); if (nsm->sm_monitored) return 0; - if (host->h_rpcclnt) - nodename = host->h_rpcclnt->cl_nodename; - /* * Choose whether to record the caller_name or IP address of * this peer in the local rpc.statd's database. */ nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf; - clnt = nsm_client_get(host->net, nodename); - if (IS_ERR(clnt)) { - status = PTR_ERR(clnt); - dprintk("lockd: failed to create NSM upcall transport, " - "status=%d, net=%p\n", status, host->net); - return status; - } - - status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, clnt); + status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, host); if (unlikely(res.status != 0)) status = -EIO; if (unlikely(status < 0)) { @@ -246,11 +191,9 @@ void nsm_unmonitor(const struct nlm_host *host) if (atomic_read(&nsm->sm_count) == 1 && nsm->sm_monitored && !nsm->sm_sticky) { - struct lockd_net *ln = net_generic(host->net, lockd_net_id); - dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name); - status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, ln->nsm_clnt); + status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, host); if (res.status != 0) status = -EIO; if (status < 0) @@ -258,8 +201,6 @@ void nsm_unmonitor(const struct nlm_host *host) nsm->sm_name); else nsm->sm_monitored = 0; - - nsm_client_put(host->net); } } diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h index 89fe011..5426189 100644 --- a/fs/lockd/netns.h +++ b/fs/lockd/netns.h @@ -12,9 +12,6 @@ struct lockd_net { struct delayed_work grace_period_end; struct lock_manager lockd_manager; - spinlock_t nsm_clnt_lock; - unsigned int nsm_users; - struct rpc_clnt *nsm_clnt; struct list_head nsm_handles; }; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 0dff13f..5f31ebd 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -592,7 +592,6 @@ static int lockd_init_net(struct net *net) INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender); INIT_LIST_HEAD(&ln->lockd_manager.list); ln->lockd_manager.block_opens = false; - spin_lock_init(&ln->nsm_clnt_lock); INIT_LIST_HEAD(&ln->nsm_handles); return 0; } diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index fd3b65b..c153738 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -68,6 +68,7 @@ struct nlm_host { struct nsm_handle *h_nsmhandle; /* NSM status handle */ char *h_addrbuf; /* address eyecatcher */ struct net *net; /* host net */ + char nodename[UNX_MAXNODENAME + 1]; }; /* -- cgit v0.10.2 From cc6a7aab5570beef884ff95f7cade6634bf815a1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 9 Oct 2015 16:13:45 +0200 Subject: sunrpc: avoid warning in gss_key_timeout The gss_key_timeout() function causes a harmless warning in some configurations, e.g. ARM imx_v6_v7_defconfig with gcc-5.2, if the compiler cannot figure out the state of the 'expire' variable across an rcu_read_unlock(): net/sunrpc/auth_gss/auth_gss.c: In function 'gss_key_timeout': net/sunrpc/auth_gss/auth_gss.c:1422:211: warning: 'expire' may be used uninitialized in this function [-Wmaybe-uninitialized] To avoid this warning without adding a bogus initialization, this rewrites the function so the comparison is done inside of the critical section. As a side-effect, it also becomes slightly easier to understand because the implementation now more closely resembles the comment above it. Signed-off-by: Arnd Bergmann Fixes: c5e6aecd034e7 ("sunrpc: fix RCU handling of gc_ctx field") Signed-off-by: J. Bruce Fields diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index dace13d..799e65b 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1411,17 +1411,16 @@ gss_key_timeout(struct rpc_cred *rc) { struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base); struct gss_cl_ctx *ctx; - unsigned long now = jiffies; - unsigned long expire; + unsigned long timeout = jiffies + (gss_key_expire_timeo * HZ); + int ret = 0; rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); - if (ctx) - expire = ctx->gc_expiry - (gss_key_expire_timeo * HZ); + if (!ctx || time_after(timeout, ctx->gc_expiry)) + ret = -EACCES; rcu_read_unlock(); - if (!ctx || time_after(now, expire)) - return -EACCES; - return 0; + + return ret; } static int -- cgit v0.10.2 From 825213e59ec24110b0a0f94456db42621f928421 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 3 Oct 2015 08:19:57 -0400 Subject: nfsd: move include of state.h from trace.c to trace.h Any file which includes trace.h will need to include state.h, even if they aren't using any state tracepoints. Ensure that we include any headers that might be needed in trace.h instead of relying on the *.c files to have the right ones. Signed-off-by: Jeff Layton Reviewed-by: Christoph Hellwig Signed-off-by: J. Bruce Fields diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c index 82f8907..9096746 100644 --- a/fs/nfsd/trace.c +++ b/fs/nfsd/trace.c @@ -1,5 +1,3 @@ -#include "state.h" - #define CREATE_TRACE_POINTS #include "trace.h" diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index c668520..0befe76 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -9,6 +9,8 @@ #include +#include "state.h" + DECLARE_EVENT_CLASS(nfsd_stateid_class, TP_PROTO(stateid_t *stp), TP_ARGS(stp), -- cgit v0.10.2 From 778620364ef525e83597a6edee4d0a69db67fd3d Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Fri, 16 Oct 2015 08:59:08 +1100 Subject: sunrpc/cache: make cache flushing more reliable. The caches used to store sunrpc authentication information can be flushed by writing a timestamp to a file in /proc. This timestamp has a one-second resolution and any entry in cache that was last_refreshed *before* that time is treated as expired. This is problematic as it is not possible to reliably flush the cache without interrupting NFS service. If the current time is written to the "flush" file, any entry that was added since the current second started will still be treated as valid. If one second beyond than the current time is written to the file then no entries can be valid until the second ticks over. This will mean that no NFS request will be handled for up to 1 second. To resolve this issue we make two changes: 1/ treat an entry as expired if the timestamp when it was last_refreshed is before *or the same as* the expiry time. This means that current code which writes out the current time will now flush the cache reliably. 2/ when a new entry in added to the cache - set the last_refresh timestamp to 1 second *beyond* the current flush time, when that not in the past. This ensures that newly added entries will always be valid. Now that we have a very reliable way to flush the cache, and also since we are using "since-boot" timestamps which are monotonic, change cache_purge() to set the smallest future flush_time which will work, and leave it there: don't revert to '1'. Also disable the setting of the 'flush_time' far into the future. That has never been useful and is now awkward as it would cause last_refresh times to be strange. Finally: if a request is made to set the 'flush_time' to the current second, assume the intent is to flush the cache and advance it, if necessary, to 1 second beyond the current 'flush_time' so that all active entries will be deemed to be expired. As part of this we need to add a 'cache_detail' arg to cache_init() and cache_fresh_locked() so they can find the current ->flush_time. Signed-off-by: NeilBrown Reported-by: Olaf Kirch Signed-off-by: J. Bruce Fields diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 03d3b4c..ed03c9f 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -48,8 +48,10 @@ struct cache_head { struct hlist_node cache_list; time_t expiry_time; /* After time time, don't use the data */ - time_t last_refresh; /* If CACHE_PENDING, this is when upcall - * was sent, else this is when update was received + time_t last_refresh; /* If CACHE_PENDING, this is when upcall was + * sent, else this is when update was + * received, though it is alway set to + * be *after* ->flush_time. */ struct kref ref; unsigned long flags; @@ -105,8 +107,12 @@ struct cache_detail { /* fields below this comment are for internal use * and should not be touched by cache owners */ - time_t flush_time; /* flush all cache items with last_refresh - * earlier than this */ + time_t flush_time; /* flush all cache items with + * last_refresh at or earlier + * than this. last_refresh + * is never set at or earlier + * than this. + */ struct list_head others; time_t nextcheck; int entries; @@ -203,7 +209,7 @@ static inline void cache_put(struct cache_head *h, struct cache_detail *cd) static inline int cache_is_expired(struct cache_detail *detail, struct cache_head *h) { return (h->expiry_time < seconds_since_boot()) || - (detail->flush_time > h->last_refresh); + (detail->flush_time >= h->last_refresh); } extern int cache_check(struct cache_detail *detail, diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 4a2340a..5e4f815 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -41,13 +41,16 @@ static bool cache_defer_req(struct cache_req *req, struct cache_head *item); static void cache_revisit_request(struct cache_head *item); -static void cache_init(struct cache_head *h) +static void cache_init(struct cache_head *h, struct cache_detail *detail) { time_t now = seconds_since_boot(); INIT_HLIST_NODE(&h->cache_list); h->flags = 0; kref_init(&h->ref); h->expiry_time = now + CACHE_NEW_EXPIRY; + if (now <= detail->flush_time) + /* ensure it isn't already expired */ + now = detail->flush_time + 1; h->last_refresh = now; } @@ -81,7 +84,7 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, * we might get lose if we need to * cache_put it soon. */ - cache_init(new); + cache_init(new, detail); detail->init(new, key); write_lock(&detail->hash_lock); @@ -116,10 +119,15 @@ EXPORT_SYMBOL_GPL(sunrpc_cache_lookup); static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch); -static void cache_fresh_locked(struct cache_head *head, time_t expiry) +static void cache_fresh_locked(struct cache_head *head, time_t expiry, + struct cache_detail *detail) { + time_t now = seconds_since_boot(); + if (now <= detail->flush_time) + /* ensure it isn't immediately treated as expired */ + now = detail->flush_time + 1; head->expiry_time = expiry; - head->last_refresh = seconds_since_boot(); + head->last_refresh = now; smp_wmb(); /* paired with smp_rmb() in cache_is_valid() */ set_bit(CACHE_VALID, &head->flags); } @@ -149,7 +157,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, set_bit(CACHE_NEGATIVE, &old->flags); else detail->update(old, new); - cache_fresh_locked(old, new->expiry_time); + cache_fresh_locked(old, new->expiry_time, detail); write_unlock(&detail->hash_lock); cache_fresh_unlocked(old, detail); return old; @@ -162,7 +170,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, cache_put(old, detail); return NULL; } - cache_init(tmp); + cache_init(tmp, detail); detail->init(tmp, old); write_lock(&detail->hash_lock); @@ -173,8 +181,8 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, hlist_add_head(&tmp->cache_list, &detail->hash_table[hash]); detail->entries++; cache_get(tmp); - cache_fresh_locked(tmp, new->expiry_time); - cache_fresh_locked(old, 0); + cache_fresh_locked(tmp, new->expiry_time, detail); + cache_fresh_locked(old, 0, detail); write_unlock(&detail->hash_lock); cache_fresh_unlocked(tmp, detail); cache_fresh_unlocked(old, detail); @@ -219,7 +227,8 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h rv = cache_is_valid(h); if (rv == -EAGAIN) { set_bit(CACHE_NEGATIVE, &h->flags); - cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY); + cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY, + detail); rv = -ENOENT; } write_unlock(&detail->hash_lock); @@ -487,10 +496,13 @@ EXPORT_SYMBOL_GPL(cache_flush); void cache_purge(struct cache_detail *detail) { - detail->flush_time = LONG_MAX; + time_t now = seconds_since_boot(); + if (detail->flush_time >= now) + now = detail->flush_time + 1; + /* 'now' is the maximum value any 'last_refresh' can have */ + detail->flush_time = now; detail->nextcheck = seconds_since_boot(); cache_flush(); - detail->flush_time = 1; } EXPORT_SYMBOL_GPL(cache_purge); @@ -1436,6 +1448,7 @@ static ssize_t write_flush(struct file *file, const char __user *buf, { char tbuf[20]; char *bp, *ep; + time_t then, now; if (*ppos || count > sizeof(tbuf)-1) return -EINVAL; @@ -1447,8 +1460,22 @@ static ssize_t write_flush(struct file *file, const char __user *buf, return -EINVAL; bp = tbuf; - cd->flush_time = get_expiry(&bp); - cd->nextcheck = seconds_since_boot(); + then = get_expiry(&bp); + now = seconds_since_boot(); + cd->nextcheck = now; + /* Can only set flush_time to 1 second beyond "now", or + * possibly 1 second beyond flushtime. This is because + * flush_time never goes backwards so it mustn't get too far + * ahead of time. + */ + if (then >= now) { + /* Want to flush everything, so behave like cache_purge() */ + if (cd->flush_time >= now) + now = cd->flush_time + 1; + then = now; + } + + cd->flush_time = then; cache_flush(); *ppos += count; -- cgit v0.10.2 From 2b63482185e6054cc11ca6d6c073f90160c161fd Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 15 Oct 2015 15:33:23 -0400 Subject: nfsd: fix clid_inuse on mount with security change In bakeathon testing Solaris client was getting CLID_INUSE error when doing a krb5 mount soon after an auth_sys mount, or vice versa. That's not really necessary since in this case the old client doesn't have any state any more: http://tools.ietf.org/html/rfc7530#page-103 "when the server gets a SETCLIENTID for a client ID that currently has no state, or it has state but the lease has expired, rather than returning NFS4ERR_CLID_INUSE, the server MUST allow the SETCLIENTID and confirm the new client ID if followed by the appropriate SETCLIENTID_CONFIRM." This doesn't fix the problem completely since our client_has_state() check counts openowners left around to handle close replays, which we should probably just remove in this case. Signed-off-by: J. Bruce Fields diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1b39edf..bac6207 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2262,7 +2262,8 @@ static bool client_has_state(struct nfs4_client *clp) * Note clp->cl_openowners check isn't quite right: there's no * need to count owners without stateid's. * - * Also note we should probably be using this in 4.0 case too. + * Also note in 4.0 case should also be checking for openowners + * kept around just for close handling. */ return !list_empty(&clp->cl_openowners) #ifdef CONFIG_NFSD_PNFS @@ -3049,7 +3050,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Cases below refer to rfc 3530 section 14.2.33: */ spin_lock(&nn->client_lock); conf = find_confirmed_client_by_name(&clname, nn); - if (conf) { + if (conf && client_has_state(conf)) { /* case 0: */ status = nfserr_clid_inuse; if (clp_used_exchangeid(conf)) @@ -3136,6 +3137,11 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, } else { /* case 3: normal case; new or rebooted client */ old = find_confirmed_client_by_name(&unconf->cl_name, nn); if (old) { + status = nfserr_clid_inuse; + if (client_has_state(old) + && !same_creds(&unconf->cl_cred, + &old->cl_cred)) + goto out; status = mark_client_expired_locked(old); if (status) { old = NULL; -- cgit v0.10.2 From 4eaea13425078272895ec37814c6878d78b8db9f Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 15 Oct 2015 16:11:01 -0400 Subject: nfsd: improve client_has_state to check for unused openowners At least in the v4.0 case openowners can hang around for a while after last close, but they shouldn't really block (for example), a new mount with a different principal. Signed-off-by: J. Bruce Fields diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index bac6207..f2ea343 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2256,16 +2256,20 @@ nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) clid->flags = new->cl_exchange_flags; } +static bool client_has_openowners(struct nfs4_client *clp) +{ + struct nfs4_openowner *oo; + + list_for_each_entry(oo, &clp->cl_openowners, oo_perclient) { + if (!list_empty(&oo->oo_owner.so_stateids)) + return true; + } + return false; +} + static bool client_has_state(struct nfs4_client *clp) { - /* - * Note clp->cl_openowners check isn't quite right: there's no - * need to count owners without stateid's. - * - * Also note in 4.0 case should also be checking for openowners - * kept around just for close handling. - */ - return !list_empty(&clp->cl_openowners) + return client_has_openowners(clp) #ifdef CONFIG_NFSD_PNFS || !list_empty(&clp->cl_lo_states) #endif -- cgit v0.10.2 From cc8a55320b5f1196bee5bd14e4bb2ebd3b983317 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 17 Sep 2015 07:58:24 -0400 Subject: nfsd: serialize layout stateid morphing operations In order to allow the client to make a sane determination of what happened with racing LAYOUTGET/LAYOUTRETURN/CB_LAYOUTRECALL calls, we must ensure that the seqids return accurately represent the order of operations. The simplest way to do that is to ensure that operations on a single stateid are serialized. This patch adds a mutex to the layout stateid, and locks it when checking the layout stateid's seqid. The mutex is held over the entire operation and released after the seqid is bumped. Note that in the case of CB_LAYOUTRECALL we must move the increment of the seqid and setting into a new cb "prepare" operation. The lease infrastructure will call the lm_break callback with a spinlock held, so and we can't take the mutex in that codepath. Cc: Christoph Hellwig Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index ebf90e4..4a68ab9 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -201,6 +201,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, INIT_LIST_HEAD(&ls->ls_perfile); spin_lock_init(&ls->ls_lock); INIT_LIST_HEAD(&ls->ls_layouts); + mutex_init(&ls->ls_mutex); ls->ls_layout_type = layout_type; nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops, NFSPROC4_CLNT_CB_LAYOUT); @@ -262,19 +263,23 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, status = nfserr_jukebox; if (!ls) goto out; + mutex_lock(&ls->ls_mutex); } else { ls = container_of(stid, struct nfs4_layout_stateid, ls_stid); status = nfserr_bad_stateid; + mutex_lock(&ls->ls_mutex); if (stateid->si_generation > stid->sc_stateid.si_generation) - goto out_put_stid; + goto out_unlock_stid; if (layout_type != ls->ls_layout_type) - goto out_put_stid; + goto out_unlock_stid; } *lsp = ls; return 0; +out_unlock_stid: + mutex_unlock(&ls->ls_mutex); out_put_stid: nfs4_put_stid(stid); out: @@ -296,8 +301,6 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls) trace_layout_recall(&ls->ls_stid.sc_stateid); atomic_inc(&ls->ls_stid.sc_count); - update_stateid(&ls->ls_stid.sc_stateid); - memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t)); nfsd4_run_cb(&ls->ls_recall); out_unlock: @@ -494,6 +497,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp, } spin_unlock(&ls->ls_lock); + mutex_unlock(&ls->ls_mutex); nfs4_put_stid(&ls->ls_stid); nfsd4_free_layouts(&reaplist); return nfs_ok; @@ -608,6 +612,17 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) } } +static void +nfsd4_cb_layout_prepare(struct nfsd4_callback *cb) +{ + struct nfs4_layout_stateid *ls = + container_of(cb, struct nfs4_layout_stateid, ls_recall); + + mutex_lock(&ls->ls_mutex); + update_stateid(&ls->ls_stid.sc_stateid); + memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t)); +} + static int nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) { @@ -649,12 +664,14 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb) trace_layout_recall_release(&ls->ls_stid.sc_stateid); + mutex_unlock(&ls->ls_mutex); nfsd4_return_all_layouts(ls, &reaplist); nfsd4_free_layouts(&reaplist); nfs4_put_stid(&ls->ls_stid); } static struct nfsd4_callback_ops nfsd4_cb_layout_ops = { + .prepare = nfsd4_cb_layout_prepare, .done = nfsd4_cb_layout_done, .release = nfsd4_cb_layout_release, }; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 4ce6b97..a9f096c 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1309,6 +1309,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp, nfserr = nfsd4_insert_layout(lgp, ls); out_put_stid: + mutex_unlock(&ls->ls_mutex); nfs4_put_stid(&ls->ls_stid); out: return nfserr; @@ -1362,6 +1363,9 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, goto out; } + /* LAYOUTCOMMIT does not require any serialization */ + mutex_unlock(&ls->ls_mutex); + if (new_size > i_size_read(inode)) { lcp->lc_size_chg = 1; lcp->lc_newsize = new_size; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 31bde12..1fa0f38 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -562,6 +562,7 @@ struct nfs4_layout_stateid { struct nfsd4_callback ls_recall; stateid_t ls_recall_sid; bool ls_recalled; + struct mutex ls_mutex; }; static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s) -- cgit v0.10.2 From 9767feb2c64b29775f1ea683130b44f95f67d169 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 1 Oct 2015 09:05:50 -0400 Subject: nfsd: ensure that seqid morphing operations are atomic wrt to copies Bruce points out that the increment of the seqid in stateids is not serialized in any way, so it's possible for racing calls to bump it twice and end up sending the same stateid. While we don't have any reports of this problem it _is_ theoretically possible, and could lead to spurious state recovery by the client. In the current code, update_stateid is always followed by a memcpy of that stateid, so we can combine the two operations. For better atomicity, we add a spinlock to the nfs4_stid and hold that when bumping the seqid and copying the stateid. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 4a68ab9..9ffef06 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -409,8 +409,7 @@ nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls) list_add_tail(&new->lo_perstate, &ls->ls_layouts); new = NULL; done: - update_stateid(&ls->ls_stid.sc_stateid); - memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t)); + nfs4_inc_and_copy_stateid(&lgp->lg_sid, &ls->ls_stid); spin_unlock(&ls->ls_lock); out: spin_unlock(&fp->fi_lock); @@ -484,11 +483,8 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp, } } if (!list_empty(&ls->ls_layouts)) { - if (found) { - update_stateid(&ls->ls_stid.sc_stateid); - memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid, - sizeof(stateid_t)); - } + if (found) + nfs4_inc_and_copy_stateid(&lrp->lr_sid, &ls->ls_stid); lrp->lrs_present = 1; } else { trace_layoutstate_unhash(&ls->ls_stid.sc_stateid); @@ -619,8 +615,7 @@ nfsd4_cb_layout_prepare(struct nfsd4_callback *cb) container_of(cb, struct nfs4_layout_stateid, ls_recall); mutex_lock(&ls->ls_mutex); - update_stateid(&ls->ls_stid.sc_stateid); - memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t)); + nfs4_inc_and_copy_stateid(&ls->ls_recall_sid, &ls->ls_stid); } static int diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index f2ea343..0a69715 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -575,6 +575,7 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid; /* Will be incremented before return to client: */ atomic_set(&stid->sc_count, 1); + spin_lock_init(&stid->sc_lock); /* * It shouldn't be a problem to reuse an opaque stateid value. @@ -745,6 +746,18 @@ nfs4_put_stid(struct nfs4_stid *s) put_nfs4_file(fp); } +void +nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid) +{ + stateid_t *src = &stid->sc_stateid; + + spin_lock(&stid->sc_lock); + if (unlikely(++src->si_generation == 0)) + src->si_generation = 1; + memcpy(dst, src, sizeof(*dst)); + spin_unlock(&stid->sc_lock); +} + static void nfs4_put_deleg_lease(struct nfs4_file *fp) { struct file *filp = NULL; @@ -4221,8 +4234,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf if (stp->st_clnt_odstate == open->op_odstate) open->op_odstate = NULL; } - update_stateid(&stp->st_stid.sc_stateid); - memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); + nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid); up_read(&stp->st_rwsem); if (nfsd4_has_session(&resp->cstate)) { @@ -4925,8 +4937,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto put_stateid; } oo->oo_flags |= NFS4_OO_CONFIRMED; - update_stateid(&stp->st_stid.sc_stateid); - memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); + nfs4_inc_and_copy_stateid(&oc->oc_resp_stateid, &stp->st_stid); up_write(&stp->st_rwsem); dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid)); @@ -4999,11 +5010,8 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, goto put_stateid; } nfs4_stateid_downgrade(stp, od->od_share_access); - reset_union_bmap_deny(od->od_share_deny, stp); - - update_stateid(&stp->st_stid.sc_stateid); - memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); + nfs4_inc_and_copy_stateid(&od->od_stateid, &stp->st_stid); status = nfs_ok; put_stateid: up_write(&stp->st_rwsem); @@ -5058,8 +5066,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfsd4_bump_seqid(cstate, status); if (status) goto out; - update_stateid(&stp->st_stid.sc_stateid); - memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); + nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid); up_write(&stp->st_rwsem); nfsd4_close_open_stateid(stp); @@ -5542,9 +5549,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, err = vfs_lock_file(filp, F_SETLK, file_lock, conflock); switch (-err) { case 0: /* success! */ - update_stateid(&lock_stp->st_stid.sc_stateid); - memcpy(&lock->lk_resp_stateid, &lock_stp->st_stid.sc_stateid, - sizeof(stateid_t)); + nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid); status = 0; break; case (EAGAIN): /* conflock holds conflicting lock */ @@ -5736,8 +5741,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n"); goto out_nfserr; } - update_stateid(&stp->st_stid.sc_stateid); - memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); + nfs4_inc_and_copy_stateid(&locku->lu_stateid, &stp->st_stid); fput: fput(filp); put_stateid: diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 1fa0f38..77fdf4d 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -84,7 +84,7 @@ struct nfsd4_callback_ops { * fields that are of general use to any stateid. */ struct nfs4_stid { - atomic_t sc_count; + atomic_t sc_count; #define NFS4_OPEN_STID 1 #define NFS4_LOCK_STID 2 #define NFS4_DELEG_STID 4 @@ -94,11 +94,12 @@ struct nfs4_stid { #define NFS4_REVOKED_DELEG_STID 16 #define NFS4_CLOSED_DELEG_STID 32 #define NFS4_LAYOUT_STID 64 - unsigned char sc_type; - stateid_t sc_stateid; - struct nfs4_client *sc_client; - struct nfs4_file *sc_file; - void (*sc_free)(struct nfs4_stid *); + unsigned char sc_type; + stateid_t sc_stateid; + spinlock_t sc_lock; + struct nfs4_client *sc_client; + struct nfs4_file *sc_file; + void (*sc_free)(struct nfs4_stid *); }; /* @@ -364,15 +365,6 @@ struct nfs4_client_reclaim { char cr_recdir[HEXDIR_LEN]; /* recover dir */ }; -static inline void -update_stateid(stateid_t *stateid) -{ - stateid->si_generation++; - /* Wraparound recommendation from 3530bis-13 9.1.3.2: */ - if (stateid->si_generation == 0) - stateid->si_generation = 1; -} - /* A reasonable value for REPLAY_ISIZE was estimated as follows: * The OPEN response, typically the largest, requires * 4(status) + 8(stateid) + 20(changeinfo) + 4(rflags) + 8(verifier) + @@ -595,6 +587,7 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab); void nfs4_unhash_stid(struct nfs4_stid *s); void nfs4_put_stid(struct nfs4_stid *s); +void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid); void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *); extern void nfs4_release_reclaim(struct nfsd_net *); extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir, -- cgit v0.10.2 From ea833f5de3ab49a0aac79a16633fd510390b83a6 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 27 Oct 2015 10:23:30 +0000 Subject: SUNRPC: drop stale comment in svc_setup_socket() The svc_setup_socket() function does set the send and receive buffer sizes, so the comment is out-of-date: Signed-off-by: Stefan Hajnoczi Signed-off-by: J. Bruce Fields diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 4892373..e0c7b33 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1367,7 +1367,6 @@ EXPORT_SYMBOL_GPL(svc_sock_update_bufs); /* * Initialize socket for RPC use and create svc_sock struct - * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. */ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct socket *sock, -- cgit v0.10.2 From 3e80dbcda7f3e1e349a779d7a14c0e08677c39fa Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 4 Nov 2015 11:02:29 -0500 Subject: nfsd: remove recurring workqueue job to clean DRC We have a shrinker, we clean out the cache when nfsd is shut down, and prune the chains on each request. A recurring workqueue job seems like unnecessary overhead. Just remove it. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 116940c..54cde9a 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -63,7 +63,6 @@ static unsigned int longest_chain; static unsigned int longest_chain_cachesize; static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec); -static void cache_cleaner_func(struct work_struct *unused); static unsigned long nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc); static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink, @@ -76,13 +75,6 @@ static struct shrinker nfsd_reply_cache_shrinker = { }; /* - * locking for the reply cache: - * A cache entry is "single use" if c_state == RC_INPROG - * Otherwise, it when accessing _prev or _next, the lock must be held. - */ -static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func); - -/* * Put a cap on the size of the DRC based on the amount of available * low memory in the machine. * @@ -203,7 +195,6 @@ void nfsd_reply_cache_shutdown(void) unsigned int i; unregister_shrinker(&nfsd_reply_cache_shrinker); - cancel_delayed_work_sync(&cache_cleaner); for (i = 0; i < drc_hashsize; i++) { struct list_head *head = &drc_hashtbl[i].lru_head; @@ -230,7 +221,6 @@ lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) { rp->c_timestamp = jiffies; list_move_tail(&rp->c_lru, &b->lru_head); - schedule_delayed_work(&cache_cleaner, RC_EXPIRE); } static long @@ -264,7 +254,6 @@ prune_cache_entries(void) { unsigned int i; long freed = 0; - bool cancel = true; for (i = 0; i < drc_hashsize; i++) { struct nfsd_drc_bucket *b = &drc_hashtbl[i]; @@ -273,26 +262,11 @@ prune_cache_entries(void) continue; spin_lock(&b->cache_lock); freed += prune_bucket(b); - if (!list_empty(&b->lru_head)) - cancel = false; spin_unlock(&b->cache_lock); } - - /* - * Conditionally rearm the job to run in RC_EXPIRE since we just - * ran the pruner. - */ - if (!cancel) - mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE); return freed; } -static void -cache_cleaner_func(struct work_struct *unused) -{ - prune_cache_entries(); -} - static unsigned long nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) { -- cgit v0.10.2 From 34ed9872e745fa56f10e9bef2cf3d2336c6c8816 Mon Sep 17 00:00:00 2001 From: Andrew Elble Date: Thu, 15 Oct 2015 12:07:28 -0400 Subject: nfsd: eliminate sending duplicate and repeated delegations We've observed the nfsd server in a state where there are multiple delegations on the same nfs4_file for the same client. The nfs client does attempt to DELEGRETURN these when they are presented to it - but apparently under some (unknown) circumstances the client does not manage to return all of them. This leads to the eventual attempt to CB_RECALL more than one delegation with the same nfs filehandle to the same client. The first recall will succeed, but the next recall will fail with NFS4ERR_BADHANDLE. This leads to the server having delegations on cl_revoked that the client has no way to FREE or DELEGRETURN, with resulting inability to recover. The state manager on the server will continually assert SEQ4_STATUS_RECALLABLE_STATE_REVOKED, and the state manager on the client will be looping unable to satisfy the server. List discussion also reports a race between OPEN and DELEGRETURN that will be avoided by only sending the delegation once to the client. This is also logically in accordance with RFC5561 9.1.1 and 10.2. So, let's: 1.) Not hand out duplicate delegations. 2.) Only send them to the client once. RFC 5561: 9.1.1: "Delegations and layouts, on the other hand, are not associated with a specific owner but are associated with the client as a whole (identified by a client ID)." 10.2: "...the stateid for a delegation is associated with a client ID and may be used on behalf of all the open-owners for the given client. A delegation is made to the client as a whole and not to any specific process or thread of control within it." Reported-by: Eric Meddaugh Cc: Trond Myklebust Cc: Olga Kornievskaia Signed-off-by: Andrew Elble Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 0a69715..6411c34 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -778,16 +778,68 @@ void nfs4_unhash_stid(struct nfs4_stid *s) s->sc_type = 0; } -static void +/** + * nfs4_get_existing_delegation - Discover if this delegation already exists + * @clp: a pointer to the nfs4_client we're granting a delegation to + * @fp: a pointer to the nfs4_file we're granting a delegation on + * + * Return: + * On success: NULL if an existing delegation was not found. + * + * On error: -EAGAIN if one was previously granted to this nfs4_client + * for this nfs4_file. + * + */ + +static int +nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp) +{ + struct nfs4_delegation *searchdp = NULL; + struct nfs4_client *searchclp = NULL; + + lockdep_assert_held(&state_lock); + lockdep_assert_held(&fp->fi_lock); + + list_for_each_entry(searchdp, &fp->fi_delegations, dl_perfile) { + searchclp = searchdp->dl_stid.sc_client; + if (clp == searchclp) { + return -EAGAIN; + } + } + return 0; +} + +/** + * hash_delegation_locked - Add a delegation to the appropriate lists + * @dp: a pointer to the nfs4_delegation we are adding. + * @fp: a pointer to the nfs4_file we're granting a delegation on + * + * Return: + * On success: NULL if the delegation was successfully hashed. + * + * On error: -EAGAIN if one was previously granted to this + * nfs4_client for this nfs4_file. Delegation is not hashed. + * + */ + +static int hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) { + int status; + struct nfs4_client *clp = dp->dl_stid.sc_client; + lockdep_assert_held(&state_lock); lockdep_assert_held(&fp->fi_lock); + status = nfs4_get_existing_delegation(clp, fp); + if (status) + return status; + ++fp->fi_delegees; atomic_inc(&dp->dl_stid.sc_count); dp->dl_stid.sc_type = NFS4_DELEG_STID; list_add(&dp->dl_perfile, &fp->fi_delegations); - list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); + list_add(&dp->dl_perclnt, &clp->cl_delegations); + return 0; } static bool @@ -3969,6 +4021,18 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag) return fl; } +/** + * nfs4_setlease - Obtain a delegation by requesting lease from vfs layer + * @dp: a pointer to the nfs4_delegation we're adding. + * + * Return: + * On success: Return code will be 0 on success. + * + * On error: -EAGAIN if there was an existing delegation. + * nonzero if there is an error in other cases. + * + */ + static int nfs4_setlease(struct nfs4_delegation *dp) { struct nfs4_file *fp = dp->dl_stid.sc_file; @@ -4000,16 +4064,19 @@ static int nfs4_setlease(struct nfs4_delegation *dp) goto out_unlock; /* Race breaker */ if (fp->fi_deleg_file) { - status = 0; - ++fp->fi_delegees; - hash_delegation_locked(dp, fp); + status = hash_delegation_locked(dp, fp); goto out_unlock; } fp->fi_deleg_file = filp; - fp->fi_delegees = 1; - hash_delegation_locked(dp, fp); + fp->fi_delegees = 0; + status = hash_delegation_locked(dp, fp); spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); + if (status) { + /* Should never happen, this is a new fi_deleg_file */ + WARN_ON_ONCE(1); + goto out_fput; + } return 0; out_unlock: spin_unlock(&fp->fi_lock); @@ -4029,6 +4096,15 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, if (fp->fi_had_conflict) return ERR_PTR(-EAGAIN); + spin_lock(&state_lock); + spin_lock(&fp->fi_lock); + status = nfs4_get_existing_delegation(clp, fp); + spin_unlock(&fp->fi_lock); + spin_unlock(&state_lock); + + if (status) + return ERR_PTR(status); + dp = alloc_init_deleg(clp, fh, odstate); if (!dp) return ERR_PTR(-ENOMEM); @@ -4047,9 +4123,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, status = -EAGAIN; goto out_unlock; } - ++fp->fi_delegees; - hash_delegation_locked(dp, fp); - status = 0; + status = hash_delegation_locked(dp, fp); out_unlock: spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); -- cgit v0.10.2 From 7fc0564e3a8d16df096f48c9c6425ba84d945c6e Mon Sep 17 00:00:00 2001 From: Andrew Elble Date: Thu, 5 Nov 2015 20:42:43 -0500 Subject: nfsd: fix race with open / open upgrade stateids We observed multiple open stateids on the server for files that seemingly should have been closed. nfsd4_process_open2() tests for the existence of a preexisting stateid. If one is not found, the locks are dropped and a new one is created. The problem is that init_open_stateid(), which is also responsible for hashing the newly initialized stateid, doesn't check to see if another open has raced in and created a matching stateid. This fix is to enable init_open_stateid() to return the matching stateid and have nfsd4_process_open2() swap to that stateid and switch to the open upgrade path. In testing this patch, coverage to the newly created path indicates that the race was indeed happening. Signed-off-by: Andrew Elble Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 6411c34..6b800b5b 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3392,6 +3392,27 @@ static const struct nfs4_stateowner_operations openowner_ops = { .so_free = nfs4_free_openowner, }; +static struct nfs4_ol_stateid * +nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) +{ + struct nfs4_ol_stateid *local, *ret = NULL; + struct nfs4_openowner *oo = open->op_openowner; + + lockdep_assert_held(&fp->fi_lock); + + list_for_each_entry(local, &fp->fi_stateids, st_perfile) { + /* ignore lock owners */ + if (local->st_stateowner->so_is_open_owner == 0) + continue; + if (local->st_stateowner == &oo->oo_owner) { + ret = local; + atomic_inc(&ret->st_stid.sc_count); + break; + } + } + return ret; +} + static struct nfs4_openowner * alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, struct nfsd4_compound_state *cstate) @@ -3423,9 +3444,20 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, return ret; } -static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { +static struct nfs4_ol_stateid * +init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, + struct nfsd4_open *open) +{ + struct nfs4_openowner *oo = open->op_openowner; + struct nfs4_ol_stateid *retstp = NULL; + spin_lock(&oo->oo_owner.so_client->cl_lock); + spin_lock(&fp->fi_lock); + + retstp = nfsd4_find_existing_open(fp, open); + if (retstp) + goto out_unlock; atomic_inc(&stp->st_stid.sc_count); stp->st_stid.sc_type = NFS4_OPEN_STID; INIT_LIST_HEAD(&stp->st_locks); @@ -3436,12 +3468,13 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, stp->st_deny_bmap = 0; stp->st_openstp = NULL; init_rwsem(&stp->st_rwsem); - spin_lock(&oo->oo_owner.so_client->cl_lock); list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); - spin_lock(&fp->fi_lock); list_add(&stp->st_perfile, &fp->fi_stateids); + +out_unlock: spin_unlock(&fp->fi_lock); spin_unlock(&oo->oo_owner.so_client->cl_lock); + return retstp; } /* @@ -3852,27 +3885,6 @@ out: return nfs_ok; } -static struct nfs4_ol_stateid * -nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) -{ - struct nfs4_ol_stateid *local, *ret = NULL; - struct nfs4_openowner *oo = open->op_openowner; - - spin_lock(&fp->fi_lock); - list_for_each_entry(local, &fp->fi_stateids, st_perfile) { - /* ignore lock owners */ - if (local->st_stateowner->so_is_open_owner == 0) - continue; - if (local->st_stateowner == &oo->oo_owner) { - ret = local; - atomic_inc(&ret->st_stid.sc_count); - break; - } - } - spin_unlock(&fp->fi_lock); - return ret; -} - static inline int nfs4_access_to_access(u32 nfs4_access) { int flags = 0; @@ -4258,6 +4270,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf struct nfs4_client *cl = open->op_openowner->oo_owner.so_client; struct nfs4_file *fp = NULL; struct nfs4_ol_stateid *stp = NULL; + struct nfs4_ol_stateid *swapstp = NULL; struct nfs4_delegation *dp = NULL; __be32 status; @@ -4271,7 +4284,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs4_check_deleg(cl, open, &dp); if (status) goto out; + spin_lock(&fp->fi_lock); stp = nfsd4_find_existing_open(fp, open); + spin_unlock(&fp->fi_lock); } else { open->op_file = NULL; status = nfserr_bad_stateid; @@ -4294,7 +4309,19 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf } else { stp = open->op_stp; open->op_stp = NULL; - init_open_stateid(stp, fp, open); + swapstp = init_open_stateid(stp, fp, open); + if (swapstp) { + nfs4_put_stid(&stp->st_stid); + stp = swapstp; + down_read(&stp->st_rwsem); + status = nfs4_upgrade_open(rqstp, fp, current_fh, + stp, open); + if (status) { + up_read(&stp->st_rwsem); + goto out; + } + goto upgrade_out; + } down_read(&stp->st_rwsem); status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open); if (status) { @@ -4308,6 +4335,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf if (stp->st_clnt_odstate == open->op_odstate) open->op_odstate = NULL; } +upgrade_out: nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid); up_read(&stp->st_rwsem); -- cgit v0.10.2 From 0442f14b15f8e7a8b3778a9f8cf640ef89b2df26 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 9 Oct 2015 01:44:07 +0000 Subject: svcrpc: document lack of some memory barriers We're missing memory barriers in net/sunrpc/svcsock.c in some spots we'd expect them. But it doesn't appear they're necessary in our case, and this is likely a hot path--for now just document the odd behavior. Kosuke Tatsukawa found this issue while looking through the linux source code for places calling waitqueue_active() before wake_up*(), but without preceding memory barriers, after sending a patch to fix a similar issue in drivers/tty/n_tty.c (Details about the original issue can be found here: https://lkml.org/lkml/2015/9/28/849). Reported-by: Kosuke Tatsukawa Signed-off-by: J. Bruce Fields diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index e0c7b33..1413cdc 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -399,6 +399,31 @@ static int svc_sock_secure_port(struct svc_rqst *rqstp) return svc_port_is_privileged(svc_addr(rqstp)); } +static bool sunrpc_waitqueue_active(wait_queue_head_t *wq) +{ + if (!wq) + return false; + /* + * There should normally be a memory * barrier here--see + * wq_has_sleeper(). + * + * It appears that isn't currently necessary, though, basically + * because callers all appear to have sufficient memory barriers + * between the time the relevant change is made and the + * time they call these callbacks. + * + * The nfsd code itself doesn't actually explicitly wait on + * these waitqueues, but it may wait on them for example in + * sendpage() or sendmsg() calls. (And those may be the only + * places, since it it uses nonblocking reads.) + * + * Maybe we should add the memory barriers anyway, but these are + * hot paths so we'd need to be convinced there's no sigificant + * penalty. + */ + return waitqueue_active(wq); +} + /* * INET callback when data has been received on the socket. */ @@ -414,7 +439,7 @@ static void svc_udp_data_ready(struct sock *sk) set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); svc_xprt_enqueue(&svsk->sk_xprt); } - if (wq && waitqueue_active(wq)) + if (sunrpc_waitqueue_active(wq)) wake_up_interruptible(wq); } @@ -432,7 +457,7 @@ static void svc_write_space(struct sock *sk) svc_xprt_enqueue(&svsk->sk_xprt); } - if (wq && waitqueue_active(wq)) { + if (sunrpc_waitqueue_active(wq)) { dprintk("RPC svc_write_space: someone sleeping on %p\n", svsk); wake_up_interruptible(wq); @@ -787,7 +812,7 @@ static void svc_tcp_listen_data_ready(struct sock *sk) } wq = sk_sleep(sk); - if (wq && waitqueue_active(wq)) + if (sunrpc_waitqueue_active(wq)) wake_up_interruptible_all(wq); } @@ -808,7 +833,7 @@ static void svc_tcp_state_change(struct sock *sk) set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); svc_xprt_enqueue(&svsk->sk_xprt); } - if (wq && waitqueue_active(wq)) + if (sunrpc_waitqueue_active(wq)) wake_up_interruptible_all(wq); } @@ -823,7 +848,7 @@ static void svc_tcp_data_ready(struct sock *sk) set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); svc_xprt_enqueue(&svsk->sk_xprt); } - if (wq && waitqueue_active(wq)) + if (sunrpc_waitqueue_active(wq)) wake_up_interruptible(wq); } @@ -1593,7 +1618,7 @@ static void svc_sock_detach(struct svc_xprt *xprt) sk->sk_write_space = svsk->sk_owspace; wq = sk_sleep(sk); - if (wq && waitqueue_active(wq)) + if (sunrpc_waitqueue_active(wq)) wake_up_interruptible(wq); } -- cgit v0.10.2