From 8fb883f3e30065529e4f35d4b4f355193dcdb7a2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 21 Sep 2013 00:09:31 +0100 Subject: FS-Cache: Add use/unuse/wake cookie wrappers Add wrapper functions for dealing with cookie->n_active: (*) __fscache_use_cookie() to increment it. (*) __fscache_unuse_cookie() to decrement and test against zero. (*) __fscache_wake_unused_cookie() to wake up anyone waiting for it to reach zero. The second and third are split so that the third can be done after cookie->lock has been released in case the waiter wakes up whilst we're still holding it and tries to get it. We will need to wake-on-zero once the cookie disablement patch is applied because it will then be possible to see n_active become zero without the cookie being relinquished. Also move the cookie usement out of fscache_attr_changed_op() and into fscache_attr_changed() and the operation struct so that cookie disablement will be able to track it. Whilst we're at it, only increment n_active if we're about to do fscache_submit_op() so that we don't have to deal with undoing it if anything earlier fails. Possibly this should be moved into fscache_submit_op() which could look at FSCACHE_OP_UNUSE_COOKIE. Signed-off-by: David Howells diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index b2a86e3..d851aa5 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -568,6 +568,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) { struct fscache_operation *op; struct fscache_object *object; + bool wake_cookie = false; int ret; _enter("%p,", cookie); @@ -600,7 +601,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) op->debug_id = atomic_inc_return(&fscache_op_debug_id); - atomic_inc(&cookie->n_active); + __fscache_use_cookie(cookie); if (fscache_submit_op(object, op) < 0) goto submit_failed; @@ -622,9 +623,11 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) return ret; submit_failed: - atomic_dec(&cookie->n_active); + wake_cookie = __fscache_unuse_cookie(cookie); inconsistent: spin_unlock(&cookie->lock); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); kfree(op); _leave(" = -ESTALE"); return -ESTALE; diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 73899c1..0fe42a6 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -163,12 +163,10 @@ static void fscache_attr_changed_op(struct fscache_operation *op) fscache_stat(&fscache_n_attr_changed_calls); - if (fscache_object_is_active(object) && - fscache_use_cookie(object)) { + if (fscache_object_is_active(object)) { fscache_stat(&fscache_n_cop_attr_changed); ret = object->cache->ops->attr_changed(object); fscache_stat_d(&fscache_n_cop_attr_changed); - fscache_unuse_cookie(object); if (ret < 0) fscache_abort_object(object); } @@ -184,6 +182,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) { struct fscache_operation *op; struct fscache_object *object; + bool wake_cookie; _enter("%p", cookie); @@ -199,7 +198,9 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) } fscache_operation_init(op, fscache_attr_changed_op, NULL); - op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE); + op->flags = FSCACHE_OP_ASYNC | + (1 << FSCACHE_OP_EXCLUSIVE) | + (1 << FSCACHE_OP_UNUSE_COOKIE); spin_lock(&cookie->lock); @@ -208,6 +209,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); + __fscache_use_cookie(cookie); if (fscache_submit_exclusive_op(object, op) < 0) goto nobufs; spin_unlock(&cookie->lock); @@ -217,8 +219,11 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) return 0; nobufs: + wake_cookie = __fscache_unuse_cookie(cookie); spin_unlock(&cookie->lock); kfree(op); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); fscache_stat(&fscache_n_attr_changed_nobufs); _leave(" = %d", -ENOBUFS); return -ENOBUFS; @@ -263,7 +268,6 @@ static struct fscache_retrieval *fscache_alloc_retrieval( } fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op); - atomic_inc(&cookie->n_active); op->op.flags = FSCACHE_OP_MYTHREAD | (1UL << FSCACHE_OP_WAITING) | (1UL << FSCACHE_OP_UNUSE_COOKIE); @@ -384,6 +388,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, { struct fscache_retrieval *op; struct fscache_object *object; + bool wake_cookie = false; int ret; _enter("%p,%p,,,", cookie, page); @@ -421,6 +426,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)); + __fscache_use_cookie(cookie); atomic_inc(&object->n_reads); __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); @@ -475,9 +481,11 @@ error: nobufs_unlock_dec: atomic_dec(&object->n_reads); + wake_cookie = __fscache_unuse_cookie(cookie); nobufs_unlock: spin_unlock(&cookie->lock); - atomic_dec(&cookie->n_active); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); kfree(op); nobufs: fscache_stat(&fscache_n_retrievals_nobufs); @@ -514,6 +522,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, { struct fscache_retrieval *op; struct fscache_object *object; + bool wake_cookie = false; int ret; _enter("%p,,%d,,,", cookie, *nr_pages); @@ -547,6 +556,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); + __fscache_use_cookie(cookie); atomic_inc(&object->n_reads); __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); @@ -601,10 +611,12 @@ error: nobufs_unlock_dec: atomic_dec(&object->n_reads); + wake_cookie = __fscache_unuse_cookie(cookie); nobufs_unlock: spin_unlock(&cookie->lock); - atomic_dec(&cookie->n_active); kfree(op); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); nobufs: fscache_stat(&fscache_n_retrievals_nobufs); _leave(" = -ENOBUFS"); @@ -626,6 +638,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, { struct fscache_retrieval *op; struct fscache_object *object; + bool wake_cookie = false; int ret; _enter("%p,%p,,,", cookie, page); @@ -658,8 +671,9 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); + __fscache_use_cookie(cookie); if (fscache_submit_op(object, &op->op) < 0) - goto nobufs_unlock; + goto nobufs_unlock_dec; spin_unlock(&cookie->lock); fscache_stat(&fscache_n_alloc_ops); @@ -689,10 +703,13 @@ error: _leave(" = %d", ret); return ret; +nobufs_unlock_dec: + wake_cookie = __fscache_unuse_cookie(cookie); nobufs_unlock: spin_unlock(&cookie->lock); - atomic_dec(&cookie->n_active); kfree(op); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); nobufs: fscache_stat(&fscache_n_allocs_nobufs); _leave(" = -ENOBUFS"); @@ -889,6 +906,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, { struct fscache_storage *op; struct fscache_object *object; + bool wake_cookie = false; int ret; _enter("%p,%x,", cookie, (u32) page->flags); @@ -957,7 +975,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, op->op.debug_id = atomic_inc_return(&fscache_op_debug_id); op->store_limit = object->store_limit; - atomic_inc(&cookie->n_active); + __fscache_use_cookie(cookie); if (fscache_submit_op(object, &op->op) < 0) goto submit_failed; @@ -984,10 +1002,10 @@ already_pending: return 0; submit_failed: - atomic_dec(&cookie->n_active); spin_lock(&cookie->stores_lock); radix_tree_delete(&cookie->stores, page->index); spin_unlock(&cookie->stores_lock); + wake_cookie = __fscache_unuse_cookie(cookie); page_cache_release(page); ret = -ENOBUFS; goto nobufs; @@ -999,6 +1017,8 @@ nobufs: spin_unlock(&cookie->lock); radix_tree_preload_end(); kfree(op); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); fscache_stat(&fscache_n_stores_nobufs); _leave(" = -ENOBUFS"); return -ENOBUFS; diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 7823e9e..96a2b66 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -511,6 +511,11 @@ static inline void fscache_end_io(struct fscache_retrieval *op, op->end_io_func(page, op->context, error); } +static inline void __fscache_use_cookie(struct fscache_cookie *cookie) +{ + atomic_inc(&cookie->n_active); +} + /** * fscache_use_cookie - Request usage of cookie attached to an object * @object: Object description @@ -524,6 +529,16 @@ static inline bool fscache_use_cookie(struct fscache_object *object) return atomic_inc_not_zero(&cookie->n_active) != 0; } +static inline bool __fscache_unuse_cookie(struct fscache_cookie *cookie) +{ + return atomic_dec_and_test(&cookie->n_active); +} + +static inline void __fscache_wake_unused_cookie(struct fscache_cookie *cookie) +{ + wake_up_atomic_t(&cookie->n_active); +} + /** * fscache_unuse_cookie - Cease usage of cookie attached to an object * @object: Object description @@ -534,8 +549,8 @@ static inline bool fscache_use_cookie(struct fscache_object *object) static inline void fscache_unuse_cookie(struct fscache_object *object) { struct fscache_cookie *cookie = object->cookie; - if (atomic_dec_and_test(&cookie->n_active)) - wake_up_atomic_t(&cookie->n_active); + if (__fscache_unuse_cookie(cookie)) + __fscache_wake_unused_cookie(cookie); } /* -- cgit v0.10.2 From 94d30ae90a00cafe686c1057be57f4885f963abf Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 21 Sep 2013 00:09:31 +0100 Subject: FS-Cache: Provide the ability to enable/disable cookies Provide the ability to enable and disable fscache cookies. A disabled cookie will reject or ignore further requests to: Acquire a child cookie Invalidate and update backing objects Check the consistency of a backing object Allocate storage for backing page Read backing pages Write to backing pages but still allows: Checks/waits on the completion of already in-progress objects Uncaching of pages Relinquishment of cookies Two new operations are provided: (1) Disable a cookie: void fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate); If the cookie is not already disabled, this locks the cookie against other dis/enablement ops, marks the cookie as being disabled, discards or invalidates any backing objects and waits for cessation of activity on any associated object. This is a wrapper around a chunk split out of fscache_relinquish_cookie(), but it reinitialises the cookie such that it can be reenabled. All possible failures are handled internally. The caller should consider calling fscache_uncache_all_inode_pages() afterwards to make sure all page markings are cleared up. (2) Enable a cookie: void fscache_enable_cookie(struct fscache_cookie *cookie, bool (*can_enable)(void *data), void *data) If the cookie is not already enabled, this locks the cookie against other dis/enablement ops, invokes can_enable() and, if the cookie is not an index cookie, will begin the procedure of acquiring backing objects. The optional can_enable() function is passed the data argument and returns a ruling as to whether or not enablement should actually be permitted to begin. All possible failures are handled internally. The cookie will only be marked as enabled if provisional backing objects are allocated. A later patch will introduce these to NFS. Cookie enablement during nfs_open() is then contingent on i_writecount <= 0. can_enable() checks for a race between open(O_RDONLY) and open(O_WRONLY/O_RDWR). This simplifies NFS's cookie handling and allows us to get rid of open(O_RDONLY) accidentally introducing caching to an inode that's open for writing already. One operation has its API modified: (3) Acquire a cookie. struct fscache_cookie *fscache_acquire_cookie( struct fscache_cookie *parent, const struct fscache_cookie_def *def, void *netfs_data, bool enable); This now has an additional argument that indicates whether the requested cookie should be enabled by default. It doesn't need the can_enable() function because the caller must prevent multiple calls for the same netfs object and it doesn't need to take the enablement lock because no one else can get at the cookie before this returns. Signed-off-by: David Howells cache = fscache_acquire_cookie(afs_cache_netfs.primary_index, &afs_cell_cache_index_def, - cell); + cell, true); Then when a volume location was accessed, it would be entered into the cell's index and an inode would be allocated that acts as a volume type and hash chain @@ -366,7 +372,7 @@ combination: vlocation->cache = fscache_acquire_cookie(cell->cache, &afs_vlocation_cache_index_def, - vlocation); + vlocation, true); And then a particular flavour of volume (R/O for example) could be added to that index, creating another index for vnodes (AFS inode equivalents): @@ -374,7 +380,7 @@ that index, creating another index for vnodes (AFS inode equivalents): volume->cache = fscache_acquire_cookie(vlocation->cache, &afs_volume_cache_index_def, - volume); + volume, true); ====================== @@ -388,7 +394,7 @@ the object definition should be something other than index type. vnode->cache = fscache_acquire_cookie(volume->cache, &afs_vnode_cache_object_def, - vnode); + vnode, true); ================================= @@ -404,7 +410,7 @@ it would be some other type of object such as a data file. xattr->cache = fscache_acquire_cookie(vnode->cache, &afs_xattr_cache_object_def, - xattr); + xattr, true); Miscellaneous objects might be used to store extended attributes or directory entries for example. @@ -733,6 +739,47 @@ Note that partial updates may happen automatically at other times, such as when data blocks are added to a data file object. +================= +COOKIE ENABLEMENT +================= + +Cookies exist in one of two states: enabled and disabled. If a cookie is +disabled, it ignores all attempts to acquire child cookies; check, update or +invalidate its state; allocate, read or write backing pages - though it is +still possible to uncache pages and relinquish the cookie. + +The initial enablement state is set by fscache_acquire_cookie(), but the cookie +can be enabled or disabled later. To disable a cookie, call: + + void fscache_disable_cookie(struct fscache_cookie *cookie, + bool invalidate); + +If the cookie is not already disabled, this locks the cookie against other +enable and disable ops, marks the cookie as being disabled, discards or +invalidates any backing objects and waits for cessation of activity on any +associated object before unlocking the cookie. + +All possible failures are handled internally. The caller should consider +calling fscache_uncache_all_inode_pages() afterwards to make sure all page +markings are cleared up. + +Cookies can be enabled or reenabled with: + + void fscache_enable_cookie(struct fscache_cookie *cookie, + bool (*can_enable)(void *data), + void *data) + +If the cookie is not already enabled, this locks the cookie against other +enable and disable ops, invokes can_enable() and, if the cookie is not an index +cookie, will begin the procedure of acquiring backing objects. + +The optional can_enable() function is passed the data argument and returns a +ruling as to whether or not enablement should actually be permitted to begin. + +All possible failures are handled internally. The cookie will only be marked +as enabled if provisional backing objects are allocated. + + =============================== MISCELLANEOUS COOKIE OPERATIONS =============================== @@ -778,7 +825,7 @@ COOKIE UNREGISTRATION To get rid of a cookie, this function should be called. void fscache_relinquish_cookie(struct fscache_cookie *cookie, - int retire); + bool retire); If retire is non-zero, then the object will be marked for recycling, and all copies of it will be removed from all active caches in which it is present. diff --git a/fs/9p/cache.c b/fs/9p/cache.c index a9ea73d..2b7a032 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -90,7 +90,7 @@ void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses) v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index, &v9fs_cache_session_index_def, - v9ses); + v9ses, true); p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n", v9ses, v9ses->fscache); } @@ -204,7 +204,7 @@ void v9fs_cache_inode_get_cookie(struct inode *inode) v9ses = v9fs_inode2v9ses(inode); v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, &v9fs_cache_inode_index_def, - v9inode); + v9inode, true); p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", inode, v9inode->fscache); @@ -271,7 +271,7 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode) v9ses = v9fs_inode2v9ses(inode); v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, &v9fs_cache_inode_index_def, - v9inode); + v9inode, true); p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n", inode, old, v9inode->fscache); diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 3c090b7..ca0a3cf 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -179,7 +179,7 @@ struct afs_cell *afs_cell_create(const char *name, unsigned namesz, /* put it up for caching (this never returns an error) */ cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index, &afs_cell_cache_index_def, - cell); + cell, true); #endif /* add to the cell lists */ diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 789bc25..ce25d75 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -259,7 +259,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, #ifdef CONFIG_AFS_FSCACHE vnode->cache = fscache_acquire_cookie(vnode->volume->cache, &afs_vnode_cache_index_def, - vnode); + vnode, true); #endif ret = afs_inode_map_status(vnode, key); diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index 57bcb15..b6df2e8 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -308,7 +308,8 @@ static int afs_vlocation_fill_in_record(struct afs_vlocation *vl, /* see if we have an in-cache copy (will set vl->valid if there is) */ #ifdef CONFIG_AFS_FSCACHE vl->cache = fscache_acquire_cookie(vl->cell->cache, - &afs_vlocation_cache_index_def, vl); + &afs_vlocation_cache_index_def, vl, + true); #endif if (vl->valid) { diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 401eeb2..2b60725 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -131,7 +131,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params) #ifdef CONFIG_AFS_FSCACHE volume->cache = fscache_acquire_cookie(vlocation->cache, &afs_volume_cache_index_def, - volume); + volume, true); #endif afs_get_vlocation(vlocation); volume->vlocation = vlocation; diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 43eb559..00baf14 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -270,7 +270,7 @@ static void cachefiles_drop_object(struct fscache_object *_object) #endif /* delete retired objects */ - if (test_bit(FSCACHE_COOKIE_RETIRED, &object->fscache.cookie->flags) && + if (test_bit(FSCACHE_OBJECT_RETIRED, &object->fscache.flags) && _object != cache->cache.fsdef ) { _debug("- retire object OBJ%x", object->fscache.debug_id); diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 6bfe65e..7db2e6c 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -68,7 +68,7 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc) { fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, &ceph_fscache_fsid_object_def, - fsc); + fsc, true); if (fsc->fscache == NULL) { pr_err("Unable to resgister fsid: %p fscache cookie", fsc); @@ -204,7 +204,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, ci->fscache = fscache_acquire_cookie(fsc->fscache, &ceph_fscache_inode_object_def, - ci); + ci, true); done: mutex_unlock(&inode->i_mutex); diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 2f4bc5a..fe2492d 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -27,7 +27,7 @@ void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) { server->fscache = fscache_acquire_cookie(cifs_fscache_netfs.primary_index, - &cifs_fscache_server_index_def, server); + &cifs_fscache_server_index_def, server, true); cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, server, server->fscache); } @@ -46,7 +46,7 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) tcon->fscache = fscache_acquire_cookie(server->fscache, - &cifs_fscache_super_index_def, tcon); + &cifs_fscache_super_index_def, tcon, true); cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, server->fscache, tcon->fscache); } @@ -69,7 +69,7 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) { cifsi->fscache = fscache_acquire_cookie(tcon->fscache, - &cifs_fscache_inode_object_def, cifsi); + &cifs_fscache_inode_object_def, cifsi, true); cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n", __func__, tcon->fscache, cifsi->fscache); } @@ -119,7 +119,7 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode) cifsi->fscache = fscache_acquire_cookie( cifs_sb_master_tcon(cifs_sb)->fscache, &cifs_fscache_inode_object_def, - cifsi); + cifsi, true); cifs_dbg(FYI, "%s: new cookie 0x%p oldcookie 0x%p\n", __func__, cifsi->fscache, old); } diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index d851aa5..29d7feb 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -58,15 +58,16 @@ void fscache_cookie_init_once(void *_cookie) struct fscache_cookie *__fscache_acquire_cookie( struct fscache_cookie *parent, const struct fscache_cookie_def *def, - void *netfs_data) + void *netfs_data, + bool enable) { struct fscache_cookie *cookie; BUG_ON(!def); - _enter("{%s},{%s},%p", + _enter("{%s},{%s},%p,%u", parent ? (char *) parent->def->name : "", - def->name, netfs_data); + def->name, netfs_data, enable); fscache_stat(&fscache_n_acquires); @@ -106,7 +107,7 @@ struct fscache_cookie *__fscache_acquire_cookie( cookie->def = def; cookie->parent = parent; cookie->netfs_data = netfs_data; - cookie->flags = 0; + cookie->flags = (1 << FSCACHE_COOKIE_NO_DATA_YET); /* radix tree insertion won't use the preallocation pool unless it's * told it may not wait */ @@ -124,16 +125,22 @@ struct fscache_cookie *__fscache_acquire_cookie( break; } - /* if the object is an index then we need do nothing more here - we - * create indices on disk when we need them as an index may exist in - * multiple caches */ - if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { - if (fscache_acquire_non_index_cookie(cookie) < 0) { - atomic_dec(&parent->n_children); - __fscache_cookie_put(cookie); - fscache_stat(&fscache_n_acquires_nobufs); - _leave(" = NULL"); - return NULL; + if (enable) { + /* if the object is an index then we need do nothing more here + * - we create indices on disk when we need them as an index + * may exist in multiple caches */ + if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { + if (fscache_acquire_non_index_cookie(cookie) == 0) { + set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); + } else { + atomic_dec(&parent->n_children); + __fscache_cookie_put(cookie); + fscache_stat(&fscache_n_acquires_nobufs); + _leave(" = NULL"); + return NULL; + } + } else { + set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); } } @@ -144,6 +151,39 @@ struct fscache_cookie *__fscache_acquire_cookie( EXPORT_SYMBOL(__fscache_acquire_cookie); /* + * Enable a cookie to permit it to accept new operations. + */ +void __fscache_enable_cookie(struct fscache_cookie *cookie, + bool (*can_enable)(void *data), + void *data) +{ + _enter("%p", cookie); + + wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + + if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) + goto out_unlock; + + if (can_enable && !can_enable(data)) { + /* The netfs decided it didn't want to enable after all */ + } else if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { + /* Wait for outstanding disablement to complete */ + __fscache_wait_on_invalidate(cookie); + + if (fscache_acquire_non_index_cookie(cookie) == 0) + set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); + } else { + set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); + } + +out_unlock: + clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags); + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK); +} +EXPORT_SYMBOL(__fscache_enable_cookie); + +/* * acquire a non-index cookie * - this must make sure the index chain is instantiated and instantiate the * object representation too @@ -157,7 +197,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) _enter(""); - cookie->flags = 1 << FSCACHE_COOKIE_UNAVAILABLE; + set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); /* now we need to see whether the backing objects for this cookie yet * exist, if not there'll be nothing to search */ @@ -180,9 +220,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) _debug("cache %s", cache->tag->name); - cookie->flags = - (1 << FSCACHE_COOKIE_LOOKING_UP) | - (1 << FSCACHE_COOKIE_NO_DATA_YET); + set_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); /* ask the cache to allocate objects for this cookie and its parent * chain */ @@ -398,7 +436,8 @@ void __fscache_invalidate(struct fscache_cookie *cookie) if (!hlist_empty(&cookie->backing_objects)) { spin_lock(&cookie->lock); - if (!hlist_empty(&cookie->backing_objects) && + if (fscache_cookie_enabled(cookie) && + !hlist_empty(&cookie->backing_objects) && !test_and_set_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { object = hlist_entry(cookie->backing_objects.first, @@ -452,10 +491,14 @@ void __fscache_update_cookie(struct fscache_cookie *cookie) spin_lock(&cookie->lock); - /* update the index entry on disk in each cache backing this cookie */ - hlist_for_each_entry(object, - &cookie->backing_objects, cookie_link) { - fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE); + if (fscache_cookie_enabled(cookie)) { + /* update the index entry on disk in each cache backing this + * cookie. + */ + hlist_for_each_entry(object, + &cookie->backing_objects, cookie_link) { + fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE); + } } spin_unlock(&cookie->lock); @@ -464,28 +507,14 @@ void __fscache_update_cookie(struct fscache_cookie *cookie) EXPORT_SYMBOL(__fscache_update_cookie); /* - * release a cookie back to the cache - * - the object will be marked as recyclable on disk if retire is true - * - all dependents of this cookie must have already been unregistered - * (indices/files/pages) + * Disable a cookie to stop it from accepting new requests from the netfs. */ -void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) +void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) { struct fscache_object *object; + bool awaken = false; - fscache_stat(&fscache_n_relinquishes); - if (retire) - fscache_stat(&fscache_n_relinquishes_retire); - - if (!cookie) { - fscache_stat(&fscache_n_relinquishes_null); - _leave(" [no cookie]"); - return; - } - - _enter("%p{%s,%p,%d},%d", - cookie, cookie->def->name, cookie->netfs_data, - atomic_read(&cookie->n_active), retire); + _enter("%p,%u", cookie, invalidate); ASSERTCMP(atomic_read(&cookie->n_active), >, 0); @@ -495,24 +524,82 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) BUG(); } - /* No further netfs-accessing operations on this cookie permitted */ - set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags); - if (retire) - set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags); + wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) + goto out_unlock_enable; + + /* If the cookie is being invalidated, wait for that to complete first + * so that we can reuse the flag. + */ + __fscache_wait_on_invalidate(cookie); + + /* Dispose of the backing objects */ + set_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags); spin_lock(&cookie->lock); - hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) { - fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL); + if (!hlist_empty(&cookie->backing_objects)) { + hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) { + if (invalidate) + set_bit(FSCACHE_OBJECT_RETIRED, &object->flags); + fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL); + } + } else { + if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) + awaken = true; } spin_unlock(&cookie->lock); + if (awaken) + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); /* Wait for cessation of activity requiring access to the netfs (when - * n_active reaches 0). + * n_active reaches 0). This makes sure outstanding reads and writes + * have completed. */ if (!atomic_dec_and_test(&cookie->n_active)) wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t, TASK_UNINTERRUPTIBLE); + /* Reset the cookie state if it wasn't relinquished */ + if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) { + atomic_inc(&cookie->n_active); + set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + } + +out_unlock_enable: + clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags); + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK); + _leave(""); +} +EXPORT_SYMBOL(__fscache_disable_cookie); + +/* + * release a cookie back to the cache + * - the object will be marked as recyclable on disk if retire is true + * - all dependents of this cookie must have already been unregistered + * (indices/files/pages) + */ +void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire) +{ + fscache_stat(&fscache_n_relinquishes); + if (retire) + fscache_stat(&fscache_n_relinquishes_retire); + + if (!cookie) { + fscache_stat(&fscache_n_relinquishes_null); + _leave(" [no cookie]"); + return; + } + + _enter("%p{%s,%p,%d},%d", + cookie, cookie->def->name, cookie->netfs_data, + atomic_read(&cookie->n_active), retire); + + /* No further netfs-accessing operations on this cookie permitted */ + set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags); + + __fscache_disable_cookie(cookie, retire); + /* Clear pointers back to the netfs */ cookie->netfs_data = NULL; cookie->def = NULL; @@ -592,7 +679,8 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) spin_lock(&cookie->lock); - if (hlist_empty(&cookie->backing_objects)) + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) goto inconsistent; object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c index 10a2ade..5a117df 100644 --- a/fs/fscache/fsdef.c +++ b/fs/fscache/fsdef.c @@ -59,6 +59,7 @@ struct fscache_cookie fscache_fsdef_index = { .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock), .backing_objects = HLIST_HEAD_INIT, .def = &fscache_fsdef_index_def, + .flags = 1 << FSCACHE_COOKIE_ENABLED, }; EXPORT_SYMBOL(fscache_fsdef_index); diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c index b1bb611..989f394 100644 --- a/fs/fscache/netfs.c +++ b/fs/fscache/netfs.c @@ -45,6 +45,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs) netfs->primary_index->def = &fscache_fsdef_netfs_def; netfs->primary_index->parent = &fscache_fsdef_index; netfs->primary_index->netfs_data = netfs; + netfs->primary_index->flags = 1 << FSCACHE_COOKIE_ENABLED; atomic_inc(&netfs->primary_index->parent->usage); atomic_inc(&netfs->primary_index->parent->n_children); diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 86d75a6..dcb8216 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -495,6 +495,7 @@ void fscache_object_lookup_negative(struct fscache_object *object) * returning ENODATA. */ set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); _debug("wake up lookup %p", &cookie->flags); clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); @@ -527,6 +528,7 @@ void fscache_obtained_object(struct fscache_object *object) /* We do (presumably) have data */ clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); /* Allow write requests to begin stacking up and read requests * to begin shovelling data. @@ -679,7 +681,8 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob */ spin_lock(&cookie->lock); hlist_del_init(&object->cookie_link); - if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) + if (hlist_empty(&cookie->backing_objects) && + test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) awaken = true; spin_unlock(&cookie->lock); @@ -927,7 +930,7 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj */ if (!fscache_use_cookie(object)) { ASSERT(object->cookie->stores.rnode == NULL); - set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags); + set_bit(FSCACHE_OBJECT_RETIRED, &object->flags); _leave(" [no cookie]"); return transit_to(KILL_OBJECT); } diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 0fe42a6..7f5c658 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -204,7 +204,8 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) spin_lock(&cookie->lock); - if (hlist_empty(&cookie->backing_objects)) + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) goto nobufs; object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); @@ -410,7 +411,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, return -ERESTARTSYS; op = fscache_alloc_retrieval(cookie, page->mapping, - end_io_func,context); + end_io_func, context); if (!op) { _leave(" = -ENOMEM"); return -ENOMEM; @@ -419,7 +420,8 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, spin_lock(&cookie->lock); - if (hlist_empty(&cookie->backing_objects)) + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) goto nobufs_unlock; object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); @@ -551,7 +553,8 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, spin_lock(&cookie->lock); - if (hlist_empty(&cookie->backing_objects)) + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) goto nobufs_unlock; object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); @@ -666,7 +669,8 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, spin_lock(&cookie->lock); - if (hlist_empty(&cookie->backing_objects)) + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) goto nobufs_unlock; object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); @@ -938,7 +942,8 @@ int __fscache_write_page(struct fscache_cookie *cookie, ret = -ENOBUFS; spin_lock(&cookie->lock); - if (hlist_empty(&cookie->backing_objects)) + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) goto nobufs; object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 24d1d1c..cd6e7ef 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -39,7 +39,7 @@ void nfs_fscache_get_client_cookie(struct nfs_client *clp) /* create a cache index for looking up filehandles */ clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index, &nfs_fscache_server_index_def, - clp); + clp, true); dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n", clp, clp->fscache); } @@ -139,7 +139,7 @@ void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int /* create a cache index for looking up filehandles */ nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache, &nfs_fscache_super_index_def, - nfss); + nfss, true); dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n", nfss, nfss->fscache); return; @@ -200,7 +200,7 @@ static void nfs_fscache_enable_inode_cookie(struct inode *inode) nfsi->fscache = fscache_acquire_cookie( NFS_SB(sb)->fscache, &nfs_fscache_inode_object_def, - nfsi); + nfsi, true); dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n", sb, nfsi, nfsi->fscache); @@ -327,7 +327,7 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode) nfsi->fscache = fscache_acquire_cookie( nfss->nfs_client->fscache, &nfs_fscache_inode_object_def, - nfsi); + nfsi, true); dfprintk(FSCACHE, "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n", diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 96a2b66..7714849 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -308,36 +308,6 @@ struct fscache_cache_ops { void (*dissociate_pages)(struct fscache_cache *cache); }; -/* - * data file or index object cookie - * - a file will only appear in one cache - * - a request to cache a file may or may not be honoured, subject to - * constraints such as disk space - * - indices are created on disk just-in-time - */ -struct fscache_cookie { - atomic_t usage; /* number of users of this cookie */ - atomic_t n_children; /* number of children of this cookie */ - atomic_t n_active; /* number of active users of netfs ptrs */ - spinlock_t lock; - spinlock_t stores_lock; /* lock on page store tree */ - struct hlist_head backing_objects; /* object(s) backing this file/index */ - const struct fscache_cookie_def *def; /* definition */ - struct fscache_cookie *parent; /* parent of this entry */ - void *netfs_data; /* back pointer to netfs */ - struct radix_tree_root stores; /* pages to be stored on this cookie */ -#define FSCACHE_COOKIE_PENDING_TAG 0 /* pages tag: pending write to cache */ -#define FSCACHE_COOKIE_STORING_TAG 1 /* pages tag: writing to cache */ - - unsigned long flags; -#define FSCACHE_COOKIE_LOOKING_UP 0 /* T if non-index cookie being looked up still */ -#define FSCACHE_COOKIE_NO_DATA_YET 1 /* T if new object with no cached data yet */ -#define FSCACHE_COOKIE_UNAVAILABLE 2 /* T if cookie is unavailable (error, etc) */ -#define FSCACHE_COOKIE_INVALIDATING 3 /* T if cookie is being invalidated */ -#define FSCACHE_COOKIE_RELINQUISHED 4 /* T if cookie has been relinquished */ -#define FSCACHE_COOKIE_RETIRED 5 /* T if cookie was retired */ -}; - extern struct fscache_cookie fscache_fsdef_index; /* @@ -400,6 +370,7 @@ struct fscache_object { #define FSCACHE_OBJECT_IS_LIVE 3 /* T if object is not withdrawn or relinquished */ #define FSCACHE_OBJECT_IS_LOOKED_UP 4 /* T if object has been looked up */ #define FSCACHE_OBJECT_IS_AVAILABLE 5 /* T if object has become active */ +#define FSCACHE_OBJECT_RETIRED 6 /* T if object was retired on relinquishment */ struct list_head cache_link; /* link in cache->object_list */ struct hlist_node cookie_link; /* link in cookie->backing_objects */ diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 19b4645..115bb81 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -167,6 +167,42 @@ struct fscache_netfs { }; /* + * data file or index object cookie + * - a file will only appear in one cache + * - a request to cache a file may or may not be honoured, subject to + * constraints such as disk space + * - indices are created on disk just-in-time + */ +struct fscache_cookie { + atomic_t usage; /* number of users of this cookie */ + atomic_t n_children; /* number of children of this cookie */ + atomic_t n_active; /* number of active users of netfs ptrs */ + spinlock_t lock; + spinlock_t stores_lock; /* lock on page store tree */ + struct hlist_head backing_objects; /* object(s) backing this file/index */ + const struct fscache_cookie_def *def; /* definition */ + struct fscache_cookie *parent; /* parent of this entry */ + void *netfs_data; /* back pointer to netfs */ + struct radix_tree_root stores; /* pages to be stored on this cookie */ +#define FSCACHE_COOKIE_PENDING_TAG 0 /* pages tag: pending write to cache */ +#define FSCACHE_COOKIE_STORING_TAG 1 /* pages tag: writing to cache */ + + unsigned long flags; +#define FSCACHE_COOKIE_LOOKING_UP 0 /* T if non-index cookie being looked up still */ +#define FSCACHE_COOKIE_NO_DATA_YET 1 /* T if new object with no cached data yet */ +#define FSCACHE_COOKIE_UNAVAILABLE 2 /* T if cookie is unavailable (error, etc) */ +#define FSCACHE_COOKIE_INVALIDATING 3 /* T if cookie is being invalidated */ +#define FSCACHE_COOKIE_RELINQUISHED 4 /* T if cookie has been relinquished */ +#define FSCACHE_COOKIE_ENABLED 5 /* T if cookie is enabled */ +#define FSCACHE_COOKIE_ENABLEMENT_LOCK 6 /* T if cookie is being en/disabled */ +}; + +static inline bool fscache_cookie_enabled(struct fscache_cookie *cookie) +{ + return test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); +} + +/* * slow-path functions for when there is actually caching available, and the * netfs does actually have a valid token * - these are not to be called directly @@ -181,8 +217,8 @@ extern void __fscache_release_cache_tag(struct fscache_cache_tag *); extern struct fscache_cookie *__fscache_acquire_cookie( struct fscache_cookie *, const struct fscache_cookie_def *, - void *); -extern void __fscache_relinquish_cookie(struct fscache_cookie *, int); + void *, bool); +extern void __fscache_relinquish_cookie(struct fscache_cookie *, bool); extern int __fscache_check_consistency(struct fscache_cookie *); extern void __fscache_update_cookie(struct fscache_cookie *); extern int __fscache_attr_changed(struct fscache_cookie *); @@ -211,6 +247,9 @@ extern void __fscache_uncache_all_inode_pages(struct fscache_cookie *, struct inode *); extern void __fscache_readpages_cancel(struct fscache_cookie *cookie, struct list_head *pages); +extern void __fscache_disable_cookie(struct fscache_cookie *, bool); +extern void __fscache_enable_cookie(struct fscache_cookie *, + bool (*)(void *), void *); /** * fscache_register_netfs - Register a filesystem as desiring caching services @@ -289,6 +328,7 @@ void fscache_release_cache_tag(struct fscache_cache_tag *tag) * @def: A description of the cache object, including callback operations * @netfs_data: An arbitrary piece of data to be kept in the cookie to * represent the cache object to the netfs + * @enable: Whether or not to enable a data cookie immediately * * This function is used to inform FS-Cache about part of an index hierarchy * that can be used to locate files. This is done by requesting a cookie for @@ -301,10 +341,12 @@ static inline struct fscache_cookie *fscache_acquire_cookie( struct fscache_cookie *parent, const struct fscache_cookie_def *def, - void *netfs_data) + void *netfs_data, + bool enable) { - if (fscache_cookie_valid(parent)) - return __fscache_acquire_cookie(parent, def, netfs_data); + if (fscache_cookie_valid(parent) && fscache_cookie_enabled(parent)) + return __fscache_acquire_cookie(parent, def, netfs_data, + enable); else return NULL; } @@ -322,7 +364,7 @@ struct fscache_cookie *fscache_acquire_cookie( * description. */ static inline -void fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) +void fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire) { if (fscache_cookie_valid(cookie)) __fscache_relinquish_cookie(cookie, retire); @@ -341,7 +383,7 @@ void fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) static inline int fscache_check_consistency(struct fscache_cookie *cookie) { - if (fscache_cookie_valid(cookie)) + if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie)) return __fscache_check_consistency(cookie); else return 0; @@ -360,7 +402,7 @@ int fscache_check_consistency(struct fscache_cookie *cookie) static inline void fscache_update_cookie(struct fscache_cookie *cookie) { - if (fscache_cookie_valid(cookie)) + if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie)) __fscache_update_cookie(cookie); } @@ -407,7 +449,7 @@ void fscache_unpin_cookie(struct fscache_cookie *cookie) static inline int fscache_attr_changed(struct fscache_cookie *cookie) { - if (fscache_cookie_valid(cookie)) + if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie)) return __fscache_attr_changed(cookie); else return -ENOBUFS; @@ -429,7 +471,7 @@ int fscache_attr_changed(struct fscache_cookie *cookie) static inline void fscache_invalidate(struct fscache_cookie *cookie) { - if (fscache_cookie_valid(cookie)) + if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie)) __fscache_invalidate(cookie); } @@ -503,7 +545,7 @@ int fscache_read_or_alloc_page(struct fscache_cookie *cookie, void *context, gfp_t gfp) { - if (fscache_cookie_valid(cookie)) + if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie)) return __fscache_read_or_alloc_page(cookie, page, end_io_func, context, gfp); else @@ -554,7 +596,7 @@ int fscache_read_or_alloc_pages(struct fscache_cookie *cookie, void *context, gfp_t gfp) { - if (fscache_cookie_valid(cookie)) + if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie)) return __fscache_read_or_alloc_pages(cookie, mapping, pages, nr_pages, end_io_func, context, gfp); @@ -585,7 +627,7 @@ int fscache_alloc_page(struct fscache_cookie *cookie, struct page *page, gfp_t gfp) { - if (fscache_cookie_valid(cookie)) + if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie)) return __fscache_alloc_page(cookie, page, gfp); else return -ENOBUFS; @@ -634,7 +676,7 @@ int fscache_write_page(struct fscache_cookie *cookie, struct page *page, gfp_t gfp) { - if (fscache_cookie_valid(cookie)) + if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie)) return __fscache_write_page(cookie, page, gfp); else return -ENOBUFS; @@ -744,4 +786,47 @@ void fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, __fscache_uncache_all_inode_pages(cookie, inode); } +/** + * fscache_disable_cookie - Disable a cookie + * @cookie: The cookie representing the cache object + * @invalidate: Invalidate the backing object + * + * Disable a cookie from accepting further alloc, read, write, invalidate, + * update or acquire operations. Outstanding operations can still be waited + * upon and pages can still be uncached and the cookie relinquished. + * + * This will not return until all outstanding operations have completed. + * + * If @invalidate is set, then the backing object will be invalidated and + * detached, otherwise it will just be detached. + */ +static inline +void fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) +{ + if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie)) + __fscache_disable_cookie(cookie, invalidate); +} + +/** + * fscache_enable_cookie - Reenable a cookie + * @cookie: The cookie representing the cache object + * @can_enable: A function to permit enablement once lock is held + * @data: Data for can_enable() + * + * Reenable a previously disabled cookie, allowing it to accept further alloc, + * read, write, invalidate, update or acquire operations. An attempt will be + * made to immediately reattach the cookie to a backing object. + * + * The can_enable() function is called (if not NULL) once the enablement lock + * is held to rule on whether enablement is still permitted to go ahead. + */ +static inline +void fscache_enable_cookie(struct fscache_cookie *cookie, + bool (*can_enable)(void *data), + void *data) +{ + if (fscache_cookie_valid(cookie) && !fscache_cookie_enabled(cookie)) + __fscache_enable_cookie(cookie, can_enable, data); +} + #endif /* _LINUX_FSCACHE_H */ -- cgit v0.10.2 From f1fe29b4a02d0805aa7d0ff6b73410a9f9316d69 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 27 Sep 2013 11:20:03 +0100 Subject: NFS: Use i_writecount to control whether to get an fscache cookie in nfs_open() Use i_writecount to control whether to get an fscache cookie in nfs_open() as NFS does not do write caching yet. I *think* this is the cause of a problem encountered by Mark Moseley whereby __fscache_uncache_page() gets a NULL pointer dereference because cookie->def is NULL: BUG: unable to handle kernel NULL pointer dereference at 0000000000000010 IP: [] __fscache_uncache_page+0x23/0x160 PGD 0 Thread overran stack, or stack corrupted Oops: 0000 [#1] SMP Modules linked in: ... CPU: 7 PID: 18993 Comm: php Not tainted 3.11.1 #1 Hardware name: Dell Inc. PowerEdge R420/072XWF, BIOS 1.3.5 08/21/2012 task: ffff8804203460c0 ti: ffff880420346640 RIP: 0010:[] __fscache_uncache_page+0x23/0x160 RSP: 0018:ffff8801053af878 EFLAGS: 00210286 RAX: 0000000000000000 RBX: ffff8800be2f8780 RCX: ffff88022ffae5e8 RDX: 0000000000004c66 RSI: ffffea00055ff440 RDI: ffff8800be2f8780 RBP: ffff8801053af898 R08: 0000000000000001 R09: 0000000000000003 R10: 0000000000000000 R11: 0000000000000000 R12: ffffea00055ff440 R13: 0000000000001000 R14: ffff8800c50be538 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff88042fc60000(0063) knlGS:00000000e439c700 CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033 CR2: 0000000000000010 CR3: 0000000001d8f000 CR4: 00000000000607f0 Stack: ... Call Trace: [] __nfs_fscache_invalidate_page+0x42/0x70 [] nfs_invalidate_page+0x75/0x90 [] truncate_inode_page+0x8e/0x90 [] truncate_inode_pages_range.part.12+0x14d/0x620 [] ? __mutex_lock_slowpath+0x1fd/0x2e0 [] truncate_inode_pages_range+0x53/0x70 [] truncate_inode_pages+0x2d/0x40 [] truncate_pagecache+0x4f/0x70 [] nfs_setattr_update_inode+0xa0/0x120 [] nfs3_proc_setattr+0xc4/0xe0 [] nfs_setattr+0xc8/0x150 [] notify_change+0x1cb/0x390 [] do_truncate+0x7b/0xc0 [] do_last+0xa4c/0xfd0 [] path_openat+0xcc/0x670 [] do_filp_open+0x4e/0xb0 [] do_sys_open+0x13f/0x2b0 [] compat_SyS_open+0x36/0x50 [] sysenter_dispatch+0x7/0x24 The code at the instruction pointer was disassembled: > (gdb) disas __fscache_uncache_page > Dump of assembler code for function __fscache_uncache_page: > ... > 0xffffffff812a18ff <+31>: mov 0x48(%rbx),%rax > 0xffffffff812a1903 <+35>: cmpb $0x0,0x10(%rax) > 0xffffffff812a1907 <+39>: je 0xffffffff812a19cd <__fscache_uncache_page+237> These instructions make up: ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); That cmpb is the faulting instruction (%rax is 0). So cookie->def is NULL - which presumably means that the cookie has already been at least partway through __fscache_relinquish_cookie(). What I think may be happening is something like a three-way race on the same file: PROCESS 1 PROCESS 2 PROCESS 3 =============== =============== =============== open(O_TRUNC|O_WRONLY) open(O_RDONLY) open(O_WRONLY) -->nfs_open() -->nfs_fscache_set_inode_cookie() nfs_fscache_inode_lock() nfs_fscache_disable_inode_cookie() __fscache_relinquish_cookie() nfs_inode->fscache = NULL <--nfs_fscache_set_inode_cookie() -->nfs_open() -->nfs_fscache_set_inode_cookie() nfs_fscache_inode_lock() nfs_fscache_enable_inode_cookie() __fscache_acquire_cookie() nfs_inode->fscache = cookie <--nfs_fscache_set_inode_cookie() <--nfs_open() -->nfs_setattr() ... ... -->nfs_invalidate_page() -->__nfs_fscache_invalidate_page() cookie = nfsi->fscache -->nfs_open() -->nfs_fscache_set_inode_cookie() nfs_fscache_inode_lock() nfs_fscache_disable_inode_cookie() -->__fscache_relinquish_cookie() -->__fscache_uncache_page(cookie) <--__fscache_relinquish_cookie() nfs_inode->fscache = NULL <--nfs_fscache_set_inode_cookie() What is needed is something to prevent process #2 from reacquiring the cookie - and I think checking i_writecount should do the trick. It's also possible to have a two-way race on this if the file is opened O_TRUNC|O_RDONLY instead. Reported-by: Mark Moseley Signed-off-by: David Howells diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 854a8f0..4c5edcc 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1381,7 +1381,7 @@ static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, i static int do_open(struct inode *inode, struct file *filp) { - nfs_fscache_set_inode_cookie(inode, filp); + nfs_fscache_open_file(inode, filp); return 0; } diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index cd6e7ef..3ef01f0 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -178,163 +178,79 @@ void nfs_fscache_release_super_cookie(struct super_block *sb) /* * Initialise the per-inode cache cookie pointer for an NFS inode. */ -void nfs_fscache_init_inode_cookie(struct inode *inode) +void nfs_fscache_init_inode(struct inode *inode) { - NFS_I(inode)->fscache = NULL; - if (S_ISREG(inode->i_mode)) - set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); -} - -/* - * Get the per-inode cache cookie for an NFS inode. - */ -static void nfs_fscache_enable_inode_cookie(struct inode *inode) -{ - struct super_block *sb = inode->i_sb; struct nfs_inode *nfsi = NFS_I(inode); - if (nfsi->fscache || !NFS_FSCACHE(inode)) + nfsi->fscache = NULL; + if (!S_ISREG(inode->i_mode)) return; - - if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) { - nfsi->fscache = fscache_acquire_cookie( - NFS_SB(sb)->fscache, - &nfs_fscache_inode_object_def, - nfsi, true); - - dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n", - sb, nfsi, nfsi->fscache); - } + nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache, + &nfs_fscache_inode_object_def, + nfsi, false); } /* * Release a per-inode cookie. */ -void nfs_fscache_release_inode_cookie(struct inode *inode) +void nfs_fscache_clear_inode(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); + struct fscache_cookie *cookie = nfs_i_fscache(inode); - dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", - nfsi, nfsi->fscache); + dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie); - fscache_relinquish_cookie(nfsi->fscache, 0); + fscache_relinquish_cookie(cookie, false); nfsi->fscache = NULL; } -/* - * Retire a per-inode cookie, destroying the data attached to it. - */ -void nfs_fscache_zap_inode_cookie(struct inode *inode) +static bool nfs_fscache_can_enable(void *data) { - struct nfs_inode *nfsi = NFS_I(inode); + struct inode *inode = data; - dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n", - nfsi, nfsi->fscache); - - fscache_relinquish_cookie(nfsi->fscache, 1); - nfsi->fscache = NULL; + return !inode_is_open_for_write(inode); } /* - * Turn off the cache with regard to a per-inode cookie if opened for writing, - * invalidating all the pages in the page cache relating to the associated - * inode to clear the per-page caching. - */ -static void nfs_fscache_disable_inode_cookie(struct inode *inode) -{ - clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); - - if (NFS_I(inode)->fscache) { - dfprintk(FSCACHE, - "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode)); - - /* Need to uncache any pages attached to this inode that - * fscache knows about before turning off the cache. - */ - fscache_uncache_all_inode_pages(NFS_I(inode)->fscache, inode); - nfs_fscache_zap_inode_cookie(inode); - } -} - -/* - * wait_on_bit() sleep function for uninterruptible waiting - */ -static int nfs_fscache_wait_bit(void *flags) -{ - schedule(); - return 0; -} - -/* - * Lock against someone else trying to also acquire or relinquish a cookie - */ -static inline void nfs_fscache_inode_lock(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - - while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags)) - wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK, - nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE); -} - -/* - * Unlock cookie management lock - */ -static inline void nfs_fscache_inode_unlock(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - - smp_mb__before_clear_bit(); - clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags); - smp_mb__after_clear_bit(); - wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK); -} - -/* - * Decide if we should enable or disable local caching for this inode. - * - For now, with NFS, only regular files that are open read-only will be able - * to use the cache. - * - May be invoked multiple times in parallel by parallel nfs_open() functions. - */ -void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) -{ - if (NFS_FSCACHE(inode)) { - nfs_fscache_inode_lock(inode); - if ((filp->f_flags & O_ACCMODE) != O_RDONLY) - nfs_fscache_disable_inode_cookie(inode); - else - nfs_fscache_enable_inode_cookie(inode); - nfs_fscache_inode_unlock(inode); - } -} -EXPORT_SYMBOL_GPL(nfs_fscache_set_inode_cookie); - -/* - * Replace a per-inode cookie due to revalidation detecting a file having - * changed on the server. + * Enable or disable caching for a file that is being opened as appropriate. + * The cookie is allocated when the inode is initialised, but is not enabled at + * that time. Enablement is deferred to file-open time to avoid stat() and + * access() thrashing the cache. + * + * For now, with NFS, only regular files that are open read-only will be able + * to use the cache. + * + * We enable the cache for an inode if we open it read-only and it isn't + * currently open for writing. We disable the cache if the inode is open + * write-only. + * + * The caller uses the file struct to pin i_writecount on the inode before + * calling us when a file is opened for writing, so we can make use of that. + * + * Note that this may be invoked multiple times in parallel by parallel + * nfs_open() functions. */ -void nfs_fscache_reset_inode_cookie(struct inode *inode) +void nfs_fscache_open_file(struct inode *inode, struct file *filp) { struct nfs_inode *nfsi = NFS_I(inode); - struct nfs_server *nfss = NFS_SERVER(inode); - NFS_IFDEBUG(struct fscache_cookie *old = nfsi->fscache); + struct fscache_cookie *cookie = nfs_i_fscache(inode); - nfs_fscache_inode_lock(inode); - if (nfsi->fscache) { - /* retire the current fscache cache and get a new one */ - fscache_relinquish_cookie(nfsi->fscache, 1); - - nfsi->fscache = fscache_acquire_cookie( - nfss->nfs_client->fscache, - &nfs_fscache_inode_object_def, - nfsi, true); + if (!fscache_cookie_valid(cookie)) + return; - dfprintk(FSCACHE, - "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n", - nfss, nfsi, old, nfsi->fscache); + if (inode_is_open_for_write(inode)) { + dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi); + clear_bit(NFS_INO_FSCACHE, &nfsi->flags); + fscache_disable_cookie(cookie, true); + fscache_uncache_all_inode_pages(cookie, inode); + } else { + dfprintk(FSCACHE, "NFS: nfsi 0x%p enabling cache\n", nfsi); + fscache_enable_cookie(cookie, nfs_fscache_can_enable, inode); + if (fscache_cookie_enabled(cookie)) + set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); } - nfs_fscache_inode_unlock(inode); } +EXPORT_SYMBOL_GPL(nfs_fscache_open_file); /* * Release the caching state associated with a page, if the page isn't busy @@ -344,12 +260,11 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode) int nfs_fscache_release_page(struct page *page, gfp_t gfp) { if (PageFsCache(page)) { - struct nfs_inode *nfsi = NFS_I(page->mapping->host); - struct fscache_cookie *cookie = nfsi->fscache; + struct fscache_cookie *cookie = nfs_i_fscache(page->mapping->host); BUG_ON(!cookie); dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n", - cookie, page, nfsi); + cookie, page, NFS_I(page->mapping->host)); if (!fscache_maybe_release_page(cookie, page, gfp)) return 0; @@ -367,13 +282,12 @@ int nfs_fscache_release_page(struct page *page, gfp_t gfp) */ void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode) { - struct nfs_inode *nfsi = NFS_I(inode); - struct fscache_cookie *cookie = nfsi->fscache; + struct fscache_cookie *cookie = nfs_i_fscache(inode); BUG_ON(!cookie); dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n", - cookie, page, nfsi); + cookie, page, NFS_I(inode)); fscache_wait_on_page_write(cookie, page); @@ -417,9 +331,9 @@ int __nfs_readpage_from_fscache(struct nfs_open_context *ctx, dfprintk(FSCACHE, "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n", - NFS_I(inode)->fscache, page, page->index, page->flags, inode); + nfs_i_fscache(inode), page, page->index, page->flags, inode); - ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache, + ret = fscache_read_or_alloc_page(nfs_i_fscache(inode), page, nfs_readpage_from_fscache_complete, ctx, @@ -459,9 +373,9 @@ int __nfs_readpages_from_fscache(struct nfs_open_context *ctx, int ret; dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n", - NFS_I(inode)->fscache, npages, inode); + nfs_i_fscache(inode), npages, inode); - ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache, + ret = fscache_read_or_alloc_pages(nfs_i_fscache(inode), mapping, pages, nr_pages, nfs_readpage_from_fscache_complete, ctx, @@ -506,15 +420,15 @@ void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync) dfprintk(FSCACHE, "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n", - NFS_I(inode)->fscache, page, page->index, page->flags, sync); + nfs_i_fscache(inode), page, page->index, page->flags, sync); - ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL); + ret = fscache_write_page(nfs_i_fscache(inode), page, GFP_KERNEL); dfprintk(FSCACHE, "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n", page, page->index, page->flags, ret); if (ret != 0) { - fscache_uncache_page(NFS_I(inode)->fscache, page); + fscache_uncache_page(nfs_i_fscache(inode), page); nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1); nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1); diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index 4ecb766..d7fe3e7 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -76,11 +76,9 @@ extern void nfs_fscache_release_client_cookie(struct nfs_client *); extern void nfs_fscache_get_super_cookie(struct super_block *, const char *, int); extern void nfs_fscache_release_super_cookie(struct super_block *); -extern void nfs_fscache_init_inode_cookie(struct inode *); -extern void nfs_fscache_release_inode_cookie(struct inode *); -extern void nfs_fscache_zap_inode_cookie(struct inode *); -extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *); -extern void nfs_fscache_reset_inode_cookie(struct inode *); +extern void nfs_fscache_init_inode(struct inode *); +extern void nfs_fscache_clear_inode(struct inode *); +extern void nfs_fscache_open_file(struct inode *, struct file *); extern void __nfs_fscache_invalidate_page(struct page *, struct inode *); extern int nfs_fscache_release_page(struct page *, gfp_t); @@ -187,12 +185,10 @@ static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {} static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {} -static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {} -static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {} -static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {} -static inline void nfs_fscache_set_inode_cookie(struct inode *inode, - struct file *filp) {} -static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {} +static inline void nfs_fscache_init_inode(struct inode *inode) {} +static inline void nfs_fscache_clear_inode(struct inode *inode) {} +static inline void nfs_fscache_open_file(struct inode *inode, + struct file *filp) {} static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index eda8879..bb90bff 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -122,7 +122,7 @@ void nfs_clear_inode(struct inode *inode) WARN_ON_ONCE(!list_empty(&NFS_I(inode)->open_files)); nfs_zap_acl_cache(inode); nfs_access_zap_cache(inode); - nfs_fscache_release_inode_cookie(inode); + nfs_fscache_clear_inode(inode); } EXPORT_SYMBOL_GPL(nfs_clear_inode); @@ -459,7 +459,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st nfsi->attrtimeo_timestamp = now; nfsi->access_cache = RB_ROOT; - nfs_fscache_init_inode_cookie(inode); + nfs_fscache_init_inode(inode); unlock_new_inode(inode); } else @@ -854,7 +854,7 @@ int nfs_open(struct inode *inode, struct file *filp) return PTR_ERR(ctx); nfs_file_set_open_context(filp, ctx); put_nfs_open_context(ctx); - nfs_fscache_set_inode_cookie(inode, filp); + nfs_fscache_open_file(inode, filp); return 0; } diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index e5b804d..5b8a618 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -74,7 +74,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); nfs_file_set_open_context(filp, ctx); - nfs_fscache_set_inode_cookie(inode, filp); + nfs_fscache_open_file(inode, filp); err = 0; out_put_ctx: diff --git a/include/linux/fs.h b/include/linux/fs.h index 3f40547..955dff5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2292,6 +2292,11 @@ static inline void allow_write_access(struct file *file) if (file) atomic_inc(&file_inode(file)->i_writecount); } +static inline bool inode_is_open_for_write(const struct inode *inode) +{ + return atomic_read(&inode->i_writecount) > 0; +} + #ifdef CONFIG_IMA static inline void i_readcount_dec(struct inode *inode) { diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 3ea4cde..14a4820 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -269,9 +269,13 @@ static inline int NFS_STALE(const struct inode *inode) return test_bit(NFS_INO_STALE, &NFS_I(inode)->flags); } -static inline int NFS_FSCACHE(const struct inode *inode) +static inline struct fscache_cookie *nfs_i_fscache(struct inode *inode) { - return test_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); +#ifdef CONFIG_NFS_FSCACHE + return NFS_I(inode)->fscache; +#else + return NULL; +#endif } static inline __u64 NFS_FILEID(const struct inode *inode) -- cgit v0.10.2 From a6f951ddbdfb7bd87d31a44f61abe202ed6ce57f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 1 Oct 2013 14:24:58 -0400 Subject: NFSv4: Fix a use-after-free situation in _nfs4_proc_getlk() In nfs4_proc_getlk(), when some error causes a retry of the call to _nfs4_proc_getlk(), we can end up with Oopses of the form BUG: unable to handle kernel NULL pointer dereference at 0000000000000134 IP: [] _raw_spin_lock+0xe/0x30 Call Trace: [] _atomic_dec_and_lock+0x4d/0x70 [] nfs4_put_lock_state+0x32/0xb0 [nfsv4] [] nfs4_fl_release_lock+0x15/0x20 [nfsv4] [] _nfs4_proc_getlk.isra.40+0x146/0x170 [nfsv4] [] nfs4_proc_lock+0x399/0x5a0 [nfsv4] The problem is that we don't clear the request->fl_ops after the first try and so when we retry, nfs4_set_lock_state() exits early without setting the lock stateid. Regression introduced by commit 70cc6487a4e08b8698c0e2ec935fb48d10490162 (locks: make ->lock release private data before returning in GETLK case) Reported-by: Weston Andros Adamson Reported-by: Jorge Mora Signed-off-by: Trond Myklebust Cc: #2.6.22+ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d53d678..d2b4845 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5106,6 +5106,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock status = 0; } request->fl_ops->fl_release_private(request); + request->fl_ops = NULL; out: return status; } -- cgit v0.10.2 From 7f260e8575bf53b93b77978c1e39f8e67612759c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 24 Sep 2013 11:25:22 -0400 Subject: SUNRPC: Enable the keepalive option for TCP sockets For NFSv4 we want to avoid retransmitting RPC calls unless the TCP connection breaks. However we still want to detect TCP connection breakage as soon as possible. Do this by setting the keepalive option with the idle timeout and count set to the 'timeo' and 'retrans' mount options. Signed-off-by: Trond Myklebust diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index ee03d35..208a763 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2112,6 +2112,19 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) if (!transport->inet) { struct sock *sk = sock->sk; + unsigned int keepidle = xprt->timeout->to_initval / HZ; + unsigned int keepcnt = xprt->timeout->to_retries + 1; + unsigned int opt_on = 1; + + /* TCP Keepalive options */ + kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&opt_on, sizeof(opt_on)); + kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, + (char *)&keepidle, sizeof(keepidle)); + kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, + (char *)&keepidle, sizeof(keepidle)); + kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, + (char *)&keepcnt, sizeof(keepcnt)); write_lock_bh(&sk->sk_callback_lock); -- cgit v0.10.2 From 8b71798c0d389d4cadc884fc7d68c61ee8cd4f45 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 26 Sep 2013 10:18:04 -0400 Subject: SUNRPC: Only update the TCP connect cookie on a successful connect Signed-off-by: Trond Myklebust diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 208a763..9928ba1 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1511,6 +1511,7 @@ static void xs_tcp_state_change(struct sock *sk) transport->tcp_copied = 0; transport->tcp_flags = TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID; + xprt->connect_cookie++; xprt_wake_pending_tasks(xprt, -EAGAIN); } @@ -2164,7 +2165,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) case 0: case -EINPROGRESS: /* SYN_SENT! */ - xprt->connect_cookie++; if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; } -- cgit v0.10.2 From 0a6605213040dd2fb479f0d1a9a87a1d7fa70904 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 25 Sep 2013 11:31:54 -0400 Subject: SUNRPC: Don't set the request connect_cookie until a successful transmit We're using the request connect_cookie to track whether or not a request was successfully transmitted on the current transport connection or not. For that reason we should ensure that it is only set after we've successfully transmitted the request. Signed-off-by: Trond Myklebust diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 095363e..e9ee7bf 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -912,7 +912,6 @@ void xprt_transmit(struct rpc_task *task) } else if (!req->rq_bytes_sent) return; - req->rq_connect_cookie = xprt->connect_cookie; req->rq_xtime = ktime_get(); status = xprt->ops->send_request(task); if (status != 0) { @@ -938,12 +937,14 @@ void xprt_transmit(struct rpc_task *task) /* Don't race with disconnect */ if (!xprt_connected(xprt)) task->tk_status = -ENOTCONN; - else if (!req->rq_reply_bytes_recvd && rpc_reply_expected(task)) { + else { /* * Sleep on the pending queue since * we're expecting a reply. */ - rpc_sleep_on(&xprt->pending, task, xprt_timer); + if (!req->rq_reply_bytes_recvd && rpc_reply_expected(task)) + rpc_sleep_on(&xprt->pending, task, xprt_timer); + req->rq_connect_cookie = xprt->connect_cookie; } spin_unlock_bh(&xprt->transport_lock); } @@ -1186,6 +1187,7 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) req->rq_xprt = xprt; req->rq_buffer = NULL; req->rq_xid = xprt_alloc_xid(xprt); + req->rq_connect_cookie = xprt->connect_cookie - 1; req->rq_release_snd_buf = NULL; xprt_reset_majortimeo(req); dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid, -- cgit v0.10.2 From ee071eff0f1afafa9917254a6e4ee19d28085f1d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 27 Sep 2013 11:09:53 -0400 Subject: SUNRPC: Clear the request rq_bytes_sent field in xprt_release_write Otherwise the tests of req->rq_bytes_sent in xprt_prepare_transmit will fail if we're dealing with a resend. Signed-off-by: Trond Myklebust diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index e9ee7bf..8cc5c8b 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -358,6 +358,11 @@ out_unlock: void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { + if (task != NULL) { + struct rpc_rqst *req = task->tk_rqstp; + if (req != NULL) + req->rq_bytes_sent = 0; + } xprt_clear_locked(xprt); __xprt_lock_write_next(xprt); } @@ -375,6 +380,11 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt); void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { + if (task != NULL) { + struct rpc_rqst *req = task->tk_rqstp; + if (req != NULL) + req->rq_bytes_sent = 0; + } xprt_clear_locked(xprt); __xprt_lock_write_next_cong(xprt); } -- cgit v0.10.2 From 90051ea774613ffc6b8aad3dc665c8505d6205a8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 25 Sep 2013 12:17:18 -0400 Subject: SUNRPC: Clean up - convert xprt_prepare_transmit to return a bool Signed-off-by: Trond Myklebust diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index cec7b9b..8097b9d 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -288,7 +288,7 @@ int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task); int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task); void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task); void xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task); -int xprt_prepare_transmit(struct rpc_task *task); +bool xprt_prepare_transmit(struct rpc_task *task); void xprt_transmit(struct rpc_task *task); void xprt_end_transmit(struct rpc_task *task); int xprt_adjust_timeout(struct rpc_rqst *req); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 7747960..fa50e42 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1722,8 +1722,7 @@ call_transmit(struct rpc_task *task) task->tk_action = call_status; if (task->tk_status < 0) return; - task->tk_status = xprt_prepare_transmit(task); - if (task->tk_status != 0) + if (!xprt_prepare_transmit(task)) return; task->tk_action = call_transmit_status; /* Encode here so that rpcsec_gss can use correct sequence number. */ @@ -1811,8 +1810,7 @@ call_bc_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; - task->tk_status = xprt_prepare_transmit(task); - if (task->tk_status == -EAGAIN) { + if (!xprt_prepare_transmit(task)) { /* * Could not reserve the transport. Try again after the * transport is released. diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 8cc5c8b..2326af5 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -864,24 +864,27 @@ static inline int xprt_has_timer(struct rpc_xprt *xprt) * @task: RPC task about to send a request * */ -int xprt_prepare_transmit(struct rpc_task *task) +bool xprt_prepare_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - int err = 0; + bool ret = false; dprintk("RPC: %5u xprt_prepare_transmit\n", task->tk_pid); spin_lock_bh(&xprt->transport_lock); if (req->rq_reply_bytes_recvd && !req->rq_bytes_sent) { - err = req->rq_reply_bytes_recvd; + task->tk_status = req->rq_reply_bytes_recvd; goto out_unlock; } - if (!xprt->ops->reserve_xprt(xprt, task)) - err = -EAGAIN; + if (!xprt->ops->reserve_xprt(xprt, task)) { + task->tk_status = -EAGAIN; + goto out_unlock; + } + ret = true; out_unlock: spin_unlock_bh(&xprt->transport_lock); - return err; + return ret; } void xprt_end_transmit(struct rpc_task *task) -- cgit v0.10.2 From 8a19a0b6cb2e2216afd68ef2047f30260cc8a220 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 24 Sep 2013 12:00:27 -0400 Subject: SUNRPC: Add RPC task and client level options to disable the resend timeout Signed-off-by: Trond Myklebust diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 6740801..943ee89 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -49,6 +49,7 @@ struct rpc_clnt { unsigned int cl_softrtry : 1,/* soft timeouts */ cl_discrtry : 1,/* disconnect before retry */ + cl_noretranstimeo: 1,/* No retransmit timeouts */ cl_autobind : 1,/* use getport() */ cl_chatty : 1;/* be verbose */ @@ -126,6 +127,7 @@ struct rpc_create_args { #define RPC_CLNT_CREATE_QUIET (1UL << 6) #define RPC_CLNT_CREATE_INFINITE_SLOTS (1UL << 7) #define RPC_CLNT_CREATE_NO_IDLE_TIMEOUT (1UL << 8) +#define RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT (1UL << 9) struct rpc_clnt *rpc_create(struct rpc_create_args *args); struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 096ee58..3a847de 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -122,6 +122,7 @@ struct rpc_task_setup { #define RPC_TASK_SENT 0x0800 /* message was sent */ #define RPC_TASK_TIMEOUT 0x1000 /* fail with ETIMEDOUT on timeout */ #define RPC_TASK_NOCONNECT 0x2000 /* return ENOTCONN if not connected */ +#define RPC_TASK_NO_RETRANS_TIMEOUT 0x4000 /* wait forever for a reply */ #define RPC_IS_ASYNC(t) ((t)->tk_flags & RPC_TASK_ASYNC) #define RPC_IS_SWAPPER(t) ((t)->tk_flags & RPC_TASK_SWAPPER) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index fa50e42..b585250 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -772,6 +772,8 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) atomic_inc(&clnt->cl_count); if (clnt->cl_softrtry) task->tk_flags |= RPC_TASK_SOFT; + if (clnt->cl_noretranstimeo) + task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT; if (sk_memalloc_socks()) { struct rpc_xprt *xprt; @@ -1898,7 +1900,8 @@ call_status(struct rpc_task *task) rpc_delay(task, 3*HZ); case -ETIMEDOUT: task->tk_action = call_timeout; - if (task->tk_client->cl_discrtry) + if (!(task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) + && task->tk_client->cl_discrtry) xprt_conditional_disconnect(req->rq_xprt, req->rq_connect_cookie); break; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 2326af5..d166d99 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -873,9 +873,18 @@ bool xprt_prepare_transmit(struct rpc_task *task) dprintk("RPC: %5u xprt_prepare_transmit\n", task->tk_pid); spin_lock_bh(&xprt->transport_lock); - if (req->rq_reply_bytes_recvd && !req->rq_bytes_sent) { - task->tk_status = req->rq_reply_bytes_recvd; - goto out_unlock; + if (!req->rq_bytes_sent) { + if (req->rq_reply_bytes_recvd) { + task->tk_status = req->rq_reply_bytes_recvd; + goto out_unlock; + } + if ((task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) + && xprt_connected(xprt) + && req->rq_connect_cookie == xprt->connect_cookie) { + xprt->ops->set_retrans_timeout(task); + rpc_sleep_on(&xprt->pending, task, xprt_timer); + goto out_unlock; + } } if (!xprt->ops->reserve_xprt(xprt, task)) { task->tk_status = -EAGAIN; -- cgit v0.10.2 From 99875249bfbfb6d9a2aba020ce65da2862d0dafa Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 24 Sep 2013 12:06:07 -0400 Subject: NFSv4: Ensure that we disable the resend timeout for NFSv4 The spec states that the client should not resend requests because the server will disconnect if it needs to drop an RPC request. Signed-off-by: Trond Myklebust diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 2dceee4..af03258 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -590,6 +590,8 @@ int nfs_create_rpc_client(struct nfs_client *clp, if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_DISCRTRY; + if (test_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT; if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags)) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index a860ab5..511cdce 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -368,6 +368,7 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, if (clp->cl_minorversion != 0) __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags); __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); + __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); if (error == -EINVAL) error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index b8cedce..f9c0a6c 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -41,6 +41,7 @@ struct nfs_client { #define NFS_CS_DISCRTRY 1 /* - disconnect on RPC retry */ #define NFS_CS_MIGRATION 2 /* - transparent state migr */ #define NFS_CS_INFINITE_SLOTS 3 /* - don't limit TCP slots */ +#define NFS_CS_NO_RETRANS_TIMEOUT 4 /* - Disable retransmit timeouts */ struct sockaddr_storage cl_addr; /* server identifier */ size_t cl_addrlen; char * cl_hostname; /* hostname of server */ -- cgit v0.10.2 From ca7f33aa5b8051f17eec81766b8f39c83caf4196 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 25 Sep 2013 13:39:45 -0400 Subject: SUNRPC: Fix RPC call retransmission statistics A retransmit should be when you successfully transmit an RPC call to the server a second time. Signed-off-by: Trond Myklebust diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index b585250..bde3115 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1719,6 +1719,8 @@ call_connect_status(struct rpc_task *task) static void call_transmit(struct rpc_task *task) { + int is_retrans = RPC_WAS_SENT(task); + dprint_status(task); task->tk_action = call_status; @@ -1743,6 +1745,8 @@ call_transmit(struct rpc_task *task) xprt_transmit(task); if (task->tk_status < 0) return; + if (is_retrans) + task->tk_client->cl_stats->rpcretrans++; /* * On success, ensure that we call xprt_end_transmit() before sleeping * in order to allow access to the socket to other RPC requests. @@ -1983,7 +1987,6 @@ call_timeout(struct rpc_task *task) rpcauth_invalcred(task); retry: - clnt->cl_stats->rpcretrans++; task->tk_action = call_bind; task->tk_status = 0; } @@ -2026,7 +2029,6 @@ call_decode(struct rpc_task *task) if (req->rq_rcv_buf.len < 12) { if (!RPC_IS_SOFT(task)) { task->tk_action = call_bind; - clnt->cl_stats->rpcretrans++; goto out_retry; } dprintk("RPC: %s: too small RPC reply size (%d bytes)\n", -- cgit v0.10.2 From 92551948174d079b12541437f51cbe3e17d9dd24 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 27 Sep 2013 11:28:40 -0400 Subject: SUNRPC: Remove redundant initialisations of request rq_bytes_sent Now that we clear the rq_bytes_sent field on unlock, we don't need to set it on lock, so we just set it once when initialising the request. Signed-off-by: Trond Myklebust diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index d166d99..49535505 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -205,10 +205,8 @@ int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task) goto out_sleep; } xprt->snd_task = task; - if (req != NULL) { - req->rq_bytes_sent = 0; + if (req != NULL) req->rq_ntrans++; - } return 1; @@ -263,7 +261,6 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) } if (__xprt_get_cong(xprt, task)) { xprt->snd_task = task; - req->rq_bytes_sent = 0; req->rq_ntrans++; return 1; } @@ -300,10 +297,8 @@ static bool __xprt_lock_write_func(struct rpc_task *task, void *data) req = task->tk_rqstp; xprt->snd_task = task; - if (req) { - req->rq_bytes_sent = 0; + if (req) req->rq_ntrans++; - } return true; } @@ -329,7 +324,6 @@ static bool __xprt_lock_write_cong_func(struct rpc_task *task, void *data) } if (__xprt_get_cong(xprt, task)) { xprt->snd_task = task; - req->rq_bytes_sent = 0; req->rq_ntrans++; return true; } @@ -1210,6 +1204,11 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) req->rq_buffer = NULL; req->rq_xid = xprt_alloc_xid(xprt); req->rq_connect_cookie = xprt->connect_cookie - 1; + req->rq_bytes_sent = 0; + req->rq_snd_buf.len = 0; + req->rq_snd_buf.buflen = 0; + req->rq_rcv_buf.len = 0; + req->rq_rcv_buf.buflen = 0; req->rq_release_snd_buf = NULL; xprt_reset_majortimeo(req); dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid, -- cgit v0.10.2 From 561ec1603171cd9b38dcf6cac53e8710f437a48d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 26 Sep 2013 15:22:45 -0400 Subject: SUNRPC: call_connect_status should recheck bind and connect status on error Currently, we go directly to call_transmit which sends us to call_status on error. If we know that the connect attempt failed, we should rather just jump straight back to call_bind and call_connect. Ditto for EAGAIN, except do not delay. Signed-off-by: Trond Myklebust diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index bde3115..7352aef 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1692,6 +1692,7 @@ call_connect_status(struct rpc_task *task) dprint_status(task); trace_rpc_connect_status(task, status); + task->tk_status = 0; switch (status) { /* if soft mounted, test if we've timed out */ case -ETIMEDOUT: @@ -1700,12 +1701,14 @@ call_connect_status(struct rpc_task *task) case -ECONNREFUSED: case -ECONNRESET: case -ENETUNREACH: + /* retry with existing socket, after a delay */ + rpc_delay(task, 3*HZ); if (RPC_IS_SOFTCONN(task)) break; - /* retry with existing socket, after a delay */ - case 0: case -EAGAIN: - task->tk_status = 0; + task->tk_action = call_bind; + return; + case 0: clnt->cl_stats->netreconn++; task->tk_action = call_transmit; return; -- cgit v0.10.2 From 3660cd4322fce986689b06225d9c12d77193c252 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Thu, 3 Oct 2013 12:23:24 -0400 Subject: NFSv4 Remove zeroing state kern warnings As of commit 5d422301f97b821301efcdb6fc9d1a83a5c102d6 we no longer zero the state. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index cc14cbb..fa2706a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1375,8 +1375,8 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: goto out; default: - printk(KERN_ERR "NFS: %s: unhandled error %d. " - "Zeroing state\n", __func__, status); + printk(KERN_ERR "NFS: %s: unhandled error %d\n", + __func__, status); case -ENOMEM: case -NFS4ERR_DENIED: case -NFS4ERR_RECLAIM_BAD: @@ -1439,15 +1439,12 @@ restart: } switch (status) { default: - printk(KERN_ERR "NFS: %s: unhandled error %d. " - "Zeroing state\n", __func__, status); + printk(KERN_ERR "NFS: %s: unhandled error %d\n", + __func__, status); case -ENOENT: case -ENOMEM: case -ESTALE: - /* - * Open state on this file cannot be recovered - * All we can do is revert to using the zero stateid. - */ + /* Open state on this file cannot be recovered */ nfs4_state_mark_recovery_failed(state, status); break; case -EAGAIN: -- cgit v0.10.2 From 57acc40d73407159727b3a1456f0a498133831ba Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 18 Oct 2013 10:03:37 -0400 Subject: nfs: reject version and minorversion changes on remount attempts Reported-by: Eric Doutreleau Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust diff --git a/fs/nfs/super.c b/fs/nfs/super.c index a03b9c6..137572b 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2143,6 +2143,8 @@ nfs_compare_remount_data(struct nfs_server *nfss, if (data->flags != nfss->flags || data->rsize != nfss->rsize || data->wsize != nfss->wsize || + data->version != nfss->nfs_client->rpc_ops->version || + data->minorversion != nfss->nfs_client->cl_minorversion || data->retrans != nfss->client->cl_timeout->to_retries || data->auth_flavors[0] != nfss->client->cl_auth->au_flavor || data->acregmin != nfss->acregmin / HZ || @@ -2197,6 +2199,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ; data->nfs_server.port = nfss->port; data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; + data->version = nfsvers; + data->minorversion = nfss->nfs_client->cl_minorversion; memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr, data->nfs_server.addrlen); -- cgit v0.10.2 From 1966903f8e28b31ff82de2e2180f0c066399288d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 21 Oct 2013 09:52:19 -0400 Subject: nfs: fix handling of invalid mount options in nfs_remount nfs_parse_mount_options returns 0 on error, not -errno. Reported-by: Karel Zak Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 137572b..3f5a7a8 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2205,8 +2205,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) data->nfs_server.addrlen); /* overwrite those values with any that were specified */ - error = nfs_parse_mount_options((char *)options, data); - if (error < 0) + error = -EINVAL; + if (!nfs_parse_mount_options((char *)options, data)) goto out; /* -- cgit v0.10.2 From 83c78eb0425f587a34a6644da79bdcde987dc974 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 28 Oct 2013 14:50:38 -0400 Subject: NFSv4.1: Don't change the security label as part of open reclaim. The current caching model calls for the security label to be set on first lookup and/or on any subsequent label changes. There is no need to do it as part of an open reclaim. Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d2b4845..8dd61eb 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1337,8 +1337,6 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) if (ret) goto err; - nfs_setsecurity(inode, &data->f_attr, data->f_label); - if (data->o_res.delegation_type != 0) nfs4_opendata_check_deleg(data, state); update_open_stateid(state, &data->o_res.stateid, NULL, -- cgit v0.10.2 From f494a6071d31e3294a3b51ad7a3684f983953f9f Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Mon, 21 Oct 2013 13:10:10 -0400 Subject: NFSv4: fix NULL dereference in open recover _nfs4_opendata_reclaim_to_nfs4_state doesn't expect to see a cached open CLAIM_PREVIOUS, but this can happen. An example is when there are RDWR openers and RDONLY openers on a delegation stateid. The recovery path will first try an open CLAIM_PREVIOUS for the RDWR openers, this marks the delegation as not needing RECLAIM anymore, so the open CLAIM_PREVIOUS for the RDONLY openers will not actually send an rpc. The NULL dereference is due to _nfs4_opendata_reclaim_to_nfs4_state returning PTR_ERR(rpc_status) when !rpc_done. When the open is cached, rpc_done == 0 and rpc_status == 0, thus _nfs4_opendata_reclaim_to_nfs4_state returns NULL - this is unexpected by callers of nfs4_opendata_to_nfs4_state(). This can be reproduced easily by opening the same file two times on an NFSv4.0 mount with delegations enabled, once as RDWR and once as RDONLY then sleeping for a long time. While the files are held open, kick off state recovery and this NULL dereference will be hit every time. An example OOPS: [ 65.003602] BUG: unable to handle kernel NULL pointer dereference at 00000000 00000030 [ 65.005312] IP: [] __nfs4_close+0x1e/0x160 [nfsv4] [ 65.006820] PGD 7b0ea067 PUD 791ff067 PMD 0 [ 65.008075] Oops: 0000 [#1] SMP [ 65.008802] Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache snd_ens1371 gameport nfsd snd_rawmidi snd_ac97_codec ac97_bus btusb snd_seq snd _seq_device snd_pcm ppdev bluetooth auth_rpcgss coretemp snd_page_alloc crc32_pc lmul crc32c_intel ghash_clmulni_intel microcode rfkill nfs_acl vmw_balloon serio _raw snd_timer lockd parport_pc e1000 snd soundcore parport i2c_piix4 shpchp vmw _vmci sunrpc ata_generic mperf pata_acpi mptspi vmwgfx ttm scsi_transport_spi dr m mptscsih mptbase i2c_core [ 65.018684] CPU: 0 PID: 473 Comm: 192.168.10.85-m Not tainted 3.11.2-201.fc19 .x86_64 #1 [ 65.020113] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/31/2013 [ 65.022012] task: ffff88003707e320 ti: ffff88007b906000 task.ti: ffff88007b906000 [ 65.023414] RIP: 0010:[] [] __nfs4_close+0x1e/0x160 [nfsv4] [ 65.025079] RSP: 0018:ffff88007b907d10 EFLAGS: 00010246 [ 65.026042] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 [ 65.027321] RDX: 0000000000000050 RSI: 0000000000000001 RDI: 0000000000000000 [ 65.028691] RBP: ffff88007b907d38 R08: 0000000000016f60 R09: 0000000000000000 [ 65.029990] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001 [ 65.031295] R13: 0000000000000050 R14: 0000000000000000 R15: 0000000000000001 [ 65.032527] FS: 0000000000000000(0000) GS:ffff88007f600000(0000) knlGS:0000000000000000 [ 65.033981] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 65.035177] CR2: 0000000000000030 CR3: 000000007b27f000 CR4: 00000000000407f0 [ 65.036568] Stack: [ 65.037011] 0000000000000000 0000000000000001 ffff88007b907d90 ffff88007a880220 [ 65.038472] ffff88007b768de8 ffff88007b907d48 ffffffffa037e4a5 ffff88007b907d80 [ 65.039935] ffffffffa036a6c8 ffff880037020e40 ffff88007a880000 ffff880037020e40 [ 65.041468] Call Trace: [ 65.042050] [] nfs4_close_state+0x15/0x20 [nfsv4] [ 65.043209] [] nfs4_open_recover_helper+0x148/0x1f0 [nfsv4] [ 65.044529] [] nfs4_open_recover+0x116/0x150 [nfsv4] [ 65.045730] [] nfs4_open_reclaim+0xad/0x150 [nfsv4] [ 65.046905] [] nfs4_do_reclaim+0x149/0x5f0 [nfsv4] [ 65.048071] [] nfs4_run_state_manager+0x3bc/0x670 [nfsv4] [ 65.049436] [] ? nfs4_do_reclaim+0x5f0/0x5f0 [nfsv4] [ 65.050686] [] ? nfs4_do_reclaim+0x5f0/0x5f0 [nfsv4] [ 65.051943] [] kthread+0xc0/0xd0 [ 65.052831] [] ? insert_kthread_work+0x40/0x40 [ 65.054697] [] ret_from_fork+0x7c/0xb0 [ 65.056396] [] ? insert_kthread_work+0x40/0x40 [ 65.058208] Code: 5c 41 5d 5d c3 0f 1f 84 00 00 00 00 00 66 66 66 66 90 55 48 89 e5 41 57 41 89 f7 41 56 41 89 ce 41 55 41 89 d5 41 54 53 48 89 fb <4c> 8b 67 30 f0 41 ff 44 24 44 49 8d 7c 24 40 e8 0e 0a 2d e1 44 [ 65.065225] RIP [] __nfs4_close+0x1e/0x160 [nfsv4] [ 65.067175] RSP [ 65.068570] CR2: 0000000000000030 [ 65.070098] ---[ end trace 0d1fe4f5c7dd6f8b ]--- Cc: #3.7+ Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8dd61eb..acd20c5 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1317,7 +1317,8 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) struct nfs4_state *state = data->state; int ret; - if (!data->rpc_done) { + /* allow cached opens (!rpc_done && !rpc_status) */ + if (!data->rpc_done && data->rpc_status) { ret = data->rpc_status; goto err; } -- cgit v0.10.2 From a43ec98b72aae3e330f0673438f58316c3769b84 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Mon, 21 Oct 2013 13:10:11 -0400 Subject: NFSv4: don't fail on missing fattr in open recover This is an unneeded check that could cause the client to fail to recover opens. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index acd20c5..e76fd21 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1323,12 +1323,6 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) goto err; } - ret = -ESTALE; - if (!(data->f_attr.valid & NFS_ATTR_FATTR_TYPE) || - !(data->f_attr.valid & NFS_ATTR_FATTR_FILEID) || - !(data->f_attr.valid & NFS_ATTR_FATTR_CHANGE)) - goto err; - ret = -ENOMEM; state = nfs4_get_open_state(inode, data->owner); if (state == NULL) -- cgit v0.10.2 From d49f042aeec99c5f87160bb52dd52088b1051311 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 28 Oct 2013 14:57:12 -0400 Subject: NFSv4: Fix state reference counting in _nfs4_opendata_reclaim_to_nfs4_state Currently, if the call to nfs_refresh_inode fails, then we end up leaking a reference count, due to the call to nfs4_get_open_state. While we're at it, replace nfs4_get_open_state with a simple call to atomic_inc(); there is no need to do a full lookup of the struct nfs_state since it is passed as an argument in the struct nfs4_opendata, and is already assigned to the variable 'state'. Cc: stable@vger.kernel.org # 3.7.x: a43ec98b72a: NFSv4: don't fail on missing Cc: stable@vger.kernel.org # 3.7.x Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e76fd21..d3c4255 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1323,11 +1323,6 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) goto err; } - ret = -ENOMEM; - state = nfs4_get_open_state(inode, data->owner); - if (state == NULL) - goto err; - ret = nfs_refresh_inode(inode, &data->f_attr); if (ret) goto err; @@ -1336,6 +1331,7 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) nfs4_opendata_check_deleg(data, state); update_open_stateid(state, &data->o_res.stateid, NULL, data->o_arg.fmode); + atomic_inc(&state->count); return state; err: -- cgit v0.10.2 From d2bfda2e7aa036f90ccea610a657064b1e267913 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Mon, 21 Oct 2013 13:10:13 -0400 Subject: NFSv4: don't reprocess cached open CLAIM_PREVIOUS Cached opens have already been handled by _nfs4_opendata_reclaim_to_nfs4_state and can safely skip being reprocessed, but must still call update_open_stateid to make sure that all active fmodes are recovered. Signed-off-by: Weston Andros Adamson Cc: stable@vger.kernel.org # 3.7.x: f494a6071d3: NFSv4: fix NULL dereference Cc: stable@vger.kernel.org # 3.7.x: a43ec98b72a: NFSv4: don't fail on missin Cc: stable@vger.kernel.org # 3.7.x Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d3c4255..259f8be 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1317,10 +1317,13 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) struct nfs4_state *state = data->state; int ret; - /* allow cached opens (!rpc_done && !rpc_status) */ - if (!data->rpc_done && data->rpc_status) { - ret = data->rpc_status; - goto err; + if (!data->rpc_done) { + if (data->rpc_status) { + ret = data->rpc_status; + goto err; + } + /* cached opens have already been processed */ + goto update; } ret = nfs_refresh_inode(inode, &data->f_attr); @@ -1329,6 +1332,7 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) if (data->o_res.delegation_type != 0) nfs4_opendata_check_deleg(data, state); +update: update_open_stateid(state, &data->o_res.stateid, NULL, data->o_arg.fmode); atomic_inc(&state->count); -- cgit v0.10.2 From d746e54522e52275865ca23917c16b3fdc226e51 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2013 14:12:17 -0400 Subject: SUNRPC: Modify synopsis of rpc_client_register() The rpc_client_register() helper was added in commit e73f4cc0, "SUNRPC: split client creation routine into setup and registration," Mon Jun 24 11:52:52 2013. In a subsequent patch, I'd like to invoke rpc_client_register() from a context where a struct rpc_create_args is not available. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 7352aef..587f31e 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -272,12 +272,13 @@ static void rpc_clnt_set_nodename(struct rpc_clnt *clnt, const char *nodename) memcpy(clnt->cl_nodename, nodename, clnt->cl_nodelen); } -static int rpc_client_register(const struct rpc_create_args *args, - struct rpc_clnt *clnt) +static int rpc_client_register(struct rpc_clnt *clnt, + rpc_authflavor_t pseudoflavor, + const char *client_name) { struct rpc_auth_create_args auth_args = { - .pseudoflavor = args->authflavor, - .target_name = args->client_name, + .pseudoflavor = pseudoflavor, + .target_name = client_name, }; struct rpc_auth *auth; struct net *net = rpc_net_ns(clnt); @@ -298,7 +299,7 @@ static int rpc_client_register(const struct rpc_create_args *args, auth = rpcauth_create(&auth_args, clnt); if (IS_ERR(auth)) { dprintk("RPC: Couldn't create auth handle (flavor %u)\n", - args->authflavor); + pseudoflavor); err = PTR_ERR(auth); goto err_auth; } @@ -398,7 +399,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, /* save the nodename */ rpc_clnt_set_nodename(clnt, utsname()->nodename); - err = rpc_client_register(args, clnt); + err = rpc_client_register(clnt, args->authflavor, args->client_name); if (err) goto out_no_path; if (parent) -- cgit v0.10.2 From 40b00b6b17c412ff9ff28631250d32ee29ff0006 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 17 Oct 2013 14:12:23 -0400 Subject: SUNRPC: Add a helper to switch the transport of an rpc_clnt Add an RPC client API to redirect an rpc_clnt's transport from a source server to a destination server during a migration event. Signed-off-by: Trond Myklebust [ cel: forward ported to 3.12 ] Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 943ee89..8af2804 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -136,6 +136,10 @@ void rpc_task_reset_client(struct rpc_task *task, struct rpc_clnt *clnt); struct rpc_clnt *rpc_clone_client(struct rpc_clnt *); struct rpc_clnt *rpc_clone_client_set_auth(struct rpc_clnt *, rpc_authflavor_t); +int rpc_switch_client_transport(struct rpc_clnt *, + struct xprt_create *, + const struct rpc_timeout *); + void rpc_shutdown_client(struct rpc_clnt *); void rpc_release_client(struct rpc_clnt *); void rpc_task_release_client(struct rpc_task *); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 587f31e..f167d9c 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -264,6 +265,25 @@ void rpc_clients_notifier_unregister(void) return rpc_pipefs_notifier_unregister(&rpc_clients_block); } +static struct rpc_xprt *rpc_clnt_set_transport(struct rpc_clnt *clnt, + struct rpc_xprt *xprt, + const struct rpc_timeout *timeout) +{ + struct rpc_xprt *old; + + spin_lock(&clnt->cl_lock); + old = clnt->cl_xprt; + + if (!xprt_bound(xprt)) + clnt->cl_autobind = 1; + + clnt->cl_timeout = timeout; + rcu_assign_pointer(clnt->cl_xprt, xprt); + spin_unlock(&clnt->cl_lock); + + return old; +} + static void rpc_clnt_set_nodename(struct rpc_clnt *clnt, const char *nodename) { clnt->cl_nodelen = strlen(nodename); @@ -338,7 +358,8 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, { const struct rpc_program *program = args->program; const struct rpc_version *version; - struct rpc_clnt *clnt = NULL; + struct rpc_clnt *clnt = NULL; + const struct rpc_timeout *timeout; int err; /* sanity check the name before trying to print it */ @@ -366,7 +387,6 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, if (err) goto out_no_clid; - rcu_assign_pointer(clnt->cl_xprt, xprt); clnt->cl_procinfo = version->procs; clnt->cl_maxproc = version->nrprocs; clnt->cl_prog = args->prognumber ? : program->number; @@ -381,16 +401,15 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, INIT_LIST_HEAD(&clnt->cl_tasks); spin_lock_init(&clnt->cl_lock); - if (!xprt_bound(xprt)) - clnt->cl_autobind = 1; - - clnt->cl_timeout = xprt->timeout; + timeout = xprt->timeout; if (args->timeout != NULL) { memcpy(&clnt->cl_timeout_default, args->timeout, sizeof(clnt->cl_timeout_default)); - clnt->cl_timeout = &clnt->cl_timeout_default; + timeout = &clnt->cl_timeout_default; } + rpc_clnt_set_transport(clnt, xprt, timeout); + clnt->cl_rtt = &clnt->cl_rtt_default; rpc_init_rtt(&clnt->cl_rtt_default, clnt->cl_timeout->to_initval); @@ -601,6 +620,80 @@ rpc_clone_client_set_auth(struct rpc_clnt *clnt, rpc_authflavor_t flavor) } EXPORT_SYMBOL_GPL(rpc_clone_client_set_auth); +/** + * rpc_switch_client_transport: switch the RPC transport on the fly + * @clnt: pointer to a struct rpc_clnt + * @args: pointer to the new transport arguments + * @timeout: pointer to the new timeout parameters + * + * This function allows the caller to switch the RPC transport for the + * rpc_clnt structure 'clnt' to allow it to connect to a mirrored NFS + * server, for instance. It assumes that the caller has ensured that + * there are no active RPC tasks by using some form of locking. + * + * Returns zero if "clnt" is now using the new xprt. Otherwise a + * negative errno is returned, and "clnt" continues to use the old + * xprt. + */ +int rpc_switch_client_transport(struct rpc_clnt *clnt, + struct xprt_create *args, + const struct rpc_timeout *timeout) +{ + const struct rpc_timeout *old_timeo; + rpc_authflavor_t pseudoflavor; + struct rpc_xprt *xprt, *old; + struct rpc_clnt *parent; + int err; + + xprt = xprt_create_transport(args); + if (IS_ERR(xprt)) { + dprintk("RPC: failed to create new xprt for clnt %p\n", + clnt); + return PTR_ERR(xprt); + } + + pseudoflavor = clnt->cl_auth->au_flavor; + + old_timeo = clnt->cl_timeout; + old = rpc_clnt_set_transport(clnt, xprt, timeout); + + rpc_unregister_client(clnt); + __rpc_clnt_remove_pipedir(clnt); + + /* + * A new transport was created. "clnt" therefore + * becomes the root of a new cl_parent tree. clnt's + * children, if it has any, still point to the old xprt. + */ + parent = clnt->cl_parent; + clnt->cl_parent = clnt; + + /* + * The old rpc_auth cache cannot be re-used. GSS + * contexts in particular are between a single + * client and server. + */ + err = rpc_client_register(clnt, pseudoflavor, NULL); + if (err) + goto out_revert; + + synchronize_rcu(); + if (parent != clnt) + rpc_release_client(parent); + xprt_put(old); + dprintk("RPC: replaced xprt for clnt %p\n", clnt); + return 0; + +out_revert: + rpc_clnt_set_transport(clnt, old, old_timeo); + clnt->cl_parent = parent; + rpc_client_register(clnt, pseudoflavor, NULL); + xprt_put(xprt); + dprintk("RPC: failed to switch xprt for clnt %p\n", clnt); + return err; +} +EXPORT_SYMBOL_GPL(rpc_switch_client_transport); + /* * Kill all tasks for the given client. * XXX: kill their descendants as well? -- cgit v0.10.2 From 32e62b7c3ef095eccbb6a8c96fddf05dacc749df Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2013 14:12:28 -0400 Subject: NFS: Add nfs4_update_server New function nfs4_update_server() moves an nfs_server to a different nfs_client. This is done as part of migration recovery. Though it may be appealing to think of them as the same thing, migration recovery is not the same as following a referral. For a referral, the client has not descended into the file system yet: it has no nfs_server, no super block, no inodes or open state. It is enough to simply instantiate the nfs_server and super block, and perform a referral mount. For a migration, however, we have all of those things already, and they have to be moved to a different nfs_client. No local namespace changes are needed here. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust diff --git a/fs/nfs/client.c b/fs/nfs/client.c index af03258..692fd0e 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -945,7 +945,7 @@ void nfs_server_insert_lists(struct nfs_server *server) } EXPORT_SYMBOL_GPL(nfs_server_insert_lists); -static void nfs_server_remove_lists(struct nfs_server *server) +void nfs_server_remove_lists(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; struct nfs_net *nn; @@ -962,6 +962,7 @@ static void nfs_server_remove_lists(struct nfs_server *server) synchronize_rcu(); } +EXPORT_SYMBOL_GPL(nfs_server_remove_lists); /* * Allocate and initialise a server record diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 38da8c2..e5a6bd1 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -154,6 +154,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *, rpc_authflavor_t); int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); void nfs_server_insert_lists(struct nfs_server *); +void nfs_server_remove_lists(struct nfs_server *); void nfs_init_timeout_values(struct rpc_timeout *, int, unsigned int, unsigned int); int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t, rpc_authflavor_t); @@ -174,6 +175,8 @@ extern struct nfs_server *nfs4_create_server( struct nfs_subversion *); extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *, struct nfs_fh *); +extern int nfs4_update_server(struct nfs_server *server, const char *hostname, + struct sockaddr *sap, size_t salen); extern void nfs_free_server(struct nfs_server *server); extern struct nfs_server *nfs_clone_server(struct nfs_server *, struct nfs_fh *, diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 511cdce..c6eee81 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -1092,3 +1092,111 @@ error: dprintk("<-- nfs4_create_referral_server() = error %d\n", error); return ERR_PTR(error); } + +/* + * Grab the destination's particulars, including lease expiry time. + * + * Returns zero if probe succeeded and retrieved FSID matches the FSID + * we have cached. + */ +static int nfs_probe_destination(struct nfs_server *server) +{ + struct inode *inode = server->super->s_root->d_inode; + struct nfs_fattr *fattr; + int error; + + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + return -ENOMEM; + + /* Sanity: the probe won't work if the destination server + * does not recognize the migrated FH. */ + error = nfs_probe_fsinfo(server, NFS_FH(inode), fattr); + + nfs_free_fattr(fattr); + return error; +} + +/** + * nfs4_update_server - Move an nfs_server to a different nfs_client + * + * @server: represents FSID to be moved + * @hostname: new end-point's hostname + * @sap: new end-point's socket address + * @salen: size of "sap" + * + * The nfs_server must be quiescent before this function is invoked. + * Either its session is drained (NFSv4.1+), or its transport is + * plugged and drained (NFSv4.0). + * + * Returns zero on success, or a negative errno value. + */ +int nfs4_update_server(struct nfs_server *server, const char *hostname, + struct sockaddr *sap, size_t salen) +{ + struct nfs_client *clp = server->nfs_client; + struct rpc_clnt *clnt = server->client; + struct xprt_create xargs = { + .ident = clp->cl_proto, + .net = &init_net, + .dstaddr = sap, + .addrlen = salen, + .servername = hostname, + }; + char buf[INET6_ADDRSTRLEN + 1]; + struct sockaddr_storage address; + struct sockaddr *localaddr = (struct sockaddr *)&address; + int error; + + dprintk("--> %s: move FSID %llx:%llx to \"%s\")\n", __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + hostname); + + error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout); + if (error != 0) { + dprintk("<-- %s(): rpc_switch_client_transport returned %d\n", + __func__, error); + goto out; + } + + error = rpc_localaddr(clnt, localaddr, sizeof(address)); + if (error != 0) { + dprintk("<-- %s(): rpc_localaddr returned %d\n", + __func__, error); + goto out; + } + + error = -EAFNOSUPPORT; + if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0) { + dprintk("<-- %s(): rpc_ntop returned %d\n", + __func__, error); + goto out; + } + + nfs_server_remove_lists(server); + error = nfs4_set_client(server, hostname, sap, salen, buf, + clp->cl_rpcclient->cl_auth->au_flavor, + clp->cl_proto, clnt->cl_timeout, + clp->cl_minorversion, clp->cl_net); + nfs_put_client(clp); + if (error != 0) { + nfs_server_insert_lists(server); + dprintk("<-- %s(): nfs4_set_client returned %d\n", + __func__, error); + goto out; + } + + if (server->nfs_client->cl_hostname == NULL) + server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL); + nfs_server_insert_lists(server); + + error = nfs_probe_destination(server); + if (error < 0) + goto out; + + dprintk("<-- %s() succeeded\n", __func__); + +out: + return error; +} -- cgit v0.10.2 From 800c06a5bf497e0587ba8382a761f31a7825e2cd Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2013 14:12:34 -0400 Subject: NFS: Add functions to swap transports during migration recovery Introduce functions that can walk through an array of returned fs_locations information and connect a transport to one of the destination servers listed therein. Note that NFS minor version 1 introduces "fs_locations_info" which extends the locations array sorting criteria available to clients. This is not supported yet. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 28842ab..fcae728 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -217,6 +217,8 @@ rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *); struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *, struct inode *, struct qstr *); struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *, struct nfs_fh *, struct nfs_fattr *); +int nfs4_replace_transport(struct nfs_server *server, + const struct nfs4_fs_locations *locations); /* nfs4proc.c */ extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 2288cd3..ebd8b06 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -400,3 +400,104 @@ out: rpc_shutdown_client(client); return mnt; } + +/* + * Try one location from the fs_locations array. + * + * Returns zero on success, or a negative errno value. + */ +static int nfs4_try_replacing_one_location(struct nfs_server *server, + char *page, char *page2, + const struct nfs4_fs_location *location) +{ + const size_t addr_bufsize = sizeof(struct sockaddr_storage); + struct sockaddr *sap; + unsigned int s; + size_t salen; + int error; + + sap = kmalloc(addr_bufsize, GFP_KERNEL); + if (sap == NULL) + return -ENOMEM; + + error = -ENOENT; + for (s = 0; s < location->nservers; s++) { + const struct nfs4_string *buf = &location->servers[s]; + char *hostname; + + if (buf->len <= 0 || buf->len > PAGE_SIZE) + continue; + + if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len) != NULL) + continue; + + salen = nfs_parse_server_name(buf->data, buf->len, + sap, addr_bufsize, server); + if (salen == 0) + continue; + rpc_set_port(sap, NFS_PORT); + + error = -ENOMEM; + hostname = kstrndup(buf->data, buf->len, GFP_KERNEL); + if (hostname == NULL) + break; + + error = nfs4_update_server(server, hostname, sap, salen); + kfree(hostname); + if (error == 0) + break; + } + + kfree(sap); + return error; +} + +/** + * nfs4_replace_transport - set up transport to destination server + * + * @server: export being migrated + * @locations: fs_locations array + * + * Returns zero on success, or a negative errno value. + * + * The client tries all the entries in the "locations" array, in the + * order returned by the server, until one works or the end of the + * array is reached. + */ +int nfs4_replace_transport(struct nfs_server *server, + const struct nfs4_fs_locations *locations) +{ + char *page = NULL, *page2 = NULL; + int loc, error; + + error = -ENOENT; + if (locations == NULL || locations->nlocations <= 0) + goto out; + + error = -ENOMEM; + page = (char *) __get_free_page(GFP_USER); + if (!page) + goto out; + page2 = (char *) __get_free_page(GFP_USER); + if (!page2) + goto out; + + for (loc = 0; loc < locations->nlocations; loc++) { + const struct nfs4_fs_location *location = + &locations->locations[loc]; + + if (location == NULL || location->nservers <= 0 || + location->rootpath.ncomponents == 0) + continue; + + error = nfs4_try_replacing_one_location(server, page, + page2, location); + if (error == 0) + break; + } + +out: + free_page((unsigned long)page); + free_page((unsigned long)page2); + return error; +} -- cgit v0.10.2 From ec011fe847347b40c60fdb5085f65227762e2e08 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2013 14:12:39 -0400 Subject: NFS: Introduce a vector of migration recovery ops The differences between minor version 0 and minor version 1 migration will be abstracted by the addition of a set of migration recovery ops. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index fcae728..4f48000 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -50,6 +50,7 @@ struct nfs4_minor_version_ops { const struct nfs4_state_recovery_ops *reboot_recovery_ops; const struct nfs4_state_recovery_ops *nograce_recovery_ops; const struct nfs4_state_maintenance_ops *state_renewal_ops; + const struct nfs4_mig_recovery_ops *mig_recovery_ops; }; #define NFS_SEQID_CONFIRMED 1 @@ -203,6 +204,9 @@ struct nfs4_state_maintenance_ops { int (*renew_lease)(struct nfs_client *, struct rpc_cred *); }; +struct nfs4_mig_recovery_ops { +}; + extern const struct dentry_operations nfs4_dentry_operations; /* dir.c */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 259f8be..2aacd13 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7881,6 +7881,14 @@ static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = { }; #endif +static const struct nfs4_mig_recovery_ops nfs40_mig_recovery_ops = { +}; + +#if defined(CONFIG_NFS_V4_1) +static const struct nfs4_mig_recovery_ops nfs41_mig_recovery_ops = { +}; +#endif /* CONFIG_NFS_V4_1 */ + static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .minor_version = 0, .init_caps = NFS_CAP_READDIRPLUS @@ -7896,6 +7904,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .reboot_recovery_ops = &nfs40_reboot_recovery_ops, .nograce_recovery_ops = &nfs40_nograce_recovery_ops, .state_renewal_ops = &nfs40_state_renewal_ops, + .mig_recovery_ops = &nfs40_mig_recovery_ops, }; #if defined(CONFIG_NFS_V4_1) @@ -7916,6 +7925,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, .state_renewal_ops = &nfs41_state_renewal_ops, + .mig_recovery_ops = &nfs41_mig_recovery_ops, }; #endif -- cgit v0.10.2 From 9e6ee76dfb7cd747fac5679542a9b45930173eec Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2013 14:12:45 -0400 Subject: NFS: Export _nfs_display_fhandle() Allow code in nfsv4.ko to use _nfs_display_fhandle(). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index eda8879..4bc7538 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1209,6 +1209,7 @@ u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh) * not on the result */ return nfs_fhandle_hash(fh); } +EXPORT_SYMBOL_GPL(_nfs_display_fhandle_hash); /* * _nfs_display_fhandle - display an NFS file handle on the console @@ -1253,6 +1254,7 @@ void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption) } } } +EXPORT_SYMBOL_GPL(_nfs_display_fhandle); #endif /** diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index c6eee81..a2cba66 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -925,7 +925,7 @@ static int nfs4_server_common_setup(struct nfs_server *server, dprintk("Server FSID: %llx:%llx\n", (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); - dprintk("Mount FH: %d\n", mntfh->size); + nfs_display_fhandle(mntfh, "Pseudo-fs root FH"); nfs4_session_set_rwsize(server); -- cgit v0.10.2 From b03d735b4ca2375d2251195cd848713bc55e7d79 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2013 14:12:50 -0400 Subject: NFS: Add method to retrieve fs_locations during migration recovery The nfs4_proc_fs_locations() function is invoked during referral processing to perform a GETATTR(fs_locations) on an object's parent directory in order to discover the target of the referral. It performs a LOOKUP in the compound, so the client needs to know the parent's file handle a priori. Unfortunately this function is not adequate for handling migration recovery. We need to probe fs_locations information on an FSID, but there's no parent directory available for many operations that can return NFS4ERR_MOVED. Another subtlety: recovering from NFS4ERR_LEASE_MOVED is a process of walking over a list of known FSIDs that reside on the server, and probing whether they have migrated. Once the server has detected that the client has probed all migrated file systems, it stops returning NFS4ERR_LEASE_MOVED. A minor version zero server needs to know what client ID is requesting fs_locations information so it can clear the flag that forces it to continue returning NFS4ERR_LEASE_MOVED. This flag is set per client ID and per FSID. However, the client ID is not an argument of either the PUTFH or GETATTR operations. Later minor versions have client ID information embedded in the compound's SEQUENCE operation. Therefore, by convention, minor version zero clients send a RENEW operation in the same compound as the GETATTR(fs_locations), since RENEW's one argument is a clientid4. This allows a minor version zero server to identify correctly the client that is probing for a migration. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 4f48000..e59d3b4 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -205,6 +205,8 @@ struct nfs4_state_maintenance_ops { }; struct nfs4_mig_recovery_ops { + int (*get_locations)(struct inode *, struct nfs4_fs_locations *, + struct page *, struct rpc_cred *); }; extern const struct dentry_operations nfs4_dentry_operations; @@ -237,6 +239,8 @@ extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait); extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *, struct nfs4_fs_locations *, struct page *); +extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *, + struct page *page, struct rpc_cred *); extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *, struct nfs_fh *, struct nfs_fattr *); extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 2aacd13..c71c16e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5984,6 +5984,157 @@ int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, return err; } +/* + * This operation also signals the server that this client is + * performing migration recovery. The server can stop returning + * NFS4ERR_LEASE_MOVED to this client. A RENEW operation is + * appended to this compound to identify the client ID which is + * performing recovery. + */ +static int _nfs40_proc_get_locations(struct inode *inode, + struct nfs4_fs_locations *locations, + struct page *page, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct rpc_clnt *clnt = server->client; + u32 bitmask[2] = { + [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, + }; + struct nfs4_fs_locations_arg args = { + .clientid = server->nfs_client->cl_clientid, + .fh = NFS_FH(inode), + .page = page, + .bitmask = bitmask, + .migration = 1, /* skip LOOKUP */ + .renew = 1, /* append RENEW */ + }; + struct nfs4_fs_locations_res res = { + .fs_locations = locations, + .migration = 1, + .renew = 1, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + unsigned long now = jiffies; + int status; + + nfs_fattr_init(&locations->fattr); + locations->server = server; + locations->nlocations = 0; + + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(clnt, server, &msg, + &args.seq_args, &res.seq_res); + if (status) + return status; + + renew_lease(server, now); + return 0; +} + +#ifdef CONFIG_NFS_V4_1 + +/* + * This operation also signals the server that this client is + * performing migration recovery. The server can stop asserting + * SEQ4_STATUS_LEASE_MOVED for this client. The client ID + * performing this operation is identified in the SEQUENCE + * operation in this compound. + * + * When the client supports GETATTR(fs_locations_info), it can + * be plumbed in here. + */ +static int _nfs41_proc_get_locations(struct inode *inode, + struct nfs4_fs_locations *locations, + struct page *page, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct rpc_clnt *clnt = server->client; + u32 bitmask[2] = { + [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, + }; + struct nfs4_fs_locations_arg args = { + .fh = NFS_FH(inode), + .page = page, + .bitmask = bitmask, + .migration = 1, /* skip LOOKUP */ + }; + struct nfs4_fs_locations_res res = { + .fs_locations = locations, + .migration = 1, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + int status; + + nfs_fattr_init(&locations->fattr); + locations->server = server; + locations->nlocations = 0; + + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(clnt, server, &msg, + &args.seq_args, &res.seq_res); + if (status == NFS4_OK && + res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED) + status = -NFS4ERR_LEASE_MOVED; + return status; +} + +#endif /* CONFIG_NFS_V4_1 */ + +/** + * nfs4_proc_get_locations - discover locations for a migrated FSID + * @inode: inode on FSID that is migrating + * @locations: result of query + * @page: buffer + * @cred: credential to use for this operation + * + * Returns NFS4_OK on success, a negative NFS4ERR status code if the + * operation failed, or a negative errno if a local error occurred. + * + * On success, "locations" is filled in, but if the server has + * no locations information, NFS_ATTR_FATTR_V4_LOCATIONS is not + * asserted. + * + * -NFS4ERR_LEASE_MOVED is returned if the server still has leases + * from this client that require migration recovery. + */ +int nfs4_proc_get_locations(struct inode *inode, + struct nfs4_fs_locations *locations, + struct page *page, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; + const struct nfs4_mig_recovery_ops *ops = + clp->cl_mvops->mig_recovery_ops; + struct nfs4_exception exception = { }; + int status; + + dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + clp->cl_hostname); + nfs_display_fhandle(NFS_FH(inode), __func__); + + do { + status = ops->get_locations(inode, locations, page, cred); + if (status != -NFS4ERR_DELAY) + break; + nfs4_handle_exception(server, status, &exception); + } while (exception.retry); + return status; +} + /** * If 'use_integrity' is true and the state managment nfs_client * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient @@ -7882,10 +8033,12 @@ static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = { #endif static const struct nfs4_mig_recovery_ops nfs40_mig_recovery_ops = { + .get_locations = _nfs40_proc_get_locations, }; #if defined(CONFIG_NFS_V4_1) static const struct nfs4_mig_recovery_ops nfs41_mig_recovery_ops = { + .get_locations = _nfs41_proc_get_locations, }; #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 79210d2..1854b04 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -736,13 +736,15 @@ static int nfs4_stat_to_errno(int); encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_lookup_maxsz + \ - encode_fs_locations_maxsz) + encode_fs_locations_maxsz + \ + encode_renew_maxsz) #define NFS4_dec_fs_locations_sz \ (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_lookup_maxsz + \ - decode_fs_locations_maxsz) + decode_fs_locations_maxsz + \ + decode_renew_maxsz) #define NFS4_enc_secinfo_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -2687,11 +2689,20 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); - encode_putfh(xdr, args->dir_fh, &hdr); - encode_lookup(xdr, args->name, &hdr); - replen = hdr.replen; /* get the attribute into args->page */ - encode_fs_locations(xdr, args->bitmask, &hdr); + if (args->migration) { + encode_putfh(xdr, args->fh, &hdr); + replen = hdr.replen; + encode_fs_locations(xdr, args->bitmask, &hdr); + if (args->renew) + encode_renew(xdr, args->clientid, &hdr); + } else { + encode_putfh(xdr, args->dir_fh, &hdr); + encode_lookup(xdr, args->name, &hdr); + replen = hdr.replen; + encode_fs_locations(xdr, args->bitmask, &hdr); + } + /* Set up reply kvec to capture returned fs_locations array. */ xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page, 0, PAGE_SIZE); encode_nops(&hdr); @@ -6824,13 +6835,26 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, status = decode_putfh(xdr); if (status) goto out; - status = decode_lookup(xdr); - if (status) - goto out; - xdr_enter_page(xdr, PAGE_SIZE); - status = decode_getfattr_generic(xdr, &res->fs_locations->fattr, + if (res->migration) { + xdr_enter_page(xdr, PAGE_SIZE); + status = decode_getfattr_generic(xdr, + &res->fs_locations->fattr, NULL, res->fs_locations, NULL, res->fs_locations->server); + if (status) + goto out; + if (res->renew) + status = decode_renew(xdr); + } else { + status = decode_lookup(xdr); + if (status) + goto out; + xdr_enter_page(xdr, PAGE_SIZE); + status = decode_getfattr_generic(xdr, + &res->fs_locations->fattr, + NULL, res->fs_locations, + NULL, res->fs_locations->server); + } out: return status; } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 49f52c8..405dfad 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1053,14 +1053,18 @@ struct nfs4_fs_locations { struct nfs4_fs_locations_arg { struct nfs4_sequence_args seq_args; const struct nfs_fh *dir_fh; + const struct nfs_fh *fh; const struct qstr *name; struct page *page; const u32 *bitmask; + clientid4 clientid; + unsigned char migration:1, renew:1; }; struct nfs4_fs_locations_res { struct nfs4_sequence_res seq_res; struct nfs4_fs_locations *fs_locations; + unsigned char migration:1, renew:1; }; struct nfs4_secinfo4 { -- cgit v0.10.2 From ce6cda1845cf2332d2c411a5c72cd166256b21da Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2013 14:12:56 -0400 Subject: NFS: Add a super_block backpointer to the nfs_server struct NFS_SB() returns the pointer to an nfs_server struct, given a pointer to a super_block. But we have no way to go back the other way. Add a super_block backpointer field so that, given an nfs_server struct, it is easy to get to the filesystem's root dentry. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 3f5a7a8..e26647b 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2534,6 +2534,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, mntroot = ERR_PTR(error); goto error_splat_bdi; } + server->super = s; } if (!s->s_root) { diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index f9c0a6c..46b0cb4 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -148,6 +148,7 @@ struct nfs_server { __u64 maxfilesize; /* maximum file size */ struct timespec time_delta; /* smallest time granularity */ unsigned long mount_time; /* when this fs was mounted */ + struct super_block *super; /* VFS super block */ dev_t s_dev; /* superblock dev numbers */ #ifdef CONFIG_NFS_FSCACHE -- cgit v0.10.2 From c9fdeb280b8cc511c6730db9ab3973331ea344e0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2013 14:13:02 -0400 Subject: NFS: Add basic migration support to state manager thread Migration recovery and state recovery must be serialized, so handle both in the state manager thread. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index e59d3b4..94e783f 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -29,6 +29,7 @@ enum nfs4_client_state { NFS4CLNT_SERVER_SCOPE_MISMATCH, NFS4CLNT_PURGE_STATE, NFS4CLNT_BIND_CONN_TO_SESSION, + NFS4CLNT_MOVED, }; #define NFS4_RENEW_TIMEOUT 0x01 @@ -421,6 +422,7 @@ extern int nfs4_client_recover_expired_lease(struct nfs_client *clp); extern void nfs4_schedule_state_manager(struct nfs_client *); extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp); extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); +extern int nfs4_schedule_migration_recovery(const struct nfs_server *); extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); extern void nfs41_handle_server_scope(struct nfs_client *, struct nfs41_server_scope **); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index a2cba66..0cde95e 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -197,6 +197,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; clp->cl_minorversion = cl_init->minorversion; clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; + clp->cl_mig_gen = 1; return clp; error: diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index fa2706a..ba0db18 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -239,8 +239,6 @@ static void nfs4_end_drain_session(struct nfs_client *clp) } } -#if defined(CONFIG_NFS_V4_1) - static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl) { set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state); @@ -270,6 +268,8 @@ static int nfs4_begin_drain_session(struct nfs_client *clp) return nfs4_drain_slot_tbl(&ses->fc_slot_table); } +#if defined(CONFIG_NFS_V4_1) + static int nfs41_setup_state_renewal(struct nfs_client *clp) { int status; @@ -1197,6 +1197,42 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp) } EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery); +/** + * nfs4_schedule_migration_recovery - trigger migration recovery + * + * @server: FSID that is migrating + * + * Returns zero if recovery has started, otherwise a negative NFS4ERR + * value is returned. + */ +int nfs4_schedule_migration_recovery(const struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + + if (server->fh_expire_type != NFS4_FH_PERSISTENT) { + pr_err("NFS: volatile file handles not supported (server %s)\n", + clp->cl_hostname); + return -NFS4ERR_IO; + } + + if (test_bit(NFS_MIG_FAILED, &server->mig_status)) + return -NFS4ERR_IO; + + dprintk("%s: scheduling migration recovery for (%llx:%llx) on %s\n", + __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + clp->cl_hostname); + + set_bit(NFS_MIG_IN_TRANSITION, + &((struct nfs_server *)server)->mig_status); + set_bit(NFS4CLNT_MOVED, &clp->cl_state); + + nfs4_schedule_state_manager(clp); + return 0; +} +EXPORT_SYMBOL_GPL(nfs4_schedule_migration_recovery); + int nfs4_wait_clnt_recover(struct nfs_client *clp) { int res; @@ -1826,6 +1862,119 @@ static int nfs4_purge_lease(struct nfs_client *clp) return 0; } +/* + * Try remote migration of one FSID from a source server to a + * destination server. The source server provides a list of + * potential destinations. + * + * Returns zero or a negative NFS4ERR status code. + */ +static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_fs_locations *locations = NULL; + struct inode *inode; + struct page *page; + int status, result; + + dprintk("--> %s: FSID %llx:%llx on \"%s\"\n", __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + clp->cl_hostname); + + result = 0; + page = alloc_page(GFP_KERNEL); + locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); + if (page == NULL || locations == NULL) { + dprintk("<-- %s: no memory\n", __func__); + goto out; + } + + inode = server->super->s_root->d_inode; + result = nfs4_proc_get_locations(inode, locations, page, cred); + if (result) { + dprintk("<-- %s: failed to retrieve fs_locations: %d\n", + __func__, result); + goto out; + } + + result = -NFS4ERR_NXIO; + if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { + dprintk("<-- %s: No fs_locations data, migration skipped\n", + __func__); + goto out; + } + + nfs4_begin_drain_session(clp); + + status = nfs4_replace_transport(server, locations); + if (status != 0) { + dprintk("<-- %s: failed to replace transport: %d\n", + __func__, status); + goto out; + } + + result = 0; + dprintk("<-- %s: migration succeeded\n", __func__); + +out: + if (page != NULL) + __free_page(page); + kfree(locations); + if (result) { + pr_err("NFS: migration recovery failed (server %s)\n", + clp->cl_hostname); + set_bit(NFS_MIG_FAILED, &server->mig_status); + } + return result; +} + +/* + * Returns zero or a negative NFS4ERR status code. + */ +static int nfs4_handle_migration(struct nfs_client *clp) +{ + const struct nfs4_state_maintenance_ops *ops = + clp->cl_mvops->state_renewal_ops; + struct nfs_server *server; + struct rpc_cred *cred; + + dprintk("%s: migration reported on \"%s\"\n", __func__, + clp->cl_hostname); + + spin_lock(&clp->cl_lock); + cred = ops->get_state_renewal_cred_locked(clp); + spin_unlock(&clp->cl_lock); + if (cred == NULL) + return -NFS4ERR_NOENT; + + clp->cl_mig_gen++; +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + int status; + + if (server->mig_gen == clp->cl_mig_gen) + continue; + server->mig_gen = clp->cl_mig_gen; + + if (!test_and_clear_bit(NFS_MIG_IN_TRANSITION, + &server->mig_status)) + continue; + + rcu_read_unlock(); + status = nfs4_try_migration(server, cred); + if (status < 0) { + put_rpccred(cred); + return status; + } + goto restart; + } + rcu_read_unlock(); + put_rpccred(cred); + return 0; +} + /** * nfs4_discover_server_trunking - Detect server IP address trunking * @@ -2154,7 +2303,13 @@ static void nfs4_state_manager(struct nfs_client *clp) status = nfs4_check_lease(clp); if (status < 0) goto out_error; - continue; + } + + if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) { + section = "migration"; + status = nfs4_handle_migration(clp); + if (status < 0) + goto out_error; } /* First recover reboot state... */ diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 46b0cb4..186ec49 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -79,6 +79,7 @@ struct nfs_client { char cl_ipaddr[48]; u32 cl_cb_ident; /* v4.0 callback identifier */ const struct nfs4_minor_version_ops *cl_mvops; + unsigned long cl_mig_gen; /* NFSv4.0 transport blocking */ struct nfs4_slot_table *cl_slot_tbl; @@ -189,6 +190,12 @@ struct nfs_server { struct list_head state_owners_lru; struct list_head layouts; struct list_head delegations; + + unsigned long mig_gen; + unsigned long mig_status; +#define NFS_MIG_IN_TRANSITION (1) +#define NFS_MIG_FAILED (2) + void (*destroy)(struct nfs_server *); atomic_t active; /* Keep trace of any activity to this server */ -- cgit v0.10.2 From f1478c13c0a606b960becaa70e82065835b16fbf Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2013 14:13:07 -0400 Subject: NFS: Re-use exit code in nfs4_async_handle_error() Clean up. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c71c16e..2614b46 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4750,19 +4750,15 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, dprintk("%s ERROR %d, Reset session\n", __func__, task->tk_status); nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); - task->tk_status = 0; - return -EAGAIN; + goto restart_call; #endif /* CONFIG_NFS_V4_1 */ case -NFS4ERR_DELAY: nfs_inc_server_stats(server, NFSIOS_DELAY); case -NFS4ERR_GRACE: rpc_delay(task, NFS4_POLL_RETRY_MAX); - task->tk_status = 0; - return -EAGAIN; case -NFS4ERR_RETRY_UNCACHED_REP: case -NFS4ERR_OLD_STATEID: - task->tk_status = 0; - return -EAGAIN; + goto restart_call; } task->tk_status = nfs4_map_errors(task->tk_status); return 0; @@ -4773,6 +4769,7 @@ wait_on_recovery: rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); +restart_call: task->tk_status = 0; return -EAGAIN; } -- cgit v0.10.2 From 9f51a78e3a637a5923c661d2abe958c63441696f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2013 14:13:13 -0400 Subject: NFS: Rename "stateid_invalid" label I'm going to use this exit label also for migration recovery failures. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 2614b46..fa87f81 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4728,12 +4728,12 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, if (state == NULL) break; if (nfs4_schedule_stateid_recovery(server, state) < 0) - goto stateid_invalid; + goto recovery_failed; goto wait_on_recovery; case -NFS4ERR_EXPIRED: if (state != NULL) { if (nfs4_schedule_stateid_recovery(server, state) < 0) - goto stateid_invalid; + goto recovery_failed; } case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: @@ -4762,7 +4762,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, } task->tk_status = nfs4_map_errors(task->tk_status); return 0; -stateid_invalid: +recovery_failed: task->tk_status = -EIO; return 0; wait_on_recovery: -- cgit v0.10.2 From 519ae255d403219fd5375de13c95dea3ef1cfda0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2013 14:13:19 -0400 Subject: NFS: Add migration recovery callouts in nfs4proc.c When a server returns NFS4ERR_MOVED, trigger the new migration recovery logic in the state manager. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index fa87f81..a1f6209 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -384,6 +384,11 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc case -NFS4ERR_STALE_CLIENTID: nfs4_schedule_lease_recovery(clp); goto wait_on_recovery; + case -NFS4ERR_MOVED: + ret = nfs4_schedule_migration_recovery(server); + if (ret < 0) + break; + goto wait_on_recovery; #if defined(CONFIG_NFS_V4_1) case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: @@ -431,6 +436,8 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc return nfs4_map_errors(ret); wait_on_recovery: ret = nfs4_wait_clnt_recover(clp); + if (test_bit(NFS_MIG_FAILED, &server->mig_status)) + return -EIO; if (ret == 0) exception->retry = 1; return ret; @@ -2974,11 +2981,16 @@ static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir, status = nfs4_proc_fs_locations(client, dir, name, locations, page); if (status != 0) goto out; - /* Make sure server returned a different fsid for the referral */ + + /* + * If the fsid didn't change, this is a migration event, not a + * referral. Cause us to drop into the exception handler, which + * will kick off migration recovery. + */ if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) { dprintk("%s: server did not return a different fsid for" " a referral at %s\n", __func__, name->name); - status = -EIO; + status = -NFS4ERR_MOVED; goto out; } /* Fixup attributes for the nfs_lookup() call to nfs_fhget() */ @@ -4739,6 +4751,10 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, case -NFS4ERR_STALE_CLIENTID: nfs4_schedule_lease_recovery(clp); goto wait_on_recovery; + case -NFS4ERR_MOVED: + if (nfs4_schedule_migration_recovery(server) < 0) + goto recovery_failed; + goto wait_on_recovery; #if defined(CONFIG_NFS_V4_1) case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: @@ -4769,6 +4785,8 @@ wait_on_recovery: rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); + if (test_bit(NFS_MIG_FAILED, &server->mig_status)) + goto recovery_failed; restart_call: task->tk_status = 0; return -EAGAIN; -- cgit v0.10.2 From 352297b917d8a3e61a778ffb0f0195ce8550d7f5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2013 14:13:24 -0400 Subject: NFS: Handle NFS4ERR_MOVED during delegation recall When a server returns NFS4ERR_MOVED during a delegation recall, trigger the new migration recovery logic in the state manager. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a1f6209..552e4f7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1575,6 +1575,9 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct /* Don't recall a delegation if it was lost */ nfs4_schedule_lease_recovery(server->nfs_client); return -EAGAIN; + case -NFS4ERR_MOVED: + nfs4_schedule_migration_recovery(server); + return -EAGAIN; case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: -- cgit v0.10.2 From 44c9993384e9311cd56acf6ead3baffab616ae50 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2013 14:13:30 -0400 Subject: NFS: Add method to detect whether an FSID is still on the server Introduce a mechanism for probing a server to determine if an FSID is present or absent. The on-the-wire compound is different between minor version 0 and 1. Minor version 0 appends a RENEW operation to identify which client ID is probing. Minor version 1 has a SEQUENCE operation in the compound which effectively carries the same information. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 94e783f..2f0f8c2 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -208,6 +208,7 @@ struct nfs4_state_maintenance_ops { struct nfs4_mig_recovery_ops { int (*get_locations)(struct inode *, struct nfs4_fs_locations *, struct page *, struct rpc_cred *); + int (*fsid_present)(struct inode *, struct rpc_cred *); }; extern const struct dentry_operations nfs4_dentry_operations; @@ -242,6 +243,7 @@ extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struc struct nfs4_fs_locations *, struct page *); extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *, struct page *page, struct rpc_cred *); +extern int nfs4_proc_fsid_present(struct inode *, struct rpc_cred *); extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *, struct nfs_fh *, struct nfs_fattr *); extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 552e4f7..01b90bd 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6153,6 +6153,132 @@ int nfs4_proc_get_locations(struct inode *inode, return status; } +/* + * This operation also signals the server that this client is + * performing "lease moved" recovery. The server can stop + * returning NFS4ERR_LEASE_MOVED to this client. A RENEW operation + * is appended to this compound to identify the client ID which is + * performing recovery. + */ +static int _nfs40_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct rpc_clnt *clnt = server->client; + struct nfs4_fsid_present_arg args = { + .fh = NFS_FH(inode), + .clientid = clp->cl_clientid, + .renew = 1, /* append RENEW */ + }; + struct nfs4_fsid_present_res res = { + .renew = 1, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + unsigned long now = jiffies; + int status; + + res.fh = nfs_alloc_fhandle(); + if (res.fh == NULL) + return -ENOMEM; + + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(clnt, server, &msg, + &args.seq_args, &res.seq_res); + nfs_free_fhandle(res.fh); + if (status) + return status; + + do_renew_lease(clp, now); + return 0; +} + +#ifdef CONFIG_NFS_V4_1 + +/* + * This operation also signals the server that this client is + * performing "lease moved" recovery. The server can stop asserting + * SEQ4_STATUS_LEASE_MOVED for this client. The client ID performing + * this operation is identified in the SEQUENCE operation in this + * compound. + */ +static int _nfs41_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct rpc_clnt *clnt = server->client; + struct nfs4_fsid_present_arg args = { + .fh = NFS_FH(inode), + }; + struct nfs4_fsid_present_res res = { + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + int status; + + res.fh = nfs_alloc_fhandle(); + if (res.fh == NULL) + return -ENOMEM; + + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(clnt, server, &msg, + &args.seq_args, &res.seq_res); + nfs_free_fhandle(res.fh); + if (status == NFS4_OK && + res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED) + status = -NFS4ERR_LEASE_MOVED; + return status; +} + +#endif /* CONFIG_NFS_V4_1 */ + +/** + * nfs4_proc_fsid_present - Is this FSID present or absent on server? + * @inode: inode on FSID to check + * @cred: credential to use for this operation + * + * Server indicates whether the FSID is present, moved, or not + * recognized. This operation is necessary to clear a LEASE_MOVED + * condition for this client ID. + * + * Returns NFS4_OK if the FSID is present on this server, + * -NFS4ERR_MOVED if the FSID is no longer present, a negative + * NFS4ERR code if some error occurred on the server, or a + * negative errno if a local failure occurred. + */ +int nfs4_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; + const struct nfs4_mig_recovery_ops *ops = + clp->cl_mvops->mig_recovery_ops; + struct nfs4_exception exception = { }; + int status; + + dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + clp->cl_hostname); + nfs_display_fhandle(NFS_FH(inode), __func__); + + do { + status = ops->fsid_present(inode, cred); + if (status != -NFS4ERR_DELAY) + break; + nfs4_handle_exception(server, status, &exception); + } while (exception.retry); + return status; +} + /** * If 'use_integrity' is true and the state managment nfs_client * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient @@ -8052,11 +8178,13 @@ static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = { static const struct nfs4_mig_recovery_ops nfs40_mig_recovery_ops = { .get_locations = _nfs40_proc_get_locations, + .fsid_present = _nfs40_proc_fsid_present, }; #if defined(CONFIG_NFS_V4_1) static const struct nfs4_mig_recovery_ops nfs41_mig_recovery_ops = { .get_locations = _nfs41_proc_get_locations, + .fsid_present = _nfs41_proc_fsid_present, }; #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 1854b04..f903389 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -595,11 +595,13 @@ static int nfs4_stat_to_errno(int); #define NFS4_enc_getattr_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ - encode_getattr_maxsz) + encode_getattr_maxsz + \ + encode_renew_maxsz) #define NFS4_dec_getattr_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ - decode_getattr_maxsz) + decode_getattr_maxsz + \ + decode_renew_maxsz) #define NFS4_enc_lookup_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -753,6 +755,18 @@ static int nfs4_stat_to_errno(int); decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_secinfo_maxsz) +#define NFS4_enc_fsid_present_sz \ + (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_getfh_maxsz + \ + encode_renew_maxsz) +#define NFS4_dec_fsid_present_sz \ + (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_getfh_maxsz + \ + decode_renew_maxsz) #if defined(CONFIG_NFS_V4_1) #define NFS4_enc_bind_conn_to_session_sz \ (compound_encode_hdr_maxsz + \ @@ -2726,6 +2740,26 @@ static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req, encode_nops(&hdr); } +/* + * Encode FSID_PRESENT request + */ +static void nfs4_xdr_enc_fsid_present(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_fsid_present_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_getfh(xdr, &hdr); + if (args->renew) + encode_renew(xdr, args->clientid, &hdr); + encode_nops(&hdr); +} + #if defined(CONFIG_NFS_V4_1) /* * BIND_CONN_TO_SESSION request @@ -6883,6 +6917,34 @@ out: return status; } +/* + * Decode FSID_PRESENT response + */ +static int nfs4_xdr_dec_fsid_present(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_fsid_present_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_getfh(xdr, res->fh); + if (status) + goto out; + if (res->renew) + status = decode_renew(xdr); +out: + return status; +} + #if defined(CONFIG_NFS_V4_1) /* * Decode BIND_CONN_TO_SESSION response @@ -7397,6 +7459,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), PROC(SECINFO, enc_secinfo, dec_secinfo), + PROC(FSID_PRESENT, enc_fsid_present, dec_fsid_present), #if defined(CONFIG_NFS_V4_1) PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), PROC(CREATE_SESSION, enc_create_session, dec_create_session), diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index e36dee5..c56fa8f 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -460,6 +460,7 @@ enum { NFSPROC4_CLNT_FS_LOCATIONS, NFSPROC4_CLNT_RELEASE_LOCKOWNER, NFSPROC4_CLNT_SECINFO, + NFSPROC4_CLNT_FSID_PRESENT, /* nfs41 */ NFSPROC4_CLNT_EXCHANGE_ID, diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 405dfad..8fe5b94 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1088,6 +1088,19 @@ struct nfs4_secinfo_res { struct nfs4_secinfo_flavors *flavors; }; +struct nfs4_fsid_present_arg { + struct nfs4_sequence_args seq_args; + const struct nfs_fh *fh; + clientid4 clientid; + unsigned char renew:1; +}; + +struct nfs4_fsid_present_res { + struct nfs4_sequence_res seq_res; + struct nfs_fh *fh; + unsigned char renew:1; +}; + #endif /* CONFIG_NFS_V4 */ struct nfstime4 { -- cgit v0.10.2 From b7f7a66e420a9f3ec82a5124720013cee317e73a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2013 14:13:35 -0400 Subject: NFS: Support NFS4ERR_LEASE_MOVED recovery in state manager A migration on the FSID in play for the current NFS operation is reported via the error status code NFS4ERR_MOVED. "Lease moved" means that a migration has occurred on some other FSID than the one for the current operation. It's a signal that the client should take action immediately to handle a migration that it may not have noticed otherwise. This is so that the client's lease does not expire unnoticed on the destination server. In NFSv4.0, a moved lease is reported with the NFS4ERR_LEASE_MOVED error status code. To recover from NFS4ERR_LEASE_MOVED, check each FSID for that server to see if it is still present. Invoke nfs4_try_migration() if the FSID is no longer present on the server. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 2f0f8c2..210e44e 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -30,6 +30,7 @@ enum nfs4_client_state { NFS4CLNT_PURGE_STATE, NFS4CLNT_BIND_CONN_TO_SESSION, NFS4CLNT_MOVED, + NFS4CLNT_LEASE_MOVED, }; #define NFS4_RENEW_TIMEOUT 0x01 @@ -425,6 +426,7 @@ extern void nfs4_schedule_state_manager(struct nfs_client *); extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp); extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); extern int nfs4_schedule_migration_recovery(const struct nfs_server *); +extern void nfs4_schedule_lease_moved_recovery(struct nfs_client *); extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); extern void nfs41_handle_server_scope(struct nfs_client *, struct nfs41_server_scope **); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index ba0db18..552706d 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1233,6 +1233,22 @@ int nfs4_schedule_migration_recovery(const struct nfs_server *server) } EXPORT_SYMBOL_GPL(nfs4_schedule_migration_recovery); +/** + * nfs4_schedule_lease_moved_recovery - start lease-moved recovery + * + * @clp: server to check for moved leases + * + */ +void nfs4_schedule_lease_moved_recovery(struct nfs_client *clp) +{ + dprintk("%s: scheduling lease-moved recovery for client ID %llx on %s\n", + __func__, clp->cl_clientid, clp->cl_hostname); + + set_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state); + nfs4_schedule_state_manager(clp); +} +EXPORT_SYMBOL_GPL(nfs4_schedule_lease_moved_recovery); + int nfs4_wait_clnt_recover(struct nfs_client *clp) { int res; @@ -1661,7 +1677,6 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) nfs4_state_end_reclaim_reboot(clp); break; case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_LEASE_MOVED: set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); nfs4_state_clear_reclaim_reboot(clp); nfs4_state_start_reclaim_reboot(clp); @@ -1975,6 +1990,55 @@ restart: return 0; } +/* + * Test each nfs_server on the clp's cl_superblocks list to see + * if it's moved to another server. Stop when the server no longer + * returns NFS4ERR_LEASE_MOVED. + */ +static int nfs4_handle_lease_moved(struct nfs_client *clp) +{ + const struct nfs4_state_maintenance_ops *ops = + clp->cl_mvops->state_renewal_ops; + struct nfs_server *server; + struct rpc_cred *cred; + + dprintk("%s: lease moved reported on \"%s\"\n", __func__, + clp->cl_hostname); + + spin_lock(&clp->cl_lock); + cred = ops->get_state_renewal_cred_locked(clp); + spin_unlock(&clp->cl_lock); + if (cred == NULL) + return -NFS4ERR_NOENT; + + clp->cl_mig_gen++; +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + struct inode *inode; + int status; + + if (server->mig_gen == clp->cl_mig_gen) + continue; + server->mig_gen = clp->cl_mig_gen; + + rcu_read_unlock(); + + inode = server->super->s_root->d_inode; + status = nfs4_proc_fsid_present(inode, cred); + if (status != -NFS4ERR_MOVED) + goto restart; /* wasn't this one */ + if (nfs4_try_migration(server, cred) == -NFS4ERR_LEASE_MOVED) + goto restart; /* there are more */ + goto out; + } + rcu_read_unlock(); + +out: + put_rpccred(cred); + return 0; +} + /** * nfs4_discover_server_trunking - Detect server IP address trunking * @@ -2312,6 +2376,13 @@ static void nfs4_state_manager(struct nfs_client *clp) goto out_error; } + if (test_and_clear_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state)) { + section = "lease moved"; + status = nfs4_handle_lease_moved(clp); + if (status < 0) + goto out_error; + } + /* First recover reboot state... */ if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { section = "reclaim reboot"; -- cgit v0.10.2 From 8ef2f8d46aca7ff2e7e2b926bb563212aa63a4af Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2013 14:13:41 -0400 Subject: NFS: Implement support for NFS4ERR_LEASE_MOVED Trigger lease-moved recovery when a request returns NFS4ERR_LEASE_MOVED. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 01b90bd..2564e1c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -389,6 +389,9 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc if (ret < 0) break; goto wait_on_recovery; + case -NFS4ERR_LEASE_MOVED: + nfs4_schedule_lease_moved_recovery(clp); + goto wait_on_recovery; #if defined(CONFIG_NFS_V4_1) case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: @@ -1578,6 +1581,9 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct case -NFS4ERR_MOVED: nfs4_schedule_migration_recovery(server); return -EAGAIN; + case -NFS4ERR_LEASE_MOVED: + nfs4_schedule_lease_moved_recovery(server->nfs_client); + return -EAGAIN; case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: @@ -4758,6 +4764,9 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, if (nfs4_schedule_migration_recovery(server) < 0) goto recovery_failed; goto wait_on_recovery; + case -NFS4ERR_LEASE_MOVED: + nfs4_schedule_lease_moved_recovery(clp); + goto wait_on_recovery; #if defined(CONFIG_NFS_V4_1) case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: -- cgit v0.10.2 From 60ea68129942dc36cd1a3a9bdaec783369ee5a6d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2013 14:13:47 -0400 Subject: NFS: Migration support for RELEASE_LOCKOWNER Currently the Linux NFS client ignores the operation status code for the RELEASE_LOCKOWNER operation. Like NFSv3's UMNT operation, RELEASE_LOCKOWNER is a courtesy to help servers manage their resources, and the outcome is not consequential for the client. During a migration, a server may report NFS4ERR_LEASE_MOVED, in which case the client really should retry, since typically LEASE_MOVED has nothing to do with the current operation, but does prevent it from going forward. Also, it's important for a client to respond as soon as possible to a moved lease condition, since the client's lease could expire on the destination without further action by the client. NFS4ERR_DELAY is not included in the list of valid status codes for RELEASE_LOCKOWNER in RFC 3530bis. However, rfc3530-migration-update does permit migration-capable servers to return DELAY to clients, but only in the context of an ongoing migration. In this case the server has frozen lock state in preparation for migration, and a client retry would help the destination server purge unneeded state once migration recovery is complete. Interestly, NFS4ERR_MOVED is not valid for RELEASE_LOCKOWNER, even though lock owners can be migrated with Transparent State Migration. Note that RFC 3530bis section 9.5 includes RELEASE_LOCKOWNER in the list of operations that renew a client's lease on the server if they succeed. Now that our client pays attention to the operation's status code, we can note that renewal appropriately. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 2564e1c..9f2ccf7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5800,6 +5800,7 @@ struct nfs_release_lockowner_data { struct nfs_release_lockowner_args args; struct nfs4_sequence_args seq_args; struct nfs4_sequence_res seq_res; + unsigned long timestamp; }; static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata) @@ -5807,12 +5808,27 @@ static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata struct nfs_release_lockowner_data *data = calldata; nfs40_setup_sequence(data->server, &data->seq_args, &data->seq_res, task); + data->timestamp = jiffies; } static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata) { struct nfs_release_lockowner_data *data = calldata; + struct nfs_server *server = data->server; + nfs40_sequence_done(task, &data->seq_res); + + switch (task->tk_status) { + case 0: + renew_lease(server, data->timestamp); + break; + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_EXPIRED: + case -NFS4ERR_LEASE_MOVED: + case -NFS4ERR_DELAY: + if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); + } } static void nfs4_release_lockowner_release(void *calldata) -- cgit v0.10.2 From f8aba1e8d509c0db7a82893e595a7743ce07ea83 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2013 14:13:53 -0400 Subject: NFS: Handle NFS4ERR_LEASE_MOVED during async RENEW With NFSv4 minor version 0, the asynchronous lease RENEW heartbeat can return NFS4ERR_LEASE_MOVED. Error recovery logic for async RENEW is a separate code path from the generic NFS proc paths, so it must be updated to handle NFS4ERR_LEASE_MOVED as well. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 9f2ccf7..8aa8ff3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4235,7 +4235,13 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata) unsigned long timestamp = data->timestamp; trace_nfs4_renew_async(clp, task->tk_status); - if (task->tk_status < 0) { + switch (task->tk_status) { + case 0: + break; + case -NFS4ERR_LEASE_MOVED: + nfs4_schedule_lease_moved_recovery(clp); + break; + default: /* Unless we're shutting down, schedule state recovery! */ if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0) return; -- cgit v0.10.2 From d1c2331e75d7a5a22e2910a8bd042ea90b543db1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2013 14:13:58 -0400 Subject: NFS: Handle SEQ4_STATUS_LEASE_MOVED With the advent of NFSv4 sessions in NFSv4.1 and following, a "lease moved" condition is reported differently than it is in NFSv4.0. NFSv4 minor version 0 servers return an error status code, NFS4ERR_LEASE_MOVED, to signal that a lease has moved. This error causes the whole compound operation to fail. Normal compounds against this server continue to fail until the client performs migration recovery on the migrated share. Minor version 1 and later servers assert a bit flag in the reply to a compound's SEQUENCE operation to signal LEASE_MOVED. This is not a fatal condition: operations against this server continue normally. The server asserts this flag until the client performs migration recovery on the migrated share. Note that servers MUST NOT return NFS4ERR_LEASE_MOVED to NFSv4 clients not using NFSv4.0. After the server asserts any of the sr_status_flags in the SEQUENCE operation in a typical compound, our client initiates standard lease recovery. For NFSv4.1+, a stand-alone SEQUENCE operation is performed to discover what recovery is needed. If SEQ4_STATUS_LEASE_MOVED is asserted in this stand-alone SEQUENCE operation, our client attempts to discover which FSIDs have been migrated, and then performs migration recovery on each. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 552706d..62c08bf 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2227,9 +2227,10 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) nfs41_handle_server_reboot(clp); if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | - SEQ4_STATUS_ADMIN_STATE_REVOKED | - SEQ4_STATUS_LEASE_MOVED)) + SEQ4_STATUS_ADMIN_STATE_REVOKED)) nfs41_handle_state_revoked(clp); + if (flags & SEQ4_STATUS_LEASE_MOVED) + nfs4_schedule_lease_moved_recovery(clp); if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) nfs41_handle_recallable_state_revoked(clp); if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT) -- cgit v0.10.2 From cd3fadece2b97462583500371a3eb9b091c49603 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2013 14:14:04 -0400 Subject: NFS: Set EXCHGID4_FLAG_SUPP_MOVED_MIGR Broadly speaking, v4.1 migration is untested. There are no servers in the wild that support NFSv4.1 migration. However, as server implementations become available, we do want to enable testing by developers, while leaving it disabled for environments for which broken migration support would be an unpleasant surprise. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index b5e80b0..38c1768 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -140,6 +140,17 @@ config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN If the NFS client is unchanged from the upstream kernel, this option should be set to the default "kernel.org". +config NFS_V4_1_MIGRATION + bool "NFSv4.1 client support for migration" + depends on NFS_V4_1 + default n + help + This option makes the NFS client advertise to NFSv4.1 servers that + it can support NFSv4 migration. + + The NFSv4.1 pieces of the Linux NFSv4 migration implementation are + still experimental. If you are not an NFSv4 developer, say N here. + config NFS_V4_SECURITY_LABEL bool depends on NFS_V4_2 && SECURITY diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8aa8ff3..1463c71 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6596,8 +6596,14 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, struct nfs41_exchange_id_args args = { .verifier = &verifier, .client = clp, +#ifdef CONFIG_NFS_V4_1_MIGRATION .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | - EXCHGID4_FLAG_BIND_PRINC_STATEID, + EXCHGID4_FLAG_BIND_PRINC_STATEID | + EXCHGID4_FLAG_SUPP_MOVED_MIGR, +#else + .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | + EXCHGID4_FLAG_BIND_PRINC_STATEID, +#endif }; struct nfs41_exchange_id_res res = { 0 -- cgit v0.10.2 From 0625c2dd6ae32a221871ffc04bb752216b823001 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2013 14:14:10 -0400 Subject: NFS: Fix possible endless state recovery wait In nfs4_wait_clnt_recover(), hold a reference to the clp being waited on. The state manager can reduce clp->cl_count to 1, in which case the nfs_put_client() in nfs4_run_state_manager() can free *clp before wait_on_bit() returns and allows nfs4_wait_clnt_recover() to run again. The behavior at that point is non-deterministic. If the waited-on bit still happens to be zero, wait_on_bit() will wake the waiter as expected. If the bit is set again (say, if the memory was poisoned when freed) wait_on_bit() can leave the waiter asleep. This is a narrow fix which ensures the safety of accessing *clp in nfs4_wait_clnt_recover(), but does not address the continued use of a possibly freed *clp after nfs4_wait_clnt_recover() returns (see nfs_end_delegation_return(), for example). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 62c08bf..452f4c8 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1255,14 +1255,16 @@ int nfs4_wait_clnt_recover(struct nfs_client *clp) might_sleep(); + atomic_inc(&clp->cl_count); res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, nfs_wait_bit_killable, TASK_KILLABLE); if (res) - return res; - + goto out; if (clp->cl_cons_state < 0) - return clp->cl_cons_state; - return 0; + res = clp->cl_cons_state; +out: + nfs_put_client(clp); + return res; } int nfs4_client_recover_expired_lease(struct nfs_client *clp) -- cgit v0.10.2 From 47fd88e6b79c55e6acccaf832078ed1a340672fa Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Fri, 18 Oct 2013 15:15:15 -0400 Subject: NFSv4: make nfs_find_best_sec static It's not used outside of nfs4namespace.c anymore. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 210e44e..3ce79b0 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -222,7 +222,6 @@ int nfs_atomic_open(struct inode *, struct dentry *, struct file *, extern struct file_system_type nfs4_fs_type; /* nfs4namespace.c */ -rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *); struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *, struct inode *, struct qstr *); struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *, struct nfs_fh *, struct nfs_fattr *); diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index ebd8b06..4ec0eea 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -145,7 +145,7 @@ static size_t nfs_parse_server_name(char *string, size_t len, * is searched in the order returned from the server, per RFC 3530 * recommendation. */ -rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) +static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) { rpc_authflavor_t pseudoflavor; struct nfs4_secinfo4 *secinfo; -- cgit v0.10.2 From a3f73c27afff9590a4432879b7145289cb89cf0a Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Fri, 18 Oct 2013 15:15:16 -0400 Subject: NFS: separate passed security flavs from selected When filling parsed_mount_data, store the parsed sec= mount option in the new struct nfs_auth_info and the chosen flavor in selected_flavor. This patch lays the groundwork for supporting multiple sec= options. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 692fd0e..f5a7f7f 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -787,7 +787,8 @@ static int nfs_init_server(struct nfs_server *server, server->port = data->nfs_server.port; - error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]); + error = nfs_init_server_rpcclient(server, &timeparms, + data->selected_flavor); if (error < 0) goto error; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index e5a6bd1..c8cd044 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -88,8 +88,8 @@ struct nfs_parsed_mount_data { unsigned int namlen; unsigned int options; unsigned int bsize; - unsigned int auth_flavor_len; - rpc_authflavor_t auth_flavors[1]; + struct nfs_auth_info auth_info; + rpc_authflavor_t selected_flavor; char *client_address; unsigned int version; unsigned int minorversion; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 0cde95e..d65090e 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -949,9 +949,8 @@ out: * Create a version 4 volume record */ static int nfs4_init_server(struct nfs_server *server, - const struct nfs_parsed_mount_data *data) + struct nfs_parsed_mount_data *data) { - rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX; struct rpc_timeout timeparms; int error; @@ -964,8 +963,10 @@ static int nfs4_init_server(struct nfs_server *server, server->flags = data->flags; server->options = data->options; - if (data->auth_flavor_len >= 1) - pseudoflavor = data->auth_flavors[0]; + if (data->auth_info.flavor_len >= 1) + data->selected_flavor = data->auth_info.flavors[0]; + else + data->selected_flavor = RPC_AUTH_UNIX; /* Get a client record */ error = nfs4_set_client(server, @@ -973,7 +974,7 @@ static int nfs4_init_server(struct nfs_server *server, (const struct sockaddr *)&data->nfs_server.address, data->nfs_server.addrlen, data->client_address, - pseudoflavor, + data->selected_flavor, data->nfs_server.protocol, &timeparms, data->minorversion, @@ -993,7 +994,8 @@ static int nfs4_init_server(struct nfs_server *server, server->port = data->nfs_server.port; - error = nfs_init_server_rpcclient(server, &timeparms, pseudoflavor); + error = nfs_init_server_rpcclient(server, &timeparms, + data->selected_flavor); error: /* Done */ @@ -1020,7 +1022,7 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info, if (!server) return ERR_PTR(-ENOMEM); - auth_probe = mount_info->parsed->auth_flavor_len < 1; + auth_probe = mount_info->parsed->auth_info.flavor_len < 1; /* set up the general RPC client */ error = nfs4_init_server(server, mount_info->parsed); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index e26647b..b87744f 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -923,8 +923,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void) data->mount_server.port = NFS_UNSPEC_PORT; data->nfs_server.port = NFS_UNSPEC_PORT; data->nfs_server.protocol = XPRT_TRANSPORT_TCP; - data->auth_flavors[0] = RPC_AUTH_MAXFLAVOR; - data->auth_flavor_len = 0; + data->selected_flavor = RPC_AUTH_MAXFLAVOR; data->minorversion = 0; data->need_mount = true; data->net = current->nsproxy->net_ns; @@ -1019,13 +1018,6 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt) } } -static void nfs_set_auth_parsed_mount_data(struct nfs_parsed_mount_data *data, - rpc_authflavor_t pseudoflavor) -{ - data->auth_flavors[0] = pseudoflavor; - data->auth_flavor_len = 1; -} - /* * Parse the value of the 'sec=' option. */ @@ -1076,7 +1068,8 @@ static int nfs_parse_security_flavors(char *value, } mnt->flags |= NFS_MOUNT_SECFLAVOUR; - nfs_set_auth_parsed_mount_data(mnt, pseudoflavor); + mnt->auth_info.flavors[0] = pseudoflavor; + mnt->auth_info.flavor_len = 1; return 1; } @@ -1623,7 +1616,7 @@ out_security_failure: } /* - * Ensure that the specified authtype in args->auth_flavors[0] is supported by + * Ensure that the specified authtype in args->auth_info is supported by * the server. Returns 0 if it's ok, and -EACCES if not. */ static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args, @@ -1640,17 +1633,18 @@ static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args, * can be used. */ for (i = 0; i < count; i++) { - if (args->auth_flavors[0] == server_authlist[i] || + if (args->auth_info.flavors[0] == server_authlist[i] || server_authlist[i] == RPC_AUTH_NULL) goto out; } dfprintk(MOUNT, "NFS: auth flavor %u not supported by server\n", - args->auth_flavors[0]); + args->auth_info.flavors[0]); return -EACCES; out: - dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]); + args->selected_flavor = args->auth_info.flavors[0]; + dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->selected_flavor); return 0; } @@ -1738,9 +1732,10 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf * Was a sec= authflavor specified in the options? First, verify * whether the server supports it, and then just try to use it if so. */ - if (args->auth_flavor_len > 0) { + if (args->auth_info.flavor_len > 0) { status = nfs_verify_authflavor(args, authlist, authlist_len); - dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]); + dfprintk(MOUNT, "NFS: using auth flavor %u\n", + args->selected_flavor); if (status) return ERR_PTR(status); return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); @@ -1769,7 +1764,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf /* Fallthrough */ } dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor); - nfs_set_auth_parsed_mount_data(args, flavor); + args->selected_flavor = flavor; server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); if (!IS_ERR(server)) return server; @@ -1785,7 +1780,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf /* Last chance! Try AUTH_UNIX */ dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX); - nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX); + args->selected_flavor = RPC_AUTH_UNIX; return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); } @@ -1972,9 +1967,9 @@ static int nfs23_validate_mount_data(void *options, args->bsize = data->bsize; if (data->flags & NFS_MOUNT_SECFLAVOUR) - nfs_set_auth_parsed_mount_data(args, data->pseudoflavor); + args->selected_flavor = data->pseudoflavor; else - nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX); + args->selected_flavor = RPC_AUTH_UNIX; if (!args->nfs_server.hostname) goto out_nomem; @@ -2108,7 +2103,7 @@ static int nfs_validate_text_mount_data(void *options, nfs_set_port(sap, &args->nfs_server.port, port); - if (args->auth_flavor_len > 1) + if (args->auth_info.flavor_len > 1) goto out_bad_auth; return nfs_parse_devname(dev_name, @@ -2146,7 +2141,7 @@ nfs_compare_remount_data(struct nfs_server *nfss, data->version != nfss->nfs_client->rpc_ops->version || data->minorversion != nfss->nfs_client->cl_minorversion || data->retrans != nfss->client->cl_timeout->to_retries || - data->auth_flavors[0] != nfss->client->cl_auth->au_flavor || + data->selected_flavor != nfss->client->cl_auth->au_flavor || data->acregmin != nfss->acregmin / HZ || data->acregmax != nfss->acregmax / HZ || data->acdirmin != nfss->acdirmin / HZ || @@ -2191,7 +2186,9 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) data->rsize = nfss->rsize; data->wsize = nfss->wsize; data->retrans = nfss->client->cl_timeout->to_retries; - nfs_set_auth_parsed_mount_data(data, nfss->client->cl_auth->au_flavor); + data->selected_flavor = nfss->client->cl_auth->au_flavor; + data->auth_info.flavors[0] = nfss->client->cl_auth->au_flavor; + data->auth_info.flavor_len = 1; data->acregmin = nfss->acregmin / HZ; data->acregmax = nfss->acregmax / HZ; data->acdirmin = nfss->acdirmin / HZ; @@ -2718,9 +2715,9 @@ static int nfs4_validate_mount_data(void *options, data->auth_flavours, sizeof(pseudoflavor))) return -EFAULT; - nfs_set_auth_parsed_mount_data(args, pseudoflavor); + args->selected_flavor = pseudoflavor; } else - nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX); + args->selected_flavor = RPC_AUTH_UNIX; c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); if (IS_ERR(c)) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 8fe5b94..658104a 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -591,6 +591,12 @@ struct nfs_renameres { struct nfs_fattr *new_fattr; }; +/* parsed sec= options */ +struct nfs_auth_info { + unsigned int flavor_len; + rpc_authflavor_t flavors[1]; +}; + /* * Argument struct for decode_entry function */ -- cgit v0.10.2 From 0f5f49b8b3593309fd3c3a2080a5fd465afdbe16 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Fri, 18 Oct 2013 15:15:17 -0400 Subject: NFS: cache parsed auth_info in nfs_server Cache the auth_info structure in nfs_server and pass these values to submounts. This lays the groundwork for supporting multiple sec= options. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust diff --git a/fs/nfs/client.c b/fs/nfs/client.c index f5a7f7f..1d09289 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -786,6 +786,7 @@ static int nfs_init_server(struct nfs_server *server, goto error; server->port = data->nfs_server.port; + server->auth_info = data->auth_info; error = nfs_init_server_rpcclient(server, &timeparms, data->selected_flavor); @@ -929,6 +930,7 @@ void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *sour target->acdirmax = source->acdirmax; target->caps = source->caps; target->options = source->options; + target->auth_info = source->auth_info; } EXPORT_SYMBOL_GPL(nfs_server_copy_userdata); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index d65090e..04131c8 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -962,6 +962,7 @@ static int nfs4_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = data->flags; server->options = data->options; + server->auth_info = data->auth_info; if (data->auth_info.flavor_len >= 1) data->selected_flavor = data->auth_info.flavors[0]; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index b87744f..de1e5e8 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2187,8 +2187,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) data->wsize = nfss->wsize; data->retrans = nfss->client->cl_timeout->to_retries; data->selected_flavor = nfss->client->cl_auth->au_flavor; - data->auth_info.flavors[0] = nfss->client->cl_auth->au_flavor; - data->auth_info.flavor_len = 1; + data->auth_info = nfss->auth_info; data->acregmin = nfss->acregmin / HZ; data->acregmax = nfss->acregmax / HZ; data->acdirmin = nfss->acdirmin / HZ; diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 186ec49..1150ea4 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -151,6 +151,7 @@ struct nfs_server { unsigned long mount_time; /* when this fs was mounted */ struct super_block *super; /* VFS super block */ dev_t s_dev; /* superblock dev numbers */ + struct nfs_auth_info auth_info; /* parsed auth flavors */ #ifdef CONFIG_NFS_FSCACHE struct nfs_fscache_key *fscache_key; /* unique key for superblock */ -- cgit v0.10.2 From 5837f6dfcb00f764976ddc178933e612702cbf54 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Fri, 18 Oct 2013 15:15:18 -0400 Subject: NFS: stop using NFS_MOUNT_SECFLAVOUR server flag Since the parsed sec= flavor is now stored in nfs_server->auth_info, we no longer need an nfs_server flag to determine if a sec= option was used. This flag has not been completely removed because it is still needed for the (old but still supported) non-text parsed mount options ABI compatability. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 04131c8..f6cc77c 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -1051,6 +1051,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, { struct nfs_client *parent_client; struct nfs_server *server, *parent_server; + bool auth_probe; int error; dprintk("--> nfs4_create_referral_server()\n"); @@ -1083,8 +1084,9 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, if (error < 0) goto error; - error = nfs4_server_common_setup(server, mntfh, - !(parent_server->flags & NFS_MOUNT_SECFLAVOUR)); + auth_probe = parent_server->auth_info.flavor_len < 1; + + error = nfs4_server_common_setup(server, mntfh, auth_probe); if (error < 0) goto error; diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 4ec0eea..b947054 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -390,7 +390,7 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry, if (client->cl_auth->au_flavor != flavor) flavor = client->cl_auth->au_flavor; - else if (!(server->flags & NFS_MOUNT_SECFLAVOUR)) { + else if (server->auth_info.flavor_len == 0) { rpc_authflavor_t new = nfs4_negotiate_security(dir, name); if ((int)new >= 0) flavor = new; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1463c71..0eb8dc5 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2920,7 +2920,7 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, if (status != -NFS4ERR_WRONGSEC) break; /* Did user force a 'sec=' mount option? */ - if (server->flags & NFS_MOUNT_SECFLAVOUR) + if (server->auth_info.flavor_len > 0) break; default: status = nfs4_do_find_root_sec(server, fhandle, info); @@ -3180,7 +3180,7 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, if (client != *clnt) goto out; /* No security negotiation if the user specified 'sec=' */ - if (NFS_SERVER(dir)->flags & NFS_MOUNT_SECFLAVOUR) + if (NFS_SERVER(dir)->auth_info.flavor_len > 0) goto out; client = nfs4_create_sec_client(client, dir, name); if (IS_ERR(client)) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index de1e5e8..3a4f8bf 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1067,7 +1067,6 @@ static int nfs_parse_security_flavors(char *value, return 0; } - mnt->flags |= NFS_MOUNT_SECFLAVOUR; mnt->auth_info.flavors[0] = pseudoflavor; mnt->auth_info.flavor_len = 1; return 1; @@ -2332,7 +2331,7 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n goto Ebusy; if (a->acdirmax != b->acdirmax) goto Ebusy; - if (b->flags & NFS_MOUNT_SECFLAVOUR && + if (b->auth_info.flavor_len > 0 && clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor) goto Ebusy; return 1; diff --git a/include/uapi/linux/nfs_mount.h b/include/uapi/linux/nfs_mount.h index 576bddd..64b0f22 100644 --- a/include/uapi/linux/nfs_mount.h +++ b/include/uapi/linux/nfs_mount.h @@ -60,7 +60,7 @@ struct nfs_mount_data { #define NFS_MOUNT_BROKEN_SUID 0x0400 /* 4 */ #define NFS_MOUNT_NOACL 0x0800 /* 4 */ #define NFS_MOUNT_STRICTLOCK 0x1000 /* reserved for NFSv4 */ -#define NFS_MOUNT_SECFLAVOUR 0x2000 /* 5 */ +#define NFS_MOUNT_SECFLAVOUR 0x2000 /* 5 non-text parsed mount data only */ #define NFS_MOUNT_NORDIRPLUS 0x4000 /* 5 */ #define NFS_MOUNT_UNSHARED 0x8000 /* 5 */ #define NFS_MOUNT_FLAGMASK 0xFFFF -- cgit v0.10.2 From 4d4b69dd847a098cdca341c45326f6c6f61b8691 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Fri, 18 Oct 2013 15:15:19 -0400 Subject: NFS: add support for multiple sec= mount options This patch adds support for multiple security options which can be specified using a colon-delimited list of security flavors (the same syntax as nfsd's exports file). This is useful, for instance, when NFSv4.x mounts cross SECINFO boundaries. With this patch a user can use "sec=krb5i,krb5p" to mount a remote filesystem using krb5i, but can still cross into krb5p-only exports. New mounts will try all security options before failing. NFSv4.x SECINFO results will be compared against the sec= flavors to find the first flavor in both lists or if no match is found will return -EPERM. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index c8cd044..bca6a3e 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -326,6 +326,7 @@ extern struct file_system_type nfs_xdev_fs_type; extern struct file_system_type nfs4_xdev_fs_type; extern struct file_system_type nfs4_referral_fs_type; #endif +bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t); struct dentry *nfs_try_mount(int, const char *, struct nfs_mount_info *, struct nfs_subversion *); void nfs_initialise_sb(struct super_block *); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index f6cc77c..b4a160a 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -964,6 +964,9 @@ static int nfs4_init_server(struct nfs_server *server, server->options = data->options; server->auth_info = data->auth_info; + /* Use the first specified auth flavor. If this flavor isn't + * allowed by the server, use the SECINFO path to try the + * other specified flavors */ if (data->auth_info.flavor_len >= 1) data->selected_flavor = data->auth_info.flavors[0]; else diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index b947054..c08cbf4 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -137,6 +137,7 @@ static size_t nfs_parse_server_name(char *string, size_t len, /** * nfs_find_best_sec - Find a security mechanism supported locally + * @server: NFS server struct * @flavors: List of security tuples returned by SECINFO procedure * * Return the pseudoflavor of the first security mechanism in @@ -145,7 +146,8 @@ static size_t nfs_parse_server_name(char *string, size_t len, * is searched in the order returned from the server, per RFC 3530 * recommendation. */ -static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) +static rpc_authflavor_t nfs_find_best_sec(struct nfs_server *server, + struct nfs4_secinfo_flavors *flavors) { rpc_authflavor_t pseudoflavor; struct nfs4_secinfo4 *secinfo; @@ -160,12 +162,19 @@ static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) case RPC_AUTH_GSS: pseudoflavor = rpcauth_get_pseudoflavor(secinfo->flavor, &secinfo->flavor_info); - if (pseudoflavor != RPC_AUTH_MAXFLAVOR) + /* make sure pseudoflavor matches sec= mount opt */ + if (pseudoflavor != RPC_AUTH_MAXFLAVOR && + nfs_auth_info_match(&server->auth_info, + pseudoflavor)) return pseudoflavor; break; } } + /* if there were any sec= options then nothing matched */ + if (server->auth_info.flavor_len > 0) + return -EPERM; + return RPC_AUTH_UNIX; } @@ -187,7 +196,7 @@ static rpc_authflavor_t nfs4_negotiate_security(struct inode *inode, struct qstr goto out; } - flavor = nfs_find_best_sec(flavors); + flavor = nfs_find_best_sec(NFS_SERVER(inode), flavors); out: put_page(page); @@ -390,7 +399,7 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry, if (client->cl_auth->au_flavor != flavor) flavor = client->cl_auth->au_flavor; - else if (server->auth_info.flavor_len == 0) { + else { rpc_authflavor_t new = nfs4_negotiate_security(dir, name); if ((int)new >= 0) flavor = new; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0eb8dc5..b02c4cc 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2873,11 +2873,24 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, int status = -EPERM; size_t i; - for (i = 0; i < ARRAY_SIZE(flav_array); i++) { - status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]); - if (status == -NFS4ERR_WRONGSEC || status == -EACCES) - continue; - break; + if (server->auth_info.flavor_len > 0) { + /* try each flavor specified by user */ + for (i = 0; i < server->auth_info.flavor_len; i++) { + status = nfs4_lookup_root_sec(server, fhandle, info, + server->auth_info.flavors[i]); + if (status == -NFS4ERR_WRONGSEC || status == -EACCES) + continue; + break; + } + } else { + /* no flavors specified by user, try default list */ + for (i = 0; i < ARRAY_SIZE(flav_array); i++) { + status = nfs4_lookup_root_sec(server, fhandle, info, + flav_array[i]); + if (status == -NFS4ERR_WRONGSEC || status == -EACCES) + continue; + break; + } } /* @@ -2919,9 +2932,6 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, status = nfs4_lookup_root(server, fhandle, info); if (status != -NFS4ERR_WRONGSEC) break; - /* Did user force a 'sec=' mount option? */ - if (server->auth_info.flavor_len > 0) - break; default: status = nfs4_do_find_root_sec(server, fhandle, info); } @@ -3179,9 +3189,6 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, err = -EPERM; if (client != *clnt) goto out; - /* No security negotiation if the user specified 'sec=' */ - if (NFS_SERVER(dir)->auth_info.flavor_len > 0) - goto out; client = nfs4_create_sec_client(client, dir, name); if (IS_ERR(client)) return PTR_ERR(client); @@ -7942,6 +7949,9 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, break; } + if (!nfs_auth_info_match(&server->auth_info, flavor)) + flavor = RPC_AUTH_MAXFLAVOR; + if (flavor != RPC_AUTH_MAXFLAVOR) { err = nfs4_lookup_root_sec(server, fhandle, info, flavor); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 3a4f8bf..317d6fc 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -497,7 +497,8 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour) static const struct { rpc_authflavor_t flavour; const char *str; - } sec_flavours[] = { + } sec_flavours[NFS_AUTH_INFO_MAX_FLAVORS] = { + /* update NFS_AUTH_INFO_MAX_FLAVORS when this list changes! */ { RPC_AUTH_NULL, "null" }, { RPC_AUTH_UNIX, "sys" }, { RPC_AUTH_GSS_KRB5, "krb5" }, @@ -1019,6 +1020,52 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt) } /* + * Add 'flavor' to 'auth_info' if not already present. + * Returns true if 'flavor' ends up in the list, false otherwise + */ +static bool nfs_auth_info_add(struct nfs_auth_info *auth_info, + rpc_authflavor_t flavor) +{ + unsigned int i; + unsigned int max_flavor_len = (sizeof(auth_info->flavors) / + sizeof(auth_info->flavors[0])); + + /* make sure this flavor isn't already in the list */ + for (i = 0; i < auth_info->flavor_len; i++) { + if (flavor == auth_info->flavors[i]) + return true; + } + + if (auth_info->flavor_len + 1 >= max_flavor_len) { + dfprintk(MOUNT, "NFS: too many sec= flavors\n"); + return false; + } + + auth_info->flavors[auth_info->flavor_len++] = flavor; + return true; +} + +/* + * Return true if 'match' is in auth_info or auth_info is empty. + * Return false otherwise. + */ +bool nfs_auth_info_match(const struct nfs_auth_info *auth_info, + rpc_authflavor_t match) +{ + int i; + + if (!auth_info->flavor_len) + return true; + + for (i = 0; i < auth_info->flavor_len; i++) { + if (auth_info->flavors[i] == match) + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(nfs_auth_info_match); + +/* * Parse the value of the 'sec=' option. */ static int nfs_parse_security_flavors(char *value, @@ -1026,49 +1073,55 @@ static int nfs_parse_security_flavors(char *value, { substring_t args[MAX_OPT_ARGS]; rpc_authflavor_t pseudoflavor; + char *p; dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value); - switch (match_token(value, nfs_secflavor_tokens, args)) { - case Opt_sec_none: - pseudoflavor = RPC_AUTH_NULL; - break; - case Opt_sec_sys: - pseudoflavor = RPC_AUTH_UNIX; - break; - case Opt_sec_krb5: - pseudoflavor = RPC_AUTH_GSS_KRB5; - break; - case Opt_sec_krb5i: - pseudoflavor = RPC_AUTH_GSS_KRB5I; - break; - case Opt_sec_krb5p: - pseudoflavor = RPC_AUTH_GSS_KRB5P; - break; - case Opt_sec_lkey: - pseudoflavor = RPC_AUTH_GSS_LKEY; - break; - case Opt_sec_lkeyi: - pseudoflavor = RPC_AUTH_GSS_LKEYI; - break; - case Opt_sec_lkeyp: - pseudoflavor = RPC_AUTH_GSS_LKEYP; - break; - case Opt_sec_spkm: - pseudoflavor = RPC_AUTH_GSS_SPKM; - break; - case Opt_sec_spkmi: - pseudoflavor = RPC_AUTH_GSS_SPKMI; - break; - case Opt_sec_spkmp: - pseudoflavor = RPC_AUTH_GSS_SPKMP; - break; - default: - return 0; + while ((p = strsep(&value, ":")) != NULL) { + switch (match_token(p, nfs_secflavor_tokens, args)) { + case Opt_sec_none: + pseudoflavor = RPC_AUTH_NULL; + break; + case Opt_sec_sys: + pseudoflavor = RPC_AUTH_UNIX; + break; + case Opt_sec_krb5: + pseudoflavor = RPC_AUTH_GSS_KRB5; + break; + case Opt_sec_krb5i: + pseudoflavor = RPC_AUTH_GSS_KRB5I; + break; + case Opt_sec_krb5p: + pseudoflavor = RPC_AUTH_GSS_KRB5P; + break; + case Opt_sec_lkey: + pseudoflavor = RPC_AUTH_GSS_LKEY; + break; + case Opt_sec_lkeyi: + pseudoflavor = RPC_AUTH_GSS_LKEYI; + break; + case Opt_sec_lkeyp: + pseudoflavor = RPC_AUTH_GSS_LKEYP; + break; + case Opt_sec_spkm: + pseudoflavor = RPC_AUTH_GSS_SPKM; + break; + case Opt_sec_spkmi: + pseudoflavor = RPC_AUTH_GSS_SPKMI; + break; + case Opt_sec_spkmp: + pseudoflavor = RPC_AUTH_GSS_SPKMP; + break; + default: + dfprintk(MOUNT, + "NFS: sec= option '%s' not recognized\n", p); + return 0; + } + + if (!nfs_auth_info_add(&mnt->auth_info, pseudoflavor)) + return 0; } - mnt->auth_info.flavors[0] = pseudoflavor; - mnt->auth_info.flavor_len = 1; return 1; } @@ -1615,12 +1668,14 @@ out_security_failure: } /* - * Ensure that the specified authtype in args->auth_info is supported by - * the server. Returns 0 if it's ok, and -EACCES if not. + * Ensure that a specified authtype in args->auth_info is supported by + * the server. Returns 0 and sets args->selected_flavor if it's ok, and + * -EACCES if not. */ -static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args, +static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, rpc_authflavor_t *server_authlist, unsigned int count) { + rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR; unsigned int i; /* @@ -1632,17 +1687,19 @@ static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args, * can be used. */ for (i = 0; i < count; i++) { - if (args->auth_info.flavors[0] == server_authlist[i] || - server_authlist[i] == RPC_AUTH_NULL) + flavor = server_authlist[i]; + + if (nfs_auth_info_match(&args->auth_info, flavor) || + flavor == RPC_AUTH_NULL) goto out; } - dfprintk(MOUNT, "NFS: auth flavor %u not supported by server\n", - args->auth_info.flavors[0]); + dfprintk(MOUNT, + "NFS: specified auth flavors not supported by server\n"); return -EACCES; out: - args->selected_flavor = args->auth_info.flavors[0]; + args->selected_flavor = flavor; dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->selected_flavor); return 0; } @@ -1732,7 +1789,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf * whether the server supports it, and then just try to use it if so. */ if (args->auth_info.flavor_len > 0) { - status = nfs_verify_authflavor(args, authlist, authlist_len); + status = nfs_verify_authflavors(args, authlist, authlist_len); dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->selected_flavor); if (status) @@ -2102,9 +2159,6 @@ static int nfs_validate_text_mount_data(void *options, nfs_set_port(sap, &args->nfs_server.port, port); - if (args->auth_info.flavor_len > 1) - goto out_bad_auth; - return nfs_parse_devname(dev_name, &args->nfs_server.hostname, max_namelen, @@ -2124,10 +2178,6 @@ out_invalid_transport_udp: out_no_address: dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n"); return -EINVAL; - -out_bad_auth: - dfprintk(MOUNT, "NFS: Too many RPC auth flavours specified\n"); - return -EINVAL; } static int diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 658104a..3ccfcec 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -592,9 +592,10 @@ struct nfs_renameres { }; /* parsed sec= options */ +#define NFS_AUTH_INFO_MAX_FLAVORS 12 /* see fs/nfs/super.c */ struct nfs_auth_info { unsigned int flavor_len; - rpc_authflavor_t flavors[1]; + rpc_authflavor_t flavors[NFS_AUTH_INFO_MAX_FLAVORS]; }; /* -- cgit v0.10.2 From 34751b9d04a221da2a74b27ba439f01c0ae30069 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 28 Oct 2013 16:42:44 -0400 Subject: SUNRPC: Add correct rcu_dereference annotation in rpc_clnt_set_transport rpc_clnt_set_transport should use rcu_derefence_protected(), as it is only safe to be called with the rpc_clnt::cl_lock held. Cc: Chuck Lever Signed-off-by: Trond Myklebust diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index f167d9c..759b78b 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -272,7 +272,8 @@ static struct rpc_xprt *rpc_clnt_set_transport(struct rpc_clnt *clnt, struct rpc_xprt *old; spin_lock(&clnt->cl_lock); - old = clnt->cl_xprt; + old = rcu_dereference_protected(clnt->cl_xprt, + lockdep_is_held(&clnt->cl_lock)); if (!xprt_bound(xprt)) clnt->cl_autobind = 1; -- cgit v0.10.2 From e3bfab18483a26649ff9cb605aa1410386b1e498 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 2 Oct 2013 09:48:15 -0400 Subject: sunrpc: comment typo fix Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 9928ba1..9deed17 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2872,8 +2872,8 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args) if (args->bc_xprt->xpt_bc_xprt) { /* * This server connection already has a backchannel - * export; we can't create a new one, as we wouldn't be - * able to match replies based on xid any more. So, + * transport; we can't create a new one, as we wouldn't + * be able to match replies based on xid any more. So, * reuse the already-existing one: */ return args->bc_xprt->xpt_bc_xprt; -- cgit v0.10.2 From 4f5829d72636513e23a5f76355536776bf390a8a Mon Sep 17 00:00:00 2001 From: "Geyslan G. Bem" Date: Fri, 11 Oct 2013 17:15:54 -0300 Subject: nfs: Remove useless 'error' assignment the 'error' variable was been assigned twice in vain. Signed-off-by: Geyslan G. Bem Signed-off-by: Trond Myklebust diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index bb939ed..0c29b1b 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -493,7 +493,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) unsigned long long fileid; struct dentry *sdentry; struct rpc_task *task; - int error = -EIO; + int error = -EBUSY; dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n", dentry->d_parent->d_name.name, dentry->d_name.name, @@ -503,7 +503,6 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) /* * We don't allow a dentry to be silly-renamed twice. */ - error = -EBUSY; if (dentry->d_flags & DCACHE_NFSFS_RENAMED) goto out; -- cgit v0.10.2 From 54bcfa6682cfc7ee19dee0e94587f8f55a440df6 Mon Sep 17 00:00:00 2001 From: "Geyslan G. Bem" Date: Mon, 14 Oct 2013 17:24:15 -0300 Subject: nfs: Use PTR_ERR_OR_ZERO in 'nfs41_callback_up' function Use 'PTR_ERR_OR_ZERO()' rather than 'IS_ERR(...) ? PTR_ERR(...) : 0'. Signed-off-by: Geyslan G. Bem Signed-off-by: Trond Myklebust diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 67cd732..073b4cf 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -164,8 +164,7 @@ nfs41_callback_up(struct svc_serv *serv) svc_xprt_put(serv->sv_bc_xprt); serv->sv_bc_xprt = NULL; } - dprintk("--> %s return %ld\n", __func__, - IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0); + dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp)); return rqstp; } -- cgit v0.10.2 From 6706246b22358b70b49e35a988a9bbfa830619ee Mon Sep 17 00:00:00 2001 From: "Geyslan G. Bem" Date: Mon, 14 Oct 2013 17:24:16 -0300 Subject: nfs: Use PTR_ERR_OR_ZERO in 'nfs/nfs4super.c' Use 'PTR_ERR_OR_ZERO()' rather than 'IS_ERR(...) ? PTR_ERR(...) : 0'. Signed-off-by: Geyslan G. Bem Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index e26acdd..65ab0a0 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -261,9 +261,9 @@ struct dentry *nfs4_try_mount(int flags, const char *dev_name, res = nfs_follow_remote_path(root_mnt, export_path); - dfprintk(MOUNT, "<-- nfs4_try_mount() = %ld%s\n", - IS_ERR(res) ? PTR_ERR(res) : 0, - IS_ERR(res) ? " [error]" : ""); + dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", + PTR_ERR_OR_ZERO(res), + IS_ERR(res) ? " [error]" : ""); return res; } @@ -319,9 +319,9 @@ static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type, data->mnt_path = export_path; res = nfs_follow_remote_path(root_mnt, export_path); - dprintk("<-- nfs4_referral_mount() = %ld%s\n", - IS_ERR(res) ? PTR_ERR(res) : 0, - IS_ERR(res) ? " [error]" : ""); + dprintk("<-- nfs4_referral_mount() = %d%s\n", + PTR_ERR_OR_ZERO(res), + IS_ERR(res) ? " [error]" : ""); return res; } -- cgit v0.10.2 From 8313164c36473193c8034de643dc32f35a22bf59 Mon Sep 17 00:00:00 2001 From: wangweidong Date: Tue, 15 Oct 2013 11:44:30 +0800 Subject: SUNRPC: remove an unnecessary if statement If req allocated failed just goto out_free, no need to check the 'i < num_prealloc'. There is just code simplification, no functional changes. Signed-off-by: Wang Weidong Signed-off-by: Trond Myklebust diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 49535505..04199bc 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1104,11 +1104,9 @@ struct rpc_xprt *xprt_alloc(struct net *net, size_t size, for (i = 0; i < num_prealloc; i++) { req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL); if (!req) - break; + goto out_free; list_add(&req->rq_list, &xprt->free); } - if (i < num_prealloc) - goto out_free; if (max_alloc > num_prealloc) xprt->max_reqs = max_alloc; else -- cgit v0.10.2 From 5fccc5b52ee07d07a74ce53c6f174bff81e26a16 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 28 Oct 2013 18:41:44 -0400 Subject: SUNRPC: gss_alloc_msg - choose _either_ a v0 message or a v1 message Add the missing 'break' to ensure that we don't corrupt a legacy 'v0' type message by appending the 'v1'. Cc: Bruce Fields Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 0846566..cc24323 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -482,6 +482,7 @@ gss_alloc_msg(struct gss_auth *gss_auth, switch (vers) { case 0: gss_encode_v0_msg(gss_msg); + break; default: gss_encode_v1_msg(gss_msg, service_name, gss_auth->target_name); }; -- cgit v0.10.2 From 9d3a2260f0f4bd6379b0a0f131c743fff25b0029 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 28 Oct 2013 18:18:00 -0400 Subject: SUNRPC: Fix buffer overflow checking in gss_encode_v0_msg/gss_encode_v1_msg In gss_encode_v1_msg, it is pointless to BUG() after the overflow has happened. Replace the existing sprintf()-based code with scnprintf(), and warn if an overflow is ever triggered. In gss_encode_v0_msg, replace the runtime BUG_ON() with an appropriate compile-time BUILD_BUG_ON. Reported-by: Bruce Fields Signed-off-by: Trond Myklebust diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index cc24323..97912b4 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -420,41 +420,53 @@ static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg) memcpy(gss_msg->databuf, &uid, sizeof(uid)); gss_msg->msg.data = gss_msg->databuf; gss_msg->msg.len = sizeof(uid); - BUG_ON(sizeof(uid) > UPCALL_BUF_LEN); + + BUILD_BUG_ON(sizeof(uid) > sizeof(gss_msg->databuf)); } -static void gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, +static int gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, const char *service_name, const char *target_name) { struct gss_api_mech *mech = gss_msg->auth->mech; char *p = gss_msg->databuf; - int len = 0; - - gss_msg->msg.len = sprintf(gss_msg->databuf, "mech=%s uid=%d ", - mech->gm_name, - from_kuid(&init_user_ns, gss_msg->uid)); - p += gss_msg->msg.len; + size_t buflen = sizeof(gss_msg->databuf); + int len; + + len = scnprintf(p, buflen, "mech=%s uid=%d ", mech->gm_name, + from_kuid(&init_user_ns, gss_msg->uid)); + buflen -= len; + p += len; + gss_msg->msg.len = len; if (target_name) { - len = sprintf(p, "target=%s ", target_name); + len = scnprintf(p, buflen, "target=%s ", target_name); + buflen -= len; p += len; gss_msg->msg.len += len; } if (service_name != NULL) { - len = sprintf(p, "service=%s ", service_name); + len = scnprintf(p, buflen, "service=%s ", service_name); + buflen -= len; p += len; gss_msg->msg.len += len; } if (mech->gm_upcall_enctypes) { - len = sprintf(p, "enctypes=%s ", mech->gm_upcall_enctypes); + len = scnprintf(p, buflen, "enctypes=%s ", + mech->gm_upcall_enctypes); + buflen -= len; p += len; gss_msg->msg.len += len; } - len = sprintf(p, "\n"); + len = scnprintf(p, buflen, "\n"); + if (len == 0) + goto out_overflow; gss_msg->msg.len += len; gss_msg->msg.data = gss_msg->databuf; - BUG_ON(gss_msg->msg.len > UPCALL_BUF_LEN); + return 0; +out_overflow: + WARN_ON_ONCE(1); + return -ENOMEM; } static struct gss_upcall_msg * @@ -463,15 +475,15 @@ gss_alloc_msg(struct gss_auth *gss_auth, { struct gss_upcall_msg *gss_msg; int vers; + int err = -ENOMEM; gss_msg = kzalloc(sizeof(*gss_msg), GFP_NOFS); if (gss_msg == NULL) - return ERR_PTR(-ENOMEM); + goto err; vers = get_pipe_version(gss_auth->net); - if (vers < 0) { - kfree(gss_msg); - return ERR_PTR(vers); - } + err = vers; + if (err < 0) + goto err_free_msg; gss_msg->pipe = gss_auth->gss_pipe[vers]->pipe; INIT_LIST_HEAD(&gss_msg->list); rpc_init_wait_queue(&gss_msg->rpc_waitqueue, "RPCSEC_GSS upcall waitq"); @@ -484,9 +496,15 @@ gss_alloc_msg(struct gss_auth *gss_auth, gss_encode_v0_msg(gss_msg); break; default: - gss_encode_v1_msg(gss_msg, service_name, gss_auth->target_name); + err = gss_encode_v1_msg(gss_msg, service_name, gss_auth->target_name); + if (err) + goto err_free_msg; }; return gss_msg; +err_free_msg: + kfree(gss_msg); +err: + return ERR_PTR(err); } static struct gss_upcall_msg * -- cgit v0.10.2 From a3f432bfd06a4ec3b812e32d3266e0d1ad75d008 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 15 Oct 2013 17:03:16 -0400 Subject: nfs: use IS_ROOT not DCACHE_DISCONNECTED This check was added by Al Viro with d9e80b7de91db05c1c4d2e5ebbfd70b3b3ba0e0f "nfs d_revalidate() is too trigger-happy with d_drop()", with the explanation that we don't want to remove the root of a disconnected tree, which will still be included on the s_anon list. But DCACHE_DISCONNECTED does *not* actually identify dentries that are disconnected from the dentry tree or hashed on s_anon. IS_ROOT() is the way to do that. Also add a comment from Al's commit to remind us why this check is there. Signed-off-by: J. Bruce Fields Reviewed-by: Christoph Hellwig Signed-off-by: Trond Myklebust diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 02b0df7..6cc51ae 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1139,7 +1139,13 @@ out_zap_parent: if (inode && S_ISDIR(inode->i_mode)) { /* Purge readdir caches. */ nfs_zap_caches(inode); - if (dentry->d_flags & DCACHE_DISCONNECTED) + /* + * We can't d_drop the root of a disconnected tree: + * its d_hash is on the s_anon list and d_drop() would hide + * it from shrink_dcache_for_unmount(), leading to busy + * inodes on unmount and further oopses. + */ + if (IS_ROOT(dentry)) goto out_valid; } /* If we have submounts, don't unhash ! */ -- cgit v0.10.2 From 09c3e54635c85b3da44d3bc156619c1f1af3bb43 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 30 Oct 2013 13:32:15 +0800 Subject: SUNRPC: remove duplicated include from clnt.c Remove duplicated include. Signed-off-by: Wei Yongjun Signed-off-by: Trond Myklebust diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 759b78b..dab09da 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include -- cgit v0.10.2 From 93dc41bdc5c853916610576c6b48a1704959c70d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 31 Oct 2013 16:14:36 +1100 Subject: SUNRPC: close a rare race in xs_tcp_setup_socket. We have one report of a crash in xs_tcp_setup_socket. The call path to the crash is: xs_tcp_setup_socket -> inet_stream_connect -> lock_sock_nested. The 'sock' passed to that last function is NULL. The only way I can see this happening is a concurrent call to xs_close: xs_close -> xs_reset_transport -> sock_release -> inet_release inet_release sets: sock->sk = NULL; inet_stream_connect calls lock_sock(sock->sk); which gets NULL. All calls to xs_close are protected by XPRT_LOCKED as are most activations of the workqueue which runs xs_tcp_setup_socket. The exception is xs_tcp_schedule_linger_timeout. So presumably the timeout queued by the later fires exactly when some other code runs xs_close(). To protect against this we can move the cancel_delayed_work_sync() call from xs_destory() to xs_close(). As xs_close is never called from the worker scheduled on ->connect_worker, this can never deadlock. Signed-off-by: NeilBrown [Trond: Make it safe to call cancel_delayed_work_sync() on AF_LOCAL sockets] Signed-off-by: Trond Myklebust diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 9deed17..a4709bb 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -835,6 +835,8 @@ static void xs_close(struct rpc_xprt *xprt) dprintk("RPC: xs_close xprt %p\n", xprt); + cancel_delayed_work_sync(&transport->connect_worker); + xs_reset_transport(transport); xprt->reestablish_timeout = 0; @@ -869,12 +871,8 @@ static void xs_local_destroy(struct rpc_xprt *xprt) */ static void xs_destroy(struct rpc_xprt *xprt) { - struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - dprintk("RPC: xs_destroy xprt %p\n", xprt); - cancel_delayed_work_sync(&transport->connect_worker); - xs_local_destroy(xprt); } @@ -1817,6 +1815,10 @@ static inline void xs_reclassify_socket(int family, struct socket *sock) } #endif +static void xs_dummy_setup_socket(struct work_struct *work) +{ +} + static struct socket *xs_create_sock(struct rpc_xprt *xprt, struct sock_xprt *transport, int family, int type, int protocol) { @@ -2668,6 +2670,9 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args) xprt->ops = &xs_local_ops; xprt->timeout = &xs_local_default_timeout; + INIT_DELAYED_WORK(&transport->connect_worker, + xs_dummy_setup_socket); + switch (sun->sun_family) { case AF_LOCAL: if (sun->sun_path[0] != '/') { -- cgit v0.10.2 From a1311d87fa034e0de580e3a65f2d5c2e7a1f55f3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 31 Oct 2013 09:18:49 -0400 Subject: SUNRPC: Cleanup xs_destroy() There is no longer any need for a separate xs_local_destroy() helper. Signed-off-by: Trond Myklebust diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index a4709bb..17c8892 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -856,14 +856,6 @@ static void xs_tcp_close(struct rpc_xprt *xprt) xs_tcp_shutdown(xprt); } -static void xs_local_destroy(struct rpc_xprt *xprt) -{ - xs_close(xprt); - xs_free_peer_addresses(xprt); - xprt_free(xprt); - module_put(THIS_MODULE); -} - /** * xs_destroy - prepare to shutdown a transport * @xprt: doomed transport @@ -873,7 +865,10 @@ static void xs_destroy(struct rpc_xprt *xprt) { dprintk("RPC: xs_destroy xprt %p\n", xprt); - xs_local_destroy(xprt); + xs_close(xprt); + xs_free_peer_addresses(xprt); + xprt_free(xprt); + module_put(THIS_MODULE); } static inline struct rpc_xprt *xprt_from_sock(struct sock *sk) @@ -2513,7 +2508,7 @@ static struct rpc_xprt_ops xs_local_ops = { .send_request = xs_local_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_def, .close = xs_close, - .destroy = xs_local_destroy, + .destroy = xs_destroy, .print_stats = xs_local_print_stats, }; -- cgit v0.10.2 From 1acd1c301f4faae80f4d2c7bbd9a4553b131c0e3 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 31 Oct 2013 13:03:04 -0400 Subject: nfs: fix inverted test for delegation in nfs4_reclaim_open_state commit 6686390bab6a0e0 (NFS: remove incorrect "Lock reclaim failed!" warning.) added a test for a delegation before checking to see if any reclaimed locks failed. The test however is backward and is only doing that check when a delegation is held instead of when one isn't. Cc: NeilBrown Signed-off-by: Jeff Layton Fixes: 6686390bab6a: NFS: remove incorrect "Lock reclaim failed!" warning. Cc: stable@vger.kernel.org # 3.12 Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 452f4c8..c8e729d 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1476,7 +1476,7 @@ restart: if (status >= 0) { status = nfs4_reclaim_locks(state, ops); if (status >= 0) { - if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) { + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) { spin_lock(&state->state_lock); list_for_each_entry(lock, &state->lock_states, ls_locks) { if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) -- cgit v0.10.2 From 12207f69b3ef4d6eea6a5568281e49f671977ab1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 1 Nov 2013 10:49:32 -0400 Subject: nfs: fix oops when trying to set SELinux label Chao reported the following oops when testing labeled NFS: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] nfs4_xdr_enc_setattr+0x43/0x110 [nfsv4] PGD 277bbd067 PUD 2777ea067 PMD 0 Oops: 0000 [#1] SMP Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache sg coretemp kvm_intel kvm crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel lrw gf128mul iTCO_wdt glue_helper ablk_helper cryptd iTCO_vendor_support bnx2 pcspkr serio_raw i7core_edac cdc_ether microcode usbnet edac_core mii lpc_ich i2c_i801 mfd_core shpchp ioatdma dca acpi_cpufreq mperf nfsd auth_rpcgss nfs_acl lockd sunrpc xfs libcrc32c sr_mod sd_mod cdrom crc_t10dif mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper ata_generic ttm pata_acpi drm ata_piix libata megaraid_sas i2c_core dm_mirror dm_region_hash dm_log dm_mod CPU: 4 PID: 25657 Comm: chcon Not tainted 3.10.0-33.el7.x86_64 #1 Hardware name: IBM System x3550 M3 -[7944OEJ]-/90Y4784 , BIOS -[D6E150CUS-1.11]- 02/08/2011 task: ffff880178397220 ti: ffff8801595d2000 task.ti: ffff8801595d2000 RIP: 0010:[] [] nfs4_xdr_enc_setattr+0x43/0x110 [nfsv4] RSP: 0018:ffff8801595d3888 EFLAGS: 00010296 RAX: 0000000000000000 RBX: ffff8801595d3b30 RCX: 0000000000000b4c RDX: ffff8801595d3b30 RSI: ffff8801595d38e0 RDI: ffff880278b6ec00 RBP: ffff8801595d38c8 R08: ffff8801595d3b30 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801595d38e0 R13: ffff880277a4a780 R14: ffffffffa05686c0 R15: ffff8802765f206c FS: 00007f2c68486800(0000) GS:ffff88027fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000027651a000 CR4: 00000000000007e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Stack: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 ffff880277865800 ffff880278b6ec00 ffff880277a4a780 ffff8801595d3948 ffffffffa02ad926 ffff8801595d3b30 ffff8802765f206c Call Trace: [] rpcauth_wrap_req+0x86/0xd0 [sunrpc] [] ? call_connect+0xb0/0xb0 [sunrpc] [] ? call_connect+0xb0/0xb0 [sunrpc] [] call_transmit+0x18b/0x290 [sunrpc] [] ? call_connect+0xb0/0xb0 [sunrpc] [] __rpc_execute+0x84/0x400 [sunrpc] [] rpc_execute+0x5e/0xa0 [sunrpc] [] rpc_run_task+0x70/0x90 [sunrpc] [] rpc_call_sync+0x43/0xa0 [sunrpc] [] _nfs4_do_set_security_label+0x11d/0x170 [nfsv4] [] nfs4_set_security_label.isra.69+0xf1/0x1d0 [nfsv4] [] ? avc_alloc_node+0x24/0x125 [] ? avc_compute_av+0x1a3/0x1b5 [] nfs4_xattr_set_nfs4_label+0x3b/0x50 [nfsv4] [] generic_setxattr+0x62/0x80 [] __vfs_setxattr_noperm+0x63/0x1b0 [] vfs_setxattr+0xb5/0xc0 [] setxattr+0x12e/0x1c0 [] ? final_putname+0x22/0x50 [] ? putname+0x2b/0x40 [] ? user_path_at_empty+0x5f/0x90 [] ? __sb_start_write+0x49/0x100 [] SyS_lsetxattr+0x8f/0xd0 [] system_call_fastpath+0x16/0x1b Code: 48 8b 02 48 c7 45 c0 00 00 00 00 48 c7 45 c8 00 00 00 00 48 c7 45 d0 00 00 00 00 48 c7 45 d8 00 00 00 00 48 c7 45 e0 00 00 00 00 <48> 8b 00 48 8b 00 48 85 c0 0f 84 ae 00 00 00 48 8b 80 b8 03 00 RIP [] nfs4_xdr_enc_setattr+0x43/0x110 [nfsv4] RSP CR2: 0000000000000000 The problem is that _nfs4_do_set_security_label calls rpc_call_sync() directly which fails to do any setup of the SEQUENCE call. Have it use nfs4_call_sync() instead which does the right thing. While we're at it change the name of "args" to "arg" to better match the pattern in _nfs4_do_setattr. Reported-by: Chao Ye Cc: David Quigley Signed-off-by: Jeff Layton Cc: stable@vger.kernel.org # 3.11+ Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b02c4cc..d436c57 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4657,7 +4657,7 @@ static int _nfs4_do_set_security_label(struct inode *inode, struct iattr sattr = {0}; struct nfs_server *server = NFS_SERVER(inode); const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; - struct nfs_setattrargs args = { + struct nfs_setattrargs arg = { .fh = NFS_FH(inode), .iap = &sattr, .server = server, @@ -4671,14 +4671,14 @@ static int _nfs4_do_set_security_label(struct inode *inode, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], - .rpc_argp = &args, + .rpc_argp = &arg, .rpc_resp = &res, }; int status; - nfs4_stateid_copy(&args.stateid, &zero_stateid); + nfs4_stateid_copy(&arg.stateid, &zero_stateid); - status = rpc_call_sync(server->client, &msg, 0); + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (status) dprintk("%s failed: %d\n", __func__, status); -- cgit v0.10.2 From fcb63a9bd8427fc584229048ea14f1453dc9a2e1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 1 Nov 2013 12:42:25 -0400 Subject: NFS: Fix a missing initialisation when reading the SELinux label Ensure that _nfs4_do_get_security_label() also initialises the SEQUENCE call correctly, by having it call into nfs4_call_sync(). Reported-by: Jeff Layton Cc: stable@vger.kernel.org # 3.11+ Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d436c57..7e28b5c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4602,7 +4602,7 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf, struct nfs4_label label = {0, 0, buflen, buf}; u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; - struct nfs4_getattr_arg args = { + struct nfs4_getattr_arg arg = { .fh = NFS_FH(inode), .bitmask = bitmask, }; @@ -4613,14 +4613,14 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETATTR], - .rpc_argp = &args, + .rpc_argp = &arg, .rpc_resp = &res, }; int ret; nfs_fattr_init(&fattr); - ret = rpc_call_sync(server->client, &msg, 0); + ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0); if (ret) return ret; if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL)) -- cgit v0.10.2 From f3f5a0f8cc40b942f4c0ae117df82eeb65f07d4d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Nov 2013 14:38:05 -0500 Subject: NFSv4.2: Fix a mismatch between Linux labeled NFS and the NFSv4.2 spec In the spec, the security label attribute id is '80', which means that it should be bit number 80-64 == 16 in the 3rd word of the bitmap. Fixes: 4488cc96c581: NFS: Add NFSv4.2 protocol constants Cc: J. Bruce Fields Cc: Steve Dickson Cc: stable@vger.kernel.org # 3.11+ Signed-off-by: Trond Myklebust diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index c56fa8f..bfe6c37 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -395,7 +395,7 @@ enum lock_type4 { #define FATTR4_WORD1_FS_LAYOUT_TYPES (1UL << 30) #define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) #define FATTR4_WORD2_MDSTHRESHOLD (1UL << 4) -#define FATTR4_WORD2_SECURITY_LABEL (1UL << 17) +#define FATTR4_WORD2_SECURITY_LABEL (1UL << 16) /* MDS threshold bitmap bits */ #define THRESHOLD_RD (1UL << 0) -- cgit v0.10.2 From 3da580aab9482f5bf88ea79a4bf0a511eea6fa44 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 2 Nov 2013 06:57:18 -0400 Subject: nfs: set security label when revalidating inode Currently, we fetch the security label when revalidating an inode's attributes, but don't apply it. This is in contrast to the readdir() codepath where we do apply label changes. Cc: Dave Quigley Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 471ba59..26e77d8 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -923,6 +923,8 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) if (nfsi->cache_validity & NFS_INO_INVALID_ACL) nfs_zap_acl_cache(inode); + nfs_setsecurity(inode, fattr, label); + dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode)); -- cgit v0.10.2 From d204c5d2b8f7614350cd4609f4d4cdcf25494d74 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Nov 2013 13:23:59 -0500 Subject: NFSv4.2: encode_readdir - only ask for labels when doing readdirplus Currently, if the server is doing NFSv4.2 and supports labeled NFS, then our on-the-wire READDIR request ends up asking for the label information, which is then ignored unless we're doing readdirplus. This patch ensures that READDIR doesn't ask the server for label information at all unless the readdir->bitmask contains the FATTR4_WORD2_SECURITY_LABEL attribute, and the readdir->plus flag is set. While we're at it, optimise away the 3rd bitmap field if it is zero. Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index f903389..5be2868 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -105,12 +105,8 @@ static int nfs4_stat_to_errno(int); #ifdef CONFIG_NFS_V4_SECURITY_LABEL /* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */ #define nfs4_label_maxsz (4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN)) -#define encode_readdir_space 24 -#define encode_readdir_bitmask_sz 3 #else #define nfs4_label_maxsz 0 -#define encode_readdir_space 20 -#define encode_readdir_bitmask_sz 2 #endif /* We support only one layout type per file system */ #define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8) @@ -1581,6 +1577,8 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg }; uint32_t dircount = readdir->count >> 1; __be32 *p, verf[2]; + uint32_t attrlen = 0; + unsigned int i; if (readdir->plus) { attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE| @@ -1589,26 +1587,27 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV| FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS| FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; + attrs[2] |= FATTR4_WORD2_SECURITY_LABEL; dircount >>= 1; } /* Use mounted_on_fileid only if the server supports it */ if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) attrs[0] |= FATTR4_WORD0_FILEID; + for (i = 0; i < ARRAY_SIZE(attrs); i++) { + attrs[i] &= readdir->bitmask[i]; + if (attrs[i] != 0) + attrlen = i+1; + } encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr); encode_uint64(xdr, readdir->cookie); encode_nfs4_verifier(xdr, &readdir->verifier); - p = reserve_space(xdr, encode_readdir_space); + p = reserve_space(xdr, 12 + (attrlen << 2)); *p++ = cpu_to_be32(dircount); *p++ = cpu_to_be32(readdir->count); - *p++ = cpu_to_be32(encode_readdir_bitmask_sz); - *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); - *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); - if (encode_readdir_bitmask_sz > 2) { - if (hdr->minorversion > 1) - attrs[2] |= FATTR4_WORD2_SECURITY_LABEL; - p++, *p++ = cpu_to_be32(attrs[2] & readdir->bitmask[2]); - } + *p++ = cpu_to_be32(attrlen); + for (i = 0; i < attrlen; i++) + *p++ = cpu_to_be32(attrs[i]); memcpy(verf, readdir->verifier.data, sizeof(verf)); dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n", -- cgit v0.10.2 From b944dba31d2c23f1cf5d69a66cc3449e0ba78503 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Nov 2013 15:20:20 -0500 Subject: NFSv4: Sanity check the server reply in _nfs4_server_capabilities We don't want to be setting capabilities and/or requesting attributes that are not appropriate for the NFSv4 minor version. - Ensure that we clear the NFS_CAP_SECURITY_LABEL capability when appropriate - Ensure that we limit the attribute bitmasks to the mounted_on_fileid attribute and less for NFSv4.0 - Ensure that we limit the attribute bitmasks to suppattr_exclcreat and less for NFSv4.1 - Ensure that we limit it to change_sec_label or less for NFSv4.2 Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7e28b5c..ed2a0e6 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2706,6 +2706,10 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) nfs4_close_state(ctx->state, ctx->mode); } +#define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL) +#define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL) +#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_CHANGE_SECURITY_LABEL - 1UL) + static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) { struct nfs4_server_caps_arg args = { @@ -2721,12 +2725,25 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == 0) { + /* Sanity check the server answers */ + switch (server->nfs_client->cl_minorversion) { + case 0: + res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK; + res.attr_bitmask[2] = 0; + break; + case 1: + res.attr_bitmask[2] &= FATTR4_WORD2_NFS41_MASK; + break; + case 2: + res.attr_bitmask[2] &= FATTR4_WORD2_NFS42_MASK; + } memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS| NFS_CAP_SYMLINKS|NFS_CAP_FILEID| NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER| NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME| - NFS_CAP_CTIME|NFS_CAP_MTIME); + NFS_CAP_CTIME|NFS_CAP_MTIME| + NFS_CAP_SECURITY_LABEL); if (res.attr_bitmask[0] & FATTR4_WORD0_ACL) server->caps |= NFS_CAP_ACLS; if (res.has_links != 0) @@ -2755,14 +2772,12 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f #endif memcpy(server->attr_bitmask_nl, res.attr_bitmask, sizeof(server->attr_bitmask)); + server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL; - if (server->caps & NFS_CAP_SECURITY_LABEL) { - server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL; - res.attr_bitmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL; - } memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; + server->cache_consistency_bitmask[2] = 0; server->acl_bitmask = res.acl_bitmask; server->fh_expire_type = res.fh_expire_type; } diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index bfe6c37..c6f41b6 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -396,6 +396,8 @@ enum lock_type4 { #define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) #define FATTR4_WORD2_MDSTHRESHOLD (1UL << 4) #define FATTR4_WORD2_SECURITY_LABEL (1UL << 16) +#define FATTR4_WORD2_CHANGE_SECURITY_LABEL \ + (1UL << 17) /* MDS threshold bitmap bits */ #define THRESHOLD_RD (1UL << 0) -- cgit v0.10.2 From fab99ebe39fe7d11fbd9b5fb84f07432af9ba36f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Nov 2013 13:46:40 -0500 Subject: NFSv4.2: Remove redundant checks in nfs_setsecurity+nfs4_label_init_security We already check for nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL) in nfs4_label_alloc() We check the minor version in _nfs4_server_capabilities before setting NFS_CAP_SECURITY_LABEL. Signed-off-by: Trond Myklebust diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 26e77d8..18ab2da 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -274,12 +274,6 @@ void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, if (label == NULL) return; - if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL) == 0) - return; - - if (NFS_SERVER(inode)->nfs_client->cl_minorversion < 2) - return; - if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) { error = security_inode_notifysecctx(inode, label->label, label->len); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ed2a0e6..5ab33c0 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -105,9 +105,6 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry, if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0) return NULL; - if (NFS_SERVER(dir)->nfs_client->cl_minorversion < 2) - return NULL; - err = security_dentry_init_security(dentry, sattr->ia_mode, &dentry->d_name, (void **)&label->label, &label->len); if (err == 0) -- cgit v0.10.2