From 8fb883f3e30065529e4f35d4b4f355193dcdb7a2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 21 Sep 2013 00:09:31 +0100
Subject: FS-Cache: Add use/unuse/wake cookie wrappers

Add wrapper functions for dealing with cookie->n_active:

 (*) __fscache_use_cookie() to increment it.

 (*) __fscache_unuse_cookie() to decrement and test against zero.

 (*) __fscache_wake_unused_cookie() to wake up anyone waiting for it to reach
     zero.

The second and third are split so that the third can be done after cookie->lock
has been released in case the waiter wakes up whilst we're still holding it and
tries to get it.

We will need to wake-on-zero once the cookie disablement patch is applied
because it will then be possible to see n_active become zero without the cookie
being relinquished.

Also move the cookie usement out of fscache_attr_changed_op() and into
fscache_attr_changed() and the operation struct so that cookie disablement
will be able to track it.

Whilst we're at it, only increment n_active if we're about to do
fscache_submit_op() so that we don't have to deal with undoing it if anything
earlier fails.  Possibly this should be moved into fscache_submit_op() which
could look at FSCACHE_OP_UNUSE_COOKIE.

Signed-off-by: David Howells <dhowells@redhat.com>

diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index b2a86e3..d851aa5 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -568,6 +568,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
 {
 	struct fscache_operation *op;
 	struct fscache_object *object;
+	bool wake_cookie = false;
 	int ret;
 
 	_enter("%p,", cookie);
@@ -600,7 +601,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
 
 	op->debug_id = atomic_inc_return(&fscache_op_debug_id);
 
-	atomic_inc(&cookie->n_active);
+	__fscache_use_cookie(cookie);
 	if (fscache_submit_op(object, op) < 0)
 		goto submit_failed;
 
@@ -622,9 +623,11 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
 	return ret;
 
 submit_failed:
-	atomic_dec(&cookie->n_active);
+	wake_cookie = __fscache_unuse_cookie(cookie);
 inconsistent:
 	spin_unlock(&cookie->lock);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
 	kfree(op);
 	_leave(" = -ESTALE");
 	return -ESTALE;
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 73899c1..0fe42a6 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -163,12 +163,10 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
 
 	fscache_stat(&fscache_n_attr_changed_calls);
 
-	if (fscache_object_is_active(object) &&
-	    fscache_use_cookie(object)) {
+	if (fscache_object_is_active(object)) {
 		fscache_stat(&fscache_n_cop_attr_changed);
 		ret = object->cache->ops->attr_changed(object);
 		fscache_stat_d(&fscache_n_cop_attr_changed);
-		fscache_unuse_cookie(object);
 		if (ret < 0)
 			fscache_abort_object(object);
 	}
@@ -184,6 +182,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 {
 	struct fscache_operation *op;
 	struct fscache_object *object;
+	bool wake_cookie;
 
 	_enter("%p", cookie);
 
@@ -199,7 +198,9 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 	}
 
 	fscache_operation_init(op, fscache_attr_changed_op, NULL);
-	op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
+	op->flags = FSCACHE_OP_ASYNC |
+		(1 << FSCACHE_OP_EXCLUSIVE) |
+		(1 << FSCACHE_OP_UNUSE_COOKIE);
 
 	spin_lock(&cookie->lock);
 
@@ -208,6 +209,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
 
+	__fscache_use_cookie(cookie);
 	if (fscache_submit_exclusive_op(object, op) < 0)
 		goto nobufs;
 	spin_unlock(&cookie->lock);
@@ -217,8 +219,11 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 	return 0;
 
 nobufs:
+	wake_cookie = __fscache_unuse_cookie(cookie);
 	spin_unlock(&cookie->lock);
 	kfree(op);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
 	fscache_stat(&fscache_n_attr_changed_nobufs);
 	_leave(" = %d", -ENOBUFS);
 	return -ENOBUFS;
@@ -263,7 +268,6 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
 	}
 
 	fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
-	atomic_inc(&cookie->n_active);
 	op->op.flags	= FSCACHE_OP_MYTHREAD |
 		(1UL << FSCACHE_OP_WAITING) |
 		(1UL << FSCACHE_OP_UNUSE_COOKIE);
@@ -384,6 +388,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 {
 	struct fscache_retrieval *op;
 	struct fscache_object *object;
+	bool wake_cookie = false;
 	int ret;
 
 	_enter("%p,%p,,,", cookie, page);
@@ -421,6 +426,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 
 	ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags));
 
+	__fscache_use_cookie(cookie);
 	atomic_inc(&object->n_reads);
 	__set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
 
@@ -475,9 +481,11 @@ error:
 
 nobufs_unlock_dec:
 	atomic_dec(&object->n_reads);
+	wake_cookie = __fscache_unuse_cookie(cookie);
 nobufs_unlock:
 	spin_unlock(&cookie->lock);
-	atomic_dec(&cookie->n_active);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
 	kfree(op);
 nobufs:
 	fscache_stat(&fscache_n_retrievals_nobufs);
@@ -514,6 +522,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 {
 	struct fscache_retrieval *op;
 	struct fscache_object *object;
+	bool wake_cookie = false;
 	int ret;
 
 	_enter("%p,,%d,,,", cookie, *nr_pages);
@@ -547,6 +556,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
 
+	__fscache_use_cookie(cookie);
 	atomic_inc(&object->n_reads);
 	__set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
 
@@ -601,10 +611,12 @@ error:
 
 nobufs_unlock_dec:
 	atomic_dec(&object->n_reads);
+	wake_cookie = __fscache_unuse_cookie(cookie);
 nobufs_unlock:
 	spin_unlock(&cookie->lock);
-	atomic_dec(&cookie->n_active);
 	kfree(op);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
 nobufs:
 	fscache_stat(&fscache_n_retrievals_nobufs);
 	_leave(" = -ENOBUFS");
@@ -626,6 +638,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
 {
 	struct fscache_retrieval *op;
 	struct fscache_object *object;
+	bool wake_cookie = false;
 	int ret;
 
 	_enter("%p,%p,,,", cookie, page);
@@ -658,8 +671,9 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
 
+	__fscache_use_cookie(cookie);
 	if (fscache_submit_op(object, &op->op) < 0)
-		goto nobufs_unlock;
+		goto nobufs_unlock_dec;
 	spin_unlock(&cookie->lock);
 
 	fscache_stat(&fscache_n_alloc_ops);
@@ -689,10 +703,13 @@ error:
 	_leave(" = %d", ret);
 	return ret;
 
+nobufs_unlock_dec:
+	wake_cookie = __fscache_unuse_cookie(cookie);
 nobufs_unlock:
 	spin_unlock(&cookie->lock);
-	atomic_dec(&cookie->n_active);
 	kfree(op);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
 nobufs:
 	fscache_stat(&fscache_n_allocs_nobufs);
 	_leave(" = -ENOBUFS");
@@ -889,6 +906,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 {
 	struct fscache_storage *op;
 	struct fscache_object *object;
+	bool wake_cookie = false;
 	int ret;
 
 	_enter("%p,%x,", cookie, (u32) page->flags);
@@ -957,7 +975,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	op->op.debug_id	= atomic_inc_return(&fscache_op_debug_id);
 	op->store_limit = object->store_limit;
 
-	atomic_inc(&cookie->n_active);
+	__fscache_use_cookie(cookie);
 	if (fscache_submit_op(object, &op->op) < 0)
 		goto submit_failed;
 
@@ -984,10 +1002,10 @@ already_pending:
 	return 0;
 
 submit_failed:
-	atomic_dec(&cookie->n_active);
 	spin_lock(&cookie->stores_lock);
 	radix_tree_delete(&cookie->stores, page->index);
 	spin_unlock(&cookie->stores_lock);
+	wake_cookie = __fscache_unuse_cookie(cookie);
 	page_cache_release(page);
 	ret = -ENOBUFS;
 	goto nobufs;
@@ -999,6 +1017,8 @@ nobufs:
 	spin_unlock(&cookie->lock);
 	radix_tree_preload_end();
 	kfree(op);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
 	fscache_stat(&fscache_n_stores_nobufs);
 	_leave(" = -ENOBUFS");
 	return -ENOBUFS;
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 7823e9e..96a2b66 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -511,6 +511,11 @@ static inline void fscache_end_io(struct fscache_retrieval *op,
 	op->end_io_func(page, op->context, error);
 }
 
+static inline void __fscache_use_cookie(struct fscache_cookie *cookie)
+{
+	atomic_inc(&cookie->n_active);
+}
+
 /**
  * fscache_use_cookie - Request usage of cookie attached to an object
  * @object: Object description
@@ -524,6 +529,16 @@ static inline bool fscache_use_cookie(struct fscache_object *object)
 	return atomic_inc_not_zero(&cookie->n_active) != 0;
 }
 
+static inline bool __fscache_unuse_cookie(struct fscache_cookie *cookie)
+{
+	return atomic_dec_and_test(&cookie->n_active);
+}
+
+static inline void __fscache_wake_unused_cookie(struct fscache_cookie *cookie)
+{
+	wake_up_atomic_t(&cookie->n_active);
+}
+
 /**
  * fscache_unuse_cookie - Cease usage of cookie attached to an object
  * @object: Object description
@@ -534,8 +549,8 @@ static inline bool fscache_use_cookie(struct fscache_object *object)
 static inline void fscache_unuse_cookie(struct fscache_object *object)
 {
 	struct fscache_cookie *cookie = object->cookie;
-	if (atomic_dec_and_test(&cookie->n_active))
-		wake_up_atomic_t(&cookie->n_active);
+	if (__fscache_unuse_cookie(cookie))
+		__fscache_wake_unused_cookie(cookie);
 }
 
 /*
-- 
cgit v0.10.2


From 94d30ae90a00cafe686c1057be57f4885f963abf Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 21 Sep 2013 00:09:31 +0100
Subject: FS-Cache: Provide the ability to enable/disable cookies

Provide the ability to enable and disable fscache cookies.  A disabled cookie
will reject or ignore further requests to:

	Acquire a child cookie
	Invalidate and update backing objects
	Check the consistency of a backing object
	Allocate storage for backing page
	Read backing pages
	Write to backing pages

but still allows:

	Checks/waits on the completion of already in-progress objects
	Uncaching of pages
	Relinquishment of cookies

Two new operations are provided:

 (1) Disable a cookie:

	void fscache_disable_cookie(struct fscache_cookie *cookie,
				    bool invalidate);

     If the cookie is not already disabled, this locks the cookie against other
     dis/enablement ops, marks the cookie as being disabled, discards or
     invalidates any backing objects and waits for cessation of activity on any
     associated object.

     This is a wrapper around a chunk split out of fscache_relinquish_cookie(),
     but it reinitialises the cookie such that it can be reenabled.

     All possible failures are handled internally.  The caller should consider
     calling fscache_uncache_all_inode_pages() afterwards to make sure all page
     markings are cleared up.

 (2) Enable a cookie:

	void fscache_enable_cookie(struct fscache_cookie *cookie,
				   bool (*can_enable)(void *data),
				   void *data)

     If the cookie is not already enabled, this locks the cookie against other
     dis/enablement ops, invokes can_enable() and, if the cookie is not an
     index cookie, will begin the procedure of acquiring backing objects.

     The optional can_enable() function is passed the data argument and returns
     a ruling as to whether or not enablement should actually be permitted to
     begin.

     All possible failures are handled internally.  The cookie will only be
     marked as enabled if provisional backing objects are allocated.

A later patch will introduce these to NFS.  Cookie enablement during nfs_open()
is then contingent on i_writecount <= 0.  can_enable() checks for a race
between open(O_RDONLY) and open(O_WRONLY/O_RDWR).  This simplifies NFS's cookie
handling and allows us to get rid of open(O_RDONLY) accidentally introducing
caching to an inode that's open for writing already.

One operation has its API modified:

 (3) Acquire a cookie.

	struct fscache_cookie *fscache_acquire_cookie(
		struct fscache_cookie *parent,
		const struct fscache_cookie_def *def,
		void *netfs_data,
		bool enable);

     This now has an additional argument that indicates whether the requested
     cookie should be enabled by default.  It doesn't need the can_enable()
     function because the caller must prevent multiple calls for the same netfs
     object and it doesn't need to take the enablement lock because no one else
     can get at the cookie before this returns.

Signed-off-by: David Howells <dhowells@redhat.com

diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
index 11a0a40..aed6b94 100644
--- a/Documentation/filesystems/caching/netfs-api.txt
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -29,15 +29,16 @@ This document contains the following sections:
 	 (6) Index registration
 	 (7) Data file registration
 	 (8) Miscellaneous object registration
-	 (9) Setting the data file size
+ 	 (9) Setting the data file size
 	(10) Page alloc/read/write
 	(11) Page uncaching
 	(12) Index and data file consistency
-	(13) Miscellaneous cookie operations
-	(14) Cookie unregistration
-	(15) Index invalidation
-	(16) Data file invalidation
-	(17) FS-Cache specific page flags.
+	(13) Cookie enablement
+	(14) Miscellaneous cookie operations
+	(15) Cookie unregistration
+	(16) Index invalidation
+	(17) Data file invalidation
+	(18) FS-Cache specific page flags.
 
 
 =============================
@@ -334,7 +335,8 @@ the path to the file:
 	struct fscache_cookie *
 	fscache_acquire_cookie(struct fscache_cookie *parent,
 			       const struct fscache_object_def *def,
-			       void *netfs_data);
+			       void *netfs_data,
+			       bool enable);
 
 This function creates an index entry in the index represented by parent,
 filling in the index entry by calling the operations pointed to by def.
@@ -350,6 +352,10 @@ object needs to be created somewhere down the hierarchy.  Furthermore, an index
 may be created in several different caches independently at different times.
 This is all handled transparently, and the netfs doesn't see any of it.
 
+A cookie will be created in the disabled state if enabled is false.  A cookie
+must be enabled to do anything with it.  A disabled cookie can be enabled by
+calling fscache_enable_cookie() (see below).
+
 For example, with AFS, a cell would be added to the primary index.  This index
 entry would have a dependent inode containing a volume location index for the
 volume mappings within this cell:
@@ -357,7 +363,7 @@ volume mappings within this cell:
 	cell->cache =
 		fscache_acquire_cookie(afs_cache_netfs.primary_index,
 				       &afs_cell_cache_index_def,
-				       cell);
+				       cell, true);
 
 Then when a volume location was accessed, it would be entered into the cell's
 index and an inode would be allocated that acts as a volume type and hash chain
@@ -366,7 +372,7 @@ combination:
 	vlocation->cache =
 		fscache_acquire_cookie(cell->cache,
 				       &afs_vlocation_cache_index_def,
-				       vlocation);
+				       vlocation, true);
 
 And then a particular flavour of volume (R/O for example) could be added to
 that index, creating another index for vnodes (AFS inode equivalents):
@@ -374,7 +380,7 @@ that index, creating another index for vnodes (AFS inode equivalents):
 	volume->cache =
 		fscache_acquire_cookie(vlocation->cache,
 				       &afs_volume_cache_index_def,
-				       volume);
+				       volume, true);
 
 
 ======================
@@ -388,7 +394,7 @@ the object definition should be something other than index type.
 	vnode->cache =
 		fscache_acquire_cookie(volume->cache,
 				       &afs_vnode_cache_object_def,
-				       vnode);
+				       vnode, true);
 
 
 =================================
@@ -404,7 +410,7 @@ it would be some other type of object such as a data file.
 	xattr->cache =
 		fscache_acquire_cookie(vnode->cache,
 				       &afs_xattr_cache_object_def,
-				       xattr);
+				       xattr, true);
 
 Miscellaneous objects might be used to store extended attributes or directory
 entries for example.
@@ -733,6 +739,47 @@ Note that partial updates may happen automatically at other times, such as when
 data blocks are added to a data file object.
 
 
+=================
+COOKIE ENABLEMENT
+=================
+
+Cookies exist in one of two states: enabled and disabled.  If a cookie is
+disabled, it ignores all attempts to acquire child cookies; check, update or
+invalidate its state; allocate, read or write backing pages - though it is
+still possible to uncache pages and relinquish the cookie.
+
+The initial enablement state is set by fscache_acquire_cookie(), but the cookie
+can be enabled or disabled later.  To disable a cookie, call:
+    
+	void fscache_disable_cookie(struct fscache_cookie *cookie,
+    				    bool invalidate);
+    
+If the cookie is not already disabled, this locks the cookie against other
+enable and disable ops, marks the cookie as being disabled, discards or
+invalidates any backing objects and waits for cessation of activity on any
+associated object before unlocking the cookie.
+
+All possible failures are handled internally.  The caller should consider
+calling fscache_uncache_all_inode_pages() afterwards to make sure all page
+markings are cleared up.
+    
+Cookies can be enabled or reenabled with:
+    
+    	void fscache_enable_cookie(struct fscache_cookie *cookie,
+    				   bool (*can_enable)(void *data),
+    				   void *data)
+    
+If the cookie is not already enabled, this locks the cookie against other
+enable and disable ops, invokes can_enable() and, if the cookie is not an index
+cookie, will begin the procedure of acquiring backing objects.
+
+The optional can_enable() function is passed the data argument and returns a
+ruling as to whether or not enablement should actually be permitted to begin.
+
+All possible failures are handled internally.  The cookie will only be marked
+as enabled if provisional backing objects are allocated.
+
+
 ===============================
 MISCELLANEOUS COOKIE OPERATIONS
 ===============================
@@ -778,7 +825,7 @@ COOKIE UNREGISTRATION
 To get rid of a cookie, this function should be called.
 
 	void fscache_relinquish_cookie(struct fscache_cookie *cookie,
-				       int retire);
+				       bool retire);
 
 If retire is non-zero, then the object will be marked for recycling, and all
 copies of it will be removed from all active caches in which it is present.
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index a9ea73d..2b7a032 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -90,7 +90,7 @@ void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
 
 	v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index,
 						&v9fs_cache_session_index_def,
-						v9ses);
+						v9ses, true);
 	p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n",
 		 v9ses, v9ses->fscache);
 }
@@ -204,7 +204,7 @@ void v9fs_cache_inode_get_cookie(struct inode *inode)
 	v9ses = v9fs_inode2v9ses(inode);
 	v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
 						  &v9fs_cache_inode_index_def,
-						  v9inode);
+						  v9inode, true);
 
 	p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
 		 inode, v9inode->fscache);
@@ -271,7 +271,7 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
 	v9ses = v9fs_inode2v9ses(inode);
 	v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
 						  &v9fs_cache_inode_index_def,
-						  v9inode);
+						  v9inode, true);
 	p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n",
 		 inode, old, v9inode->fscache);
 
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 3c090b7..ca0a3cf 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -179,7 +179,7 @@ struct afs_cell *afs_cell_create(const char *name, unsigned namesz,
 	/* put it up for caching (this never returns an error) */
 	cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
 					     &afs_cell_cache_index_def,
-					     cell);
+					     cell, true);
 #endif
 
 	/* add to the cell lists */
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 789bc25..ce25d75 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -259,7 +259,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
 #ifdef CONFIG_AFS_FSCACHE
 	vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
 					      &afs_vnode_cache_index_def,
-					      vnode);
+					      vnode, true);
 #endif
 
 	ret = afs_inode_map_status(vnode, key);
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 57bcb15..b6df2e8 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -308,7 +308,8 @@ static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
 	/* see if we have an in-cache copy (will set vl->valid if there is) */
 #ifdef CONFIG_AFS_FSCACHE
 	vl->cache = fscache_acquire_cookie(vl->cell->cache,
-					   &afs_vlocation_cache_index_def, vl);
+					   &afs_vlocation_cache_index_def, vl,
+					   true);
 #endif
 
 	if (vl->valid) {
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 401eeb2..2b60725 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -131,7 +131,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
 #ifdef CONFIG_AFS_FSCACHE
 	volume->cache = fscache_acquire_cookie(vlocation->cache,
 					       &afs_volume_cache_index_def,
-					       volume);
+					       volume, true);
 #endif
 	afs_get_vlocation(vlocation);
 	volume->vlocation = vlocation;
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 43eb559..00baf14 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -270,7 +270,7 @@ static void cachefiles_drop_object(struct fscache_object *_object)
 #endif
 
 	/* delete retired objects */
-	if (test_bit(FSCACHE_COOKIE_RETIRED, &object->fscache.cookie->flags) &&
+	if (test_bit(FSCACHE_OBJECT_RETIRED, &object->fscache.flags) &&
 	    _object != cache->cache.fsdef
 	    ) {
 		_debug("- retire object OBJ%x", object->fscache.debug_id);
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 6bfe65e..7db2e6c 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -68,7 +68,7 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
 {
 	fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
 					      &ceph_fscache_fsid_object_def,
-					      fsc);
+					      fsc, true);
 
 	if (fsc->fscache == NULL) {
 		pr_err("Unable to resgister fsid: %p fscache cookie", fsc);
@@ -204,7 +204,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
 
 	ci->fscache = fscache_acquire_cookie(fsc->fscache,
 					     &ceph_fscache_inode_object_def,
-					     ci);
+					     ci, true);
 done:
 	mutex_unlock(&inode->i_mutex);
 
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 2f4bc5a..fe2492d 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -27,7 +27,7 @@ void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server)
 {
 	server->fscache =
 		fscache_acquire_cookie(cifs_fscache_netfs.primary_index,
-				&cifs_fscache_server_index_def, server);
+				&cifs_fscache_server_index_def, server, true);
 	cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
 		 __func__, server, server->fscache);
 }
@@ -46,7 +46,7 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
 
 	tcon->fscache =
 		fscache_acquire_cookie(server->fscache,
-				&cifs_fscache_super_index_def, tcon);
+				&cifs_fscache_super_index_def, tcon, true);
 	cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
 		 __func__, server->fscache, tcon->fscache);
 }
@@ -69,7 +69,7 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) {
 		cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
-				&cifs_fscache_inode_object_def, cifsi);
+				&cifs_fscache_inode_object_def, cifsi, true);
 		cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n",
 			 __func__, tcon->fscache, cifsi->fscache);
 	}
@@ -119,7 +119,7 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode)
 		cifsi->fscache = fscache_acquire_cookie(
 					cifs_sb_master_tcon(cifs_sb)->fscache,
 					&cifs_fscache_inode_object_def,
-					cifsi);
+					cifsi, true);
 		cifs_dbg(FYI, "%s: new cookie 0x%p oldcookie 0x%p\n",
 			 __func__, cifsi->fscache, old);
 	}
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index d851aa5..29d7feb 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -58,15 +58,16 @@ void fscache_cookie_init_once(void *_cookie)
 struct fscache_cookie *__fscache_acquire_cookie(
 	struct fscache_cookie *parent,
 	const struct fscache_cookie_def *def,
-	void *netfs_data)
+	void *netfs_data,
+	bool enable)
 {
 	struct fscache_cookie *cookie;
 
 	BUG_ON(!def);
 
-	_enter("{%s},{%s},%p",
+	_enter("{%s},{%s},%p,%u",
 	       parent ? (char *) parent->def->name : "<no-parent>",
-	       def->name, netfs_data);
+	       def->name, netfs_data, enable);
 
 	fscache_stat(&fscache_n_acquires);
 
@@ -106,7 +107,7 @@ struct fscache_cookie *__fscache_acquire_cookie(
 	cookie->def		= def;
 	cookie->parent		= parent;
 	cookie->netfs_data	= netfs_data;
-	cookie->flags		= 0;
+	cookie->flags		= (1 << FSCACHE_COOKIE_NO_DATA_YET);
 
 	/* radix tree insertion won't use the preallocation pool unless it's
 	 * told it may not wait */
@@ -124,16 +125,22 @@ struct fscache_cookie *__fscache_acquire_cookie(
 		break;
 	}
 
-	/* if the object is an index then we need do nothing more here - we
-	 * create indices on disk when we need them as an index may exist in
-	 * multiple caches */
-	if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
-		if (fscache_acquire_non_index_cookie(cookie) < 0) {
-			atomic_dec(&parent->n_children);
-			__fscache_cookie_put(cookie);
-			fscache_stat(&fscache_n_acquires_nobufs);
-			_leave(" = NULL");
-			return NULL;
+	if (enable) {
+		/* if the object is an index then we need do nothing more here
+		 * - we create indices on disk when we need them as an index
+		 * may exist in multiple caches */
+		if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+			if (fscache_acquire_non_index_cookie(cookie) == 0) {
+				set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
+			} else {
+				atomic_dec(&parent->n_children);
+				__fscache_cookie_put(cookie);
+				fscache_stat(&fscache_n_acquires_nobufs);
+				_leave(" = NULL");
+				return NULL;
+			}
+		} else {
+			set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
 		}
 	}
 
@@ -144,6 +151,39 @@ struct fscache_cookie *__fscache_acquire_cookie(
 EXPORT_SYMBOL(__fscache_acquire_cookie);
 
 /*
+ * Enable a cookie to permit it to accept new operations.
+ */
+void __fscache_enable_cookie(struct fscache_cookie *cookie,
+			     bool (*can_enable)(void *data),
+			     void *data)
+{
+	_enter("%p", cookie);
+
+	wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK,
+			 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+
+	if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags))
+		goto out_unlock;
+
+	if (can_enable && !can_enable(data)) {
+		/* The netfs decided it didn't want to enable after all */
+	} else if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+		/* Wait for outstanding disablement to complete */
+		__fscache_wait_on_invalidate(cookie);
+
+		if (fscache_acquire_non_index_cookie(cookie) == 0)
+			set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
+	} else {
+		set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
+	}
+
+out_unlock:
+	clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags);
+	wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK);
+}
+EXPORT_SYMBOL(__fscache_enable_cookie);
+
+/*
  * acquire a non-index cookie
  * - this must make sure the index chain is instantiated and instantiate the
  *   object representation too
@@ -157,7 +197,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
 
 	_enter("");
 
-	cookie->flags = 1 << FSCACHE_COOKIE_UNAVAILABLE;
+	set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
 
 	/* now we need to see whether the backing objects for this cookie yet
 	 * exist, if not there'll be nothing to search */
@@ -180,9 +220,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
 
 	_debug("cache %s", cache->tag->name);
 
-	cookie->flags =
-		(1 << FSCACHE_COOKIE_LOOKING_UP) |
-		(1 << FSCACHE_COOKIE_NO_DATA_YET);
+	set_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
 
 	/* ask the cache to allocate objects for this cookie and its parent
 	 * chain */
@@ -398,7 +436,8 @@ void __fscache_invalidate(struct fscache_cookie *cookie)
 	if (!hlist_empty(&cookie->backing_objects)) {
 		spin_lock(&cookie->lock);
 
-		if (!hlist_empty(&cookie->backing_objects) &&
+		if (fscache_cookie_enabled(cookie) &&
+		    !hlist_empty(&cookie->backing_objects) &&
 		    !test_and_set_bit(FSCACHE_COOKIE_INVALIDATING,
 				      &cookie->flags)) {
 			object = hlist_entry(cookie->backing_objects.first,
@@ -452,10 +491,14 @@ void __fscache_update_cookie(struct fscache_cookie *cookie)
 
 	spin_lock(&cookie->lock);
 
-	/* update the index entry on disk in each cache backing this cookie */
-	hlist_for_each_entry(object,
-			     &cookie->backing_objects, cookie_link) {
-		fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
+	if (fscache_cookie_enabled(cookie)) {
+		/* update the index entry on disk in each cache backing this
+		 * cookie.
+		 */
+		hlist_for_each_entry(object,
+				     &cookie->backing_objects, cookie_link) {
+			fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
+		}
 	}
 
 	spin_unlock(&cookie->lock);
@@ -464,28 +507,14 @@ void __fscache_update_cookie(struct fscache_cookie *cookie)
 EXPORT_SYMBOL(__fscache_update_cookie);
 
 /*
- * release a cookie back to the cache
- * - the object will be marked as recyclable on disk if retire is true
- * - all dependents of this cookie must have already been unregistered
- *   (indices/files/pages)
+ * Disable a cookie to stop it from accepting new requests from the netfs.
  */
-void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
+void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
 {
 	struct fscache_object *object;
+	bool awaken = false;
 
-	fscache_stat(&fscache_n_relinquishes);
-	if (retire)
-		fscache_stat(&fscache_n_relinquishes_retire);
-
-	if (!cookie) {
-		fscache_stat(&fscache_n_relinquishes_null);
-		_leave(" [no cookie]");
-		return;
-	}
-
-	_enter("%p{%s,%p,%d},%d",
-	       cookie, cookie->def->name, cookie->netfs_data,
-	       atomic_read(&cookie->n_active), retire);
+	_enter("%p,%u", cookie, invalidate);
 
 	ASSERTCMP(atomic_read(&cookie->n_active), >, 0);
 
@@ -495,24 +524,82 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
 		BUG();
 	}
 
-	/* No further netfs-accessing operations on this cookie permitted */
-	set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags);
-	if (retire)
-		set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
+	wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK,
+			 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+	if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags))
+		goto out_unlock_enable;
+
+	/* If the cookie is being invalidated, wait for that to complete first
+	 * so that we can reuse the flag.
+	 */
+	__fscache_wait_on_invalidate(cookie);
+
+	/* Dispose of the backing objects */
+	set_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags);
 
 	spin_lock(&cookie->lock);
-	hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) {
-		fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
+	if (!hlist_empty(&cookie->backing_objects)) {
+		hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) {
+			if (invalidate)
+				set_bit(FSCACHE_OBJECT_RETIRED, &object->flags);
+			fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
+		}
+	} else {
+		if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+			awaken = true;
 	}
 	spin_unlock(&cookie->lock);
+	if (awaken)
+		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
 
 	/* Wait for cessation of activity requiring access to the netfs (when
-	 * n_active reaches 0).
+	 * n_active reaches 0).  This makes sure outstanding reads and writes
+	 * have completed.
 	 */
 	if (!atomic_dec_and_test(&cookie->n_active))
 		wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
 				 TASK_UNINTERRUPTIBLE);
 
+	/* Reset the cookie state if it wasn't relinquished */
+	if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) {
+		atomic_inc(&cookie->n_active);
+		set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+	}
+
+out_unlock_enable:
+	clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags);
+	wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK);
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_disable_cookie);
+
+/*
+ * release a cookie back to the cache
+ * - the object will be marked as recyclable on disk if retire is true
+ * - all dependents of this cookie must have already been unregistered
+ *   (indices/files/pages)
+ */
+void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire)
+{
+	fscache_stat(&fscache_n_relinquishes);
+	if (retire)
+		fscache_stat(&fscache_n_relinquishes_retire);
+
+	if (!cookie) {
+		fscache_stat(&fscache_n_relinquishes_null);
+		_leave(" [no cookie]");
+		return;
+	}
+
+	_enter("%p{%s,%p,%d},%d",
+	       cookie, cookie->def->name, cookie->netfs_data,
+	       atomic_read(&cookie->n_active), retire);
+
+	/* No further netfs-accessing operations on this cookie permitted */
+	set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags);
+
+	__fscache_disable_cookie(cookie, retire);
+
 	/* Clear pointers back to the netfs */
 	cookie->netfs_data	= NULL;
 	cookie->def		= NULL;
@@ -592,7 +679,8 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
 
 	spin_lock(&cookie->lock);
 
-	if (hlist_empty(&cookie->backing_objects))
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
 		goto inconsistent;
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
index 10a2ade..5a117df 100644
--- a/fs/fscache/fsdef.c
+++ b/fs/fscache/fsdef.c
@@ -59,6 +59,7 @@ struct fscache_cookie fscache_fsdef_index = {
 	.lock		= __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
 	.backing_objects = HLIST_HEAD_INIT,
 	.def		= &fscache_fsdef_index_def,
+	.flags		= 1 << FSCACHE_COOKIE_ENABLED,
 };
 EXPORT_SYMBOL(fscache_fsdef_index);
 
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index b1bb611..989f394 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -45,6 +45,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
 	netfs->primary_index->def		= &fscache_fsdef_netfs_def;
 	netfs->primary_index->parent		= &fscache_fsdef_index;
 	netfs->primary_index->netfs_data	= netfs;
+	netfs->primary_index->flags		= 1 << FSCACHE_COOKIE_ENABLED;
 
 	atomic_inc(&netfs->primary_index->parent->usage);
 	atomic_inc(&netfs->primary_index->parent->n_children);
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 86d75a6..dcb8216 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -495,6 +495,7 @@ void fscache_object_lookup_negative(struct fscache_object *object)
 		 * returning ENODATA.
 		 */
 		set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+		clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
 
 		_debug("wake up lookup %p", &cookie->flags);
 		clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
@@ -527,6 +528,7 @@ void fscache_obtained_object(struct fscache_object *object)
 
 		/* We do (presumably) have data */
 		clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+		clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
 
 		/* Allow write requests to begin stacking up and read requests
 		 * to begin shovelling data.
@@ -679,7 +681,8 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob
 	 */
 	spin_lock(&cookie->lock);
 	hlist_del_init(&object->cookie_link);
-	if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+	if (hlist_empty(&cookie->backing_objects) &&
+	    test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
 		awaken = true;
 	spin_unlock(&cookie->lock);
 
@@ -927,7 +930,7 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj
 	 */
 	if (!fscache_use_cookie(object)) {
 		ASSERT(object->cookie->stores.rnode == NULL);
-		set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
+		set_bit(FSCACHE_OBJECT_RETIRED, &object->flags);
 		_leave(" [no cookie]");
 		return transit_to(KILL_OBJECT);
 	}
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 0fe42a6..7f5c658 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -204,7 +204,8 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 
 	spin_lock(&cookie->lock);
 
-	if (hlist_empty(&cookie->backing_objects))
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
 		goto nobufs;
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
@@ -410,7 +411,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 		return -ERESTARTSYS;
 
 	op = fscache_alloc_retrieval(cookie, page->mapping,
-				     end_io_func,context);
+				     end_io_func, context);
 	if (!op) {
 		_leave(" = -ENOMEM");
 		return -ENOMEM;
@@ -419,7 +420,8 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 
 	spin_lock(&cookie->lock);
 
-	if (hlist_empty(&cookie->backing_objects))
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
 		goto nobufs_unlock;
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
@@ -551,7 +553,8 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 
 	spin_lock(&cookie->lock);
 
-	if (hlist_empty(&cookie->backing_objects))
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
 		goto nobufs_unlock;
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
@@ -666,7 +669,8 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
 
 	spin_lock(&cookie->lock);
 
-	if (hlist_empty(&cookie->backing_objects))
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
 		goto nobufs_unlock;
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
@@ -938,7 +942,8 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	ret = -ENOBUFS;
 	spin_lock(&cookie->lock);
 
-	if (hlist_empty(&cookie->backing_objects))
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
 		goto nobufs;
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 24d1d1c..cd6e7ef 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -39,7 +39,7 @@ void nfs_fscache_get_client_cookie(struct nfs_client *clp)
 	/* create a cache index for looking up filehandles */
 	clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,
 					      &nfs_fscache_server_index_def,
-					      clp);
+					      clp, true);
 	dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",
 		 clp, clp->fscache);
 }
@@ -139,7 +139,7 @@ void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int
 	/* create a cache index for looking up filehandles */
 	nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
 					       &nfs_fscache_super_index_def,
-					       nfss);
+					       nfss, true);
 	dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
 		 nfss, nfss->fscache);
 	return;
@@ -200,7 +200,7 @@ static void nfs_fscache_enable_inode_cookie(struct inode *inode)
 		nfsi->fscache = fscache_acquire_cookie(
 			NFS_SB(sb)->fscache,
 			&nfs_fscache_inode_object_def,
-			nfsi);
+			nfsi, true);
 
 		dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n",
 			 sb, nfsi, nfsi->fscache);
@@ -327,7 +327,7 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode)
 		nfsi->fscache = fscache_acquire_cookie(
 			nfss->nfs_client->fscache,
 			&nfs_fscache_inode_object_def,
-			nfsi);
+			nfsi, true);
 
 		dfprintk(FSCACHE,
 			 "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n",
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 96a2b66..7714849 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -308,36 +308,6 @@ struct fscache_cache_ops {
 	void (*dissociate_pages)(struct fscache_cache *cache);
 };
 
-/*
- * data file or index object cookie
- * - a file will only appear in one cache
- * - a request to cache a file may or may not be honoured, subject to
- *   constraints such as disk space
- * - indices are created on disk just-in-time
- */
-struct fscache_cookie {
-	atomic_t			usage;		/* number of users of this cookie */
-	atomic_t			n_children;	/* number of children of this cookie */
-	atomic_t			n_active;	/* number of active users of netfs ptrs */
-	spinlock_t			lock;
-	spinlock_t			stores_lock;	/* lock on page store tree */
-	struct hlist_head		backing_objects; /* object(s) backing this file/index */
-	const struct fscache_cookie_def	*def;		/* definition */
-	struct fscache_cookie		*parent;	/* parent of this entry */
-	void				*netfs_data;	/* back pointer to netfs */
-	struct radix_tree_root		stores;		/* pages to be stored on this cookie */
-#define FSCACHE_COOKIE_PENDING_TAG	0		/* pages tag: pending write to cache */
-#define FSCACHE_COOKIE_STORING_TAG	1		/* pages tag: writing to cache */
-
-	unsigned long			flags;
-#define FSCACHE_COOKIE_LOOKING_UP	0	/* T if non-index cookie being looked up still */
-#define FSCACHE_COOKIE_NO_DATA_YET	1	/* T if new object with no cached data yet */
-#define FSCACHE_COOKIE_UNAVAILABLE	2	/* T if cookie is unavailable (error, etc) */
-#define FSCACHE_COOKIE_INVALIDATING	3	/* T if cookie is being invalidated */
-#define FSCACHE_COOKIE_RELINQUISHED	4	/* T if cookie has been relinquished */
-#define FSCACHE_COOKIE_RETIRED		5	/* T if cookie was retired */
-};
-
 extern struct fscache_cookie fscache_fsdef_index;
 
 /*
@@ -400,6 +370,7 @@ struct fscache_object {
 #define FSCACHE_OBJECT_IS_LIVE		3	/* T if object is not withdrawn or relinquished */
 #define FSCACHE_OBJECT_IS_LOOKED_UP	4	/* T if object has been looked up */
 #define FSCACHE_OBJECT_IS_AVAILABLE	5	/* T if object has become active */
+#define FSCACHE_OBJECT_RETIRED		6	/* T if object was retired on relinquishment */
 
 	struct list_head	cache_link;	/* link in cache->object_list */
 	struct hlist_node	cookie_link;	/* link in cookie->backing_objects */
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 19b4645..115bb81 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -167,6 +167,42 @@ struct fscache_netfs {
 };
 
 /*
+ * data file or index object cookie
+ * - a file will only appear in one cache
+ * - a request to cache a file may or may not be honoured, subject to
+ *   constraints such as disk space
+ * - indices are created on disk just-in-time
+ */
+struct fscache_cookie {
+	atomic_t			usage;		/* number of users of this cookie */
+	atomic_t			n_children;	/* number of children of this cookie */
+	atomic_t			n_active;	/* number of active users of netfs ptrs */
+	spinlock_t			lock;
+	spinlock_t			stores_lock;	/* lock on page store tree */
+	struct hlist_head		backing_objects; /* object(s) backing this file/index */
+	const struct fscache_cookie_def	*def;		/* definition */
+	struct fscache_cookie		*parent;	/* parent of this entry */
+	void				*netfs_data;	/* back pointer to netfs */
+	struct radix_tree_root		stores;		/* pages to be stored on this cookie */
+#define FSCACHE_COOKIE_PENDING_TAG	0		/* pages tag: pending write to cache */
+#define FSCACHE_COOKIE_STORING_TAG	1		/* pages tag: writing to cache */
+
+	unsigned long			flags;
+#define FSCACHE_COOKIE_LOOKING_UP	0	/* T if non-index cookie being looked up still */
+#define FSCACHE_COOKIE_NO_DATA_YET	1	/* T if new object with no cached data yet */
+#define FSCACHE_COOKIE_UNAVAILABLE	2	/* T if cookie is unavailable (error, etc) */
+#define FSCACHE_COOKIE_INVALIDATING	3	/* T if cookie is being invalidated */
+#define FSCACHE_COOKIE_RELINQUISHED	4	/* T if cookie has been relinquished */
+#define FSCACHE_COOKIE_ENABLED		5	/* T if cookie is enabled */
+#define FSCACHE_COOKIE_ENABLEMENT_LOCK	6	/* T if cookie is being en/disabled */
+};
+
+static inline bool fscache_cookie_enabled(struct fscache_cookie *cookie)
+{
+	return test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
+}
+
+/*
  * slow-path functions for when there is actually caching available, and the
  * netfs does actually have a valid token
  * - these are not to be called directly
@@ -181,8 +217,8 @@ extern void __fscache_release_cache_tag(struct fscache_cache_tag *);
 extern struct fscache_cookie *__fscache_acquire_cookie(
 	struct fscache_cookie *,
 	const struct fscache_cookie_def *,
-	void *);
-extern void __fscache_relinquish_cookie(struct fscache_cookie *, int);
+	void *, bool);
+extern void __fscache_relinquish_cookie(struct fscache_cookie *, bool);
 extern int __fscache_check_consistency(struct fscache_cookie *);
 extern void __fscache_update_cookie(struct fscache_cookie *);
 extern int __fscache_attr_changed(struct fscache_cookie *);
@@ -211,6 +247,9 @@ extern void __fscache_uncache_all_inode_pages(struct fscache_cookie *,
 					      struct inode *);
 extern void __fscache_readpages_cancel(struct fscache_cookie *cookie,
 				       struct list_head *pages);
+extern void __fscache_disable_cookie(struct fscache_cookie *, bool);
+extern void __fscache_enable_cookie(struct fscache_cookie *,
+				    bool (*)(void *), void *);
 
 /**
  * fscache_register_netfs - Register a filesystem as desiring caching services
@@ -289,6 +328,7 @@ void fscache_release_cache_tag(struct fscache_cache_tag *tag)
  * @def: A description of the cache object, including callback operations
  * @netfs_data: An arbitrary piece of data to be kept in the cookie to
  * represent the cache object to the netfs
+ * @enable: Whether or not to enable a data cookie immediately
  *
  * This function is used to inform FS-Cache about part of an index hierarchy
  * that can be used to locate files.  This is done by requesting a cookie for
@@ -301,10 +341,12 @@ static inline
 struct fscache_cookie *fscache_acquire_cookie(
 	struct fscache_cookie *parent,
 	const struct fscache_cookie_def *def,
-	void *netfs_data)
+	void *netfs_data,
+	bool enable)
 {
-	if (fscache_cookie_valid(parent))
-		return __fscache_acquire_cookie(parent, def, netfs_data);
+	if (fscache_cookie_valid(parent) && fscache_cookie_enabled(parent))
+		return __fscache_acquire_cookie(parent, def, netfs_data,
+						enable);
 	else
 		return NULL;
 }
@@ -322,7 +364,7 @@ struct fscache_cookie *fscache_acquire_cookie(
  * description.
  */
 static inline
-void fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
+void fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire)
 {
 	if (fscache_cookie_valid(cookie))
 		__fscache_relinquish_cookie(cookie, retire);
@@ -341,7 +383,7 @@ void fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
 static inline
 int fscache_check_consistency(struct fscache_cookie *cookie)
 {
-	if (fscache_cookie_valid(cookie))
+	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
 		return __fscache_check_consistency(cookie);
 	else
 		return 0;
@@ -360,7 +402,7 @@ int fscache_check_consistency(struct fscache_cookie *cookie)
 static inline
 void fscache_update_cookie(struct fscache_cookie *cookie)
 {
-	if (fscache_cookie_valid(cookie))
+	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
 		__fscache_update_cookie(cookie);
 }
 
@@ -407,7 +449,7 @@ void fscache_unpin_cookie(struct fscache_cookie *cookie)
 static inline
 int fscache_attr_changed(struct fscache_cookie *cookie)
 {
-	if (fscache_cookie_valid(cookie))
+	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
 		return __fscache_attr_changed(cookie);
 	else
 		return -ENOBUFS;
@@ -429,7 +471,7 @@ int fscache_attr_changed(struct fscache_cookie *cookie)
 static inline
 void fscache_invalidate(struct fscache_cookie *cookie)
 {
-	if (fscache_cookie_valid(cookie))
+	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
 		__fscache_invalidate(cookie);
 }
 
@@ -503,7 +545,7 @@ int fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 			       void *context,
 			       gfp_t gfp)
 {
-	if (fscache_cookie_valid(cookie))
+	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
 		return __fscache_read_or_alloc_page(cookie, page, end_io_func,
 						    context, gfp);
 	else
@@ -554,7 +596,7 @@ int fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 				void *context,
 				gfp_t gfp)
 {
-	if (fscache_cookie_valid(cookie))
+	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
 		return __fscache_read_or_alloc_pages(cookie, mapping, pages,
 						     nr_pages, end_io_func,
 						     context, gfp);
@@ -585,7 +627,7 @@ int fscache_alloc_page(struct fscache_cookie *cookie,
 		       struct page *page,
 		       gfp_t gfp)
 {
-	if (fscache_cookie_valid(cookie))
+	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
 		return __fscache_alloc_page(cookie, page, gfp);
 	else
 		return -ENOBUFS;
@@ -634,7 +676,7 @@ int fscache_write_page(struct fscache_cookie *cookie,
 		       struct page *page,
 		       gfp_t gfp)
 {
-	if (fscache_cookie_valid(cookie))
+	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
 		return __fscache_write_page(cookie, page, gfp);
 	else
 		return -ENOBUFS;
@@ -744,4 +786,47 @@ void fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
 		__fscache_uncache_all_inode_pages(cookie, inode);
 }
 
+/**
+ * fscache_disable_cookie - Disable a cookie
+ * @cookie: The cookie representing the cache object
+ * @invalidate: Invalidate the backing object
+ *
+ * Disable a cookie from accepting further alloc, read, write, invalidate,
+ * update or acquire operations.  Outstanding operations can still be waited
+ * upon and pages can still be uncached and the cookie relinquished.
+ *
+ * This will not return until all outstanding operations have completed.
+ *
+ * If @invalidate is set, then the backing object will be invalidated and
+ * detached, otherwise it will just be detached.
+ */
+static inline
+void fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
+{
+	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
+		__fscache_disable_cookie(cookie, invalidate);
+}
+
+/**
+ * fscache_enable_cookie - Reenable a cookie
+ * @cookie: The cookie representing the cache object
+ * @can_enable: A function to permit enablement once lock is held
+ * @data: Data for can_enable()
+ *
+ * Reenable a previously disabled cookie, allowing it to accept further alloc,
+ * read, write, invalidate, update or acquire operations.  An attempt will be
+ * made to immediately reattach the cookie to a backing object.
+ *
+ * The can_enable() function is called (if not NULL) once the enablement lock
+ * is held to rule on whether enablement is still permitted to go ahead.
+ */
+static inline
+void fscache_enable_cookie(struct fscache_cookie *cookie,
+			   bool (*can_enable)(void *data),
+			   void *data)
+{
+	if (fscache_cookie_valid(cookie) && !fscache_cookie_enabled(cookie))
+		__fscache_enable_cookie(cookie, can_enable, data);
+}
+
 #endif /* _LINUX_FSCACHE_H */
-- 
cgit v0.10.2


From f1fe29b4a02d0805aa7d0ff6b73410a9f9316d69 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 27 Sep 2013 11:20:03 +0100
Subject: NFS: Use i_writecount to control whether to get an fscache cookie in
 nfs_open()

Use i_writecount to control whether to get an fscache cookie in nfs_open() as
NFS does not do write caching yet.  I *think* this is the cause of a problem
encountered by Mark Moseley whereby __fscache_uncache_page() gets a NULL
pointer dereference because cookie->def is NULL:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000010
IP: [<ffffffff812a1903>] __fscache_uncache_page+0x23/0x160
PGD 0
Thread overran stack, or stack corrupted
Oops: 0000 [#1] SMP
Modules linked in: ...
CPU: 7 PID: 18993 Comm: php Not tainted 3.11.1 #1
Hardware name: Dell Inc. PowerEdge R420/072XWF, BIOS 1.3.5 08/21/2012
task: ffff8804203460c0 ti: ffff880420346640
RIP: 0010:[<ffffffff812a1903>] __fscache_uncache_page+0x23/0x160
RSP: 0018:ffff8801053af878 EFLAGS: 00210286
RAX: 0000000000000000 RBX: ffff8800be2f8780 RCX: ffff88022ffae5e8
RDX: 0000000000004c66 RSI: ffffea00055ff440 RDI: ffff8800be2f8780
RBP: ffff8801053af898 R08: 0000000000000001 R09: 0000000000000003
R10: 0000000000000000 R11: 0000000000000000 R12: ffffea00055ff440
R13: 0000000000001000 R14: ffff8800c50be538 R15: 0000000000000000
FS: 0000000000000000(0000) GS:ffff88042fc60000(0063) knlGS:00000000e439c700
CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033
CR2: 0000000000000010 CR3: 0000000001d8f000 CR4: 00000000000607f0
Stack:
...
Call Trace:
[<ffffffff81365a72>] __nfs_fscache_invalidate_page+0x42/0x70
[<ffffffff813553d5>] nfs_invalidate_page+0x75/0x90
[<ffffffff811b8f5e>] truncate_inode_page+0x8e/0x90
[<ffffffff811b90ad>] truncate_inode_pages_range.part.12+0x14d/0x620
[<ffffffff81d6387d>] ? __mutex_lock_slowpath+0x1fd/0x2e0
[<ffffffff811b95d3>] truncate_inode_pages_range+0x53/0x70
[<ffffffff811b969d>] truncate_inode_pages+0x2d/0x40
[<ffffffff811b96ff>] truncate_pagecache+0x4f/0x70
[<ffffffff81356840>] nfs_setattr_update_inode+0xa0/0x120
[<ffffffff81368de4>] nfs3_proc_setattr+0xc4/0xe0
[<ffffffff81357f78>] nfs_setattr+0xc8/0x150
[<ffffffff8122d95b>] notify_change+0x1cb/0x390
[<ffffffff8120a55b>] do_truncate+0x7b/0xc0
[<ffffffff8121f96c>] do_last+0xa4c/0xfd0
[<ffffffff8121ffbc>] path_openat+0xcc/0x670
[<ffffffff81220a0e>] do_filp_open+0x4e/0xb0
[<ffffffff8120ba1f>] do_sys_open+0x13f/0x2b0
[<ffffffff8126aaf6>] compat_SyS_open+0x36/0x50
[<ffffffff81d7204c>] sysenter_dispatch+0x7/0x24

The code at the instruction pointer was disassembled:

> (gdb) disas __fscache_uncache_page
> Dump of assembler code for function __fscache_uncache_page:
> ...
> 0xffffffff812a18ff <+31>: mov 0x48(%rbx),%rax
> 0xffffffff812a1903 <+35>: cmpb $0x0,0x10(%rax)
> 0xffffffff812a1907 <+39>: je 0xffffffff812a19cd <__fscache_uncache_page+237>

These instructions make up:

	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);

That cmpb is the faulting instruction (%rax is 0).  So cookie->def is NULL -
which presumably means that the cookie has already been at least partway
through __fscache_relinquish_cookie().

What I think may be happening is something like a three-way race on the same
file:

	PROCESS 1	PROCESS 2	PROCESS 3
	===============	===============	===============
	open(O_TRUNC|O_WRONLY)
			open(O_RDONLY)
					open(O_WRONLY)
	-->nfs_open()
	-->nfs_fscache_set_inode_cookie()
	nfs_fscache_inode_lock()
	nfs_fscache_disable_inode_cookie()
	__fscache_relinquish_cookie()
	nfs_inode->fscache = NULL
	<--nfs_fscache_set_inode_cookie()

			-->nfs_open()
			-->nfs_fscache_set_inode_cookie()
			nfs_fscache_inode_lock()
			nfs_fscache_enable_inode_cookie()
			__fscache_acquire_cookie()
			nfs_inode->fscache = cookie
			<--nfs_fscache_set_inode_cookie()
	<--nfs_open()
	-->nfs_setattr()
	...
	...
	-->nfs_invalidate_page()
	-->__nfs_fscache_invalidate_page()
	cookie = nfsi->fscache
					-->nfs_open()
					-->nfs_fscache_set_inode_cookie()
					nfs_fscache_inode_lock()
					nfs_fscache_disable_inode_cookie()
					-->__fscache_relinquish_cookie()
	-->__fscache_uncache_page(cookie)
	<crash>
					<--__fscache_relinquish_cookie()
					nfs_inode->fscache = NULL
					<--nfs_fscache_set_inode_cookie()

What is needed is something to prevent process #2 from reacquiring the cookie
- and I think checking i_writecount should do the trick.

It's also possible to have a two-way race on this if the file is opened
O_TRUNC|O_RDONLY instead.

Reported-by: Mark Moseley <moseleymark@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 854a8f0..4c5edcc 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1381,7 +1381,7 @@ static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, i
 
 static int do_open(struct inode *inode, struct file *filp)
 {
-	nfs_fscache_set_inode_cookie(inode, filp);
+	nfs_fscache_open_file(inode, filp);
 	return 0;
 }
 
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index cd6e7ef..3ef01f0 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -178,163 +178,79 @@ void nfs_fscache_release_super_cookie(struct super_block *sb)
 /*
  * Initialise the per-inode cache cookie pointer for an NFS inode.
  */
-void nfs_fscache_init_inode_cookie(struct inode *inode)
+void nfs_fscache_init_inode(struct inode *inode)
 {
-	NFS_I(inode)->fscache = NULL;
-	if (S_ISREG(inode->i_mode))
-		set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
-}
-
-/*
- * Get the per-inode cache cookie for an NFS inode.
- */
-static void nfs_fscache_enable_inode_cookie(struct inode *inode)
-{
-	struct super_block *sb = inode->i_sb;
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	if (nfsi->fscache || !NFS_FSCACHE(inode))
+	nfsi->fscache = NULL;
+	if (!S_ISREG(inode->i_mode))
 		return;
-
-	if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) {
-		nfsi->fscache = fscache_acquire_cookie(
-			NFS_SB(sb)->fscache,
-			&nfs_fscache_inode_object_def,
-			nfsi, true);
-
-		dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n",
-			 sb, nfsi, nfsi->fscache);
-	}
+	nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache,
+					       &nfs_fscache_inode_object_def,
+					       nfsi, false);
 }
 
 /*
  * Release a per-inode cookie.
  */
-void nfs_fscache_release_inode_cookie(struct inode *inode)
+void nfs_fscache_clear_inode(struct inode *inode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
+	struct fscache_cookie *cookie = nfs_i_fscache(inode);
 
-	dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n",
-		 nfsi, nfsi->fscache);
+	dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie);
 
-	fscache_relinquish_cookie(nfsi->fscache, 0);
+	fscache_relinquish_cookie(cookie, false);
 	nfsi->fscache = NULL;
 }
 
-/*
- * Retire a per-inode cookie, destroying the data attached to it.
- */
-void nfs_fscache_zap_inode_cookie(struct inode *inode)
+static bool nfs_fscache_can_enable(void *data)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
+	struct inode *inode = data;
 
-	dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n",
-		 nfsi, nfsi->fscache);
-
-	fscache_relinquish_cookie(nfsi->fscache, 1);
-	nfsi->fscache = NULL;
+	return !inode_is_open_for_write(inode);
 }
 
 /*
- * Turn off the cache with regard to a per-inode cookie if opened for writing,
- * invalidating all the pages in the page cache relating to the associated
- * inode to clear the per-page caching.
- */
-static void nfs_fscache_disable_inode_cookie(struct inode *inode)
-{
-	clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
-
-	if (NFS_I(inode)->fscache) {
-		dfprintk(FSCACHE,
-			 "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode));
-
-		/* Need to uncache any pages attached to this inode that
-		 * fscache knows about before turning off the cache.
-		 */
-		fscache_uncache_all_inode_pages(NFS_I(inode)->fscache, inode);
-		nfs_fscache_zap_inode_cookie(inode);
-	}
-}
-
-/*
- * wait_on_bit() sleep function for uninterruptible waiting
- */
-static int nfs_fscache_wait_bit(void *flags)
-{
-	schedule();
-	return 0;
-}
-
-/*
- * Lock against someone else trying to also acquire or relinquish a cookie
- */
-static inline void nfs_fscache_inode_lock(struct inode *inode)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-
-	while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags))
-		wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK,
-			    nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE);
-}
-
-/*
- * Unlock cookie management lock
- */
-static inline void nfs_fscache_inode_unlock(struct inode *inode)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-
-	smp_mb__before_clear_bit();
-	clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags);
-	smp_mb__after_clear_bit();
-	wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK);
-}
-
-/*
- * Decide if we should enable or disable local caching for this inode.
- * - For now, with NFS, only regular files that are open read-only will be able
- *   to use the cache.
- * - May be invoked multiple times in parallel by parallel nfs_open() functions.
- */
-void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
-{
-	if (NFS_FSCACHE(inode)) {
-		nfs_fscache_inode_lock(inode);
-		if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
-			nfs_fscache_disable_inode_cookie(inode);
-		else
-			nfs_fscache_enable_inode_cookie(inode);
-		nfs_fscache_inode_unlock(inode);
-	}
-}
-EXPORT_SYMBOL_GPL(nfs_fscache_set_inode_cookie);
-
-/*
- * Replace a per-inode cookie due to revalidation detecting a file having
- * changed on the server.
+ * Enable or disable caching for a file that is being opened as appropriate.
+ * The cookie is allocated when the inode is initialised, but is not enabled at
+ * that time.  Enablement is deferred to file-open time to avoid stat() and
+ * access() thrashing the cache.
+ *
+ * For now, with NFS, only regular files that are open read-only will be able
+ * to use the cache.
+ *
+ * We enable the cache for an inode if we open it read-only and it isn't
+ * currently open for writing.  We disable the cache if the inode is open
+ * write-only.
+ *
+ * The caller uses the file struct to pin i_writecount on the inode before
+ * calling us when a file is opened for writing, so we can make use of that.
+ *
+ * Note that this may be invoked multiple times in parallel by parallel
+ * nfs_open() functions.
  */
-void nfs_fscache_reset_inode_cookie(struct inode *inode)
+void nfs_fscache_open_file(struct inode *inode, struct file *filp)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
-	struct nfs_server *nfss = NFS_SERVER(inode);
-	NFS_IFDEBUG(struct fscache_cookie *old = nfsi->fscache);
+	struct fscache_cookie *cookie = nfs_i_fscache(inode);
 
-	nfs_fscache_inode_lock(inode);
-	if (nfsi->fscache) {
-		/* retire the current fscache cache and get a new one */
-		fscache_relinquish_cookie(nfsi->fscache, 1);
-
-		nfsi->fscache = fscache_acquire_cookie(
-			nfss->nfs_client->fscache,
-			&nfs_fscache_inode_object_def,
-			nfsi, true);
+	if (!fscache_cookie_valid(cookie))
+		return;
 
-		dfprintk(FSCACHE,
-			 "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n",
-			 nfss, nfsi, old, nfsi->fscache);
+	if (inode_is_open_for_write(inode)) {
+		dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi);
+		clear_bit(NFS_INO_FSCACHE, &nfsi->flags);
+		fscache_disable_cookie(cookie, true);
+		fscache_uncache_all_inode_pages(cookie, inode);
+	} else {
+		dfprintk(FSCACHE, "NFS: nfsi 0x%p enabling cache\n", nfsi);
+		fscache_enable_cookie(cookie, nfs_fscache_can_enable, inode);
+		if (fscache_cookie_enabled(cookie))
+			set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
 	}
-	nfs_fscache_inode_unlock(inode);
 }
+EXPORT_SYMBOL_GPL(nfs_fscache_open_file);
 
 /*
  * Release the caching state associated with a page, if the page isn't busy
@@ -344,12 +260,11 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode)
 int nfs_fscache_release_page(struct page *page, gfp_t gfp)
 {
 	if (PageFsCache(page)) {
-		struct nfs_inode *nfsi = NFS_I(page->mapping->host);
-		struct fscache_cookie *cookie = nfsi->fscache;
+		struct fscache_cookie *cookie = nfs_i_fscache(page->mapping->host);
 
 		BUG_ON(!cookie);
 		dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
-			 cookie, page, nfsi);
+			 cookie, page, NFS_I(page->mapping->host));
 
 		if (!fscache_maybe_release_page(cookie, page, gfp))
 			return 0;
@@ -367,13 +282,12 @@ int nfs_fscache_release_page(struct page *page, gfp_t gfp)
  */
 void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
-	struct fscache_cookie *cookie = nfsi->fscache;
+	struct fscache_cookie *cookie = nfs_i_fscache(inode);
 
 	BUG_ON(!cookie);
 
 	dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n",
-		 cookie, page, nfsi);
+		 cookie, page, NFS_I(inode));
 
 	fscache_wait_on_page_write(cookie, page);
 
@@ -417,9 +331,9 @@ int __nfs_readpage_from_fscache(struct nfs_open_context *ctx,
 
 	dfprintk(FSCACHE,
 		 "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n",
-		 NFS_I(inode)->fscache, page, page->index, page->flags, inode);
+		 nfs_i_fscache(inode), page, page->index, page->flags, inode);
 
-	ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache,
+	ret = fscache_read_or_alloc_page(nfs_i_fscache(inode),
 					 page,
 					 nfs_readpage_from_fscache_complete,
 					 ctx,
@@ -459,9 +373,9 @@ int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
 	int ret;
 
 	dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
-		 NFS_I(inode)->fscache, npages, inode);
+		 nfs_i_fscache(inode), npages, inode);
 
-	ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache,
+	ret = fscache_read_or_alloc_pages(nfs_i_fscache(inode),
 					  mapping, pages, nr_pages,
 					  nfs_readpage_from_fscache_complete,
 					  ctx,
@@ -506,15 +420,15 @@ void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)
 
 	dfprintk(FSCACHE,
 		 "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n",
-		 NFS_I(inode)->fscache, page, page->index, page->flags, sync);
+		 nfs_i_fscache(inode), page, page->index, page->flags, sync);
 
-	ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL);
+	ret = fscache_write_page(nfs_i_fscache(inode), page, GFP_KERNEL);
 	dfprintk(FSCACHE,
 		 "NFS:     readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",
 		 page, page->index, page->flags, ret);
 
 	if (ret != 0) {
-		fscache_uncache_page(NFS_I(inode)->fscache, page);
+		fscache_uncache_page(nfs_i_fscache(inode), page);
 		nfs_add_fscache_stats(inode,
 				      NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1);
 		nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 4ecb766..d7fe3e7 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -76,11 +76,9 @@ extern void nfs_fscache_release_client_cookie(struct nfs_client *);
 extern void nfs_fscache_get_super_cookie(struct super_block *, const char *, int);
 extern void nfs_fscache_release_super_cookie(struct super_block *);
 
-extern void nfs_fscache_init_inode_cookie(struct inode *);
-extern void nfs_fscache_release_inode_cookie(struct inode *);
-extern void nfs_fscache_zap_inode_cookie(struct inode *);
-extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *);
-extern void nfs_fscache_reset_inode_cookie(struct inode *);
+extern void nfs_fscache_init_inode(struct inode *);
+extern void nfs_fscache_clear_inode(struct inode *);
+extern void nfs_fscache_open_file(struct inode *, struct file *);
 
 extern void __nfs_fscache_invalidate_page(struct page *, struct inode *);
 extern int nfs_fscache_release_page(struct page *, gfp_t);
@@ -187,12 +185,10 @@ static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
 
 static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
 
-static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {}
-static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {}
-static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {}
-static inline void nfs_fscache_set_inode_cookie(struct inode *inode,
-						struct file *filp) {}
-static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_init_inode(struct inode *inode) {}
+static inline void nfs_fscache_clear_inode(struct inode *inode) {}
+static inline void nfs_fscache_open_file(struct inode *inode,
+					 struct file *filp) {}
 
 static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
 {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index eda8879..bb90bff 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -122,7 +122,7 @@ void nfs_clear_inode(struct inode *inode)
 	WARN_ON_ONCE(!list_empty(&NFS_I(inode)->open_files));
 	nfs_zap_acl_cache(inode);
 	nfs_access_zap_cache(inode);
-	nfs_fscache_release_inode_cookie(inode);
+	nfs_fscache_clear_inode(inode);
 }
 EXPORT_SYMBOL_GPL(nfs_clear_inode);
 
@@ -459,7 +459,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
 		nfsi->attrtimeo_timestamp = now;
 		nfsi->access_cache = RB_ROOT;
 
-		nfs_fscache_init_inode_cookie(inode);
+		nfs_fscache_init_inode(inode);
 
 		unlock_new_inode(inode);
 	} else
@@ -854,7 +854,7 @@ int nfs_open(struct inode *inode, struct file *filp)
 		return PTR_ERR(ctx);
 	nfs_file_set_open_context(filp, ctx);
 	put_nfs_open_context(ctx);
-	nfs_fscache_set_inode_cookie(inode, filp);
+	nfs_fscache_open_file(inode, filp);
 	return 0;
 }
 
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index e5b804d..5b8a618 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -74,7 +74,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	nfs_file_set_open_context(filp, ctx);
-	nfs_fscache_set_inode_cookie(inode, filp);
+	nfs_fscache_open_file(inode, filp);
 	err = 0;
 
 out_put_ctx:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3f40547..955dff5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2292,6 +2292,11 @@ static inline void allow_write_access(struct file *file)
 	if (file)
 		atomic_inc(&file_inode(file)->i_writecount);
 }
+static inline bool inode_is_open_for_write(const struct inode *inode)
+{
+	return atomic_read(&inode->i_writecount) > 0;
+}
+
 #ifdef CONFIG_IMA
 static inline void i_readcount_dec(struct inode *inode)
 {
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 3ea4cde..14a4820 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -269,9 +269,13 @@ static inline int NFS_STALE(const struct inode *inode)
 	return test_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
 }
 
-static inline int NFS_FSCACHE(const struct inode *inode)
+static inline struct fscache_cookie *nfs_i_fscache(struct inode *inode)
 {
-	return test_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+#ifdef CONFIG_NFS_FSCACHE
+	return NFS_I(inode)->fscache;
+#else
+	return NULL;
+#endif
 }
 
 static inline __u64 NFS_FILEID(const struct inode *inode)
-- 
cgit v0.10.2


From a6f951ddbdfb7bd87d31a44f61abe202ed6ce57f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 1 Oct 2013 14:24:58 -0400
Subject: NFSv4: Fix a use-after-free situation in _nfs4_proc_getlk()

In nfs4_proc_getlk(), when some error causes a retry of the call to
_nfs4_proc_getlk(), we can end up with Oopses of the form

 BUG: unable to handle kernel NULL pointer dereference at 0000000000000134
 IP: [<ffffffff8165270e>] _raw_spin_lock+0xe/0x30
<snip>
 Call Trace:
  [<ffffffff812f287d>] _atomic_dec_and_lock+0x4d/0x70
  [<ffffffffa053c4f2>] nfs4_put_lock_state+0x32/0xb0 [nfsv4]
  [<ffffffffa053c585>] nfs4_fl_release_lock+0x15/0x20 [nfsv4]
  [<ffffffffa0522c06>] _nfs4_proc_getlk.isra.40+0x146/0x170 [nfsv4]
  [<ffffffffa052ad99>] nfs4_proc_lock+0x399/0x5a0 [nfsv4]

The problem is that we don't clear the request->fl_ops after the first
try and so when we retry, nfs4_set_lock_state() exits early without
setting the lock stateid.
Regression introduced by commit 70cc6487a4e08b8698c0e2ec935fb48d10490162
(locks: make ->lock release private data before returning in GETLK case)

Reported-by: Weston Andros Adamson <dros@netapp.com>
Reported-by: Jorge Mora <mora@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: <stable@vger.kernel.org> #2.6.22+

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d53d678..d2b4845 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5106,6 +5106,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
 			status = 0;
 	}
 	request->fl_ops->fl_release_private(request);
+	request->fl_ops = NULL;
 out:
 	return status;
 }
-- 
cgit v0.10.2


From 7f260e8575bf53b93b77978c1e39f8e67612759c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 24 Sep 2013 11:25:22 -0400
Subject: SUNRPC: Enable the keepalive option for TCP sockets

For NFSv4 we want to avoid retransmitting RPC calls unless the TCP
connection breaks. However we still want to detect TCP connection
breakage as soon as possible. Do this by setting the keepalive option
with the idle timeout and count set to the 'timeo' and 'retrans' mount
options.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index ee03d35..208a763 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2112,6 +2112,19 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 
 	if (!transport->inet) {
 		struct sock *sk = sock->sk;
+		unsigned int keepidle = xprt->timeout->to_initval / HZ;
+		unsigned int keepcnt = xprt->timeout->to_retries + 1;
+		unsigned int opt_on = 1;
+
+		/* TCP Keepalive options */
+		kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+				(char *)&opt_on, sizeof(opt_on));
+		kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
+				(char *)&keepidle, sizeof(keepidle));
+		kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
+				(char *)&keepidle, sizeof(keepidle));
+		kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
+				(char *)&keepcnt, sizeof(keepcnt));
 
 		write_lock_bh(&sk->sk_callback_lock);
 
-- 
cgit v0.10.2


From 8b71798c0d389d4cadc884fc7d68c61ee8cd4f45 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 26 Sep 2013 10:18:04 -0400
Subject: SUNRPC: Only update the TCP connect cookie on a successful connect

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 208a763..9928ba1 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1511,6 +1511,7 @@ static void xs_tcp_state_change(struct sock *sk)
 			transport->tcp_copied = 0;
 			transport->tcp_flags =
 				TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID;
+			xprt->connect_cookie++;
 
 			xprt_wake_pending_tasks(xprt, -EAGAIN);
 		}
@@ -2164,7 +2165,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 	case 0:
 	case -EINPROGRESS:
 		/* SYN_SENT! */
-		xprt->connect_cookie++;
 		if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
 			xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
 	}
-- 
cgit v0.10.2


From 0a6605213040dd2fb479f0d1a9a87a1d7fa70904 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 25 Sep 2013 11:31:54 -0400
Subject: SUNRPC: Don't set the request connect_cookie until a successful
 transmit

We're using the request connect_cookie to track whether or not a
request was successfully transmitted on the current transport
connection or not. For that reason we should ensure that it is
only set after we've successfully transmitted the request.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 095363e..e9ee7bf 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -912,7 +912,6 @@ void xprt_transmit(struct rpc_task *task)
 	} else if (!req->rq_bytes_sent)
 		return;
 
-	req->rq_connect_cookie = xprt->connect_cookie;
 	req->rq_xtime = ktime_get();
 	status = xprt->ops->send_request(task);
 	if (status != 0) {
@@ -938,12 +937,14 @@ void xprt_transmit(struct rpc_task *task)
 	/* Don't race with disconnect */
 	if (!xprt_connected(xprt))
 		task->tk_status = -ENOTCONN;
-	else if (!req->rq_reply_bytes_recvd && rpc_reply_expected(task)) {
+	else {
 		/*
 		 * Sleep on the pending queue since
 		 * we're expecting a reply.
 		 */
-		rpc_sleep_on(&xprt->pending, task, xprt_timer);
+		if (!req->rq_reply_bytes_recvd && rpc_reply_expected(task))
+			rpc_sleep_on(&xprt->pending, task, xprt_timer);
+		req->rq_connect_cookie = xprt->connect_cookie;
 	}
 	spin_unlock_bh(&xprt->transport_lock);
 }
@@ -1186,6 +1187,7 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
 	req->rq_xprt    = xprt;
 	req->rq_buffer  = NULL;
 	req->rq_xid     = xprt_alloc_xid(xprt);
+	req->rq_connect_cookie = xprt->connect_cookie - 1;
 	req->rq_release_snd_buf = NULL;
 	xprt_reset_majortimeo(req);
 	dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid,
-- 
cgit v0.10.2


From ee071eff0f1afafa9917254a6e4ee19d28085f1d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 27 Sep 2013 11:09:53 -0400
Subject: SUNRPC: Clear the request rq_bytes_sent field in xprt_release_write

Otherwise the tests of req->rq_bytes_sent in xprt_prepare_transmit
will fail if we're dealing with a resend.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index e9ee7bf..8cc5c8b 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -358,6 +358,11 @@ out_unlock:
 void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
 {
 	if (xprt->snd_task == task) {
+		if (task != NULL) {
+			struct rpc_rqst *req = task->tk_rqstp;
+			if (req != NULL)
+				req->rq_bytes_sent = 0;
+		}
 		xprt_clear_locked(xprt);
 		__xprt_lock_write_next(xprt);
 	}
@@ -375,6 +380,11 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt);
 void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
 {
 	if (xprt->snd_task == task) {
+		if (task != NULL) {
+			struct rpc_rqst *req = task->tk_rqstp;
+			if (req != NULL)
+				req->rq_bytes_sent = 0;
+		}
 		xprt_clear_locked(xprt);
 		__xprt_lock_write_next_cong(xprt);
 	}
-- 
cgit v0.10.2


From 90051ea774613ffc6b8aad3dc665c8505d6205a8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 25 Sep 2013 12:17:18 -0400
Subject: SUNRPC: Clean up - convert xprt_prepare_transmit to return a bool

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index cec7b9b..8097b9d 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -288,7 +288,7 @@ int			xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task);
 int			xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task);
 void			xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task);
 void			xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task);
-int			xprt_prepare_transmit(struct rpc_task *task);
+bool			xprt_prepare_transmit(struct rpc_task *task);
 void			xprt_transmit(struct rpc_task *task);
 void			xprt_end_transmit(struct rpc_task *task);
 int			xprt_adjust_timeout(struct rpc_rqst *req);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 7747960..fa50e42 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1722,8 +1722,7 @@ call_transmit(struct rpc_task *task)
 	task->tk_action = call_status;
 	if (task->tk_status < 0)
 		return;
-	task->tk_status = xprt_prepare_transmit(task);
-	if (task->tk_status != 0)
+	if (!xprt_prepare_transmit(task))
 		return;
 	task->tk_action = call_transmit_status;
 	/* Encode here so that rpcsec_gss can use correct sequence number. */
@@ -1811,8 +1810,7 @@ call_bc_transmit(struct rpc_task *task)
 {
 	struct rpc_rqst *req = task->tk_rqstp;
 
-	task->tk_status = xprt_prepare_transmit(task);
-	if (task->tk_status == -EAGAIN) {
+	if (!xprt_prepare_transmit(task)) {
 		/*
 		 * Could not reserve the transport. Try again after the
 		 * transport is released.
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 8cc5c8b..2326af5 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -864,24 +864,27 @@ static inline int xprt_has_timer(struct rpc_xprt *xprt)
  * @task: RPC task about to send a request
  *
  */
-int xprt_prepare_transmit(struct rpc_task *task)
+bool xprt_prepare_transmit(struct rpc_task *task)
 {
 	struct rpc_rqst	*req = task->tk_rqstp;
 	struct rpc_xprt	*xprt = req->rq_xprt;
-	int err = 0;
+	bool ret = false;
 
 	dprintk("RPC: %5u xprt_prepare_transmit\n", task->tk_pid);
 
 	spin_lock_bh(&xprt->transport_lock);
 	if (req->rq_reply_bytes_recvd && !req->rq_bytes_sent) {
-		err = req->rq_reply_bytes_recvd;
+		task->tk_status = req->rq_reply_bytes_recvd;
 		goto out_unlock;
 	}
-	if (!xprt->ops->reserve_xprt(xprt, task))
-		err = -EAGAIN;
+	if (!xprt->ops->reserve_xprt(xprt, task)) {
+		task->tk_status = -EAGAIN;
+		goto out_unlock;
+	}
+	ret = true;
 out_unlock:
 	spin_unlock_bh(&xprt->transport_lock);
-	return err;
+	return ret;
 }
 
 void xprt_end_transmit(struct rpc_task *task)
-- 
cgit v0.10.2


From 8a19a0b6cb2e2216afd68ef2047f30260cc8a220 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 24 Sep 2013 12:00:27 -0400
Subject: SUNRPC: Add RPC task and client level options to disable the resend
 timeout

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 6740801..943ee89 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -49,6 +49,7 @@ struct rpc_clnt {
 
 	unsigned int		cl_softrtry : 1,/* soft timeouts */
 				cl_discrtry : 1,/* disconnect before retry */
+				cl_noretranstimeo: 1,/* No retransmit timeouts */
 				cl_autobind : 1,/* use getport() */
 				cl_chatty   : 1;/* be verbose */
 
@@ -126,6 +127,7 @@ struct rpc_create_args {
 #define RPC_CLNT_CREATE_QUIET		(1UL << 6)
 #define RPC_CLNT_CREATE_INFINITE_SLOTS	(1UL << 7)
 #define RPC_CLNT_CREATE_NO_IDLE_TIMEOUT	(1UL << 8)
+#define RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT	(1UL << 9)
 
 struct rpc_clnt *rpc_create(struct rpc_create_args *args);
 struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 096ee58..3a847de 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -122,6 +122,7 @@ struct rpc_task_setup {
 #define RPC_TASK_SENT		0x0800		/* message was sent */
 #define RPC_TASK_TIMEOUT	0x1000		/* fail with ETIMEDOUT on timeout */
 #define RPC_TASK_NOCONNECT	0x2000		/* return ENOTCONN if not connected */
+#define RPC_TASK_NO_RETRANS_TIMEOUT	0x4000		/* wait forever for a reply */
 
 #define RPC_IS_ASYNC(t)		((t)->tk_flags & RPC_TASK_ASYNC)
 #define RPC_IS_SWAPPER(t)	((t)->tk_flags & RPC_TASK_SWAPPER)
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index fa50e42..b585250 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -772,6 +772,8 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
 		atomic_inc(&clnt->cl_count);
 		if (clnt->cl_softrtry)
 			task->tk_flags |= RPC_TASK_SOFT;
+		if (clnt->cl_noretranstimeo)
+			task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT;
 		if (sk_memalloc_socks()) {
 			struct rpc_xprt *xprt;
 
@@ -1898,7 +1900,8 @@ call_status(struct rpc_task *task)
 		rpc_delay(task, 3*HZ);
 	case -ETIMEDOUT:
 		task->tk_action = call_timeout;
-		if (task->tk_client->cl_discrtry)
+		if (!(task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT)
+		    && task->tk_client->cl_discrtry)
 			xprt_conditional_disconnect(req->rq_xprt,
 					req->rq_connect_cookie);
 		break;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 2326af5..d166d99 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -873,9 +873,18 @@ bool xprt_prepare_transmit(struct rpc_task *task)
 	dprintk("RPC: %5u xprt_prepare_transmit\n", task->tk_pid);
 
 	spin_lock_bh(&xprt->transport_lock);
-	if (req->rq_reply_bytes_recvd && !req->rq_bytes_sent) {
-		task->tk_status = req->rq_reply_bytes_recvd;
-		goto out_unlock;
+	if (!req->rq_bytes_sent) {
+		if (req->rq_reply_bytes_recvd) {
+			task->tk_status = req->rq_reply_bytes_recvd;
+			goto out_unlock;
+		}
+		if ((task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT)
+		    && xprt_connected(xprt)
+		    && req->rq_connect_cookie == xprt->connect_cookie) {
+			xprt->ops->set_retrans_timeout(task);
+			rpc_sleep_on(&xprt->pending, task, xprt_timer);
+			goto out_unlock;
+		}
 	}
 	if (!xprt->ops->reserve_xprt(xprt, task)) {
 		task->tk_status = -EAGAIN;
-- 
cgit v0.10.2


From 99875249bfbfb6d9a2aba020ce65da2862d0dafa Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 24 Sep 2013 12:06:07 -0400
Subject: NFSv4: Ensure that we disable the resend timeout for NFSv4

The spec states that the client should not resend requests because
the server will disconnect if it needs to drop an RPC request.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2dceee4..af03258 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -590,6 +590,8 @@ int nfs_create_rpc_client(struct nfs_client *clp,
 
 	if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags))
 		args.flags |= RPC_CLNT_CREATE_DISCRTRY;
+	if (test_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags))
+		args.flags |= RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT;
 	if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags))
 		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
 	if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags))
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index a860ab5..511cdce 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -368,6 +368,7 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
 	if (clp->cl_minorversion != 0)
 		__set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags);
 	__set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
+	__set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
 	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I);
 	if (error == -EINVAL)
 		error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index b8cedce..f9c0a6c 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -41,6 +41,7 @@ struct nfs_client {
 #define NFS_CS_DISCRTRY		1		/* - disconnect on RPC retry */
 #define NFS_CS_MIGRATION	2		/* - transparent state migr */
 #define NFS_CS_INFINITE_SLOTS	3		/* - don't limit TCP slots */
+#define NFS_CS_NO_RETRANS_TIMEOUT	4	/* - Disable retransmit timeouts */
 	struct sockaddr_storage	cl_addr;	/* server identifier */
 	size_t			cl_addrlen;
 	char *			cl_hostname;	/* hostname of server */
-- 
cgit v0.10.2


From ca7f33aa5b8051f17eec81766b8f39c83caf4196 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 25 Sep 2013 13:39:45 -0400
Subject: SUNRPC: Fix RPC call retransmission statistics

A retransmit should be when you successfully transmit an RPC call to
the server a second time.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index b585250..bde3115 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1719,6 +1719,8 @@ call_connect_status(struct rpc_task *task)
 static void
 call_transmit(struct rpc_task *task)
 {
+	int is_retrans = RPC_WAS_SENT(task);
+
 	dprint_status(task);
 
 	task->tk_action = call_status;
@@ -1743,6 +1745,8 @@ call_transmit(struct rpc_task *task)
 	xprt_transmit(task);
 	if (task->tk_status < 0)
 		return;
+	if (is_retrans)
+		task->tk_client->cl_stats->rpcretrans++;
 	/*
 	 * On success, ensure that we call xprt_end_transmit() before sleeping
 	 * in order to allow access to the socket to other RPC requests.
@@ -1983,7 +1987,6 @@ call_timeout(struct rpc_task *task)
 	rpcauth_invalcred(task);
 
 retry:
-	clnt->cl_stats->rpcretrans++;
 	task->tk_action = call_bind;
 	task->tk_status = 0;
 }
@@ -2026,7 +2029,6 @@ call_decode(struct rpc_task *task)
 	if (req->rq_rcv_buf.len < 12) {
 		if (!RPC_IS_SOFT(task)) {
 			task->tk_action = call_bind;
-			clnt->cl_stats->rpcretrans++;
 			goto out_retry;
 		}
 		dprintk("RPC:       %s: too small RPC reply size (%d bytes)\n",
-- 
cgit v0.10.2


From 92551948174d079b12541437f51cbe3e17d9dd24 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 27 Sep 2013 11:28:40 -0400
Subject: SUNRPC: Remove redundant initialisations of request rq_bytes_sent

Now that we clear the rq_bytes_sent field on unlock, we don't need
to set it on lock, so we just set it once when initialising the request.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index d166d99..49535505 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -205,10 +205,8 @@ int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
 		goto out_sleep;
 	}
 	xprt->snd_task = task;
-	if (req != NULL) {
-		req->rq_bytes_sent = 0;
+	if (req != NULL)
 		req->rq_ntrans++;
-	}
 
 	return 1;
 
@@ -263,7 +261,6 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
 	}
 	if (__xprt_get_cong(xprt, task)) {
 		xprt->snd_task = task;
-		req->rq_bytes_sent = 0;
 		req->rq_ntrans++;
 		return 1;
 	}
@@ -300,10 +297,8 @@ static bool __xprt_lock_write_func(struct rpc_task *task, void *data)
 
 	req = task->tk_rqstp;
 	xprt->snd_task = task;
-	if (req) {
-		req->rq_bytes_sent = 0;
+	if (req)
 		req->rq_ntrans++;
-	}
 	return true;
 }
 
@@ -329,7 +324,6 @@ static bool __xprt_lock_write_cong_func(struct rpc_task *task, void *data)
 	}
 	if (__xprt_get_cong(xprt, task)) {
 		xprt->snd_task = task;
-		req->rq_bytes_sent = 0;
 		req->rq_ntrans++;
 		return true;
 	}
@@ -1210,6 +1204,11 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
 	req->rq_buffer  = NULL;
 	req->rq_xid     = xprt_alloc_xid(xprt);
 	req->rq_connect_cookie = xprt->connect_cookie - 1;
+	req->rq_bytes_sent = 0;
+	req->rq_snd_buf.len = 0;
+	req->rq_snd_buf.buflen = 0;
+	req->rq_rcv_buf.len = 0;
+	req->rq_rcv_buf.buflen = 0;
 	req->rq_release_snd_buf = NULL;
 	xprt_reset_majortimeo(req);
 	dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid,
-- 
cgit v0.10.2


From 561ec1603171cd9b38dcf6cac53e8710f437a48d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 26 Sep 2013 15:22:45 -0400
Subject: SUNRPC: call_connect_status should recheck bind and connect status on
 error

Currently, we go directly to call_transmit which sends us to call_status
on error. If we know that the connect attempt failed, we should rather
just jump straight back to call_bind and call_connect.

Ditto for EAGAIN, except do not delay.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index bde3115..7352aef 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1692,6 +1692,7 @@ call_connect_status(struct rpc_task *task)
 	dprint_status(task);
 
 	trace_rpc_connect_status(task, status);
+	task->tk_status = 0;
 	switch (status) {
 		/* if soft mounted, test if we've timed out */
 	case -ETIMEDOUT:
@@ -1700,12 +1701,14 @@ call_connect_status(struct rpc_task *task)
 	case -ECONNREFUSED:
 	case -ECONNRESET:
 	case -ENETUNREACH:
+		/* retry with existing socket, after a delay */
+		rpc_delay(task, 3*HZ);
 		if (RPC_IS_SOFTCONN(task))
 			break;
-		/* retry with existing socket, after a delay */
-	case 0:
 	case -EAGAIN:
-		task->tk_status = 0;
+		task->tk_action = call_bind;
+		return;
+	case 0:
 		clnt->cl_stats->netreconn++;
 		task->tk_action = call_transmit;
 		return;
-- 
cgit v0.10.2


From 3660cd4322fce986689b06225d9c12d77193c252 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Thu, 3 Oct 2013 12:23:24 -0400
Subject: NFSv4 Remove zeroing state kern warnings

As of commit 5d422301f97b821301efcdb6fc9d1a83a5c102d6 we no longer zero the
state.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index cc14cbb..fa2706a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1375,8 +1375,8 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
 			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 				goto out;
 			default:
-				printk(KERN_ERR "NFS: %s: unhandled error %d. "
-					"Zeroing state\n", __func__, status);
+				printk(KERN_ERR "NFS: %s: unhandled error %d\n",
+					 __func__, status);
 			case -ENOMEM:
 			case -NFS4ERR_DENIED:
 			case -NFS4ERR_RECLAIM_BAD:
@@ -1439,15 +1439,12 @@ restart:
 		}
 		switch (status) {
 			default:
-				printk(KERN_ERR "NFS: %s: unhandled error %d. "
-					"Zeroing state\n", __func__, status);
+				printk(KERN_ERR "NFS: %s: unhandled error %d\n",
+					__func__, status);
 			case -ENOENT:
 			case -ENOMEM:
 			case -ESTALE:
-				/*
-				 * Open state on this file cannot be recovered
-				 * All we can do is revert to using the zero stateid.
-				 */
+				/* Open state on this file cannot be recovered */
 				nfs4_state_mark_recovery_failed(state, status);
 				break;
 			case -EAGAIN:
-- 
cgit v0.10.2


From 57acc40d73407159727b3a1456f0a498133831ba Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 18 Oct 2013 10:03:37 -0400
Subject: nfs: reject version and minorversion changes on remount attempts

Reported-by: Eric Doutreleau <edoutreleau@genoscope.cns.fr>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a03b9c6..137572b 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2143,6 +2143,8 @@ nfs_compare_remount_data(struct nfs_server *nfss,
 	if (data->flags != nfss->flags ||
 	    data->rsize != nfss->rsize ||
 	    data->wsize != nfss->wsize ||
+	    data->version != nfss->nfs_client->rpc_ops->version ||
+	    data->minorversion != nfss->nfs_client->cl_minorversion ||
 	    data->retrans != nfss->client->cl_timeout->to_retries ||
 	    data->auth_flavors[0] != nfss->client->cl_auth->au_flavor ||
 	    data->acregmin != nfss->acregmin / HZ ||
@@ -2197,6 +2199,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;
 	data->nfs_server.port = nfss->port;
 	data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
+	data->version = nfsvers;
+	data->minorversion = nfss->nfs_client->cl_minorversion;
 	memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
 		data->nfs_server.addrlen);
 
-- 
cgit v0.10.2


From 1966903f8e28b31ff82de2e2180f0c066399288d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 21 Oct 2013 09:52:19 -0400
Subject: nfs: fix handling of invalid mount options in nfs_remount

nfs_parse_mount_options returns 0 on error, not -errno.

Reported-by: Karel Zak <kzak@redhat.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 137572b..3f5a7a8 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2205,8 +2205,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 		data->nfs_server.addrlen);
 
 	/* overwrite those values with any that were specified */
-	error = nfs_parse_mount_options((char *)options, data);
-	if (error < 0)
+	error = -EINVAL;
+	if (!nfs_parse_mount_options((char *)options, data))
 		goto out;
 
 	/*
-- 
cgit v0.10.2


From 83c78eb0425f587a34a6644da79bdcde987dc974 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 28 Oct 2013 14:50:38 -0400
Subject: NFSv4.1: Don't change the security label as part of open reclaim.

The current caching model calls for the security label to be set on
first lookup and/or on any subsequent label changes. There is no
need to do it as part of an open reclaim.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d2b4845..8dd61eb 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1337,8 +1337,6 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
 	if (ret)
 		goto err;
 
-	nfs_setsecurity(inode, &data->f_attr, data->f_label);
-
 	if (data->o_res.delegation_type != 0)
 		nfs4_opendata_check_deleg(data, state);
 	update_open_stateid(state, &data->o_res.stateid, NULL,
-- 
cgit v0.10.2


From f494a6071d31e3294a3b51ad7a3684f983953f9f Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Mon, 21 Oct 2013 13:10:10 -0400
Subject: NFSv4: fix NULL dereference in open recover

_nfs4_opendata_reclaim_to_nfs4_state doesn't expect to see a cached
open CLAIM_PREVIOUS, but this can happen. An example is when there are
RDWR openers and RDONLY openers on a delegation stateid. The recovery
path will first try an open CLAIM_PREVIOUS for the RDWR openers, this
marks the delegation as not needing RECLAIM anymore, so the open
CLAIM_PREVIOUS for the RDONLY openers will not actually send an rpc.

The NULL dereference is due to _nfs4_opendata_reclaim_to_nfs4_state
returning PTR_ERR(rpc_status) when !rpc_done. When the open is
cached, rpc_done == 0 and rpc_status == 0, thus
_nfs4_opendata_reclaim_to_nfs4_state returns NULL - this is unexpected
by callers of nfs4_opendata_to_nfs4_state().

This can be reproduced easily by opening the same file two times on an
NFSv4.0 mount with delegations enabled, once as RDWR and once as RDONLY then
sleeping for a long time.  While the files are held open, kick off state
recovery and this NULL dereference will be hit every time.

An example OOPS:

[   65.003602] BUG: unable to handle kernel NULL pointer dereference at 00000000
00000030
[   65.005312] IP: [<ffffffffa037d6ee>] __nfs4_close+0x1e/0x160 [nfsv4]
[   65.006820] PGD 7b0ea067 PUD 791ff067 PMD 0
[   65.008075] Oops: 0000 [#1] SMP
[   65.008802] Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache
snd_ens1371 gameport nfsd snd_rawmidi snd_ac97_codec ac97_bus btusb snd_seq snd
_seq_device snd_pcm ppdev bluetooth auth_rpcgss coretemp snd_page_alloc crc32_pc
lmul crc32c_intel ghash_clmulni_intel microcode rfkill nfs_acl vmw_balloon serio
_raw snd_timer lockd parport_pc e1000 snd soundcore parport i2c_piix4 shpchp vmw
_vmci sunrpc ata_generic mperf pata_acpi mptspi vmwgfx ttm scsi_transport_spi dr
m mptscsih mptbase i2c_core
[   65.018684] CPU: 0 PID: 473 Comm: 192.168.10.85-m Not tainted 3.11.2-201.fc19
.x86_64 #1
[   65.020113] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop
Reference Platform, BIOS 6.00 07/31/2013
[   65.022012] task: ffff88003707e320 ti: ffff88007b906000 task.ti: ffff88007b906000
[   65.023414] RIP: 0010:[<ffffffffa037d6ee>]  [<ffffffffa037d6ee>] __nfs4_close+0x1e/0x160 [nfsv4]
[   65.025079] RSP: 0018:ffff88007b907d10  EFLAGS: 00010246
[   65.026042] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
[   65.027321] RDX: 0000000000000050 RSI: 0000000000000001 RDI: 0000000000000000
[   65.028691] RBP: ffff88007b907d38 R08: 0000000000016f60 R09: 0000000000000000
[   65.029990] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001
[   65.031295] R13: 0000000000000050 R14: 0000000000000000 R15: 0000000000000001
[   65.032527] FS:  0000000000000000(0000) GS:ffff88007f600000(0000) knlGS:0000000000000000
[   65.033981] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   65.035177] CR2: 0000000000000030 CR3: 000000007b27f000 CR4: 00000000000407f0
[   65.036568] Stack:
[   65.037011]  0000000000000000 0000000000000001 ffff88007b907d90 ffff88007a880220
[   65.038472]  ffff88007b768de8 ffff88007b907d48 ffffffffa037e4a5 ffff88007b907d80
[   65.039935]  ffffffffa036a6c8 ffff880037020e40 ffff88007a880000 ffff880037020e40
[   65.041468] Call Trace:
[   65.042050]  [<ffffffffa037e4a5>] nfs4_close_state+0x15/0x20 [nfsv4]
[   65.043209]  [<ffffffffa036a6c8>] nfs4_open_recover_helper+0x148/0x1f0 [nfsv4]
[   65.044529]  [<ffffffffa036a886>] nfs4_open_recover+0x116/0x150 [nfsv4]
[   65.045730]  [<ffffffffa036d98d>] nfs4_open_reclaim+0xad/0x150 [nfsv4]
[   65.046905]  [<ffffffffa037d979>] nfs4_do_reclaim+0x149/0x5f0 [nfsv4]
[   65.048071]  [<ffffffffa037e1dc>] nfs4_run_state_manager+0x3bc/0x670 [nfsv4]
[   65.049436]  [<ffffffffa037de20>] ? nfs4_do_reclaim+0x5f0/0x5f0 [nfsv4]
[   65.050686]  [<ffffffffa037de20>] ? nfs4_do_reclaim+0x5f0/0x5f0 [nfsv4]
[   65.051943]  [<ffffffff81088640>] kthread+0xc0/0xd0
[   65.052831]  [<ffffffff81088580>] ? insert_kthread_work+0x40/0x40
[   65.054697]  [<ffffffff8165686c>] ret_from_fork+0x7c/0xb0
[   65.056396]  [<ffffffff81088580>] ? insert_kthread_work+0x40/0x40
[   65.058208] Code: 5c 41 5d 5d c3 0f 1f 84 00 00 00 00 00 66 66 66 66 90 55 48 89 e5 41 57 41 89 f7 41 56 41 89 ce 41 55 41 89 d5 41 54 53 48 89 fb <4c> 8b 67 30 f0 41 ff 44 24 44 49 8d 7c 24 40 e8 0e 0a 2d e1 44
[   65.065225] RIP  [<ffffffffa037d6ee>] __nfs4_close+0x1e/0x160 [nfsv4]
[   65.067175]  RSP <ffff88007b907d10>
[   65.068570] CR2: 0000000000000030
[   65.070098] ---[ end trace 0d1fe4f5c7dd6f8b ]---

Cc: <stable@vger.kernel.org> #3.7+
Signed-off-by: Weston Andros Adamson <dros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8dd61eb..acd20c5 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1317,7 +1317,8 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
 	struct nfs4_state *state = data->state;
 	int ret;
 
-	if (!data->rpc_done) {
+	/* allow cached opens (!rpc_done && !rpc_status) */
+	if (!data->rpc_done && data->rpc_status) {
 		ret = data->rpc_status;
 		goto err;
 	}
-- 
cgit v0.10.2


From a43ec98b72aae3e330f0673438f58316c3769b84 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Mon, 21 Oct 2013 13:10:11 -0400
Subject: NFSv4: don't fail on missing fattr in open recover

This is an unneeded check that could cause the client to fail to recover
opens.

Signed-off-by: Weston Andros Adamson <dros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index acd20c5..e76fd21 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1323,12 +1323,6 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
 		goto err;
 	}
 
-	ret = -ESTALE;
-	if (!(data->f_attr.valid & NFS_ATTR_FATTR_TYPE) ||
-	    !(data->f_attr.valid & NFS_ATTR_FATTR_FILEID) ||
-	    !(data->f_attr.valid & NFS_ATTR_FATTR_CHANGE))
-		goto err;
-
 	ret = -ENOMEM;
 	state = nfs4_get_open_state(inode, data->owner);
 	if (state == NULL)
-- 
cgit v0.10.2


From d49f042aeec99c5f87160bb52dd52088b1051311 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 28 Oct 2013 14:57:12 -0400
Subject: NFSv4: Fix state reference counting in
 _nfs4_opendata_reclaim_to_nfs4_state

Currently, if the call to nfs_refresh_inode fails, then we end up leaking
a reference count, due to the call to nfs4_get_open_state.
While we're at it, replace nfs4_get_open_state with a simple call to
atomic_inc(); there is no need to do a full lookup of the struct nfs_state
since it is passed as an argument in the struct nfs4_opendata, and
is already assigned to the variable 'state'.

Cc: stable@vger.kernel.org # 3.7.x: a43ec98b72a: NFSv4: don't fail on missing
Cc: stable@vger.kernel.org # 3.7.x
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e76fd21..d3c4255 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1323,11 +1323,6 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
 		goto err;
 	}
 
-	ret = -ENOMEM;
-	state = nfs4_get_open_state(inode, data->owner);
-	if (state == NULL)
-		goto err;
-
 	ret = nfs_refresh_inode(inode, &data->f_attr);
 	if (ret)
 		goto err;
@@ -1336,6 +1331,7 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
 		nfs4_opendata_check_deleg(data, state);
 	update_open_stateid(state, &data->o_res.stateid, NULL,
 			    data->o_arg.fmode);
+	atomic_inc(&state->count);
 
 	return state;
 err:
-- 
cgit v0.10.2


From d2bfda2e7aa036f90ccea610a657064b1e267913 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Mon, 21 Oct 2013 13:10:13 -0400
Subject: NFSv4: don't reprocess cached open CLAIM_PREVIOUS

Cached opens have already been handled by _nfs4_opendata_reclaim_to_nfs4_state
and can safely skip being reprocessed, but must still call update_open_stateid
to make sure that all active fmodes are recovered.

Signed-off-by: Weston Andros Adamson <dros@netapp.com>
Cc: stable@vger.kernel.org # 3.7.x: f494a6071d3: NFSv4: fix NULL dereference
Cc: stable@vger.kernel.org # 3.7.x: a43ec98b72a: NFSv4: don't fail on missin
Cc: stable@vger.kernel.org # 3.7.x
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d3c4255..259f8be 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1317,10 +1317,13 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
 	struct nfs4_state *state = data->state;
 	int ret;
 
-	/* allow cached opens (!rpc_done && !rpc_status) */
-	if (!data->rpc_done && data->rpc_status) {
-		ret = data->rpc_status;
-		goto err;
+	if (!data->rpc_done) {
+		if (data->rpc_status) {
+			ret = data->rpc_status;
+			goto err;
+		}
+		/* cached opens have already been processed */
+		goto update;
 	}
 
 	ret = nfs_refresh_inode(inode, &data->f_attr);
@@ -1329,6 +1332,7 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
 
 	if (data->o_res.delegation_type != 0)
 		nfs4_opendata_check_deleg(data, state);
+update:
 	update_open_stateid(state, &data->o_res.stateid, NULL,
 			    data->o_arg.fmode);
 	atomic_inc(&state->count);
-- 
cgit v0.10.2


From d746e54522e52275865ca23917c16b3fdc226e51 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:12:17 -0400
Subject: SUNRPC: Modify synopsis of rpc_client_register()

The rpc_client_register() helper was added in commit e73f4cc0,
"SUNRPC: split client creation routine into setup and registration,"
Mon Jun 24 11:52:52 2013.  In a subsequent patch, I'd like to invoke
rpc_client_register() from a context where a struct rpc_create_args
is not available.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 7352aef..587f31e 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -272,12 +272,13 @@ static void rpc_clnt_set_nodename(struct rpc_clnt *clnt, const char *nodename)
 	memcpy(clnt->cl_nodename, nodename, clnt->cl_nodelen);
 }
 
-static int rpc_client_register(const struct rpc_create_args *args,
-			       struct rpc_clnt *clnt)
+static int rpc_client_register(struct rpc_clnt *clnt,
+			       rpc_authflavor_t pseudoflavor,
+			       const char *client_name)
 {
 	struct rpc_auth_create_args auth_args = {
-		.pseudoflavor = args->authflavor,
-		.target_name = args->client_name,
+		.pseudoflavor = pseudoflavor,
+		.target_name = client_name,
 	};
 	struct rpc_auth *auth;
 	struct net *net = rpc_net_ns(clnt);
@@ -298,7 +299,7 @@ static int rpc_client_register(const struct rpc_create_args *args,
 	auth = rpcauth_create(&auth_args, clnt);
 	if (IS_ERR(auth)) {
 		dprintk("RPC:       Couldn't create auth handle (flavor %u)\n",
-				args->authflavor);
+				pseudoflavor);
 		err = PTR_ERR(auth);
 		goto err_auth;
 	}
@@ -398,7 +399,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args,
 	/* save the nodename */
 	rpc_clnt_set_nodename(clnt, utsname()->nodename);
 
-	err = rpc_client_register(args, clnt);
+	err = rpc_client_register(clnt, args->authflavor, args->client_name);
 	if (err)
 		goto out_no_path;
 	if (parent)
-- 
cgit v0.10.2


From 40b00b6b17c412ff9ff28631250d32ee29ff0006 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 17 Oct 2013 14:12:23 -0400
Subject: SUNRPC: Add a helper to switch the transport of an rpc_clnt

Add an RPC client API to redirect an rpc_clnt's transport from a
source server to a destination server during a migration event.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
[ cel: forward ported to 3.12 ]
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 943ee89..8af2804 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -136,6 +136,10 @@ void rpc_task_reset_client(struct rpc_task *task, struct rpc_clnt *clnt);
 struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
 struct rpc_clnt *rpc_clone_client_set_auth(struct rpc_clnt *,
 				rpc_authflavor_t);
+int		rpc_switch_client_transport(struct rpc_clnt *,
+				struct xprt_create *,
+				const struct rpc_timeout *);
+
 void		rpc_shutdown_client(struct rpc_clnt *);
 void		rpc_release_client(struct rpc_clnt *);
 void		rpc_task_release_client(struct rpc_task *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 587f31e..f167d9c 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -25,6 +25,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/slab.h>
+#include <linux/rcupdate.h>
 #include <linux/utsname.h>
 #include <linux/workqueue.h>
 #include <linux/in.h>
@@ -264,6 +265,25 @@ void rpc_clients_notifier_unregister(void)
 	return rpc_pipefs_notifier_unregister(&rpc_clients_block);
 }
 
+static struct rpc_xprt *rpc_clnt_set_transport(struct rpc_clnt *clnt,
+		struct rpc_xprt *xprt,
+		const struct rpc_timeout *timeout)
+{
+	struct rpc_xprt *old;
+
+	spin_lock(&clnt->cl_lock);
+	old = clnt->cl_xprt;
+
+	if (!xprt_bound(xprt))
+		clnt->cl_autobind = 1;
+
+	clnt->cl_timeout = timeout;
+	rcu_assign_pointer(clnt->cl_xprt, xprt);
+	spin_unlock(&clnt->cl_lock);
+
+	return old;
+}
+
 static void rpc_clnt_set_nodename(struct rpc_clnt *clnt, const char *nodename)
 {
 	clnt->cl_nodelen = strlen(nodename);
@@ -338,7 +358,8 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args,
 {
 	const struct rpc_program *program = args->program;
 	const struct rpc_version *version;
-	struct rpc_clnt		*clnt = NULL;
+	struct rpc_clnt *clnt = NULL;
+	const struct rpc_timeout *timeout;
 	int err;
 
 	/* sanity check the name before trying to print it */
@@ -366,7 +387,6 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args,
 	if (err)
 		goto out_no_clid;
 
-	rcu_assign_pointer(clnt->cl_xprt, xprt);
 	clnt->cl_procinfo = version->procs;
 	clnt->cl_maxproc  = version->nrprocs;
 	clnt->cl_prog     = args->prognumber ? : program->number;
@@ -381,16 +401,15 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args,
 	INIT_LIST_HEAD(&clnt->cl_tasks);
 	spin_lock_init(&clnt->cl_lock);
 
-	if (!xprt_bound(xprt))
-		clnt->cl_autobind = 1;
-
-	clnt->cl_timeout = xprt->timeout;
+	timeout = xprt->timeout;
 	if (args->timeout != NULL) {
 		memcpy(&clnt->cl_timeout_default, args->timeout,
 				sizeof(clnt->cl_timeout_default));
-		clnt->cl_timeout = &clnt->cl_timeout_default;
+		timeout = &clnt->cl_timeout_default;
 	}
 
+	rpc_clnt_set_transport(clnt, xprt, timeout);
+
 	clnt->cl_rtt = &clnt->cl_rtt_default;
 	rpc_init_rtt(&clnt->cl_rtt_default, clnt->cl_timeout->to_initval);
 
@@ -601,6 +620,80 @@ rpc_clone_client_set_auth(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
 }
 EXPORT_SYMBOL_GPL(rpc_clone_client_set_auth);
 
+/**
+ * rpc_switch_client_transport: switch the RPC transport on the fly
+ * @clnt: pointer to a struct rpc_clnt
+ * @args: pointer to the new transport arguments
+ * @timeout: pointer to the new timeout parameters
+ *
+ * This function allows the caller to switch the RPC transport for the
+ * rpc_clnt structure 'clnt' to allow it to connect to a mirrored NFS
+ * server, for instance.  It assumes that the caller has ensured that
+ * there are no active RPC tasks by using some form of locking.
+ *
+ * Returns zero if "clnt" is now using the new xprt.  Otherwise a
+ * negative errno is returned, and "clnt" continues to use the old
+ * xprt.
+ */
+int rpc_switch_client_transport(struct rpc_clnt *clnt,
+		struct xprt_create *args,
+		const struct rpc_timeout *timeout)
+{
+	const struct rpc_timeout *old_timeo;
+	rpc_authflavor_t pseudoflavor;
+	struct rpc_xprt *xprt, *old;
+	struct rpc_clnt *parent;
+	int err;
+
+	xprt = xprt_create_transport(args);
+	if (IS_ERR(xprt)) {
+		dprintk("RPC:       failed to create new xprt for clnt %p\n",
+			clnt);
+		return PTR_ERR(xprt);
+	}
+
+	pseudoflavor = clnt->cl_auth->au_flavor;
+
+	old_timeo = clnt->cl_timeout;
+	old = rpc_clnt_set_transport(clnt, xprt, timeout);
+
+	rpc_unregister_client(clnt);
+	__rpc_clnt_remove_pipedir(clnt);
+
+	/*
+	 * A new transport was created.  "clnt" therefore
+	 * becomes the root of a new cl_parent tree.  clnt's
+	 * children, if it has any, still point to the old xprt.
+	 */
+	parent = clnt->cl_parent;
+	clnt->cl_parent = clnt;
+
+	/*
+	 * The old rpc_auth cache cannot be re-used.  GSS
+	 * contexts in particular are between a single
+	 * client and server.
+	 */
+	err = rpc_client_register(clnt, pseudoflavor, NULL);
+	if (err)
+		goto out_revert;
+
+	synchronize_rcu();
+	if (parent != clnt)
+		rpc_release_client(parent);
+	xprt_put(old);
+	dprintk("RPC:       replaced xprt for clnt %p\n", clnt);
+	return 0;
+
+out_revert:
+	rpc_clnt_set_transport(clnt, old, old_timeo);
+	clnt->cl_parent = parent;
+	rpc_client_register(clnt, pseudoflavor, NULL);
+	xprt_put(xprt);
+	dprintk("RPC:       failed to switch xprt for clnt %p\n", clnt);
+	return err;
+}
+EXPORT_SYMBOL_GPL(rpc_switch_client_transport);
+
 /*
  * Kill all tasks for the given client.
  * XXX: kill their descendants as well?
-- 
cgit v0.10.2


From 32e62b7c3ef095eccbb6a8c96fddf05dacc749df Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:12:28 -0400
Subject: NFS: Add nfs4_update_server

New function nfs4_update_server() moves an nfs_server to a different
nfs_client.  This is done as part of migration recovery.

Though it may be appealing to think of them as the same thing,
migration recovery is not the same as following a referral.

For a referral, the client has not descended into the file system
yet: it has no nfs_server, no super block, no inodes or open state.
It is enough to simply instantiate the nfs_server and super block,
and perform a referral mount.

For a migration, however, we have all of those things already, and
they have to be moved to a different nfs_client.  No local namespace
changes are needed here.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index af03258..692fd0e 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -945,7 +945,7 @@ void nfs_server_insert_lists(struct nfs_server *server)
 }
 EXPORT_SYMBOL_GPL(nfs_server_insert_lists);
 
-static void nfs_server_remove_lists(struct nfs_server *server)
+void nfs_server_remove_lists(struct nfs_server *server)
 {
 	struct nfs_client *clp = server->nfs_client;
 	struct nfs_net *nn;
@@ -962,6 +962,7 @@ static void nfs_server_remove_lists(struct nfs_server *server)
 
 	synchronize_rcu();
 }
+EXPORT_SYMBOL_GPL(nfs_server_remove_lists);
 
 /*
  * Allocate and initialise a server record
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 38da8c2..e5a6bd1 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -154,6 +154,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *,
 				  rpc_authflavor_t);
 int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
 void nfs_server_insert_lists(struct nfs_server *);
+void nfs_server_remove_lists(struct nfs_server *);
 void nfs_init_timeout_values(struct rpc_timeout *, int, unsigned int, unsigned int);
 int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t,
 		rpc_authflavor_t);
@@ -174,6 +175,8 @@ extern struct nfs_server *nfs4_create_server(
 					struct nfs_subversion *);
 extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *,
 						      struct nfs_fh *);
+extern int nfs4_update_server(struct nfs_server *server, const char *hostname,
+					struct sockaddr *sap, size_t salen);
 extern void nfs_free_server(struct nfs_server *server);
 extern struct nfs_server *nfs_clone_server(struct nfs_server *,
 					   struct nfs_fh *,
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 511cdce..c6eee81 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -1092,3 +1092,111 @@ error:
 	dprintk("<-- nfs4_create_referral_server() = error %d\n", error);
 	return ERR_PTR(error);
 }
+
+/*
+ * Grab the destination's particulars, including lease expiry time.
+ *
+ * Returns zero if probe succeeded and retrieved FSID matches the FSID
+ * we have cached.
+ */
+static int nfs_probe_destination(struct nfs_server *server)
+{
+	struct inode *inode = server->super->s_root->d_inode;
+	struct nfs_fattr *fattr;
+	int error;
+
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		return -ENOMEM;
+
+	/* Sanity: the probe won't work if the destination server
+	 * does not recognize the migrated FH. */
+	error = nfs_probe_fsinfo(server, NFS_FH(inode), fattr);
+
+	nfs_free_fattr(fattr);
+	return error;
+}
+
+/**
+ * nfs4_update_server - Move an nfs_server to a different nfs_client
+ *
+ * @server: represents FSID to be moved
+ * @hostname: new end-point's hostname
+ * @sap: new end-point's socket address
+ * @salen: size of "sap"
+ *
+ * The nfs_server must be quiescent before this function is invoked.
+ * Either its session is drained (NFSv4.1+), or its transport is
+ * plugged and drained (NFSv4.0).
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs4_update_server(struct nfs_server *server, const char *hostname,
+		       struct sockaddr *sap, size_t salen)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct rpc_clnt *clnt = server->client;
+	struct xprt_create xargs = {
+		.ident		= clp->cl_proto,
+		.net		= &init_net,
+		.dstaddr	= sap,
+		.addrlen	= salen,
+		.servername	= hostname,
+	};
+	char buf[INET6_ADDRSTRLEN + 1];
+	struct sockaddr_storage address;
+	struct sockaddr *localaddr = (struct sockaddr *)&address;
+	int error;
+
+	dprintk("--> %s: move FSID %llx:%llx to \"%s\")\n", __func__,
+			(unsigned long long)server->fsid.major,
+			(unsigned long long)server->fsid.minor,
+			hostname);
+
+	error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout);
+	if (error != 0) {
+		dprintk("<-- %s(): rpc_switch_client_transport returned %d\n",
+			__func__, error);
+		goto out;
+	}
+
+	error = rpc_localaddr(clnt, localaddr, sizeof(address));
+	if (error != 0) {
+		dprintk("<-- %s(): rpc_localaddr returned %d\n",
+			__func__, error);
+		goto out;
+	}
+
+	error = -EAFNOSUPPORT;
+	if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0) {
+		dprintk("<-- %s(): rpc_ntop returned %d\n",
+			__func__, error);
+		goto out;
+	}
+
+	nfs_server_remove_lists(server);
+	error = nfs4_set_client(server, hostname, sap, salen, buf,
+				clp->cl_rpcclient->cl_auth->au_flavor,
+				clp->cl_proto, clnt->cl_timeout,
+				clp->cl_minorversion, clp->cl_net);
+	nfs_put_client(clp);
+	if (error != 0) {
+		nfs_server_insert_lists(server);
+		dprintk("<-- %s(): nfs4_set_client returned %d\n",
+			__func__, error);
+		goto out;
+	}
+
+	if (server->nfs_client->cl_hostname == NULL)
+		server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL);
+	nfs_server_insert_lists(server);
+
+	error = nfs_probe_destination(server);
+	if (error < 0)
+		goto out;
+
+	dprintk("<-- %s() succeeded\n", __func__);
+
+out:
+	return error;
+}
-- 
cgit v0.10.2


From 800c06a5bf497e0587ba8382a761f31a7825e2cd Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:12:34 -0400
Subject: NFS: Add functions to swap transports during migration recovery

Introduce functions that can walk through an array of returned
fs_locations information and connect a transport to one of the
destination servers listed therein.

Note that NFS minor version 1 introduces "fs_locations_info" which
extends the locations array sorting criteria available to clients.
This is not supported yet.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 28842ab..fcae728 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -217,6 +217,8 @@ rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *);
 struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *, struct inode *, struct qstr *);
 struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *,
 			       struct nfs_fh *, struct nfs_fattr *);
+int nfs4_replace_transport(struct nfs_server *server,
+				const struct nfs4_fs_locations *locations);
 
 /* nfs4proc.c */
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 2288cd3..ebd8b06 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -400,3 +400,104 @@ out:
 	rpc_shutdown_client(client);
 	return mnt;
 }
+
+/*
+ * Try one location from the fs_locations array.
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+static int nfs4_try_replacing_one_location(struct nfs_server *server,
+		char *page, char *page2,
+		const struct nfs4_fs_location *location)
+{
+	const size_t addr_bufsize = sizeof(struct sockaddr_storage);
+	struct sockaddr *sap;
+	unsigned int s;
+	size_t salen;
+	int error;
+
+	sap = kmalloc(addr_bufsize, GFP_KERNEL);
+	if (sap == NULL)
+		return -ENOMEM;
+
+	error = -ENOENT;
+	for (s = 0; s < location->nservers; s++) {
+		const struct nfs4_string *buf = &location->servers[s];
+		char *hostname;
+
+		if (buf->len <= 0 || buf->len > PAGE_SIZE)
+			continue;
+
+		if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len) != NULL)
+			continue;
+
+		salen = nfs_parse_server_name(buf->data, buf->len,
+						sap, addr_bufsize, server);
+		if (salen == 0)
+			continue;
+		rpc_set_port(sap, NFS_PORT);
+
+		error = -ENOMEM;
+		hostname = kstrndup(buf->data, buf->len, GFP_KERNEL);
+		if (hostname == NULL)
+			break;
+
+		error = nfs4_update_server(server, hostname, sap, salen);
+		kfree(hostname);
+		if (error == 0)
+			break;
+	}
+
+	kfree(sap);
+	return error;
+}
+
+/**
+ * nfs4_replace_transport - set up transport to destination server
+ *
+ * @server: export being migrated
+ * @locations: fs_locations array
+ *
+ * Returns zero on success, or a negative errno value.
+ *
+ * The client tries all the entries in the "locations" array, in the
+ * order returned by the server, until one works or the end of the
+ * array is reached.
+ */
+int nfs4_replace_transport(struct nfs_server *server,
+			   const struct nfs4_fs_locations *locations)
+{
+	char *page = NULL, *page2 = NULL;
+	int loc, error;
+
+	error = -ENOENT;
+	if (locations == NULL || locations->nlocations <= 0)
+		goto out;
+
+	error = -ENOMEM;
+	page = (char *) __get_free_page(GFP_USER);
+	if (!page)
+		goto out;
+	page2 = (char *) __get_free_page(GFP_USER);
+	if (!page2)
+		goto out;
+
+	for (loc = 0; loc < locations->nlocations; loc++) {
+		const struct nfs4_fs_location *location =
+						&locations->locations[loc];
+
+		if (location == NULL || location->nservers <= 0 ||
+		    location->rootpath.ncomponents == 0)
+			continue;
+
+		error = nfs4_try_replacing_one_location(server, page,
+							page2, location);
+		if (error == 0)
+			break;
+	}
+
+out:
+	free_page((unsigned long)page);
+	free_page((unsigned long)page2);
+	return error;
+}
-- 
cgit v0.10.2


From ec011fe847347b40c60fdb5085f65227762e2e08 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:12:39 -0400
Subject: NFS: Introduce a vector of migration recovery ops

The differences between minor version 0 and minor version 1
migration will be abstracted by the addition of a set of migration
recovery ops.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index fcae728..4f48000 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -50,6 +50,7 @@ struct nfs4_minor_version_ops {
 	const struct nfs4_state_recovery_ops *reboot_recovery_ops;
 	const struct nfs4_state_recovery_ops *nograce_recovery_ops;
 	const struct nfs4_state_maintenance_ops *state_renewal_ops;
+	const struct nfs4_mig_recovery_ops *mig_recovery_ops;
 };
 
 #define NFS_SEQID_CONFIRMED 1
@@ -203,6 +204,9 @@ struct nfs4_state_maintenance_ops {
 	int (*renew_lease)(struct nfs_client *, struct rpc_cred *);
 };
 
+struct nfs4_mig_recovery_ops {
+};
+
 extern const struct dentry_operations nfs4_dentry_operations;
 
 /* dir.c */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 259f8be..2aacd13 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7881,6 +7881,14 @@ static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
 };
 #endif
 
+static const struct nfs4_mig_recovery_ops nfs40_mig_recovery_ops = {
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static const struct nfs4_mig_recovery_ops nfs41_mig_recovery_ops = {
+};
+#endif	/* CONFIG_NFS_V4_1 */
+
 static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
 	.minor_version = 0,
 	.init_caps = NFS_CAP_READDIRPLUS
@@ -7896,6 +7904,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
 	.reboot_recovery_ops = &nfs40_reboot_recovery_ops,
 	.nograce_recovery_ops = &nfs40_nograce_recovery_ops,
 	.state_renewal_ops = &nfs40_state_renewal_ops,
+	.mig_recovery_ops = &nfs40_mig_recovery_ops,
 };
 
 #if defined(CONFIG_NFS_V4_1)
@@ -7916,6 +7925,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
 	.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
 	.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
 	.state_renewal_ops = &nfs41_state_renewal_ops,
+	.mig_recovery_ops = &nfs41_mig_recovery_ops,
 };
 #endif
 
-- 
cgit v0.10.2


From 9e6ee76dfb7cd747fac5679542a9b45930173eec Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:12:45 -0400
Subject: NFS: Export _nfs_display_fhandle()

Allow code in nfsv4.ko to use _nfs_display_fhandle().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index eda8879..4bc7538 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1209,6 +1209,7 @@ u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh)
 	 * not on the result */
 	return nfs_fhandle_hash(fh);
 }
+EXPORT_SYMBOL_GPL(_nfs_display_fhandle_hash);
 
 /*
  * _nfs_display_fhandle - display an NFS file handle on the console
@@ -1253,6 +1254,7 @@ void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption)
 		}
 	}
 }
+EXPORT_SYMBOL_GPL(_nfs_display_fhandle);
 #endif
 
 /**
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index c6eee81..a2cba66 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -925,7 +925,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
 	dprintk("Server FSID: %llx:%llx\n",
 			(unsigned long long) server->fsid.major,
 			(unsigned long long) server->fsid.minor);
-	dprintk("Mount FH: %d\n", mntfh->size);
+	nfs_display_fhandle(mntfh, "Pseudo-fs root FH");
 
 	nfs4_session_set_rwsize(server);
 
-- 
cgit v0.10.2


From b03d735b4ca2375d2251195cd848713bc55e7d79 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:12:50 -0400
Subject: NFS: Add method to retrieve fs_locations during migration recovery

The nfs4_proc_fs_locations() function is invoked during referral
processing to perform a GETATTR(fs_locations) on an object's parent
directory in order to discover the target of the referral.  It
performs a LOOKUP in the compound, so the client needs to know the
parent's file handle a priori.

Unfortunately this function is not adequate for handling migration
recovery.  We need to probe fs_locations information on an FSID, but
there's no parent directory available for many operations that
can return NFS4ERR_MOVED.

Another subtlety: recovering from NFS4ERR_LEASE_MOVED is a process
of walking over a list of known FSIDs that reside on the server, and
probing whether they have migrated.  Once the server has detected
that the client has probed all migrated file systems, it stops
returning NFS4ERR_LEASE_MOVED.

A minor version zero server needs to know what client ID is
requesting fs_locations information so it can clear the flag that
forces it to continue returning NFS4ERR_LEASE_MOVED.  This flag is
set per client ID and per FSID.  However, the client ID is not an
argument of either the PUTFH or GETATTR operations.  Later minor
versions have client ID information embedded in the compound's
SEQUENCE operation.

Therefore, by convention, minor version zero clients send a RENEW
operation in the same compound as the GETATTR(fs_locations), since
RENEW's one argument is a clientid4.  This allows a minor version
zero server to identify correctly the client that is probing for a
migration.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 4f48000..e59d3b4 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -205,6 +205,8 @@ struct nfs4_state_maintenance_ops {
 };
 
 struct nfs4_mig_recovery_ops {
+	int (*get_locations)(struct inode *, struct nfs4_fs_locations *,
+		struct page *, struct rpc_cred *);
 };
 
 extern const struct dentry_operations nfs4_dentry_operations;
@@ -237,6 +239,8 @@ extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *,
 				  struct nfs4_fs_locations *, struct page *);
+extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *,
+		struct page *page, struct rpc_cred *);
 extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *,
 			    struct nfs_fh *, struct nfs_fattr *);
 extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2aacd13..c71c16e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5984,6 +5984,157 @@ int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
 	return err;
 }
 
+/*
+ * This operation also signals the server that this client is
+ * performing migration recovery.  The server can stop returning
+ * NFS4ERR_LEASE_MOVED to this client.  A RENEW operation is
+ * appended to this compound to identify the client ID which is
+ * performing recovery.
+ */
+static int _nfs40_proc_get_locations(struct inode *inode,
+				     struct nfs4_fs_locations *locations,
+				     struct page *page, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct rpc_clnt *clnt = server->client;
+	u32 bitmask[2] = {
+		[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
+	};
+	struct nfs4_fs_locations_arg args = {
+		.clientid	= server->nfs_client->cl_clientid,
+		.fh		= NFS_FH(inode),
+		.page		= page,
+		.bitmask	= bitmask,
+		.migration	= 1,		/* skip LOOKUP */
+		.renew		= 1,		/* append RENEW */
+	};
+	struct nfs4_fs_locations_res res = {
+		.fs_locations	= locations,
+		.migration	= 1,
+		.renew		= 1,
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+		.rpc_argp	= &args,
+		.rpc_resp	= &res,
+		.rpc_cred	= cred,
+	};
+	unsigned long now = jiffies;
+	int status;
+
+	nfs_fattr_init(&locations->fattr);
+	locations->server = server;
+	locations->nlocations = 0;
+
+	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
+	nfs4_set_sequence_privileged(&args.seq_args);
+	status = nfs4_call_sync_sequence(clnt, server, &msg,
+					&args.seq_args, &res.seq_res);
+	if (status)
+		return status;
+
+	renew_lease(server, now);
+	return 0;
+}
+
+#ifdef CONFIG_NFS_V4_1
+
+/*
+ * This operation also signals the server that this client is
+ * performing migration recovery.  The server can stop asserting
+ * SEQ4_STATUS_LEASE_MOVED for this client.  The client ID
+ * performing this operation is identified in the SEQUENCE
+ * operation in this compound.
+ *
+ * When the client supports GETATTR(fs_locations_info), it can
+ * be plumbed in here.
+ */
+static int _nfs41_proc_get_locations(struct inode *inode,
+				     struct nfs4_fs_locations *locations,
+				     struct page *page, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct rpc_clnt *clnt = server->client;
+	u32 bitmask[2] = {
+		[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
+	};
+	struct nfs4_fs_locations_arg args = {
+		.fh		= NFS_FH(inode),
+		.page		= page,
+		.bitmask	= bitmask,
+		.migration	= 1,		/* skip LOOKUP */
+	};
+	struct nfs4_fs_locations_res res = {
+		.fs_locations	= locations,
+		.migration	= 1,
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+		.rpc_argp	= &args,
+		.rpc_resp	= &res,
+		.rpc_cred	= cred,
+	};
+	int status;
+
+	nfs_fattr_init(&locations->fattr);
+	locations->server = server;
+	locations->nlocations = 0;
+
+	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
+	nfs4_set_sequence_privileged(&args.seq_args);
+	status = nfs4_call_sync_sequence(clnt, server, &msg,
+					&args.seq_args, &res.seq_res);
+	if (status == NFS4_OK &&
+	    res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED)
+		status = -NFS4ERR_LEASE_MOVED;
+	return status;
+}
+
+#endif	/* CONFIG_NFS_V4_1 */
+
+/**
+ * nfs4_proc_get_locations - discover locations for a migrated FSID
+ * @inode: inode on FSID that is migrating
+ * @locations: result of query
+ * @page: buffer
+ * @cred: credential to use for this operation
+ *
+ * Returns NFS4_OK on success, a negative NFS4ERR status code if the
+ * operation failed, or a negative errno if a local error occurred.
+ *
+ * On success, "locations" is filled in, but if the server has
+ * no locations information, NFS_ATTR_FATTR_V4_LOCATIONS is not
+ * asserted.
+ *
+ * -NFS4ERR_LEASE_MOVED is returned if the server still has leases
+ * from this client that require migration recovery.
+ */
+int nfs4_proc_get_locations(struct inode *inode,
+			    struct nfs4_fs_locations *locations,
+			    struct page *page, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_client *clp = server->nfs_client;
+	const struct nfs4_mig_recovery_ops *ops =
+					clp->cl_mvops->mig_recovery_ops;
+	struct nfs4_exception exception = { };
+	int status;
+
+	dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__,
+		(unsigned long long)server->fsid.major,
+		(unsigned long long)server->fsid.minor,
+		clp->cl_hostname);
+	nfs_display_fhandle(NFS_FH(inode), __func__);
+
+	do {
+		status = ops->get_locations(inode, locations, page, cred);
+		if (status != -NFS4ERR_DELAY)
+			break;
+		nfs4_handle_exception(server, status, &exception);
+	} while (exception.retry);
+	return status;
+}
+
 /**
  * If 'use_integrity' is true and the state managment nfs_client
  * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient
@@ -7882,10 +8033,12 @@ static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
 #endif
 
 static const struct nfs4_mig_recovery_ops nfs40_mig_recovery_ops = {
+	.get_locations = _nfs40_proc_get_locations,
 };
 
 #if defined(CONFIG_NFS_V4_1)
 static const struct nfs4_mig_recovery_ops nfs41_mig_recovery_ops = {
+	.get_locations = _nfs41_proc_get_locations,
 };
 #endif	/* CONFIG_NFS_V4_1 */
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 79210d2..1854b04 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -736,13 +736,15 @@ static int nfs4_stat_to_errno(int);
 				 encode_sequence_maxsz + \
 				 encode_putfh_maxsz + \
 				 encode_lookup_maxsz + \
-				 encode_fs_locations_maxsz)
+				 encode_fs_locations_maxsz + \
+				 encode_renew_maxsz)
 #define NFS4_dec_fs_locations_sz \
 				(compound_decode_hdr_maxsz + \
 				 decode_sequence_maxsz + \
 				 decode_putfh_maxsz + \
 				 decode_lookup_maxsz + \
-				 decode_fs_locations_maxsz)
+				 decode_fs_locations_maxsz + \
+				 decode_renew_maxsz)
 #define NFS4_enc_secinfo_sz 	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
@@ -2687,11 +2689,20 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
 
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
-	encode_putfh(xdr, args->dir_fh, &hdr);
-	encode_lookup(xdr, args->name, &hdr);
-	replen = hdr.replen;	/* get the attribute into args->page */
-	encode_fs_locations(xdr, args->bitmask, &hdr);
+	if (args->migration) {
+		encode_putfh(xdr, args->fh, &hdr);
+		replen = hdr.replen;
+		encode_fs_locations(xdr, args->bitmask, &hdr);
+		if (args->renew)
+			encode_renew(xdr, args->clientid, &hdr);
+	} else {
+		encode_putfh(xdr, args->dir_fh, &hdr);
+		encode_lookup(xdr, args->name, &hdr);
+		replen = hdr.replen;
+		encode_fs_locations(xdr, args->bitmask, &hdr);
+	}
 
+	/* Set up reply kvec to capture returned fs_locations array. */
 	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page,
 			0, PAGE_SIZE);
 	encode_nops(&hdr);
@@ -6824,13 +6835,26 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
 	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_lookup(xdr);
-	if (status)
-		goto out;
-	xdr_enter_page(xdr, PAGE_SIZE);
-	status = decode_getfattr_generic(xdr, &res->fs_locations->fattr,
+	if (res->migration) {
+		xdr_enter_page(xdr, PAGE_SIZE);
+		status = decode_getfattr_generic(xdr,
+					&res->fs_locations->fattr,
 					 NULL, res->fs_locations,
 					 NULL, res->fs_locations->server);
+		if (status)
+			goto out;
+		if (res->renew)
+			status = decode_renew(xdr);
+	} else {
+		status = decode_lookup(xdr);
+		if (status)
+			goto out;
+		xdr_enter_page(xdr, PAGE_SIZE);
+		status = decode_getfattr_generic(xdr,
+					&res->fs_locations->fattr,
+					 NULL, res->fs_locations,
+					 NULL, res->fs_locations->server);
+	}
 out:
 	return status;
 }
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 49f52c8..405dfad 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1053,14 +1053,18 @@ struct nfs4_fs_locations {
 struct nfs4_fs_locations_arg {
 	struct nfs4_sequence_args	seq_args;
 	const struct nfs_fh *dir_fh;
+	const struct nfs_fh *fh;
 	const struct qstr *name;
 	struct page *page;
 	const u32 *bitmask;
+	clientid4 clientid;
+	unsigned char migration:1, renew:1;
 };
 
 struct nfs4_fs_locations_res {
 	struct nfs4_sequence_res	seq_res;
 	struct nfs4_fs_locations       *fs_locations;
+	unsigned char			migration:1, renew:1;
 };
 
 struct nfs4_secinfo4 {
-- 
cgit v0.10.2


From ce6cda1845cf2332d2c411a5c72cd166256b21da Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:12:56 -0400
Subject: NFS: Add a super_block backpointer to the nfs_server struct

NFS_SB() returns the pointer to an nfs_server struct, given a
pointer to a super_block.  But we have no way to go back the other
way.

Add a super_block backpointer field so that, given an nfs_server
struct, it is easy to get to the filesystem's root dentry.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 3f5a7a8..e26647b 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2534,6 +2534,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
 			mntroot = ERR_PTR(error);
 			goto error_splat_bdi;
 		}
+		server->super = s;
 	}
 
 	if (!s->s_root) {
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index f9c0a6c..46b0cb4 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -148,6 +148,7 @@ struct nfs_server {
 	__u64			maxfilesize;	/* maximum file size */
 	struct timespec		time_delta;	/* smallest time granularity */
 	unsigned long		mount_time;	/* when this fs was mounted */
+	struct super_block	*super;		/* VFS super block */
 	dev_t			s_dev;		/* superblock dev numbers */
 
 #ifdef CONFIG_NFS_FSCACHE
-- 
cgit v0.10.2


From c9fdeb280b8cc511c6730db9ab3973331ea344e0 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:13:02 -0400
Subject: NFS: Add basic migration support to state manager thread

Migration recovery and state recovery must be serialized, so handle
both in the state manager thread.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index e59d3b4..94e783f 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -29,6 +29,7 @@ enum nfs4_client_state {
 	NFS4CLNT_SERVER_SCOPE_MISMATCH,
 	NFS4CLNT_PURGE_STATE,
 	NFS4CLNT_BIND_CONN_TO_SESSION,
+	NFS4CLNT_MOVED,
 };
 
 #define NFS4_RENEW_TIMEOUT		0x01
@@ -421,6 +422,7 @@ extern int nfs4_client_recover_expired_lease(struct nfs_client *clp);
 extern void nfs4_schedule_state_manager(struct nfs_client *);
 extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
 extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
+extern int nfs4_schedule_migration_recovery(const struct nfs_server *);
 extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
 extern void nfs41_handle_server_scope(struct nfs_client *,
 				      struct nfs41_server_scope **);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index a2cba66..0cde95e 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -197,6 +197,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
 	clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
 	clp->cl_minorversion = cl_init->minorversion;
 	clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
+	clp->cl_mig_gen = 1;
 	return clp;
 
 error:
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index fa2706a..ba0db18 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -239,8 +239,6 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
 	}
 }
 
-#if defined(CONFIG_NFS_V4_1)
-
 static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl)
 {
 	set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
@@ -270,6 +268,8 @@ static int nfs4_begin_drain_session(struct nfs_client *clp)
 	return nfs4_drain_slot_tbl(&ses->fc_slot_table);
 }
 
+#if defined(CONFIG_NFS_V4_1)
+
 static int nfs41_setup_state_renewal(struct nfs_client *clp)
 {
 	int status;
@@ -1197,6 +1197,42 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery);
 
+/**
+ * nfs4_schedule_migration_recovery - trigger migration recovery
+ *
+ * @server: FSID that is migrating
+ *
+ * Returns zero if recovery has started, otherwise a negative NFS4ERR
+ * value is returned.
+ */
+int nfs4_schedule_migration_recovery(const struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+
+	if (server->fh_expire_type != NFS4_FH_PERSISTENT) {
+		pr_err("NFS: volatile file handles not supported (server %s)\n",
+				clp->cl_hostname);
+		return -NFS4ERR_IO;
+	}
+
+	if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+		return -NFS4ERR_IO;
+
+	dprintk("%s: scheduling migration recovery for (%llx:%llx) on %s\n",
+			__func__,
+			(unsigned long long)server->fsid.major,
+			(unsigned long long)server->fsid.minor,
+			clp->cl_hostname);
+
+	set_bit(NFS_MIG_IN_TRANSITION,
+			&((struct nfs_server *)server)->mig_status);
+	set_bit(NFS4CLNT_MOVED, &clp->cl_state);
+
+	nfs4_schedule_state_manager(clp);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_migration_recovery);
+
 int nfs4_wait_clnt_recover(struct nfs_client *clp)
 {
 	int res;
@@ -1826,6 +1862,119 @@ static int nfs4_purge_lease(struct nfs_client *clp)
 	return 0;
 }
 
+/*
+ * Try remote migration of one FSID from a source server to a
+ * destination server.  The source server provides a list of
+ * potential destinations.
+ *
+ * Returns zero or a negative NFS4ERR status code.
+ */
+static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_fs_locations *locations = NULL;
+	struct inode *inode;
+	struct page *page;
+	int status, result;
+
+	dprintk("--> %s: FSID %llx:%llx on \"%s\"\n", __func__,
+			(unsigned long long)server->fsid.major,
+			(unsigned long long)server->fsid.minor,
+			clp->cl_hostname);
+
+	result = 0;
+	page = alloc_page(GFP_KERNEL);
+	locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+	if (page == NULL || locations == NULL) {
+		dprintk("<-- %s: no memory\n", __func__);
+		goto out;
+	}
+
+	inode = server->super->s_root->d_inode;
+	result = nfs4_proc_get_locations(inode, locations, page, cred);
+	if (result) {
+		dprintk("<-- %s: failed to retrieve fs_locations: %d\n",
+			__func__, result);
+		goto out;
+	}
+
+	result = -NFS4ERR_NXIO;
+	if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) {
+		dprintk("<-- %s: No fs_locations data, migration skipped\n",
+			__func__);
+		goto out;
+	}
+
+	nfs4_begin_drain_session(clp);
+
+	status = nfs4_replace_transport(server, locations);
+	if (status != 0) {
+		dprintk("<-- %s: failed to replace transport: %d\n",
+			__func__, status);
+		goto out;
+	}
+
+	result = 0;
+	dprintk("<-- %s: migration succeeded\n", __func__);
+
+out:
+	if (page != NULL)
+		__free_page(page);
+	kfree(locations);
+	if (result) {
+		pr_err("NFS: migration recovery failed (server %s)\n",
+				clp->cl_hostname);
+		set_bit(NFS_MIG_FAILED, &server->mig_status);
+	}
+	return result;
+}
+
+/*
+ * Returns zero or a negative NFS4ERR status code.
+ */
+static int nfs4_handle_migration(struct nfs_client *clp)
+{
+	const struct nfs4_state_maintenance_ops *ops =
+				clp->cl_mvops->state_renewal_ops;
+	struct nfs_server *server;
+	struct rpc_cred *cred;
+
+	dprintk("%s: migration reported on \"%s\"\n", __func__,
+			clp->cl_hostname);
+
+	spin_lock(&clp->cl_lock);
+	cred = ops->get_state_renewal_cred_locked(clp);
+	spin_unlock(&clp->cl_lock);
+	if (cred == NULL)
+		return -NFS4ERR_NOENT;
+
+	clp->cl_mig_gen++;
+restart:
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		int status;
+
+		if (server->mig_gen == clp->cl_mig_gen)
+			continue;
+		server->mig_gen = clp->cl_mig_gen;
+
+		if (!test_and_clear_bit(NFS_MIG_IN_TRANSITION,
+						&server->mig_status))
+			continue;
+
+		rcu_read_unlock();
+		status = nfs4_try_migration(server, cred);
+		if (status < 0) {
+			put_rpccred(cred);
+			return status;
+		}
+		goto restart;
+	}
+	rcu_read_unlock();
+	put_rpccred(cred);
+	return 0;
+}
+
 /**
  * nfs4_discover_server_trunking - Detect server IP address trunking
  *
@@ -2154,7 +2303,13 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			status = nfs4_check_lease(clp);
 			if (status < 0)
 				goto out_error;
-			continue;
+		}
+
+		if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) {
+			section = "migration";
+			status = nfs4_handle_migration(clp);
+			if (status < 0)
+				goto out_error;
 		}
 
 		/* First recover reboot state... */
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 46b0cb4..186ec49 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -79,6 +79,7 @@ struct nfs_client {
 	char			cl_ipaddr[48];
 	u32			cl_cb_ident;	/* v4.0 callback identifier */
 	const struct nfs4_minor_version_ops *cl_mvops;
+	unsigned long		cl_mig_gen;
 
 	/* NFSv4.0 transport blocking */
 	struct nfs4_slot_table	*cl_slot_tbl;
@@ -189,6 +190,12 @@ struct nfs_server {
 	struct list_head	state_owners_lru;
 	struct list_head	layouts;
 	struct list_head	delegations;
+
+	unsigned long		mig_gen;
+	unsigned long		mig_status;
+#define NFS_MIG_IN_TRANSITION		(1)
+#define NFS_MIG_FAILED			(2)
+
 	void (*destroy)(struct nfs_server *);
 
 	atomic_t active; /* Keep trace of any activity to this server */
-- 
cgit v0.10.2


From f1478c13c0a606b960becaa70e82065835b16fbf Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:13:07 -0400
Subject: NFS: Re-use exit code in nfs4_async_handle_error()

Clean up.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c71c16e..2614b46 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4750,19 +4750,15 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 			dprintk("%s ERROR %d, Reset session\n", __func__,
 				task->tk_status);
 			nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
-			task->tk_status = 0;
-			return -EAGAIN;
+			goto restart_call;
 #endif /* CONFIG_NFS_V4_1 */
 		case -NFS4ERR_DELAY:
 			nfs_inc_server_stats(server, NFSIOS_DELAY);
 		case -NFS4ERR_GRACE:
 			rpc_delay(task, NFS4_POLL_RETRY_MAX);
-			task->tk_status = 0;
-			return -EAGAIN;
 		case -NFS4ERR_RETRY_UNCACHED_REP:
 		case -NFS4ERR_OLD_STATEID:
-			task->tk_status = 0;
-			return -EAGAIN;
+			goto restart_call;
 	}
 	task->tk_status = nfs4_map_errors(task->tk_status);
 	return 0;
@@ -4773,6 +4769,7 @@ wait_on_recovery:
 	rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
 	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
 		rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
+restart_call:
 	task->tk_status = 0;
 	return -EAGAIN;
 }
-- 
cgit v0.10.2


From 9f51a78e3a637a5923c661d2abe958c63441696f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:13:13 -0400
Subject: NFS: Rename "stateid_invalid" label

I'm going to use this exit label also for migration recovery
failures.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2614b46..fa87f81 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4728,12 +4728,12 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 			if (state == NULL)
 				break;
 			if (nfs4_schedule_stateid_recovery(server, state) < 0)
-				goto stateid_invalid;
+				goto recovery_failed;
 			goto wait_on_recovery;
 		case -NFS4ERR_EXPIRED:
 			if (state != NULL) {
 				if (nfs4_schedule_stateid_recovery(server, state) < 0)
-					goto stateid_invalid;
+					goto recovery_failed;
 			}
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_STALE_CLIENTID:
@@ -4762,7 +4762,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 	}
 	task->tk_status = nfs4_map_errors(task->tk_status);
 	return 0;
-stateid_invalid:
+recovery_failed:
 	task->tk_status = -EIO;
 	return 0;
 wait_on_recovery:
-- 
cgit v0.10.2


From 519ae255d403219fd5375de13c95dea3ef1cfda0 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:13:19 -0400
Subject: NFS: Add migration recovery callouts in nfs4proc.c

When a server returns NFS4ERR_MOVED, trigger the new migration
recovery logic in the state manager.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index fa87f81..a1f6209 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -384,6 +384,11 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
 		case -NFS4ERR_STALE_CLIENTID:
 			nfs4_schedule_lease_recovery(clp);
 			goto wait_on_recovery;
+		case -NFS4ERR_MOVED:
+			ret = nfs4_schedule_migration_recovery(server);
+			if (ret < 0)
+				break;
+			goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
 		case -NFS4ERR_BADSESSION:
 		case -NFS4ERR_BADSLOT:
@@ -431,6 +436,8 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
 	return nfs4_map_errors(ret);
 wait_on_recovery:
 	ret = nfs4_wait_clnt_recover(clp);
+	if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+		return -EIO;
 	if (ret == 0)
 		exception->retry = 1;
 	return ret;
@@ -2974,11 +2981,16 @@ static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir,
 	status = nfs4_proc_fs_locations(client, dir, name, locations, page);
 	if (status != 0)
 		goto out;
-	/* Make sure server returned a different fsid for the referral */
+
+	/*
+	 * If the fsid didn't change, this is a migration event, not a
+	 * referral.  Cause us to drop into the exception handler, which
+	 * will kick off migration recovery.
+	 */
 	if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) {
 		dprintk("%s: server did not return a different fsid for"
 			" a referral at %s\n", __func__, name->name);
-		status = -EIO;
+		status = -NFS4ERR_MOVED;
 		goto out;
 	}
 	/* Fixup attributes for the nfs_lookup() call to nfs_fhget() */
@@ -4739,6 +4751,10 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 		case -NFS4ERR_STALE_CLIENTID:
 			nfs4_schedule_lease_recovery(clp);
 			goto wait_on_recovery;
+		case -NFS4ERR_MOVED:
+			if (nfs4_schedule_migration_recovery(server) < 0)
+				goto recovery_failed;
+			goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
 		case -NFS4ERR_BADSESSION:
 		case -NFS4ERR_BADSLOT:
@@ -4769,6 +4785,8 @@ wait_on_recovery:
 	rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
 	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
 		rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
+	if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+		goto recovery_failed;
 restart_call:
 	task->tk_status = 0;
 	return -EAGAIN;
-- 
cgit v0.10.2


From 352297b917d8a3e61a778ffb0f0195ce8550d7f5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:13:24 -0400
Subject: NFS: Handle NFS4ERR_MOVED during delegation recall

When a server returns NFS4ERR_MOVED during a delegation recall,
trigger the new migration recovery logic in the state manager.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a1f6209..552e4f7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1575,6 +1575,9 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
 			/* Don't recall a delegation if it was lost */
 			nfs4_schedule_lease_recovery(server->nfs_client);
 			return -EAGAIN;
+		case -NFS4ERR_MOVED:
+			nfs4_schedule_migration_recovery(server);
+			return -EAGAIN;
 		case -NFS4ERR_DELEG_REVOKED:
 		case -NFS4ERR_ADMIN_REVOKED:
 		case -NFS4ERR_BAD_STATEID:
-- 
cgit v0.10.2


From 44c9993384e9311cd56acf6ead3baffab616ae50 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:13:30 -0400
Subject: NFS: Add method to detect whether an FSID is still on the server

Introduce a mechanism for probing a server to determine if an FSID
is present or absent.

The on-the-wire compound is different between minor version 0 and 1.
Minor version 0 appends a RENEW operation to identify which client
ID is probing.  Minor version 1 has a SEQUENCE operation in the
compound which effectively carries the same information.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 94e783f..2f0f8c2 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -208,6 +208,7 @@ struct nfs4_state_maintenance_ops {
 struct nfs4_mig_recovery_ops {
 	int (*get_locations)(struct inode *, struct nfs4_fs_locations *,
 		struct page *, struct rpc_cred *);
+	int (*fsid_present)(struct inode *, struct rpc_cred *);
 };
 
 extern const struct dentry_operations nfs4_dentry_operations;
@@ -242,6 +243,7 @@ extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struc
 				  struct nfs4_fs_locations *, struct page *);
 extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *,
 		struct page *page, struct rpc_cred *);
+extern int nfs4_proc_fsid_present(struct inode *, struct rpc_cred *);
 extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *,
 			    struct nfs_fh *, struct nfs_fattr *);
 extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 552e4f7..01b90bd 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6153,6 +6153,132 @@ int nfs4_proc_get_locations(struct inode *inode,
 	return status;
 }
 
+/*
+ * This operation also signals the server that this client is
+ * performing "lease moved" recovery.  The server can stop
+ * returning NFS4ERR_LEASE_MOVED to this client.  A RENEW operation
+ * is appended to this compound to identify the client ID which is
+ * performing recovery.
+ */
+static int _nfs40_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+	struct rpc_clnt *clnt = server->client;
+	struct nfs4_fsid_present_arg args = {
+		.fh		= NFS_FH(inode),
+		.clientid	= clp->cl_clientid,
+		.renew		= 1,		/* append RENEW */
+	};
+	struct nfs4_fsid_present_res res = {
+		.renew		= 1,
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT],
+		.rpc_argp	= &args,
+		.rpc_resp	= &res,
+		.rpc_cred	= cred,
+	};
+	unsigned long now = jiffies;
+	int status;
+
+	res.fh = nfs_alloc_fhandle();
+	if (res.fh == NULL)
+		return -ENOMEM;
+
+	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
+	nfs4_set_sequence_privileged(&args.seq_args);
+	status = nfs4_call_sync_sequence(clnt, server, &msg,
+						&args.seq_args, &res.seq_res);
+	nfs_free_fhandle(res.fh);
+	if (status)
+		return status;
+
+	do_renew_lease(clp, now);
+	return 0;
+}
+
+#ifdef CONFIG_NFS_V4_1
+
+/*
+ * This operation also signals the server that this client is
+ * performing "lease moved" recovery.  The server can stop asserting
+ * SEQ4_STATUS_LEASE_MOVED for this client.  The client ID performing
+ * this operation is identified in the SEQUENCE operation in this
+ * compound.
+ */
+static int _nfs41_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct rpc_clnt *clnt = server->client;
+	struct nfs4_fsid_present_arg args = {
+		.fh		= NFS_FH(inode),
+	};
+	struct nfs4_fsid_present_res res = {
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT],
+		.rpc_argp	= &args,
+		.rpc_resp	= &res,
+		.rpc_cred	= cred,
+	};
+	int status;
+
+	res.fh = nfs_alloc_fhandle();
+	if (res.fh == NULL)
+		return -ENOMEM;
+
+	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
+	nfs4_set_sequence_privileged(&args.seq_args);
+	status = nfs4_call_sync_sequence(clnt, server, &msg,
+						&args.seq_args, &res.seq_res);
+	nfs_free_fhandle(res.fh);
+	if (status == NFS4_OK &&
+	    res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED)
+		status = -NFS4ERR_LEASE_MOVED;
+	return status;
+}
+
+#endif	/* CONFIG_NFS_V4_1 */
+
+/**
+ * nfs4_proc_fsid_present - Is this FSID present or absent on server?
+ * @inode: inode on FSID to check
+ * @cred: credential to use for this operation
+ *
+ * Server indicates whether the FSID is present, moved, or not
+ * recognized.  This operation is necessary to clear a LEASE_MOVED
+ * condition for this client ID.
+ *
+ * Returns NFS4_OK if the FSID is present on this server,
+ * -NFS4ERR_MOVED if the FSID is no longer present, a negative
+ *  NFS4ERR code if some error occurred on the server, or a
+ *  negative errno if a local failure occurred.
+ */
+int nfs4_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_client *clp = server->nfs_client;
+	const struct nfs4_mig_recovery_ops *ops =
+					clp->cl_mvops->mig_recovery_ops;
+	struct nfs4_exception exception = { };
+	int status;
+
+	dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__,
+		(unsigned long long)server->fsid.major,
+		(unsigned long long)server->fsid.minor,
+		clp->cl_hostname);
+	nfs_display_fhandle(NFS_FH(inode), __func__);
+
+	do {
+		status = ops->fsid_present(inode, cred);
+		if (status != -NFS4ERR_DELAY)
+			break;
+		nfs4_handle_exception(server, status, &exception);
+	} while (exception.retry);
+	return status;
+}
+
 /**
  * If 'use_integrity' is true and the state managment nfs_client
  * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient
@@ -8052,11 +8178,13 @@ static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
 
 static const struct nfs4_mig_recovery_ops nfs40_mig_recovery_ops = {
 	.get_locations = _nfs40_proc_get_locations,
+	.fsid_present = _nfs40_proc_fsid_present,
 };
 
 #if defined(CONFIG_NFS_V4_1)
 static const struct nfs4_mig_recovery_ops nfs41_mig_recovery_ops = {
 	.get_locations = _nfs41_proc_get_locations,
+	.fsid_present = _nfs41_proc_fsid_present,
 };
 #endif	/* CONFIG_NFS_V4_1 */
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1854b04..f903389 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -595,11 +595,13 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_enc_getattr_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
-				encode_getattr_maxsz)
+				encode_getattr_maxsz + \
+				encode_renew_maxsz)
 #define NFS4_dec_getattr_sz	(compound_decode_hdr_maxsz + \
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
-				decode_getattr_maxsz)
+				decode_getattr_maxsz + \
+				decode_renew_maxsz)
 #define NFS4_enc_lookup_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
@@ -753,6 +755,18 @@ static int nfs4_stat_to_errno(int);
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
 				decode_secinfo_maxsz)
+#define NFS4_enc_fsid_present_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_sequence_maxsz + \
+				 encode_putfh_maxsz + \
+				 encode_getfh_maxsz + \
+				 encode_renew_maxsz)
+#define NFS4_dec_fsid_present_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_sequence_maxsz + \
+				 decode_putfh_maxsz + \
+				 decode_getfh_maxsz + \
+				 decode_renew_maxsz)
 #if defined(CONFIG_NFS_V4_1)
 #define NFS4_enc_bind_conn_to_session_sz \
 				(compound_encode_hdr_maxsz + \
@@ -2726,6 +2740,26 @@ static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req,
 	encode_nops(&hdr);
 }
 
+/*
+ * Encode FSID_PRESENT request
+ */
+static void nfs4_xdr_enc_fsid_present(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      struct nfs4_fsid_present_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_getfh(xdr, &hdr);
+	if (args->renew)
+		encode_renew(xdr, args->clientid, &hdr);
+	encode_nops(&hdr);
+}
+
 #if defined(CONFIG_NFS_V4_1)
 /*
  * BIND_CONN_TO_SESSION request
@@ -6883,6 +6917,34 @@ out:
 	return status;
 }
 
+/*
+ * Decode FSID_PRESENT response
+ */
+static int nfs4_xdr_dec_fsid_present(struct rpc_rqst *rqstp,
+				     struct xdr_stream *xdr,
+				     struct nfs4_fsid_present_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_getfh(xdr, res->fh);
+	if (status)
+		goto out;
+	if (res->renew)
+		status = decode_renew(xdr);
+out:
+	return status;
+}
+
 #if defined(CONFIG_NFS_V4_1)
 /*
  * Decode BIND_CONN_TO_SESSION response
@@ -7397,6 +7459,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
 	PROC(FS_LOCATIONS,	enc_fs_locations,	dec_fs_locations),
 	PROC(RELEASE_LOCKOWNER,	enc_release_lockowner,	dec_release_lockowner),
 	PROC(SECINFO,		enc_secinfo,		dec_secinfo),
+	PROC(FSID_PRESENT,	enc_fsid_present,	dec_fsid_present),
 #if defined(CONFIG_NFS_V4_1)
 	PROC(EXCHANGE_ID,	enc_exchange_id,	dec_exchange_id),
 	PROC(CREATE_SESSION,	enc_create_session,	dec_create_session),
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index e36dee5..c56fa8f 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -460,6 +460,7 @@ enum {
 	NFSPROC4_CLNT_FS_LOCATIONS,
 	NFSPROC4_CLNT_RELEASE_LOCKOWNER,
 	NFSPROC4_CLNT_SECINFO,
+	NFSPROC4_CLNT_FSID_PRESENT,
 
 	/* nfs41 */
 	NFSPROC4_CLNT_EXCHANGE_ID,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 405dfad..8fe5b94 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1088,6 +1088,19 @@ struct nfs4_secinfo_res {
 	struct nfs4_secinfo_flavors	*flavors;
 };
 
+struct nfs4_fsid_present_arg {
+	struct nfs4_sequence_args	seq_args;
+	const struct nfs_fh		*fh;
+	clientid4			clientid;
+	unsigned char			renew:1;
+};
+
+struct nfs4_fsid_present_res {
+	struct nfs4_sequence_res	seq_res;
+	struct nfs_fh			*fh;
+	unsigned char			renew:1;
+};
+
 #endif /* CONFIG_NFS_V4 */
 
 struct nfstime4 {
-- 
cgit v0.10.2


From b7f7a66e420a9f3ec82a5124720013cee317e73a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:13:35 -0400
Subject: NFS: Support NFS4ERR_LEASE_MOVED recovery in state manager

A migration on the FSID in play for the current NFS operation
is reported via the error status code NFS4ERR_MOVED.

"Lease moved" means that a migration has occurred on some other
FSID than the one for the current operation.  It's a signal that
the client should take action immediately to handle a migration
that it may not have noticed otherwise.  This is so that the
client's lease does not expire unnoticed on the destination server.

In NFSv4.0, a moved lease is reported with the NFS4ERR_LEASE_MOVED
error status code.

To recover from NFS4ERR_LEASE_MOVED, check each FSID for that server
to see if it is still present.  Invoke nfs4_try_migration() if the
FSID is no longer present on the server.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 2f0f8c2..210e44e 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -30,6 +30,7 @@ enum nfs4_client_state {
 	NFS4CLNT_PURGE_STATE,
 	NFS4CLNT_BIND_CONN_TO_SESSION,
 	NFS4CLNT_MOVED,
+	NFS4CLNT_LEASE_MOVED,
 };
 
 #define NFS4_RENEW_TIMEOUT		0x01
@@ -425,6 +426,7 @@ extern void nfs4_schedule_state_manager(struct nfs_client *);
 extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
 extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
 extern int nfs4_schedule_migration_recovery(const struct nfs_server *);
+extern void nfs4_schedule_lease_moved_recovery(struct nfs_client *);
 extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
 extern void nfs41_handle_server_scope(struct nfs_client *,
 				      struct nfs41_server_scope **);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index ba0db18..552706d 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1233,6 +1233,22 @@ int nfs4_schedule_migration_recovery(const struct nfs_server *server)
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_migration_recovery);
 
+/**
+ * nfs4_schedule_lease_moved_recovery - start lease-moved recovery
+ *
+ * @clp: server to check for moved leases
+ *
+ */
+void nfs4_schedule_lease_moved_recovery(struct nfs_client *clp)
+{
+	dprintk("%s: scheduling lease-moved recovery for client ID %llx on %s\n",
+		__func__, clp->cl_clientid, clp->cl_hostname);
+
+	set_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state);
+	nfs4_schedule_state_manager(clp);
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_lease_moved_recovery);
+
 int nfs4_wait_clnt_recover(struct nfs_client *clp)
 {
 	int res;
@@ -1661,7 +1677,6 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 			nfs4_state_end_reclaim_reboot(clp);
 			break;
 		case -NFS4ERR_STALE_CLIENTID:
-		case -NFS4ERR_LEASE_MOVED:
 			set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
 			nfs4_state_clear_reclaim_reboot(clp);
 			nfs4_state_start_reclaim_reboot(clp);
@@ -1975,6 +1990,55 @@ restart:
 	return 0;
 }
 
+/*
+ * Test each nfs_server on the clp's cl_superblocks list to see
+ * if it's moved to another server.  Stop when the server no longer
+ * returns NFS4ERR_LEASE_MOVED.
+ */
+static int nfs4_handle_lease_moved(struct nfs_client *clp)
+{
+	const struct nfs4_state_maintenance_ops *ops =
+				clp->cl_mvops->state_renewal_ops;
+	struct nfs_server *server;
+	struct rpc_cred *cred;
+
+	dprintk("%s: lease moved reported on \"%s\"\n", __func__,
+			clp->cl_hostname);
+
+	spin_lock(&clp->cl_lock);
+	cred = ops->get_state_renewal_cred_locked(clp);
+	spin_unlock(&clp->cl_lock);
+	if (cred == NULL)
+		return -NFS4ERR_NOENT;
+
+	clp->cl_mig_gen++;
+restart:
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		struct inode *inode;
+		int status;
+
+		if (server->mig_gen == clp->cl_mig_gen)
+			continue;
+		server->mig_gen = clp->cl_mig_gen;
+
+		rcu_read_unlock();
+
+		inode = server->super->s_root->d_inode;
+		status = nfs4_proc_fsid_present(inode, cred);
+		if (status != -NFS4ERR_MOVED)
+			goto restart;	/* wasn't this one */
+		if (nfs4_try_migration(server, cred) == -NFS4ERR_LEASE_MOVED)
+			goto restart;	/* there are more */
+		goto out;
+	}
+	rcu_read_unlock();
+
+out:
+	put_rpccred(cred);
+	return 0;
+}
+
 /**
  * nfs4_discover_server_trunking - Detect server IP address trunking
  *
@@ -2312,6 +2376,13 @@ static void nfs4_state_manager(struct nfs_client *clp)
 				goto out_error;
 		}
 
+		if (test_and_clear_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state)) {
+			section = "lease moved";
+			status = nfs4_handle_lease_moved(clp);
+			if (status < 0)
+				goto out_error;
+		}
+
 		/* First recover reboot state... */
 		if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
 			section = "reclaim reboot";
-- 
cgit v0.10.2


From 8ef2f8d46aca7ff2e7e2b926bb563212aa63a4af Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:13:41 -0400
Subject: NFS: Implement support for NFS4ERR_LEASE_MOVED

Trigger lease-moved recovery when a request returns
NFS4ERR_LEASE_MOVED.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 01b90bd..2564e1c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -389,6 +389,9 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
 			if (ret < 0)
 				break;
 			goto wait_on_recovery;
+		case -NFS4ERR_LEASE_MOVED:
+			nfs4_schedule_lease_moved_recovery(clp);
+			goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
 		case -NFS4ERR_BADSESSION:
 		case -NFS4ERR_BADSLOT:
@@ -1578,6 +1581,9 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
 		case -NFS4ERR_MOVED:
 			nfs4_schedule_migration_recovery(server);
 			return -EAGAIN;
+		case -NFS4ERR_LEASE_MOVED:
+			nfs4_schedule_lease_moved_recovery(server->nfs_client);
+			return -EAGAIN;
 		case -NFS4ERR_DELEG_REVOKED:
 		case -NFS4ERR_ADMIN_REVOKED:
 		case -NFS4ERR_BAD_STATEID:
@@ -4758,6 +4764,9 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 			if (nfs4_schedule_migration_recovery(server) < 0)
 				goto recovery_failed;
 			goto wait_on_recovery;
+		case -NFS4ERR_LEASE_MOVED:
+			nfs4_schedule_lease_moved_recovery(clp);
+			goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
 		case -NFS4ERR_BADSESSION:
 		case -NFS4ERR_BADSLOT:
-- 
cgit v0.10.2


From 60ea68129942dc36cd1a3a9bdaec783369ee5a6d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:13:47 -0400
Subject: NFS: Migration support for RELEASE_LOCKOWNER

Currently the Linux NFS client ignores the operation status code for
the RELEASE_LOCKOWNER operation.  Like NFSv3's UMNT operation,
RELEASE_LOCKOWNER is a courtesy to help servers manage their
resources, and the outcome is not consequential for the client.

During a migration, a server may report NFS4ERR_LEASE_MOVED, in
which case the client really should retry, since typically
LEASE_MOVED has nothing to do with the current operation, but does
prevent it from going forward.

Also, it's important for a client to respond as soon as possible to
a moved lease condition, since the client's lease could expire on
the destination without further action by the client.

NFS4ERR_DELAY is not included in the list of valid status codes for
RELEASE_LOCKOWNER in RFC 3530bis.  However, rfc3530-migration-update
does permit migration-capable servers to return DELAY to clients,
but only in the context of an ongoing migration.  In this case the
server has frozen lock state in preparation for migration, and a
client retry would help the destination server purge unneeded state
once migration recovery is complete.

Interestly, NFS4ERR_MOVED is not valid for RELEASE_LOCKOWNER, even
though lock owners can be migrated with Transparent State Migration.

Note that RFC 3530bis section 9.5 includes RELEASE_LOCKOWNER in the
list of operations that renew a client's lease on the server if they
succeed.  Now that our client pays attention to the operation's
status code, we can note that renewal appropriately.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2564e1c..9f2ccf7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5800,6 +5800,7 @@ struct nfs_release_lockowner_data {
 	struct nfs_release_lockowner_args args;
 	struct nfs4_sequence_args seq_args;
 	struct nfs4_sequence_res seq_res;
+	unsigned long timestamp;
 };
 
 static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata)
@@ -5807,12 +5808,27 @@ static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata
 	struct nfs_release_lockowner_data *data = calldata;
 	nfs40_setup_sequence(data->server,
 				&data->seq_args, &data->seq_res, task);
+	data->timestamp = jiffies;
 }
 
 static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs_release_lockowner_data *data = calldata;
+	struct nfs_server *server = data->server;
+
 	nfs40_sequence_done(task, &data->seq_res);
+
+	switch (task->tk_status) {
+	case 0:
+		renew_lease(server, data->timestamp);
+		break;
+	case -NFS4ERR_STALE_CLIENTID:
+	case -NFS4ERR_EXPIRED:
+	case -NFS4ERR_LEASE_MOVED:
+	case -NFS4ERR_DELAY:
+		if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN)
+			rpc_restart_call_prepare(task);
+	}
 }
 
 static void nfs4_release_lockowner_release(void *calldata)
-- 
cgit v0.10.2


From f8aba1e8d509c0db7a82893e595a7743ce07ea83 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:13:53 -0400
Subject: NFS: Handle NFS4ERR_LEASE_MOVED during async RENEW

With NFSv4 minor version 0, the asynchronous lease RENEW
heartbeat can return NFS4ERR_LEASE_MOVED.  Error recovery logic for
async RENEW is a separate code path from the generic NFS proc paths,
so it must be updated to handle NFS4ERR_LEASE_MOVED as well.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9f2ccf7..8aa8ff3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4235,7 +4235,13 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
 	unsigned long timestamp = data->timestamp;
 
 	trace_nfs4_renew_async(clp, task->tk_status);
-	if (task->tk_status < 0) {
+	switch (task->tk_status) {
+	case 0:
+		break;
+	case -NFS4ERR_LEASE_MOVED:
+		nfs4_schedule_lease_moved_recovery(clp);
+		break;
+	default:
 		/* Unless we're shutting down, schedule state recovery! */
 		if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0)
 			return;
-- 
cgit v0.10.2


From d1c2331e75d7a5a22e2910a8bd042ea90b543db1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:13:58 -0400
Subject: NFS: Handle SEQ4_STATUS_LEASE_MOVED

With the advent of NFSv4 sessions in NFSv4.1 and following, a "lease
moved" condition is reported differently than it is in NFSv4.0.

NFSv4 minor version 0 servers return an error status code,
NFS4ERR_LEASE_MOVED, to signal that a lease has moved.  This error
causes the whole compound operation to fail.  Normal compounds
against this server continue to fail until the client performs
migration recovery on the migrated share.

Minor version 1 and later servers assert a bit flag in the reply to
a compound's SEQUENCE operation to signal LEASE_MOVED.  This is not
a fatal condition: operations against this server continue normally.
The server asserts this flag until the client performs migration
recovery on the migrated share.

Note that servers MUST NOT return NFS4ERR_LEASE_MOVED to NFSv4
clients not using NFSv4.0.

After the server asserts any of the sr_status_flags in the SEQUENCE
operation in a typical compound, our client initiates standard lease
recovery.  For NFSv4.1+, a stand-alone SEQUENCE operation is
performed to discover what recovery is needed.

If SEQ4_STATUS_LEASE_MOVED is asserted in this stand-alone SEQUENCE
operation, our client attempts to discover which FSIDs have been
migrated, and then performs migration recovery on each.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 552706d..62c08bf 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2227,9 +2227,10 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
 		nfs41_handle_server_reboot(clp);
 	if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
 			    SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED |
-			    SEQ4_STATUS_ADMIN_STATE_REVOKED |
-			    SEQ4_STATUS_LEASE_MOVED))
+			    SEQ4_STATUS_ADMIN_STATE_REVOKED))
 		nfs41_handle_state_revoked(clp);
+	if (flags & SEQ4_STATUS_LEASE_MOVED)
+		nfs4_schedule_lease_moved_recovery(clp);
 	if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
 		nfs41_handle_recallable_state_revoked(clp);
 	if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT)
-- 
cgit v0.10.2


From cd3fadece2b97462583500371a3eb9b091c49603 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:14:04 -0400
Subject: NFS: Set EXCHGID4_FLAG_SUPP_MOVED_MIGR

Broadly speaking, v4.1 migration is untested.  There are no servers
in the wild that support NFSv4.1 migration.  However, as server
implementations become available, we do want to enable testing by
developers, while leaving it disabled for environments for which
broken migration support would be an unpleasant surprise.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index b5e80b0..38c1768 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -140,6 +140,17 @@ config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
 	  If the NFS client is unchanged from the upstream kernel, this
 	  option should be set to the default "kernel.org".
 
+config NFS_V4_1_MIGRATION
+	bool "NFSv4.1 client support for migration"
+	depends on NFS_V4_1
+	default n
+	help
+	  This option makes the NFS client advertise to NFSv4.1 servers that
+          it can support NFSv4 migration.
+
+          The NFSv4.1 pieces of the Linux NFSv4 migration implementation are
+          still experimental.  If you are not an NFSv4 developer, say N here.
+
 config NFS_V4_SECURITY_LABEL
 	bool
 	depends on NFS_V4_2 && SECURITY
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8aa8ff3..1463c71 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6596,8 +6596,14 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 	struct nfs41_exchange_id_args args = {
 		.verifier = &verifier,
 		.client = clp,
+#ifdef CONFIG_NFS_V4_1_MIGRATION
 		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
-			EXCHGID4_FLAG_BIND_PRINC_STATEID,
+			 EXCHGID4_FLAG_BIND_PRINC_STATEID |
+			 EXCHGID4_FLAG_SUPP_MOVED_MIGR,
+#else
+		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+			 EXCHGID4_FLAG_BIND_PRINC_STATEID,
+#endif
 	};
 	struct nfs41_exchange_id_res res = {
 		0
-- 
cgit v0.10.2


From 0625c2dd6ae32a221871ffc04bb752216b823001 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:14:10 -0400
Subject: NFS: Fix possible endless state recovery wait

In nfs4_wait_clnt_recover(), hold a reference to the clp being
waited on.  The state manager can reduce clp->cl_count to 1, in
which case the nfs_put_client() in nfs4_run_state_manager() can
free *clp before wait_on_bit() returns and allows
nfs4_wait_clnt_recover() to run again.

The behavior at that point is non-deterministic.  If the waited-on
bit still happens to be zero, wait_on_bit() will wake the waiter as
expected.  If the bit is set again (say, if the memory was poisoned
when freed) wait_on_bit() can leave the waiter asleep.

This is a narrow fix which ensures the safety of accessing *clp in
nfs4_wait_clnt_recover(), but does not address the continued use
of a possibly freed *clp after nfs4_wait_clnt_recover() returns
(see nfs_end_delegation_return(), for example).

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 62c08bf..452f4c8 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1255,14 +1255,16 @@ int nfs4_wait_clnt_recover(struct nfs_client *clp)
 
 	might_sleep();
 
+	atomic_inc(&clp->cl_count);
 	res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
 			nfs_wait_bit_killable, TASK_KILLABLE);
 	if (res)
-		return res;
-
+		goto out;
 	if (clp->cl_cons_state < 0)
-		return clp->cl_cons_state;
-	return 0;
+		res = clp->cl_cons_state;
+out:
+	nfs_put_client(clp);
+	return res;
 }
 
 int nfs4_client_recover_expired_lease(struct nfs_client *clp)
-- 
cgit v0.10.2


From 47fd88e6b79c55e6acccaf832078ed1a340672fa Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Fri, 18 Oct 2013 15:15:15 -0400
Subject: NFSv4: make nfs_find_best_sec static

It's not used outside of nfs4namespace.c anymore.

Signed-off-by: Weston Andros Adamson <dros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 210e44e..3ce79b0 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -222,7 +222,6 @@ int nfs_atomic_open(struct inode *, struct dentry *, struct file *,
 extern struct file_system_type nfs4_fs_type;
 
 /* nfs4namespace.c */
-rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *);
 struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *, struct inode *, struct qstr *);
 struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *,
 			       struct nfs_fh *, struct nfs_fattr *);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index ebd8b06..4ec0eea 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -145,7 +145,7 @@ static size_t nfs_parse_server_name(char *string, size_t len,
  * is searched in the order returned from the server, per RFC 3530
  * recommendation.
  */
-rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
+static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
 {
 	rpc_authflavor_t pseudoflavor;
 	struct nfs4_secinfo4 *secinfo;
-- 
cgit v0.10.2


From a3f73c27afff9590a4432879b7145289cb89cf0a Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Fri, 18 Oct 2013 15:15:16 -0400
Subject: NFS: separate passed security flavs from selected

When filling parsed_mount_data, store the parsed sec= mount option in
the new struct nfs_auth_info and the chosen flavor in selected_flavor.

This patch lays the groundwork for supporting multiple sec= options.

Signed-off-by: Weston Andros Adamson <dros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 692fd0e..f5a7f7f 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -787,7 +787,8 @@ static int nfs_init_server(struct nfs_server *server,
 
 	server->port = data->nfs_server.port;
 
-	error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]);
+	error = nfs_init_server_rpcclient(server, &timeparms,
+					  data->selected_flavor);
 	if (error < 0)
 		goto error;
 
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e5a6bd1..c8cd044 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -88,8 +88,8 @@ struct nfs_parsed_mount_data {
 	unsigned int		namlen;
 	unsigned int		options;
 	unsigned int		bsize;
-	unsigned int		auth_flavor_len;
-	rpc_authflavor_t	auth_flavors[1];
+	struct nfs_auth_info	auth_info;
+	rpc_authflavor_t	selected_flavor;
 	char			*client_address;
 	unsigned int		version;
 	unsigned int		minorversion;
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 0cde95e..d65090e 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -949,9 +949,8 @@ out:
  * Create a version 4 volume record
  */
 static int nfs4_init_server(struct nfs_server *server,
-		const struct nfs_parsed_mount_data *data)
+		struct nfs_parsed_mount_data *data)
 {
-	rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX;
 	struct rpc_timeout timeparms;
 	int error;
 
@@ -964,8 +963,10 @@ static int nfs4_init_server(struct nfs_server *server,
 	server->flags = data->flags;
 	server->options = data->options;
 
-	if (data->auth_flavor_len >= 1)
-		pseudoflavor = data->auth_flavors[0];
+	if (data->auth_info.flavor_len >= 1)
+		data->selected_flavor = data->auth_info.flavors[0];
+	else
+		data->selected_flavor = RPC_AUTH_UNIX;
 
 	/* Get a client record */
 	error = nfs4_set_client(server,
@@ -973,7 +974,7 @@ static int nfs4_init_server(struct nfs_server *server,
 			(const struct sockaddr *)&data->nfs_server.address,
 			data->nfs_server.addrlen,
 			data->client_address,
-			pseudoflavor,
+			data->selected_flavor,
 			data->nfs_server.protocol,
 			&timeparms,
 			data->minorversion,
@@ -993,7 +994,8 @@ static int nfs4_init_server(struct nfs_server *server,
 
 	server->port = data->nfs_server.port;
 
-	error = nfs_init_server_rpcclient(server, &timeparms, pseudoflavor);
+	error = nfs_init_server_rpcclient(server, &timeparms,
+					  data->selected_flavor);
 
 error:
 	/* Done */
@@ -1020,7 +1022,7 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info,
 	if (!server)
 		return ERR_PTR(-ENOMEM);
 
-	auth_probe = mount_info->parsed->auth_flavor_len < 1;
+	auth_probe = mount_info->parsed->auth_info.flavor_len < 1;
 
 	/* set up the general RPC client */
 	error = nfs4_init_server(server, mount_info->parsed);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e26647b..b87744f 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -923,8 +923,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void)
 		data->mount_server.port	= NFS_UNSPEC_PORT;
 		data->nfs_server.port	= NFS_UNSPEC_PORT;
 		data->nfs_server.protocol = XPRT_TRANSPORT_TCP;
-		data->auth_flavors[0]	= RPC_AUTH_MAXFLAVOR;
-		data->auth_flavor_len	= 0;
+		data->selected_flavor	= RPC_AUTH_MAXFLAVOR;
 		data->minorversion	= 0;
 		data->need_mount	= true;
 		data->net		= current->nsproxy->net_ns;
@@ -1019,13 +1018,6 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
 	}
 }
 
-static void nfs_set_auth_parsed_mount_data(struct nfs_parsed_mount_data *data,
-		rpc_authflavor_t pseudoflavor)
-{
-	data->auth_flavors[0] = pseudoflavor;
-	data->auth_flavor_len = 1;
-}
-
 /*
  * Parse the value of the 'sec=' option.
  */
@@ -1076,7 +1068,8 @@ static int nfs_parse_security_flavors(char *value,
 	}
 
 	mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-	nfs_set_auth_parsed_mount_data(mnt, pseudoflavor);
+	mnt->auth_info.flavors[0] = pseudoflavor;
+	mnt->auth_info.flavor_len = 1;
 	return 1;
 }
 
@@ -1623,7 +1616,7 @@ out_security_failure:
 }
 
 /*
- * Ensure that the specified authtype in args->auth_flavors[0] is supported by
+ * Ensure that the specified authtype in args->auth_info is supported by
  * the server. Returns 0 if it's ok, and -EACCES if not.
  */
 static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args,
@@ -1640,17 +1633,18 @@ static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args,
 	 * can be used.
 	 */
 	for (i = 0; i < count; i++) {
-		if (args->auth_flavors[0] == server_authlist[i] ||
+		if (args->auth_info.flavors[0] == server_authlist[i] ||
 		    server_authlist[i] == RPC_AUTH_NULL)
 			goto out;
 	}
 
 	dfprintk(MOUNT, "NFS: auth flavor %u not supported by server\n",
-		args->auth_flavors[0]);
+		args->auth_info.flavors[0]);
 	return -EACCES;
 
 out:
-	dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
+	args->selected_flavor = args->auth_info.flavors[0];
+	dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->selected_flavor);
 	return 0;
 }
 
@@ -1738,9 +1732,10 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf
 	 * Was a sec= authflavor specified in the options? First, verify
 	 * whether the server supports it, and then just try to use it if so.
 	 */
-	if (args->auth_flavor_len > 0) {
+	if (args->auth_info.flavor_len > 0) {
 		status = nfs_verify_authflavor(args, authlist, authlist_len);
-		dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
+		dfprintk(MOUNT, "NFS: using auth flavor %u\n",
+			 args->selected_flavor);
 		if (status)
 			return ERR_PTR(status);
 		return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
@@ -1769,7 +1764,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf
 			/* Fallthrough */
 		}
 		dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor);
-		nfs_set_auth_parsed_mount_data(args, flavor);
+		args->selected_flavor = flavor;
 		server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
 		if (!IS_ERR(server))
 			return server;
@@ -1785,7 +1780,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf
 
 	/* Last chance! Try AUTH_UNIX */
 	dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX);
-	nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX);
+	args->selected_flavor = RPC_AUTH_UNIX;
 	return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
 }
 
@@ -1972,9 +1967,9 @@ static int nfs23_validate_mount_data(void *options,
 		args->bsize		= data->bsize;
 
 		if (data->flags & NFS_MOUNT_SECFLAVOUR)
-			nfs_set_auth_parsed_mount_data(args, data->pseudoflavor);
+			args->selected_flavor = data->pseudoflavor;
 		else
-			nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX);
+			args->selected_flavor = RPC_AUTH_UNIX;
 		if (!args->nfs_server.hostname)
 			goto out_nomem;
 
@@ -2108,7 +2103,7 @@ static int nfs_validate_text_mount_data(void *options,
 
 	nfs_set_port(sap, &args->nfs_server.port, port);
 
-	if (args->auth_flavor_len > 1)
+	if (args->auth_info.flavor_len > 1)
 		goto out_bad_auth;
 
 	return nfs_parse_devname(dev_name,
@@ -2146,7 +2141,7 @@ nfs_compare_remount_data(struct nfs_server *nfss,
 	    data->version != nfss->nfs_client->rpc_ops->version ||
 	    data->minorversion != nfss->nfs_client->cl_minorversion ||
 	    data->retrans != nfss->client->cl_timeout->to_retries ||
-	    data->auth_flavors[0] != nfss->client->cl_auth->au_flavor ||
+	    data->selected_flavor != nfss->client->cl_auth->au_flavor ||
 	    data->acregmin != nfss->acregmin / HZ ||
 	    data->acregmax != nfss->acregmax / HZ ||
 	    data->acdirmin != nfss->acdirmin / HZ ||
@@ -2191,7 +2186,9 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	data->rsize = nfss->rsize;
 	data->wsize = nfss->wsize;
 	data->retrans = nfss->client->cl_timeout->to_retries;
-	nfs_set_auth_parsed_mount_data(data, nfss->client->cl_auth->au_flavor);
+	data->selected_flavor = nfss->client->cl_auth->au_flavor;
+	data->auth_info.flavors[0] = nfss->client->cl_auth->au_flavor;
+	data->auth_info.flavor_len = 1;
 	data->acregmin = nfss->acregmin / HZ;
 	data->acregmax = nfss->acregmax / HZ;
 	data->acdirmin = nfss->acdirmin / HZ;
@@ -2718,9 +2715,9 @@ static int nfs4_validate_mount_data(void *options,
 					   data->auth_flavours,
 					   sizeof(pseudoflavor)))
 				return -EFAULT;
-			nfs_set_auth_parsed_mount_data(args, pseudoflavor);
+			args->selected_flavor = pseudoflavor;
 		} else
-			nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX);
+			args->selected_flavor = RPC_AUTH_UNIX;
 
 		c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
 		if (IS_ERR(c))
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 8fe5b94..658104a 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -591,6 +591,12 @@ struct nfs_renameres {
 	struct nfs_fattr		*new_fattr;
 };
 
+/* parsed sec= options */
+struct nfs_auth_info {
+	unsigned int            flavor_len;
+	rpc_authflavor_t        flavors[1];
+};
+
 /*
  * Argument struct for decode_entry function
  */
-- 
cgit v0.10.2


From 0f5f49b8b3593309fd3c3a2080a5fd465afdbe16 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Fri, 18 Oct 2013 15:15:17 -0400
Subject: NFS: cache parsed auth_info in nfs_server

Cache the auth_info structure in nfs_server and pass these values to submounts.

This lays the groundwork for supporting multiple sec= options.

Signed-off-by: Weston Andros Adamson <dros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index f5a7f7f..1d09289 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -786,6 +786,7 @@ static int nfs_init_server(struct nfs_server *server,
 		goto error;
 
 	server->port = data->nfs_server.port;
+	server->auth_info = data->auth_info;
 
 	error = nfs_init_server_rpcclient(server, &timeparms,
 					  data->selected_flavor);
@@ -929,6 +930,7 @@ void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *sour
 	target->acdirmax = source->acdirmax;
 	target->caps = source->caps;
 	target->options = source->options;
+	target->auth_info = source->auth_info;
 }
 EXPORT_SYMBOL_GPL(nfs_server_copy_userdata);
 
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index d65090e..04131c8 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -962,6 +962,7 @@ static int nfs4_init_server(struct nfs_server *server,
 	/* Initialise the client representation from the mount data */
 	server->flags = data->flags;
 	server->options = data->options;
+	server->auth_info = data->auth_info;
 
 	if (data->auth_info.flavor_len >= 1)
 		data->selected_flavor = data->auth_info.flavors[0];
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b87744f..de1e5e8 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2187,8 +2187,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	data->wsize = nfss->wsize;
 	data->retrans = nfss->client->cl_timeout->to_retries;
 	data->selected_flavor = nfss->client->cl_auth->au_flavor;
-	data->auth_info.flavors[0] = nfss->client->cl_auth->au_flavor;
-	data->auth_info.flavor_len = 1;
+	data->auth_info = nfss->auth_info;
 	data->acregmin = nfss->acregmin / HZ;
 	data->acregmax = nfss->acregmax / HZ;
 	data->acdirmin = nfss->acdirmin / HZ;
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 186ec49..1150ea4 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -151,6 +151,7 @@ struct nfs_server {
 	unsigned long		mount_time;	/* when this fs was mounted */
 	struct super_block	*super;		/* VFS super block */
 	dev_t			s_dev;		/* superblock dev numbers */
+	struct nfs_auth_info	auth_info;	/* parsed auth flavors */
 
 #ifdef CONFIG_NFS_FSCACHE
 	struct nfs_fscache_key	*fscache_key;	/* unique key for superblock */
-- 
cgit v0.10.2


From 5837f6dfcb00f764976ddc178933e612702cbf54 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Fri, 18 Oct 2013 15:15:18 -0400
Subject: NFS: stop using NFS_MOUNT_SECFLAVOUR server flag

Since the parsed sec= flavor is now stored in nfs_server->auth_info,
we no longer need an nfs_server flag to determine if a sec= option was
used.

This flag has not been completely removed because it is still needed for
the (old but still supported) non-text parsed mount options ABI
compatability.

Signed-off-by: Weston Andros Adamson <dros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 04131c8..f6cc77c 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -1051,6 +1051,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 {
 	struct nfs_client *parent_client;
 	struct nfs_server *server, *parent_server;
+	bool auth_probe;
 	int error;
 
 	dprintk("--> nfs4_create_referral_server()\n");
@@ -1083,8 +1084,9 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 	if (error < 0)
 		goto error;
 
-	error = nfs4_server_common_setup(server, mntfh,
-			!(parent_server->flags & NFS_MOUNT_SECFLAVOUR));
+	auth_probe = parent_server->auth_info.flavor_len < 1;
+
+	error = nfs4_server_common_setup(server, mntfh, auth_probe);
 	if (error < 0)
 		goto error;
 
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 4ec0eea..b947054 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -390,7 +390,7 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry,
 
 	if (client->cl_auth->au_flavor != flavor)
 		flavor = client->cl_auth->au_flavor;
-	else if (!(server->flags & NFS_MOUNT_SECFLAVOUR)) {
+	else if (server->auth_info.flavor_len == 0) {
 		rpc_authflavor_t new = nfs4_negotiate_security(dir, name);
 		if ((int)new >= 0)
 			flavor = new;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1463c71..0eb8dc5 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2920,7 +2920,7 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
 		if (status != -NFS4ERR_WRONGSEC)
 			break;
 		/* Did user force a 'sec=' mount option? */
-		if (server->flags & NFS_MOUNT_SECFLAVOUR)
+		if (server->auth_info.flavor_len > 0)
 			break;
 	default:
 		status = nfs4_do_find_root_sec(server, fhandle, info);
@@ -3180,7 +3180,7 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
 			if (client != *clnt)
 				goto out;
 			/* No security negotiation if the user specified 'sec=' */
-			if (NFS_SERVER(dir)->flags & NFS_MOUNT_SECFLAVOUR)
+			if (NFS_SERVER(dir)->auth_info.flavor_len > 0)
 				goto out;
 			client = nfs4_create_sec_client(client, dir, name);
 			if (IS_ERR(client))
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index de1e5e8..3a4f8bf 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1067,7 +1067,6 @@ static int nfs_parse_security_flavors(char *value,
 		return 0;
 	}
 
-	mnt->flags |= NFS_MOUNT_SECFLAVOUR;
 	mnt->auth_info.flavors[0] = pseudoflavor;
 	mnt->auth_info.flavor_len = 1;
 	return 1;
@@ -2332,7 +2331,7 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n
 		goto Ebusy;
 	if (a->acdirmax != b->acdirmax)
 		goto Ebusy;
-	if (b->flags & NFS_MOUNT_SECFLAVOUR &&
+	if (b->auth_info.flavor_len > 0 &&
 	   clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
 		goto Ebusy;
 	return 1;
diff --git a/include/uapi/linux/nfs_mount.h b/include/uapi/linux/nfs_mount.h
index 576bddd..64b0f22 100644
--- a/include/uapi/linux/nfs_mount.h
+++ b/include/uapi/linux/nfs_mount.h
@@ -60,7 +60,7 @@ struct nfs_mount_data {
 #define NFS_MOUNT_BROKEN_SUID	0x0400	/* 4 */
 #define NFS_MOUNT_NOACL		0x0800	/* 4 */
 #define NFS_MOUNT_STRICTLOCK	0x1000	/* reserved for NFSv4 */
-#define NFS_MOUNT_SECFLAVOUR	0x2000	/* 5 */
+#define NFS_MOUNT_SECFLAVOUR	0x2000	/* 5 non-text parsed mount data only */
 #define NFS_MOUNT_NORDIRPLUS	0x4000	/* 5 */
 #define NFS_MOUNT_UNSHARED	0x8000	/* 5 */
 #define NFS_MOUNT_FLAGMASK	0xFFFF
-- 
cgit v0.10.2


From 4d4b69dd847a098cdca341c45326f6c6f61b8691 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Fri, 18 Oct 2013 15:15:19 -0400
Subject: NFS: add support for multiple sec= mount options

This patch adds support for multiple security options which can be
specified using a colon-delimited list of security flavors (the same
syntax as nfsd's exports file).

This is useful, for instance, when NFSv4.x mounts cross SECINFO
boundaries. With this patch a user can use "sec=krb5i,krb5p"
to mount a remote filesystem using krb5i, but can still cross
into krb5p-only exports.

New mounts will try all security options before failing.  NFSv4.x
SECINFO results will be compared against the sec= flavors to
find the first flavor in both lists or if no match is found will
return -EPERM.

Signed-off-by: Weston Andros Adamson <dros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index c8cd044..bca6a3e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -326,6 +326,7 @@ extern struct file_system_type nfs_xdev_fs_type;
 extern struct file_system_type nfs4_xdev_fs_type;
 extern struct file_system_type nfs4_referral_fs_type;
 #endif
+bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t);
 struct dentry *nfs_try_mount(int, const char *, struct nfs_mount_info *,
 			struct nfs_subversion *);
 void nfs_initialise_sb(struct super_block *);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index f6cc77c..b4a160a 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -964,6 +964,9 @@ static int nfs4_init_server(struct nfs_server *server,
 	server->options = data->options;
 	server->auth_info = data->auth_info;
 
+	/* Use the first specified auth flavor. If this flavor isn't
+	 * allowed by the server, use the SECINFO path to try the
+	 * other specified flavors */
 	if (data->auth_info.flavor_len >= 1)
 		data->selected_flavor = data->auth_info.flavors[0];
 	else
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index b947054..c08cbf4 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -137,6 +137,7 @@ static size_t nfs_parse_server_name(char *string, size_t len,
 
 /**
  * nfs_find_best_sec - Find a security mechanism supported locally
+ * @server: NFS server struct
  * @flavors: List of security tuples returned by SECINFO procedure
  *
  * Return the pseudoflavor of the first security mechanism in
@@ -145,7 +146,8 @@ static size_t nfs_parse_server_name(char *string, size_t len,
  * is searched in the order returned from the server, per RFC 3530
  * recommendation.
  */
-static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
+static rpc_authflavor_t nfs_find_best_sec(struct nfs_server *server,
+					  struct nfs4_secinfo_flavors *flavors)
 {
 	rpc_authflavor_t pseudoflavor;
 	struct nfs4_secinfo4 *secinfo;
@@ -160,12 +162,19 @@ static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
 		case RPC_AUTH_GSS:
 			pseudoflavor = rpcauth_get_pseudoflavor(secinfo->flavor,
 							&secinfo->flavor_info);
-			if (pseudoflavor != RPC_AUTH_MAXFLAVOR)
+			/* make sure pseudoflavor matches sec= mount opt */
+			if (pseudoflavor != RPC_AUTH_MAXFLAVOR &&
+			    nfs_auth_info_match(&server->auth_info,
+						pseudoflavor))
 				return pseudoflavor;
 			break;
 		}
 	}
 
+	/* if there were any sec= options then nothing matched */
+	if (server->auth_info.flavor_len > 0)
+		return -EPERM;
+
 	return RPC_AUTH_UNIX;
 }
 
@@ -187,7 +196,7 @@ static rpc_authflavor_t nfs4_negotiate_security(struct inode *inode, struct qstr
 		goto out;
 	}
 
-	flavor = nfs_find_best_sec(flavors);
+	flavor = nfs_find_best_sec(NFS_SERVER(inode), flavors);
 
 out:
 	put_page(page);
@@ -390,7 +399,7 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry,
 
 	if (client->cl_auth->au_flavor != flavor)
 		flavor = client->cl_auth->au_flavor;
-	else if (server->auth_info.flavor_len == 0) {
+	else {
 		rpc_authflavor_t new = nfs4_negotiate_security(dir, name);
 		if ((int)new >= 0)
 			flavor = new;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0eb8dc5..b02c4cc 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2873,11 +2873,24 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
 	int status = -EPERM;
 	size_t i;
 
-	for (i = 0; i < ARRAY_SIZE(flav_array); i++) {
-		status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]);
-		if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
-			continue;
-		break;
+	if (server->auth_info.flavor_len > 0) {
+		/* try each flavor specified by user */
+		for (i = 0; i < server->auth_info.flavor_len; i++) {
+			status = nfs4_lookup_root_sec(server, fhandle, info,
+						server->auth_info.flavors[i]);
+			if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
+				continue;
+			break;
+		}
+	} else {
+		/* no flavors specified by user, try default list */
+		for (i = 0; i < ARRAY_SIZE(flav_array); i++) {
+			status = nfs4_lookup_root_sec(server, fhandle, info,
+						      flav_array[i]);
+			if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
+				continue;
+			break;
+		}
 	}
 
 	/*
@@ -2919,9 +2932,6 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
 		status = nfs4_lookup_root(server, fhandle, info);
 		if (status != -NFS4ERR_WRONGSEC)
 			break;
-		/* Did user force a 'sec=' mount option? */
-		if (server->auth_info.flavor_len > 0)
-			break;
 	default:
 		status = nfs4_do_find_root_sec(server, fhandle, info);
 	}
@@ -3179,9 +3189,6 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
 			err = -EPERM;
 			if (client != *clnt)
 				goto out;
-			/* No security negotiation if the user specified 'sec=' */
-			if (NFS_SERVER(dir)->auth_info.flavor_len > 0)
-				goto out;
 			client = nfs4_create_sec_client(client, dir, name);
 			if (IS_ERR(client))
 				return PTR_ERR(client);
@@ -7942,6 +7949,9 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
 			break;
 		}
 
+		if (!nfs_auth_info_match(&server->auth_info, flavor))
+			flavor = RPC_AUTH_MAXFLAVOR;
+
 		if (flavor != RPC_AUTH_MAXFLAVOR) {
 			err = nfs4_lookup_root_sec(server, fhandle,
 						   info, flavor);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 3a4f8bf..317d6fc 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -497,7 +497,8 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)
 	static const struct {
 		rpc_authflavor_t flavour;
 		const char *str;
-	} sec_flavours[] = {
+	} sec_flavours[NFS_AUTH_INFO_MAX_FLAVORS] = {
+		/* update NFS_AUTH_INFO_MAX_FLAVORS when this list changes! */
 		{ RPC_AUTH_NULL, "null" },
 		{ RPC_AUTH_UNIX, "sys" },
 		{ RPC_AUTH_GSS_KRB5, "krb5" },
@@ -1019,6 +1020,52 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
 }
 
 /*
+ * Add 'flavor' to 'auth_info' if not already present.
+ * Returns true if 'flavor' ends up in the list, false otherwise
+ */
+static bool nfs_auth_info_add(struct nfs_auth_info *auth_info,
+			      rpc_authflavor_t flavor)
+{
+	unsigned int i;
+	unsigned int max_flavor_len = (sizeof(auth_info->flavors) /
+				       sizeof(auth_info->flavors[0]));
+
+	/* make sure this flavor isn't already in the list */
+	for (i = 0; i < auth_info->flavor_len; i++) {
+		if (flavor == auth_info->flavors[i])
+			return true;
+	}
+
+	if (auth_info->flavor_len + 1 >= max_flavor_len) {
+		dfprintk(MOUNT, "NFS: too many sec= flavors\n");
+		return false;
+	}
+
+	auth_info->flavors[auth_info->flavor_len++] = flavor;
+	return true;
+}
+
+/*
+ * Return true if 'match' is in auth_info or auth_info is empty.
+ * Return false otherwise.
+ */
+bool nfs_auth_info_match(const struct nfs_auth_info *auth_info,
+			 rpc_authflavor_t match)
+{
+	int i;
+
+	if (!auth_info->flavor_len)
+		return true;
+
+	for (i = 0; i < auth_info->flavor_len; i++) {
+		if (auth_info->flavors[i] == match)
+			return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(nfs_auth_info_match);
+
+/*
  * Parse the value of the 'sec=' option.
  */
 static int nfs_parse_security_flavors(char *value,
@@ -1026,49 +1073,55 @@ static int nfs_parse_security_flavors(char *value,
 {
 	substring_t args[MAX_OPT_ARGS];
 	rpc_authflavor_t pseudoflavor;
+	char *p;
 
 	dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value);
 
-	switch (match_token(value, nfs_secflavor_tokens, args)) {
-	case Opt_sec_none:
-		pseudoflavor = RPC_AUTH_NULL;
-		break;
-	case Opt_sec_sys:
-		pseudoflavor = RPC_AUTH_UNIX;
-		break;
-	case Opt_sec_krb5:
-		pseudoflavor = RPC_AUTH_GSS_KRB5;
-		break;
-	case Opt_sec_krb5i:
-		pseudoflavor = RPC_AUTH_GSS_KRB5I;
-		break;
-	case Opt_sec_krb5p:
-		pseudoflavor = RPC_AUTH_GSS_KRB5P;
-		break;
-	case Opt_sec_lkey:
-		pseudoflavor = RPC_AUTH_GSS_LKEY;
-		break;
-	case Opt_sec_lkeyi:
-		pseudoflavor = RPC_AUTH_GSS_LKEYI;
-		break;
-	case Opt_sec_lkeyp:
-		pseudoflavor = RPC_AUTH_GSS_LKEYP;
-		break;
-	case Opt_sec_spkm:
-		pseudoflavor = RPC_AUTH_GSS_SPKM;
-		break;
-	case Opt_sec_spkmi:
-		pseudoflavor = RPC_AUTH_GSS_SPKMI;
-		break;
-	case Opt_sec_spkmp:
-		pseudoflavor = RPC_AUTH_GSS_SPKMP;
-		break;
-	default:
-		return 0;
+	while ((p = strsep(&value, ":")) != NULL) {
+		switch (match_token(p, nfs_secflavor_tokens, args)) {
+		case Opt_sec_none:
+			pseudoflavor = RPC_AUTH_NULL;
+			break;
+		case Opt_sec_sys:
+			pseudoflavor = RPC_AUTH_UNIX;
+			break;
+		case Opt_sec_krb5:
+			pseudoflavor = RPC_AUTH_GSS_KRB5;
+			break;
+		case Opt_sec_krb5i:
+			pseudoflavor = RPC_AUTH_GSS_KRB5I;
+			break;
+		case Opt_sec_krb5p:
+			pseudoflavor = RPC_AUTH_GSS_KRB5P;
+			break;
+		case Opt_sec_lkey:
+			pseudoflavor = RPC_AUTH_GSS_LKEY;
+			break;
+		case Opt_sec_lkeyi:
+			pseudoflavor = RPC_AUTH_GSS_LKEYI;
+			break;
+		case Opt_sec_lkeyp:
+			pseudoflavor = RPC_AUTH_GSS_LKEYP;
+			break;
+		case Opt_sec_spkm:
+			pseudoflavor = RPC_AUTH_GSS_SPKM;
+			break;
+		case Opt_sec_spkmi:
+			pseudoflavor = RPC_AUTH_GSS_SPKMI;
+			break;
+		case Opt_sec_spkmp:
+			pseudoflavor = RPC_AUTH_GSS_SPKMP;
+			break;
+		default:
+			dfprintk(MOUNT,
+				 "NFS: sec= option '%s' not recognized\n", p);
+			return 0;
+		}
+
+		if (!nfs_auth_info_add(&mnt->auth_info, pseudoflavor))
+			return 0;
 	}
 
-	mnt->auth_info.flavors[0] = pseudoflavor;
-	mnt->auth_info.flavor_len = 1;
 	return 1;
 }
 
@@ -1615,12 +1668,14 @@ out_security_failure:
 }
 
 /*
- * Ensure that the specified authtype in args->auth_info is supported by
- * the server. Returns 0 if it's ok, and -EACCES if not.
+ * Ensure that a specified authtype in args->auth_info is supported by
+ * the server. Returns 0 and sets args->selected_flavor if it's ok, and
+ * -EACCES if not.
  */
-static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args,
+static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
 			rpc_authflavor_t *server_authlist, unsigned int count)
 {
+	rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;
 	unsigned int i;
 
 	/*
@@ -1632,17 +1687,19 @@ static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args,
 	 * can be used.
 	 */
 	for (i = 0; i < count; i++) {
-		if (args->auth_info.flavors[0] == server_authlist[i] ||
-		    server_authlist[i] == RPC_AUTH_NULL)
+		flavor = server_authlist[i];
+
+		if (nfs_auth_info_match(&args->auth_info, flavor) ||
+		    flavor == RPC_AUTH_NULL)
 			goto out;
 	}
 
-	dfprintk(MOUNT, "NFS: auth flavor %u not supported by server\n",
-		args->auth_info.flavors[0]);
+	dfprintk(MOUNT,
+		 "NFS: specified auth flavors not supported by server\n");
 	return -EACCES;
 
 out:
-	args->selected_flavor = args->auth_info.flavors[0];
+	args->selected_flavor = flavor;
 	dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->selected_flavor);
 	return 0;
 }
@@ -1732,7 +1789,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf
 	 * whether the server supports it, and then just try to use it if so.
 	 */
 	if (args->auth_info.flavor_len > 0) {
-		status = nfs_verify_authflavor(args, authlist, authlist_len);
+		status = nfs_verify_authflavors(args, authlist, authlist_len);
 		dfprintk(MOUNT, "NFS: using auth flavor %u\n",
 			 args->selected_flavor);
 		if (status)
@@ -2102,9 +2159,6 @@ static int nfs_validate_text_mount_data(void *options,
 
 	nfs_set_port(sap, &args->nfs_server.port, port);
 
-	if (args->auth_info.flavor_len > 1)
-		goto out_bad_auth;
-
 	return nfs_parse_devname(dev_name,
 				   &args->nfs_server.hostname,
 				   max_namelen,
@@ -2124,10 +2178,6 @@ out_invalid_transport_udp:
 out_no_address:
 	dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n");
 	return -EINVAL;
-
-out_bad_auth:
-	dfprintk(MOUNT, "NFS: Too many RPC auth flavours specified\n");
-	return -EINVAL;
 }
 
 static int
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 658104a..3ccfcec 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -592,9 +592,10 @@ struct nfs_renameres {
 };
 
 /* parsed sec= options */
+#define NFS_AUTH_INFO_MAX_FLAVORS 12 /* see fs/nfs/super.c */
 struct nfs_auth_info {
 	unsigned int            flavor_len;
-	rpc_authflavor_t        flavors[1];
+	rpc_authflavor_t        flavors[NFS_AUTH_INFO_MAX_FLAVORS];
 };
 
 /*
-- 
cgit v0.10.2


From 34751b9d04a221da2a74b27ba439f01c0ae30069 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 28 Oct 2013 16:42:44 -0400
Subject: SUNRPC: Add correct rcu_dereference annotation in
 rpc_clnt_set_transport

rpc_clnt_set_transport should use rcu_derefence_protected(), as it is
only safe to be called with the rpc_clnt::cl_lock held.

Cc: Chuck Lever <Chuck.Lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index f167d9c..759b78b 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -272,7 +272,8 @@ static struct rpc_xprt *rpc_clnt_set_transport(struct rpc_clnt *clnt,
 	struct rpc_xprt *old;
 
 	spin_lock(&clnt->cl_lock);
-	old = clnt->cl_xprt;
+	old = rcu_dereference_protected(clnt->cl_xprt,
+			lockdep_is_held(&clnt->cl_lock));
 
 	if (!xprt_bound(xprt))
 		clnt->cl_autobind = 1;
-- 
cgit v0.10.2


From e3bfab18483a26649ff9cb605aa1410386b1e498 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 2 Oct 2013 09:48:15 -0400
Subject: sunrpc: comment typo fix

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 9928ba1..9deed17 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2872,8 +2872,8 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
 	if (args->bc_xprt->xpt_bc_xprt) {
 		/*
 		 * This server connection already has a backchannel
-		 * export; we can't create a new one, as we wouldn't be
-		 * able to match replies based on xid any more.  So,
+		 * transport; we can't create a new one, as we wouldn't
+		 * be able to match replies based on xid any more.  So,
 		 * reuse the already-existing one:
 		 */
 		 return args->bc_xprt->xpt_bc_xprt;
-- 
cgit v0.10.2


From 4f5829d72636513e23a5f76355536776bf390a8a Mon Sep 17 00:00:00 2001
From: "Geyslan G. Bem" <geyslan@gmail.com>
Date: Fri, 11 Oct 2013 17:15:54 -0300
Subject: nfs: Remove useless 'error' assignment

the 'error' variable was been assigned twice in vain.

Signed-off-by: Geyslan G. Bem <geyslan@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index bb939ed..0c29b1b 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -493,7 +493,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 	unsigned long long fileid;
 	struct dentry *sdentry;
 	struct rpc_task *task;
-	int            error = -EIO;
+	int            error = -EBUSY;
 
 	dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -503,7 +503,6 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 	/*
 	 * We don't allow a dentry to be silly-renamed twice.
 	 */
-	error = -EBUSY;
 	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
 		goto out;
 
-- 
cgit v0.10.2


From 54bcfa6682cfc7ee19dee0e94587f8f55a440df6 Mon Sep 17 00:00:00 2001
From: "Geyslan G. Bem" <geyslan@gmail.com>
Date: Mon, 14 Oct 2013 17:24:15 -0300
Subject: nfs: Use PTR_ERR_OR_ZERO in 'nfs41_callback_up' function

Use 'PTR_ERR_OR_ZERO()' rather than 'IS_ERR(...) ? PTR_ERR(...) : 0'.

Signed-off-by: Geyslan G. Bem <geyslan@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 67cd732..073b4cf 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -164,8 +164,7 @@ nfs41_callback_up(struct svc_serv *serv)
 		svc_xprt_put(serv->sv_bc_xprt);
 		serv->sv_bc_xprt = NULL;
 	}
-	dprintk("--> %s return %ld\n", __func__,
-		IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0);
+	dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp));
 	return rqstp;
 }
 
-- 
cgit v0.10.2


From 6706246b22358b70b49e35a988a9bbfa830619ee Mon Sep 17 00:00:00 2001
From: "Geyslan G. Bem" <geyslan@gmail.com>
Date: Mon, 14 Oct 2013 17:24:16 -0300
Subject: nfs: Use PTR_ERR_OR_ZERO in 'nfs/nfs4super.c'

Use 'PTR_ERR_OR_ZERO()' rather than 'IS_ERR(...) ? PTR_ERR(...) : 0'.

Signed-off-by: Geyslan G. Bem <geyslan@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index e26acdd..65ab0a0 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -261,9 +261,9 @@ struct dentry *nfs4_try_mount(int flags, const char *dev_name,
 
 	res = nfs_follow_remote_path(root_mnt, export_path);
 
-	dfprintk(MOUNT, "<-- nfs4_try_mount() = %ld%s\n",
-			IS_ERR(res) ? PTR_ERR(res) : 0,
-			IS_ERR(res) ? " [error]" : "");
+	dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n",
+		 PTR_ERR_OR_ZERO(res),
+		 IS_ERR(res) ? " [error]" : "");
 	return res;
 }
 
@@ -319,9 +319,9 @@ static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
 	data->mnt_path = export_path;
 
 	res = nfs_follow_remote_path(root_mnt, export_path);
-	dprintk("<-- nfs4_referral_mount() = %ld%s\n",
-			IS_ERR(res) ? PTR_ERR(res) : 0,
-			IS_ERR(res) ? " [error]" : "");
+	dprintk("<-- nfs4_referral_mount() = %d%s\n",
+		PTR_ERR_OR_ZERO(res),
+		IS_ERR(res) ? " [error]" : "");
 	return res;
 }
 
-- 
cgit v0.10.2


From 8313164c36473193c8034de643dc32f35a22bf59 Mon Sep 17 00:00:00 2001
From: wangweidong <wangweidong1@huawei.com>
Date: Tue, 15 Oct 2013 11:44:30 +0800
Subject: SUNRPC: remove an unnecessary if statement

If req allocated failed just goto out_free, no need to check the
'i < num_prealloc'. There is just code simplification, no
functional changes.

Signed-off-by: Wang Weidong <wangweidong1@huawei.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 49535505..04199bc 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1104,11 +1104,9 @@ struct rpc_xprt *xprt_alloc(struct net *net, size_t size,
 	for (i = 0; i < num_prealloc; i++) {
 		req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL);
 		if (!req)
-			break;
+			goto out_free;
 		list_add(&req->rq_list, &xprt->free);
 	}
-	if (i < num_prealloc)
-		goto out_free;
 	if (max_alloc > num_prealloc)
 		xprt->max_reqs = max_alloc;
 	else
-- 
cgit v0.10.2


From 5fccc5b52ee07d07a74ce53c6f174bff81e26a16 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 28 Oct 2013 18:41:44 -0400
Subject: SUNRPC: gss_alloc_msg - choose _either_ a v0 message or a v1 message

Add the missing 'break' to ensure that we don't corrupt a legacy 'v0' type
message by appending the 'v1'.

Cc: Bruce Fields <bfields@fieldses.org>
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 0846566..cc24323 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -482,6 +482,7 @@ gss_alloc_msg(struct gss_auth *gss_auth,
 	switch (vers) {
 	case 0:
 		gss_encode_v0_msg(gss_msg);
+		break;
 	default:
 		gss_encode_v1_msg(gss_msg, service_name, gss_auth->target_name);
 	};
-- 
cgit v0.10.2


From 9d3a2260f0f4bd6379b0a0f131c743fff25b0029 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 28 Oct 2013 18:18:00 -0400
Subject: SUNRPC: Fix buffer overflow checking in
 gss_encode_v0_msg/gss_encode_v1_msg

In gss_encode_v1_msg, it is pointless to BUG() after the overflow has
happened. Replace the existing sprintf()-based code with scnprintf(),
and warn if an overflow is ever triggered.

In gss_encode_v0_msg, replace the runtime BUG_ON() with an appropriate
compile-time BUILD_BUG_ON.

Reported-by: Bruce Fields <bfields@fieldses.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index cc24323..97912b4 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -420,41 +420,53 @@ static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg)
 	memcpy(gss_msg->databuf, &uid, sizeof(uid));
 	gss_msg->msg.data = gss_msg->databuf;
 	gss_msg->msg.len = sizeof(uid);
-	BUG_ON(sizeof(uid) > UPCALL_BUF_LEN);
+
+	BUILD_BUG_ON(sizeof(uid) > sizeof(gss_msg->databuf));
 }
 
-static void gss_encode_v1_msg(struct gss_upcall_msg *gss_msg,
+static int gss_encode_v1_msg(struct gss_upcall_msg *gss_msg,
 				const char *service_name,
 				const char *target_name)
 {
 	struct gss_api_mech *mech = gss_msg->auth->mech;
 	char *p = gss_msg->databuf;
-	int len = 0;
-
-	gss_msg->msg.len = sprintf(gss_msg->databuf, "mech=%s uid=%d ",
-				   mech->gm_name,
-				   from_kuid(&init_user_ns, gss_msg->uid));
-	p += gss_msg->msg.len;
+	size_t buflen = sizeof(gss_msg->databuf);
+	int len;
+
+	len = scnprintf(p, buflen, "mech=%s uid=%d ", mech->gm_name,
+			from_kuid(&init_user_ns, gss_msg->uid));
+	buflen -= len;
+	p += len;
+	gss_msg->msg.len = len;
 	if (target_name) {
-		len = sprintf(p, "target=%s ", target_name);
+		len = scnprintf(p, buflen, "target=%s ", target_name);
+		buflen -= len;
 		p += len;
 		gss_msg->msg.len += len;
 	}
 	if (service_name != NULL) {
-		len = sprintf(p, "service=%s ", service_name);
+		len = scnprintf(p, buflen, "service=%s ", service_name);
+		buflen -= len;
 		p += len;
 		gss_msg->msg.len += len;
 	}
 	if (mech->gm_upcall_enctypes) {
-		len = sprintf(p, "enctypes=%s ", mech->gm_upcall_enctypes);
+		len = scnprintf(p, buflen, "enctypes=%s ",
+				mech->gm_upcall_enctypes);
+		buflen -= len;
 		p += len;
 		gss_msg->msg.len += len;
 	}
-	len = sprintf(p, "\n");
+	len = scnprintf(p, buflen, "\n");
+	if (len == 0)
+		goto out_overflow;
 	gss_msg->msg.len += len;
 
 	gss_msg->msg.data = gss_msg->databuf;
-	BUG_ON(gss_msg->msg.len > UPCALL_BUF_LEN);
+	return 0;
+out_overflow:
+	WARN_ON_ONCE(1);
+	return -ENOMEM;
 }
 
 static struct gss_upcall_msg *
@@ -463,15 +475,15 @@ gss_alloc_msg(struct gss_auth *gss_auth,
 {
 	struct gss_upcall_msg *gss_msg;
 	int vers;
+	int err = -ENOMEM;
 
 	gss_msg = kzalloc(sizeof(*gss_msg), GFP_NOFS);
 	if (gss_msg == NULL)
-		return ERR_PTR(-ENOMEM);
+		goto err;
 	vers = get_pipe_version(gss_auth->net);
-	if (vers < 0) {
-		kfree(gss_msg);
-		return ERR_PTR(vers);
-	}
+	err = vers;
+	if (err < 0)
+		goto err_free_msg;
 	gss_msg->pipe = gss_auth->gss_pipe[vers]->pipe;
 	INIT_LIST_HEAD(&gss_msg->list);
 	rpc_init_wait_queue(&gss_msg->rpc_waitqueue, "RPCSEC_GSS upcall waitq");
@@ -484,9 +496,15 @@ gss_alloc_msg(struct gss_auth *gss_auth,
 		gss_encode_v0_msg(gss_msg);
 		break;
 	default:
-		gss_encode_v1_msg(gss_msg, service_name, gss_auth->target_name);
+		err = gss_encode_v1_msg(gss_msg, service_name, gss_auth->target_name);
+		if (err)
+			goto err_free_msg;
 	};
 	return gss_msg;
+err_free_msg:
+	kfree(gss_msg);
+err:
+	return ERR_PTR(err);
 }
 
 static struct gss_upcall_msg *
-- 
cgit v0.10.2


From a3f432bfd06a4ec3b812e32d3266e0d1ad75d008 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 15 Oct 2013 17:03:16 -0400
Subject: nfs: use IS_ROOT not DCACHE_DISCONNECTED

This check was added by Al Viro with
d9e80b7de91db05c1c4d2e5ebbfd70b3b3ba0e0f "nfs d_revalidate() is too
trigger-happy with d_drop()", with the explanation that we don't want to
remove the root of a disconnected tree, which will still be included on
the s_anon list.

But DCACHE_DISCONNECTED does *not* actually identify dentries that are
disconnected from the dentry tree or hashed on s_anon.  IS_ROOT() is the
way to do that.

Also add a comment from Al's commit to remind us why this check is
there.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 02b0df7..6cc51ae 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1139,7 +1139,13 @@ out_zap_parent:
 	if (inode && S_ISDIR(inode->i_mode)) {
 		/* Purge readdir caches. */
 		nfs_zap_caches(inode);
-		if (dentry->d_flags & DCACHE_DISCONNECTED)
+		/*
+		 * We can't d_drop the root of a disconnected tree:
+		 * its d_hash is on the s_anon list and d_drop() would hide
+		 * it from shrink_dcache_for_unmount(), leading to busy
+		 * inodes on unmount and further oopses.
+		 */
+		if (IS_ROOT(dentry))
 			goto out_valid;
 	}
 	/* If we have submounts, don't unhash ! */
-- 
cgit v0.10.2


From 09c3e54635c85b3da44d3bc156619c1f1af3bb43 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Wed, 30 Oct 2013 13:32:15 +0800
Subject: SUNRPC: remove duplicated include from clnt.c

Remove duplicated include.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 759b78b..dab09da 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -31,7 +31,6 @@
 #include <linux/in.h>
 #include <linux/in6.h>
 #include <linux/un.h>
-#include <linux/rcupdate.h>
 
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/addr.h>
-- 
cgit v0.10.2


From 93dc41bdc5c853916610576c6b48a1704959c70d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 31 Oct 2013 16:14:36 +1100
Subject: SUNRPC: close a rare race in xs_tcp_setup_socket.

We have one report of a crash in xs_tcp_setup_socket.
The call path to the crash is:

  xs_tcp_setup_socket -> inet_stream_connect -> lock_sock_nested.

The 'sock' passed to that last function is NULL.

The only way I can see this happening is a concurrent call to
xs_close:

  xs_close -> xs_reset_transport -> sock_release -> inet_release

inet_release sets:
   sock->sk = NULL;
inet_stream_connect calls
   lock_sock(sock->sk);
which gets NULL.

All calls to xs_close are protected by XPRT_LOCKED as are most
activations of the workqueue which runs xs_tcp_setup_socket.
The exception is xs_tcp_schedule_linger_timeout.

So presumably the timeout queued by the later fires exactly when some
other code runs xs_close().

To protect against this we can move the cancel_delayed_work_sync()
call from xs_destory() to xs_close().

As xs_close is never called from the worker scheduled on
->connect_worker, this can never deadlock.

Signed-off-by: NeilBrown <neilb@suse.de>
[Trond: Make it safe to call cancel_delayed_work_sync() on AF_LOCAL sockets]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 9deed17..a4709bb 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -835,6 +835,8 @@ static void xs_close(struct rpc_xprt *xprt)
 
 	dprintk("RPC:       xs_close xprt %p\n", xprt);
 
+	cancel_delayed_work_sync(&transport->connect_worker);
+
 	xs_reset_transport(transport);
 	xprt->reestablish_timeout = 0;
 
@@ -869,12 +871,8 @@ static void xs_local_destroy(struct rpc_xprt *xprt)
  */
 static void xs_destroy(struct rpc_xprt *xprt)
 {
-	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
-
 	dprintk("RPC:       xs_destroy xprt %p\n", xprt);
 
-	cancel_delayed_work_sync(&transport->connect_worker);
-
 	xs_local_destroy(xprt);
 }
 
@@ -1817,6 +1815,10 @@ static inline void xs_reclassify_socket(int family, struct socket *sock)
 }
 #endif
 
+static void xs_dummy_setup_socket(struct work_struct *work)
+{
+}
+
 static struct socket *xs_create_sock(struct rpc_xprt *xprt,
 		struct sock_xprt *transport, int family, int type, int protocol)
 {
@@ -2668,6 +2670,9 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
 	xprt->ops = &xs_local_ops;
 	xprt->timeout = &xs_local_default_timeout;
 
+	INIT_DELAYED_WORK(&transport->connect_worker,
+			xs_dummy_setup_socket);
+
 	switch (sun->sun_family) {
 	case AF_LOCAL:
 		if (sun->sun_path[0] != '/') {
-- 
cgit v0.10.2


From a1311d87fa034e0de580e3a65f2d5c2e7a1f55f3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 31 Oct 2013 09:18:49 -0400
Subject: SUNRPC: Cleanup xs_destroy()

There is no longer any need for a separate xs_local_destroy() helper.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index a4709bb..17c8892 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -856,14 +856,6 @@ static void xs_tcp_close(struct rpc_xprt *xprt)
 		xs_tcp_shutdown(xprt);
 }
 
-static void xs_local_destroy(struct rpc_xprt *xprt)
-{
-	xs_close(xprt);
-	xs_free_peer_addresses(xprt);
-	xprt_free(xprt);
-	module_put(THIS_MODULE);
-}
-
 /**
  * xs_destroy - prepare to shutdown a transport
  * @xprt: doomed transport
@@ -873,7 +865,10 @@ static void xs_destroy(struct rpc_xprt *xprt)
 {
 	dprintk("RPC:       xs_destroy xprt %p\n", xprt);
 
-	xs_local_destroy(xprt);
+	xs_close(xprt);
+	xs_free_peer_addresses(xprt);
+	xprt_free(xprt);
+	module_put(THIS_MODULE);
 }
 
 static inline struct rpc_xprt *xprt_from_sock(struct sock *sk)
@@ -2513,7 +2508,7 @@ static struct rpc_xprt_ops xs_local_ops = {
 	.send_request		= xs_local_send_request,
 	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
 	.close			= xs_close,
-	.destroy		= xs_local_destroy,
+	.destroy		= xs_destroy,
 	.print_stats		= xs_local_print_stats,
 };
 
-- 
cgit v0.10.2


From 1acd1c301f4faae80f4d2c7bbd9a4553b131c0e3 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 31 Oct 2013 13:03:04 -0400
Subject: nfs: fix inverted test for delegation in nfs4_reclaim_open_state

commit 6686390bab6a0e0 (NFS: remove incorrect "Lock reclaim failed!"
warning.) added a test for a delegation before checking to see if any
reclaimed locks failed. The test however is backward and is only doing
that check when a delegation is held instead of when one isn't.

Cc: NeilBrown <neilb@suse.de>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Fixes: 6686390bab6a: NFS: remove incorrect "Lock reclaim failed!" warning.
Cc: stable@vger.kernel.org # 3.12
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 452f4c8..c8e729d 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1476,7 +1476,7 @@ restart:
 		if (status >= 0) {
 			status = nfs4_reclaim_locks(state, ops);
 			if (status >= 0) {
-				if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) {
+				if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) {
 					spin_lock(&state->state_lock);
 					list_for_each_entry(lock, &state->lock_states, ls_locks) {
 						if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
-- 
cgit v0.10.2


From 12207f69b3ef4d6eea6a5568281e49f671977ab1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 1 Nov 2013 10:49:32 -0400
Subject: nfs: fix oops when trying to set SELinux label

Chao reported the following oops when testing labeled NFS:

BUG: unable to handle kernel NULL pointer dereference at           (null)
IP: [<ffffffffa0568703>] nfs4_xdr_enc_setattr+0x43/0x110 [nfsv4]
PGD 277bbd067 PUD 2777ea067 PMD 0
Oops: 0000 [#1] SMP
Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache sg coretemp kvm_intel kvm crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel lrw gf128mul iTCO_wdt glue_helper ablk_helper cryptd iTCO_vendor_support bnx2 pcspkr serio_raw i7core_edac cdc_ether microcode usbnet edac_core mii lpc_ich i2c_i801 mfd_core shpchp ioatdma dca acpi_cpufreq mperf nfsd auth_rpcgss nfs_acl lockd sunrpc xfs libcrc32c sr_mod sd_mod cdrom crc_t10dif mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper ata_generic ttm pata_acpi drm ata_piix libata megaraid_sas i2c_core dm_mirror dm_region_hash dm_log dm_mod
CPU: 4 PID: 25657 Comm: chcon Not tainted 3.10.0-33.el7.x86_64 #1
Hardware name: IBM System x3550 M3 -[7944OEJ]-/90Y4784     , BIOS -[D6E150CUS-1.11]- 02/08/2011
task: ffff880178397220 ti: ffff8801595d2000 task.ti: ffff8801595d2000
RIP: 0010:[<ffffffffa0568703>]  [<ffffffffa0568703>] nfs4_xdr_enc_setattr+0x43/0x110 [nfsv4]
RSP: 0018:ffff8801595d3888  EFLAGS: 00010296
RAX: 0000000000000000 RBX: ffff8801595d3b30 RCX: 0000000000000b4c
RDX: ffff8801595d3b30 RSI: ffff8801595d38e0 RDI: ffff880278b6ec00
RBP: ffff8801595d38c8 R08: ffff8801595d3b30 R09: 0000000000000001
R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801595d38e0
R13: ffff880277a4a780 R14: ffffffffa05686c0 R15: ffff8802765f206c
FS:  00007f2c68486800(0000) GS:ffff88027fc00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 000000027651a000 CR4: 00000000000007e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Stack:
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 ffff880277865800 ffff880278b6ec00 ffff880277a4a780
 ffff8801595d3948 ffffffffa02ad926 ffff8801595d3b30 ffff8802765f206c
Call Trace:
 [<ffffffffa02ad926>] rpcauth_wrap_req+0x86/0xd0 [sunrpc]
 [<ffffffffa02a1d40>] ? call_connect+0xb0/0xb0 [sunrpc]
 [<ffffffffa02a1d40>] ? call_connect+0xb0/0xb0 [sunrpc]
 [<ffffffffa02a1ecb>] call_transmit+0x18b/0x290 [sunrpc]
 [<ffffffffa02a1d40>] ? call_connect+0xb0/0xb0 [sunrpc]
 [<ffffffffa02aae14>] __rpc_execute+0x84/0x400 [sunrpc]
 [<ffffffffa02ac40e>] rpc_execute+0x5e/0xa0 [sunrpc]
 [<ffffffffa02a2ea0>] rpc_run_task+0x70/0x90 [sunrpc]
 [<ffffffffa02a2f03>] rpc_call_sync+0x43/0xa0 [sunrpc]
 [<ffffffffa055284d>] _nfs4_do_set_security_label+0x11d/0x170 [nfsv4]
 [<ffffffffa0558861>] nfs4_set_security_label.isra.69+0xf1/0x1d0 [nfsv4]
 [<ffffffff815fca8b>] ? avc_alloc_node+0x24/0x125
 [<ffffffff815fcd2f>] ? avc_compute_av+0x1a3/0x1b5
 [<ffffffffa055897b>] nfs4_xattr_set_nfs4_label+0x3b/0x50 [nfsv4]
 [<ffffffff811bc772>] generic_setxattr+0x62/0x80
 [<ffffffff811bcfc3>] __vfs_setxattr_noperm+0x63/0x1b0
 [<ffffffff811bd1c5>] vfs_setxattr+0xb5/0xc0
 [<ffffffff811bd2fe>] setxattr+0x12e/0x1c0
 [<ffffffff811a4d22>] ? final_putname+0x22/0x50
 [<ffffffff811a4f2b>] ? putname+0x2b/0x40
 [<ffffffff811aa1cf>] ? user_path_at_empty+0x5f/0x90
 [<ffffffff8119bc29>] ? __sb_start_write+0x49/0x100
 [<ffffffff811bd66f>] SyS_lsetxattr+0x8f/0xd0
 [<ffffffff8160cf99>] system_call_fastpath+0x16/0x1b
Code: 48 8b 02 48 c7 45 c0 00 00 00 00 48 c7 45 c8 00 00 00 00 48 c7 45 d0 00 00 00 00 48 c7 45 d8 00 00 00 00 48 c7 45 e0 00 00 00 00 <48> 8b 00 48 8b 00 48 85 c0 0f 84 ae 00 00 00 48 8b 80 b8 03 00
RIP  [<ffffffffa0568703>] nfs4_xdr_enc_setattr+0x43/0x110 [nfsv4]
 RSP <ffff8801595d3888>
CR2: 0000000000000000

The problem is that _nfs4_do_set_security_label calls rpc_call_sync()
directly which fails to do any setup of the SEQUENCE call. Have it use
nfs4_call_sync() instead which does the right thing. While we're at it
change the name of "args" to "arg" to better match the pattern in
_nfs4_do_setattr.

Reported-by: Chao Ye <cye@redhat.com>
Cc: David Quigley <dpquigl@davequigley.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Cc: stable@vger.kernel.org # 3.11+
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b02c4cc..d436c57 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4657,7 +4657,7 @@ static int _nfs4_do_set_security_label(struct inode *inode,
 	struct iattr sattr = {0};
 	struct nfs_server *server = NFS_SERVER(inode);
 	const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
-	struct nfs_setattrargs args = {
+	struct nfs_setattrargs arg = {
 		.fh             = NFS_FH(inode),
 		.iap            = &sattr,
 		.server		= server,
@@ -4671,14 +4671,14 @@ static int _nfs4_do_set_security_label(struct inode *inode,
 	};
 	struct rpc_message msg = {
 		.rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
-		.rpc_argp       = &args,
+		.rpc_argp       = &arg,
 		.rpc_resp       = &res,
 	};
 	int status;
 
-	nfs4_stateid_copy(&args.stateid, &zero_stateid);
+	nfs4_stateid_copy(&arg.stateid, &zero_stateid);
 
-	status = rpc_call_sync(server->client, &msg, 0);
+	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
 	if (status)
 		dprintk("%s failed: %d\n", __func__, status);
 
-- 
cgit v0.10.2


From fcb63a9bd8427fc584229048ea14f1453dc9a2e1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 1 Nov 2013 12:42:25 -0400
Subject: NFS: Fix a missing initialisation when reading the SELinux label

Ensure that _nfs4_do_get_security_label() also initialises the
SEQUENCE call correctly, by having it call into nfs4_call_sync().

Reported-by: Jeff Layton <jlayton@redhat.com>
Cc: stable@vger.kernel.org # 3.11+
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d436c57..7e28b5c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4602,7 +4602,7 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf,
 	struct nfs4_label label = {0, 0, buflen, buf};
 
 	u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
-	struct nfs4_getattr_arg args = {
+	struct nfs4_getattr_arg arg = {
 		.fh		= NFS_FH(inode),
 		.bitmask	= bitmask,
 	};
@@ -4613,14 +4613,14 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf,
 	};
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_GETATTR],
-		.rpc_argp	= &args,
+		.rpc_argp	= &arg,
 		.rpc_resp	= &res,
 	};
 	int ret;
 
 	nfs_fattr_init(&fattr);
 
-	ret = rpc_call_sync(server->client, &msg, 0);
+	ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0);
 	if (ret)
 		return ret;
 	if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL))
-- 
cgit v0.10.2


From f3f5a0f8cc40b942f4c0ae117df82eeb65f07d4d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 4 Nov 2013 14:38:05 -0500
Subject: NFSv4.2: Fix a mismatch between Linux labeled NFS and the NFSv4.2
 spec

In the spec, the security label attribute id is '80', which means that
it should be bit number 80-64 == 16 in the 3rd word of the bitmap.

Fixes: 4488cc96c581: NFS: Add NFSv4.2 protocol constants
Cc: J. Bruce Fields <bfields@fieldses.org>
Cc: Steve Dickson <steved@redhat.com>
Cc: stable@vger.kernel.org # 3.11+
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index c56fa8f..bfe6c37 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -395,7 +395,7 @@ enum lock_type4 {
 #define FATTR4_WORD1_FS_LAYOUT_TYPES    (1UL << 30)
 #define FATTR4_WORD2_LAYOUT_BLKSIZE     (1UL << 1)
 #define FATTR4_WORD2_MDSTHRESHOLD       (1UL << 4)
-#define FATTR4_WORD2_SECURITY_LABEL     (1UL << 17)
+#define FATTR4_WORD2_SECURITY_LABEL     (1UL << 16)
 
 /* MDS threshold bitmap bits */
 #define THRESHOLD_RD                    (1UL << 0)
-- 
cgit v0.10.2


From 3da580aab9482f5bf88ea79a4bf0a511eea6fa44 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 2 Nov 2013 06:57:18 -0400
Subject: nfs: set security label when revalidating inode

Currently, we fetch the security label when revalidating an inode's
attributes, but don't apply it. This is in contrast to the readdir()
codepath where we do apply label changes.

Cc: Dave Quigley <dpquigl@davequigley.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 471ba59..26e77d8 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -923,6 +923,8 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
 		nfs_zap_acl_cache(inode);
 
+	nfs_setsecurity(inode, fattr, label);
+
 	dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n",
 		inode->i_sb->s_id,
 		(long long)NFS_FILEID(inode));
-- 
cgit v0.10.2


From d204c5d2b8f7614350cd4609f4d4cdcf25494d74 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 4 Nov 2013 13:23:59 -0500
Subject: NFSv4.2: encode_readdir - only ask for labels when doing readdirplus

Currently, if the server is doing NFSv4.2 and supports labeled NFS, then
our on-the-wire READDIR request ends up asking for the label information,
which is then ignored unless we're doing readdirplus.
This patch ensures that READDIR doesn't ask the server for label information
at all unless the readdir->bitmask contains the FATTR4_WORD2_SECURITY_LABEL
attribute, and the readdir->plus flag is set.

While we're at it, optimise away the 3rd bitmap field if it is zero.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f903389..5be2868 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -105,12 +105,8 @@ static int nfs4_stat_to_errno(int);
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
 /* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */
 #define	nfs4_label_maxsz	(4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN))
-#define encode_readdir_space 24
-#define encode_readdir_bitmask_sz 3
 #else
 #define	nfs4_label_maxsz	0
-#define encode_readdir_space 20
-#define encode_readdir_bitmask_sz 2
 #endif
 /* We support only one layout type per file system */
 #define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8)
@@ -1581,6 +1577,8 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 	};
 	uint32_t dircount = readdir->count >> 1;
 	__be32 *p, verf[2];
+	uint32_t attrlen = 0;
+	unsigned int i;
 
 	if (readdir->plus) {
 		attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
@@ -1589,26 +1587,27 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 			FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|
 			FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
 			FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+		attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
 		dircount >>= 1;
 	}
 	/* Use mounted_on_fileid only if the server supports it */
 	if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID))
 		attrs[0] |= FATTR4_WORD0_FILEID;
+	for (i = 0; i < ARRAY_SIZE(attrs); i++) {
+		attrs[i] &= readdir->bitmask[i];
+		if (attrs[i] != 0)
+			attrlen = i+1;
+	}
 
 	encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr);
 	encode_uint64(xdr, readdir->cookie);
 	encode_nfs4_verifier(xdr, &readdir->verifier);
-	p = reserve_space(xdr, encode_readdir_space);
+	p = reserve_space(xdr, 12 + (attrlen << 2));
 	*p++ = cpu_to_be32(dircount);
 	*p++ = cpu_to_be32(readdir->count);
-	*p++ = cpu_to_be32(encode_readdir_bitmask_sz);
-	*p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
-	*p   = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
-	if (encode_readdir_bitmask_sz > 2) {
-		if (hdr->minorversion > 1)
-			attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
-		p++, *p++ = cpu_to_be32(attrs[2] & readdir->bitmask[2]);
-	}
+	*p++ = cpu_to_be32(attrlen);
+	for (i = 0; i < attrlen; i++)
+		*p++ = cpu_to_be32(attrs[i]);
 	memcpy(verf, readdir->verifier.data, sizeof(verf));
 
 	dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n",
-- 
cgit v0.10.2


From b944dba31d2c23f1cf5d69a66cc3449e0ba78503 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 4 Nov 2013 15:20:20 -0500
Subject: NFSv4: Sanity check the server reply in _nfs4_server_capabilities

We don't want to be setting capabilities and/or requesting attributes
that are not appropriate for the NFSv4 minor version.

- Ensure that we clear the NFS_CAP_SECURITY_LABEL capability when appropriate
- Ensure that we limit the attribute bitmasks to the mounted_on_fileid
  attribute and less for NFSv4.0
- Ensure that we limit the attribute bitmasks to suppattr_exclcreat and
  less for NFSv4.1
- Ensure that we limit it to change_sec_label or less for NFSv4.2

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7e28b5c..ed2a0e6 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2706,6 +2706,10 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
 		nfs4_close_state(ctx->state, ctx->mode);
 }
 
+#define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL)
+#define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL)
+#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_CHANGE_SECURITY_LABEL - 1UL)
+
 static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
 	struct nfs4_server_caps_arg args = {
@@ -2721,12 +2725,25 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 
 	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
 	if (status == 0) {
+		/* Sanity check the server answers */
+		switch (server->nfs_client->cl_minorversion) {
+		case 0:
+			res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK;
+			res.attr_bitmask[2] = 0;
+			break;
+		case 1:
+			res.attr_bitmask[2] &= FATTR4_WORD2_NFS41_MASK;
+			break;
+		case 2:
+			res.attr_bitmask[2] &= FATTR4_WORD2_NFS42_MASK;
+		}
 		memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
 		server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|
 				NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
 				NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|
 				NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME|
-				NFS_CAP_CTIME|NFS_CAP_MTIME);
+				NFS_CAP_CTIME|NFS_CAP_MTIME|
+				NFS_CAP_SECURITY_LABEL);
 		if (res.attr_bitmask[0] & FATTR4_WORD0_ACL)
 			server->caps |= NFS_CAP_ACLS;
 		if (res.has_links != 0)
@@ -2755,14 +2772,12 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 #endif
 		memcpy(server->attr_bitmask_nl, res.attr_bitmask,
 				sizeof(server->attr_bitmask));
+		server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
 
-		if (server->caps & NFS_CAP_SECURITY_LABEL) {
-			server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
-			res.attr_bitmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
-		}
 		memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
 		server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
 		server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+		server->cache_consistency_bitmask[2] = 0;
 		server->acl_bitmask = res.acl_bitmask;
 		server->fh_expire_type = res.fh_expire_type;
 	}
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index bfe6c37..c6f41b6 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -396,6 +396,8 @@ enum lock_type4 {
 #define FATTR4_WORD2_LAYOUT_BLKSIZE     (1UL << 1)
 #define FATTR4_WORD2_MDSTHRESHOLD       (1UL << 4)
 #define FATTR4_WORD2_SECURITY_LABEL     (1UL << 16)
+#define FATTR4_WORD2_CHANGE_SECURITY_LABEL \
+					(1UL << 17)
 
 /* MDS threshold bitmap bits */
 #define THRESHOLD_RD                    (1UL << 0)
-- 
cgit v0.10.2


From fab99ebe39fe7d11fbd9b5fb84f07432af9ba36f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 4 Nov 2013 13:46:40 -0500
Subject: NFSv4.2: Remove redundant checks in
 nfs_setsecurity+nfs4_label_init_security

We already check for nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)
in nfs4_label_alloc()
We check the minor version in _nfs4_server_capabilities before setting
NFS_CAP_SECURITY_LABEL.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 26e77d8..18ab2da 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -274,12 +274,6 @@ void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
 	if (label == NULL)
 		return;
 
-	if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL) == 0)
-		return;
-
-	if (NFS_SERVER(inode)->nfs_client->cl_minorversion < 2)
-		return;
-
 	if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) {
 		error = security_inode_notifysecctx(inode, label->label,
 				label->len);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ed2a0e6..5ab33c0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -105,9 +105,6 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
 	if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0)
 		return NULL;
 
-	if (NFS_SERVER(dir)->nfs_client->cl_minorversion < 2)
-		return NULL;
-
 	err = security_dentry_init_security(dentry, sattr->ia_mode,
 				&dentry->d_name, (void **)&label->label, &label->len);
 	if (err == 0)
-- 
cgit v0.10.2