From 96f287b0cf512ee537826943c15b0b8647472f70 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 3 Dec 2009 08:09:56 -0500
Subject: NFS: BKL removal from the mount code...

None of the code in nfs_umount_begin() or nfs_remount() has any BKL
dependency.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 90be551..f0188ea 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -714,8 +714,6 @@ static void nfs_umount_begin(struct super_block *sb)
 	struct nfs_server *server;
 	struct rpc_clnt *rpc;
 
-	lock_kernel();
-
 	server = NFS_SB(sb);
 	/* -EIO all pending I/O */
 	rpc = server->client_acl;
@@ -724,8 +722,6 @@ static void nfs_umount_begin(struct super_block *sb)
 	rpc = server->client;
 	if (!IS_ERR(rpc))
 		rpc_killall_tasks(rpc);
-
-	unlock_kernel();
 }
 
 static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version)
@@ -1881,7 +1877,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	if (data == NULL)
 		return -ENOMEM;
 
-	lock_kernel();
 	/* fill out struct with values from existing mount */
 	data->flags = nfss->flags;
 	data->rsize = nfss->rsize;
@@ -1907,7 +1902,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	error = nfs_compare_remount_data(nfss, data);
 out:
 	kfree(data);
-	unlock_kernel();
 	return error;
 }
 
-- 
cgit v0.10.2


From d4e935bd67ca05db4119b67801d9ece6ae139f05 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Thu, 3 Dec 2009 15:58:33 -0500
Subject: The rpc server does not require that service threads take the BKL.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 293fa05..e66ec5d 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -78,11 +78,6 @@ nfs4_callback_svc(void *vrqstp)
 
 	set_freezable();
 
-	/*
-	 * FIXME: do we really need to run this under the BKL? If so, please
-	 * add a comment about what it's intended to protect.
-	 */
-	lock_kernel();
 	while (!kthread_should_stop()) {
 		/*
 		 * Listen for a request on the socket
@@ -104,7 +99,6 @@ nfs4_callback_svc(void *vrqstp)
 		preverr = err;
 		svc_process(rqstp);
 	}
-	unlock_kernel();
 	return 0;
 }
 
@@ -160,11 +154,6 @@ nfs41_callback_svc(void *vrqstp)
 
 	set_freezable();
 
-	/*
-	 * FIXME: do we really need to run this under the BKL? If so, please
-	 * add a comment about what it's intended to protect.
-	 */
-	lock_kernel();
 	while (!kthread_should_stop()) {
 		prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
 		spin_lock_bh(&serv->sv_cb_lock);
@@ -183,7 +172,6 @@ nfs41_callback_svc(void *vrqstp)
 		}
 		finish_wait(&serv->sv_cb_waitq, &wq);
 	}
-	unlock_kernel();
 	return 0;
 }
 
-- 
cgit v0.10.2


From ee671b016fbfc26d69c3fe02e28706222beb1149 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 3 Dec 2009 15:58:56 -0500
Subject: NFS: convert proto= option to use netids rather than a protoname

Solaris uses netids as values for the proto= option, so that when
someone specifies "tcp6" they get traffic over TCP + IPv6. Until
recently, this has never really been an issue for Linux since it didn't
support NFS over IPv6. The netid and the protocol name were generally
always the same (modulo any strange configuration in /etc/netconfig).

The solaris manpage documents their proto= option as:

    proto= _netid_ | rdma

This patch is intended to bring Linux closer to how the Solaris proto=
option works, by declaring a static netid mapping in the kernel and
converting the proto= and mountproto= options to follow it and display
the proper values in /proc/mounts.

Much of this functionality will need to be provided by a userspace
mount.nfs patch. Chuck Lever has a patch to change mount.nfs in
the same way. In principle, we could do *all* of this in userspace but
that would mean that the options in /proc/mounts may not match the
options used by userspace.

The alternative to the static mapping here is to add a mechanism to
upcall to userspace for netid's. I'm not opposed to that option, but
it'll probably mean more overhead (and quite a bit more code). Rather
than shoot for that at first, I figured it was probably better to
start simply.

Comments welcome.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f0188ea..bfad746 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -175,14 +175,16 @@ static const match_table_t nfs_mount_option_tokens = {
 };
 
 enum {
-	Opt_xprt_udp, Opt_xprt_tcp, Opt_xprt_rdma,
+	Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma,
 
 	Opt_xprt_err
 };
 
 static const match_table_t nfs_xprt_protocol_tokens = {
 	{ Opt_xprt_udp, "udp" },
+	{ Opt_xprt_udp6, "udp6" },
 	{ Opt_xprt_tcp, "tcp" },
+	{ Opt_xprt_tcp6, "tcp6" },
 	{ Opt_xprt_rdma, "rdma" },
 
 	{ Opt_xprt_err, NULL }
@@ -492,6 +494,45 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)
 	return sec_flavours[i].str;
 }
 
+static void nfs_show_mountd_netid(struct seq_file *m, struct nfs_server *nfss,
+				  int showdefaults)
+{
+	struct sockaddr *sap = (struct sockaddr *) &nfss->mountd_address;
+
+	seq_printf(m, ",mountproto=");
+	switch (sap->sa_family) {
+	case AF_INET:
+		switch (nfss->mountd_protocol) {
+		case IPPROTO_UDP:
+			seq_printf(m, RPCBIND_NETID_UDP);
+			break;
+		case IPPROTO_TCP:
+			seq_printf(m, RPCBIND_NETID_TCP);
+			break;
+		default:
+			if (showdefaults)
+				seq_printf(m, "auto");
+		}
+		break;
+	case AF_INET6:
+		switch (nfss->mountd_protocol) {
+		case IPPROTO_UDP:
+			seq_printf(m, RPCBIND_NETID_UDP6);
+			break;
+		case IPPROTO_TCP:
+			seq_printf(m, RPCBIND_NETID_TCP6);
+			break;
+		default:
+			if (showdefaults)
+				seq_printf(m, "auto");
+		}
+		break;
+	default:
+		if (showdefaults)
+			seq_printf(m, "auto");
+	}
+}
+
 static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
 				    int showdefaults)
 {
@@ -518,17 +559,7 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
 	if (nfss->mountd_port || showdefaults)
 		seq_printf(m, ",mountport=%u", nfss->mountd_port);
 
-	switch (nfss->mountd_protocol) {
-	case IPPROTO_UDP:
-		seq_printf(m, ",mountproto=udp");
-		break;
-	case IPPROTO_TCP:
-		seq_printf(m, ",mountproto=tcp");
-		break;
-	default:
-		if (showdefaults)
-			seq_printf(m, ",mountproto=auto");
-	}
+	nfs_show_mountd_netid(m, nfss, showdefaults);
 }
 
 /*
@@ -578,7 +609,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 			seq_puts(m, nfs_infop->nostr);
 	}
 	seq_printf(m, ",proto=%s",
-		   rpc_peeraddr2str(nfss->client, RPC_DISPLAY_PROTO));
+		   rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID));
 	if (version == 4) {
 		if (nfss->port != NFS_PORT)
 			seq_printf(m, ",port=%u", nfss->port);
@@ -883,6 +914,8 @@ static int nfs_parse_mount_options(char *raw,
 {
 	char *p, *string, *secdata;
 	int rc, sloppy = 0, invalid_option = 0;
+	unsigned short protofamily = AF_UNSPEC;
+	unsigned short mountfamily = AF_UNSPEC;
 
 	if (!raw) {
 		dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
@@ -1228,12 +1261,17 @@ static int nfs_parse_mount_options(char *raw,
 			token = match_token(string,
 					    nfs_xprt_protocol_tokens, args);
 
+			protofamily = AF_INET;
 			switch (token) {
+			case Opt_xprt_udp6:
+				protofamily = AF_INET6;
 			case Opt_xprt_udp:
 				mnt->flags &= ~NFS_MOUNT_TCP;
 				mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
 				kfree(string);
 				break;
+			case Opt_xprt_tcp6:
+				protofamily = AF_INET6;
 			case Opt_xprt_tcp:
 				mnt->flags |= NFS_MOUNT_TCP;
 				mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
@@ -1261,10 +1299,15 @@ static int nfs_parse_mount_options(char *raw,
 					    nfs_xprt_protocol_tokens, args);
 			kfree(string);
 
+			mountfamily = AF_INET;
 			switch (token) {
+			case Opt_xprt_udp6:
+				mountfamily = AF_INET6;
 			case Opt_xprt_udp:
 				mnt->mount_server.protocol = XPRT_TRANSPORT_UDP;
 				break;
+			case Opt_xprt_tcp6:
+				mountfamily = AF_INET6;
 			case Opt_xprt_tcp:
 				mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
 				break;
@@ -1363,8 +1406,33 @@ static int nfs_parse_mount_options(char *raw,
 	if (!sloppy && invalid_option)
 		return 0;
 
+	/*
+	 * verify that any proto=/mountproto= options match the address
+	 * familiies in the addr=/mountaddr= options.
+	 */
+	if (protofamily != AF_UNSPEC &&
+	    protofamily != mnt->nfs_server.address.ss_family)
+		goto out_proto_mismatch;
+
+	if (mountfamily != AF_UNSPEC) {
+		if (mnt->mount_server.addrlen) {
+			if (mountfamily != mnt->mount_server.address.ss_family)
+				goto out_mountproto_mismatch;
+		} else {
+			if (mountfamily != mnt->nfs_server.address.ss_family)
+				goto out_mountproto_mismatch;
+		}
+	}
+
 	return 1;
 
+out_mountproto_mismatch:
+	printk(KERN_INFO "NFS: mount server address does not match mountproto= "
+			 "option\n");
+	return 0;
+out_proto_mismatch:
+	printk(KERN_INFO "NFS: server address does not match proto= option\n");
+	return 0;
 out_invalid_address:
 	printk(KERN_INFO "NFS: bad IP address specified: %s\n", p);
 	return 0;
-- 
cgit v0.10.2


From a01878aac57eac6eb4bf194788ab2cc440490d0f Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Thu, 3 Dec 2009 15:58:56 -0500
Subject: NFS: reorder nfs4_sequence_regs to remove 8 bytes of padding on 64
 bits

reorder nfs4_sequence_args to remove 8 bytes of padding on 64 bit
builds.

The size of this structure drops to 24 bytes from 32 and reduces the
text size of nfs.ko.
On my x86_64 size reports

		text       data     bss
2.6.32-rc5 	200996	   8512	    432	 209940	  33414	nfs.ko
+patch 		200884	   8512	    432	 209828	  333a4	nfs.ko


Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 62f63fb..8c88037 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -170,8 +170,8 @@ struct nfs4_sequence_args {
 struct nfs4_sequence_res {
 	struct nfs4_session	*sr_session;
 	u8			sr_slotid;	/* slot used to send request */
-	unsigned long		sr_renewal_time;
 	int			sr_status;	/* sequence operation status */
+	unsigned long		sr_renewal_time;
 };
 
 struct nfs4_get_lease_time_args {
-- 
cgit v0.10.2


From dd1fd90fe65e2e642f0e58e2ff4849f317a6c43d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 3 Dec 2009 15:58:56 -0500
Subject: SUNRPC: Display compressed (shorthand) IPv6 presentation addresses

Recent changes to snprintf() introduced the %pI6c formatter, which can
display an IPv6 address with standard shorthanding.  Using a
shorthanded address can save us a few bytes of memory for each stored
presentation address, or a few bytes on the wire when sending these in
a universal address.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c
index c7450c8..6dcdd25 100644
--- a/net/sunrpc/addr.c
+++ b/net/sunrpc/addr.c
@@ -55,16 +55,8 @@ static size_t rpc_ntop6_noscopeid(const struct sockaddr *sap,
 
 	/*
 	 * RFC 4291, Section 2.2.1
-	 *
-	 * To keep the result as short as possible, especially
-	 * since we don't shorthand, we don't want leading zeros
-	 * in each halfword, so avoid %pI6.
 	 */
-	return snprintf(buf, buflen, "%x:%x:%x:%x:%x:%x:%x:%x",
-		ntohs(addr->s6_addr16[0]), ntohs(addr->s6_addr16[1]),
-		ntohs(addr->s6_addr16[2]), ntohs(addr->s6_addr16[3]),
-		ntohs(addr->s6_addr16[4]), ntohs(addr->s6_addr16[5]),
-		ntohs(addr->s6_addr16[6]), ntohs(addr->s6_addr16[7]));
+	return snprintf(buf, buflen, "%pI6c", addr);
 }
 
 static size_t rpc_ntop6(const struct sockaddr *sap,
-- 
cgit v0.10.2


From d250e190fb9b06f4c595eade88b3d0b705fb330a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 3 Dec 2009 15:58:56 -0500
Subject: NFS: Display compressed (shorthand) IPv6 in /proc/mounts

Recent changes to snprintf() introduced the %pI6c formatter, which can
display an IPv6 address with standard shorthanding.  Use this new
formatter when displaying IPv6 server addresses in /proc/mounts.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index bfad746..8370327 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -546,7 +546,7 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
 	}
 	case AF_INET6: {
 		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
-		seq_printf(m, ",mountaddr=%pI6", &sin6->sin6_addr);
+		seq_printf(m, ",mountaddr=%pI6c", &sin6->sin6_addr);
 		break;
 	}
 	default:
-- 
cgit v0.10.2


From dd47f96c077b4516727e497e4b6fd47a06778c0a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 3 Dec 2009 15:58:56 -0500
Subject: NFS: Revert default r/wsize behavior

When the "rsize=" or "wsize=" mount options are not specified,
text-based mounts have slightly different behavior than legacy binary
mounts.  Text-based mounts use the smaller of the server's maximum
and the client's maximum, but binary mounts use the smaller of the
server's _preferred_ size and the client's maximum.

This difference is actually pretty subtle.  Most servers advertise
the same value as their maximum and their preferred transfer size, so
the end result is the same in most cases.

The reason for this difference is that for text-based mounts, if
r/wsize are not specified, they are set to the largest value supported
by the client.  For legacy mounts, the values are set to zero if these
options are not specified.

nfs_server_set_fsinfo() can negotiate the transfer size defaults
correctly in any case.  There's no need to specify any particular
value as default in the text-based option parsing logic.

Note that nfs4 doesn't use nfs_server_set_fsinfo(), but the mount.nfs4
command does set rsize and wsize to 0 if the user didn't specify these
options.  So, make the same change for text-based NFSv4 mounts.

Thanks to James Pearson <james-p@moving-picture.com> for reporting and
diagnosing the problem.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 8370327..ce907ef 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -761,8 +761,6 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (data) {
-		data->rsize		= NFS_MAX_FILE_IO_SIZE;
-		data->wsize		= NFS_MAX_FILE_IO_SIZE;
 		data->acregmin		= NFS_DEF_ACREGMIN;
 		data->acregmax		= NFS_DEF_ACREGMAX;
 		data->acdirmin		= NFS_DEF_ACDIRMIN;
-- 
cgit v0.10.2


From 206a134b4d8abf57cd34dffacf993869355b9aac Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 3 Dec 2009 15:58:56 -0500
Subject: SUNRPC: Check explicitly for tk_status == 0 in call_transmit_status()

The success case, where task->tk_status == 0, is by far the most
frequent case in call_transmit_status().

The default: arm of the switch statement in call_transmit_status()
handles the 0 case.  default: was moved close to the top of the switch
statement in call_transmit_status() under the theory that the compiler
places object code for the earliest arms of a switch statement first,
making the CPU do less work.

The default: arm of a switch statement, however, is executed only
after all the other cases have been checked.  Even if the compiler
rearranges the object code, the default: arm is the "last resort",
meaning all of the other cases have been explicitly exhausted.  That
makes the current arrangement about as inefficient as it gets for the
common case.

To fix this, add an explicit check for zero before the switch
statement.  That forces the compiler to do the zero check first, no
matter what optimizations it might try to do to the switch statement.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 38829e2..7bcd931 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1180,10 +1180,22 @@ static void
 call_transmit_status(struct rpc_task *task)
 {
 	task->tk_action = call_status;
+
+	/*
+	 * Common case: success.  Force the compiler to put this
+	 * test first.
+	 */
+	if (task->tk_status == 0) {
+		xprt_end_transmit(task);
+		rpc_task_force_reencode(task);
+		return;
+	}
+
 	switch (task->tk_status) {
 	case -EAGAIN:
 		break;
 	default:
+		dprint_status(task);
 		xprt_end_transmit(task);
 		/*
 		 * Special cases: if we've been waiting on the
-- 
cgit v0.10.2


From 09a21c4102c8f7893368553273d39c0cadedf9af Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 3 Dec 2009 15:58:56 -0500
Subject: SUNRPC: Allow RPCs to fail quickly if the server is unreachable

The kernel sometimes makes RPC calls to services that aren't running.
Because the kernel's RPC client always assumes the hard retry semantic
when reconnecting a connection-oriented RPC transport, the underlying
reconnect logic takes a long while to time out, even though the remote
may have responded immediately with ECONNREFUSED.

In certain cases, like upcalls to our local rpcbind daemon, or for NFS
mount requests, we'd like the kernel to fail immediately if the remote
service isn't reachable.  This allows another transport to be tried
immediately, or the pending request can be abandoned quickly.

Introduce a per-request flag which controls how call_transmit_status()
behaves when request transmission fails because the server cannot be
reached.

We don't want soft connection semantics to apply to other errors.  The
default case of the switch statement in call_transmit_status() no
longer falls through; the fall through code is copied to the default
case, and a "break;" is added.

The transport's connection re-establishment timeout is also ignored for
such requests.  We want the request to fail immediately, so the
reconnect delay is skipped.  Additionally, we don't want a connect
failure here to further increase the reconnect timeout value, since
this request will not be retried.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 4010977..1906782 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -130,12 +130,14 @@ struct rpc_task_setup {
 #define RPC_TASK_DYNAMIC	0x0080		/* task was kmalloc'ed */
 #define RPC_TASK_KILLED		0x0100		/* task was killed */
 #define RPC_TASK_SOFT		0x0200		/* Use soft timeouts */
+#define RPC_TASK_SOFTCONN	0x0400		/* Fail if can't connect */
 
 #define RPC_IS_ASYNC(t)		((t)->tk_flags & RPC_TASK_ASYNC)
 #define RPC_IS_SWAPPER(t)	((t)->tk_flags & RPC_TASK_SWAPPER)
 #define RPC_DO_ROOTOVERRIDE(t)	((t)->tk_flags & RPC_TASK_ROOTCREDS)
 #define RPC_ASSASSINATED(t)	((t)->tk_flags & RPC_TASK_KILLED)
 #define RPC_IS_SOFT(t)		((t)->tk_flags & RPC_TASK_SOFT)
+#define RPC_IS_SOFTCONN(t)	((t)->tk_flags & RPC_TASK_SOFTCONN)
 
 #define RPC_TASK_RUNNING	0
 #define RPC_TASK_QUEUED		1
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 7bcd931..68a2358 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1197,6 +1197,8 @@ call_transmit_status(struct rpc_task *task)
 	default:
 		dprint_status(task);
 		xprt_end_transmit(task);
+		rpc_task_force_reencode(task);
+		break;
 		/*
 		 * Special cases: if we've been waiting on the
 		 * socket's write_space() callback, or if the
@@ -1204,11 +1206,16 @@ call_transmit_status(struct rpc_task *task)
 		 * then hold onto the transport lock.
 		 */
 	case -ECONNREFUSED:
-	case -ECONNRESET:
-	case -ENOTCONN:
 	case -EHOSTDOWN:
 	case -EHOSTUNREACH:
 	case -ENETUNREACH:
+		if (RPC_IS_SOFTCONN(task)) {
+			xprt_end_transmit(task);
+			rpc_exit(task, task->tk_status);
+			break;
+		}
+	case -ECONNRESET:
+	case -ENOTCONN:
 	case -EPIPE:
 		rpc_task_force_reencode(task);
 	}
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 37c5475..ff312f8 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2033,7 +2033,7 @@ static void xs_connect(struct rpc_task *task)
 	if (xprt_test_and_set_connecting(xprt))
 		return;
 
-	if (transport->sock != NULL) {
+	if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) {
 		dprintk("RPC:       xs_connect delayed xprt %p for %lu "
 				"seconds\n",
 				xprt, xprt->reestablish_timeout / HZ);
-- 
cgit v0.10.2


From 5a46211540a83871196489247f57da2bdde58d87 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 3 Dec 2009 15:58:56 -0500
Subject: SUNRPC: Simplify synopsis of rpcb_local_clnt()

Clean up: At one point, rpcb_local_clnt() handled IPv6 loopback
addresses too, but it doesn't any more; only IPv4 loopback is used
now.  Get rid of the @addr and @addrlen arguments to
rpcb_local_clnt().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 830faf4..28f50da 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -163,13 +163,12 @@ static const struct sockaddr_in rpcb_inaddr_loopback = {
 	.sin_port		= htons(RPCBIND_PORT),
 };
 
-static struct rpc_clnt *rpcb_create_local(struct sockaddr *addr,
-					  size_t addrlen, u32 version)
+static struct rpc_clnt *rpcb_create_local(u32 version)
 {
 	struct rpc_create_args args = {
 		.protocol	= XPRT_TRANSPORT_UDP,
-		.address	= addr,
-		.addrsize	= addrlen,
+		.address	= (struct sockaddr *)&rpcb_inaddr_loopback,
+		.addrsize	= sizeof(rpcb_inaddr_loopback),
 		.servername	= "localhost",
 		.program	= &rpcb_program,
 		.version	= version,
@@ -211,14 +210,12 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
 
 static int rpcb_register_call(const u32 version, struct rpc_message *msg)
 {
-	struct sockaddr *addr = (struct sockaddr *)&rpcb_inaddr_loopback;
-	size_t addrlen = sizeof(rpcb_inaddr_loopback);
 	struct rpc_clnt *rpcb_clnt;
 	int result, error = 0;
 
 	msg->rpc_resp = &result;
 
-	rpcb_clnt = rpcb_create_local(addr, addrlen, version);
+	rpcb_clnt = rpcb_create_local(version);
 	if (!IS_ERR(rpcb_clnt)) {
 		error = rpc_call_sync(rpcb_clnt, msg, 0);
 		rpc_shutdown_client(rpcb_clnt);
-- 
cgit v0.10.2


From c526611dd631b2802b6b0221ffb306c5fa25c86c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 3 Dec 2009 15:58:56 -0500
Subject: SUNRPC: Use a cached RPC client and transport for rpcbind upcalls

The kernel's rpcbind client creates and deletes an rpc_clnt and its
underlying transport socket for every upcall to the local rpcbind
daemon.

When starting a typical NFS server on IPv4 and IPv6, the NFS service
itself does three upcalls (one per version) times two upcalls (one
per transport) times two upcalls (one per address family), making 12,
plus another one for the initial call to unregister previous NFS
services.  Starting the NLM service adds an additional 13 upcalls,
for similar reasons.

(Currently the NFS service doesn't start IPv6 listeners, but it will
soon enough).

Instead, let's create an rpc_clnt for rpcbind upcalls during the
first local rpcbind query, and cache it.  This saves the overhead of
creating and destroying an rpc_clnt and a socket for every upcall.

The new logic also prevents the kernel from attempting an RPCB_SET or
RPCB_UNSET if it knows from the start that the local portmapper does
not support rpcbind protocol version 4.  This will cut down on the
number of rpcbind upcalls in legacy environments.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 28f50da..116db74 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -20,6 +20,7 @@
 #include <linux/in6.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
+#include <linux/mutex.h>
 #include <net/ipv6.h>
 
 #include <linux/sunrpc/clnt.h>
@@ -110,6 +111,9 @@ static void			rpcb_getport_done(struct rpc_task *, void *);
 static void			rpcb_map_release(void *data);
 static struct rpc_program	rpcb_program;
 
+static struct rpc_clnt *	rpcb_local_clnt;
+static struct rpc_clnt *	rpcb_local_clnt4;
+
 struct rpcbind_args {
 	struct rpc_xprt *	r_xprt;
 
@@ -163,7 +167,13 @@ static const struct sockaddr_in rpcb_inaddr_loopback = {
 	.sin_port		= htons(RPCBIND_PORT),
 };
 
-static struct rpc_clnt *rpcb_create_local(u32 version)
+static DEFINE_MUTEX(rpcb_create_local_mutex);
+
+/*
+ * Returns zero on success, otherwise a negative errno value
+ * is returned.
+ */
+static int rpcb_create_local(void)
 {
 	struct rpc_create_args args = {
 		.protocol	= XPRT_TRANSPORT_UDP,
@@ -171,12 +181,46 @@ static struct rpc_clnt *rpcb_create_local(u32 version)
 		.addrsize	= sizeof(rpcb_inaddr_loopback),
 		.servername	= "localhost",
 		.program	= &rpcb_program,
-		.version	= version,
+		.version	= RPCBVERS_2,
 		.authflavor	= RPC_AUTH_UNIX,
 		.flags		= RPC_CLNT_CREATE_NOPING,
 	};
+	struct rpc_clnt *clnt, *clnt4;
+	int result = 0;
+
+	if (rpcb_local_clnt)
+		return result;
+
+	mutex_lock(&rpcb_create_local_mutex);
+	if (rpcb_local_clnt)
+		goto out;
+
+	clnt = rpc_create(&args);
+	if (IS_ERR(clnt)) {
+		dprintk("RPC:       failed to create local rpcbind "
+				"client (errno %ld).\n", PTR_ERR(clnt));
+		result = -PTR_ERR(clnt);
+		goto out;
+	}
 
-	return rpc_create(&args);
+	/*
+	 * This results in an RPC ping.  On systems running portmapper,
+	 * the v4 ping will fail.  Proceed anyway, but disallow rpcb
+	 * v4 upcalls.
+	 */
+	clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4);
+	if (IS_ERR(clnt4)) {
+		dprintk("RPC:       failed to create local rpcbind v4 "
+				"cleint (errno %ld).\n", PTR_ERR(clnt4));
+		clnt4 = NULL;
+	}
+
+	rpcb_local_clnt = clnt;
+	rpcb_local_clnt4 = clnt4;
+
+out:
+	mutex_unlock(&rpcb_create_local_mutex);
+	return result;
 }
 
 static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
@@ -208,20 +252,13 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
 	return rpc_create(&args);
 }
 
-static int rpcb_register_call(const u32 version, struct rpc_message *msg)
+static int rpcb_register_call(struct rpc_clnt *clnt, struct rpc_message *msg)
 {
-	struct rpc_clnt *rpcb_clnt;
 	int result, error = 0;
 
 	msg->rpc_resp = &result;
 
-	rpcb_clnt = rpcb_create_local(version);
-	if (!IS_ERR(rpcb_clnt)) {
-		error = rpc_call_sync(rpcb_clnt, msg, 0);
-		rpc_shutdown_client(rpcb_clnt);
-	} else
-		error = PTR_ERR(rpcb_clnt);
-
+	error = rpc_call_sync(clnt, msg, 0);
 	if (error < 0) {
 		dprintk("RPC:       failed to contact local rpcbind "
 				"server (errno %d).\n", -error);
@@ -276,6 +313,11 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port)
 	struct rpc_message msg = {
 		.rpc_argp	= &map,
 	};
+	int error;
+
+	error = rpcb_create_local();
+	if (error)
+		return error;
 
 	dprintk("RPC:       %sregistering (%u, %u, %d, %u) with local "
 			"rpcbind\n", (port ? "" : "un"),
@@ -285,7 +327,7 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port)
 	if (port)
 		msg.rpc_proc = &rpcb_procedures2[RPCBPROC_SET];
 
-	return rpcb_register_call(RPCBVERS_2, &msg);
+	return rpcb_register_call(rpcb_local_clnt, &msg);
 }
 
 /*
@@ -310,7 +352,7 @@ static int rpcb_register_inet4(const struct sockaddr *sap,
 	if (port)
 		msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET];
 
-	result = rpcb_register_call(RPCBVERS_4, msg);
+	result = rpcb_register_call(rpcb_local_clnt4, msg);
 	kfree(map->r_addr);
 	return result;
 }
@@ -337,7 +379,7 @@ static int rpcb_register_inet6(const struct sockaddr *sap,
 	if (port)
 		msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET];
 
-	result = rpcb_register_call(RPCBVERS_4, msg);
+	result = rpcb_register_call(rpcb_local_clnt4, msg);
 	kfree(map->r_addr);
 	return result;
 }
@@ -353,7 +395,7 @@ static int rpcb_unregister_all_protofamilies(struct rpc_message *msg)
 	map->r_addr = "";
 	msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
 
-	return rpcb_register_call(RPCBVERS_4, msg);
+	return rpcb_register_call(rpcb_local_clnt4, msg);
 }
 
 /**
@@ -411,6 +453,13 @@ int rpcb_v4_register(const u32 program, const u32 version,
 	struct rpc_message msg = {
 		.rpc_argp	= &map,
 	};
+	int error;
+
+	error = rpcb_create_local();
+	if (error)
+		return error;
+	if (rpcb_local_clnt4 == NULL)
+		return -EPROTONOSUPPORT;
 
 	if (address == NULL)
 		return rpcb_unregister_all_protofamilies(&msg);
@@ -1024,3 +1073,15 @@ static struct rpc_program rpcb_program = {
 	.version	= rpcb_version,
 	.stats		= &rpcb_stats,
 };
+
+/**
+ * cleanup_rpcb_clnt - remove xprtsock's sysctls, unregister
+ *
+ */
+void cleanup_rpcb_clnt(void)
+{
+	if (rpcb_local_clnt4)
+		rpc_shutdown_client(rpcb_local_clnt4);
+	if (rpcb_local_clnt)
+		rpc_shutdown_client(rpcb_local_clnt);
+}
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 8cce921..f438347 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -24,6 +24,8 @@
 
 extern struct cache_detail ip_map_cache, unix_gid_cache;
 
+extern void cleanup_rpcb_clnt(void);
+
 static int __init
 init_sunrpc(void)
 {
@@ -53,6 +55,7 @@ out:
 static void __exit
 cleanup_sunrpc(void)
 {
+	cleanup_rpcb_clnt();
 	rpcauth_remove_module();
 	cleanup_socket_xprt();
 	svc_cleanup_xprt_sock();
-- 
cgit v0.10.2


From 2a76b3bfa22993fc09166bd6a8db0dbe902b6813 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 3 Dec 2009 15:58:56 -0500
Subject: SUNRPC: Use TCP for local rpcbind upcalls

Use TCP with the soft connect semantic for local rpcbind upcalls so
the kernel can detect immediately if the local rpcbind daemon is not
running.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 116db74..4011540 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -176,7 +176,7 @@ static DEFINE_MUTEX(rpcb_create_local_mutex);
 static int rpcb_create_local(void)
 {
 	struct rpc_create_args args = {
-		.protocol	= XPRT_TRANSPORT_UDP,
+		.protocol	= XPRT_TRANSPORT_TCP,
 		.address	= (struct sockaddr *)&rpcb_inaddr_loopback,
 		.addrsize	= sizeof(rpcb_inaddr_loopback),
 		.servername	= "localhost",
@@ -258,7 +258,7 @@ static int rpcb_register_call(struct rpc_clnt *clnt, struct rpc_message *msg)
 
 	msg->rpc_resp = &result;
 
-	error = rpc_call_sync(clnt, msg, 0);
+	error = rpc_call_sync(clnt, msg, RPC_TASK_SOFTCONN);
 	if (error < 0) {
 		dprintk("RPC:       failed to contact local rpcbind "
 				"server (errno %d).\n", -error);
-- 
cgit v0.10.2


From 012da158f636347c4eb28fd1e1cca020fef5e54d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 3 Dec 2009 15:58:56 -0500
Subject: SUNRPC: Use soft connects for autobinding over TCP

Autobinding is handled by the rpciod process, not in user processes
that are generating regular RPC requests.  Thus autobinding is usually
not affected by signals targetting user processes, such as KILL or
timer expiration events.

In addition, an RPC request generated by a user process that has
RPC_TASK_SOFTCONN set and needs to perform an autobind will hang if
the remote rpcbind service is not available.

For rpcbind queries on connection-oriented transports, let's use the
new soft connect semantic to return control to the user's process
quickly, if the kernel's rpcbind client can't connect to the remote
rpcbind service.

Logic is introduced in call_bind_status() to handle connection errors
that occurred during an asynchronous rpcbind query.  The logic
abandons the rpcbind query if the RPC request has SOFTCONN set, and
retries after a few seconds in the normal case.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 68a2358..4b76ef93 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1060,7 +1060,7 @@ call_bind_status(struct rpc_task *task)
 		goto retry_timeout;
 	case -EPFNOSUPPORT:
 		/* server doesn't support any rpcbind version we know of */
-		dprintk("RPC: %5u remote rpcbind service unavailable\n",
+		dprintk("RPC: %5u unrecognized remote rpcbind service\n",
 				task->tk_pid);
 		break;
 	case -EPROTONOSUPPORT:
@@ -1069,6 +1069,21 @@ call_bind_status(struct rpc_task *task)
 		task->tk_status = 0;
 		task->tk_action = call_bind;
 		return;
+	case -ECONNREFUSED:		/* connection problems */
+	case -ECONNRESET:
+	case -ENOTCONN:
+	case -EHOSTDOWN:
+	case -EHOSTUNREACH:
+	case -ENETUNREACH:
+	case -EPIPE:
+		dprintk("RPC: %5u remote rpcbind unreachable: %d\n",
+				task->tk_pid, task->tk_status);
+		if (!RPC_IS_SOFTCONN(task)) {
+			rpc_delay(task, 5*HZ);
+			goto retry_timeout;
+		}
+		status = task->tk_status;
+		break;
 	default:
 		dprintk("RPC: %5u unrecognized rpcbind error (%d)\n",
 				task->tk_pid, -task->tk_status);
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 4011540..3e3772d 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -537,7 +537,7 @@ static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbi
 		.rpc_message = &msg,
 		.callback_ops = &rpcb_getport_ops,
 		.callback_data = map,
-		.flags = RPC_TASK_ASYNC,
+		.flags = RPC_TASK_ASYNC | RPC_TASK_SOFTCONN,
 	};
 
 	return rpc_run_task(&task_setup_data);
-- 
cgit v0.10.2


From caabea8a565fb4e16f8e58e16bb9d535e591b709 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 3 Dec 2009 15:58:56 -0500
Subject: SUNRPC: Use soft connect semantics when performing RPC ping

Currently, if a remote RPC service is unreachable, an RPC ping will
hang until the underlying transport connect attempt times out.  A more
desirable behavior might be to have the ping fail immediately so upper
layers can recover appropriately.

In the case of an NFS mount, for instance, this would mean the
mount(2) system call could fail immediately if the server isn't
listening, rather than hanging uninterruptibly for more than 3
minutes.

Change rpc_ping() so that it fails immediately for connection-oriented
transports.  rpc_create() will then fail immediately for such
transports if an RPC ping was requested.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 4b76ef93..97931d9 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -79,7 +79,7 @@ static void	call_connect_status(struct rpc_task *task);
 
 static __be32	*rpc_encode_header(struct rpc_task *task);
 static __be32	*rpc_verify_header(struct rpc_task *task);
-static int	rpc_ping(struct rpc_clnt *clnt, int flags);
+static int	rpc_ping(struct rpc_clnt *clnt);
 
 static void rpc_register_client(struct rpc_clnt *clnt)
 {
@@ -340,7 +340,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 		return clnt;
 
 	if (!(args->flags & RPC_CLNT_CREATE_NOPING)) {
-		int err = rpc_ping(clnt, RPC_TASK_SOFT);
+		int err = rpc_ping(clnt);
 		if (err != 0) {
 			rpc_shutdown_client(clnt);
 			return ERR_PTR(err);
@@ -528,7 +528,7 @@ struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old,
 	clnt->cl_prog     = program->number;
 	clnt->cl_vers     = version->number;
 	clnt->cl_stats    = program->stats;
-	err = rpc_ping(clnt, RPC_TASK_SOFT);
+	err = rpc_ping(clnt);
 	if (err != 0) {
 		rpc_shutdown_client(clnt);
 		clnt = ERR_PTR(err);
@@ -1709,14 +1709,14 @@ static struct rpc_procinfo rpcproc_null = {
 	.p_decode = rpcproc_decode_null,
 };
 
-static int rpc_ping(struct rpc_clnt *clnt, int flags)
+static int rpc_ping(struct rpc_clnt *clnt)
 {
 	struct rpc_message msg = {
 		.rpc_proc = &rpcproc_null,
 	};
 	int err;
 	msg.rpc_cred = authnull_ops.lookup_cred(NULL, NULL, 0);
-	err = rpc_call_sync(clnt, &msg, flags);
+	err = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN);
 	put_rpccred(msg.rpc_cred);
 	return err;
 }
-- 
cgit v0.10.2


From 3a28becc35e5c8f1fabb707bcd8a473712653de6 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 3 Dec 2009 15:58:56 -0500
Subject: SUNRPC: soft connect semantics for UDP

Introduce soft connect behavior for UDP transports.  In this case, a
major timeout returns ETIMEDOUT instead of EIO.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 97931d9..154034b 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1380,6 +1380,10 @@ call_timeout(struct rpc_task *task)
 	dprintk("RPC: %5u call_timeout (major)\n", task->tk_pid);
 	task->tk_timeouts++;
 
+	if (RPC_IS_SOFTCONN(task)) {
+		rpc_exit(task, -ETIMEDOUT);
+		return;
+	}
 	if (RPC_IS_SOFT(task)) {
 		if (clnt->cl_chatty)
 			printk(KERN_NOTICE "%s: server %s not responding, timed out\n",
-- 
cgit v0.10.2


From 9c4c761a629caa5572c1a29a8288416070d5d6b7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 3 Dec 2009 15:58:56 -0500
Subject: NFSv4.1: Handle NFSv4.1 session errors in the lock recovery code

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2ef4fec..3004089 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -877,6 +877,10 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
 			case -NFS4ERR_EXPIRED:
 			case -NFS4ERR_NO_GRACE:
 			case -NFS4ERR_STALE_CLIENTID:
+			case -NFS4ERR_BADSESSION:
+			case -NFS4ERR_BADSLOT:
+			case -NFS4ERR_BAD_HIGH_SLOT:
+			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 				goto out;
 			default:
 				printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
@@ -959,6 +963,10 @@ restart:
 			case -NFS4ERR_NO_GRACE:
 				nfs4_state_mark_reclaim_nograce(sp->so_client, state);
 			case -NFS4ERR_STALE_CLIENTID:
+			case -NFS4ERR_BADSESSION:
+			case -NFS4ERR_BADSLOT:
+			case -NFS4ERR_BAD_HIGH_SLOT:
+			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 				goto out_err;
 		}
 		nfs4_put_open_state(state);
-- 
cgit v0.10.2


From e48de5ec25b37d42292c876c1d3337766aae89bd Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 3 Dec 2009 15:58:56 -0500
Subject: nfs: remove unnecessary check from nfs_rename()

VFS already checks if both source and target are directories.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7cb2985..b5fae19 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1601,13 +1601,8 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 * silly-rename. If the silly-rename succeeds, the
 	 * copied dentry is hashed and becomes the new target.
 	 */
-	if (!new_inode)
-		goto go_ahead;
-	if (S_ISDIR(new_inode->i_mode)) {
-		error = -EISDIR;
-		if (!S_ISDIR(old_inode->i_mode))
-			goto out;
-	} else if (atomic_read(&new_dentry->d_count) > 2) {
+	if (new_inode && !S_ISDIR(new_inode->i_mode) &&
+	    atomic_read(&new_dentry->d_count) > 2) {
 		int err;
 		/* copy the target dentry's name */
 		dentry = d_alloc(new_dentry->d_parent,
@@ -1627,7 +1622,6 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto out;
 	}
 
-go_ahead:
 	/*
 	 * ... prune child dentries and writebacks if needed.
 	 */
-- 
cgit v0.10.2


From 28f79a1a695e7a5b00af3b6713b449e08581ffbb Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 3 Dec 2009 15:58:56 -0500
Subject: nfs: fix comments in nfs_rename()

Comments are wrong or out of date.  In particular d_drop() doesn't
free the inode it just unhashes the dentry.  And if target is a
directory then it is not checked for being busy.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b5fae19..11d0c4c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1581,7 +1581,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	/*
 	 * To prevent any new references to the target during the rename,
-	 * we unhash the dentry and free the inode in advance.
+	 * we unhash the dentry in advance.
 	 */
 	if (!d_unhashed(new_dentry)) {
 		d_drop(new_dentry);
@@ -1594,12 +1594,10 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		 atomic_read(&new_dentry->d_count));
 
 	/*
-	 * First check whether the target is busy ... we can't
-	 * safely do _any_ rename if the target is in use.
-	 *
-	 * For files, make a copy of the dentry and then do a 
-	 * silly-rename. If the silly-rename succeeds, the
-	 * copied dentry is hashed and becomes the new target.
+	 * For non-directories, check whether the target is busy and if so,
+	 * make a copy of the dentry and then do a silly-rename. If the
+	 * silly-rename succeeds, the copied dentry is hashed and becomes
+	 * the new target.
 	 */
 	if (new_inode && !S_ISDIR(new_inode->i_mode) &&
 	    atomic_read(&new_dentry->d_count) > 2) {
-- 
cgit v0.10.2


From 27226104e60964f21717e0f452cecd45c85a64c6 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 3 Dec 2009 15:58:56 -0500
Subject: nfs: dont unhash target if renaming a directory

Move unhashing the target to after the check for existence and being a
non-directory.

If renaming a directory then the VFS already unhashes the target if it
is not busy.  If it's busy then acquiring more references during the
rename makes no difference.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 11d0c4c..76b7f53 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1579,15 +1579,6 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct dentry *dentry = NULL, *rehash = NULL;
 	int error = -EBUSY;
 
-	/*
-	 * To prevent any new references to the target during the rename,
-	 * we unhash the dentry in advance.
-	 */
-	if (!d_unhashed(new_dentry)) {
-		d_drop(new_dentry);
-		rehash = new_dentry;
-	}
-
 	dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
 		 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
 		 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
@@ -1599,25 +1590,36 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 * silly-rename succeeds, the copied dentry is hashed and becomes
 	 * the new target.
 	 */
-	if (new_inode && !S_ISDIR(new_inode->i_mode) &&
-	    atomic_read(&new_dentry->d_count) > 2) {
-		int err;
-		/* copy the target dentry's name */
-		dentry = d_alloc(new_dentry->d_parent,
-				 &new_dentry->d_name);
-		if (!dentry)
-			goto out;
+	if (new_inode && !S_ISDIR(new_inode->i_mode)) {
+		/*
+		 * To prevent any new references to the target during the
+		 * rename, we unhash the dentry in advance.
+		 */
+		if (!d_unhashed(new_dentry)) {
+			d_drop(new_dentry);
+			rehash = new_dentry;
+		}
 
-		/* silly-rename the existing target ... */
-		err = nfs_sillyrename(new_dir, new_dentry);
-		if (!err) {
-			new_dentry = rehash = dentry;
-			new_inode = NULL;
-			/* instantiate the replacement target */
-			d_instantiate(new_dentry, NULL);
-		} else if (atomic_read(&new_dentry->d_count) > 1)
-			/* dentry still busy? */
-			goto out;
+		if (atomic_read(&new_dentry->d_count) > 2) {
+			int err;
+
+			/* copy the target dentry's name */
+			dentry = d_alloc(new_dentry->d_parent,
+					 &new_dentry->d_name);
+			if (!dentry)
+				goto out;
+
+			/* silly-rename the existing target ... */
+			err = nfs_sillyrename(new_dir, new_dentry);
+			if (!err) {
+				new_dentry = rehash = dentry;
+				new_inode = NULL;
+				/* instantiate the replacement target */
+				d_instantiate(new_dentry, NULL);
+			} else if (atomic_read(&new_dentry->d_count) > 1)
+				/* dentry still busy? */
+				goto out;
+		}
 	}
 
 	/*
-- 
cgit v0.10.2


From 24e93025ee434a58d35e5abb283c5bcc9a13e477 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 3 Dec 2009 15:58:56 -0500
Subject: nfs: clean up sillyrenaming in nfs_rename()

The d_instantiate(new_dentry, NULL) is superfluous, the dentry is
already negative.  Rehashing this dummy dentry isn't needed either,
d_move() works fine on an unhashed target.

The re-checking for busy after a failed nfs_sillyrename() is bogus
too: new_dentry->d_count < 2 would be a bug here.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 76b7f53..2c5ace4 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1611,14 +1611,11 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 			/* silly-rename the existing target ... */
 			err = nfs_sillyrename(new_dir, new_dentry);
-			if (!err) {
-				new_dentry = rehash = dentry;
-				new_inode = NULL;
-				/* instantiate the replacement target */
-				d_instantiate(new_dentry, NULL);
-			} else if (atomic_read(&new_dentry->d_count) > 1)
-				/* dentry still busy? */
+			if (err)
 				goto out;
+
+			new_dentry = dentry;
+			new_inode = NULL;
 		}
 	}
 
-- 
cgit v0.10.2


From 44ed3556bad809797f7b06a4a88918fd8a23d6fe Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 3 Dec 2009 15:58:56 -0500
Subject: NFS4ERR_FILE_OPEN handling in Linux/NFS

NFS4ERR_FILE_OPEN is return by the server when an operation cannot be
performed because the file is currently open and local (to the server)
semantics prohibit the operation while the file is open.
A typical case is a RENAME operation on an MS-Windows platform, which
prevents rename while the file is open.

While it is possible that such a condition is transitory, it is also
very possible that the file will be held open for an extended period
of time thus preventing the operation.

The current behaviour of Linux/NFS is to retry the operation
indefinitely.  This is not appropriate - we do not expect a rename to
take an arbitrary amount of time to complete.

Rather, and error should be returned.  The most obvious error code
would be EBUSY, which is a legal at least for 'rename' and 'unlink',
and accurately captures the reason for the error.

This patch allows a few retries until about 2 seconds have elapsed,
then returns EBUSY.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 741a562..40da0d5 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -275,6 +275,13 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
 			/* FALLTHROUGH */
 #endif /* !defined(CONFIG_NFS_V4_1) */
 		case -NFS4ERR_FILE_OPEN:
+			if (exception->timeout > HZ) {
+				/* We have retried a decent amount, time to
+				 * fail
+				 */
+				ret = -EBUSY;
+				break;
+			}
 		case -NFS4ERR_GRACE:
 		case -NFS4ERR_DELAY:
 			ret = nfs4_delay(server->client, &exception->timeout);
-- 
cgit v0.10.2