From f2ebb3a921c1ca1e2ddd9242e95a1989a50c4c68 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 27 Feb 2014 09:35:45 -0500
Subject: smarter propagate_mnt()

The current mainline has copies propagated to *all* nodes, then
tears down the copies we made for nodes that do not contain
counterparts of the desired mountpoint.  That sets the right
propagation graph for the copies (at teardown time we move
the slaves of removed node to a surviving peer or directly
to master), but we end up paying a fairly steep price in
useless allocations.  It's fairly easy to create a situation
where N calls of mount(2) create exactly N bindings, with
O(N^2) vfsmounts allocated and freed in process.

Fortunately, it is possible to avoid those allocations/freeings.
The trick is to create copies in the right order and find which
one would've eventually become a master with the current algorithm.
It turns out to be possible in O(nodes getting propagation) time
and with no extra allocations at all.

One part is that we need to make sure that eventual master will be
created before its slaves, so we need to walk the propagation
tree in a different order - by peer groups.  And iterate through
the peers before dealing with the next group.

Another thing is finding the (earlier) copy that will be a master
of one we are about to create; to do that we are (temporary) marking
the masters of mountpoints we are attaching the copies to.

Either we are in a peer of the last mountpoint we'd dealt with,
or we have the following situation: we are attaching to mountpoint M,
the last copy S_0 had been attached to M_0 and there are sequences
S_0...S_n, M_0...M_n such that S_{i+1} is a master of S_{i},
S_{i} mounted on M{i} and we need to create a slave of the first S_{k}
such that M is getting propagation from M_{k}.  It means that the master
of M_{k} will be among the sequence of masters of M.  On the
other hand, the nearest marked node in that sequence will either
be the master of M_{k} or the master of M_{k-1} (the latter -
in the case if M_{k-1} is a slave of something M gets propagation
from, but in a wrong peer group).

So we go through the sequence of masters of M until we find
a marked one (P).  Let N be the one before it.  Then we go through
the sequence of masters of S_0 until we find one (say, S) mounted
on a node D that has P as master and check if D is a peer of N.
If it is, S will be the master of new copy, if not - the master of S
will be.

That's it for the hard part; the rest is fairly simple.  Iterator
is in next_group(), handling of one prospective mountpoint is
propagate_one().

It seems to survive all tests and gives a noticably better performance
than the current mainline for setups that are seriously using shared
subtrees.

Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namespace.c b/fs/namespace.c
index 2ffc5a2..65233a5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -885,7 +885,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 			goto out_free;
 	}
 
-	mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD;
+	mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
 	/* Don't allow unprivileged users to change mount flags */
 	if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY))
 		mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
@@ -1661,9 +1661,9 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 		if (err)
 			goto out;
 		err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
+		lock_mount_hash();
 		if (err)
 			goto out_cleanup_ids;
-		lock_mount_hash();
 		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
 			set_mnt_shared(p);
 	} else {
@@ -1690,6 +1690,11 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 	return 0;
 
  out_cleanup_ids:
+	while (!hlist_empty(&tree_list)) {
+		child = hlist_entry(tree_list.first, struct mount, mnt_hash);
+		umount_tree(child, 0);
+	}
+	unlock_mount_hash();
 	cleanup_group_ids(source_mnt, NULL);
  out:
 	return err;
@@ -2044,7 +2049,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
 	struct mount *parent;
 	int err;
 
-	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT);
+	mnt_flags &= ~MNT_INTERNAL_FLAGS;
 
 	mp = lock_mount(path);
 	if (IS_ERR(mp))
diff --git a/fs/pnode.c b/fs/pnode.c
index 88396df..302bf22 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -164,46 +164,94 @@ static struct mount *propagation_next(struct mount *m,
 	}
 }
 
-/*
- * return the source mount to be used for cloning
- *
- * @dest 	the current destination mount
- * @last_dest  	the last seen destination mount
- * @last_src  	the last seen source mount
- * @type	return CL_SLAVE if the new mount has to be
- * 		cloned as a slave.
- */
-static struct mount *get_source(struct mount *dest,
-				struct mount *last_dest,
-				struct mount *last_src,
-				int *type)
+static struct mount *next_group(struct mount *m, struct mount *origin)
 {
-	struct mount *p_last_src = NULL;
-	struct mount *p_last_dest = NULL;
-
-	while (last_dest != dest->mnt_master) {
-		p_last_dest = last_dest;
-		p_last_src = last_src;
-		last_dest = last_dest->mnt_master;
-		last_src = last_src->mnt_master;
+	while (1) {
+		while (1) {
+			struct mount *next;
+			if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+				return first_slave(m);
+			next = next_peer(m);
+			if (m->mnt_group_id == origin->mnt_group_id) {
+				if (next == origin)
+					return NULL;
+			} else if (m->mnt_slave.next != &next->mnt_slave)
+				break;
+			m = next;
+		}
+		/* m is the last peer */
+		while (1) {
+			struct mount *master = m->mnt_master;
+			if (m->mnt_slave.next != &master->mnt_slave_list)
+				return next_slave(m);
+			m = next_peer(master);
+			if (master->mnt_group_id == origin->mnt_group_id)
+				break;
+			if (master->mnt_slave.next == &m->mnt_slave)
+				break;
+			m = master;
+		}
+		if (m == origin)
+			return NULL;
 	}
+}
 
-	if (p_last_dest) {
-		do {
-			p_last_dest = next_peer(p_last_dest);
-		} while (IS_MNT_NEW(p_last_dest));
-		/* is that a peer of the earlier? */
-		if (dest == p_last_dest) {
-			*type = CL_MAKE_SHARED;
-			return p_last_src;
+/* all accesses are serialized by namespace_sem */
+static struct user_namespace *user_ns;
+static struct mount *last_dest, *last_source, *dest_master;
+static struct mountpoint *mp;
+static struct hlist_head *list;
+
+static int propagate_one(struct mount *m)
+{
+	struct mount *child;
+	int type;
+	/* skip ones added by this propagate_mnt() */
+	if (IS_MNT_NEW(m))
+		return 0;
+	/* skip if mountpoint isn't covered by it */
+	if (!is_subdir(mp->m_dentry, m->mnt.mnt_root))
+		return 0;
+	if (m->mnt_group_id == last_dest->mnt_group_id) {
+		type = CL_MAKE_SHARED;
+	} else {
+		struct mount *n, *p;
+		for (n = m; ; n = p) {
+			p = n->mnt_master;
+			if (p == dest_master || IS_MNT_MARKED(p)) {
+				while (last_dest->mnt_master != p) {
+					last_source = last_source->mnt_master;
+					last_dest = last_source->mnt_parent;
+				}
+				if (n->mnt_group_id != last_dest->mnt_group_id) {
+					last_source = last_source->mnt_master;
+					last_dest = last_source->mnt_parent;
+				}
+				break;
+			}
 		}
+		type = CL_SLAVE;
+		/* beginning of peer group among the slaves? */
+		if (IS_MNT_SHARED(m))
+			type |= CL_MAKE_SHARED;
 	}
-	/* slave of the earlier, then */
-	*type = CL_SLAVE;
-	/* beginning of peer group among the slaves? */
-	if (IS_MNT_SHARED(dest))
-		*type |= CL_MAKE_SHARED;
-	return last_src;
+		
+	/* Notice when we are propagating across user namespaces */
+	if (m->mnt_ns->user_ns != user_ns)
+		type |= CL_UNPRIVILEGED;
+	child = copy_tree(last_source, last_source->mnt.mnt_root, type);
+	if (IS_ERR(child))
+		return PTR_ERR(child);
+	mnt_set_mountpoint(m, mp, child);
+	last_dest = m;
+	last_source = child;
+	if (m->mnt_master != dest_master) {
+		read_seqlock_excl(&mount_lock);
+		SET_MNT_MARK(m->mnt_master);
+		read_sequnlock_excl(&mount_lock);
+	}
+	hlist_add_head(&child->mnt_hash, list);
+	return 0;
 }
 
 /*
@@ -222,56 +270,48 @@ static struct mount *get_source(struct mount *dest,
 int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 		    struct mount *source_mnt, struct hlist_head *tree_list)
 {
-	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
-	struct mount *m, *child;
+	struct mount *m, *n;
 	int ret = 0;
-	struct mount *prev_dest_mnt = dest_mnt;
-	struct mount *prev_src_mnt  = source_mnt;
-	HLIST_HEAD(tmp_list);
-
-	for (m = propagation_next(dest_mnt, dest_mnt); m;
-			m = propagation_next(m, dest_mnt)) {
-		int type;
-		struct mount *source;
-
-		if (IS_MNT_NEW(m))
-			continue;
-
-		source =  get_source(m, prev_dest_mnt, prev_src_mnt, &type);
-
-		/* Notice when we are propagating across user namespaces */
-		if (m->mnt_ns->user_ns != user_ns)
-			type |= CL_UNPRIVILEGED;
-
-		child = copy_tree(source, source->mnt.mnt_root, type);
-		if (IS_ERR(child)) {
-			ret = PTR_ERR(child);
-			tmp_list = *tree_list;
-			tmp_list.first->pprev = &tmp_list.first;
-			INIT_HLIST_HEAD(tree_list);
+
+	/*
+	 * we don't want to bother passing tons of arguments to
+	 * propagate_one(); everything is serialized by namespace_sem,
+	 * so globals will do just fine.
+	 */
+	user_ns = current->nsproxy->mnt_ns->user_ns;
+	last_dest = dest_mnt;
+	last_source = source_mnt;
+	mp = dest_mp;
+	list = tree_list;
+	dest_master = dest_mnt->mnt_master;
+
+	/* all peers of dest_mnt, except dest_mnt itself */
+	for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) {
+		ret = propagate_one(n);
+		if (ret)
 			goto out;
-		}
+	}
 
-		if (is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) {
-			mnt_set_mountpoint(m, dest_mp, child);
-			hlist_add_head(&child->mnt_hash, tree_list);
-		} else {
-			/*
-			 * This can happen if the parent mount was bind mounted
-			 * on some subdirectory of a shared/slave mount.
-			 */
-			hlist_add_head(&child->mnt_hash, &tmp_list);
-		}
-		prev_dest_mnt = m;
-		prev_src_mnt  = child;
+	/* all slave groups */
+	for (m = next_group(dest_mnt, dest_mnt); m;
+			m = next_group(m, dest_mnt)) {
+		/* everything in that slave group */
+		n = m;
+		do {
+			ret = propagate_one(n);
+			if (ret)
+				goto out;
+			n = next_peer(n);
+		} while (n != m);
 	}
 out:
-	lock_mount_hash();
-	while (!hlist_empty(&tmp_list)) {
-		child = hlist_entry(tmp_list.first, struct mount, mnt_hash);
-		umount_tree(child, 0);
+	read_seqlock_excl(&mount_lock);
+	hlist_for_each_entry(n, tree_list, mnt_hash) {
+		m = n->mnt_parent;
+		if (m->mnt_master != dest_mnt->mnt_master)
+			CLEAR_MNT_MARK(m->mnt_master);
 	}
-	unlock_mount_hash();
+	read_sequnlock_excl(&mount_lock);
 	return ret;
 }
 
diff --git a/fs/pnode.h b/fs/pnode.h
index fc28a27..4a24635 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -16,6 +16,9 @@
 #define IS_MNT_NEW(m)  (!(m)->mnt_ns)
 #define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
 #define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
+#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
+#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
+#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
 
 #define CL_EXPIRE    		0x01
 #define CL_SLAVE     		0x02
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 371d346..839bac2 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -44,6 +44,8 @@ struct mnt_namespace;
 #define MNT_SHARED_MASK	(MNT_UNBINDABLE)
 #define MNT_PROPAGATION_MASK	(MNT_SHARED | MNT_UNBINDABLE)
 
+#define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
+			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED)
 
 #define MNT_INTERNAL	0x4000
 
@@ -51,6 +53,7 @@ struct mnt_namespace;
 #define MNT_LOCKED		0x800000
 #define MNT_DOOMED		0x1000000
 #define MNT_SYNC_UMOUNT		0x2000000
+#define MNT_MARKED		0x4000000
 
 struct vfsmount {
 	struct dentry *mnt_root;	/* root of the mounted tree */
-- 
cgit v0.10.2


From c7999c3627bc6d49aa6fb9451063938cfd2c2082 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 27 Feb 2014 14:40:10 -0500
Subject: reduce m_start() cost...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/mount.h b/fs/mount.h
index b29e42f..d55297f 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -10,7 +10,7 @@ struct mnt_namespace {
 	struct user_namespace	*user_ns;
 	u64			seq;	/* Sequence number to prevent loops */
 	wait_queue_head_t poll;
-	int event;
+	u64 event;
 };
 
 struct mnt_pcp {
@@ -104,6 +104,9 @@ struct proc_mounts {
 	struct mnt_namespace *ns;
 	struct path root;
 	int (*show)(struct seq_file *, struct vfsmount *);
+	void *cached_mount;
+	u64 cached_event;
+	loff_t cached_index;
 };
 
 #define proc_mounts(p) (container_of((p), struct proc_mounts, m))
diff --git a/fs/namespace.c b/fs/namespace.c
index 65233a5..a66aff5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -52,7 +52,7 @@ static int __init set_mphash_entries(char *str)
 }
 __setup("mphash_entries=", set_mphash_entries);
 
-static int event;
+static u64 event;
 static DEFINE_IDA(mnt_id_ida);
 static DEFINE_IDA(mnt_group_ida);
 static DEFINE_SPINLOCK(mnt_id_lock);
@@ -1100,14 +1100,29 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	struct proc_mounts *p = proc_mounts(m);
 
 	down_read(&namespace_sem);
-	return seq_list_start(&p->ns->list, *pos);
+	if (p->cached_event == p->ns->event) {
+		void *v = p->cached_mount;
+		if (*pos == p->cached_index)
+			return v;
+		if (*pos == p->cached_index + 1) {
+			v = seq_list_next(v, &p->ns->list, &p->cached_index);
+			return p->cached_mount = v;
+		}
+	}
+
+	p->cached_event = p->ns->event;
+	p->cached_mount = seq_list_start(&p->ns->list, *pos);
+	p->cached_index = *pos;
+	return p->cached_mount;
 }
 
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct proc_mounts *p = proc_mounts(m);
 
-	return seq_list_next(v, &p->ns->list, pos);
+	p->cached_mount = seq_list_next(v, &p->ns->list, pos);
+	p->cached_index = *pos;
+	return p->cached_mount;
 }
 
 static void m_stop(struct seq_file *m, void *v)
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 7be26f0..1a81373 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -267,6 +267,7 @@ static int mounts_open_common(struct inode *inode, struct file *file,
 	p->root = root;
 	p->m.poll_event = ns->event;
 	p->show = show;
+	p->cached_event = ~0ULL;
 
 	return 0;
 
-- 
cgit v0.10.2


From 964ea96ebaa4014a3d4bc9d6950b460899e3814d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Mar 2014 20:33:08 -0500
Subject: usbip: don't open-code sockfd_lookup/sockfd_put

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/drivers/staging/usbip/stub_dev.c b/drivers/staging/usbip/stub_dev.c
index 76a1ff0..2e2ccef 100644
--- a/drivers/staging/usbip/stub_dev.c
+++ b/drivers/staging/usbip/stub_dev.c
@@ -86,7 +86,6 @@ static ssize_t store_sockfd(struct device *dev, struct device_attribute *attr,
 	struct stub_device *sdev = dev_get_drvdata(dev);
 	int sockfd = 0;
 	struct socket *socket;
-	ssize_t err = -EINVAL;
 
 	if (!sdev) {
 		dev_err(dev, "sdev is null\n");
@@ -96,6 +95,7 @@ static ssize_t store_sockfd(struct device *dev, struct device_attribute *attr,
 	sscanf(buf, "%d", &sockfd);
 
 	if (sockfd != -1) {
+		int err;
 		dev_info(dev, "stub up\n");
 
 		spin_lock_irq(&sdev->ud.lock);
@@ -105,7 +105,7 @@ static ssize_t store_sockfd(struct device *dev, struct device_attribute *attr,
 			goto err;
 		}
 
-		socket = sockfd_to_socket(sockfd);
+		socket = sockfd_lookup(sockfd, &err);
 		if (!socket)
 			goto err;
 
@@ -138,7 +138,7 @@ static ssize_t store_sockfd(struct device *dev, struct device_attribute *attr,
 
 err:
 	spin_unlock_irq(&sdev->ud.lock);
-	return err;
+	return -EINVAL;
 }
 static DEVICE_ATTR(usbip_sockfd, S_IWUSR, NULL, store_sockfd);
 
@@ -208,7 +208,7 @@ static void stub_shutdown_connection(struct usbip_device *ud)
 	 * not touch NULL socket.
 	 */
 	if (ud->tcp_socket) {
-		fput(ud->tcp_socket->file);
+		sockfd_put(ud->tcp_socket);
 		ud->tcp_socket = NULL;
 	}
 
diff --git a/drivers/staging/usbip/usbip_common.c b/drivers/staging/usbip/usbip_common.c
index 96552e3..e010939 100644
--- a/drivers/staging/usbip/usbip_common.c
+++ b/drivers/staging/usbip/usbip_common.c
@@ -400,31 +400,6 @@ err:
 }
 EXPORT_SYMBOL_GPL(usbip_recv);
 
-struct socket *sockfd_to_socket(unsigned int sockfd)
-{
-	struct socket *socket;
-	struct file *file;
-	struct inode *inode;
-
-	file = fget(sockfd);
-	if (!file) {
-		pr_err("invalid sockfd\n");
-		return NULL;
-	}
-
-	inode = file_inode(file);
-
-	if (!inode || !S_ISSOCK(inode->i_mode)) {
-		fput(file);
-		return NULL;
-	}
-
-	socket = SOCKET_I(inode);
-
-	return socket;
-}
-EXPORT_SYMBOL_GPL(sockfd_to_socket);
-
 /* there may be more cases to tweak the flags. */
 static unsigned int tweak_transfer_flags(unsigned int flags)
 {
diff --git a/drivers/staging/usbip/usbip_common.h b/drivers/staging/usbip/usbip_common.h
index 7e6c543..9f86588 100644
--- a/drivers/staging/usbip/usbip_common.h
+++ b/drivers/staging/usbip/usbip_common.h
@@ -314,7 +314,6 @@ void usbip_dump_urb(struct urb *purb);
 void usbip_dump_header(struct usbip_header *pdu);
 
 int usbip_recv(struct socket *sock, void *buf, int size);
-struct socket *sockfd_to_socket(unsigned int sockfd);
 
 void usbip_pack_pdu(struct usbip_header *pdu, struct urb *urb, int cmd,
 		    int pack);
diff --git a/drivers/staging/usbip/vhci_hcd.c b/drivers/staging/usbip/vhci_hcd.c
index 72391ef..99dd2b1 100644
--- a/drivers/staging/usbip/vhci_hcd.c
+++ b/drivers/staging/usbip/vhci_hcd.c
@@ -789,7 +789,7 @@ static void vhci_shutdown_connection(struct usbip_device *ud)
 
 	/* active connection is closed */
 	if (vdev->ud.tcp_socket) {
-		fput(vdev->ud.tcp_socket->file);
+		sockfd_put(vdev->ud.tcp_socket);
 		vdev->ud.tcp_socket = NULL;
 	}
 	pr_info("release socket\n");
@@ -836,7 +836,7 @@ static void vhci_device_reset(struct usbip_device *ud)
 	vdev->udev = NULL;
 
 	if (ud->tcp_socket) {
-		fput(ud->tcp_socket->file);
+		sockfd_put(ud->tcp_socket);
 		ud->tcp_socket = NULL;
 	}
 	ud->status = VDEV_ST_NULL;
diff --git a/drivers/staging/usbip/vhci_sysfs.c b/drivers/staging/usbip/vhci_sysfs.c
index 0141bc3..baba127 100644
--- a/drivers/staging/usbip/vhci_sysfs.c
+++ b/drivers/staging/usbip/vhci_sysfs.c
@@ -175,6 +175,7 @@ static ssize_t store_attach(struct device *dev, struct device_attribute *attr,
 	struct socket *socket;
 	int sockfd = 0;
 	__u32 rhport = 0, devid = 0, speed = 0;
+	int err;
 
 	/*
 	 * @rhport: port number of vhci_hcd
@@ -192,8 +193,7 @@ static ssize_t store_attach(struct device *dev, struct device_attribute *attr,
 		return -EINVAL;
 
 	/* Extract socket from fd. */
-	/* The correct way to clean this up is to fput(socket->file). */
-	socket = sockfd_to_socket(sockfd);
+	socket = sockfd_lookup(sockfd, &err);
 	if (!socket)
 		return -EINVAL;
 
@@ -209,7 +209,7 @@ static ssize_t store_attach(struct device *dev, struct device_attribute *attr,
 		spin_unlock(&vdev->ud.lock);
 		spin_unlock(&the_controller->lock);
 
-		fput(socket->file);
+		sockfd_put(socket);
 
 		dev_err(dev, "port %d already used\n", rhport);
 		return -EINVAL;
-- 
cgit v0.10.2


From 09aaacf02a3e88870ed5cad038a5bc822c947904 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Mar 2014 20:39:00 -0500
Subject: vhost: don't open-code sockfd_put()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e1e22e0..be414d2 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -818,9 +818,9 @@ static int vhost_net_release(struct inode *inode, struct file *f)
 	vhost_dev_cleanup(&n->dev, false);
 	vhost_net_vq_reset(n);
 	if (tx_sock)
-		fput(tx_sock->file);
+		sockfd_put(tx_sock);
 	if (rx_sock)
-		fput(rx_sock->file);
+		sockfd_put(rx_sock);
 	/* Make sure no callbacks are outstanding */
 	synchronize_rcu_bh();
 	/* We do an extra flush before freeing memory,
@@ -860,7 +860,7 @@ static struct socket *get_raw_socket(int fd)
 	}
 	return sock;
 err:
-	fput(sock->file);
+	sockfd_put(sock);
 	return ERR_PTR(r);
 }
 
@@ -966,7 +966,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 
 	if (oldsock) {
 		vhost_net_flush_vq(n, index);
-		fput(oldsock->file);
+		sockfd_put(oldsock);
 	}
 
 	mutex_unlock(&n->dev.mutex);
@@ -978,7 +978,7 @@ err_used:
 	if (ubufs)
 		vhost_net_ubuf_put_wait_and_free(ubufs);
 err_ubufs:
-	fput(sock->file);
+	sockfd_put(sock);
 err_vq:
 	mutex_unlock(&vq->mutex);
 err:
@@ -1009,9 +1009,9 @@ static long vhost_net_reset_owner(struct vhost_net *n)
 done:
 	mutex_unlock(&n->dev.mutex);
 	if (tx_sock)
-		fput(tx_sock->file);
+		sockfd_put(tx_sock);
 	if (rx_sock)
-		fput(rx_sock->file);
+		sockfd_put(rx_sock);
 	return err;
 }
 
-- 
cgit v0.10.2


From e25115786ee540fc428a14872ebd4f56252aba32 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Mar 2014 20:41:36 -0500
Subject: switch nbd to sockfd_lookup/sockfd_put

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 55298db..3a70ea2 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -630,37 +630,29 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 	}
  
 	case NBD_CLEAR_SOCK: {
-		struct file *file;
-
+		struct socket *sock = nbd->sock;
 		nbd->sock = NULL;
-		file = nbd->file;
-		nbd->file = NULL;
 		nbd_clear_que(nbd);
 		BUG_ON(!list_empty(&nbd->queue_head));
 		BUG_ON(!list_empty(&nbd->waiting_queue));
 		kill_bdev(bdev);
-		if (file)
-			fput(file);
+		if (sock)
+			sockfd_put(sock);
 		return 0;
 	}
 
 	case NBD_SET_SOCK: {
-		struct file *file;
-		if (nbd->file)
+		struct socket *sock;
+		int err;
+		if (nbd->sock)
 			return -EBUSY;
-		file = fget(arg);
-		if (file) {
-			struct inode *inode = file_inode(file);
-			if (S_ISSOCK(inode->i_mode)) {
-				nbd->file = file;
-				nbd->sock = SOCKET_I(inode);
-				if (max_part > 0)
-					bdev->bd_invalidated = 1;
-				nbd->disconnect = 0; /* we're connected now */
-				return 0;
-			} else {
-				fput(file);
-			}
+		sock = sockfd_lookup(arg, &err);
+		if (sock) {
+			nbd->sock = sock;
+			if (max_part > 0)
+				bdev->bd_invalidated = 1;
+			nbd->disconnect = 0; /* we're connected now */
+			return 0;
 		}
 		return -EINVAL;
 	}
@@ -697,12 +689,12 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 
 	case NBD_DO_IT: {
 		struct task_struct *thread;
-		struct file *file;
+		struct socket *sock;
 		int error;
 
 		if (nbd->pid)
 			return -EBUSY;
-		if (!nbd->file)
+		if (!nbd->sock)
 			return -EINVAL;
 
 		mutex_unlock(&nbd->tx_lock);
@@ -731,15 +723,15 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		if (error)
 			return error;
 		sock_shutdown(nbd, 0);
-		file = nbd->file;
-		nbd->file = NULL;
+		sock = nbd->sock;
+		nbd->sock = NULL;
 		nbd_clear_que(nbd);
 		dev_warn(disk_to_dev(nbd->disk), "queue cleared\n");
 		kill_bdev(bdev);
 		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
 		set_device_ro(bdev, false);
-		if (file)
-			fput(file);
+		if (sock)
+			sockfd_put(sock);
 		nbd->flags = 0;
 		nbd->bytesize = 0;
 		bdev->bd_inode->i_size = 0;
@@ -875,9 +867,7 @@ static int __init nbd_init(void)
 
 	for (i = 0; i < nbds_max; i++) {
 		struct gendisk *disk = nbd_dev[i].disk;
-		nbd_dev[i].file = NULL;
 		nbd_dev[i].magic = NBD_MAGIC;
-		nbd_dev[i].flags = 0;
 		INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
 		spin_lock_init(&nbd_dev[i].queue_lock);
 		INIT_LIST_HEAD(&nbd_dev[i].queue_head);
diff --git a/include/linux/nbd.h b/include/linux/nbd.h
index ae4981e..f62f78a 100644
--- a/include/linux/nbd.h
+++ b/include/linux/nbd.h
@@ -24,8 +24,7 @@ struct request;
 struct nbd_device {
 	int flags;
 	int harderror;		/* Code of hard error			*/
-	struct socket * sock;
-	struct file * file; 	/* If == NULL, device is not ready, yet	*/
+	struct socket * sock;	/* If == NULL, device is not ready, yet	*/
 	int magic;
 
 	spinlock_t queue_lock;
-- 
cgit v0.10.2


From 44ba8406d0005400e56c6b6a279a669eb761d1b8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 6 Mar 2014 17:41:01 -0500
Subject: ncpfs: switch to sockfd_lookup()/sockfd_put()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 2cf2ebe..ceeca64 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -468,9 +468,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 {
 	struct ncp_mount_data_kernel data;
 	struct ncp_server *server;
-	struct file *ncp_filp;
 	struct inode *root_inode;
-	struct inode *sock_inode;
 	struct socket *sock;
 	int error;
 	int default_bufsize;
@@ -539,18 +537,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 	if (!uid_valid(data.mounted_uid) || !uid_valid(data.uid) ||
 	    !gid_valid(data.gid))
 		goto out;
-	error = -EBADF;
-	ncp_filp = fget(data.ncp_fd);
-	if (!ncp_filp)
-		goto out;
-	error = -ENOTSOCK;
-	sock_inode = file_inode(ncp_filp);
-	if (!S_ISSOCK(sock_inode->i_mode))
-		goto out_fput;
-	sock = SOCKET_I(sock_inode);
+	sock = sockfd_lookup(data.ncp_fd, &error);
 	if (!sock)
-		goto out_fput;
-		
+		goto out;
+
 	if (sock->type == SOCK_STREAM)
 		default_bufsize = 0xF000;
 	else
@@ -572,27 +562,16 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 	if (error)
 		goto out_fput;
 
-	server->ncp_filp = ncp_filp;
 	server->ncp_sock = sock;
 	
 	if (data.info_fd != -1) {
-		struct socket *info_sock;
-
-		error = -EBADF;
-		server->info_filp = fget(data.info_fd);
-		if (!server->info_filp)
-			goto out_bdi;
-		error = -ENOTSOCK;
-		sock_inode = file_inode(server->info_filp);
-		if (!S_ISSOCK(sock_inode->i_mode))
-			goto out_fput2;
-		info_sock = SOCKET_I(sock_inode);
+		struct socket *info_sock = sockfd_lookup(data.info_fd, &error);
 		if (!info_sock)
-			goto out_fput2;
+			goto out_bdi;
+		server->info_sock = info_sock;
 		error = -EBADFD;
 		if (info_sock->type != SOCK_STREAM)
 			goto out_fput2;
-		server->info_sock = info_sock;
 	}
 
 /*	server->lock = 0;	*/
@@ -764,17 +743,12 @@ out_nls:
 	mutex_destroy(&server->root_setup_lock);
 	mutex_destroy(&server->mutex);
 out_fput2:
-	if (server->info_filp)
-		fput(server->info_filp);
+	if (server->info_sock)
+		sockfd_put(server->info_sock);
 out_bdi:
 	bdi_destroy(&server->bdi);
 out_fput:
-	/* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>:
-	 * 
-	 * The previously used put_filp(ncp_filp); was bogus, since
-	 * it doesn't perform proper unlocking.
-	 */
-	fput(ncp_filp);
+	sockfd_put(sock);
 out:
 	put_pid(data.wdog_pid);
 	sb->s_fs_info = NULL;
@@ -807,9 +781,9 @@ static void ncp_put_super(struct super_block *sb)
 	mutex_destroy(&server->root_setup_lock);
 	mutex_destroy(&server->mutex);
 
-	if (server->info_filp)
-		fput(server->info_filp);
-	fput(server->ncp_filp);
+	if (server->info_sock)
+		sockfd_put(server->info_sock);
+	sockfd_put(server->ncp_sock);
 	kill_pid(server->m.wdog_pid, SIGTERM, 1);
 	put_pid(server->m.wdog_pid);
 
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
index b81e97a..7fa17e4 100644
--- a/fs/ncpfs/ncp_fs_sb.h
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -45,9 +45,7 @@ struct ncp_server {
 
 	__u8 name_space[NCP_NUMBER_OF_VOLUMES + 2];
 
-	struct file *ncp_filp;	/* File pointer to ncp socket */
 	struct socket *ncp_sock;/* ncp socket */
-	struct file *info_filp;
 	struct socket *info_sock;
 
 	u8 sequence;
-- 
cgit v0.10.2


From dd20908a8a06b22c171f6c3fcdbdbd65bed07505 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 14 Mar 2014 10:56:20 -0400
Subject: don't bother with {get,put}_write_access() on non-regular files

it's pointless and actually leads to wrong behaviour in at least one
moderately convoluted case (pipe(), close one end, try to get to
another via /proc/*/fd and run into ETXTBUSY).

Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/file_table.c b/fs/file_table.c
index 5b24008..79ecae6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -209,10 +209,10 @@ static void drop_file_write_access(struct file *file)
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 
-	put_write_access(inode);
-
 	if (special_file(inode->i_mode))
 		return;
+
+	put_write_access(inode);
 	if (file_check_writeable(file) != 0)
 		return;
 	__mnt_drop_write(mnt);
diff --git a/fs/open.c b/fs/open.c
index b9ed8b2..2ed7325 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -641,23 +641,12 @@ out:
 static inline int __get_file_write_access(struct inode *inode,
 					  struct vfsmount *mnt)
 {
-	int error;
-	error = get_write_access(inode);
+	int error = get_write_access(inode);
 	if (error)
 		return error;
-	/*
-	 * Do not take mount writer counts on
-	 * special files since no writes to
-	 * the mount itself will occur.
-	 */
-	if (!special_file(inode->i_mode)) {
-		/*
-		 * Balanced in __fput()
-		 */
-		error = __mnt_want_write(mnt);
-		if (error)
-			put_write_access(inode);
-	}
+	error = __mnt_want_write(mnt);
+	if (error)
+		put_write_access(inode);
 	return error;
 }
 
@@ -690,12 +679,11 @@ static int do_dentry_open(struct file *f,
 
 	path_get(&f->f_path);
 	inode = f->f_inode = f->f_path.dentry->d_inode;
-	if (f->f_mode & FMODE_WRITE) {
+	if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
 		error = __get_file_write_access(inode, f->f_path.mnt);
 		if (error)
 			goto cleanup_file;
-		if (!special_file(inode->i_mode))
-			file_take_write(f);
+		file_take_write(f);
 	}
 
 	f->f_mapping = inode->i_mapping;
@@ -742,7 +730,6 @@ static int do_dentry_open(struct file *f,
 cleanup_all:
 	fops_put(f->f_op);
 	if (f->f_mode & FMODE_WRITE) {
-		put_write_access(inode);
 		if (!special_file(inode->i_mode)) {
 			/*
 			 * We don't consider this a real
@@ -750,6 +737,7 @@ cleanup_all:
 			 * because it all happenend right
 			 * here, so just reset the state.
 			 */
+			put_write_access(inode);
 			file_reset_write(f);
 			__mnt_drop_write(f->f_path.mnt);
 		}
-- 
cgit v0.10.2


From 4597e695b8baa3e2620da89c7593be70cf20566b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 14 Mar 2014 10:06:32 -0400
Subject: get rid of DEBUG_WRITECOUNT

it only makes control flow in __fput() and friends more convoluted.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig
index c2353bf..175a8b9 100644
--- a/arch/powerpc/configs/ppc6xx_defconfig
+++ b/arch/powerpc/configs/ppc6xx_defconfig
@@ -1244,7 +1244,6 @@ CONFIG_DEBUG_SPINLOCK_SLEEP=y
 CONFIG_DEBUG_HIGHMEM=y
 CONFIG_DEBUG_INFO=y
 CONFIG_DEBUG_VM=y
-CONFIG_DEBUG_WRITECOUNT=y
 CONFIG_DEBUG_LIST=y
 CONFIG_DEBUG_SG=y
 # CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/powerpc/configs/ps3_defconfig b/arch/powerpc/configs/ps3_defconfig
index 139a830..fdee37f 100644
--- a/arch/powerpc/configs/ps3_defconfig
+++ b/arch/powerpc/configs/ps3_defconfig
@@ -174,7 +174,6 @@ CONFIG_DETECT_HUNG_TASK=y
 CONFIG_PROVE_LOCKING=y
 CONFIG_DEBUG_LOCKDEP=y
 CONFIG_DEBUG_INFO=y
-CONFIG_DEBUG_WRITECOUNT=y
 CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_DEBUG_LIST=y
 CONFIG_RCU_CPU_STALL_TIMEOUT=60
diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/default_defconfig
index e0af2ee..3f53846 100644
--- a/arch/s390/configs/default_defconfig
+++ b/arch/s390/configs/default_defconfig
@@ -550,7 +550,6 @@ CONFIG_LOCK_STAT=y
 CONFIG_DEBUG_LOCKDEP=y
 CONFIG_DEBUG_ATOMIC_SLEEP=y
 CONFIG_DEBUG_LOCKING_API_SELFTESTS=y
-CONFIG_DEBUG_WRITECOUNT=y
 CONFIG_DEBUG_LIST=y
 CONFIG_DEBUG_SG=y
 CONFIG_DEBUG_NOTIFIERS=y
diff --git a/arch/sh/configs/rsk7203_defconfig b/arch/sh/configs/rsk7203_defconfig
index 4e5229b..4723657 100644
--- a/arch/sh/configs/rsk7203_defconfig
+++ b/arch/sh/configs/rsk7203_defconfig
@@ -128,7 +128,6 @@ CONFIG_DEBUG_MUTEXES=y
 CONFIG_DEBUG_SPINLOCK_SLEEP=y
 CONFIG_DEBUG_INFO=y
 CONFIG_DEBUG_VM=y
-CONFIG_DEBUG_WRITECOUNT=y
 CONFIG_DEBUG_LIST=y
 CONFIG_DEBUG_SG=y
 CONFIG_FRAME_POINTER=y
diff --git a/arch/xtensa/configs/iss_defconfig b/arch/xtensa/configs/iss_defconfig
index 4f23320..711f8aa 100644
--- a/arch/xtensa/configs/iss_defconfig
+++ b/arch/xtensa/configs/iss_defconfig
@@ -627,7 +627,6 @@ CONFIG_SCHED_DEBUG=y
 # CONFIG_DEBUG_KOBJECT is not set
 # CONFIG_DEBUG_INFO is not set
 # CONFIG_DEBUG_VM is not set
-# CONFIG_DEBUG_WRITECOUNT is not set
 # CONFIG_DEBUG_MEMORY_INIT is not set
 # CONFIG_DEBUG_LIST is not set
 # CONFIG_DEBUG_SG is not set
diff --git a/arch/xtensa/configs/s6105_defconfig b/arch/xtensa/configs/s6105_defconfig
index d929f77..78318a7 100644
--- a/arch/xtensa/configs/s6105_defconfig
+++ b/arch/xtensa/configs/s6105_defconfig
@@ -569,7 +569,6 @@ CONFIG_DEBUG_SPINLOCK_SLEEP=y
 # CONFIG_DEBUG_INFO is not set
 # CONFIG_DEBUG_VM is not set
 CONFIG_DEBUG_NOMMU_REGIONS=y
-# CONFIG_DEBUG_WRITECOUNT is not set
 # CONFIG_DEBUG_MEMORY_INIT is not set
 # CONFIG_DEBUG_LIST is not set
 # CONFIG_DEBUG_SG is not set
diff --git a/fs/file_table.c b/fs/file_table.c
index 79ecae6..ee20658 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -52,7 +52,6 @@ static void file_free_rcu(struct rcu_head *head)
 static inline void file_free(struct file *f)
 {
 	percpu_counter_dec(&nr_files);
-	file_check_state(f);
 	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
 }
 
@@ -186,7 +185,6 @@ struct file *alloc_file(struct path *path, fmode_t mode,
 	 * that we can do debugging checks at __fput()
 	 */
 	if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) {
-		file_take_write(file);
 		WARN_ON(mnt_clone_write(path->mnt));
 	}
 	if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
@@ -213,10 +211,7 @@ static void drop_file_write_access(struct file *file)
 		return;
 
 	put_write_access(inode);
-	if (file_check_writeable(file) != 0)
-		return;
 	__mnt_drop_write(mnt);
-	file_release_write(file);
 }
 
 /* the real guts of fput() - releasing the last reference to file
diff --git a/fs/open.c b/fs/open.c
index 2ed7325..8d0b6ad 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -683,7 +683,6 @@ static int do_dentry_open(struct file *f,
 		error = __get_file_write_access(inode, f->f_path.mnt);
 		if (error)
 			goto cleanup_file;
-		file_take_write(f);
 	}
 
 	f->f_mapping = inode->i_mapping;
@@ -731,14 +730,7 @@ cleanup_all:
 	fops_put(f->f_op);
 	if (f->f_mode & FMODE_WRITE) {
 		if (!special_file(inode->i_mode)) {
-			/*
-			 * We don't consider this a real
-			 * mnt_want/drop_write() pair
-			 * because it all happenend right
-			 * here, so just reset the state.
-			 */
 			put_write_access(inode);
-			file_reset_write(f);
 			__mnt_drop_write(f->f_path.mnt);
 		}
 	}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 23b2a35..e80659e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -769,9 +769,6 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
 		index <  ra->start + ra->size);
 }
 
-#define FILE_MNT_WRITE_TAKEN	1
-#define FILE_MNT_WRITE_RELEASED	2
-
 struct file {
 	union {
 		struct llist_node	fu_llist;
@@ -809,9 +806,6 @@ struct file {
 	struct list_head	f_tfile_llink;
 #endif /* #ifdef CONFIG_EPOLL */
 	struct address_space	*f_mapping;
-#ifdef CONFIG_DEBUG_WRITECOUNT
-	unsigned long f_mnt_write_state;
-#endif
 } __attribute__((aligned(4)));	/* lest something weird decides that 2 is OK */
 
 struct file_handle {
@@ -829,49 +823,6 @@ static inline struct file *get_file(struct file *f)
 #define fput_atomic(x)	atomic_long_add_unless(&(x)->f_count, -1, 1)
 #define file_count(x)	atomic_long_read(&(x)->f_count)
 
-#ifdef CONFIG_DEBUG_WRITECOUNT
-static inline void file_take_write(struct file *f)
-{
-	WARN_ON(f->f_mnt_write_state != 0);
-	f->f_mnt_write_state = FILE_MNT_WRITE_TAKEN;
-}
-static inline void file_release_write(struct file *f)
-{
-	f->f_mnt_write_state |= FILE_MNT_WRITE_RELEASED;
-}
-static inline void file_reset_write(struct file *f)
-{
-	f->f_mnt_write_state = 0;
-}
-static inline void file_check_state(struct file *f)
-{
-	/*
-	 * At this point, either both or neither of these bits
-	 * should be set.
-	 */
-	WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN);
-	WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_RELEASED);
-}
-static inline int file_check_writeable(struct file *f)
-{
-	if (f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN)
-		return 0;
-	printk(KERN_WARNING "writeable file with no "
-			    "mnt_want_write()\n");
-	WARN_ON(1);
-	return -EINVAL;
-}
-#else /* !CONFIG_DEBUG_WRITECOUNT */
-static inline void file_take_write(struct file *filp) {}
-static inline void file_release_write(struct file *filp) {}
-static inline void file_reset_write(struct file *filp) {}
-static inline void file_check_state(struct file *filp) {}
-static inline int file_check_writeable(struct file *filp)
-{
-	return 0;
-}
-#endif /* CONFIG_DEBUG_WRITECOUNT */
-
 #define	MAX_NON_LFS	((1UL<<31) - 1)
 
 /* Page cache limit. The filesystems should put that into their s_maxbytes 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a48abea..7a08593 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1030,16 +1030,6 @@ config DEBUG_BUGVERBOSE
 	  of the BUG call as well as the EIP and oops trace.  This aids
 	  debugging but costs about 70-100K of memory.
 
-config DEBUG_WRITECOUNT
-	bool "Debug filesystem writers count"
-	depends on DEBUG_KERNEL
-	help
-	  Enable this to catch wrong use of the writers count in struct
-	  vfsmount.  This will increase the size of each file struct by
-	  32 bits.
-
-	  If unsure, say N.
-
 config DEBUG_LIST
 	bool "Debug linked list manipulation"
 	depends on DEBUG_KERNEL
-- 
cgit v0.10.2


From 0ccb286346c4c0644be17f04a9eb23ad99262882 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 14 Mar 2014 10:40:46 -0400
Subject: fold __get_file_write_access() into its only caller

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/open.c b/fs/open.c
index 8d0b6ad..ebef0c5 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -632,24 +632,6 @@ out:
 	return error;
 }
 
-/*
- * You have to be very careful that these write
- * counts get cleaned up in error cases and
- * upon __fput().  This should probably never
- * be called outside of __dentry_open().
- */
-static inline int __get_file_write_access(struct inode *inode,
-					  struct vfsmount *mnt)
-{
-	int error = get_write_access(inode);
-	if (error)
-		return error;
-	error = __mnt_want_write(mnt);
-	if (error)
-		put_write_access(inode);
-	return error;
-}
-
 int open_check_o_direct(struct file *f)
 {
 	/* NB: we're sure to have correct a_ops only after f_op->open */
@@ -680,9 +662,14 @@ static int do_dentry_open(struct file *f,
 	path_get(&f->f_path);
 	inode = f->f_inode = f->f_path.dentry->d_inode;
 	if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
-		error = __get_file_write_access(inode, f->f_path.mnt);
+		error = get_write_access(inode);
 		if (error)
 			goto cleanup_file;
+		error = __mnt_want_write(f->f_path.mnt);
+		if (error) {
+			put_write_access(inode);
+			goto cleanup_file;
+		}
 	}
 
 	f->f_mapping = inode->i_mapping;
-- 
cgit v0.10.2


From 83f936c75e3689a63253d89c47a4d239c56d7410 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 14 Mar 2014 12:02:47 -0400
Subject: mark struct file that had write access grabbed by open()

new flag in ->f_mode - FMODE_WRITER.  Set by do_dentry_open() in case
when it has grabbed write access, checked by __fput() to decide whether
it wants to drop the sucker.  Allows to stop bothering with mnt_clone_write()
in alloc_file(), along with fewer special_file() checks.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/file_table.c b/fs/file_table.c
index ee20658..ce1504f 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -177,43 +177,12 @@ struct file *alloc_file(struct path *path, fmode_t mode,
 	file->f_mapping = path->dentry->d_inode->i_mapping;
 	file->f_mode = mode;
 	file->f_op = fop;
-
-	/*
-	 * These mounts don't really matter in practice
-	 * for r/o bind mounts.  They aren't userspace-
-	 * visible.  We do this for consistency, and so
-	 * that we can do debugging checks at __fput()
-	 */
-	if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) {
-		WARN_ON(mnt_clone_write(path->mnt));
-	}
 	if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
 		i_readcount_inc(path->dentry->d_inode);
 	return file;
 }
 EXPORT_SYMBOL(alloc_file);
 
-/**
- * drop_file_write_access - give up ability to write to a file
- * @file: the file to which we will stop writing
- *
- * This is a central place which will give up the ability
- * to write to @file, along with access to write through
- * its vfsmount.
- */
-static void drop_file_write_access(struct file *file)
-{
-	struct vfsmount *mnt = file->f_path.mnt;
-	struct dentry *dentry = file->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
-
-	if (special_file(inode->i_mode))
-		return;
-
-	put_write_access(inode);
-	__mnt_drop_write(mnt);
-}
-
 /* the real guts of fput() - releasing the last reference to file
  */
 static void __fput(struct file *file)
@@ -248,8 +217,10 @@ static void __fput(struct file *file)
 	put_pid(file->f_owner.pid);
 	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
 		i_readcount_dec(inode);
-	if (file->f_mode & FMODE_WRITE)
-		drop_file_write_access(file);
+	if (file->f_mode & FMODE_WRITER) {
+		put_write_access(inode);
+		__mnt_drop_write(mnt);
+	}
 	file->f_path.dentry = NULL;
 	file->f_path.mnt = NULL;
 	file->f_inode = NULL;
diff --git a/fs/namespace.c b/fs/namespace.c
index a66aff5..20e8696 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -414,9 +414,7 @@ EXPORT_SYMBOL_GPL(mnt_clone_write);
  */
 int __mnt_want_write_file(struct file *file)
 {
-	struct inode *inode = file_inode(file);
-
-	if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
+	if (!(file->f_mode & FMODE_WRITER))
 		return __mnt_want_write(file->f_path.mnt);
 	else
 		return mnt_clone_write(file->f_path.mnt);
diff --git a/fs/open.c b/fs/open.c
index ebef0c5..dcefb2f 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -670,6 +670,7 @@ static int do_dentry_open(struct file *f,
 			put_write_access(inode);
 			goto cleanup_file;
 		}
+		f->f_mode |= FMODE_WRITER;
 	}
 
 	f->f_mapping = inode->i_mapping;
@@ -715,11 +716,9 @@ static int do_dentry_open(struct file *f,
 
 cleanup_all:
 	fops_put(f->f_op);
-	if (f->f_mode & FMODE_WRITE) {
-		if (!special_file(inode->i_mode)) {
-			put_write_access(inode);
-			__mnt_drop_write(f->f_path.mnt);
-		}
+	if (f->f_mode & FMODE_WRITER) {
+		put_write_access(inode);
+		__mnt_drop_write(f->f_path.mnt);
 	}
 cleanup_file:
 	path_put(&f->f_path);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e80659e..d9d88a0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -125,6 +125,8 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 
 /* File needs atomic accesses to f_pos */
 #define FMODE_ATOMIC_POS	((__force fmode_t)0x8000)
+/* Write access to underlying fs */
+#define FMODE_WRITER		((__force fmode_t)0x10000)
 
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY		((__force fmode_t)0x1000000)
-- 
cgit v0.10.2


From 3f4d5a00076b7e340625a2014cb83e10bf0d6dd1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 14 Mar 2014 09:43:29 -0400
Subject: tidy do_dentry_open() up a bit

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/open.c b/fs/open.c
index dcefb2f..37f65fa 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -656,30 +656,28 @@ static int do_dentry_open(struct file *f,
 	f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
 				FMODE_PREAD | FMODE_PWRITE;
 
-	if (unlikely(f->f_flags & O_PATH))
-		f->f_mode = FMODE_PATH;
-
 	path_get(&f->f_path);
 	inode = f->f_inode = f->f_path.dentry->d_inode;
+	f->f_mapping = inode->i_mapping;
+
+	if (unlikely(f->f_flags & O_PATH)) {
+		f->f_mode = FMODE_PATH;
+		f->f_op = &empty_fops;
+		return 0;
+	}
+
 	if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
 		error = get_write_access(inode);
-		if (error)
+		if (unlikely(error))
 			goto cleanup_file;
 		error = __mnt_want_write(f->f_path.mnt);
-		if (error) {
+		if (unlikely(error)) {
 			put_write_access(inode);
 			goto cleanup_file;
 		}
 		f->f_mode |= FMODE_WRITER;
 	}
 
-	f->f_mapping = inode->i_mapping;
-
-	if (unlikely(f->f_mode & FMODE_PATH)) {
-		f->f_op = &empty_fops;
-		return 0;
-	}
-
 	/* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
 	if (S_ISREG(inode->i_mode))
 		f->f_mode |= FMODE_ATOMIC_POS;
-- 
cgit v0.10.2


From 0018d8bfc4f41de27e45517380e7d90be3580b44 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 14 Mar 2014 12:09:36 -0400
Subject: get_write_access() is inlined, exporting it is pointless

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 4b491b4..2fb4fe5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4413,7 +4413,6 @@ EXPORT_SYMBOL(user_path_at);
 EXPORT_SYMBOL(follow_down_one);
 EXPORT_SYMBOL(follow_down);
 EXPORT_SYMBOL(follow_up);
-EXPORT_SYMBOL(get_write_access); /* nfsd */
 EXPORT_SYMBOL(lock_rename);
 EXPORT_SYMBOL(lookup_one_len);
 EXPORT_SYMBOL(page_follow_link_light);
-- 
cgit v0.10.2


From 4d359507346a156491300cc193252b525892ae91 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 14 Mar 2014 12:20:17 -0400
Subject: namei.c: move EXPORT_SYMBOL to corresponding definitions

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 2fb4fe5..617de9e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -358,6 +358,7 @@ int generic_permission(struct inode *inode, int mask)
 
 	return -EACCES;
 }
+EXPORT_SYMBOL(generic_permission);
 
 /*
  * We _really_ want to just do "generic_permission()" without
@@ -455,6 +456,7 @@ int inode_permission(struct inode *inode, int mask)
 		return retval;
 	return __inode_permission(inode, mask);
 }
+EXPORT_SYMBOL(inode_permission);
 
 /**
  * path_get - get a reference to a path
@@ -924,6 +926,7 @@ int follow_up(struct path *path)
 	path->mnt = &parent->mnt;
 	return 1;
 }
+EXPORT_SYMBOL(follow_up);
 
 /*
  * Perform an automount
@@ -1085,6 +1088,7 @@ int follow_down_one(struct path *path)
 	}
 	return 0;
 }
+EXPORT_SYMBOL(follow_down_one);
 
 static inline bool managed_dentry_might_block(struct dentry *dentry)
 {
@@ -1223,6 +1227,7 @@ int follow_down(struct path *path)
 	}
 	return 0;
 }
+EXPORT_SYMBOL(follow_down);
 
 /*
  * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
@@ -2025,6 +2030,7 @@ int kern_path(const char *name, unsigned int flags, struct path *path)
 		*path = nd.path;
 	return res;
 }
+EXPORT_SYMBOL(kern_path);
 
 /**
  * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
@@ -2049,6 +2055,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 		*path = nd.path;
 	return err;
 }
+EXPORT_SYMBOL(vfs_path_lookup);
 
 /*
  * Restricted form of lookup. Doesn't follow links, single-component only,
@@ -2111,6 +2118,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 
 	return __lookup_hash(&this, base, 0);
 }
+EXPORT_SYMBOL(lookup_one_len);
 
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 		 struct path *path, int *empty)
@@ -2135,6 +2143,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
 {
 	return user_path_at_empty(dfd, name, flags, path, NULL);
 }
+EXPORT_SYMBOL(user_path_at);
 
 /*
  * NB: most callers don't do anything directly with the reference to the
@@ -2477,6 +2486,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 	mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
 	return NULL;
 }
+EXPORT_SYMBOL(lock_rename);
 
 void unlock_rename(struct dentry *p1, struct dentry *p2)
 {
@@ -2486,6 +2496,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
 		mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
 	}
 }
+EXPORT_SYMBOL(unlock_rename);
 
 int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 		bool want_excl)
@@ -2506,6 +2517,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 		fsnotify_create(dir, dentry);
 	return error;
 }
+EXPORT_SYMBOL(vfs_create);
 
 static int may_open(struct path *path, int acc_mode, int flag)
 {
@@ -3376,6 +3388,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
 		fsnotify_create(dir, dentry);
 	return error;
 }
+EXPORT_SYMBOL(vfs_mknod);
 
 static int may_mknod(umode_t mode)
 {
@@ -3465,6 +3478,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 		fsnotify_mkdir(dir, dentry);
 	return error;
 }
+EXPORT_SYMBOL(vfs_mkdir);
 
 SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
 {
@@ -3519,6 +3533,7 @@ void dentry_unhash(struct dentry *dentry)
 		__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
 }
+EXPORT_SYMBOL(dentry_unhash);
 
 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
@@ -3556,6 +3571,7 @@ out:
 		d_delete(dentry);
 	return error;
 }
+EXPORT_SYMBOL(vfs_rmdir);
 
 static long do_rmdir(int dfd, const char __user *pathname)
 {
@@ -3673,6 +3689,7 @@ out:
 
 	return error;
 }
+EXPORT_SYMBOL(vfs_unlink);
 
 /*
  * Make sure that the actual truncation of the file will occur outside its
@@ -3786,6 +3803,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
 		fsnotify_create(dir, dentry);
 	return error;
 }
+EXPORT_SYMBOL(vfs_symlink);
 
 SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
 		int, newdfd, const char __user *, newname)
@@ -3894,6 +3912,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 		fsnotify_link(dir, inode, new_dentry);
 	return error;
 }
+EXPORT_SYMBOL(vfs_link);
 
 /*
  * Hardlinks are often used in delicate situations.  We avoid
@@ -4156,6 +4175,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	return error;
 }
+EXPORT_SYMBOL(vfs_rename);
 
 SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
 		int, newdfd, const char __user *, newname)
@@ -4293,6 +4313,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const c
 out:
 	return len;
 }
+EXPORT_SYMBOL(vfs_readlink);
 
 /*
  * A helper for ->readlink().  This should be used *ONLY* for symlinks that
@@ -4315,6 +4336,7 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 		dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
 	return res;
 }
+EXPORT_SYMBOL(generic_readlink);
 
 /* get the link contents into pagecache */
 static char *page_getlink(struct dentry * dentry, struct page **ppage)
@@ -4342,6 +4364,7 @@ int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 	}
 	return res;
 }
+EXPORT_SYMBOL(page_readlink);
 
 void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
 {
@@ -4349,6 +4372,7 @@ void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
 	nd_set_link(nd, page_getlink(dentry, &page));
 	return page;
 }
+EXPORT_SYMBOL(page_follow_link_light);
 
 void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
 {
@@ -4359,6 +4383,7 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
 		page_cache_release(page);
 	}
 }
+EXPORT_SYMBOL(page_put_link);
 
 /*
  * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
@@ -4396,44 +4421,18 @@ retry:
 fail:
 	return err;
 }
+EXPORT_SYMBOL(__page_symlink);
 
 int page_symlink(struct inode *inode, const char *symname, int len)
 {
 	return __page_symlink(inode, symname, len,
 			!(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
 }
+EXPORT_SYMBOL(page_symlink);
 
 const struct inode_operations page_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
 };
-
-EXPORT_SYMBOL(user_path_at);
-EXPORT_SYMBOL(follow_down_one);
-EXPORT_SYMBOL(follow_down);
-EXPORT_SYMBOL(follow_up);
-EXPORT_SYMBOL(lock_rename);
-EXPORT_SYMBOL(lookup_one_len);
-EXPORT_SYMBOL(page_follow_link_light);
-EXPORT_SYMBOL(page_put_link);
-EXPORT_SYMBOL(page_readlink);
-EXPORT_SYMBOL(__page_symlink);
-EXPORT_SYMBOL(page_symlink);
 EXPORT_SYMBOL(page_symlink_inode_operations);
-EXPORT_SYMBOL(kern_path);
-EXPORT_SYMBOL(vfs_path_lookup);
-EXPORT_SYMBOL(inode_permission);
-EXPORT_SYMBOL(unlock_rename);
-EXPORT_SYMBOL(vfs_create);
-EXPORT_SYMBOL(vfs_link);
-EXPORT_SYMBOL(vfs_mkdir);
-EXPORT_SYMBOL(vfs_mknod);
-EXPORT_SYMBOL(generic_permission);
-EXPORT_SYMBOL(vfs_readlink);
-EXPORT_SYMBOL(vfs_rename);
-EXPORT_SYMBOL(vfs_rmdir);
-EXPORT_SYMBOL(vfs_symlink);
-EXPORT_SYMBOL(vfs_unlink);
-EXPORT_SYMBOL(dentry_unhash);
-EXPORT_SYMBOL(generic_readlink);
-- 
cgit v0.10.2


From 7f4b36f9bb930b3b2105a9a2cb0121fa7028c432 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 14 Mar 2014 12:45:29 -0400
Subject: get rid of files_defer_init()

the only thing it's doing these days is calculation of
upper limit for fs.nr_open sysctl and that can be done
statically

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/file.c b/fs/file.c
index eb56a13..682103b 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -25,7 +25,10 @@
 
 int sysctl_nr_open __read_mostly = 1024*1024;
 int sysctl_nr_open_min = BITS_PER_LONG;
-int sysctl_nr_open_max = 1024 * 1024; /* raised later */
+/* our max() is unusable in constant expressions ;-/ */
+#define __const_max(x, y) ((x) < (y) ? (x) : (y))
+int sysctl_nr_open_max = __const_max(INT_MAX, ~(size_t)0/sizeof(void *)) &
+			 -BITS_PER_LONG;
 
 static void *alloc_fdmem(size_t size)
 {
@@ -429,12 +432,6 @@ void exit_files(struct task_struct *tsk)
 	}
 }
 
-void __init files_defer_init(void)
-{
-	sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) &
-			     -BITS_PER_LONG;
-}
-
 struct files_struct init_files = {
 	.count		= ATOMIC_INIT(1),
 	.fdt		= &init_files.fdtab,
diff --git a/fs/file_table.c b/fs/file_table.c
index ce1504f..718e8e5 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -325,6 +325,5 @@ void __init files_init(unsigned long mempages)
 
 	n = (mempages * (PAGE_SIZE / 1024)) / 10;
 	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
-	files_defer_init();
 	percpu_counter_init(&nr_files, 0);
 } 
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 70e8e21..230f87b 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -63,8 +63,6 @@ struct file_operations;
 struct vfsmount;
 struct dentry;
 
-extern void __init files_defer_init(void);
-
 #define rcu_dereference_check_fdtable(files, fdtfd) \
 	rcu_dereference_check((fdtfd), lockdep_is_held(&(files)->file_lock))
 
-- 
cgit v0.10.2


From 4efcc9ffcd4fc53f1f7de539842cdffa1f8e5ecc Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 14 Mar 2014 12:54:25 -0400
Subject: lustre: generic_readlink() is just fine there, TYVM...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/drivers/staging/lustre/lustre/llite/symlink.c b/drivers/staging/lustre/lustre/llite/symlink.c
index ab06891..80d48b5 100644
--- a/drivers/staging/lustre/lustre/llite/symlink.c
+++ b/drivers/staging/lustre/lustre/llite/symlink.c
@@ -115,27 +115,6 @@ failed:
 	return rc;
 }
 
-static int ll_readlink(struct dentry *dentry, char *buffer, int buflen)
-{
-	struct inode *inode = dentry->d_inode;
-	struct ptlrpc_request *request;
-	char *symname;
-	int rc;
-
-	CDEBUG(D_VFSTRACE, "VFS Op\n");
-
-	ll_inode_size_lock(inode);
-	rc = ll_readlink_internal(inode, &request, &symname);
-	if (rc)
-		GOTO(out, rc);
-
-	rc = vfs_readlink(dentry, buffer, buflen, symname);
- out:
-	ptlrpc_req_finished(request);
-	ll_inode_size_unlock(inode);
-	return rc;
-}
-
 static void *ll_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
@@ -175,7 +154,7 @@ static void ll_put_link(struct dentry *dentry, struct nameidata *nd, void *cooki
 }
 
 struct inode_operations ll_fast_symlink_inode_operations = {
-	.readlink	= ll_readlink,
+	.readlink	= generic_readlink,
 	.setattr	= ll_setattr,
 	.follow_link	= ll_follow_link,
 	.put_link	= ll_put_link,
-- 
cgit v0.10.2


From 5d826c847b34de6415b4f1becd88a57ff619af50 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 14 Mar 2014 13:42:45 -0400
Subject: new helper: readlink_copy()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 617de9e..4fb52f0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4297,11 +4297,9 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna
 	return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
 }
 
-int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
+int readlink_copy(char __user *buffer, int buflen, const char *link)
 {
-	int len;
-
-	len = PTR_ERR(link);
+	int len = PTR_ERR(link);
 	if (IS_ERR(link))
 		goto out;
 
@@ -4313,7 +4311,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const c
 out:
 	return len;
 }
-EXPORT_SYMBOL(vfs_readlink);
+EXPORT_SYMBOL(readlink_copy);
 
 /*
  * A helper for ->readlink().  This should be used *ONLY* for symlinks that
@@ -4331,7 +4329,7 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 	if (IS_ERR(cookie))
 		return PTR_ERR(cookie);
 
-	res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
+	res = readlink_copy(buffer, buflen, nd_get_link(&nd));
 	if (dentry->d_inode->i_op->put_link)
 		dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
 	return res;
@@ -4356,8 +4354,7 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage)
 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 {
 	struct page *page = NULL;
-	char *s = page_getlink(dentry, &page);
-	int res = vfs_readlink(dentry,buffer,buflen,s);
+	int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page));
 	if (page) {
 		kunmap(page);
 		page_cache_release(page);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 9ae46b8..8902609 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -146,7 +146,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
 	struct task_struct *task;
 	void *ns;
 	char name[50];
-	int len = -EACCES;
+	int res = -EACCES;
 
 	task = get_proc_task(inode);
 	if (!task)
@@ -155,24 +155,18 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
 	if (!ptrace_may_access(task, PTRACE_MODE_READ))
 		goto out_put_task;
 
-	len = -ENOENT;
+	res = -ENOENT;
 	ns = ns_ops->get(task);
 	if (!ns)
 		goto out_put_task;
 
 	snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns));
-	len = strlen(name);
-
-	if (len > buflen)
-		len = buflen;
-	if (copy_to_user(buffer, name, len))
-		len = -EFAULT;
-
+	res = readlink_copy(buffer, buflen, name);
 	ns_ops->put(ns);
 out_put_task:
 	put_task_struct(task);
 out:
-	return len;
+	return res;
 }
 
 static const struct inode_operations proc_ns_link_inode_operations = {
diff --git a/fs/proc/self.c b/fs/proc/self.c
index ffeb202..4348bb8 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -16,7 +16,7 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
 	if (!tgid)
 		return -ENOENT;
 	sprintf(tmp, "%d", tgid);
-	return vfs_readlink(dentry,buffer,buflen,tmp);
+	return readlink_copy(buffer, buflen, tmp);
 }
 
 static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index bcfe612..0b18776 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -271,32 +271,6 @@ xfs_open_by_handle(
 	return error;
 }
 
-/*
- * This is a copy from fs/namei.c:vfs_readlink(), except for removing it's
- * unused first argument.
- */
-STATIC int
-do_readlink(
-	char __user		*buffer,
-	int			buflen,
-	const char		*link)
-{
-        int len;
-
-	len = PTR_ERR(link);
-	if (IS_ERR(link))
-		goto out;
-
-	len = strlen(link);
-	if (len > (unsigned) buflen)
-		len = buflen;
-	if (copy_to_user(buffer, link, len))
-		len = -EFAULT;
- out:
-	return len;
-}
-
-
 int
 xfs_readlink_by_handle(
 	struct file		*parfilp,
@@ -334,7 +308,7 @@ xfs_readlink_by_handle(
 	error = -xfs_readlink(XFS_I(dentry->d_inode), link);
 	if (error)
 		goto out_kfree;
-	error = do_readlink(hreq->ohandle, olen, link);
+	error = readlink_copy(hreq->ohandle, olen, link);
 	if (error)
 		goto out_kfree;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d9d88a0..db181b5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2517,7 +2517,7 @@ extern const struct file_operations generic_ro_fops;
 
 #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
 
-extern int vfs_readlink(struct dentry *, char __user *, int, const char *);
+extern int readlink_copy(char __user *, int, const char *);
 extern int page_readlink(struct dentry *, char __user *, int);
 extern void *page_follow_link_light(struct dentry *, struct nameidata *);
 extern void page_put_link(struct dentry *, struct nameidata *, void *);
-- 
cgit v0.10.2


From 05faf3169f039cb03ebabdfee6eda0e7ada5ea11 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 1 Feb 2014 04:41:36 -0500
Subject: ntfs: don't put NULL into ->i_op/->i_fop

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index ffb9b36..4de660f 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1704,8 +1704,6 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
 	iput(bvi);
 skip_large_index_stuff:
 	/* Setup the operations for this index inode. */
-	vi->i_op = NULL;
-	vi->i_fop = NULL;
 	vi->i_mapping->a_ops = &ntfs_mst_aops;
 	vi->i_blocks = ni->allocated_size >> 9;
 	/*
-- 
cgit v0.10.2


From 627bf81ac625f05060db033a0f3791521ad7bd79 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 1 Feb 2014 04:43:32 -0500
Subject: get rid of pointless checks for NULL ->i_op

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index 622f469..5b99baf 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -124,7 +124,6 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
 	/* check parameters */
 	ret = -EOPNOTSUPP;
 	if (!root->d_inode ||
-	    !root->d_inode->i_op ||
 	    !root->d_inode->i_op->lookup ||
 	    !root->d_inode->i_op->mkdir ||
 	    !root->d_inode->i_op->setxattr ||
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index ca65f39..1b1283b 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -779,8 +779,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
 	}
 
 	ret = -EPERM;
-	if (!subdir->d_inode->i_op ||
-	    !subdir->d_inode->i_op->setxattr ||
+	if (!subdir->d_inode->i_op->setxattr ||
 	    !subdir->d_inode->i_op->getxattr ||
 	    !subdir->d_inode->i_op->lookup ||
 	    !subdir->d_inode->i_op->mkdir ||
diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c
index 3bab89e..e90ab0e 100644
--- a/security/integrity/evm/evm_crypto.c
+++ b/security/integrity/evm/evm_crypto.c
@@ -137,7 +137,7 @@ static int evm_calc_hmac_or_hash(struct dentry *dentry,
 	int error;
 	int size;
 
-	if (!inode->i_op || !inode->i_op->getxattr)
+	if (!inode->i_op->getxattr)
 		return -EOPNOTSUPP;
 	desc = init_desc(type);
 	if (IS_ERR(desc))
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index 336b3dd..bab1c39 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -62,7 +62,7 @@ static int evm_find_protected_xattrs(struct dentry *dentry)
 	int error;
 	int count = 0;
 
-	if (!inode->i_op || !inode->i_op->getxattr)
+	if (!inode->i_op->getxattr)
 		return -EOPNOTSUPP;
 
 	for (xattr = evm_config_xattrnames; *xattr != NULL; xattr++) {
diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c
index 80a09c3..a3386d1 100644
--- a/security/tomoyo/realpath.c
+++ b/security/tomoyo/realpath.c
@@ -173,7 +173,7 @@ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer,
 		 * Use filesystem name if filesystem does not support rename()
 		 * operation.
 		 */
-		if (inode->i_op && !inode->i_op->rename)
+		if (!inode->i_op->rename)
 			goto prepend_filesystem_name;
 	}
 	/* Prepend device name. */
@@ -282,7 +282,7 @@ char *tomoyo_realpath_from_path(struct path *path)
 		 * Get local name for filesystems without rename() operation
 		 * or dentry without vfsmount.
 		 */
-		if (!path->mnt || (inode->i_op && !inode->i_op->rename))
+		if (!path->mnt || !inode->i_op->rename)
 			pos = tomoyo_get_local_path(path->dentry, buf,
 						    buf_len - 1);
 		/* Get absolute name for the rest. */
-- 
cgit v0.10.2


From 81c5a68478be38816bb5110ae0a5de1320cd2dfd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 1 Feb 2014 08:26:29 -0500
Subject: cifs: ->rename() without ->lookup() makes no sense

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 849f613..f31f9d6 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -849,7 +849,6 @@ const struct inode_operations cifs_file_inode_ops = {
 /*	revalidate:cifs_revalidate, */
 	.setattr = cifs_setattr,
 	.getattr = cifs_getattr, /* do we need this anymore? */
-	.rename = cifs_rename,
 	.permission = cifs_permission,
 #ifdef CONFIG_CIFS_XATTR
 	.setxattr = cifs_setxattr,
-- 
cgit v0.10.2


From 3ef120a459260b35175a64a418bdb115d80bf58f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 2 Feb 2014 06:31:19 -0500
Subject: mn10300: kmap_atomic() returns void *, not unsigned long...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/arch/mn10300/include/asm/highmem.h b/arch/mn10300/include/asm/highmem.h
index 7c137cd..2fbbe4d 100644
--- a/arch/mn10300/include/asm/highmem.h
+++ b/arch/mn10300/include/asm/highmem.h
@@ -70,7 +70,7 @@ static inline void kunmap(struct page *page)
  * be used in IRQ contexts, so in some (very limited) cases we need
  * it.
  */
-static inline unsigned long kmap_atomic(struct page *page)
+static inline void *kmap_atomic(struct page *page)
 {
 	unsigned long vaddr;
 	int idx, type;
@@ -89,7 +89,7 @@ static inline unsigned long kmap_atomic(struct page *page)
 	set_pte(kmap_pte - idx, mk_pte(page, kmap_prot));
 	local_flush_tlb_one(vaddr);
 
-	return vaddr;
+	return (void *)vaddr;
 }
 
 static inline void __kunmap_atomic(unsigned long vaddr)
-- 
cgit v0.10.2


From 8ffcb32e05239f0e53abfb0a1bc4eee4855b7fd2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 24 Jan 2014 12:17:54 +0000
Subject: VFS: Make delayed_free() call free_vfsmnt()

Make delayed_free() call free_vfsmnt() so that we don't have two functions
doing the same job.  This requires the calls to mnt_free_id() in free_vfsmnt()
to be moved into the callers of that function.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namespace.c b/fs/namespace.c
index 20e8696..182bc41 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -568,13 +568,17 @@ int sb_prepare_remount_readonly(struct super_block *sb)
 static void free_vfsmnt(struct mount *mnt)
 {
 	kfree(mnt->mnt_devname);
-	mnt_free_id(mnt);
 #ifdef CONFIG_SMP
 	free_percpu(mnt->mnt_pcp);
 #endif
 	kmem_cache_free(mnt_cache, mnt);
 }
 
+static void delayed_free_vfsmnt(struct rcu_head *head)
+{
+	free_vfsmnt(container_of(head, struct mount, mnt_rcu));
+}
+
 /* call under rcu_read_lock */
 bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
 {
@@ -846,6 +850,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 
 	root = mount_fs(type, flags, name, data);
 	if (IS_ERR(root)) {
+		mnt_free_id(mnt);
 		free_vfsmnt(mnt);
 		return ERR_CAST(root);
 	}
@@ -926,20 +931,11 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	return mnt;
 
  out_free:
+	mnt_free_id(mnt);
 	free_vfsmnt(mnt);
 	return ERR_PTR(err);
 }
 
-static void delayed_free(struct rcu_head *head)
-{
-	struct mount *mnt = container_of(head, struct mount, mnt_rcu);
-	kfree(mnt->mnt_devname);
-#ifdef CONFIG_SMP
-	free_percpu(mnt->mnt_pcp);
-#endif
-	kmem_cache_free(mnt_cache, mnt);
-}
-
 static void mntput_no_expire(struct mount *mnt)
 {
 put_again:
@@ -989,7 +985,7 @@ put_again:
 	dput(mnt->mnt.mnt_root);
 	deactivate_super(mnt->mnt.mnt_sb);
 	mnt_free_id(mnt);
-	call_rcu(&mnt->mnt_rcu, delayed_free);
+	call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
 }
 
 void mntput(struct vfsmount *mnt)
-- 
cgit v0.10.2


From 58bda1da4b3c3ec6da8d0badaba12ce02839a241 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 2 Feb 2014 21:03:40 -0500
Subject: fuse/dev: use atomic maps

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 0a648bb..f90af6f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -669,13 +669,13 @@ static void fuse_copy_finish(struct fuse_copy_state *cs)
 		if (!cs->write) {
 			buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
 		} else {
-			kunmap(buf->page);
+			kunmap_atomic(cs->mapaddr);
 			buf->len = PAGE_SIZE - cs->len;
 		}
 		cs->currbuf = NULL;
 		cs->mapaddr = NULL;
 	} else if (cs->mapaddr) {
-		kunmap(cs->pg);
+		kunmap_atomic(cs->mapaddr);
 		if (cs->write) {
 			flush_dcache_page(cs->pg);
 			set_page_dirty_lock(cs->pg);
@@ -706,7 +706,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 
 			BUG_ON(!cs->nr_segs);
 			cs->currbuf = buf;
-			cs->mapaddr = buf->ops->map(cs->pipe, buf, 0);
+			cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
 			cs->len = buf->len;
 			cs->buf = cs->mapaddr + buf->offset;
 			cs->pipebufs++;
@@ -726,7 +726,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 			buf->len = 0;
 
 			cs->currbuf = buf;
-			cs->mapaddr = kmap(page);
+			cs->mapaddr = kmap_atomic(page);
 			cs->buf = cs->mapaddr;
 			cs->len = PAGE_SIZE;
 			cs->pipebufs++;
@@ -745,7 +745,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 			return err;
 		BUG_ON(err != 1);
 		offset = cs->addr % PAGE_SIZE;
-		cs->mapaddr = kmap(cs->pg);
+		cs->mapaddr = kmap_atomic(cs->pg);
 		cs->buf = cs->mapaddr + offset;
 		cs->len = min(PAGE_SIZE - offset, cs->seglen);
 		cs->seglen -= cs->len;
-- 
cgit v0.10.2


From fbb32750a62df75d1ffea547f3908b21c5496d9f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 2 Feb 2014 21:09:54 -0500
Subject: pipe: kill ->map() and ->unmap()

all pipe_buffer_operations have the same instances of those...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 6928d09..60aafb8 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -901,9 +901,9 @@ static int pipe_to_sg(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 		if (len + offset > PAGE_SIZE)
 			len = PAGE_SIZE - offset;
 
-		src = buf->ops->map(pipe, buf, 1);
+		src = kmap_atomic(buf->page);
 		memcpy(page_address(page) + offset, src + buf->offset, len);
-		buf->ops->unmap(pipe, buf, src);
+		kunmap_atomic(src);
 
 		sg_set_page(&(sgl->sg[sgl->n]), page, len, offset);
 	}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index f90af6f..aac71ce 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -667,7 +667,7 @@ static void fuse_copy_finish(struct fuse_copy_state *cs)
 		struct pipe_buffer *buf = cs->currbuf;
 
 		if (!cs->write) {
-			buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
+			kunmap_atomic(cs->mapaddr);
 		} else {
 			kunmap_atomic(cs->mapaddr);
 			buf->len = PAGE_SIZE - cs->len;
@@ -706,7 +706,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 
 			BUG_ON(!cs->nr_segs);
 			cs->currbuf = buf;
-			cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
+			cs->mapaddr = kmap_atomic(buf->page);
 			cs->len = buf->len;
 			cs->buf = cs->mapaddr + buf->offset;
 			cs->pipebufs++;
@@ -874,7 +874,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
 out_fallback_unlock:
 	unlock_page(newpage);
 out_fallback:
-	cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
+	cs->mapaddr = kmap_atomic(buf->page);
 	cs->buf = cs->mapaddr + buf->offset;
 
 	err = lock_request(cs->fc, cs->req);
diff --git a/fs/pipe.c b/fs/pipe.c
index 78fd0d0..6679c95 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -226,52 +226,6 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 }
 
 /**
- * generic_pipe_buf_map - virtually map a pipe buffer
- * @pipe:	the pipe that the buffer belongs to
- * @buf:	the buffer that should be mapped
- * @atomic:	whether to use an atomic map
- *
- * Description:
- *	This function returns a kernel virtual address mapping for the
- *	pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided
- *	and the caller has to be careful not to fault before calling
- *	the unmap function.
- *
- *	Note that this function calls kmap_atomic() if @atomic != 0.
- */
-void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
-			   struct pipe_buffer *buf, int atomic)
-{
-	if (atomic) {
-		buf->flags |= PIPE_BUF_FLAG_ATOMIC;
-		return kmap_atomic(buf->page);
-	}
-
-	return kmap(buf->page);
-}
-EXPORT_SYMBOL(generic_pipe_buf_map);
-
-/**
- * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
- * @pipe:	the pipe that the buffer belongs to
- * @buf:	the buffer that should be unmapped
- * @map_data:	the data that the mapping function returned
- *
- * Description:
- *	This function undoes the mapping that ->map() provided.
- */
-void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
-			    struct pipe_buffer *buf, void *map_data)
-{
-	if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
-		buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
-		kunmap_atomic(map_data);
-	} else
-		kunmap(buf->page);
-}
-EXPORT_SYMBOL(generic_pipe_buf_unmap);
-
-/**
  * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
  * @pipe:	the pipe that the buffer belongs to
  * @buf:	the buffer to attempt to steal
@@ -351,8 +305,6 @@ EXPORT_SYMBOL(generic_pipe_buf_release);
 
 static const struct pipe_buf_operations anon_pipe_buf_ops = {
 	.can_merge = 1,
-	.map = generic_pipe_buf_map,
-	.unmap = generic_pipe_buf_unmap,
 	.confirm = generic_pipe_buf_confirm,
 	.release = anon_pipe_buf_release,
 	.steal = generic_pipe_buf_steal,
@@ -361,8 +313,6 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
 
 static const struct pipe_buf_operations packet_pipe_buf_ops = {
 	.can_merge = 0,
-	.map = generic_pipe_buf_map,
-	.unmap = generic_pipe_buf_unmap,
 	.confirm = generic_pipe_buf_confirm,
 	.release = anon_pipe_buf_release,
 	.steal = generic_pipe_buf_steal,
@@ -410,9 +360,15 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
 
 			atomic = !iov_fault_in_pages_write(iov, chars);
 redo:
-			addr = ops->map(pipe, buf, atomic);
+			if (atomic)
+				addr = kmap_atomic(buf->page);
+			else
+				addr = kmap(buf->page);
 			error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic);
-			ops->unmap(pipe, buf, addr);
+			if (atomic)
+				kunmap_atomic(addr);
+			else
+				kunmap(buf->page);
 			if (unlikely(error)) {
 				/*
 				 * Just retry with the slow path if we failed.
@@ -538,10 +494,16 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
 
 			iov_fault_in_pages_read(iov, chars);
 redo1:
-			addr = ops->map(pipe, buf, atomic);
+			if (atomic)
+				addr = kmap_atomic(buf->page);
+			else
+				addr = kmap(buf->page);
 			error = pipe_iov_copy_from_user(offset + addr, iov,
 							chars, atomic);
-			ops->unmap(pipe, buf, addr);
+			if (atomic)
+				kunmap_atomic(addr);
+			else
+				kunmap(buf->page);
 			ret = error;
 			do_wakeup = 1;
 			if (error) {
diff --git a/fs/splice.c b/fs/splice.c
index 12028fa..ca3bfbd 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -136,8 +136,6 @@ error:
 
 const struct pipe_buf_operations page_cache_pipe_buf_ops = {
 	.can_merge = 0,
-	.map = generic_pipe_buf_map,
-	.unmap = generic_pipe_buf_unmap,
 	.confirm = page_cache_pipe_buf_confirm,
 	.release = page_cache_pipe_buf_release,
 	.steal = page_cache_pipe_buf_steal,
@@ -156,8 +154,6 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
 
 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
 	.can_merge = 0,
-	.map = generic_pipe_buf_map,
-	.unmap = generic_pipe_buf_unmap,
 	.confirm = generic_pipe_buf_confirm,
 	.release = page_cache_pipe_buf_release,
 	.steal = user_page_pipe_buf_steal,
@@ -547,8 +543,6 @@ EXPORT_SYMBOL(generic_file_splice_read);
 
 static const struct pipe_buf_operations default_pipe_buf_ops = {
 	.can_merge = 0,
-	.map = generic_pipe_buf_map,
-	.unmap = generic_pipe_buf_unmap,
 	.confirm = generic_pipe_buf_confirm,
 	.release = generic_pipe_buf_release,
 	.steal = generic_pipe_buf_steal,
@@ -564,8 +558,6 @@ static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
 /* Pipe buffer operations for a socket and similar. */
 const struct pipe_buf_operations nosteal_pipe_buf_ops = {
 	.can_merge = 0,
-	.map = generic_pipe_buf_map,
-	.unmap = generic_pipe_buf_unmap,
 	.confirm = generic_pipe_buf_confirm,
 	.release = generic_pipe_buf_release,
 	.steal = generic_pipe_buf_nosteal,
@@ -767,13 +759,13 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 		goto out;
 
 	if (buf->page != page) {
-		char *src = buf->ops->map(pipe, buf, 1);
+		char *src = kmap_atomic(buf->page);
 		char *dst = kmap_atomic(page);
 
 		memcpy(dst + offset, src + buf->offset, this_len);
 		flush_dcache_page(page);
 		kunmap_atomic(dst);
-		buf->ops->unmap(pipe, buf, src);
+		kunmap_atomic(src);
 	}
 	ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
 				page, fsdata);
@@ -1067,9 +1059,9 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	void *data;
 	loff_t tmp = sd->pos;
 
-	data = buf->ops->map(pipe, buf, 0);
+	data = kmap(buf->page);
 	ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
-	buf->ops->unmap(pipe, buf, data);
+	kunmap(buf->page);
 
 	return ret;
 }
@@ -1536,10 +1528,10 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	 * pages and doing an atomic copy
 	 */
 	if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
-		src = buf->ops->map(pipe, buf, 1);
+		src = kmap_atomic(buf->page);
 		ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
 							sd->len);
-		buf->ops->unmap(pipe, buf, src);
+		kunmap_atomic(src);
 		if (!ret) {
 			ret = sd->len;
 			goto out;
@@ -1549,13 +1541,13 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	/*
 	 * No dice, use slow non-atomic map and copy
  	 */
-	src = buf->ops->map(pipe, buf, 0);
+	src = kmap(buf->page);
 
 	ret = sd->len;
 	if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
 		ret = -EFAULT;
 
-	buf->ops->unmap(pipe, buf, src);
+	kunmap(buf->page);
 out:
 	if (ret > 0)
 		sd->u.userptr += ret;
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index ab57526..6dffceb 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -83,23 +83,6 @@ struct pipe_buf_operations {
 	int can_merge;
 
 	/*
-	 * ->map() returns a virtual address mapping of the pipe buffer.
-	 * The last integer flag reflects whether this should be an atomic
-	 * mapping or not. The atomic map is faster, however you can't take
-	 * page faults before calling ->unmap() again. So if you need to eg
-	 * access user data through copy_to/from_user(), then you must get
-	 * a non-atomic map. ->map() uses the kmap_atomic slot for
-	 * atomic maps, you have to be careful if mapping another page as
-	 * source or destination for a copy.
-	 */
-	void * (*map)(struct pipe_inode_info *, struct pipe_buffer *, int);
-
-	/*
-	 * Undoes ->map(), finishes the virtual mapping of the pipe buffer.
-	 */
-	void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *, void *);
-
-	/*
 	 * ->confirm() verifies that the data in the pipe buffer is there
 	 * and that the contents are good. If the pages in the pipe belong
 	 * to a file system, we may need to wait for IO completion in this
@@ -150,8 +133,6 @@ struct pipe_inode_info *alloc_pipe_info(void);
 void free_pipe_info(struct pipe_inode_info *);
 
 /* Generic pipe buffer ops functions */
-void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *, int);
-void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *, void *);
 void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
 int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
 int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
diff --git a/kernel/relay.c b/kernel/relay.c
index 5001c98..98833f6 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1195,8 +1195,6 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
 
 static const struct pipe_buf_operations relay_pipe_buf_ops = {
 	.can_merge = 0,
-	.map = generic_pipe_buf_map,
-	.unmap = generic_pipe_buf_unmap,
 	.confirm = generic_pipe_buf_confirm,
 	.release = relay_pipe_buf_release,
 	.steal = generic_pipe_buf_steal,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 24c1f23..7511de3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4316,8 +4316,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
 
 static const struct pipe_buf_operations tracing_pipe_buf_ops = {
 	.can_merge		= 0,
-	.map			= generic_pipe_buf_map,
-	.unmap			= generic_pipe_buf_unmap,
 	.confirm		= generic_pipe_buf_confirm,
 	.release		= generic_pipe_buf_release,
 	.steal			= generic_pipe_buf_steal,
@@ -5194,8 +5192,6 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
 /* Pipe buffer operations for a buffer. */
 static const struct pipe_buf_operations buffer_pipe_buf_ops = {
 	.can_merge		= 0,
-	.map			= generic_pipe_buf_map,
-	.unmap			= generic_pipe_buf_unmap,
 	.confirm		= generic_pipe_buf_confirm,
 	.release		= buffer_pipe_buf_release,
 	.steal			= generic_pipe_buf_steal,
-- 
cgit v0.10.2


From c186afb4dbd0050a537b96c7fbee2dba3b57fc38 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 2 Feb 2014 21:16:54 -0500
Subject: switch ->is_partially_uptodate() to saner arguments

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 5b0c083..bb2534b 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -198,7 +198,7 @@ prototypes:
 				unsigned long *);
 	int (*migratepage)(struct address_space *, struct page *, struct page *);
 	int (*launder_page)(struct page *);
-	int (*is_partially_uptodate)(struct page *, read_descriptor_t *, unsigned long);
+	int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long);
 	int (*error_remove_page)(struct address_space *, struct page *);
 	int (*swap_activate)(struct file *);
 	int (*swap_deactivate)(struct file *);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index c53784c..419e734 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -580,7 +580,7 @@ struct address_space_operations {
 	/* migrate the contents of a page to the specified target */
 	int (*migratepage) (struct page *, struct page *);
 	int (*launder_page) (struct page *);
-	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
+	int (*is_partially_uptodate) (struct page *, unsigned long,
 					unsigned long);
 	void (*is_dirty_writeback) (struct page *, bool *, bool *);
 	int (*error_remove_page) (struct mapping *mapping, struct page *page);
diff --git a/fs/buffer.c b/fs/buffer.c
index 27265a8..027ae3b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2114,8 +2114,8 @@ EXPORT_SYMBOL(generic_write_end);
  * Returns true if all buffers which correspond to a file portion
  * we want to read are uptodate.
  */
-int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
-					unsigned long from)
+int block_is_partially_uptodate(struct page *page, unsigned long from,
+					unsigned long count)
 {
 	unsigned block_start, block_end, blocksize;
 	unsigned to;
@@ -2127,7 +2127,7 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
 
 	head = page_buffers(page);
 	blocksize = head->b_size;
-	to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
+	to = min_t(unsigned, PAGE_CACHE_SIZE - from, count);
 	to = from + to;
 	if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
 		return 0;
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index d77797a..c40302f 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -210,8 +210,8 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
 int block_write_full_page_endio(struct page *page, get_block_t *get_block,
 			struct writeback_control *wbc, bh_end_io_t *handler);
 int block_read_full_page(struct page*, get_block_t*);
-int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
-				unsigned long from);
+int block_is_partially_uptodate(struct page *page, unsigned long from,
+				unsigned long count);
 int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
 		unsigned flags, struct page **pagep, get_block_t *get_block);
 int __block_write_begin(struct page *page, loff_t pos, unsigned len,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index db181b5..ddfff2e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -385,7 +385,7 @@ struct address_space_operations {
 	int (*migratepage) (struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
 	int (*launder_page) (struct page *);
-	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
+	int (*is_partially_uptodate) (struct page *, unsigned long,
 					unsigned long);
 	void (*is_dirty_writeback) (struct page *, bool *, bool *);
 	int (*error_remove_page)(struct address_space *, struct page *);
diff --git a/mm/filemap.c b/mm/filemap.c
index 7a13f6a..46e9801 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1148,7 +1148,7 @@ find_page:
 			if (!page->mapping)
 				goto page_not_up_to_date_locked;
 			if (!mapping->a_ops->is_partially_uptodate(page,
-								desc, offset))
+							offset, desc->count))
 				goto page_not_up_to_date_locked;
 			unlock_page(page);
 		}
-- 
cgit v0.10.2


From 9e8c2af96e0d2d5fe298dd796fb6bc16e888a48d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 2 Feb 2014 22:10:25 -0500
Subject: callers of iov_copy_from_user_atomic() don't need pagecache_disable()

... it does that itself (via kmap_atomic())

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0165b86..34e0962 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -425,13 +425,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
 		struct page *page = prepared_pages[pg];
 		/*
 		 * Copy data from userspace to the current page
-		 *
-		 * Disable pagefault to avoid recursive lock since
-		 * the pages are already locked
 		 */
-		pagefault_disable();
 		copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
-		pagefault_enable();
 
 		/* Flush processor's dcache for this page */
 		flush_dcache_page(page);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 77bcc30..a91d3b4 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1003,9 +1003,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
 		if (mapping_writably_mapped(mapping))
 			flush_dcache_page(page);
 
-		pagefault_disable();
 		tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
-		pagefault_enable();
 		flush_dcache_page(page);
 
 		mark_page_accessed(page);
diff --git a/mm/filemap.c b/mm/filemap.c
index 46e9801..bfb7a97 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1974,7 +1974,6 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
 	char *kaddr;
 	size_t copied;
 
-	BUG_ON(!in_atomic());
 	kaddr = kmap_atomic(page);
 	if (likely(i->nr_segs == 1)) {
 		int left;
@@ -2348,9 +2347,7 @@ again:
 		if (mapping_writably_mapped(mapping))
 			flush_dcache_page(page);
 
-		pagefault_disable();
 		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
-		pagefault_enable();
 		flush_dcache_page(page);
 
 		mark_page_accessed(page);
-- 
cgit v0.10.2


From 8142c184b8f16d213eb8ba06ccb6222259a51cfc Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 2 Feb 2014 22:18:22 -0500
Subject: do_shmem_file_read(): call file_read_actor() directly

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/mm/shmem.c b/mm/shmem.c
index 1f18c9d..9398e6c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1462,7 +1462,7 @@ shmem_write_end(struct file *file, struct address_space *mapping,
 	return copied;
 }
 
-static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
+static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc)
 {
 	struct inode *inode = file_inode(filp);
 	struct address_space *mapping = inode->i_mapping;
@@ -1550,7 +1550,7 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
 		 * "pos" here (the actor routine has to update the user buffer
 		 * pointers and the remaining count).
 		 */
-		ret = actor(desc, page, offset, nr);
+		ret = file_read_actor(desc, page, offset, nr);
 		offset += ret;
 		index += offset >> PAGE_CACHE_SHIFT;
 		offset &= ~PAGE_CACHE_MASK;
@@ -1588,7 +1588,7 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb,
 		if (desc.count == 0)
 			continue;
 		desc.error = 0;
-		do_shmem_file_read(filp, ppos, &desc, file_read_actor);
+		do_shmem_file_read(filp, ppos, &desc);
 		retval += desc.written;
 		if (desc.error) {
 			retval = retval ?: desc.error;
-- 
cgit v0.10.2


From 9223687863ffa63fa655f52ef64148ee08dee4d1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 27 Nov 2013 16:29:46 -0800
Subject: iov_iter: Move iov_iter to uio.h

Signed-off-by: Kent Overstreet <kmo@daterainc.com>

diff --git a/include/linux/fs.h b/include/linux/fs.h
index ddfff2e..2261ac8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -295,38 +295,6 @@ struct page;
 struct address_space;
 struct writeback_control;
 
-struct iov_iter {
-	const struct iovec *iov;
-	unsigned long nr_segs;
-	size_t iov_offset;
-	size_t count;
-};
-
-size_t iov_iter_copy_from_user_atomic(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes);
-size_t iov_iter_copy_from_user(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes);
-void iov_iter_advance(struct iov_iter *i, size_t bytes);
-int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
-size_t iov_iter_single_seg_count(const struct iov_iter *i);
-
-static inline void iov_iter_init(struct iov_iter *i,
-			const struct iovec *iov, unsigned long nr_segs,
-			size_t count, size_t written)
-{
-	i->iov = iov;
-	i->nr_segs = nr_segs;
-	i->iov_offset = 0;
-	i->count = count + written;
-
-	iov_iter_advance(i, written);
-}
-
-static inline size_t iov_iter_count(struct iov_iter *i)
-{
-	return i->count;
-}
-
 /*
  * "descriptor" for what we're up to with a read.
  * This allows us to use the same read code yet
diff --git a/include/linux/uio.h b/include/linux/uio.h
index c55ce24..347d70c 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -9,14 +9,23 @@
 #ifndef __LINUX_UIO_H
 #define __LINUX_UIO_H
 
+#include <linux/kernel.h>
 #include <uapi/linux/uio.h>
 
+struct page;
 
 struct kvec {
 	void *iov_base; /* and that should *never* hold a userland pointer */
 	size_t iov_len;
 };
 
+struct iov_iter {
+	const struct iovec *iov;
+	unsigned long nr_segs;
+	size_t iov_offset;
+	size_t count;
+};
+
 /*
  * Total number of bytes covered by an iovec.
  *
@@ -34,8 +43,49 @@ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
 	return ret;
 }
 
+static inline struct iovec iov_iter_iovec(const struct iov_iter *iter)
+{
+	return (struct iovec) {
+		.iov_base = iter->iov->iov_base + iter->iov_offset,
+		.iov_len = min(iter->count,
+			       iter->iov->iov_len - iter->iov_offset),
+	};
+}
+
+#define iov_for_each(iov, iter, start)				\
+	for (iter = (start);					\
+	     (iter).count &&					\
+	     ((iov = iov_iter_iovec(&(iter))), 1);		\
+	     iov_iter_advance(&(iter), (iov).iov_len))
+
 unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to);
 
+size_t iov_iter_copy_from_user_atomic(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes);
+size_t iov_iter_copy_from_user(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes);
+void iov_iter_advance(struct iov_iter *i, size_t bytes);
+int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
+size_t iov_iter_single_seg_count(const struct iov_iter *i);
+
+static inline void iov_iter_init(struct iov_iter *i,
+			const struct iovec *iov, unsigned long nr_segs,
+			size_t count, size_t written)
+{
+	i->iov = iov;
+	i->nr_segs = nr_segs;
+	i->iov_offset = 0;
+	i->count = count + written;
+
+	iov_iter_advance(i, written);
+}
+
+static inline size_t iov_iter_count(struct iov_iter *i)
+{
+	return i->count;
+}
+
 int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len);
 int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len);
+
 #endif
-- 
cgit v0.10.2


From 6e58e79db8a16222b31fc8da1ca2ac2dccfc4237 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 3 Feb 2014 17:07:03 -0500
Subject: introduce copy_page_to_iter, kill loop over iovec in
 generic_file_aio_read()

generic_file_aio_read() was looping over the target iovec, with loop over
(source) pages nested inside that.  Just set an iov_iter up and pass *that*
to do_generic_file_aio_read().  With copy_page_to_iter() doing all work
of mapping and copying a page to iovec and advancing iov_iter.

Switch shmem_file_aio_read() to the same and kill file_read_actor(), while
we are at it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2261ac8..2a5b174 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2390,7 +2390,6 @@ extern int generic_file_mmap(struct file *, struct vm_area_struct *);
 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
 extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr,
 		unsigned long size, pgoff_t pgoff);
-extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
 extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
 extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long,
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 347d70c..199bcc3 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -67,6 +67,8 @@ size_t iov_iter_copy_from_user(struct page *page,
 void iov_iter_advance(struct iov_iter *i, size_t bytes);
 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
 size_t iov_iter_single_seg_count(const struct iov_iter *i);
+size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
+			 struct iov_iter *i);
 
 static inline void iov_iter_init(struct iov_iter *i,
 			const struct iovec *iov, unsigned long nr_segs,
diff --git a/mm/filemap.c b/mm/filemap.c
index bfb7a97..a16eb2c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1085,11 +1085,90 @@ static void shrink_readahead_size_eio(struct file *filp,
 	ra->ra_pages /= 4;
 }
 
+size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
+			 struct iov_iter *i)
+{
+	size_t skip, copy, left, wanted;
+	const struct iovec *iov;
+	char __user *buf;
+	void *kaddr, *from;
+
+	if (unlikely(bytes > i->count))
+		bytes = i->count;
+
+	if (unlikely(!bytes))
+		return 0;
+
+	wanted = bytes;
+	iov = i->iov;
+	skip = i->iov_offset;
+	buf = iov->iov_base + skip;
+	copy = min(bytes, iov->iov_len - skip);
+
+	if (!fault_in_pages_writeable(buf, copy)) {
+		kaddr = kmap_atomic(page);
+		from = kaddr + offset;
+
+		/* first chunk, usually the only one */
+		left = __copy_to_user_inatomic(buf, from, copy);
+		copy -= left;
+		skip += copy;
+		from += copy;
+		bytes -= copy;
+
+		while (unlikely(!left && bytes)) {
+			iov++;
+			buf = iov->iov_base;
+			copy = min(bytes, iov->iov_len);
+			left = __copy_to_user_inatomic(buf, from, copy);
+			copy -= left;
+			skip = copy;
+			from += copy;
+			bytes -= copy;
+		}
+		if (likely(!bytes)) {
+			kunmap_atomic(kaddr);
+			goto done;
+		}
+		offset = from - kaddr;
+		buf += copy;
+		kunmap_atomic(kaddr);
+		copy = min(bytes, iov->iov_len - skip);
+	}
+	/* Too bad - revert to non-atomic kmap */
+	kaddr = kmap(page);
+	from = kaddr + offset;
+	left = __copy_to_user(buf, from, copy);
+	copy -= left;
+	skip += copy;
+	from += copy;
+	bytes -= copy;
+	while (unlikely(!left && bytes)) {
+		iov++;
+		buf = iov->iov_base;
+		copy = min(bytes, iov->iov_len);
+		left = __copy_to_user(buf, from, copy);
+		copy -= left;
+		skip = copy;
+		from += copy;
+		bytes -= copy;
+	}
+	kunmap(page);
+done:
+	i->count -= wanted - bytes;
+	i->nr_segs -= iov - i->iov;
+	i->iov = iov;
+	i->iov_offset = skip;
+	return wanted - bytes;
+}
+EXPORT_SYMBOL(copy_page_to_iter);
+
 /**
  * do_generic_file_read - generic file read routine
  * @filp:	the file to read
  * @ppos:	current file position
- * @desc:	read_descriptor
+ * @iter:	data destination
+ * @written:	already copied
  *
  * This is a generic file read routine, and uses the
  * mapping->a_ops->readpage() function for the actual low-level stuff.
@@ -1097,8 +1176,8 @@ static void shrink_readahead_size_eio(struct file *filp,
  * This is really ugly. But the goto's actually try to clarify some
  * of the logic when it comes to error handling etc.
  */
-static void do_generic_file_read(struct file *filp, loff_t *ppos,
-		read_descriptor_t *desc)
+static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
+		struct iov_iter *iter, ssize_t written)
 {
 	struct address_space *mapping = filp->f_mapping;
 	struct inode *inode = mapping->host;
@@ -1108,12 +1187,12 @@ static void do_generic_file_read(struct file *filp, loff_t *ppos,
 	pgoff_t prev_index;
 	unsigned long offset;      /* offset into pagecache page */
 	unsigned int prev_offset;
-	int error;
+	int error = 0;
 
 	index = *ppos >> PAGE_CACHE_SHIFT;
 	prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
 	prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
-	last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+	last_index = (*ppos + iter->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
 	offset = *ppos & ~PAGE_CACHE_MASK;
 
 	for (;;) {
@@ -1148,7 +1227,7 @@ find_page:
 			if (!page->mapping)
 				goto page_not_up_to_date_locked;
 			if (!mapping->a_ops->is_partially_uptodate(page,
-							offset, desc->count))
+							offset, iter->count))
 				goto page_not_up_to_date_locked;
 			unlock_page(page);
 		}
@@ -1198,24 +1277,23 @@ page_ok:
 		/*
 		 * Ok, we have the page, and it's up-to-date, so
 		 * now we can copy it to user space...
-		 *
-		 * The file_read_actor routine returns how many bytes were
-		 * actually used..
-		 * NOTE! This may not be the same as how much of a user buffer
-		 * we filled up (we may be padding etc), so we can only update
-		 * "pos" here (the actor routine has to update the user buffer
-		 * pointers and the remaining count).
 		 */
-		ret = file_read_actor(desc, page, offset, nr);
+
+		ret = copy_page_to_iter(page, offset, nr, iter);
 		offset += ret;
 		index += offset >> PAGE_CACHE_SHIFT;
 		offset &= ~PAGE_CACHE_MASK;
 		prev_offset = offset;
 
 		page_cache_release(page);
-		if (ret == nr && desc->count)
-			continue;
-		goto out;
+		written += ret;
+		if (!iov_iter_count(iter))
+			goto out;
+		if (ret < nr) {
+			error = -EFAULT;
+			goto out;
+		}
+		continue;
 
 page_not_up_to_date:
 		/* Get exclusive access to the page ... */
@@ -1250,6 +1328,7 @@ readpage:
 		if (unlikely(error)) {
 			if (error == AOP_TRUNCATED_PAGE) {
 				page_cache_release(page);
+				error = 0;
 				goto find_page;
 			}
 			goto readpage_error;
@@ -1280,7 +1359,6 @@ readpage:
 
 readpage_error:
 		/* UHHUH! A synchronous read error occurred. Report it */
-		desc->error = error;
 		page_cache_release(page);
 		goto out;
 
@@ -1291,16 +1369,17 @@ no_cached_page:
 		 */
 		page = page_cache_alloc_cold(mapping);
 		if (!page) {
-			desc->error = -ENOMEM;
+			error = -ENOMEM;
 			goto out;
 		}
 		error = add_to_page_cache_lru(page, mapping,
 						index, GFP_KERNEL);
 		if (error) {
 			page_cache_release(page);
-			if (error == -EEXIST)
+			if (error == -EEXIST) {
+				error = 0;
 				goto find_page;
-			desc->error = error;
+			}
 			goto out;
 		}
 		goto readpage;
@@ -1313,44 +1392,7 @@ out:
 
 	*ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
 	file_accessed(filp);
-}
-
-int file_read_actor(read_descriptor_t *desc, struct page *page,
-			unsigned long offset, unsigned long size)
-{
-	char *kaddr;
-	unsigned long left, count = desc->count;
-
-	if (size > count)
-		size = count;
-
-	/*
-	 * Faults on the destination of a read are common, so do it before
-	 * taking the kmap.
-	 */
-	if (!fault_in_pages_writeable(desc->arg.buf, size)) {
-		kaddr = kmap_atomic(page);
-		left = __copy_to_user_inatomic(desc->arg.buf,
-						kaddr + offset, size);
-		kunmap_atomic(kaddr);
-		if (left == 0)
-			goto success;
-	}
-
-	/* Do it the slow way */
-	kaddr = kmap(page);
-	left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
-	kunmap(page);
-
-	if (left) {
-		size -= left;
-		desc->error = -EFAULT;
-	}
-success:
-	desc->count = count - size;
-	desc->written += size;
-	desc->arg.buf += size;
-	return size;
+	return written ? written : error;
 }
 
 /*
@@ -1408,14 +1450,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 {
 	struct file *filp = iocb->ki_filp;
 	ssize_t retval;
-	unsigned long seg = 0;
 	size_t count;
 	loff_t *ppos = &iocb->ki_pos;
+	struct iov_iter i;
 
 	count = 0;
 	retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
 	if (retval)
 		return retval;
+	iov_iter_init(&i, iov, nr_segs, count, 0);
 
 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
 	if (filp->f_flags & O_DIRECT) {
@@ -1437,6 +1480,11 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 		if (retval > 0) {
 			*ppos = pos + retval;
 			count -= retval;
+			/*
+			 * If we did a short DIO read we need to skip the
+			 * section of the iov that we've already read data into.
+			 */
+			iov_iter_advance(&i, retval);
 		}
 
 		/*
@@ -1453,39 +1501,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 		}
 	}
 
-	count = retval;
-	for (seg = 0; seg < nr_segs; seg++) {
-		read_descriptor_t desc;
-		loff_t offset = 0;
-
-		/*
-		 * If we did a short DIO read we need to skip the section of the
-		 * iov that we've already read data into.
-		 */
-		if (count) {
-			if (count > iov[seg].iov_len) {
-				count -= iov[seg].iov_len;
-				continue;
-			}
-			offset = count;
-			count = 0;
-		}
-
-		desc.written = 0;
-		desc.arg.buf = iov[seg].iov_base + offset;
-		desc.count = iov[seg].iov_len - offset;
-		if (desc.count == 0)
-			continue;
-		desc.error = 0;
-		do_generic_file_read(filp, ppos, &desc);
-		retval += desc.written;
-		if (desc.error) {
-			retval = retval ?: desc.error;
-			break;
-		}
-		if (desc.count > 0)
-			break;
-	}
+	retval = do_generic_file_read(filp, ppos, &i, retval);
 out:
 	return retval;
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index 9398e6c..17d3799d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1462,13 +1462,25 @@ shmem_write_end(struct file *file, struct address_space *mapping,
 	return copied;
 }
 
-static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc)
+static ssize_t shmem_file_aio_read(struct kiocb *iocb,
+		const struct iovec *iov, unsigned long nr_segs, loff_t pos)
 {
-	struct inode *inode = file_inode(filp);
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
 	struct address_space *mapping = inode->i_mapping;
 	pgoff_t index;
 	unsigned long offset;
 	enum sgp_type sgp = SGP_READ;
+	int error;
+	ssize_t retval;
+	size_t count;
+	loff_t *ppos = &iocb->ki_pos;
+	struct iov_iter iter;
+
+	retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
+	if (retval)
+		return retval;
+	iov_iter_init(&iter, iov, nr_segs, count, 0);
 
 	/*
 	 * Might this read be for a stacking filesystem?  Then when reading
@@ -1496,10 +1508,10 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
 				break;
 		}
 
-		desc->error = shmem_getpage(inode, index, &page, sgp, NULL);
-		if (desc->error) {
-			if (desc->error == -EINVAL)
-				desc->error = 0;
+		error = shmem_getpage(inode, index, &page, sgp, NULL);
+		if (error) {
+			if (error == -EINVAL)
+				error = 0;
 			break;
 		}
 		if (page)
@@ -1543,61 +1555,26 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
 		/*
 		 * Ok, we have the page, and it's up-to-date, so
 		 * now we can copy it to user space...
-		 *
-		 * The actor routine returns how many bytes were actually used..
-		 * NOTE! This may not be the same as how much of a user buffer
-		 * we filled up (we may be padding etc), so we can only update
-		 * "pos" here (the actor routine has to update the user buffer
-		 * pointers and the remaining count).
 		 */
-		ret = file_read_actor(desc, page, offset, nr);
+		ret = copy_page_to_iter(page, offset, nr, &iter);
+		retval += ret;
 		offset += ret;
 		index += offset >> PAGE_CACHE_SHIFT;
 		offset &= ~PAGE_CACHE_MASK;
 
 		page_cache_release(page);
-		if (ret != nr || !desc->count)
+		if (!iov_iter_count(&iter))
 			break;
-
+		if (ret < nr) {
+			error = -EFAULT;
+			break;
+		}
 		cond_resched();
 	}
 
 	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
-	file_accessed(filp);
-}
-
-static ssize_t shmem_file_aio_read(struct kiocb *iocb,
-		const struct iovec *iov, unsigned long nr_segs, loff_t pos)
-{
-	struct file *filp = iocb->ki_filp;
-	ssize_t retval;
-	unsigned long seg;
-	size_t count;
-	loff_t *ppos = &iocb->ki_pos;
-
-	retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
-	if (retval)
-		return retval;
-
-	for (seg = 0; seg < nr_segs; seg++) {
-		read_descriptor_t desc;
-
-		desc.written = 0;
-		desc.arg.buf = iov[seg].iov_base;
-		desc.count = iov[seg].iov_len;
-		if (desc.count == 0)
-			continue;
-		desc.error = 0;
-		do_shmem_file_read(filp, ppos, &desc);
-		retval += desc.written;
-		if (desc.error) {
-			retval = retval ?: desc.error;
-			break;
-		}
-		if (desc.count > 0)
-			break;
-	}
-	return retval;
+	file_accessed(file);
+	return retval ? retval : error;
 }
 
 static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
-- 
cgit v0.10.2


From 74027f4a181754e917853bd1d2e21449f008ab39 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 4 Feb 2014 13:47:26 -0500
Subject: cifs_iovec_read(): resubmit shouldn't restart the loop

... by that point the request we'd just resent is in the
head of the list anyway.  Just return to the beginning of
the loop body...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 834fce7..df414db 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2918,8 +2918,8 @@ error:
 		rc = 0;
 
 	/* the loop below should proceed in the order of increasing offsets */
-restart_loop:
 	list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
+	again:
 		if (!rc) {
 			ssize_t copied;
 
@@ -2927,20 +2927,20 @@ restart_loop:
 			rc = wait_for_completion_killable(&rdata->done);
 			if (rc)
 				rc = -EINTR;
-			else if (rdata->result)
+			else if (rdata->result) {
 				rc = rdata->result;
-			else {
+				/* resend call if it's a retryable error */
+				if (rc == -EAGAIN) {
+					rc = cifs_retry_async_readv(rdata);
+					goto again;
+				}
+			} else {
 				rc = cifs_readdata_to_iov(rdata, iov,
 							nr_segs, *poffset,
 							&copied);
 				total_read += copied;
 			}
 
-			/* resend call if it's a retryable error */
-			if (rc == -EAGAIN) {
-				rc = cifs_retry_async_readv(rdata);
-				goto restart_loop;
-			}
 		}
 		list_del_init(&rdata->list);
 		kref_put(&rdata->refcount, cifs_uncached_readdata_release);
-- 
cgit v0.10.2


From 637b58c2887e5e57850865839cc75f59184b23d1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 3 Feb 2014 19:11:42 -0500
Subject: switch pipe_read() to copy_page_to_iter()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/pipe.c b/fs/pipe.c
index 6679c95..034bffa 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -142,55 +142,6 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len,
 	return 0;
 }
 
-static int
-pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len,
-		      int atomic)
-{
-	unsigned long copy;
-
-	while (len > 0) {
-		while (!iov->iov_len)
-			iov++;
-		copy = min_t(unsigned long, len, iov->iov_len);
-
-		if (atomic) {
-			if (__copy_to_user_inatomic(iov->iov_base, from, copy))
-				return -EFAULT;
-		} else {
-			if (copy_to_user(iov->iov_base, from, copy))
-				return -EFAULT;
-		}
-		from += copy;
-		len -= copy;
-		iov->iov_base += copy;
-		iov->iov_len -= copy;
-	}
-	return 0;
-}
-
-/*
- * Attempt to pre-fault in the user memory, so we can use atomic copies.
- * Returns the number of bytes not faulted in.
- */
-static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len)
-{
-	while (!iov->iov_len)
-		iov++;
-
-	while (len > 0) {
-		unsigned long this_len;
-
-		this_len = min_t(unsigned long, len, iov->iov_len);
-		if (fault_in_pages_writeable(iov->iov_base, this_len))
-			break;
-
-		len -= this_len;
-		iov++;
-	}
-
-	return len;
-}
-
 /*
  * Pre-fault in the user memory, so we can use atomic copies.
  */
@@ -329,12 +280,15 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
 	ssize_t ret;
 	struct iovec *iov = (struct iovec *)_iov;
 	size_t total_len;
+	struct iov_iter iter;
 
 	total_len = iov_length(iov, nr_segs);
 	/* Null read succeeds. */
 	if (unlikely(total_len == 0))
 		return 0;
 
+	iov_iter_init(&iter, iov, nr_segs, total_len, 0);
+
 	do_wakeup = 0;
 	ret = 0;
 	__pipe_lock(pipe);
@@ -344,9 +298,9 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
 			int curbuf = pipe->curbuf;
 			struct pipe_buffer *buf = pipe->bufs + curbuf;
 			const struct pipe_buf_operations *ops = buf->ops;
-			void *addr;
 			size_t chars = buf->len;
-			int error, atomic;
+			size_t written;
+			int error;
 
 			if (chars > total_len)
 				chars = total_len;
@@ -358,27 +312,10 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
 				break;
 			}
 
-			atomic = !iov_fault_in_pages_write(iov, chars);
-redo:
-			if (atomic)
-				addr = kmap_atomic(buf->page);
-			else
-				addr = kmap(buf->page);
-			error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic);
-			if (atomic)
-				kunmap_atomic(addr);
-			else
-				kunmap(buf->page);
-			if (unlikely(error)) {
-				/*
-				 * Just retry with the slow path if we failed.
-				 */
-				if (atomic) {
-					atomic = 0;
-					goto redo;
-				}
+			written = copy_page_to_iter(buf->page, buf->offset, chars, &iter);
+			if (unlikely(written < chars)) {
 				if (!ret)
-					ret = error;
+					ret = -EFAULT;
 				break;
 			}
 			ret += chars;
-- 
cgit v0.10.2


From 6130f5315ee80a591285a25957af71621bd0f17e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 3 Feb 2014 18:19:51 -0500
Subject: switch vmsplice_to_user() to copy_page_to_iter()

I've switched the sanity checks on iovec to rw_copy_check_uvector();
we might need to do a local analog, if any behaviour differences are
not actually bugfixes here...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/splice.c b/fs/splice.c
index ca3bfbd..9bc07d2 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1520,116 +1520,48 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 			struct splice_desc *sd)
 {
-	char *src;
-	int ret;
-
-	/*
-	 * See if we can use the atomic maps, by prefaulting in the
-	 * pages and doing an atomic copy
-	 */
-	if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
-		src = kmap_atomic(buf->page);
-		ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
-							sd->len);
-		kunmap_atomic(src);
-		if (!ret) {
-			ret = sd->len;
-			goto out;
-		}
-	}
-
-	/*
-	 * No dice, use slow non-atomic map and copy
- 	 */
-	src = kmap(buf->page);
-
-	ret = sd->len;
-	if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
-		ret = -EFAULT;
-
-	kunmap(buf->page);
-out:
-	if (ret > 0)
-		sd->u.userptr += ret;
-	return ret;
+	int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
+	return n == sd->len ? n : -EFAULT;
 }
 
 /*
  * For lack of a better implementation, implement vmsplice() to userspace
  * as a simple copy of the pipes pages to the user iov.
  */
-static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
+static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
 			     unsigned long nr_segs, unsigned int flags)
 {
 	struct pipe_inode_info *pipe;
 	struct splice_desc sd;
-	ssize_t size;
-	int error;
 	long ret;
+	struct iovec iovstack[UIO_FASTIOV];
+	struct iovec *iov = iovstack;
+	struct iov_iter iter;
+	ssize_t count = 0;
 
 	pipe = get_pipe_info(file);
 	if (!pipe)
 		return -EBADF;
 
-	pipe_lock(pipe);
-
-	error = ret = 0;
-	while (nr_segs) {
-		void __user *base;
-		size_t len;
-
-		/*
-		 * Get user address base and length for this iovec.
-		 */
-		error = get_user(base, &iov->iov_base);
-		if (unlikely(error))
-			break;
-		error = get_user(len, &iov->iov_len);
-		if (unlikely(error))
-			break;
-
-		/*
-		 * Sanity check this iovec. 0 read succeeds.
-		 */
-		if (unlikely(!len))
-			break;
-		if (unlikely(!base)) {
-			error = -EFAULT;
-			break;
-		}
-
-		if (unlikely(!access_ok(VERIFY_WRITE, base, len))) {
-			error = -EFAULT;
-			break;
-		}
-
-		sd.len = 0;
-		sd.total_len = len;
-		sd.flags = flags;
-		sd.u.userptr = base;
-		sd.pos = 0;
-
-		size = __splice_from_pipe(pipe, &sd, pipe_to_user);
-		if (size < 0) {
-			if (!ret)
-				ret = size;
-
-			break;
-		}
-
-		ret += size;
+	ret = rw_copy_check_uvector(READ, uiov, nr_segs,
+				    ARRAY_SIZE(iovstack), iovstack, &iov);
+	if (ret <= 0)
+		return ret;
 
-		if (size < len)
-			break;
+	iov_iter_init(&iter, iov, nr_segs, count, 0);
 
-		nr_segs--;
-		iov++;
-	}
+	sd.len = 0;
+	sd.total_len = count;
+	sd.flags = flags;
+	sd.u.data = &iter;
+	sd.pos = 0;
 
+	pipe_lock(pipe);
+	ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
 	pipe_unlock(pipe);
 
-	if (!ret)
-		ret = error;
+	if (iov != iovstack)
+		kfree(iov);
 
 	return ret;
 }
-- 
cgit v0.10.2


From 7f25bba819a38ab7310024a9350655f374707e20 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 4 Feb 2014 14:07:43 -0500
Subject: cifs_iovec_read: keep iov_iter between the calls of
 cifs_readdata_to_iov()

... we are doing them on adjacent parts of file, so what happens is that
each subsequent call works to rebuild the iov_iter to exact state it
had been abandoned in by previous one.  Just keep it through the entire
cifs_iovec_read().  And use copy_page_to_iter() instead of doing
kmap/copy_to_user/kunmap manually...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index df414db..ad63e47 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2727,56 +2727,27 @@ cifs_retry_async_readv(struct cifs_readdata *rdata)
 /**
  * cifs_readdata_to_iov - copy data from pages in response to an iovec
  * @rdata:	the readdata response with list of pages holding data
- * @iov:	vector in which we should copy the data
- * @nr_segs:	number of segments in vector
- * @offset:	offset into file of the first iovec
- * @copied:	used to return the amount of data copied to the iov
+ * @iter:	destination for our data
  *
  * This function copies data from a list of pages in a readdata response into
  * an array of iovecs. It will first calculate where the data should go
  * based on the info in the readdata and then copy the data into that spot.
  */
-static ssize_t
-cifs_readdata_to_iov(struct cifs_readdata *rdata, const struct iovec *iov,
-			unsigned long nr_segs, loff_t offset, ssize_t *copied)
+static int
+cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
 {
-	int rc = 0;
-	struct iov_iter ii;
-	size_t pos = rdata->offset - offset;
-	ssize_t remaining = rdata->bytes;
-	unsigned char *pdata;
+	size_t remaining = rdata->bytes;
 	unsigned int i;
 
-	/* set up iov_iter and advance to the correct offset */
-	iov_iter_init(&ii, iov, nr_segs, iov_length(iov, nr_segs), 0);
-	iov_iter_advance(&ii, pos);
-
-	*copied = 0;
 	for (i = 0; i < rdata->nr_pages; i++) {
-		ssize_t copy;
 		struct page *page = rdata->pages[i];
-
-		/* copy a whole page or whatever's left */
-		copy = min_t(ssize_t, remaining, PAGE_SIZE);
-
-		/* ...but limit it to whatever space is left in the iov */
-		copy = min_t(ssize_t, copy, iov_iter_count(&ii));
-
-		/* go while there's data to be copied and no errors */
-		if (copy && !rc) {
-			pdata = kmap(page);
-			rc = memcpy_toiovecend(ii.iov, pdata, ii.iov_offset,
-						(int)copy);
-			kunmap(page);
-			if (!rc) {
-				*copied += copy;
-				remaining -= copy;
-				iov_iter_advance(&ii, copy);
-			}
-		}
+		size_t copy = min(remaining, PAGE_SIZE);
+		size_t written = copy_page_to_iter(page, 0, copy, iter);
+		remaining -= written;
+		if (written < copy && iov_iter_count(iter) > 0)
+			break;
 	}
-
-	return rc;
+	return remaining ? -EFAULT : 0;
 }
 
 static void
@@ -2851,6 +2822,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
 	struct cifsFileInfo *open_file;
 	struct cifs_readdata *rdata, *tmp;
 	struct list_head rdata_list;
+	struct iov_iter to;
 	pid_t pid;
 
 	if (!nr_segs)
@@ -2860,6 +2832,8 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
 	if (!len)
 		return 0;
 
+	iov_iter_init(&to, iov, nr_segs, len, 0);
+
 	INIT_LIST_HEAD(&rdata_list);
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 	open_file = file->private_data;
@@ -2917,12 +2891,11 @@ error:
 	if (!list_empty(&rdata_list))
 		rc = 0;
 
+	len = iov_iter_count(&to);
 	/* the loop below should proceed in the order of increasing offsets */
 	list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
 	again:
 		if (!rc) {
-			ssize_t copied;
-
 			/* FIXME: freezable sleep too? */
 			rc = wait_for_completion_killable(&rdata->done);
 			if (rc)
@@ -2935,10 +2908,7 @@ error:
 					goto again;
 				}
 			} else {
-				rc = cifs_readdata_to_iov(rdata, iov,
-							nr_segs, *poffset,
-							&copied);
-				total_read += copied;
+				rc = cifs_readdata_to_iov(rdata, &to);
 			}
 
 		}
@@ -2946,6 +2916,8 @@ error:
 		kref_put(&rdata->refcount, cifs_uncached_readdata_release);
 	}
 
+	total_read = len - iov_iter_count(&to);
+
 	cifs_stats_bytes_read(tcon, total_read);
 	*poffset += total_read;
 
-- 
cgit v0.10.2


From 0165e8100be3b2b81f747ebc25418656404c61b8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 4 Feb 2014 14:19:48 -0500
Subject: fold cifs_iovec_read() into its (only) caller

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ad63e47..3443b8f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2808,14 +2808,14 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
 	return total_read > 0 ? total_read : result;
 }
 
-static ssize_t
-cifs_iovec_read(struct file *file, const struct iovec *iov,
-		 unsigned long nr_segs, loff_t *poffset)
+ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
+			       unsigned long nr_segs, loff_t pos)
 {
+	struct file *file = iocb->ki_filp;
 	ssize_t rc;
 	size_t len, cur_len;
 	ssize_t total_read = 0;
-	loff_t offset = *poffset;
+	loff_t offset = pos;
 	unsigned int npages;
 	struct cifs_sb_info *cifs_sb;
 	struct cifs_tcon *tcon;
@@ -2919,25 +2919,16 @@ error:
 	total_read = len - iov_iter_count(&to);
 
 	cifs_stats_bytes_read(tcon, total_read);
-	*poffset += total_read;
 
 	/* mask nodata case */
 	if (rc == -ENODATA)
 		rc = 0;
 
-	return total_read ? total_read : rc;
-}
-
-ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
-			       unsigned long nr_segs, loff_t pos)
-{
-	ssize_t read;
-
-	read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos);
-	if (read > 0)
-		iocb->ki_pos = pos;
-
-	return read;
+	if (total_read) {
+		iocb->ki_pos = pos + total_read;
+		return total_read;
+	}
+	return rc;
 }
 
 ssize_t
-- 
cgit v0.10.2


From ec69557982563c97b3a7d68dd271be5105b83869 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 4 Feb 2014 19:08:21 -0500
Subject: read_code(): go through vfs_read() instead of calling the method
 directly

... and don't skip on sanity checks.  It's *not* a hot path, TYVM
(a couple of calls per a.out execve(), for pity sake) and headers of
random a.out binary are not to be trusted.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/exec.c b/fs/exec.c
index 3d78fcc..4cc9453 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -810,7 +810,7 @@ EXPORT_SYMBOL(kernel_read);
 
 ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
 {
-	ssize_t res = file->f_op->read(file, (void __user *)addr, len, &pos);
+	ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
 	if (res > 0)
 		flush_icache_range(addr, addr + len);
 	return res;
-- 
cgit v0.10.2


From 480402e18def5514c9dc8cb04e3c0e7482ff2b86 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Feb 2014 10:14:01 -0500
Subject: untangling process_vm_..., part 1

we want to massage it to use of iov_iter.  This one is an equivalent
transformation - just introduce a local variable mirroring
lvec + *lvec_current.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index fd26d04..7b8d63e 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -60,6 +60,7 @@ static int process_vm_rw_pages(struct task_struct *task,
 	int ret;
 	ssize_t bytes_to_copy;
 	ssize_t rc = 0;
+	const struct iovec *iov = lvec + *lvec_current;
 
 	*bytes_copied = 0;
 
@@ -81,8 +82,10 @@ static int process_vm_rw_pages(struct task_struct *task,
 	     pgs_copied++) {
 		/* Make sure we have a non zero length iovec */
 		while (*lvec_current < lvec_cnt
-		       && lvec[*lvec_current].iov_len == 0)
+		       && iov->iov_len == 0) {
+			iov++;
 			(*lvec_current)++;
+		}
 		if (*lvec_current == lvec_cnt)
 			break;
 
@@ -94,18 +97,18 @@ static int process_vm_rw_pages(struct task_struct *task,
 		bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset,
 				      len - *bytes_copied);
 		bytes_to_copy = min_t(ssize_t, bytes_to_copy,
-				      lvec[*lvec_current].iov_len
+				      iov->iov_len
 				      - *lvec_offset);
 
 		target_kaddr = kmap(process_pages[pgs_copied]) + start_offset;
 
 		if (vm_write)
 			ret = copy_from_user(target_kaddr,
-					     lvec[*lvec_current].iov_base
+					     iov->iov_base
 					     + *lvec_offset,
 					     bytes_to_copy);
 		else
-			ret = copy_to_user(lvec[*lvec_current].iov_base
+			ret = copy_to_user(iov->iov_base
 					   + *lvec_offset,
 					   target_kaddr, bytes_to_copy);
 		kunmap(process_pages[pgs_copied]);
@@ -117,12 +120,13 @@ static int process_vm_rw_pages(struct task_struct *task,
 		}
 		*bytes_copied += bytes_to_copy;
 		*lvec_offset += bytes_to_copy;
-		if (*lvec_offset == lvec[*lvec_current].iov_len) {
+		if (*lvec_offset == iov->iov_len) {
 			/*
 			 * Need to copy remaining part of page into the
 			 * next iovec if there are any bytes left in page
 			 */
 			(*lvec_current)++;
+			iov++;
 			*lvec_offset = 0;
 			start_offset = (start_offset + bytes_to_copy)
 				% PAGE_SIZE;
-- 
cgit v0.10.2


From c61c70384fad407bfcb066c3fb9271164d630212 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Feb 2014 10:22:50 -0500
Subject: untangling process_vm_..., part 2

move iov to caller's stack frame; the value we assign to it on the
next call of process_vm_rw_pages() is equal to the value it had
when the last time we were leaving process_vm_rw_pages().

drop lvec argument of process_vm_rw_pages() - it's not used anymore.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 7b8d63e..186ec5d 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -45,7 +45,7 @@ static int process_vm_rw_pages(struct task_struct *task,
 			       unsigned long pa,
 			       unsigned long start_offset,
 			       unsigned long len,
-			       const struct iovec *lvec,
+			       const struct iovec **iovp,
 			       unsigned long lvec_cnt,
 			       unsigned long *lvec_current,
 			       size_t *lvec_offset,
@@ -60,7 +60,7 @@ static int process_vm_rw_pages(struct task_struct *task,
 	int ret;
 	ssize_t bytes_to_copy;
 	ssize_t rc = 0;
-	const struct iovec *iov = lvec + *lvec_current;
+	const struct iovec *iov = *iovp;
 
 	*bytes_copied = 0;
 
@@ -149,6 +149,7 @@ end:
 			put_page(process_pages[j]);
 	}
 
+	*iovp = iov;
 	return rc;
 }
 
@@ -192,6 +193,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
 	unsigned long nr_pages_to_copy;
 	unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
 		/ sizeof(struct pages *);
+	const struct iovec *iov = lvec + *lvec_current;
 
 	*bytes_copied = 0;
 
@@ -206,7 +208,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
 
 		rc = process_vm_rw_pages(task, mm, process_pages, pa,
 					 start_offset, len,
-					 lvec, lvec_cnt,
+					 &iov, lvec_cnt,
 					 lvec_current, lvec_offset,
 					 vm_write, nr_pages_to_copy,
 					 &bytes_copied_loop);
-- 
cgit v0.10.2


From 12e3004e464f554e0d57ddd0b4185060275e1f12 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Feb 2014 10:28:58 -0500
Subject: untangling process_vm_..., part 3

lift iov one more level out - from process_vm_rw_single_vec to
process_vm_rw_core().  Same story as with the previous commit.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 186ec5d..40dfb39 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -174,7 +174,7 @@ end:
  */
 static int process_vm_rw_single_vec(unsigned long addr,
 				    unsigned long len,
-				    const struct iovec *lvec,
+				    const struct iovec **iovp,
 				    unsigned long lvec_cnt,
 				    unsigned long *lvec_current,
 				    size_t *lvec_offset,
@@ -193,7 +193,6 @@ static int process_vm_rw_single_vec(unsigned long addr,
 	unsigned long nr_pages_to_copy;
 	unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
 		/ sizeof(struct pages *);
-	const struct iovec *iov = lvec + *lvec_current;
 
 	*bytes_copied = 0;
 
@@ -208,7 +207,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
 
 		rc = process_vm_rw_pages(task, mm, process_pages, pa,
 					 start_offset, len,
-					 &iov, lvec_cnt,
+					 iovp, lvec_cnt,
 					 lvec_current, lvec_offset,
 					 vm_write, nr_pages_to_copy,
 					 &bytes_copied_loop);
@@ -319,7 +318,7 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
 	for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) {
 		rc = process_vm_rw_single_vec(
 			(unsigned long)rvec[i].iov_base, rvec[i].iov_len,
-			lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset,
+			&lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset,
 			process_pages, mm, task, vm_write, &bytes_copied_loop);
 		bytes_copied += bytes_copied_loop;
 		if (rc != 0) {
-- 
cgit v0.10.2


From 1291afc18115885cf3ff265a5dc836393060e820 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Feb 2014 10:40:15 -0500
Subject: untangling process_vm_..., part 4

instead of passing vector size (by value) and index (by reference),
pass the number of elements remaining.  That's all we care about
in these functions by that point.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 40dfb39..4bbd495 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -46,8 +46,7 @@ static int process_vm_rw_pages(struct task_struct *task,
 			       unsigned long start_offset,
 			       unsigned long len,
 			       const struct iovec **iovp,
-			       unsigned long lvec_cnt,
-			       unsigned long *lvec_current,
+			       unsigned long *left,
 			       size_t *lvec_offset,
 			       int vm_write,
 			       unsigned int nr_pages_to_copy,
@@ -78,15 +77,14 @@ static int process_vm_rw_pages(struct task_struct *task,
 
 	/* Do the copy for each page */
 	for (pgs_copied = 0;
-	     (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt);
+	     (pgs_copied < nr_pages_to_copy) && *left;
 	     pgs_copied++) {
 		/* Make sure we have a non zero length iovec */
-		while (*lvec_current < lvec_cnt
-		       && iov->iov_len == 0) {
+		while (*left && iov->iov_len == 0) {
 			iov++;
-			(*lvec_current)++;
+			(*left)--;
 		}
-		if (*lvec_current == lvec_cnt)
+		if (!*left)
 			break;
 
 		/*
@@ -125,7 +123,7 @@ static int process_vm_rw_pages(struct task_struct *task,
 			 * Need to copy remaining part of page into the
 			 * next iovec if there are any bytes left in page
 			 */
-			(*lvec_current)++;
+			(*left)--;
 			iov++;
 			*lvec_offset = 0;
 			start_offset = (start_offset + bytes_to_copy)
@@ -175,8 +173,7 @@ end:
 static int process_vm_rw_single_vec(unsigned long addr,
 				    unsigned long len,
 				    const struct iovec **iovp,
-				    unsigned long lvec_cnt,
-				    unsigned long *lvec_current,
+				    unsigned long *left,
 				    size_t *lvec_offset,
 				    struct page **process_pages,
 				    struct mm_struct *mm,
@@ -201,14 +198,14 @@ static int process_vm_rw_single_vec(unsigned long addr,
 		return 0;
 	nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
 
-	while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) {
+	while ((nr_pages_copied < nr_pages) && *left) {
 		nr_pages_to_copy = min(nr_pages - nr_pages_copied,
 				       max_pages_per_loop);
 
 		rc = process_vm_rw_pages(task, mm, process_pages, pa,
 					 start_offset, len,
-					 iovp, lvec_cnt,
-					 lvec_current, lvec_offset,
+					 iovp, left,
+					 lvec_offset,
 					 vm_write, nr_pages_to_copy,
 					 &bytes_copied_loop);
 		start_offset = 0;
@@ -259,7 +256,7 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
 	ssize_t bytes_copied = 0;
 	unsigned long nr_pages = 0;
 	unsigned long nr_pages_iov;
-	unsigned long iov_l_curr_idx = 0;
+	unsigned long left = liovcnt;
 	size_t iov_l_curr_offset = 0;
 	ssize_t iov_len;
 
@@ -315,10 +312,10 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
 		goto put_task_struct;
 	}
 
-	for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) {
+	for (i = 0; i < riovcnt && left; i++) {
 		rc = process_vm_rw_single_vec(
 			(unsigned long)rvec[i].iov_base, rvec[i].iov_len,
-			&lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset,
+			&lvec, &left, &iov_l_curr_offset,
 			process_pages, mm, task, vm_write, &bytes_copied_loop);
 		bytes_copied += bytes_copied_loop;
 		if (rc != 0) {
-- 
cgit v0.10.2


From 9f78bdfabf0de0bc8d3cc97d06908651b577a567 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Feb 2014 11:51:53 -0500
Subject: process_vm_access: switch to iov_iter

instead of keeping its pieces in separate variables and passing
pointers to all of them...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 4bbd495..1e3c811 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -45,9 +45,7 @@ static int process_vm_rw_pages(struct task_struct *task,
 			       unsigned long pa,
 			       unsigned long start_offset,
 			       unsigned long len,
-			       const struct iovec **iovp,
-			       unsigned long *left,
-			       size_t *lvec_offset,
+			       struct iov_iter *iter,
 			       int vm_write,
 			       unsigned int nr_pages_to_copy,
 			       ssize_t *bytes_copied)
@@ -59,7 +57,7 @@ static int process_vm_rw_pages(struct task_struct *task,
 	int ret;
 	ssize_t bytes_to_copy;
 	ssize_t rc = 0;
-	const struct iovec *iov = *iovp;
+	const struct iovec *iov = iter->iov;
 
 	*bytes_copied = 0;
 
@@ -77,14 +75,14 @@ static int process_vm_rw_pages(struct task_struct *task,
 
 	/* Do the copy for each page */
 	for (pgs_copied = 0;
-	     (pgs_copied < nr_pages_to_copy) && *left;
+	     (pgs_copied < nr_pages_to_copy) && iter->nr_segs;
 	     pgs_copied++) {
 		/* Make sure we have a non zero length iovec */
-		while (*left && iov->iov_len == 0) {
+		while (iter->nr_segs && iov->iov_len == 0) {
 			iov++;
-			(*left)--;
+			iter->nr_segs--;
 		}
-		if (!*left)
+		if (!iter->nr_segs)
 			break;
 
 		/*
@@ -96,18 +94,18 @@ static int process_vm_rw_pages(struct task_struct *task,
 				      len - *bytes_copied);
 		bytes_to_copy = min_t(ssize_t, bytes_to_copy,
 				      iov->iov_len
-				      - *lvec_offset);
+				      - iter->iov_offset);
 
 		target_kaddr = kmap(process_pages[pgs_copied]) + start_offset;
 
 		if (vm_write)
 			ret = copy_from_user(target_kaddr,
 					     iov->iov_base
-					     + *lvec_offset,
+					     + iter->iov_offset,
 					     bytes_to_copy);
 		else
 			ret = copy_to_user(iov->iov_base
-					   + *lvec_offset,
+					   + iter->iov_offset,
 					   target_kaddr, bytes_to_copy);
 		kunmap(process_pages[pgs_copied]);
 		if (ret) {
@@ -117,15 +115,15 @@ static int process_vm_rw_pages(struct task_struct *task,
 			goto end;
 		}
 		*bytes_copied += bytes_to_copy;
-		*lvec_offset += bytes_to_copy;
-		if (*lvec_offset == iov->iov_len) {
+		iter->iov_offset += bytes_to_copy;
+		if (iter->iov_offset == iov->iov_len) {
 			/*
 			 * Need to copy remaining part of page into the
 			 * next iovec if there are any bytes left in page
 			 */
-			(*left)--;
+			iter->nr_segs--;
 			iov++;
-			*lvec_offset = 0;
+			iter->iov_offset = 0;
 			start_offset = (start_offset + bytes_to_copy)
 				% PAGE_SIZE;
 			if (start_offset)
@@ -147,7 +145,7 @@ end:
 			put_page(process_pages[j]);
 	}
 
-	*iovp = iov;
+	iter->iov = iov;
 	return rc;
 }
 
@@ -172,9 +170,7 @@ end:
  */
 static int process_vm_rw_single_vec(unsigned long addr,
 				    unsigned long len,
-				    const struct iovec **iovp,
-				    unsigned long *left,
-				    size_t *lvec_offset,
+				    struct iov_iter *iter,
 				    struct page **process_pages,
 				    struct mm_struct *mm,
 				    struct task_struct *task,
@@ -198,14 +194,12 @@ static int process_vm_rw_single_vec(unsigned long addr,
 		return 0;
 	nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
 
-	while ((nr_pages_copied < nr_pages) && *left) {
+	while ((nr_pages_copied < nr_pages) && iter->nr_segs) {
 		nr_pages_to_copy = min(nr_pages - nr_pages_copied,
 				       max_pages_per_loop);
 
 		rc = process_vm_rw_pages(task, mm, process_pages, pa,
-					 start_offset, len,
-					 iovp, left,
-					 lvec_offset,
+					 start_offset, len, iter,
 					 vm_write, nr_pages_to_copy,
 					 &bytes_copied_loop);
 		start_offset = 0;
@@ -240,8 +234,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
  *  return less bytes than expected if an error occurs during the copying
  *  process.
  */
-static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
-				  unsigned long liovcnt,
+static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
 				  const struct iovec *rvec,
 				  unsigned long riovcnt,
 				  unsigned long flags, int vm_write)
@@ -256,8 +249,6 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
 	ssize_t bytes_copied = 0;
 	unsigned long nr_pages = 0;
 	unsigned long nr_pages_iov;
-	unsigned long left = liovcnt;
-	size_t iov_l_curr_offset = 0;
 	ssize_t iov_len;
 
 	/*
@@ -312,11 +303,11 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
 		goto put_task_struct;
 	}
 
-	for (i = 0; i < riovcnt && left; i++) {
+	for (i = 0; i < riovcnt && iter->nr_segs; i++) {
 		rc = process_vm_rw_single_vec(
 			(unsigned long)rvec[i].iov_base, rvec[i].iov_len,
-			&lvec, &left, &iov_l_curr_offset,
-			process_pages, mm, task, vm_write, &bytes_copied_loop);
+			iter, process_pages, mm, task, vm_write,
+			&bytes_copied_loop);
 		bytes_copied += bytes_copied_loop;
 		if (rc != 0) {
 			/* If we have managed to copy any data at all then
@@ -365,6 +356,7 @@ static ssize_t process_vm_rw(pid_t pid,
 	struct iovec iovstack_r[UIO_FASTIOV];
 	struct iovec *iov_l = iovstack_l;
 	struct iovec *iov_r = iovstack_r;
+	struct iov_iter iter;
 	ssize_t rc;
 
 	if (flags != 0)
@@ -380,13 +372,14 @@ static ssize_t process_vm_rw(pid_t pid,
 	if (rc <= 0)
 		goto free_iovecs;
 
+	iov_iter_init(&iter, iov_l, liovcnt, rc, 0);
+
 	rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
 				   iovstack_r, &iov_r);
 	if (rc <= 0)
 		goto free_iovecs;
 
-	rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
-				vm_write);
+	rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);
 
 free_iovecs:
 	if (iov_r != iovstack_r)
@@ -426,6 +419,7 @@ compat_process_vm_rw(compat_pid_t pid,
 	struct iovec iovstack_r[UIO_FASTIOV];
 	struct iovec *iov_l = iovstack_l;
 	struct iovec *iov_r = iovstack_r;
+	struct iov_iter iter;
 	ssize_t rc = -EFAULT;
 
 	if (flags != 0)
@@ -441,14 +435,14 @@ compat_process_vm_rw(compat_pid_t pid,
 						  &iov_l);
 	if (rc <= 0)
 		goto free_iovecs;
+	iov_iter_init(&iter, iov_l, liovcnt, rc, 0);
 	rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt,
 					  UIO_FASTIOV, iovstack_r,
 					  &iov_r);
 	if (rc <= 0)
 		goto free_iovecs;
 
-	rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
-			   vm_write);
+	rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);
 
 free_iovecs:
 	if (iov_r != iovstack_r)
-- 
cgit v0.10.2


From 240f3905f513cd5f27cc3f79bf03bf3f88323f41 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Feb 2014 12:14:11 -0500
Subject: process_vm_access: switch to
 copy_page_to_iter/iov_iter_copy_from_user

... rather than open-coding those.  As a side benefit, we get much saner
loop calling those; we can just feed entire pages, instead of the "copy
would span the iovec boundary, let's do it in two loop iterations" mess.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 1e3c811..70ec315 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -51,13 +51,11 @@ static int process_vm_rw_pages(struct task_struct *task,
 			       ssize_t *bytes_copied)
 {
 	int pages_pinned;
-	void *target_kaddr;
 	int pgs_copied = 0;
 	int j;
 	int ret;
 	ssize_t bytes_to_copy;
 	ssize_t rc = 0;
-	const struct iovec *iov = iter->iov;
 
 	*bytes_copied = 0;
 
@@ -75,77 +73,34 @@ static int process_vm_rw_pages(struct task_struct *task,
 
 	/* Do the copy for each page */
 	for (pgs_copied = 0;
-	     (pgs_copied < nr_pages_to_copy) && iter->nr_segs;
+	     (pgs_copied < nr_pages_to_copy) && iov_iter_count(iter);
 	     pgs_copied++) {
-		/* Make sure we have a non zero length iovec */
-		while (iter->nr_segs && iov->iov_len == 0) {
-			iov++;
-			iter->nr_segs--;
+		struct page *page = process_pages[pgs_copied];
+		bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset, len);
+
+		if (vm_write) {
+			if (bytes_to_copy > iov_iter_count(iter))
+				bytes_to_copy = iov_iter_count(iter);
+			ret = iov_iter_copy_from_user(page,
+					iter, start_offset, bytes_to_copy);
+			iov_iter_advance(iter, ret);
+			set_page_dirty_lock(page);
+		} else {
+			ret = copy_page_to_iter(page, start_offset,
+					bytes_to_copy, iter);
 		}
-		if (!iter->nr_segs)
-			break;
-
-		/*
-		 * Will copy smallest of:
-		 * - bytes remaining in page
-		 * - bytes remaining in destination iovec
-		 */
-		bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset,
-				      len - *bytes_copied);
-		bytes_to_copy = min_t(ssize_t, bytes_to_copy,
-				      iov->iov_len
-				      - iter->iov_offset);
-
-		target_kaddr = kmap(process_pages[pgs_copied]) + start_offset;
-
-		if (vm_write)
-			ret = copy_from_user(target_kaddr,
-					     iov->iov_base
-					     + iter->iov_offset,
-					     bytes_to_copy);
-		else
-			ret = copy_to_user(iov->iov_base
-					   + iter->iov_offset,
-					   target_kaddr, bytes_to_copy);
-		kunmap(process_pages[pgs_copied]);
-		if (ret) {
-			*bytes_copied += bytes_to_copy - ret;
-			pgs_copied++;
+		*bytes_copied += ret;
+		len -= ret;
+		if (ret < bytes_to_copy && iov_iter_count(iter)) {
 			rc = -EFAULT;
-			goto end;
-		}
-		*bytes_copied += bytes_to_copy;
-		iter->iov_offset += bytes_to_copy;
-		if (iter->iov_offset == iov->iov_len) {
-			/*
-			 * Need to copy remaining part of page into the
-			 * next iovec if there are any bytes left in page
-			 */
-			iter->nr_segs--;
-			iov++;
-			iter->iov_offset = 0;
-			start_offset = (start_offset + bytes_to_copy)
-				% PAGE_SIZE;
-			if (start_offset)
-				pgs_copied--;
-		} else {
-			start_offset = 0;
+			break;
 		}
+		start_offset = 0;
 	}
 
 end:
-	if (vm_write) {
-		for (j = 0; j < pages_pinned; j++) {
-			if (j < pgs_copied)
-				set_page_dirty_lock(process_pages[j]);
-			put_page(process_pages[j]);
-		}
-	} else {
-		for (j = 0; j < pages_pinned; j++)
-			put_page(process_pages[j]);
-	}
-
-	iter->iov = iov;
+	for (j = 0; j < pages_pinned; j++)
+		put_page(process_pages[j]);
 	return rc;
 }
 
@@ -194,7 +149,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
 		return 0;
 	nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
 
-	while ((nr_pages_copied < nr_pages) && iter->nr_segs) {
+	while ((nr_pages_copied < nr_pages) && iov_iter_count(iter)) {
 		nr_pages_to_copy = min(nr_pages - nr_pages_copied,
 				       max_pages_per_loop);
 
@@ -303,7 +258,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
 		goto put_task_struct;
 	}
 
-	for (i = 0; i < riovcnt && iter->nr_segs; i++) {
+	for (i = 0; i < riovcnt && iov_iter_count(iter); i++) {
 		rc = process_vm_rw_single_vec(
 			(unsigned long)rvec[i].iov_base, rvec[i].iov_len,
 			iter, process_pages, mm, task, vm_write,
-- 
cgit v0.10.2


From 70eca12d80a8b51800b4ebfbc629e0ee34166779 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Feb 2014 12:44:24 -0500
Subject: process_vm_access: take get_user_pages/put_pages one level up

... and trim the fuck out of process_vm_rw_pages() argument list.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 70ec315..c2a1916 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -39,69 +39,39 @@
  * @bytes_copied: returns number of bytes successfully copied
  * Returns 0 on success, error code otherwise
  */
-static int process_vm_rw_pages(struct task_struct *task,
-			       struct mm_struct *mm,
-			       struct page **process_pages,
-			       unsigned long pa,
-			       unsigned long start_offset,
+static int process_vm_rw_pages(struct page **pages,
+			       unsigned offset,
 			       unsigned long len,
 			       struct iov_iter *iter,
 			       int vm_write,
 			       unsigned int nr_pages_to_copy,
 			       ssize_t *bytes_copied)
 {
-	int pages_pinned;
-	int pgs_copied = 0;
-	int j;
-	int ret;
-	ssize_t bytes_to_copy;
-	ssize_t rc = 0;
-
 	*bytes_copied = 0;
 
-	/* Get the pages we're interested in */
-	down_read(&mm->mmap_sem);
-	pages_pinned = get_user_pages(task, mm, pa,
-				      nr_pages_to_copy,
-				      vm_write, 0, process_pages, NULL);
-	up_read(&mm->mmap_sem);
-
-	if (pages_pinned != nr_pages_to_copy) {
-		rc = -EFAULT;
-		goto end;
-	}
-
 	/* Do the copy for each page */
-	for (pgs_copied = 0;
-	     (pgs_copied < nr_pages_to_copy) && iov_iter_count(iter);
-	     pgs_copied++) {
-		struct page *page = process_pages[pgs_copied];
-		bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset, len);
+	while (iov_iter_count(iter) && nr_pages_to_copy--) {
+		struct page *page = *pages++;
+		size_t copy = min_t(ssize_t, PAGE_SIZE - offset, len);
+		size_t copied;
 
 		if (vm_write) {
-			if (bytes_to_copy > iov_iter_count(iter))
-				bytes_to_copy = iov_iter_count(iter);
-			ret = iov_iter_copy_from_user(page,
-					iter, start_offset, bytes_to_copy);
-			iov_iter_advance(iter, ret);
+			if (copy > iov_iter_count(iter))
+				copy = iov_iter_count(iter);
+			copied = iov_iter_copy_from_user(page, iter,
+					offset, copy);
+			iov_iter_advance(iter, copied);
 			set_page_dirty_lock(page);
 		} else {
-			ret = copy_page_to_iter(page, start_offset,
-					bytes_to_copy, iter);
-		}
-		*bytes_copied += ret;
-		len -= ret;
-		if (ret < bytes_to_copy && iov_iter_count(iter)) {
-			rc = -EFAULT;
-			break;
+			copied = copy_page_to_iter(page, offset, copy, iter);
 		}
-		start_offset = 0;
+		*bytes_copied += copied;
+		len -= copied;
+		if (copied < copy && iov_iter_count(iter))
+			return -EFAULT;
+		offset = 0;
 	}
-
-end:
-	for (j = 0; j < pages_pinned; j++)
-		put_page(process_pages[j]);
-	return rc;
+	return 0;
 }
 
 /* Maximum number of pages kmalloc'd to hold struct page's during copy */
@@ -138,7 +108,6 @@ static int process_vm_rw_single_vec(unsigned long addr,
 	ssize_t bytes_copied_loop;
 	ssize_t rc = 0;
 	unsigned long nr_pages_copied = 0;
-	unsigned long nr_pages_to_copy;
 	unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
 		/ sizeof(struct pages *);
 
@@ -150,23 +119,35 @@ static int process_vm_rw_single_vec(unsigned long addr,
 	nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
 
 	while ((nr_pages_copied < nr_pages) && iov_iter_count(iter)) {
+		int nr_pages_to_copy;
+		int pages_pinned;
 		nr_pages_to_copy = min(nr_pages - nr_pages_copied,
 				       max_pages_per_loop);
 
-		rc = process_vm_rw_pages(task, mm, process_pages, pa,
+		/* Get the pages we're interested in */
+		down_read(&mm->mmap_sem);
+		pages_pinned = get_user_pages(task, mm, pa,
+					      nr_pages_to_copy,
+					      vm_write, 0, process_pages, NULL);
+		up_read(&mm->mmap_sem);
+
+		if (pages_pinned <= 0)
+			return -EFAULT;
+
+		rc = process_vm_rw_pages(process_pages,
 					 start_offset, len, iter,
-					 vm_write, nr_pages_to_copy,
+					 vm_write, pages_pinned,
 					 &bytes_copied_loop);
 		start_offset = 0;
 		*bytes_copied += bytes_copied_loop;
+		len -= bytes_copied_loop;
+		nr_pages_copied += pages_pinned;
+		pa += pages_pinned * PAGE_SIZE;
+		while (pages_pinned)
+			put_page(process_pages[--pages_pinned]);
 
-		if (rc < 0) {
-			return rc;
-		} else {
-			len -= bytes_copied_loop;
-			nr_pages_copied += nr_pages_to_copy;
-			pa += nr_pages_to_copy * PAGE_SIZE;
-		}
+		if (rc < 0)
+			break;
 	}
 
 	return rc;
-- 
cgit v0.10.2


From e21345f9c3f8a806604910cae886e471a860ab86 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Feb 2014 12:55:11 -0500
Subject: process_vm_rw_pages(): pass accurate amount of bytes

... makes passing the amount of pages unnecessary

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index c2a1916..83252d7 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -41,20 +41,22 @@
  */
 static int process_vm_rw_pages(struct page **pages,
 			       unsigned offset,
-			       unsigned long len,
+			       size_t len,
 			       struct iov_iter *iter,
 			       int vm_write,
-			       unsigned int nr_pages_to_copy,
 			       ssize_t *bytes_copied)
 {
 	*bytes_copied = 0;
 
 	/* Do the copy for each page */
-	while (iov_iter_count(iter) && nr_pages_to_copy--) {
+	while (iov_iter_count(iter) && len) {
 		struct page *page = *pages++;
-		size_t copy = min_t(ssize_t, PAGE_SIZE - offset, len);
+		size_t copy = PAGE_SIZE - offset;
 		size_t copied;
 
+		if (copy > len)
+			copy = len;
+
 		if (vm_write) {
 			if (copy > iov_iter_count(iter))
 				copy = iov_iter_count(iter);
@@ -121,6 +123,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
 	while ((nr_pages_copied < nr_pages) && iov_iter_count(iter)) {
 		int nr_pages_to_copy;
 		int pages_pinned;
+		size_t n;
 		nr_pages_to_copy = min(nr_pages - nr_pages_copied,
 				       max_pages_per_loop);
 
@@ -134,18 +137,21 @@ static int process_vm_rw_single_vec(unsigned long addr,
 		if (pages_pinned <= 0)
 			return -EFAULT;
 
+		n = pages_pinned * PAGE_SIZE - start_offset;
+		if (n > len)
+			n = len;
+
 		rc = process_vm_rw_pages(process_pages,
-					 start_offset, len, iter,
-					 vm_write, pages_pinned,
+					 start_offset, n, iter,
+					 vm_write,
 					 &bytes_copied_loop);
+		len -= n;
 		start_offset = 0;
 		*bytes_copied += bytes_copied_loop;
-		len -= bytes_copied_loop;
 		nr_pages_copied += pages_pinned;
 		pa += pages_pinned * PAGE_SIZE;
 		while (pages_pinned)
 			put_page(process_pages[--pages_pinned]);
-
 		if (rc < 0)
 			break;
 	}
-- 
cgit v0.10.2


From 9acc1a0f9af244826de0eea2c38b78410eb0df6d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Feb 2014 13:15:28 -0500
Subject: process_vm_access: don't bother with returning the amounts of bytes
 copied

we can calculate that in the caller just fine, TYVM

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 83252d7..15e4c2f 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -43,13 +43,10 @@ static int process_vm_rw_pages(struct page **pages,
 			       unsigned offset,
 			       size_t len,
 			       struct iov_iter *iter,
-			       int vm_write,
-			       ssize_t *bytes_copied)
+			       int vm_write)
 {
-	*bytes_copied = 0;
-
 	/* Do the copy for each page */
-	while (iov_iter_count(iter) && len) {
+	while (len && iov_iter_count(iter)) {
 		struct page *page = *pages++;
 		size_t copy = PAGE_SIZE - offset;
 		size_t copied;
@@ -67,7 +64,6 @@ static int process_vm_rw_pages(struct page **pages,
 		} else {
 			copied = copy_page_to_iter(page, offset, copy, iter);
 		}
-		*bytes_copied += copied;
 		len -= copied;
 		if (copied < copy && iov_iter_count(iter))
 			return -EFAULT;
@@ -101,20 +97,16 @@ static int process_vm_rw_single_vec(unsigned long addr,
 				    struct page **process_pages,
 				    struct mm_struct *mm,
 				    struct task_struct *task,
-				    int vm_write,
-				    ssize_t *bytes_copied)
+				    int vm_write)
 {
 	unsigned long pa = addr & PAGE_MASK;
 	unsigned long start_offset = addr - pa;
 	unsigned long nr_pages;
-	ssize_t bytes_copied_loop;
 	ssize_t rc = 0;
 	unsigned long nr_pages_copied = 0;
 	unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
 		/ sizeof(struct pages *);
 
-	*bytes_copied = 0;
-
 	/* Work out address and page range required */
 	if (len == 0)
 		return 0;
@@ -143,11 +135,9 @@ static int process_vm_rw_single_vec(unsigned long addr,
 
 		rc = process_vm_rw_pages(process_pages,
 					 start_offset, n, iter,
-					 vm_write,
-					 &bytes_copied_loop);
+					 vm_write);
 		len -= n;
 		start_offset = 0;
-		*bytes_copied += bytes_copied_loop;
 		nr_pages_copied += pages_pinned;
 		pa += pages_pinned * PAGE_SIZE;
 		while (pages_pinned)
@@ -187,11 +177,10 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
 	struct mm_struct *mm;
 	unsigned long i;
 	ssize_t rc = 0;
-	ssize_t bytes_copied_loop;
-	ssize_t bytes_copied = 0;
 	unsigned long nr_pages = 0;
 	unsigned long nr_pages_iov;
 	ssize_t iov_len;
+	size_t total_len = iov_iter_count(iter);
 
 	/*
 	 * Work out how many pages of struct pages we're going to need
@@ -245,24 +234,20 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
 		goto put_task_struct;
 	}
 
-	for (i = 0; i < riovcnt && iov_iter_count(iter); i++) {
+	for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++)
 		rc = process_vm_rw_single_vec(
 			(unsigned long)rvec[i].iov_base, rvec[i].iov_len,
-			iter, process_pages, mm, task, vm_write,
-			&bytes_copied_loop);
-		bytes_copied += bytes_copied_loop;
-		if (rc != 0) {
-			/* If we have managed to copy any data at all then
-			   we return the number of bytes copied. Otherwise
-			   we return the error code */
-			if (bytes_copied)
-				rc = bytes_copied;
-			goto put_mm;
-		}
-	}
+			iter, process_pages, mm, task, vm_write);
+
+	/* copied = space before - space after */
+	total_len -= iov_iter_count(iter);
+
+	/* If we have managed to copy any data at all then
+	   we return the number of bytes copied. Otherwise
+	   we return the error code */
+	if (total_len)
+		rc = total_len;
 
-	rc = bytes_copied;
-put_mm:
 	mmput(mm);
 
 put_task_struct:
-- 
cgit v0.10.2


From 4bafbec7bf60ed56ccbb36a96091bdbd162f075d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Feb 2014 13:25:32 -0500
Subject: process_vm_access: tidy up a bit

saner variable names, update linuxdoc comments, etc.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 15e4c2f..d6bd3fd 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -23,20 +23,11 @@
 
 /**
  * process_vm_rw_pages - read/write pages from task specified
- * @task: task to read/write from
- * @mm: mm for task
- * @process_pages: struct pages area that can store at least
- *  nr_pages_to_copy struct page pointers
- * @pa: address of page in task to start copying from/to
+ * @pages: array of pointers to pages we want to copy
  * @start_offset: offset in page to start copying from/to
  * @len: number of bytes to copy
- * @lvec: iovec array specifying where to copy to/from
- * @lvec_cnt: number of elements in iovec array
- * @lvec_current: index in iovec array we are up to
- * @lvec_offset: offset in bytes from current iovec iov_base we are up to
+ * @iter: where to copy to/from locally
  * @vm_write: 0 means copy from, 1 means copy to
- * @nr_pages_to_copy: number of pages to copy
- * @bytes_copied: returns number of bytes successfully copied
  * Returns 0 on success, error code otherwise
  */
 static int process_vm_rw_pages(struct page **pages,
@@ -79,16 +70,12 @@ static int process_vm_rw_pages(struct page **pages,
  * process_vm_rw_single_vec - read/write pages from task specified
  * @addr: start memory address of target process
  * @len: size of area to copy to/from
- * @lvec: iovec array specifying where to copy to/from locally
- * @lvec_cnt: number of elements in iovec array
- * @lvec_current: index in iovec array we are up to
- * @lvec_offset: offset in bytes from current iovec iov_base we are up to
+ * @iter: where to copy to/from locally
  * @process_pages: struct pages area that can store at least
  *  nr_pages_to_copy struct page pointers
  * @mm: mm for task
  * @task: task to read/write from
  * @vm_write: 0 means copy from, 1 means copy to
- * @bytes_copied: returns number of bytes successfully copied
  * Returns 0 on success or on failure error code
  */
 static int process_vm_rw_single_vec(unsigned long addr,
@@ -103,7 +90,6 @@ static int process_vm_rw_single_vec(unsigned long addr,
 	unsigned long start_offset = addr - pa;
 	unsigned long nr_pages;
 	ssize_t rc = 0;
-	unsigned long nr_pages_copied = 0;
 	unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
 		/ sizeof(struct pages *);
 
@@ -112,38 +98,32 @@ static int process_vm_rw_single_vec(unsigned long addr,
 		return 0;
 	nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
 
-	while ((nr_pages_copied < nr_pages) && iov_iter_count(iter)) {
-		int nr_pages_to_copy;
-		int pages_pinned;
-		size_t n;
-		nr_pages_to_copy = min(nr_pages - nr_pages_copied,
-				       max_pages_per_loop);
+	while (!rc && nr_pages && iov_iter_count(iter)) {
+		int pages = min(nr_pages, max_pages_per_loop);
+		size_t bytes;
 
 		/* Get the pages we're interested in */
 		down_read(&mm->mmap_sem);
-		pages_pinned = get_user_pages(task, mm, pa,
-					      nr_pages_to_copy,
-					      vm_write, 0, process_pages, NULL);
+		pages = get_user_pages(task, mm, pa, pages,
+				      vm_write, 0, process_pages, NULL);
 		up_read(&mm->mmap_sem);
 
-		if (pages_pinned <= 0)
+		if (pages <= 0)
 			return -EFAULT;
 
-		n = pages_pinned * PAGE_SIZE - start_offset;
-		if (n > len)
-			n = len;
+		bytes = pages * PAGE_SIZE - start_offset;
+		if (bytes > len)
+			bytes = len;
 
 		rc = process_vm_rw_pages(process_pages,
-					 start_offset, n, iter,
+					 start_offset, bytes, iter,
 					 vm_write);
-		len -= n;
+		len -= bytes;
 		start_offset = 0;
-		nr_pages_copied += pages_pinned;
-		pa += pages_pinned * PAGE_SIZE;
-		while (pages_pinned)
-			put_page(process_pages[--pages_pinned]);
-		if (rc < 0)
-			break;
+		nr_pages -= pages;
+		pa += pages * PAGE_SIZE;
+		while (pages)
+			put_page(process_pages[--pages]);
 	}
 
 	return rc;
@@ -156,8 +136,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
 /**
  * process_vm_rw_core - core of reading/writing pages from task specified
  * @pid: PID of process to read/write from/to
- * @lvec: iovec array specifying where to copy to/from locally
- * @liovcnt: size of lvec array
+ * @iter: where to copy to/from locally
  * @rvec: iovec array specifying where to copy to/from in the other process
  * @riovcnt: size of rvec array
  * @flags: currently unused
-- 
cgit v0.10.2


From 4f18cd317a118c28482f97303600a2fe2ada6c79 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Feb 2014 19:11:33 -0500
Subject: take iov_iter stuff to mm/iov_iter.c

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/mm/Makefile b/mm/Makefile
index 310c90a..178a434 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -17,7 +17,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   util.o mmzone.o vmstat.o backing-dev.o \
 			   mm_init.o mmu_context.o percpu.o slab_common.o \
 			   compaction.o balloon_compaction.o \
-			   interval_tree.o list_lru.o $(mmu-y)
+			   interval_tree.o list_lru.o iov_iter.o $(mmu-y)
 
 obj-y += init-mm.o
 
diff --git a/mm/filemap.c b/mm/filemap.c
index a16eb2c..c4730ef 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1085,84 +1085,6 @@ static void shrink_readahead_size_eio(struct file *filp,
 	ra->ra_pages /= 4;
 }
 
-size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
-			 struct iov_iter *i)
-{
-	size_t skip, copy, left, wanted;
-	const struct iovec *iov;
-	char __user *buf;
-	void *kaddr, *from;
-
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
-	wanted = bytes;
-	iov = i->iov;
-	skip = i->iov_offset;
-	buf = iov->iov_base + skip;
-	copy = min(bytes, iov->iov_len - skip);
-
-	if (!fault_in_pages_writeable(buf, copy)) {
-		kaddr = kmap_atomic(page);
-		from = kaddr + offset;
-
-		/* first chunk, usually the only one */
-		left = __copy_to_user_inatomic(buf, from, copy);
-		copy -= left;
-		skip += copy;
-		from += copy;
-		bytes -= copy;
-
-		while (unlikely(!left && bytes)) {
-			iov++;
-			buf = iov->iov_base;
-			copy = min(bytes, iov->iov_len);
-			left = __copy_to_user_inatomic(buf, from, copy);
-			copy -= left;
-			skip = copy;
-			from += copy;
-			bytes -= copy;
-		}
-		if (likely(!bytes)) {
-			kunmap_atomic(kaddr);
-			goto done;
-		}
-		offset = from - kaddr;
-		buf += copy;
-		kunmap_atomic(kaddr);
-		copy = min(bytes, iov->iov_len - skip);
-	}
-	/* Too bad - revert to non-atomic kmap */
-	kaddr = kmap(page);
-	from = kaddr + offset;
-	left = __copy_to_user(buf, from, copy);
-	copy -= left;
-	skip += copy;
-	from += copy;
-	bytes -= copy;
-	while (unlikely(!left && bytes)) {
-		iov++;
-		buf = iov->iov_base;
-		copy = min(bytes, iov->iov_len);
-		left = __copy_to_user(buf, from, copy);
-		copy -= left;
-		skip = copy;
-		from += copy;
-		bytes -= copy;
-	}
-	kunmap(page);
-done:
-	i->count -= wanted - bytes;
-	i->nr_segs -= iov - i->iov;
-	i->iov = iov;
-	i->iov_offset = skip;
-	return wanted - bytes;
-}
-EXPORT_SYMBOL(copy_page_to_iter);
-
 /**
  * do_generic_file_read - generic file read routine
  * @filp:	the file to read
@@ -1957,149 +1879,6 @@ struct page *read_cache_page(struct address_space *mapping,
 }
 EXPORT_SYMBOL(read_cache_page);
 
-static size_t __iovec_copy_from_user_inatomic(char *vaddr,
-			const struct iovec *iov, size_t base, size_t bytes)
-{
-	size_t copied = 0, left = 0;
-
-	while (bytes) {
-		char __user *buf = iov->iov_base + base;
-		int copy = min(bytes, iov->iov_len - base);
-
-		base = 0;
-		left = __copy_from_user_inatomic(vaddr, buf, copy);
-		copied += copy;
-		bytes -= copy;
-		vaddr += copy;
-		iov++;
-
-		if (unlikely(left))
-			break;
-	}
-	return copied - left;
-}
-
-/*
- * Copy as much as we can into the page and return the number of bytes which
- * were successfully copied.  If a fault is encountered then return the number of
- * bytes which were copied.
- */
-size_t iov_iter_copy_from_user_atomic(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes)
-{
-	char *kaddr;
-	size_t copied;
-
-	kaddr = kmap_atomic(page);
-	if (likely(i->nr_segs == 1)) {
-		int left;
-		char __user *buf = i->iov->iov_base + i->iov_offset;
-		left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
-		copied = bytes - left;
-	} else {
-		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
-						i->iov, i->iov_offset, bytes);
-	}
-	kunmap_atomic(kaddr);
-
-	return copied;
-}
-EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
-
-/*
- * This has the same sideeffects and return value as
- * iov_iter_copy_from_user_atomic().
- * The difference is that it attempts to resolve faults.
- * Page must not be locked.
- */
-size_t iov_iter_copy_from_user(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes)
-{
-	char *kaddr;
-	size_t copied;
-
-	kaddr = kmap(page);
-	if (likely(i->nr_segs == 1)) {
-		int left;
-		char __user *buf = i->iov->iov_base + i->iov_offset;
-		left = __copy_from_user(kaddr + offset, buf, bytes);
-		copied = bytes - left;
-	} else {
-		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
-						i->iov, i->iov_offset, bytes);
-	}
-	kunmap(page);
-	return copied;
-}
-EXPORT_SYMBOL(iov_iter_copy_from_user);
-
-void iov_iter_advance(struct iov_iter *i, size_t bytes)
-{
-	BUG_ON(i->count < bytes);
-
-	if (likely(i->nr_segs == 1)) {
-		i->iov_offset += bytes;
-		i->count -= bytes;
-	} else {
-		const struct iovec *iov = i->iov;
-		size_t base = i->iov_offset;
-		unsigned long nr_segs = i->nr_segs;
-
-		/*
-		 * The !iov->iov_len check ensures we skip over unlikely
-		 * zero-length segments (without overruning the iovec).
-		 */
-		while (bytes || unlikely(i->count && !iov->iov_len)) {
-			int copy;
-
-			copy = min(bytes, iov->iov_len - base);
-			BUG_ON(!i->count || i->count < copy);
-			i->count -= copy;
-			bytes -= copy;
-			base += copy;
-			if (iov->iov_len == base) {
-				iov++;
-				nr_segs--;
-				base = 0;
-			}
-		}
-		i->iov = iov;
-		i->iov_offset = base;
-		i->nr_segs = nr_segs;
-	}
-}
-EXPORT_SYMBOL(iov_iter_advance);
-
-/*
- * Fault in the first iovec of the given iov_iter, to a maximum length
- * of bytes. Returns 0 on success, or non-zero if the memory could not be
- * accessed (ie. because it is an invalid address).
- *
- * writev-intensive code may want this to prefault several iovecs -- that
- * would be possible (callers must not rely on the fact that _only_ the
- * first iovec will be faulted with the current implementation).
- */
-int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
-{
-	char __user *buf = i->iov->iov_base + i->iov_offset;
-	bytes = min(bytes, i->iov->iov_len - i->iov_offset);
-	return fault_in_pages_readable(buf, bytes);
-}
-EXPORT_SYMBOL(iov_iter_fault_in_readable);
-
-/*
- * Return the count of just the current iov_iter segment.
- */
-size_t iov_iter_single_seg_count(const struct iov_iter *i)
-{
-	const struct iovec *iov = i->iov;
-	if (i->nr_segs == 1)
-		return i->count;
-	else
-		return min(i->count, iov->iov_len - i->iov_offset);
-}
-EXPORT_SYMBOL(iov_iter_single_seg_count);
-
 /*
  * Performs necessary checks before doing a write
  *
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
new file mode 100644
index 0000000..10e46cd
--- /dev/null
+++ b/mm/iov_iter.c
@@ -0,0 +1,224 @@
+#include <linux/export.h>
+#include <linux/uio.h>
+#include <linux/pagemap.h>
+
+size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
+			 struct iov_iter *i)
+{
+	size_t skip, copy, left, wanted;
+	const struct iovec *iov;
+	char __user *buf;
+	void *kaddr, *from;
+
+	if (unlikely(bytes > i->count))
+		bytes = i->count;
+
+	if (unlikely(!bytes))
+		return 0;
+
+	wanted = bytes;
+	iov = i->iov;
+	skip = i->iov_offset;
+	buf = iov->iov_base + skip;
+	copy = min(bytes, iov->iov_len - skip);
+
+	if (!fault_in_pages_writeable(buf, copy)) {
+		kaddr = kmap_atomic(page);
+		from = kaddr + offset;
+
+		/* first chunk, usually the only one */
+		left = __copy_to_user_inatomic(buf, from, copy);
+		copy -= left;
+		skip += copy;
+		from += copy;
+		bytes -= copy;
+
+		while (unlikely(!left && bytes)) {
+			iov++;
+			buf = iov->iov_base;
+			copy = min(bytes, iov->iov_len);
+			left = __copy_to_user_inatomic(buf, from, copy);
+			copy -= left;
+			skip = copy;
+			from += copy;
+			bytes -= copy;
+		}
+		if (likely(!bytes)) {
+			kunmap_atomic(kaddr);
+			goto done;
+		}
+		offset = from - kaddr;
+		buf += copy;
+		kunmap_atomic(kaddr);
+		copy = min(bytes, iov->iov_len - skip);
+	}
+	/* Too bad - revert to non-atomic kmap */
+	kaddr = kmap(page);
+	from = kaddr + offset;
+	left = __copy_to_user(buf, from, copy);
+	copy -= left;
+	skip += copy;
+	from += copy;
+	bytes -= copy;
+	while (unlikely(!left && bytes)) {
+		iov++;
+		buf = iov->iov_base;
+		copy = min(bytes, iov->iov_len);
+		left = __copy_to_user(buf, from, copy);
+		copy -= left;
+		skip = copy;
+		from += copy;
+		bytes -= copy;
+	}
+	kunmap(page);
+done:
+	i->count -= wanted - bytes;
+	i->nr_segs -= iov - i->iov;
+	i->iov = iov;
+	i->iov_offset = skip;
+	return wanted - bytes;
+}
+EXPORT_SYMBOL(copy_page_to_iter);
+
+static size_t __iovec_copy_from_user_inatomic(char *vaddr,
+			const struct iovec *iov, size_t base, size_t bytes)
+{
+	size_t copied = 0, left = 0;
+
+	while (bytes) {
+		char __user *buf = iov->iov_base + base;
+		int copy = min(bytes, iov->iov_len - base);
+
+		base = 0;
+		left = __copy_from_user_inatomic(vaddr, buf, copy);
+		copied += copy;
+		bytes -= copy;
+		vaddr += copy;
+		iov++;
+
+		if (unlikely(left))
+			break;
+	}
+	return copied - left;
+}
+
+/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were successfully copied.  If a fault is encountered then return the number of
+ * bytes which were copied.
+ */
+size_t iov_iter_copy_from_user_atomic(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	char *kaddr;
+	size_t copied;
+
+	kaddr = kmap_atomic(page);
+	if (likely(i->nr_segs == 1)) {
+		int left;
+		char __user *buf = i->iov->iov_base + i->iov_offset;
+		left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+		copied = bytes - left;
+	} else {
+		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+						i->iov, i->iov_offset, bytes);
+	}
+	kunmap_atomic(kaddr);
+
+	return copied;
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
+
+/*
+ * This has the same sideeffects and return value as
+ * iov_iter_copy_from_user_atomic().
+ * The difference is that it attempts to resolve faults.
+ * Page must not be locked.
+ */
+size_t iov_iter_copy_from_user(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	char *kaddr;
+	size_t copied;
+
+	kaddr = kmap(page);
+	if (likely(i->nr_segs == 1)) {
+		int left;
+		char __user *buf = i->iov->iov_base + i->iov_offset;
+		left = __copy_from_user(kaddr + offset, buf, bytes);
+		copied = bytes - left;
+	} else {
+		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+						i->iov, i->iov_offset, bytes);
+	}
+	kunmap(page);
+	return copied;
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user);
+
+void iov_iter_advance(struct iov_iter *i, size_t bytes)
+{
+	BUG_ON(i->count < bytes);
+
+	if (likely(i->nr_segs == 1)) {
+		i->iov_offset += bytes;
+		i->count -= bytes;
+	} else {
+		const struct iovec *iov = i->iov;
+		size_t base = i->iov_offset;
+		unsigned long nr_segs = i->nr_segs;
+
+		/*
+		 * The !iov->iov_len check ensures we skip over unlikely
+		 * zero-length segments (without overruning the iovec).
+		 */
+		while (bytes || unlikely(i->count && !iov->iov_len)) {
+			int copy;
+
+			copy = min(bytes, iov->iov_len - base);
+			BUG_ON(!i->count || i->count < copy);
+			i->count -= copy;
+			bytes -= copy;
+			base += copy;
+			if (iov->iov_len == base) {
+				iov++;
+				nr_segs--;
+				base = 0;
+			}
+		}
+		i->iov = iov;
+		i->iov_offset = base;
+		i->nr_segs = nr_segs;
+	}
+}
+EXPORT_SYMBOL(iov_iter_advance);
+
+/*
+ * Fault in the first iovec of the given iov_iter, to a maximum length
+ * of bytes. Returns 0 on success, or non-zero if the memory could not be
+ * accessed (ie. because it is an invalid address).
+ *
+ * writev-intensive code may want this to prefault several iovecs -- that
+ * would be possible (callers must not rely on the fact that _only_ the
+ * first iovec will be faulted with the current implementation).
+ */
+int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+	char __user *buf = i->iov->iov_base + i->iov_offset;
+	bytes = min(bytes, i->iov->iov_len - i->iov_offset);
+	return fault_in_pages_readable(buf, bytes);
+}
+EXPORT_SYMBOL(iov_iter_fault_in_readable);
+
+/*
+ * Return the count of just the current iov_iter segment.
+ */
+size_t iov_iter_single_seg_count(const struct iov_iter *i)
+{
+	const struct iovec *iov = i->iov;
+	if (i->nr_segs == 1)
+		return i->count;
+	else
+		return min(i->count, iov->iov_len - i->iov_offset);
+}
+EXPORT_SYMBOL(iov_iter_single_seg_count);
-- 
cgit v0.10.2


From 66f5dcef137beef14560f1f6eba6717028b89089 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 8 Feb 2014 13:27:03 -0500
Subject: ocfs2: don't open-code kernel_sendmsg()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 2cd2406..68d80c3 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -940,33 +940,21 @@ static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
 			      size_t veclen, size_t total)
 {
 	int ret;
-	mm_segment_t oldfs;
-	struct msghdr msg = {
-		.msg_iov = (struct iovec *)vec,
-		.msg_iovlen = veclen,
-	};
+	struct msghdr msg;
 
 	if (sock == NULL) {
 		ret = -EINVAL;
 		goto out;
 	}
 
-	oldfs = get_fs();
-	set_fs(get_ds());
-	ret = sock_sendmsg(sock, &msg, total);
-	set_fs(oldfs);
-	if (ret != total) {
-		mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret,
-		     total);
-		if (ret >= 0)
-			ret = -EPIPE; /* should be smarter, I bet */
-		goto out;
-	}
-
-	ret = 0;
+	ret = kernel_sendmsg(sock, &msg, vec, veclen, total);
+	if (likely(ret == total))
+		return 0;
+	mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, total);
+	if (ret >= 0)
+		ret = -EPIPE; /* should be smarter, I bet */
 out:
-	if (ret < 0)
-		mlog(0, "returning error: %d\n", ret);
+	mlog(0, "returning error: %d\n", ret);
 	return ret;
 }
 
-- 
cgit v0.10.2


From 480f40de91e74190281309fd0ecb2d0414603c2e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 8 Feb 2014 13:44:43 -0500
Subject: lustre: switch to kernel_sendmsg()

(casts are due to misannotations in lustre; it uses iovec where kvec would be
correct type; too much noise to properly annotate right now).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
index 80141aa..c3b6716 100644
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
@@ -99,16 +99,7 @@ ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
 		struct iovec   *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
 		unsigned int    niov = tx->tx_niov;
 #endif
-		struct msghdr msg = {
-			.msg_name       = NULL,
-			.msg_namelen    = 0,
-			.msg_iov	= scratchiov,
-			.msg_iovlen     = niov,
-			.msg_control    = NULL,
-			.msg_controllen = 0,
-			.msg_flags      = MSG_DONTWAIT
-		};
-		mm_segment_t oldmm = get_fs();
+		struct msghdr msg = {.msg_flags = MSG_DONTWAIT};
 		int  i;
 
 		for (nob = i = 0; i < niov; i++) {
@@ -120,9 +111,7 @@ ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
 		    nob < tx->tx_resid)
 			msg.msg_flags |= MSG_MORE;
 
-		set_fs (KERNEL_DS);
-		rc = sock_sendmsg(sock, &msg, nob);
-		set_fs (oldmm);
+		rc = kernel_sendmsg(sock, &msg, (struct kvec *)scratchiov, niov, nob);
 	}
 	return rc;
 }
@@ -174,16 +163,7 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
 		struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
 		unsigned int  niov = tx->tx_nkiov;
 #endif
-		struct msghdr msg = {
-			.msg_name       = NULL,
-			.msg_namelen    = 0,
-			.msg_iov	= scratchiov,
-			.msg_iovlen     = niov,
-			.msg_control    = NULL,
-			.msg_controllen = 0,
-			.msg_flags      = MSG_DONTWAIT
-		};
-		mm_segment_t  oldmm = get_fs();
+		struct msghdr msg = {.msg_flags = MSG_DONTWAIT};
 		int	   i;
 
 		for (nob = i = 0; i < niov; i++) {
@@ -196,9 +176,7 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
 		    nob < tx->tx_resid)
 			msg.msg_flags |= MSG_MORE;
 
-		set_fs (KERNEL_DS);
-		rc = sock_sendmsg(sock, &msg, nob);
-		set_fs (oldmm);
+		rc = kernel_sendmsg(sock, &msg, (struct kvec *)scratchiov, niov, nob);
 
 		for (i = 0; i < niov; i++)
 			kunmap(kiov[i].kiov_page);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c
index e6069d7..9bd08d6 100644
--- a/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c
@@ -265,17 +265,11 @@ libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout)
 	 * empty enough to take the whole message immediately */
 
 	for (;;) {
-		struct iovec  iov = {
+		struct kvec  iov = {
 			.iov_base = buffer,
 			.iov_len  = nob
 		};
 		struct msghdr msg = {
-			.msg_name       = NULL,
-			.msg_namelen    = 0,
-			.msg_iov	= &iov,
-			.msg_iovlen     = 1,
-			.msg_control    = NULL,
-			.msg_controllen = 0,
 			.msg_flags      = (timeout == 0) ? MSG_DONTWAIT : 0
 		};
 
@@ -297,11 +291,9 @@ libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout)
 			}
 		}
 
-		set_fs (KERNEL_DS);
 		then = jiffies;
-		rc = sock_sendmsg (sock, &msg, iov.iov_len);
+		rc = kernel_sendmsg(sock, &msg, &iov, 1, nob);
 		ticks -= jiffies - then;
-		set_fs (oldmm);
 
 		if (rc == nob)
 			return 0;
-- 
cgit v0.10.2


From 86d564c84c38b1ec06d9f2120d6a7373dcaeff0c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 8 Feb 2014 20:42:52 -0500
Subject: constify blk_rq_map_user_iov() and friends

sg_iovec array passed to it can be const

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/block/blk-map.c b/block/blk-map.c
index ae4ae10..86d9377 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -188,7 +188,7 @@ EXPORT_SYMBOL(blk_rq_map_user);
  *    unmapping.
  */
 int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
-			struct rq_map_data *map_data, struct sg_iovec *iov,
+			struct rq_map_data *map_data, const struct sg_iovec *iov,
 			int iov_count, unsigned int len, gfp_t gfp_mask)
 {
 	struct bio *bio;
diff --git a/fs/bio.c b/fs/bio.c
index 8754e7b..7065837 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1003,7 +1003,7 @@ struct bio_map_data {
 };
 
 static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
-			     struct sg_iovec *iov, int iov_count,
+			     const struct sg_iovec *iov, int iov_count,
 			     int is_our_pages)
 {
 	memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
@@ -1023,7 +1023,7 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs,
 		       sizeof(struct sg_iovec) * iov_count, gfp_mask);
 }
 
-static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
+static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count,
 			  int to_user, int from_user, int do_free_page)
 {
 	int ret = 0, i;
@@ -1121,7 +1121,7 @@ EXPORT_SYMBOL(bio_uncopy_user);
  */
 struct bio *bio_copy_user_iov(struct request_queue *q,
 			      struct rq_map_data *map_data,
-			      struct sg_iovec *iov, int iov_count,
+			      const struct sg_iovec *iov, int iov_count,
 			      int write_to_vm, gfp_t gfp_mask)
 {
 	struct bio_map_data *bmd;
@@ -1260,7 +1260,7 @@ EXPORT_SYMBOL(bio_copy_user);
 
 static struct bio *__bio_map_user_iov(struct request_queue *q,
 				      struct block_device *bdev,
-				      struct sg_iovec *iov, int iov_count,
+				      const struct sg_iovec *iov, int iov_count,
 				      int write_to_vm, gfp_t gfp_mask)
 {
 	int i, j;
@@ -1408,7 +1408,7 @@ EXPORT_SYMBOL(bio_map_user);
  *	device. Returns an error pointer in case of error.
  */
 struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
-			     struct sg_iovec *iov, int iov_count,
+			     const struct sg_iovec *iov, int iov_count,
 			     int write_to_vm, gfp_t gfp_mask)
 {
 	struct bio *bio;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5a4d39b..21e2720 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -388,7 +388,7 @@ struct sg_iovec;
 struct rq_map_data;
 extern struct bio *bio_map_user_iov(struct request_queue *,
 				    struct block_device *,
-				    struct sg_iovec *, int, int, gfp_t);
+				    const struct sg_iovec *, int, int, gfp_t);
 extern void bio_unmap_user(struct bio *);
 extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int,
 				gfp_t);
@@ -414,7 +414,8 @@ extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
 extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *,
 				 unsigned long, unsigned int, int, gfp_t);
 extern struct bio *bio_copy_user_iov(struct request_queue *,
-				     struct rq_map_data *, struct sg_iovec *,
+				     struct rq_map_data *,
+				     const struct sg_iovec *,
 				     int, int, gfp_t);
 extern int bio_uncopy_user(struct bio *);
 void zero_fill_bio(struct bio *bio);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4afa4f8..a639fd8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -823,8 +823,8 @@ extern int blk_rq_map_user(struct request_queue *, struct request *,
 extern int blk_rq_unmap_user(struct bio *);
 extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t);
 extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
-			       struct rq_map_data *, struct sg_iovec *, int,
-			       unsigned int, gfp_t);
+			       struct rq_map_data *, const struct sg_iovec *,
+			       int, unsigned int, gfp_t);
 extern int blk_execute_rq(struct request_queue *, struct gendisk *,
 			  struct request *, int);
 extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
-- 
cgit v0.10.2


From f730c848affc05fb7262574b06e0cd7e1fa96096 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 8 Feb 2014 21:07:38 -0500
Subject: drbd: don't open-code kernel_recvmsg()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index d073305..1385714 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -468,24 +468,14 @@ static void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
 
 static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
 {
-	mm_segment_t oldfs;
 	struct kvec iov = {
 		.iov_base = buf,
 		.iov_len = size,
 	};
 	struct msghdr msg = {
-		.msg_iovlen = 1,
-		.msg_iov = (struct iovec *)&iov,
 		.msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
 	};
-	int rv;
-
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-	rv = sock_recvmsg(sock, &msg, size, msg.msg_flags);
-	set_fs(oldfs);
-
-	return rv;
+	return kernel_recvmsg(sock, &msg, &iov, 1, size, msg.msg_flags);
 }
 
 static int drbd_recv(struct drbd_tconn *tconn, void *buf, size_t size)
-- 
cgit v0.10.2


From 920220c11122952061f2d81a9f06ca077bb744d9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 8 Feb 2014 21:09:07 -0500
Subject: ocfs2: don't open-code kernel_recvmsg()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 68d80c3..ea63d64 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -916,24 +916,9 @@ static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key)
 
 static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
 {
-	int ret;
-	mm_segment_t oldfs;
-	struct kvec vec = {
-		.iov_len = len,
-		.iov_base = data,
-	};
-	struct msghdr msg = {
-		.msg_iovlen = 1,
-		.msg_iov = (struct iovec *)&vec,
-       		.msg_flags = MSG_DONTWAIT,
-	};
-
-	oldfs = get_fs();
-	set_fs(get_ds());
-	ret = sock_recvmsg(sock, &msg, len, msg.msg_flags);
-	set_fs(oldfs);
-
-	return ret;
+	struct kvec vec = { .iov_len = len, .iov_base = data, };
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT, };
+	return kernel_recvmsg(sock, &msg, &vec, 1, len, msg.msg_flags);
 }
 
 static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
-- 
cgit v0.10.2


From b2f42cfeeb0452ca3e004c3014cda99b53554d47 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 9 Feb 2014 12:07:46 -0500
Subject: lustre: don't open-code kernel_recvmsg()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
index c3b6716..733c79e 100644
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
@@ -215,15 +215,8 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn)
 #endif
 	struct iovec *iov = conn->ksnc_rx_iov;
 	struct msghdr msg = {
-		.msg_name       = NULL,
-		.msg_namelen    = 0,
-		.msg_iov	= scratchiov,
-		.msg_iovlen     = niov,
-		.msg_control    = NULL,
-		.msg_controllen = 0,
 		.msg_flags      = 0
 	};
-	mm_segment_t oldmm = get_fs();
 	int	  nob;
 	int	  i;
 	int	  rc;
@@ -241,10 +234,8 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn)
 	}
 	LASSERT (nob <= conn->ksnc_rx_nob_wanted);
 
-	set_fs (KERNEL_DS);
-	rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT);
-	/* NB this is just a boolean..........................^ */
-	set_fs (oldmm);
+	rc = kernel_recvmsg(conn->ksnc_sock, &msg,
+		(struct kvec *)scratchiov, niov, nob, MSG_DONTWAIT);
 
 	saved_csum = 0;
 	if (conn->ksnc_proto == &ksocknal_protocol_v2x) {
@@ -333,14 +324,8 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn)
 #endif
 	lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
 	struct msghdr msg = {
-		.msg_name       = NULL,
-		.msg_namelen    = 0,
-		.msg_iov	= scratchiov,
-		.msg_control    = NULL,
-		.msg_controllen = 0,
 		.msg_flags      = 0
 	};
-	mm_segment_t oldmm = get_fs();
 	int	  nob;
 	int	  i;
 	int	  rc;
@@ -348,12 +333,13 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn)
 	void	*addr;
 	int	  sum;
 	int	  fragnob;
+	int n;
 
 	/* NB we can't trust socket ops to either consume our iovs
 	 * or leave them alone. */
 	if ((addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages)) != NULL) {
 		nob = scratchiov[0].iov_len;
-		msg.msg_iovlen = 1;
+		n = 1;
 
 	} else {
 		for (nob = i = 0; i < niov; i++) {
@@ -361,15 +347,13 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn)
 			scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
 						 kiov[i].kiov_offset;
 		}
-		msg.msg_iovlen = niov;
+		n = niov;
 	}
 
 	LASSERT (nob <= conn->ksnc_rx_nob_wanted);
 
-	set_fs (KERNEL_DS);
-	rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT);
-	/* NB this is just a boolean.......................^ */
-	set_fs (oldmm);
+	rc = kernel_recvmsg(conn->ksnc_sock, &msg,
+			(struct kvec *)scratchiov, n, nob, MSG_DONTWAIT);
 
 	if (conn->ksnc_msg.ksm_csum != 0) {
 		for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c
index 9bd08d6..7539fe1 100644
--- a/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c
@@ -330,17 +330,11 @@ libcfs_sock_read (struct socket *sock, void *buffer, int nob, int timeout)
 	LASSERT (ticks > 0);
 
 	for (;;) {
-		struct iovec  iov = {
+		struct kvec  iov = {
 			.iov_base = buffer,
 			.iov_len  = nob
 		};
 		struct msghdr msg = {
-			.msg_name       = NULL,
-			.msg_namelen    = 0,
-			.msg_iov	= &iov,
-			.msg_iovlen     = 1,
-			.msg_control    = NULL,
-			.msg_controllen = 0,
 			.msg_flags      = 0
 		};
 
@@ -359,11 +353,9 @@ libcfs_sock_read (struct socket *sock, void *buffer, int nob, int timeout)
 			return rc;
 		}
 
-		set_fs(KERNEL_DS);
 		then = jiffies;
-		rc = sock_recvmsg(sock, &msg, iov.iov_len, 0);
+		rc = kernel_recvmsg(sock, &msg, &iov, 1, nob, 0);
 		ticks -= jiffies - then;
-		set_fs(oldmm);
 
 		if (rc < 0)
 			return rc;
-- 
cgit v0.10.2


From 41fc56d573c35a212688b12b48af8c303f9bb6d2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 9 Feb 2014 12:58:52 -0500
Subject: kill the 4th argument of __generic_file_aio_write()

It's always equal to &iocb->ki_pos, where iocb is the value of the 1st
argument.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1e86823..764bd3b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1518,7 +1518,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	BUG_ON(iocb->ki_pos != pos);
 
 	blk_start_plug(&plug);
-	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+	ret = __generic_file_aio_write(iocb, iov, nr_segs);
 	if (ret > 0) {
 		ssize_t err;
 
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 1a50739..d564bcf 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -146,7 +146,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
 			overwrite = 1;
 	}
 
-	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+	ret = __generic_file_aio_write(iocb, iov, nr_segs);
 	mutex_unlock(&inode->i_mutex);
 
 	if (ret > 0) {
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 1037637..d2c170f 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -171,7 +171,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	} else
 		up_write(&iinfo->i_data_sem);
 
-	retval = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+	retval = __generic_file_aio_write(iocb, iov, nr_segs);
 	mutex_unlock(&inode->i_mutex);
 
 	if (retval > 0) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2a5b174..e677d1e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2392,8 +2392,7 @@ extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr,
 		unsigned long size, pgoff_t pgoff);
 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
 extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
-extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long,
-		loff_t *);
+extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long);
 extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
 extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *,
 		unsigned long *, loff_t, loff_t *, size_t, size_t);
diff --git a/mm/filemap.c b/mm/filemap.c
index c4730ef..ce2246d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2222,14 +2222,14 @@ EXPORT_SYMBOL(generic_file_buffered_write);
  * avoid syncing under i_mutex.
  */
 ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-				 unsigned long nr_segs, loff_t *ppos)
+				 unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space * mapping = file->f_mapping;
 	size_t ocount;		/* original count */
 	size_t count;		/* after file limit checks */
 	struct inode 	*inode = mapping->host;
-	loff_t		pos;
+	loff_t		pos = iocb->ki_pos;
 	ssize_t		written;
 	ssize_t		err;
 
@@ -2239,7 +2239,6 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		return err;
 
 	count = ocount;
-	pos = *ppos;
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = mapping->backing_dev_info;
@@ -2266,7 +2265,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		ssize_t written_buffered;
 
 		written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
-							ppos, count, ocount);
+							&iocb->ki_pos, count, ocount);
 		if (written < 0 || written == count)
 			goto out;
 		/*
@@ -2276,7 +2275,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		pos += written;
 		count -= written;
 		written_buffered = generic_file_buffered_write(iocb, iov,
-						nr_segs, pos, ppos, count,
+						nr_segs, pos, &iocb->ki_pos, count,
 						written);
 		/*
 		 * If generic_file_buffered_write() retuned a synchronous error
@@ -2310,7 +2309,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		}
 	} else {
 		written = generic_file_buffered_write(iocb, iov, nr_segs,
-				pos, ppos, count, written);
+				pos, &iocb->ki_pos, count, written);
 	}
 out:
 	current->backing_dev_info = NULL;
@@ -2339,7 +2338,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	BUG_ON(iocb->ki_pos != pos);
 
 	mutex_lock(&inode->i_mutex);
-	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+	ret = __generic_file_aio_write(iocb, iov, nr_segs);
 	mutex_unlock(&inode->i_mutex);
 
 	if (ret > 0) {
-- 
cgit v0.10.2


From fcacafd269adc88f41b68cb77a3f957a66563428 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 9 Feb 2014 13:37:49 -0500
Subject: kill the 5th argument of generic_file_buffered_write()

same story - it's &iocb->ki_pos in all cases

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 09c7afe..a798db5 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -978,8 +978,7 @@ retry_snap:
 		 * can not run at the same time
 		 */
 		written = generic_file_buffered_write(iocb, iov, nr_segs,
-						      pos, &iocb->ki_pos,
-						      count, 0);
+						      pos, count, 0);
 		mutex_unlock(&inode->i_mutex);
 	}
 
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 51632c4..89099cc 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2383,7 +2383,7 @@ relock:
 	} else {
 		current->backing_dev_info = file->f_mapping->backing_dev_info;
 		written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,
-						      ppos, count, 0);
+						      count, 0);
 		current->backing_dev_info = NULL;
 	}
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 64b48ea..175ce58 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -738,7 +738,7 @@ xfs_file_buffered_aio_write(
 write_retry:
 	trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
 	ret = generic_file_buffered_write(iocb, iovp, nr_segs,
-			pos, &iocb->ki_pos, count, 0);
+			pos, count, 0);
 
 	/*
 	 * If we just got an ENOSPC, try to write back all dirty inodes to
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e677d1e..830e374 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2397,7 +2397,7 @@ extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsi
 extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *,
 		unsigned long *, loff_t, loff_t *, size_t, size_t);
 extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *,
-		unsigned long, loff_t, loff_t *, size_t, ssize_t);
+		unsigned long, loff_t, size_t, ssize_t);
 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
 extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
 extern int generic_segment_checks(const struct iovec *iov,
diff --git a/mm/filemap.c b/mm/filemap.c
index ce2246d..9d515a1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2183,7 +2183,7 @@ again:
 
 ssize_t
 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos, loff_t *ppos,
+		unsigned long nr_segs, loff_t pos,
 		size_t count, ssize_t written)
 {
 	struct file *file = iocb->ki_filp;
@@ -2195,7 +2195,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 
 	if (likely(status >= 0)) {
 		written += status;
-		*ppos = pos + status;
+		iocb->ki_pos = pos + status;
   	}
 	
 	return written ? written : status;
@@ -2275,8 +2275,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		pos += written;
 		count -= written;
 		written_buffered = generic_file_buffered_write(iocb, iov,
-						nr_segs, pos, &iocb->ki_pos, count,
-						written);
+						nr_segs, pos, count, written);
 		/*
 		 * If generic_file_buffered_write() retuned a synchronous error
 		 * then we want to return the number of bytes which were
@@ -2309,7 +2308,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		}
 	} else {
 		written = generic_file_buffered_write(iocb, iov, nr_segs,
-				pos, &iocb->ki_pos, count, written);
+				pos, count, written);
 	}
 out:
 	current->backing_dev_info = NULL;
-- 
cgit v0.10.2


From 867c4f9329e1bf7d0967bec761f033373f72b55e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 11 Feb 2014 19:31:06 -0500
Subject: btrfs_file_aio_write(): get rid of ppos

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 34e0962..f6032a2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1631,7 +1631,7 @@ again:
 static ssize_t __btrfs_direct_write(struct kiocb *iocb,
 				    const struct iovec *iov,
 				    unsigned long nr_segs, loff_t pos,
-				    loff_t *ppos, size_t count, size_t ocount)
+				    size_t count, size_t ocount)
 {
 	struct file *file = iocb->ki_filp;
 	struct iov_iter i;
@@ -1640,7 +1640,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
 	loff_t endbyte;
 	int err;
 
-	written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
+	written = generic_file_direct_write(iocb, iov, &nr_segs, pos, &iocb->ki_pos,
 					    count, ocount);
 
 	if (written < 0 || written == count)
@@ -1659,7 +1659,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
 	if (err)
 		goto out;
 	written += written_buffered;
-	*ppos = pos + written_buffered;
+	iocb->ki_pos = pos + written_buffered;
 	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
 				 endbyte >> PAGE_CACHE_SHIFT);
 out:
@@ -1691,7 +1691,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	loff_t *ppos = &iocb->ki_pos;
 	u64 start_pos;
 	ssize_t num_written = 0;
 	ssize_t err = 0;
@@ -1759,7 +1758,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 
 	if (unlikely(file->f_flags & O_DIRECT)) {
 		num_written = __btrfs_direct_write(iocb, iov, nr_segs,
-						   pos, ppos, count, ocount);
+						   pos, count, ocount);
 	} else {
 		struct iov_iter i;
 
@@ -1767,7 +1766,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 
 		num_written = __btrfs_buffered_write(file, &i, pos);
 		if (num_written > 0)
-			*ppos = pos + num_written;
+			iocb->ki_pos = pos + num_written;
 	}
 
 	mutex_unlock(&inode->i_mutex);
-- 
cgit v0.10.2


From 5cb6c6c7eb1ed24744b41fad47d9a25b72207098 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 11 Feb 2014 20:58:20 -0500
Subject: generic_file_direct_write(): get rid of ppos argument

always equal to &iocb->ki_pos.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f6032a2..8ed4b16 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1640,7 +1640,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
 	loff_t endbyte;
 	int err;
 
-	written = generic_file_direct_write(iocb, iov, &nr_segs, pos, &iocb->ki_pos,
+	written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
 					    count, ocount);
 
 	if (written < 0 || written == count)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a91d3b4..fd06d1e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1143,8 +1143,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		goto out;
 
 	if (file->f_flags & O_DIRECT) {
-		written = generic_file_direct_write(iocb, iov, &nr_segs,
-						    pos, &iocb->ki_pos,
+		written = generic_file_direct_write(iocb, iov, &nr_segs, pos, 
 						    count, ocount);
 		if (written < 0 || written == count)
 			goto out;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 89099cc..77b8a74 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2375,7 +2375,7 @@ relock:
 
 	if (direct_io) {
 		written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
-						    ppos, count, ocount);
+						    count, ocount);
 		if (written < 0) {
 			ret = written;
 			goto out_dio;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 175ce58..e593554 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -699,7 +699,7 @@ xfs_file_dio_aio_write(
 
 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
 	ret = generic_file_direct_write(iocb, iovp,
-			&nr_segs, pos, &iocb->ki_pos, count, ocount);
+			&nr_segs, pos, count, ocount);
 
 out:
 	xfs_rw_iunlock(ip, iolock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 830e374..9dfd7c7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2395,7 +2395,7 @@ extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsig
 extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long);
 extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
 extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *,
-		unsigned long *, loff_t, loff_t *, size_t, size_t);
+		unsigned long *, loff_t, size_t, size_t);
 extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *,
 		unsigned long, loff_t, size_t, ssize_t);
 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
diff --git a/mm/filemap.c b/mm/filemap.c
index 9d515a1..93e9cf5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1985,7 +1985,7 @@ EXPORT_SYMBOL(pagecache_write_end);
 
 ssize_t
 generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long *nr_segs, loff_t pos, loff_t *ppos,
+		unsigned long *nr_segs, loff_t pos,
 		size_t count, size_t ocount)
 {
 	struct file	*file = iocb->ki_filp;
@@ -2046,7 +2046,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 			i_size_write(inode, pos);
 			mark_inode_dirty(inode);
 		}
-		*ppos = pos;
+		iocb->ki_pos = pos;
 	}
 out:
 	return written;
@@ -2265,7 +2265,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		ssize_t written_buffered;
 
 		written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
-							&iocb->ki_pos, count, ocount);
+							count, ocount);
 		if (written < 0 || written == count)
 			goto out;
 		/*
-- 
cgit v0.10.2


From 3b93f911d55cf8b3540f82fac4eeddfee7b5de0c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 11 Feb 2014 21:34:08 -0500
Subject: export generic_perform_write(), start getting rid of
 generic_file_buffer_write()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9dfd7c7..87be51a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -48,6 +48,7 @@ struct cred;
 struct swap_info_struct;
 struct seq_file;
 struct workqueue_struct;
+struct iov_iter;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
@@ -2396,6 +2397,7 @@ extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, un
 extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
 extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *,
 		unsigned long *, loff_t, size_t, size_t);
+extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t);
 extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *,
 		unsigned long, loff_t, size_t, ssize_t);
 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
diff --git a/mm/filemap.c b/mm/filemap.c
index 93e9cf5..09bfc9b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2092,7 +2092,7 @@ found:
 }
 EXPORT_SYMBOL(grab_cache_page_write_begin);
 
-static ssize_t generic_perform_write(struct file *file,
+ssize_t generic_perform_write(struct file *file,
 				struct iov_iter *i, loff_t pos)
 {
 	struct address_space *mapping = file->f_mapping;
@@ -2180,6 +2180,7 @@ again:
 
 	return written ? written : status;
 }
+EXPORT_SYMBOL(generic_perform_write);
 
 ssize_t
 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
@@ -2230,8 +2231,10 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	size_t count;		/* after file limit checks */
 	struct inode 	*inode = mapping->host;
 	loff_t		pos = iocb->ki_pos;
-	ssize_t		written;
+	ssize_t		written = 0;
 	ssize_t		err;
+	ssize_t		status;
+	struct iov_iter from;
 
 	ocount = 0;
 	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
@@ -2242,8 +2245,6 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = mapping->backing_dev_info;
-	written = 0;
-
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 	if (err)
 		goto out;
@@ -2259,44 +2260,47 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	if (err)
 		goto out;
 
+	iov_iter_init(&from, iov, nr_segs, count, 0);
+
 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
 	if (unlikely(file->f_flags & O_DIRECT)) {
 		loff_t endbyte;
-		ssize_t written_buffered;
 
-		written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
+		written = generic_file_direct_write(iocb, iov, &from.nr_segs, pos,
 							count, ocount);
 		if (written < 0 || written == count)
 			goto out;
+		iov_iter_advance(&from, written);
+
 		/*
 		 * direct-io write to a hole: fall through to buffered I/O
 		 * for completing the rest of the request.
 		 */
 		pos += written;
 		count -= written;
-		written_buffered = generic_file_buffered_write(iocb, iov,
-						nr_segs, pos, count, written);
+
+		status = generic_perform_write(file, &from, pos);
 		/*
-		 * If generic_file_buffered_write() retuned a synchronous error
+		 * If generic_perform_write() returned a synchronous error
 		 * then we want to return the number of bytes which were
 		 * direct-written, or the error code if that was zero.  Note
 		 * that this differs from normal direct-io semantics, which
 		 * will return -EFOO even if some bytes were written.
 		 */
-		if (written_buffered < 0) {
-			err = written_buffered;
+		if (unlikely(status < 0) && !written) {
+			err = status;
 			goto out;
 		}
-
+		iocb->ki_pos = pos + status;
 		/*
 		 * We need to ensure that the page cache pages are written to
 		 * disk and invalidated to preserve the expected O_DIRECT
 		 * semantics.
 		 */
-		endbyte = pos + written_buffered - written - 1;
+		endbyte = pos + status - 1;
 		err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
 		if (err == 0) {
-			written = written_buffered;
+			written += status;
 			invalidate_mapping_pages(mapping,
 						 pos >> PAGE_CACHE_SHIFT,
 						 endbyte >> PAGE_CACHE_SHIFT);
@@ -2307,8 +2311,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 			 */
 		}
 	} else {
-		written = generic_file_buffered_write(iocb, iov, nr_segs,
-				pos, count, written);
+		written = generic_perform_write(file, &from, pos);
+		if (likely(written >= 0))
+			iocb->ki_pos = pos + written;
 	}
 out:
 	current->backing_dev_info = NULL;
-- 
cgit v0.10.2


From 0a64bc2c0474640a850febd5ac3abb8d45b32821 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 11 Feb 2014 22:25:22 -0500
Subject: xfs_file_buffered_aio_write(): switch to generic_perform_write()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e593554..c3f4289 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -715,7 +715,7 @@ xfs_file_buffered_aio_write(
 	const struct iovec	*iovp,
 	unsigned long		nr_segs,
 	loff_t			pos,
-	size_t			ocount)
+	size_t			count)
 {
 	struct file		*file = iocb->ki_filp;
 	struct address_space	*mapping = file->f_mapping;
@@ -724,7 +724,7 @@ xfs_file_buffered_aio_write(
 	ssize_t			ret;
 	int			enospc = 0;
 	int			iolock = XFS_IOLOCK_EXCL;
-	size_t			count = ocount;
+	struct iov_iter		from;
 
 	xfs_rw_ilock(ip, iolock);
 
@@ -732,14 +732,15 @@ xfs_file_buffered_aio_write(
 	if (ret)
 		goto out;
 
+	iov_iter_init(&from, iovp, nr_segs, count, 0);
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = mapping->backing_dev_info;
 
 write_retry:
 	trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
-	ret = generic_file_buffered_write(iocb, iovp, nr_segs,
-			pos, count, 0);
-
+	ret = generic_perform_write(file, &from, pos);
+	if (likely(ret >= 0))
+		iocb->ki_pos = pos + ret;
 	/*
 	 * If we just got an ENOSPC, try to write back all dirty inodes to
 	 * convert delalloc space to free up some of the excess reserved
-- 
cgit v0.10.2


From aec605f4297d044a2a0d9ceedcf7d33af679c884 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 11 Feb 2014 22:28:43 -0500
Subject: ceph_aio_write(): switch to generic_perform_write()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index a798db5..2d9088b 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -970,6 +970,7 @@ retry_snap:
 			goto retry_snap;
 		}
 	} else {
+		struct iov_iter from;
 		/*
 		 * No need to acquire the i_truncate_mutex. Because
 		 * the MDS revokes Fwb caps before sending truncate
@@ -977,8 +978,10 @@ retry_snap:
 		 * are pending vmtruncate. So write and vmtruncate
 		 * can not run at the same time
 		 */
-		written = generic_file_buffered_write(iocb, iov, nr_segs,
-						      pos, count, 0);
+		iov_iter_init(&from, iov, nr_segs, count, 0);
+		written = generic_perform_write(file, &from, pos);
+		if (likely(written >= 0))
+			iocb->ki_pos = pos + written;
 		mutex_unlock(&inode->i_mutex);
 	}
 
-- 
cgit v0.10.2


From 58bfab395b302306baccbd1b5f38a9b890acb4e3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 11 Feb 2014 22:34:52 -0500
Subject: ocfs2_file_aio_write(): switch to generic_perform_write()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 77b8a74..9c27adf 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2381,9 +2381,12 @@ relock:
 			goto out_dio;
 		}
 	} else {
+		struct iov_iter from;
+		iov_iter_init(&from, iov, nr_segs, count, 0);
 		current->backing_dev_info = file->f_mapping->backing_dev_info;
-		written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,
-						      count, 0);
+		written = generic_perform_write(file, &from, *ppos);
+		if (likely(written >= 0))
+			iocb->ki_pos = *ppos + written;
 		current->backing_dev_info = NULL;
 	}
 
-- 
cgit v0.10.2


From ccad2365668f12336dd497d9039e8584af836070 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 11 Feb 2014 22:36:48 -0500
Subject: kill generic_file_buffered_write()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 87be51a..c309b7a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2398,8 +2398,6 @@ extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsi
 extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *,
 		unsigned long *, loff_t, size_t, size_t);
 extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t);
-extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *,
-		unsigned long, loff_t, size_t, ssize_t);
 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
 extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
 extern int generic_segment_checks(const struct iovec *iov,
diff --git a/mm/filemap.c b/mm/filemap.c
index 09bfc9b..1a45514 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -76,7 +76,7 @@
  *  ->mmap_sem
  *    ->lock_page		(access_process_vm)
  *
- *  ->i_mutex			(generic_file_buffered_write)
+ *  ->i_mutex			(generic_perform_write)
  *    ->mmap_sem		(fault_in_pages_readable->do_page_fault)
  *
  *  bdi->wb.list_lock
@@ -2182,27 +2182,6 @@ again:
 }
 EXPORT_SYMBOL(generic_perform_write);
 
-ssize_t
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos,
-		size_t count, ssize_t written)
-{
-	struct file *file = iocb->ki_filp;
-	ssize_t status;
-	struct iov_iter i;
-
-	iov_iter_init(&i, iov, nr_segs, count, written);
-	status = generic_perform_write(file, &i, pos);
-
-	if (likely(status >= 0)) {
-		written += status;
-		iocb->ki_pos = pos + status;
-  	}
-	
-	return written ? written : status;
-}
-EXPORT_SYMBOL(generic_file_buffered_write);
-
 /**
  * __generic_file_aio_write - write data to a file
  * @iocb:	IO state structure (file, offset, etc.)
-- 
cgit v0.10.2


From eab87235c0f5979503a547f836a93a3d327c4201 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 3 Apr 2014 22:44:19 -0400
Subject: ceph_sync_{,direct_}write: fix an oops on ceph_osdc_new_request()
 failure

ceph_osdc_put_request(ERR_PTR(-error)) oopses.  What we want there
is break, not goto out.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 2d9088b..359805b 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -600,7 +600,7 @@ ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov,
 					    false);
 		if (IS_ERR(req)) {
 			ret = PTR_ERR(req);
-			goto out;
+			break;
 		}
 
 		num_pages = calc_pages_for(page_align, len);
@@ -718,7 +718,7 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov,
 					    false);
 		if (IS_ERR(req)) {
 			ret = PTR_ERR(req);
-			goto out;
+			break;
 		}
 
 		/*
-- 
cgit v0.10.2


From 19dfc1f5f2ef03a52aa30c8257c5745edef23f55 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 3 Apr 2014 10:27:17 -0400
Subject: cifs: fix the race in cifs_writev()

O_APPEND handling there hadn't been completely fixed by Pavel's
patch; it checks the right value, but it's racy - we can't really
do that until i_mutex has been taken.

Fix by switching to __generic_file_aio_write() (open-coding
generic_file_aio_write(), actually) and pulling mutex_lock() above
inode_size_read().

Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 3443b8f..5bac276 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2579,19 +2579,32 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
 	struct cifsInodeInfo *cinode = CIFS_I(inode);
 	struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
 	ssize_t rc = -EACCES;
-	loff_t lock_pos = pos;
+	loff_t lock_pos = iocb->ki_pos;
 
-	if (file->f_flags & O_APPEND)
-		lock_pos = i_size_read(inode);
 	/*
 	 * We need to hold the sem to be sure nobody modifies lock list
 	 * with a brlock that prevents writing.
 	 */
 	down_read(&cinode->lock_sem);
+	mutex_lock(&inode->i_mutex);
+	if (file->f_flags & O_APPEND)
+		lock_pos = i_size_read(inode);
 	if (!cifs_find_lock_conflict(cfile, lock_pos, iov_length(iov, nr_segs),
 				     server->vals->exclusive_lock_type, NULL,
-				     CIFS_WRITE_OP))
-		rc = generic_file_aio_write(iocb, iov, nr_segs, pos);
+				     CIFS_WRITE_OP)) {
+		rc = __generic_file_aio_write(iocb, iov, nr_segs);
+		mutex_unlock(&inode->i_mutex);
+
+		if (rc > 0) {
+			ssize_t err;
+
+			err = generic_write_sync(file, iocb->ki_pos - rc, rc);
+			if (rc < 0)
+				rc = err;
+		}
+	} else {
+		mutex_unlock(&inode->i_mutex);
+	}
 	up_read(&cinode->lock_sem);
 	return rc;
 }
-- 
cgit v0.10.2


From a786c06d9f2719203c00b3d97b21f9a96980d0b5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 11 Apr 2014 12:01:03 -0400
Subject: missing bits of "splice: fix racy pipe->buffers uses"

that commit has fixed only the parts of that mess in fs/splice.c itself;
there had been more in several other ->splice_read() instances...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/kernel/relay.c b/kernel/relay.c
index 98833f6..7d38607 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1251,7 +1251,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
 	subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
 	pidx = (read_start / PAGE_SIZE) % subbuf_pages;
 	poff = read_start & ~PAGE_MASK;
-	nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers);
+	nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max);
 
 	for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
 		unsigned int this_len, this_end, private;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7511de3..27924ca 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4410,7 +4410,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 	trace_access_lock(iter->cpu_file);
 
 	/* Fill as many pages as possible. */
-	for (i = 0, rem = len; i < pipe->buffers && rem; i++) {
+	for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) {
 		spd.pages[i] = alloc_page(GFP_KERNEL);
 		if (!spd.pages[i])
 			break;
@@ -5267,7 +5267,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 	trace_access_lock(iter->cpu_file);
 	entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
 
-	for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
+	for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) {
 		struct page *page;
 		int r;
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 17d3799d..37400a1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1613,7 +1613,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
 	index = *ppos >> PAGE_CACHE_SHIFT;
 	loff = *ppos & ~PAGE_CACHE_MASK;
 	req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	nr_pages = min(req_pages, pipe->buffers);
+	nr_pages = min(req_pages, spd.nr_pages_max);
 
 	spd.nr_pages = find_get_pages_contig(mapping, index,
 						nr_pages, spd.pages);
-- 
cgit v0.10.2