Merge branch 'virtio-next' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux

Pull virtio changes from Rusty Russell: "New workflow: same git trees pulled by linux-next get sent straight to Linus. Git is awkward at shuffling patches compared with quilt or mq, but that doesn't happen often once things get into my -next branch." * 'virtio-next' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux: (24 commits) lguest: fix occasional crash in example launcher. virtio-blk: Disable callback in virtblk_done() virtio_mmio: Don't attempt to create empty virtqueues virtio_mmio: fix off by one error allocating queue drivers/virtio/virtio_pci.c: fix error return code virtio: don't crash when device is buggy virtio: remove CONFIG_VIRTIO_RING virtio: add help to CONFIG_VIRTIO option. virtio: support reserved vqs virtio: introduce an API to set affinity for a virtqueue virtio-ring: move queue_index to vring_virtqueue virtio_balloon: not EXPERIMENTAL any more. virtio-balloon: dependency fix virtio-blk: fix NULL checking in virtblk_alloc_req() virtio-blk: Add REQ_FLUSH and REQ_FUA support to bio path virtio-blk: Add bio-based IO path for virtio-blk virtio: console: fix error handling in init() function tools: Fix pthread flag for Makefile of trace-agent used by virtio-trace tools: Add guest trace agent as a user tool virtio/console: Allocate scatterlist according to the current pipe size ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-10-07 12:04:56 (GMT)
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-10-07 12:04:56 (GMT)
commit: dc92b1f9ab1e1665dbbc56911782358e7f9a49f9 (patch)
tree: 965ccb4a0f2c24a8b24adce415f6506246d07a90
parent: 5e090ed7af10729a396a25df43d69a236e789736 (diff)
parent: ca16f580a5db7e60bfafe59a50bb133bd3347491 (diff)
download: linux-dc92b1f9ab1e1665dbbc56911782358e7f9a49f9.tar.xz
25 files changed, 1391 insertions, 106 deletions
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index f9acddd..c8af429 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -656,7 +656,6 @@ config S390_GUEST
 	depends on 64BIT && EXPERIMENTAL
 	select VIRTUALIZATION
 	select VIRTIO
-	select VIRTIO_RING
 	select VIRTIO_CONSOLE
 	help
 	  Enabling this option adds support for virtio based paravirtual device
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 6e121a2..7872a33 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -4,7 +4,6 @@ config LGUEST_GUEST
 	depends on X86_32
 	select VIRTUALIZATION
 	select VIRTIO
-	select VIRTIO_RING
 	select VIRTIO_CONSOLE
 	help
 	  Lguest is a tiny in-kernel hypervisor.  Selecting this will
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index c0bbeb4..0bdde8f 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -14,6 +14,9 @@
 
 #define PART_BITS 4
 
+static bool use_bio;
+module_param(use_bio, bool, S_IRUGO);
+
 static int major;
 static DEFINE_IDA(vd_index_ida);
 
@@ -23,6 +26,7 @@ struct virtio_blk
 {
 	struct virtio_device *vdev;
 	struct virtqueue *vq;
+	wait_queue_head_t queue_wait;
 
 	/* The disk structure for the kernel. */
 	struct gendisk *disk;
@@ -51,53 +55,244 @@ struct virtio_blk
 struct virtblk_req
 {
 	struct request *req;
+	struct bio *bio;
 	struct virtio_blk_outhdr out_hdr;
 	struct virtio_scsi_inhdr in_hdr;
+	struct work_struct work;
+	struct virtio_blk *vblk;
+	int flags;
 	u8 status;
+	struct scatterlist sg[];
+};
+
+enum {
+	VBLK_IS_FLUSH		= 1,
+	VBLK_REQ_FLUSH		= 2,
+	VBLK_REQ_DATA		= 4,
+	VBLK_REQ_FUA		= 8,
 };
 
-static void blk_done(struct virtqueue *vq)
+static inline int virtblk_result(struct virtblk_req *vbr)
+{
+	switch (vbr->status) {
+	case VIRTIO_BLK_S_OK:
+		return 0;
+	case VIRTIO_BLK_S_UNSUPP:
+		return -ENOTTY;
+	default:
+		return -EIO;
+	}
+}
+
+static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk,
+						    gfp_t gfp_mask)
 {
-	struct virtio_blk *vblk = vq->vdev->priv;
 	struct virtblk_req *vbr;
-	unsigned int len;
-	unsigned long flags;
 
-	spin_lock_irqsave(vblk->disk->queue->queue_lock, flags);
-	while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
-		int error;
+	vbr = mempool_alloc(vblk->pool, gfp_mask);
+	if (!vbr)
+		return NULL;
 
-		switch (vbr->status) {
-		case VIRTIO_BLK_S_OK:
-			error = 0;
-			break;
-		case VIRTIO_BLK_S_UNSUPP:
-			error = -ENOTTY;
-			break;
-		default:
-			error = -EIO;
+	vbr->vblk = vblk;
+	if (use_bio)
+		sg_init_table(vbr->sg, vblk->sg_elems);
+
+	return vbr;
+}
+
+static void virtblk_add_buf_wait(struct virtio_blk *vblk,
+				 struct virtblk_req *vbr,
+				 unsigned long out,
+				 unsigned long in)
+{
+	DEFINE_WAIT(wait);
+
+	for (;;) {
+		prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
+					  TASK_UNINTERRUPTIBLE);
+
+		spin_lock_irq(vblk->disk->queue->queue_lock);
+		if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
+				      GFP_ATOMIC) < 0) {
+			spin_unlock_irq(vblk->disk->queue->queue_lock);
+			io_schedule();
+		} else {
+			virtqueue_kick(vblk->vq);
+			spin_unlock_irq(vblk->disk->queue->queue_lock);
 			break;
 		}
 
-		switch (vbr->req->cmd_type) {
-		case REQ_TYPE_BLOCK_PC:
-			vbr->req->resid_len = vbr->in_hdr.residual;
-			vbr->req->sense_len = vbr->in_hdr.sense_len;
-			vbr->req->errors = vbr->in_hdr.errors;
-			break;
-		case REQ_TYPE_SPECIAL:
-			vbr->req->errors = (error != 0);
-			break;
-		default:
-			break;
+	}
+
+	finish_wait(&vblk->queue_wait, &wait);
+}
+
+static inline void virtblk_add_req(struct virtblk_req *vbr,
+				   unsigned int out, unsigned int in)
+{
+	struct virtio_blk *vblk = vbr->vblk;
+
+	spin_lock_irq(vblk->disk->queue->queue_lock);
+	if (unlikely(virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
+					GFP_ATOMIC) < 0)) {
+		spin_unlock_irq(vblk->disk->queue->queue_lock);
+		virtblk_add_buf_wait(vblk, vbr, out, in);
+		return;
+	}
+	virtqueue_kick(vblk->vq);
+	spin_unlock_irq(vblk->disk->queue->queue_lock);
+}
+
+static int virtblk_bio_send_flush(struct virtblk_req *vbr)
+{
+	unsigned int out = 0, in = 0;
+
+	vbr->flags |= VBLK_IS_FLUSH;
+	vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
+	vbr->out_hdr.sector = 0;
+	vbr->out_hdr.ioprio = 0;
+	sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
+	sg_set_buf(&vbr->sg[out + in++], &vbr->status, sizeof(vbr->status));
+
+	virtblk_add_req(vbr, out, in);
+
+	return 0;
+}
+
+static int virtblk_bio_send_data(struct virtblk_req *vbr)
+{
+	struct virtio_blk *vblk = vbr->vblk;
+	unsigned int num, out = 0, in = 0;
+	struct bio *bio = vbr->bio;
+
+	vbr->flags &= ~VBLK_IS_FLUSH;
+	vbr->out_hdr.type = 0;
+	vbr->out_hdr.sector = bio->bi_sector;
+	vbr->out_hdr.ioprio = bio_prio(bio);
+
+	sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
+
+	num = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg + out);
+
+	sg_set_buf(&vbr->sg[num + out + in++], &vbr->status,
+		   sizeof(vbr->status));
+
+	if (num) {
+		if (bio->bi_rw & REQ_WRITE) {
+			vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
+			out += num;
+		} else {
+			vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
+			in += num;
 		}
+	}
+
+	virtblk_add_req(vbr, out, in);
+
+	return 0;
+}
+
+static void virtblk_bio_send_data_work(struct work_struct *work)
+{
+	struct virtblk_req *vbr;
+
+	vbr = container_of(work, struct virtblk_req, work);
+
+	virtblk_bio_send_data(vbr);
+}
+
+static void virtblk_bio_send_flush_work(struct work_struct *work)
+{
+	struct virtblk_req *vbr;
+
+	vbr = container_of(work, struct virtblk_req, work);
+
+	virtblk_bio_send_flush(vbr);
+}
+
+static inline void virtblk_request_done(struct virtblk_req *vbr)
+{
+	struct virtio_blk *vblk = vbr->vblk;
+	struct request *req = vbr->req;
+	int error = virtblk_result(vbr);
+
+	if (req->cmd_type == REQ_TYPE_BLOCK_PC) {
+		req->resid_len = vbr->in_hdr.residual;
+		req->sense_len = vbr->in_hdr.sense_len;
+		req->errors = vbr->in_hdr.errors;
+	} else if (req->cmd_type == REQ_TYPE_SPECIAL) {
+		req->errors = (error != 0);
+	}
+
+	__blk_end_request_all(req, error);
+	mempool_free(vbr, vblk->pool);
+}
+
+static inline void virtblk_bio_flush_done(struct virtblk_req *vbr)
+{
+	struct virtio_blk *vblk = vbr->vblk;
+
+	if (vbr->flags & VBLK_REQ_DATA) {
+		/* Send out the actual write data */
+		INIT_WORK(&vbr->work, virtblk_bio_send_data_work);
+		queue_work(virtblk_wq, &vbr->work);
+	} else {
+		bio_endio(vbr->bio, virtblk_result(vbr));
+		mempool_free(vbr, vblk->pool);
+	}
+}
+
+static inline void virtblk_bio_data_done(struct virtblk_req *vbr)
+{
+	struct virtio_blk *vblk = vbr->vblk;
 
-		__blk_end_request_all(vbr->req, error);
+	if (unlikely(vbr->flags & VBLK_REQ_FUA)) {
+		/* Send out a flush before end the bio */
+		vbr->flags &= ~VBLK_REQ_DATA;
+		INIT_WORK(&vbr->work, virtblk_bio_send_flush_work);
+		queue_work(virtblk_wq, &vbr->work);
+	} else {
+		bio_endio(vbr->bio, virtblk_result(vbr));
 		mempool_free(vbr, vblk->pool);
 	}
+}
+
+static inline void virtblk_bio_done(struct virtblk_req *vbr)
+{
+	if (unlikely(vbr->flags & VBLK_IS_FLUSH))
+		virtblk_bio_flush_done(vbr);
+	else
+		virtblk_bio_data_done(vbr);
+}
+
+static void virtblk_done(struct virtqueue *vq)
+{
+	struct virtio_blk *vblk = vq->vdev->priv;
+	bool bio_done = false, req_done = false;
+	struct virtblk_req *vbr;
+	unsigned long flags;
+	unsigned int len;
+
+	spin_lock_irqsave(vblk->disk->queue->queue_lock, flags);
+	do {
+		virtqueue_disable_cb(vq);
+		while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
+			if (vbr->bio) {
+				virtblk_bio_done(vbr);
+				bio_done = true;
+			} else {
+				virtblk_request_done(vbr);
+				req_done = true;
+			}
+		}
+	} while (!virtqueue_enable_cb(vq));
 	/* In case queue is stopped waiting for more buffers. */
-	blk_start_queue(vblk->disk->queue);
+	if (req_done)
+		blk_start_queue(vblk->disk->queue);
 	spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags);
+
+	if (bio_done)
+		wake_up(&vblk->queue_wait);
 }
 
 static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
@@ -106,13 +301,13 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 	unsigned long num, out = 0, in = 0;
 	struct virtblk_req *vbr;
 
-	vbr = mempool_alloc(vblk->pool, GFP_ATOMIC);
+	vbr = virtblk_alloc_req(vblk, GFP_ATOMIC);
 	if (!vbr)
 		/* When another request finishes we'll try again. */
 		return false;
 
 	vbr->req = req;
-
+	vbr->bio = NULL;
 	if (req->cmd_flags & REQ_FLUSH) {
 		vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
 		vbr->out_hdr.sector = 0;
@@ -172,7 +367,8 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 		}
 	}
 
-	if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, GFP_ATOMIC)<0) {
+	if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr,
+			      GFP_ATOMIC) < 0) {
 		mempool_free(vbr, vblk->pool);
 		return false;
 	}
@@ -180,7 +376,7 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 	return true;
 }
 
-static void do_virtblk_request(struct request_queue *q)
+static void virtblk_request(struct request_queue *q)
 {
 	struct virtio_blk *vblk = q->queuedata;
 	struct request *req;
@@ -203,6 +399,34 @@ static void do_virtblk_request(struct request_queue *q)
 		virtqueue_kick(vblk->vq);
 }
 
+static void virtblk_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct virtio_blk *vblk = q->queuedata;
+	struct virtblk_req *vbr;
+
+	BUG_ON(bio->bi_phys_segments + 2 > vblk->sg_elems);
+
+	vbr = virtblk_alloc_req(vblk, GFP_NOIO);
+	if (!vbr) {
+		bio_endio(bio, -ENOMEM);
+		return;
+	}
+
+	vbr->bio = bio;
+	vbr->flags = 0;
+	if (bio->bi_rw & REQ_FLUSH)
+		vbr->flags |= VBLK_REQ_FLUSH;
+	if (bio->bi_rw & REQ_FUA)
+		vbr->flags |= VBLK_REQ_FUA;
+	if (bio->bi_size)
+		vbr->flags |= VBLK_REQ_DATA;
+
+	if (unlikely(vbr->flags & VBLK_REQ_FLUSH))
+		virtblk_bio_send_flush(vbr);
+	else
+		virtblk_bio_send_data(vbr);
+}
+
 /* return id (s/n) string for *disk to *id_str
  */
 static int virtblk_get_id(struct gendisk *disk, char *id_str)
@@ -360,7 +584,7 @@ static int init_vq(struct virtio_blk *vblk)
 	int err = 0;
 
 	/* We expect one virtqueue, for output. */
-	vblk->vq = virtio_find_single_vq(vblk->vdev, blk_done, "requests");
+	vblk->vq = virtio_find_single_vq(vblk->vdev, virtblk_done, "requests");
 	if (IS_ERR(vblk->vq))
 		err = PTR_ERR(vblk->vq);
 
@@ -477,6 +701,8 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 	struct virtio_blk *vblk;
 	struct request_queue *q;
 	int err, index;
+	int pool_size;
+
 	u64 cap;
 	u32 v, blk_size, sg_elems, opt_io_size;
 	u16 min_io_size;
@@ -506,10 +732,12 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 		goto out_free_index;
 	}
 
+	init_waitqueue_head(&vblk->queue_wait);
 	vblk->vdev = vdev;
 	vblk->sg_elems = sg_elems;
 	sg_init_table(vblk->sg, vblk->sg_elems);
 	mutex_init(&vblk->config_lock);
+
 	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
 	vblk->config_enable = true;
 
@@ -517,7 +745,10 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 	if (err)
 		goto out_free_vblk;
 
-	vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req));
+	pool_size = sizeof(struct virtblk_req);
+	if (use_bio)
+		pool_size += sizeof(struct scatterlist) * sg_elems;
+	vblk->pool = mempool_create_kmalloc_pool(1, pool_size);
 	if (!vblk->pool) {
 		err = -ENOMEM;
 		goto out_free_vq;
@@ -530,12 +761,14 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 		goto out_mempool;
 	}
 
-	q = vblk->disk->queue = blk_init_queue(do_virtblk_request, NULL);
+	q = vblk->disk->queue = blk_init_queue(virtblk_request, NULL);
 	if (!q) {
 		err = -ENOMEM;
 		goto out_put_disk;
 	}
 
+	if (use_bio)
+		blk_queue_make_request(q, virtblk_make_request);
 	q->queuedata = vblk;
 
 	virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
@@ -620,7 +853,6 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 	if (!err && opt_io_size)
 		blk_queue_io_opt(q, blk_size * opt_io_size);
 
-
 	add_disk(vblk->disk);
 	err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial);
 	if (err)
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 060a672..8ab9c3d 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -24,6 +24,8 @@
 #include <linux/err.h>
 #include <linux/freezer.h>
 #include <linux/fs.h>
+#include <linux/splice.h>
+#include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/poll.h>
@@ -474,26 +476,53 @@ static ssize_t send_control_msg(struct port *port, unsigned int event,
 	return 0;
 }
 
+struct buffer_token {
+	union {
+		void *buf;
+		struct scatterlist *sg;
+	} u;
+	/* If sgpages == 0 then buf is used, else sg is used */
+	unsigned int sgpages;
+};
+
+static void reclaim_sg_pages(struct scatterlist *sg, unsigned int nrpages)
+{
+	int i;
+	struct page *page;
+
+	for (i = 0; i < nrpages; i++) {
+		page = sg_page(&sg[i]);
+		if (!page)
+			break;
+		put_page(page);
+	}
+	kfree(sg);
+}
+
 /* Callers must take the port->outvq_lock */
 static void reclaim_consumed_buffers(struct port *port)
 {
-	void *buf;
+	struct buffer_token *tok;
 	unsigned int len;
 
 	if (!port->portdev) {
 		/* Device has been unplugged.  vqs are already gone. */
 		return;
 	}
-	while ((buf = virtqueue_get_buf(port->out_vq, &len))) {
-		kfree(buf);
+	while ((tok = virtqueue_get_buf(port->out_vq, &len))) {
+		if (tok->sgpages)
+			reclaim_sg_pages(tok->u.sg, tok->sgpages);
+		else
+			kfree(tok->u.buf);
+		kfree(tok);
 		port->outvq_full = false;
 	}
 }
 
-static ssize_t send_buf(struct port *port, void *in_buf, size_t in_count,
-			bool nonblock)
+static ssize_t __send_to_port(struct port *port, struct scatterlist *sg,
+			      int nents, size_t in_count,
+			      struct buffer_token *tok, bool nonblock)
 {
-	struct scatterlist sg[1];
 	struct virtqueue *out_vq;
 	ssize_t ret;
 	unsigned long flags;
@@ -505,8 +534,7 @@ static ssize_t send_buf(struct port *port, void *in_buf, size_t in_count,
 
 	reclaim_consumed_buffers(port);
 
-	sg_init_one(sg, in_buf, in_count);
-	ret = virtqueue_add_buf(out_vq, sg, 1, 0, in_buf, GFP_ATOMIC);
+	ret = virtqueue_add_buf(out_vq, sg, nents, 0, tok, GFP_ATOMIC);
 
 	/* Tell Host to go! */
 	virtqueue_kick(out_vq);
@@ -544,6 +572,37 @@ done:
 	return in_count;
 }
 
+static ssize_t send_buf(struct port *port, void *in_buf, size_t in_count,
+			bool nonblock)
+{
+	struct scatterlist sg[1];
+	struct buffer_token *tok;
+
+	tok = kmalloc(sizeof(*tok), GFP_ATOMIC);
+	if (!tok)
+		return -ENOMEM;
+	tok->sgpages = 0;
+	tok->u.buf = in_buf;
+
+	sg_init_one(sg, in_buf, in_count);
+
+	return __send_to_port(port, sg, 1, in_count, tok, nonblock);
+}
+
+static ssize_t send_pages(struct port *port, struct scatterlist *sg, int nents,
+			  size_t in_count, bool nonblock)
+{
+	struct buffer_token *tok;
+
+	tok = kmalloc(sizeof(*tok), GFP_ATOMIC);
+	if (!tok)
+		return -ENOMEM;
+	tok->sgpages = nents;
+	tok->u.sg = sg;
+
+	return __send_to_port(port, sg, nents, in_count, tok, nonblock);
+}
+
 /*
  * Give out the data that's requested from the buffer that we have
  * queued up.
@@ -665,6 +724,26 @@ static ssize_t port_fops_read(struct file *filp, char __user *ubuf,
 	return fill_readbuf(port, ubuf, count, true);
 }
 
+static int wait_port_writable(struct port *port, bool nonblock)
+{
+	int ret;
+
+	if (will_write_block(port)) {
+		if (nonblock)
+			return -EAGAIN;
+
+		ret = wait_event_freezable(port->waitqueue,
+					   !will_write_block(port));
+		if (ret < 0)
+			return ret;
+	}
+	/* Port got hot-unplugged. */
+	if (!port->guest_connected)
+		return -ENODEV;
+
+	return 0;
+}
+
 static ssize_t port_fops_write(struct file *filp, const char __user *ubuf,
 			       size_t count, loff_t *offp)
 {
@@ -681,18 +760,9 @@ static ssize_t port_fops_write(struct file *filp, const char __user *ubuf,
 
 	nonblock = filp->f_flags & O_NONBLOCK;
 
-	if (will_write_block(port)) {
-		if (nonblock)
-			return -EAGAIN;
-
-		ret = wait_event_freezable(port->waitqueue,
-					   !will_write_block(port));
-		if (ret < 0)
-			return ret;
-	}
-	/* Port got hot-unplugged. */
-	if (!port->guest_connected)
-		return -ENODEV;
+	ret = wait_port_writable(port, nonblock);
+	if (ret < 0)
+		return ret;
 
 	count = min((size_t)(32 * 1024), count);
 
@@ -725,6 +795,93 @@ out:
 	return ret;
 }
 
+struct sg_list {
+	unsigned int n;
+	unsigned int size;
+	size_t len;
+	struct scatterlist *sg;
+};
+
+static int pipe_to_sg(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+			struct splice_desc *sd)
+{
+	struct sg_list *sgl = sd->u.data;
+	unsigned int offset, len;
+
+	if (sgl->n == sgl->size)
+		return 0;
+
+	/* Try lock this page */
+	if (buf->ops->steal(pipe, buf) == 0) {
+		/* Get reference and unlock page for moving */
+		get_page(buf->page);
+		unlock_page(buf->page);
+
+		len = min(buf->len, sd->len);
+		sg_set_page(&(sgl->sg[sgl->n]), buf->page, len, buf->offset);
+	} else {
+		/* Failback to copying a page */
+		struct page *page = alloc_page(GFP_KERNEL);
+		char *src = buf->ops->map(pipe, buf, 1);
+		char *dst;
+
+		if (!page)
+			return -ENOMEM;
+		dst = kmap(page);
+
+		offset = sd->pos & ~PAGE_MASK;
+
+		len = sd->len;
+		if (len + offset > PAGE_SIZE)
+			len = PAGE_SIZE - offset;
+
+		memcpy(dst + offset, src + buf->offset, len);
+
+		kunmap(page);
+		buf->ops->unmap(pipe, buf, src);
+
+		sg_set_page(&(sgl->sg[sgl->n]), page, len, offset);
+	}
+	sgl->n++;
+	sgl->len += len;
+
+	return len;
+}
+
+/* Faster zero-copy write by splicing */
+static ssize_t port_fops_splice_write(struct pipe_inode_info *pipe,
+				      struct file *filp, loff_t *ppos,
+				      size_t len, unsigned int flags)
+{
+	struct port *port = filp->private_data;
+	struct sg_list sgl;
+	ssize_t ret;
+	struct splice_desc sd = {
+		.total_len = len,
+		.flags = flags,
+		.pos = *ppos,
+		.u.data = &sgl,
+	};
+
+	ret = wait_port_writable(port, filp->f_flags & O_NONBLOCK);
+	if (ret < 0)
+		return ret;
+
+	sgl.n = 0;
+	sgl.len = 0;
+	sgl.size = pipe->nrbufs;
+	sgl.sg = kmalloc(sizeof(struct scatterlist) * sgl.size, GFP_KERNEL);
+	if (unlikely(!sgl.sg))
+		return -ENOMEM;
+
+	sg_init_table(sgl.sg, sgl.size);
+	ret = __splice_from_pipe(pipe, &sd, pipe_to_sg);
+	if (likely(ret > 0))
+		ret = send_pages(port, sgl.sg, sgl.n, sgl.len, true);
+
+	return ret;
+}
+
 static unsigned int port_fops_poll(struct file *filp, poll_table *wait)
 {
 	struct port *port;
@@ -856,6 +1013,7 @@ static const struct file_operations port_fops = {
 	.open  = port_fops_open,
 	.read  = port_fops_read,
 	.write = port_fops_write,
+	.splice_write = port_fops_splice_write,
 	.poll  = port_fops_poll,
 	.release = port_fops_release,
 	.fasync = port_fops_fasync,
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index 9e8388e..fc92ccb 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -263,6 +263,9 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
 	struct virtqueue *vq;
 	int err;
 
+	if (!name)
+		return NULL;
+
 	/* We must have this many virtqueues. */
 	if (index >= ldev->desc->num_vq)
 		return ERR_PTR(-ENOENT);
@@ -296,7 +299,7 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
 	 * to 'true': the host just a(nother) SMP CPU, so we only need inter-cpu
 	 * barriers.
 	 */
-	vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN, vdev,
+	vq = vring_new_virtqueue(index, lvq->config.num, LGUEST_VRING_ALIGN, vdev,
 				 true, lvq->pages, lg_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c
index 3541b44..e7a4780 100644
--- a/drivers/remoteproc/remoteproc_virtio.c
+++ b/drivers/remoteproc/remoteproc_virtio.c
@@ -84,6 +84,9 @@ static struct virtqueue *rp_find_vq(struct virtio_device *vdev,
 	if (id >= ARRAY_SIZE(rvdev->vring))
 		return ERR_PTR(-EINVAL);
 
+	if (!name)
+		return NULL;
+
 	ret = rproc_alloc_vring(rvdev, id);
 	if (ret)
 		return ERR_PTR(ret);
@@ -103,7 +106,7 @@ static struct virtqueue *rp_find_vq(struct virtio_device *vdev,
 	 * Create the new vq, and tell virtio we're not interested in
 	 * the 'weak' smp barriers, since we're talking with a real device.
 	 */
-	vq = vring_new_virtqueue(len, rvring->align, vdev, false, addr,
+	vq = vring_new_virtqueue(id, len, rvring->align, vdev, false, addr,
 					rproc_virtio_notify, callback, name);
 	if (!vq) {
 		dev_err(dev, "vring_new_virtqueue %s failed\n", name);
diff --git a/drivers/rpmsg/Kconfig b/drivers/rpmsg/Kconfig
index 32aead6..2bd911f 100644
--- a/drivers/rpmsg/Kconfig
+++ b/drivers/rpmsg/Kconfig
@@ -4,7 +4,6 @@ menu "Rpmsg drivers (EXPERIMENTAL)"
 config RPMSG
 	tristate
 	select VIRTIO
-	select VIRTIO_RING
 	depends on EXPERIMENTAL
 
 endmenu
diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c
index 47cccd5..7dabef6 100644
--- a/drivers/s390/kvm/kvm_virtio.c
+++ b/drivers/s390/kvm/kvm_virtio.c
@@ -190,6 +190,9 @@ static struct virtqueue *kvm_find_vq(struct virtio_device *vdev,
 	if (index >= kdev->desc->num_vq)
 		return ERR_PTR(-ENOENT);
 
+	if (!name)
+		return NULL;
+
 	config = kvm_vq_config(kdev->desc)+index;
 
 	err = vmem_add_mapping(config->address,
@@ -198,7 +201,7 @@ static struct virtqueue *kvm_find_vq(struct virtio_device *vdev,
 	if (err)
 		goto out;
 
-	vq = vring_new_virtqueue(config->num, KVM_S390_VIRTIO_RING_ALIGN,
+	vq = vring_new_virtqueue(index, config->num, KVM_S390_VIRTIO_RING_ALIGN,
 				 vdev, true, (void *) config->address,
 				 kvm_notify, callback, name);
 	if (!vq) {
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index f38b17a..8d5bddb 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -1,11 +1,9 @@
-# Virtio always gets selected by whoever wants it.
 config VIRTIO
 	tristate
-
-# Similarly the virtio ring implementation.
-config VIRTIO_RING
-	tristate
-	depends on VIRTIO
+	---help---
+	  This option is selected by any driver which implements the virtio
+	  bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_LGUEST,
+	  CONFIG_RPMSG or CONFIG_S390_GUEST.
 
 menu "Virtio drivers"
 
@@ -13,7 +11,6 @@ config VIRTIO_PCI
 	tristate "PCI driver for virtio devices (EXPERIMENTAL)"
 	depends on PCI && EXPERIMENTAL
 	select VIRTIO
-	select VIRTIO_RING
 	---help---
 	  This drivers provides support for virtio based paravirtual device
 	  drivers over PCI.  This requires that your VMM has appropriate PCI
@@ -26,9 +23,8 @@ config VIRTIO_PCI
 	  If unsure, say M.
 
 config VIRTIO_BALLOON
-	tristate "Virtio balloon driver (EXPERIMENTAL)"
-	select VIRTIO
-	select VIRTIO_RING
+	tristate "Virtio balloon driver"
+	depends on VIRTIO
 	---help---
 	 This driver supports increasing and decreasing the amount
 	 of memory within a KVM guest.
@@ -39,7 +35,6 @@ config VIRTIO_BALLOON
  	tristate "Platform bus driver for memory mapped virtio devices (EXPERIMENTAL)"
  	depends on HAS_IOMEM && EXPERIMENTAL
  	select VIRTIO
- 	select VIRTIO_RING
  	---help---
  	 This drivers provides support for memory mapped virtio
 	 platform device driver.
diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
index 5a4c63c..9076635 100644
--- a/drivers/virtio/Makefile
+++ b/drivers/virtio/Makefile
@@ -1,5 +1,4 @@
-obj-$(CONFIG_VIRTIO) += virtio.o
-obj-$(CONFIG_VIRTIO_RING) += virtio_ring.o
+obj-$(CONFIG_VIRTIO) += virtio.o virtio_ring.o
 obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o
 obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o
 obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index c3b3f7f..1e8659c 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -159,7 +159,7 @@ static int virtio_dev_remove(struct device *_d)
 	drv->remove(dev);
 
 	/* Driver should have reset device. */
-	BUG_ON(dev->config->get_status(dev));
+	WARN_ON_ONCE(dev->config->get_status(dev));
 
 	/* Acknowledge the device's existence again. */
 	add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index 453db0c..6b1b7e1 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -131,9 +131,6 @@ struct virtio_mmio_vq_info {
 	/* the number of entries in the queue */
 	unsigned int num;
 
-	/* the index of the queue */
-	int queue_index;
-
 	/* the virtual address of the ring queue */
 	void *queue;
 
@@ -225,11 +222,10 @@ static void vm_reset(struct virtio_device *vdev)
 static void vm_notify(struct virtqueue *vq)
 {
 	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev);
-	struct virtio_mmio_vq_info *info = vq->priv;
 
 	/* We write the queue's selector into the notification register to
 	 * signal the other end */
-	writel(info->queue_index, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY);
+	writel(virtqueue_get_queue_index(vq), vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY);
 }
 
 /* Notify all virtqueues on an interrupt. */
@@ -270,6 +266,7 @@ static void vm_del_vq(struct virtqueue *vq)
 	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev);
 	struct virtio_mmio_vq_info *info = vq->priv;
 	unsigned long flags, size;
+	unsigned int index = virtqueue_get_queue_index(vq);
 
 	spin_lock_irqsave(&vm_dev->lock, flags);
 	list_del(&info->node);
@@ -278,7 +275,7 @@ static void vm_del_vq(struct virtqueue *vq)
 	vring_del_virtqueue(vq);
 
 	/* Select and deactivate the queue */
-	writel(info->queue_index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL);
+	writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL);
 	writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN);
 
 	size = PAGE_ALIGN(vring_size(info->num, VIRTIO_MMIO_VRING_ALIGN));
@@ -309,6 +306,9 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
 	unsigned long flags, size;
 	int err;
 
+	if (!name)
+		return NULL;
+
 	/* Select the queue we're interested in */
 	writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL);
 
@@ -324,7 +324,6 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
 		err = -ENOMEM;
 		goto error_kmalloc;
 	}
-	info->queue_index = index;
 
 	/* Allocate pages for the queue - start with a queue as big as
 	 * possible (limited by maximum size allowed by device), drop down
@@ -332,11 +331,21 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
 	 * and two rings (which makes it "alignment_size * 2")
 	 */
 	info->num = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NUM_MAX);
+
+	/* If the device reports a 0 entry queue, we won't be able to
+	 * use it to perform I/O, and vring_new_virtqueue() can't create
+	 * empty queues anyway, so don't bother to set up the device.
+	 */
+	if (info->num == 0) {
+		err = -ENOENT;
+		goto error_alloc_pages;
+	}
+
 	while (1) {
 		size = PAGE_ALIGN(vring_size(info->num,
 				VIRTIO_MMIO_VRING_ALIGN));
-		/* Already smallest possible allocation? */
-		if (size <= VIRTIO_MMIO_VRING_ALIGN * 2) {
+		/* Did the last iter shrink the queue below minimum size? */
+		if (size < VIRTIO_MMIO_VRING_ALIGN * 2) {
 			err = -ENOMEM;
 			goto error_alloc_pages;
 		}
@@ -356,7 +365,7 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
 			vm_dev->base + VIRTIO_MMIO_QUEUE_PFN);
 
 	/* Create the vring */
-	vq = vring_new_virtqueue(info->num, VIRTIO_MMIO_VRING_ALIGN, vdev,
+	vq = vring_new_virtqueue(index, info->num, VIRTIO_MMIO_VRING_ALIGN, vdev,
 				 true, info->queue, vm_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index 2e03d41..c33aea3 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -48,6 +48,7 @@ struct virtio_pci_device
 	int msix_enabled;
 	int intx_enabled;
 	struct msix_entry *msix_entries;
+	cpumask_var_t *msix_affinity_masks;
 	/* Name strings for interrupts. This size should be enough,
 	 * and I'm too lazy to allocate each name separately. */
 	char (*msix_names)[256];
@@ -79,9 +80,6 @@ struct virtio_pci_vq_info
 	/* the number of entries in the queue */
 	int num;
 
-	/* the index of the queue */
-	int queue_index;
-
 	/* the virtual address of the ring queue */
 	void *queue;
 
@@ -202,11 +200,11 @@ static void vp_reset(struct virtio_device *vdev)
 static void vp_notify(struct virtqueue *vq)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
-	struct virtio_pci_vq_info *info = vq->priv;
 
 	/* we write the queue's selector into the notification register to
 	 * signal the other end */
-	iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
+	iowrite16(virtqueue_get_queue_index(vq),
+		  vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
 }
 
 /* Handle a configuration change: Tell driver if it wants to know. */
@@ -279,6 +277,10 @@ static void vp_free_vectors(struct virtio_device *vdev)
 	for (i = 0; i < vp_dev->msix_used_vectors; ++i)
 		free_irq(vp_dev->msix_entries[i].vector, vp_dev);
 
+	for (i = 0; i < vp_dev->msix_vectors; i++)
+		if (vp_dev->msix_affinity_masks[i])
+			free_cpumask_var(vp_dev->msix_affinity_masks[i]);
+
 	if (vp_dev->msix_enabled) {
 		/* Disable the vector used for configuration */
 		iowrite16(VIRTIO_MSI_NO_VECTOR,
@@ -296,6 +298,8 @@ static void vp_free_vectors(struct virtio_device *vdev)
 	vp_dev->msix_names = NULL;
 	kfree(vp_dev->msix_entries);
 	vp_dev->msix_entries = NULL;
+	kfree(vp_dev->msix_affinity_masks);
+	vp_dev->msix_affinity_masks = NULL;
 }
 
 static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
@@ -314,6 +318,15 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
 				     GFP_KERNEL);
 	if (!vp_dev->msix_names)
 		goto error;
+	vp_dev->msix_affinity_masks
+		= kzalloc(nvectors * sizeof *vp_dev->msix_affinity_masks,
+			  GFP_KERNEL);
+	if (!vp_dev->msix_affinity_masks)
+		goto error;
+	for (i = 0; i < nvectors; ++i)
+		if (!alloc_cpumask_var(&vp_dev->msix_affinity_masks[i],
+					GFP_KERNEL))
+			goto error;
 
 	for (i = 0; i < nvectors; ++i)
 		vp_dev->msix_entries[i].entry = i;
@@ -402,7 +415,6 @@ static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index,
 	if (!info)
 		return ERR_PTR(-ENOMEM);
 
-	info->queue_index = index;
 	info->num = num;
 	info->msix_vector = msix_vec;
 
@@ -418,7 +430,7 @@ static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index,
 		  vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
 
 	/* create the vring */
-	vq = vring_new_virtqueue(info->num, VIRTIO_PCI_VRING_ALIGN, vdev,
+	vq = vring_new_virtqueue(index, info->num, VIRTIO_PCI_VRING_ALIGN, vdev,
 				 true, info->queue, vp_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
@@ -467,7 +479,8 @@ static void vp_del_vq(struct virtqueue *vq)
 	list_del(&info->node);
 	spin_unlock_irqrestore(&vp_dev->lock, flags);
 
-	iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
+	iowrite16(virtqueue_get_queue_index(vq),
+		vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
 
 	if (vp_dev->msix_enabled) {
 		iowrite16(VIRTIO_MSI_NO_VECTOR,
@@ -542,7 +555,10 @@ static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 	vp_dev->per_vq_vectors = per_vq_vectors;
 	allocated_vectors = vp_dev->msix_used_vectors;
 	for (i = 0; i < nvqs; ++i) {
-		if (!callbacks[i] || !vp_dev->msix_enabled)
+		if (!names[i]) {
+			vqs[i] = NULL;
+			continue;
+		} else if (!callbacks[i] || !vp_dev->msix_enabled)
 			msix_vec = VIRTIO_MSI_NO_VECTOR;
 		else if (vp_dev->per_vq_vectors)
 			msix_vec = allocated_vectors++;
@@ -609,6 +625,35 @@ static const char *vp_bus_name(struct virtio_device *vdev)
 	return pci_name(vp_dev->pci_dev);
 }
 
+/* Setup the affinity for a virtqueue:
+ * - force the affinity for per vq vector
+ * - OR over all affinities for shared MSI
+ * - ignore the affinity request if we're using INTX
+ */
+static int vp_set_vq_affinity(struct virtqueue *vq, int cpu)
+{
+	struct virtio_device *vdev = vq->vdev;
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	struct virtio_pci_vq_info *info = vq->priv;
+	struct cpumask *mask;
+	unsigned int irq;
+
+	if (!vq->callback)
+		return -EINVAL;
+
+	if (vp_dev->msix_enabled) {
+		mask = vp_dev->msix_affinity_masks[info->msix_vector];
+		irq = vp_dev->msix_entries[info->msix_vector].vector;
+		if (cpu == -1)
+			irq_set_affinity_hint(irq, NULL);
+		else {
+			cpumask_set_cpu(cpu, mask);
+			irq_set_affinity_hint(irq, mask);
+		}
+	}
+	return 0;
+}
+
 static struct virtio_config_ops virtio_pci_config_ops = {
 	.get		= vp_get,
 	.set		= vp_set,
@@ -620,6 +665,7 @@ static struct virtio_config_ops virtio_pci_config_ops = {
 	.get_features	= vp_get_features,
 	.finalize_features = vp_finalize_features,
 	.bus_name	= vp_bus_name,
+	.set_vq_affinity = vp_set_vq_affinity,
 };
 
 static void virtio_pci_release_dev(struct device *_d)
@@ -673,8 +719,10 @@ static int __devinit virtio_pci_probe(struct pci_dev *pci_dev,
 		goto out_enable_device;
 
 	vp_dev->ioaddr = pci_iomap(pci_dev, 0, 0);
-	if (vp_dev->ioaddr == NULL)
+	if (vp_dev->ioaddr == NULL) {
+		err = -ENOMEM;
 		goto out_req_regions;
+	}
 
 	pci_set_drvdata(pci_dev, vp_dev);
 	pci_set_master(pci_dev);
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 5aa43c3..e639584 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -106,6 +106,9 @@ struct vring_virtqueue
 	/* How to notify other side. FIXME: commonalize hcalls! */
 	void (*notify)(struct virtqueue *vq);
 
+	/* Index of the queue */
+	int queue_index;
+
 #ifdef DEBUG
 	/* They're supposed to lock for us. */
 	unsigned int in_use;
@@ -171,6 +174,13 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
 	return head;
 }
 
+int virtqueue_get_queue_index(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	return vq->queue_index;
+}
+EXPORT_SYMBOL_GPL(virtqueue_get_queue_index);
+
 /**
  * virtqueue_add_buf - expose buffer to other end
  * @vq: the struct virtqueue we're talking about.
@@ -616,7 +626,8 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
 }
 EXPORT_SYMBOL_GPL(vring_interrupt);
 
-struct virtqueue *vring_new_virtqueue(unsigned int num,
+struct virtqueue *vring_new_virtqueue(unsigned int index,
+				      unsigned int num,
 				      unsigned int vring_align,
 				      struct virtio_device *vdev,
 				      bool weak_barriers,
@@ -647,6 +658,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
 	vq->broken = false;
 	vq->last_used_idx = 0;
 	vq->num_added = 0;
+	vq->queue_index = index;
 	list_add_tail(&vq->vq.list, &vdev->vqs);
 #ifdef DEBUG
 	vq->in_use = false;
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index a1ba8bb..533b115 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -50,6 +50,8 @@ void *virtqueue_detach_unused_buf(struct virtqueue *vq);
 
 unsigned int virtqueue_get_vring_size(struct virtqueue *vq);
 
+int virtqueue_get_queue_index(struct virtqueue *vq);
+
 /**
  * virtio_device - representation of a device using virtio
  * @index: unique position on the virtio bus
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index fc457f4..e2850a7 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -84,7 +84,9 @@
  *	nvqs: the number of virtqueues to find
  *	vqs: on success, includes new virtqueues
  *	callbacks: array of callbacks, for each virtqueue
+ *		include a NULL entry for vqs that do not need a callback
  *	names: array of virtqueue names (mainly for debugging)
+ *		include a NULL entry for vqs unused by driver
  *	Returns 0 on success or error status
  * @del_vqs: free virtqueues found by find_vqs().
  * @get_features: get the array of feature bits for this device.
@@ -98,6 +100,7 @@
  *	vdev: the virtio_device
  *      This returns a pointer to the bus name a la pci_name from which
  *      the caller can then copy.
+ * @set_vq_affinity: set the affinity for a virtqueue.
  */
 typedef void vq_callback_t(struct virtqueue *);
 struct virtio_config_ops {
@@ -116,6 +119,7 @@ struct virtio_config_ops {
 	u32 (*get_features)(struct virtio_device *vdev);
 	void (*finalize_features)(struct virtio_device *vdev);
 	const char *(*bus_name)(struct virtio_device *vdev);
+	int (*set_vq_affinity)(struct virtqueue *vq, int cpu);
 };
 
 /* If driver didn't advertise the feature, it will never appear. */
@@ -190,5 +194,24 @@ const char *virtio_bus_name(struct virtio_device *vdev)
 	return vdev->config->bus_name(vdev);
 }
 
+/**
+ * virtqueue_set_affinity - setting affinity for a virtqueue
+ * @vq: the virtqueue
+ * @cpu: the cpu no.
+ *
+ * Pay attention the function are best-effort: the affinity hint may not be set
+ * due to config support, irq type and sharing.
+ *
+ */
+static inline
+int virtqueue_set_affinity(struct virtqueue *vq, int cpu)
+{
+	struct virtio_device *vdev = vq->vdev;
+	if (vdev->config->set_vq_affinity)
+		return vdev->config->set_vq_affinity(vq, cpu);
+	return 0;
+}
+
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_VIRTIO_CONFIG_H */
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index e338730..c2d793a 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -165,7 +165,8 @@ static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
 struct virtio_device;
 struct virtqueue;
 
-struct virtqueue *vring_new_virtqueue(unsigned int num,
+struct virtqueue *vring_new_virtqueue(unsigned int index,
+				      unsigned int num,
 				      unsigned int vring_align,
 				      struct virtio_device *vdev,
 				      bool weak_barriers,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index cdcb594..31e4f55 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4200,12 +4200,6 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
 	buf->private = 0;
 }
 
-static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe,
-				 struct pipe_buffer *buf)
-{
-	return 1;
-}
-
 static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
 				struct pipe_buffer *buf)
 {
@@ -4221,7 +4215,7 @@ static const struct pipe_buf_operations buffer_pipe_buf_ops = {
 	.unmap			= generic_pipe_buf_unmap,
 	.confirm		= generic_pipe_buf_confirm,
 	.release		= buffer_pipe_buf_release,
-	.steal			= buffer_pipe_buf_steal,
+	.steal			= generic_pipe_buf_steal,
 	.get			= buffer_pipe_buf_get,
 };
 
diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c
index f759f4f..fd2f922 100644
--- a/tools/lguest/lguest.c
+++ b/tools/lguest/lguest.c
@@ -1299,6 +1299,7 @@ static struct device *new_device(const char *name, u16 type)
 	dev->feature_len = 0;
 	dev->num_vq = 0;
 	dev->running = false;
+	dev->next = NULL;
 
 	/*
 	 * Append to device list.  Prepending to a single-linked list is
diff --git a/tools/virtio/virtio-trace/Makefile b/tools/virtio/virtio-trace/Makefile
new file mode 100644
index 0000000..0d23816
--- /dev/null
+++ b/tools/virtio/virtio-trace/Makefile
@@ -0,0 +1,13 @@
+CC = gcc
+CFLAGS = -O2 -Wall -pthread
+
+all: trace-agent
+
+.c.o:
+	$(CC) $(CFLAGS) -c $^ -o $@
+
+trace-agent: trace-agent.o trace-agent-ctl.o trace-agent-rw.o
+	$(CC) $(CFLAGS) -o $@ $^
+
+clean:
+	rm -f *.o trace-agent
diff --git a/tools/virtio/virtio-trace/README b/tools/virtio/virtio-trace/README
new file mode 100644
index 0000000..b64845b
--- /dev/null
+++ b/tools/virtio/virtio-trace/README
@@ -0,0 +1,118 @@
+Trace Agent for virtio-trace
+============================
+
+Trace agent is a user tool for sending trace data of a guest to a Host in low
+overhead. Trace agent has the following functions:
+ - splice a page of ring-buffer to read_pipe without memory copying
+ - splice the page from write_pipe to virtio-console without memory copying
+ - write trace data to stdout by using -o option
+ - controlled by start/stop orders from a Host
+
+The trace agent operates as follows:
+ 1) Initialize all structures.
+ 2) Create a read/write thread per CPU. Each thread is bound to a CPU.
+    The read/write threads hold it.
+ 3) A controller thread does poll() for a start order of a host.
+ 4) After the controller of the trace agent receives a start order from a host,
+    the controller wake read/write threads.
+ 5) The read/write threads start to read trace data from ring-buffers and
+    write the data to virtio-serial.
+ 6) If the controller receives a stop order from a host, the read/write threads
+    stop to read trace data.
+
+
+Files
+=====
+
+README: this file
+Makefile: Makefile of trace agent for virtio-trace
+trace-agent.c: includes main function, sets up for operating trace agent
+trace-agent.h: includes all structures and some macros
+trace-agent-ctl.c: includes controller function for read/write threads
+trace-agent-rw.c: includes read/write threads function
+
+
+Setup
+=====
+
+To use this trace agent for virtio-trace, we need to prepare some virtio-serial
+I/Fs.
+
+1) Make FIFO in a host
+ virtio-trace uses virtio-serial pipe as trace data paths as to the number
+of CPUs and a control path, so FIFO (named pipe) should be created as follows:
+	# mkdir /tmp/virtio-trace/
+	# mkfifo /tmp/virtio-trace/trace-path-cpu{0,1,2,...,X}.{in,out}
+	# mkfifo /tmp/virtio-trace/agent-ctl-path.{in,out}
+
+For example, if a guest use three CPUs, the names are
+	trace-path-cpu{0,1,2}.{in.out}
+and
+	agent-ctl-path.{in,out}.
+
+2) Set up of virtio-serial pipe in a host
+ Add qemu option to use virtio-serial pipe.
+
+ ##virtio-serial device##
+     -device virtio-serial-pci,id=virtio-serial0\
+ ##control path##
+     -chardev pipe,id=charchannel0,path=/tmp/virtio-trace/agent-ctl-path\
+     -device virtserialport,bus=virtio-serial0.0,nr=1,chardev=charchannel0,\
+      id=channel0,name=agent-ctl-path\
+ ##data path##
+     -chardev pipe,id=charchannel1,path=/tmp/virtio-trace/trace-path-cpu0\
+     -device virtserialport,bus=virtio-serial0.0,nr=2,chardev=charchannel0,\
+      id=channel1,name=trace-path-cpu0\
+      ...
+
+If you manage guests with libvirt, add the following tags to domain XML files.
+Then, libvirt passes the same command option to qemu.
+
+	<channel type='pipe'>
+	   <source path='/tmp/virtio-trace/agent-ctl-path'/>
+	   <target type='virtio' name='agent-ctl-path'/>
+	   <address type='virtio-serial' controller='0' bus='0' port='0'/>
+	</channel>
+	<channel type='pipe'>
+	   <source path='/tmp/virtio-trace/trace-path-cpu0'/>
+	   <target type='virtio' name='trace-path-cpu0'/>
+	   <address type='virtio-serial' controller='0' bus='0' port='1'/>
+	</channel>
+	...
+Here, chardev names are restricted to trace-path-cpuX and agent-ctl-path. For
+example, if a guest use three CPUs, chardev names should be trace-path-cpu0,
+trace-path-cpu1, trace-path-cpu2, and agent-ctl-path.
+
+3) Boot the guest
+ You can find some chardev in /dev/virtio-ports/ in the guest.
+
+
+Run
+===
+
+0) Build trace agent in a guest
+	$ make
+
+1) Enable ftrace in the guest
+ <Example>
+	# echo 1 > /sys/kernel/debug/tracing/events/sched/enable
+
+2) Run trace agent in the guest
+ This agent must be operated as root.
+	# ./trace-agent
+read/write threads in the agent wait for start order from host. If you add -o
+option, trace data are output via stdout in the guest.
+
+3) Open FIFO in a host
+	# cat /tmp/virtio-trace/trace-path-cpu0.out
+If a host does not open these, trace data get stuck in buffers of virtio. Then,
+the guest will stop by specification of chardev in QEMU. This blocking mode may
+be solved in the future.
+
+4) Start to read trace data by ordering from a host
+ A host injects read start order to the guest via virtio-serial.
+	# echo 1 > /tmp/virtio-trace/agent-ctl-path.in
+
+5) Stop to read trace data by ordering from a host
+ A host injects read stop order to the guest via virtio-serial.
+	# echo 0 > /tmp/virtio-trace/agent-ctl-path.in
diff --git a/tools/virtio/virtio-trace/trace-agent-ctl.c b/tools/virtio/virtio-trace/trace-agent-ctl.c
new file mode 100644
index 0000000..a2d0403
--- /dev/null
+++ b/tools/virtio/virtio-trace/trace-agent-ctl.c
@@ -0,0 +1,137 @@
+/*
+ * Controller of read/write threads for virtio-trace
+ *
+ * Copyright (C) 2012 Hitachi, Ltd.
+ * Created by Yoshihiro Yunomae <yoshihiro.yunomae.ez@hitachi.com>
+ *            Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
+ *
+ * Licensed under GPL version 2 only.
+ *
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <poll.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include "trace-agent.h"
+
+#define HOST_MSG_SIZE		256
+#define EVENT_WAIT_MSEC		100
+
+static volatile sig_atomic_t global_signal_val;
+bool global_sig_receive;	/* default false */
+bool global_run_operation;	/* default false*/
+
+/* Handle SIGTERM/SIGINT/SIGQUIT to exit */
+static void signal_handler(int sig)
+{
+	global_signal_val = sig;
+}
+
+int rw_ctl_init(const char *ctl_path)
+{
+	int ctl_fd;
+
+	ctl_fd = open(ctl_path, O_RDONLY);
+	if (ctl_fd == -1) {
+		pr_err("Cannot open ctl_fd\n");
+		goto error;
+	}
+
+	return ctl_fd;
+
+error:
+	exit(EXIT_FAILURE);
+}
+
+static int wait_order(int ctl_fd)
+{
+	struct pollfd poll_fd;
+	int ret = 0;
+
+	while (!global_sig_receive) {
+		poll_fd.fd = ctl_fd;
+		poll_fd.events = POLLIN;
+
+		ret = poll(&poll_fd, 1, EVENT_WAIT_MSEC);
+
+		if (global_signal_val) {
+			global_sig_receive = true;
+			pr_info("Receive interrupt %d\n", global_signal_val);
+
+			/* Wakes rw-threads when they are sleeping */
+			if (!global_run_operation)
+				pthread_cond_broadcast(&cond_wakeup);
+
+			ret = -1;
+			break;
+		}
+
+		if (ret < 0) {
+			pr_err("Polling error\n");
+			goto error;
+		}
+
+		if (ret)
+			break;
+	};
+
+	return ret;
+
+error:
+	exit(EXIT_FAILURE);
+}
+
+/*
+ * contol read/write threads by handling global_run_operation
+ */
+void *rw_ctl_loop(int ctl_fd)
+{
+	ssize_t rlen;
+	char buf[HOST_MSG_SIZE];
+	int ret;
+
+	/* Setup signal handlers */
+	signal(SIGTERM, signal_handler);
+	signal(SIGINT, signal_handler);
+	signal(SIGQUIT, signal_handler);
+
+	while (!global_sig_receive) {
+
+		ret = wait_order(ctl_fd);
+		if (ret < 0)
+			break;
+
+		rlen = read(ctl_fd, buf, sizeof(buf));
+		if (rlen < 0) {
+			pr_err("read data error in ctl thread\n");
+			goto error;
+		}
+
+		if (rlen == 2 && buf[0] == '1') {
+			/*
+			 * If host writes '1' to a control path,
+			 * this controller wakes all read/write threads.
+			 */
+			global_run_operation = true;
+			pthread_cond_broadcast(&cond_wakeup);
+			pr_debug("Wake up all read/write threads\n");
+		} else if (rlen == 2 && buf[0] == '0') {
+			/*
+			 * If host writes '0' to a control path, read/write
+			 * threads will wait for notification from Host.
+			 */
+			global_run_operation = false;
+			pr_debug("Stop all read/write threads\n");
+		} else
+			pr_info("Invalid host notification: %s\n", buf);
+	}
+
+	return NULL;
+
+error:
+	exit(EXIT_FAILURE);
+}
diff --git a/tools/virtio/virtio-trace/trace-agent-rw.c b/tools/virtio/virtio-trace/trace-agent-rw.c
new file mode 100644
index 0000000..3aace5e
--- /dev/null
+++ b/tools/virtio/virtio-trace/trace-agent-rw.c
@@ -0,0 +1,192 @@
+/*
+ * Read/write thread of a guest agent for virtio-trace
+ *
+ * Copyright (C) 2012 Hitachi, Ltd.
+ * Created by Yoshihiro Yunomae <yoshihiro.yunomae.ez@hitachi.com>
+ *            Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
+ *
+ * Licensed under GPL version 2 only.
+ *
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include "trace-agent.h"
+
+#define READ_WAIT_USEC	100000
+
+void *rw_thread_info_new(void)
+{
+	struct rw_thread_info *rw_ti;
+
+	rw_ti = zalloc(sizeof(struct rw_thread_info));
+	if (rw_ti == NULL) {
+		pr_err("rw_thread_info zalloc error\n");
+		exit(EXIT_FAILURE);
+	}
+
+	rw_ti->cpu_num = -1;
+	rw_ti->in_fd = -1;
+	rw_ti->out_fd = -1;
+	rw_ti->read_pipe = -1;
+	rw_ti->write_pipe = -1;
+	rw_ti->pipe_size = PIPE_INIT;
+
+	return rw_ti;
+}
+
+void *rw_thread_init(int cpu, const char *in_path, const char *out_path,
+				bool stdout_flag, unsigned long pipe_size,
+				struct rw_thread_info *rw_ti)
+{
+	int data_pipe[2];
+
+	rw_ti->cpu_num = cpu;
+
+	/* set read(input) fd */
+	rw_ti->in_fd = open(in_path, O_RDONLY);
+	if (rw_ti->in_fd == -1) {
+		pr_err("Could not open in_fd (CPU:%d)\n", cpu);
+		goto error;
+	}
+
+	/* set write(output) fd */
+	if (!stdout_flag) {
+		/* virtio-serial output mode */
+		rw_ti->out_fd = open(out_path, O_WRONLY);
+		if (rw_ti->out_fd == -1) {
+			pr_err("Could not open out_fd (CPU:%d)\n", cpu);
+			goto error;
+		}
+	} else
+		/* stdout mode */
+		rw_ti->out_fd = STDOUT_FILENO;
+
+	if (pipe2(data_pipe, O_NONBLOCK) < 0) {
+		pr_err("Could not create pipe in rw-thread(%d)\n", cpu);
+		goto error;
+	}
+
+	/*
+	 * Size of pipe is 64kB in default based on fs/pipe.c.
+	 * To read/write trace data speedy, pipe size is changed.
+	 */
+	if (fcntl(*data_pipe, F_SETPIPE_SZ, pipe_size) < 0) {
+		pr_err("Could not change pipe size in rw-thread(%d)\n", cpu);
+		goto error;
+	}
+
+	rw_ti->read_pipe = data_pipe[1];
+	rw_ti->write_pipe = data_pipe[0];
+	rw_ti->pipe_size = pipe_size;
+
+	return NULL;
+
+error:
+	exit(EXIT_FAILURE);
+}
+
+/* Bind a thread to a cpu */
+static void bind_cpu(int cpu_num)
+{
+	cpu_set_t mask;
+
+	CPU_ZERO(&mask);
+	CPU_SET(cpu_num, &mask);
+
+	/* bind my thread to cpu_num by assigning zero to the first argument */
+	if (sched_setaffinity(0, sizeof(mask), &mask) == -1)
+		pr_err("Could not set CPU#%d affinity\n", (int)cpu_num);
+}
+
+static void *rw_thread_main(void *thread_info)
+{
+	ssize_t rlen, wlen;
+	ssize_t ret;
+	struct rw_thread_info *ts = (struct rw_thread_info *)thread_info;
+
+	bind_cpu(ts->cpu_num);
+
+	while (1) {
+		/* Wait for a read order of trace data by Host OS */
+		if (!global_run_operation) {
+			pthread_mutex_lock(&mutex_notify);
+			pthread_cond_wait(&cond_wakeup, &mutex_notify);
+			pthread_mutex_unlock(&mutex_notify);
+		}
+
+		if (global_sig_receive)
+			break;
+
+		/*
+		 * Each thread read trace_pipe_raw of each cpu bounding the
+		 * thread, so contention of multi-threads does not occur.
+		 */
+		rlen = splice(ts->in_fd, NULL, ts->read_pipe, NULL,
+				ts->pipe_size, SPLICE_F_MOVE | SPLICE_F_MORE);
+
+		if (rlen < 0) {
+			pr_err("Splice_read in rw-thread(%d)\n", ts->cpu_num);
+			goto error;
+		} else if (rlen == 0) {
+			/*
+			 * If trace data do not exist or are unreadable not
+			 * for exceeding the page size, splice_read returns
+			 * NULL. Then, this waits for being filled the data in a
+			 * ring-buffer.
+			 */
+			usleep(READ_WAIT_USEC);
+			pr_debug("Read retry(cpu:%d)\n", ts->cpu_num);
+			continue;
+		}
+
+		wlen = 0;
+
+		do {
+			ret = splice(ts->write_pipe, NULL, ts->out_fd, NULL,
+					rlen - wlen,
+					SPLICE_F_MOVE | SPLICE_F_MORE);
+
+			if (ret < 0) {
+				pr_err("Splice_write in rw-thread(%d)\n",
+								ts->cpu_num);
+				goto error;
+			} else if (ret == 0)
+				/*
+				 * When host reader is not in time for reading
+				 * trace data, guest will be stopped. This is
+				 * because char dev in QEMU is not supported
+				 * non-blocking mode. Then, writer might be
+				 * sleep in that case.
+				 * This sleep will be removed by supporting
+				 * non-blocking mode.
+				 */
+				sleep(1);
+			wlen += ret;
+		} while (wlen < rlen);
+	}
+
+	return NULL;
+
+error:
+	exit(EXIT_FAILURE);
+}
+
+
+pthread_t rw_thread_run(struct rw_thread_info *rw_ti)
+{
+	int ret;
+	pthread_t rw_thread_per_cpu;
+
+	ret = pthread_create(&rw_thread_per_cpu, NULL, rw_thread_main, rw_ti);
+	if (ret != 0) {
+		pr_err("Could not create a rw thread(%d)\n", rw_ti->cpu_num);
+		exit(EXIT_FAILURE);
+	}
+
+	return rw_thread_per_cpu;
+}
diff --git a/tools/virtio/virtio-trace/trace-agent.c b/tools/virtio/virtio-trace/trace-agent.c
new file mode 100644
index 0000000..0a0a7dd
--- /dev/null
+++ b/tools/virtio/virtio-trace/trace-agent.c
@@ -0,0 +1,270 @@
+/*
+ * Guest agent for virtio-trace
+ *
+ * Copyright (C) 2012 Hitachi, Ltd.
+ * Created by Yoshihiro Yunomae <yoshihiro.yunomae.ez@hitachi.com>
+ *            Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
+ *
+ * Licensed under GPL version 2 only.
+ *
+ */
+
+#define _GNU_SOURCE
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include "trace-agent.h"
+
+#define PAGE_SIZE		(sysconf(_SC_PAGE_SIZE))
+#define PIPE_DEF_BUFS		16
+#define PIPE_MIN_SIZE		(PAGE_SIZE*PIPE_DEF_BUFS)
+#define PIPE_MAX_SIZE		(1024*1024)
+#define READ_PATH_FMT	\
+		"/sys/kernel/debug/tracing/per_cpu/cpu%d/trace_pipe_raw"
+#define WRITE_PATH_FMT		"/dev/virtio-ports/trace-path-cpu%d"
+#define CTL_PATH		"/dev/virtio-ports/agent-ctl-path"
+
+pthread_mutex_t mutex_notify = PTHREAD_MUTEX_INITIALIZER;
+pthread_cond_t cond_wakeup = PTHREAD_COND_INITIALIZER;
+
+static int get_total_cpus(void)
+{
+	int nr_cpus = (int)sysconf(_SC_NPROCESSORS_CONF);
+
+	if (nr_cpus <= 0) {
+		pr_err("Could not read cpus\n");
+		goto error;
+	} else if (nr_cpus > MAX_CPUS) {
+		pr_err("Exceed max cpus(%d)\n", (int)MAX_CPUS);
+		goto error;
+	}
+
+	return nr_cpus;
+
+error:
+	exit(EXIT_FAILURE);
+}
+
+static void *agent_info_new(void)
+{
+	struct agent_info *s;
+	int i;
+
+	s = zalloc(sizeof(struct agent_info));
+	if (s == NULL) {
+		pr_err("agent_info zalloc error\n");
+		exit(EXIT_FAILURE);
+	}
+
+	s->pipe_size = PIPE_INIT;
+	s->use_stdout = false;
+	s->cpus = get_total_cpus();
+	s->ctl_fd = -1;
+
+	/* read/write threads init */
+	for (i = 0; i < s->cpus; i++)
+		s->rw_ti[i] = rw_thread_info_new();
+
+	return s;
+}
+
+static unsigned long parse_size(const char *arg)
+{
+	unsigned long value, round;
+	char *ptr;
+
+	value = strtoul(arg, &ptr, 10);
+	switch (*ptr) {
+	case 'K': case 'k':
+		value <<= 10;
+		break;
+	case 'M': case 'm':
+		value <<= 20;
+		break;
+	default:
+		break;
+	}
+
+	if (value > PIPE_MAX_SIZE) {
+		pr_err("Pipe size must be less than 1MB\n");
+		goto error;
+	} else if (value < PIPE_MIN_SIZE) {
+		pr_err("Pipe size must be over 64KB\n");
+		goto error;
+	}
+
+	/* Align buffer size with page unit */
+	round = value & (PAGE_SIZE - 1);
+	value = value - round;
+
+	return value;
+error:
+	return 0;
+}
+
+static void usage(char const *prg)
+{
+	pr_err("usage: %s [-h] [-o] [-s <size of pipe>]\n", prg);
+}
+
+static const char *make_path(int cpu_num, bool this_is_write_path)
+{
+	int ret;
+	char *buf;
+
+	buf = zalloc(PATH_MAX);
+	if (buf == NULL) {
+		pr_err("Could not allocate buffer\n");
+		goto error;
+	}
+
+	if (this_is_write_path)
+		/* write(output) path */
+		ret = snprintf(buf, PATH_MAX, WRITE_PATH_FMT, cpu_num);
+	else
+		/* read(input) path */
+		ret = snprintf(buf, PATH_MAX, READ_PATH_FMT, cpu_num);
+
+	if (ret <= 0) {
+		pr_err("Failed to generate %s path(CPU#%d):%d\n",
+			this_is_write_path ? "read" : "write", cpu_num, ret);
+		goto error;
+	}
+
+	return buf;
+
+error:
+	free(buf);
+	return NULL;
+}
+
+static const char *make_input_path(int cpu_num)
+{
+	return make_path(cpu_num, false);
+}
+
+static const char *make_output_path(int cpu_num)
+{
+	return make_path(cpu_num, true);
+}
+
+static void *agent_info_init(struct agent_info *s)
+{
+	int cpu;
+	const char *in_path = NULL;
+	const char *out_path = NULL;
+
+	/* init read/write threads */
+	for (cpu = 0; cpu < s->cpus; cpu++) {
+		/* set read(input) path per read/write thread */
+		in_path = make_input_path(cpu);
+		if (in_path == NULL)
+			goto error;
+
+		/* set write(output) path per read/write thread*/
+		if (!s->use_stdout) {
+			out_path = make_output_path(cpu);
+			if (out_path == NULL)
+				goto error;
+		} else
+			/* stdout mode */
+			pr_debug("stdout mode\n");
+
+		rw_thread_init(cpu, in_path, out_path, s->use_stdout,
+						s->pipe_size, s->rw_ti[cpu]);
+	}
+
+	/* init controller of read/write threads */
+	s->ctl_fd = rw_ctl_init((const char *)CTL_PATH);
+
+	return NULL;
+
+error:
+	exit(EXIT_FAILURE);
+}
+
+static void *parse_args(int argc, char *argv[], struct agent_info *s)
+{
+	int cmd;
+	unsigned long size;
+
+	while ((cmd = getopt(argc, argv, "hos:")) != -1) {
+		switch (cmd) {
+		/* stdout mode */
+		case 'o':
+			s->use_stdout = true;
+			break;
+		/* size of pipe */
+		case 's':
+			size = parse_size(optarg);
+			if (size == 0)
+				goto error;
+			s->pipe_size = size;
+			break;
+		case 'h':
+		default:
+			usage(argv[0]);
+			goto error;
+		}
+	}
+
+	agent_info_init(s);
+
+	return NULL;
+
+error:
+	exit(EXIT_FAILURE);
+}
+
+static void agent_main_loop(struct agent_info *s)
+{
+	int cpu;
+	pthread_t rw_thread_per_cpu[MAX_CPUS];
+
+	/* Start all read/write threads */
+	for (cpu = 0; cpu < s->cpus; cpu++)
+		rw_thread_per_cpu[cpu] = rw_thread_run(s->rw_ti[cpu]);
+
+	rw_ctl_loop(s->ctl_fd);
+
+	/* Finish all read/write threads */
+	for (cpu = 0; cpu < s->cpus; cpu++) {
+		int ret;
+
+		ret = pthread_join(rw_thread_per_cpu[cpu], NULL);
+		if (ret != 0) {
+			pr_err("pthread_join() error:%d (cpu %d)\n", ret, cpu);
+			exit(EXIT_FAILURE);
+		}
+	}
+}
+
+static void agent_info_free(struct agent_info *s)
+{
+	int i;
+
+	close(s->ctl_fd);
+	for (i = 0; i < s->cpus; i++) {
+		close(s->rw_ti[i]->in_fd);
+		close(s->rw_ti[i]->out_fd);
+		close(s->rw_ti[i]->read_pipe);
+		close(s->rw_ti[i]->write_pipe);
+		free(s->rw_ti[i]);
+	}
+	free(s);
+}
+
+int main(int argc, char *argv[])
+{
+	struct agent_info *s = NULL;
+
+	s = agent_info_new();
+	parse_args(argc, argv, s);
+
+	agent_main_loop(s);
+
+	agent_info_free(s);
+
+	return 0;
+}
diff --git a/tools/virtio/virtio-trace/trace-agent.h b/tools/virtio/virtio-trace/trace-agent.h
new file mode 100644
index 0000000..8de79bf
--- /dev/null
+++ b/tools/virtio/virtio-trace/trace-agent.h
@@ -0,0 +1,75 @@
+#ifndef __TRACE_AGENT_H__
+#define __TRACE_AGENT_H__
+#include <pthread.h>
+#include <stdbool.h>
+
+#define MAX_CPUS	256
+#define PIPE_INIT       (1024*1024)
+
+/*
+ * agent_info - structure managing total information of guest agent
+ * @pipe_size:	size of pipe (default 1MB)
+ * @use_stdout:	set to true when o option is added (default false)
+ * @cpus:	total number of CPUs
+ * @ctl_fd:	fd of control path, /dev/virtio-ports/agent-ctl-path
+ * @rw_ti:	structure managing information of read/write threads
+ */
+struct agent_info {
+	unsigned long pipe_size;
+	bool use_stdout;
+	int cpus;
+	int ctl_fd;
+	struct rw_thread_info *rw_ti[MAX_CPUS];
+};
+
+/*
+ * rw_thread_info - structure managing a read/write thread a cpu
+ * @cpu_num:	cpu number operating this read/write thread
+ * @in_fd:	fd of reading trace data path in cpu_num
+ * @out_fd:	fd of writing trace data path in cpu_num
+ * @read_pipe:	fd of read pipe
+ * @write_pipe:	fd of write pipe
+ * @pipe_size:	size of pipe (default 1MB)
+ */
+struct rw_thread_info {
+	int cpu_num;
+	int in_fd;
+	int out_fd;
+	int read_pipe;
+	int write_pipe;
+	unsigned long pipe_size;
+};
+
+/* use for stopping rw threads */
+extern bool global_sig_receive;
+
+/* use for notification */
+extern bool global_run_operation;
+extern pthread_mutex_t mutex_notify;
+extern pthread_cond_t cond_wakeup;
+
+/* for controller of read/write threads */
+extern int rw_ctl_init(const char *ctl_path);
+extern void *rw_ctl_loop(int ctl_fd);
+
+/* for trace read/write thread */
+extern void *rw_thread_info_new(void);
+extern void *rw_thread_init(int cpu, const char *in_path, const char *out_path,
+			bool stdout_flag, unsigned long pipe_size,
+			struct rw_thread_info *rw_ti);
+extern pthread_t rw_thread_run(struct rw_thread_info *rw_ti);
+
+static inline void *zalloc(size_t size)
+{
+	return calloc(1, size);
+}
+
+#define pr_err(format, ...) fprintf(stderr, format, ## __VA_ARGS__)
+#define pr_info(format, ...) fprintf(stdout, format, ## __VA_ARGS__)
+#ifdef DEBUG
+#define pr_debug(format, ...) fprintf(stderr, format, ## __VA_ARGS__)
+#else
+#define pr_debug(format, ...) do {} while (0)
+#endif
+
+#endif /*__TRACE_AGENT_H__*/
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-10-07 12:04:56 (GMT)
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-10-07 12:04:56 (GMT)
commit	dc92b1f9ab1e1665dbbc56911782358e7f9a49f9 (patch)
tree	965ccb4a0f2c24a8b24adce415f6506246d07a90
parent	5e090ed7af10729a396a25df43d69a236e789736 (diff)
parent	ca16f580a5db7e60bfafe59a50bb133bd3347491 (diff)
download	linux-dc92b1f9ab1e1665dbbc56911782358e7f9a49f9.tar.xz