From 64d098886e0ec01f88349fe757161c2e2e89086b Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 16 Apr 2012 10:11:12 -0400 Subject: virtio/tools: add delayed interupt mode Signed-off-by: Michael S. Tsirkin diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h index 7579f19..81847dd 100644 --- a/tools/virtio/linux/virtio.h +++ b/tools/virtio/linux/virtio.h @@ -203,6 +203,7 @@ void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); void virtqueue_disable_cb(struct virtqueue *vq); bool virtqueue_enable_cb(struct virtqueue *vq); +bool virtqueue_enable_cb_delayed(struct virtqueue *vq); void *virtqueue_detach_unused_buf(struct virtqueue *vq); struct virtqueue *vring_new_virtqueue(unsigned int num, diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c index 6bf95f9..e626fa5 100644 --- a/tools/virtio/virtio_test.c +++ b/tools/virtio/virtio_test.c @@ -144,7 +144,8 @@ static void wait_for_interrupt(struct vdev_info *dev) } } -static void run_test(struct vdev_info *dev, struct vq_info *vq, int bufs) +static void run_test(struct vdev_info *dev, struct vq_info *vq, + bool delayed, int bufs) { struct scatterlist sl; long started = 0, completed = 0; @@ -183,8 +184,12 @@ static void run_test(struct vdev_info *dev, struct vq_info *vq, int bufs) assert(started <= bufs); if (completed == bufs) break; - if (virtqueue_enable_cb(vq->vq)) { - wait_for_interrupt(dev); + if (delayed) { + if (virtqueue_enable_cb_delayed(vq->vq)) + wait_for_interrupt(dev); + } else { + if (virtqueue_enable_cb(vq->vq)) + wait_for_interrupt(dev); } } test = 0; @@ -216,6 +221,14 @@ const struct option longopts[] = { .val = 'i', }, { + .name = "delayed-interrupt", + .val = 'D', + }, + { + .name = "no-delayed-interrupt", + .val = 'd', + }, + { } }; @@ -224,6 +237,7 @@ static void help() fprintf(stderr, "Usage: virtio_test [--help]" " [--no-indirect]" " [--no-event-idx]" + " [--delayed-interrupt]" "\n"); } @@ -233,6 +247,7 @@ int main(int argc, char **argv) unsigned long long features = (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | (1ULL << VIRTIO_RING_F_EVENT_IDX); int o; + bool delayed = false; for (;;) { o = getopt_long(argc, argv, optstring, longopts, NULL); @@ -251,6 +266,9 @@ int main(int argc, char **argv) case 'i': features &= ~(1ULL << VIRTIO_RING_F_INDIRECT_DESC); break; + case 'D': + delayed = true; + break; default: assert(0); break; @@ -260,6 +278,6 @@ int main(int argc, char **argv) done: vdev_info_init(&dev, features); vq_info_add(&dev, 256); - run_test(&dev, &dev.vqs[0], 0x100000); + run_test(&dev, &dev.vqs[0], delayed, 0x100000); return 0; } -- cgit v0.10.2 From 3afc9621f15701c557e60f61eba9242bac2771dd Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 2 May 2012 11:41:30 +0800 Subject: macvtap: zerocopy: fix offset calculation when building skb This patch fixes the offset calculation when building skb: - offset1 were used as skb data offset not vector offset - reset offset to zero only when we advance to next vector Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 0427c65..bd4a70d 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -505,10 +505,11 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, if (copy > size) { ++from; --count; - } + offset = 0; + } else + offset += size; copy -= size; offset1 += size; - offset = 0; } if (len == offset1) @@ -519,13 +520,13 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, int num_pages; unsigned long base; - len = from->iov_len - offset1; + len = from->iov_len - offset; if (!len) { - offset1 = 0; + offset = 0; ++from; continue; } - base = (unsigned long)from->iov_base + offset1; + base = (unsigned long)from->iov_base + offset; size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; num_pages = get_user_pages_fast(base, size, 0, &page[i]); if ((num_pages != size) || @@ -546,7 +547,7 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, len -= size; i++; } - offset1 = 0; + offset = 0; ++from; } return 0; -- cgit v0.10.2 From 4ef67ebedffa44ed9939b34708ac2fee06d2f65f Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 2 May 2012 11:41:44 +0800 Subject: macvtap: zerocopy: fix truesize underestimation As the skb fragment were pinned/built from user pages, we should account the page instead of length for truesize. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index bd4a70d..7cb2684 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -519,6 +519,7 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, struct page *page[MAX_SKB_FRAGS]; int num_pages; unsigned long base; + unsigned long truesize; len = from->iov_len - offset; if (!len) { @@ -533,10 +534,11 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags)) /* put_page is in skb free */ return -EFAULT; + truesize = size * PAGE_SIZE; skb->data_len += len; skb->len += len; - skb->truesize += len; - atomic_add(len, &skb->sk->sk_wmem_alloc); + skb->truesize += truesize; + atomic_add(truesize, &skb->sk->sk_wmem_alloc); while (len) { int off = base & ~PAGE_MASK; int size = min_t(int, len, PAGE_SIZE - off); -- cgit v0.10.2 From 02ce04bb3d28c3333231f43bca677228dbc686fe Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 2 May 2012 11:41:58 +0800 Subject: macvtap: zerocopy: put page when fail to get all requested user pages When get_user_pages_fast() fails to get all requested pages, we could not use kfree_skb() to free it as it has not been put in the skb fragments. So we need to call put_page() instead. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 7cb2684..9ab182a 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -531,9 +531,11 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; num_pages = get_user_pages_fast(base, size, 0, &page[i]); if ((num_pages != size) || - (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags)) - /* put_page is in skb free */ + (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags)) { + for (i = 0; i < num_pages; i++) + put_page(page[i]); return -EFAULT; + } truesize = size * PAGE_SIZE; skb->data_len += len; skb->len += len; -- cgit v0.10.2 From 01d6657b388438def19c8baaea28e742b6ed32ec Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 2 May 2012 11:42:06 +0800 Subject: macvtap: zerocopy: set SKBTX_DEV_ZEROCOPY only when skb is built successfully Current the SKBTX_DEV_ZEROCOPY is set unconditionally after zerocopy_sg_from_iovec(), this would lead NULL pointer when macvtap fails to build zerocopy skb because destructor_arg was not initialized. Solve this by set this flag after the skb were built successfully. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 9ab182a..a4ff694 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -699,10 +699,9 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, if (!skb) goto err; - if (zerocopy) { + if (zerocopy) err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count); - skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; - } else + else err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, len); if (err) @@ -721,8 +720,10 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, rcu_read_lock_bh(); vlan = rcu_dereference_bh(q->vlan); /* copy skb_ubuf_info for callback when skb has no error */ - if (zerocopy) + if (zerocopy) { skb_shinfo(skb)->destructor_arg = m->msg_control; + skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; + } if (vlan) macvlan_start_xmit(skb, vlan->dev); else -- cgit v0.10.2 From b92946e2919134ebe2a4083e4302236295ea2a73 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 2 May 2012 11:42:15 +0800 Subject: macvtap: zerocopy: validate vectors before building skb There're several reasons that the vectors need to be validated: - Return error when caller provides vectors whose num is greater than UIO_MAXIOV. - Linearize part of skb when userspace provides vectors grater than MAX_SKB_FRAGS. - Return error when userspace provides vectors whose total length may exceed - MAX_SKB_FRAGS * PAGE_SIZE. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index a4ff694..163559c 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -529,9 +529,10 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, } base = (unsigned long)from->iov_base + offset; size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; + if (i + size > MAX_SKB_FRAGS) + return -EMSGSIZE; num_pages = get_user_pages_fast(base, size, 0, &page[i]); - if ((num_pages != size) || - (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags)) { + if (num_pages != size) { for (i = 0; i < num_pages; i++) put_page(page[i]); return -EFAULT; @@ -651,7 +652,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, int err; struct virtio_net_hdr vnet_hdr = { 0 }; int vnet_hdr_len = 0; - int copylen; + int copylen = 0; bool zerocopy = false; if (q->flags & IFF_VNET_HDR) { @@ -680,15 +681,31 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, if (unlikely(len < ETH_HLEN)) goto err; + err = -EMSGSIZE; + if (unlikely(count > UIO_MAXIOV)) + goto err; + if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) zerocopy = true; if (zerocopy) { + /* Userspace may produce vectors with count greater than + * MAX_SKB_FRAGS, so we need to linearize parts of the skb + * to let the rest of data to be fit in the frags. + */ + if (count > MAX_SKB_FRAGS) { + copylen = iov_length(iv, count - MAX_SKB_FRAGS); + if (copylen < vnet_hdr_len) + copylen = 0; + else + copylen -= vnet_hdr_len; + } /* There are 256 bytes to be copied in skb, so there is enough * room for skb expand head in case it is used. * The rest buffer is mapped from userspace. */ - copylen = vnet_hdr.hdr_len; + if (copylen < vnet_hdr.hdr_len) + copylen = vnet_hdr.hdr_len; if (!copylen) copylen = GOODCOPY_LEN; } else -- cgit v0.10.2 From c460f0573941cb28dc7f35595679c3508f0ce41f Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 2 May 2012 11:42:23 +0800 Subject: vhost_net: zerocopy: fix possible NULL pointer dereference of vq->bufs When we want to disable vhost_net backend while there's a tx work, a possible NULL pointer defernece may happen we we try to deference the vq->bufs after vhost_net_set_backend() assign a NULL to it. As suggested by Michael, fix this by checking the vq->bufs instead of vhost_sock_zcopy(). Signed-off-by: Michael S. Tsirkin diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 1f21d2a..35abe90 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -166,7 +166,7 @@ static void handle_tx(struct vhost_net *net) if (wmem < sock->sk->sk_sndbuf / 2) tx_poll_stop(net); hdr_size = vq->vhost_hlen; - zcopy = vhost_sock_zcopy(sock); + zcopy = vq->ubufs; for (;;) { /* Release DMAs done buffers first */ -- cgit v0.10.2 From dbf34207c62bdec16b49721d119647c470a3443c Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 2 May 2012 11:42:32 +0800 Subject: vhost_net: re-poll only on EAGAIN or ENOBUFS Currently, we restart tx polling unconditionally when sendmsg() fails. This would cause unnecessary wakeups of vhost wokers and waste cpu utlization when evil userspace(guest driver) is able to hit EFAULT or EINVAL. The polling is only needed when the socket send buffer were exceeded or not enough memory. So fix this by restarting polling only when sendmsg() returns EAGAIN/ENOBUFS. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 35abe90..f54b1d5 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -257,7 +257,8 @@ static void handle_tx(struct vhost_net *net) UIO_MAXIOV; } vhost_discard_vq_desc(vq, 1); - tx_poll_start(net, sock); + if (err == -EAGAIN || err == -ENOBUFS) + tx_poll_start(net, sock); break; } if (err != len) -- cgit v0.10.2 From c8fb217af57c6c232af3517d3115d2af4ce9900e Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 2 May 2012 11:42:41 +0800 Subject: vhost_net: zerocopy: adding and signalling immediately when fully copied When a packet were fully copied in zerocopy, we don't wait for the DMA done to mark the done flag, so after the packet were passed to lower device, we need to add used and signal guest immediately. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index f54b1d5..853db7a 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -266,6 +266,8 @@ static void handle_tx(struct vhost_net *net) " len %d != %zd\n", err, len); if (!zcopy) vhost_add_used_and_signal(&net->dev, vq, head, 0); + else + vhost_zerocopy_signal_used(vq); total_len += len; if (unlikely(total_len >= VHOST_NET_WEIGHT)) { vhost_poll_queue(&vq->poll); -- cgit v0.10.2 From c70aa540c7a9f67add11ad3161096fb95233aa2e Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 2 May 2012 11:42:54 +0800 Subject: vhost: zerocopy: poll vq in zerocopy callback We add used and signal guest in worker thread but did not poll the virtqueue during the zero copy callback. This may lead the missing of adding and signalling during zerocopy. Solve this by polling the virtqueue and let it wakeup the worker during callback. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 51e4c1e..94dbd25 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1603,6 +1603,7 @@ void vhost_zerocopy_callback(struct ubuf_info *ubuf) struct vhost_ubuf_ref *ubufs = ubuf->ctx; struct vhost_virtqueue *vq = ubufs->vq; + vhost_poll_queue(&vq->poll); /* set len = 1 to mark this desc buffers done DMA */ vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN; kref_put(&ubufs->kref, vhost_zerocopy_done_signal); -- cgit v0.10.2