From 3de88d622fd68bd4dbee0f80168218b23f798fd0 Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Fri, 19 Jun 2015 16:15:57 +0100 Subject: xen/events/fifo: Consume unprocessed events when a CPU dies When a CPU is offlined, there may be unprocessed events on a port for that CPU. If the port is subsequently reused on a different CPU, it could be in an unexpected state with the link bit set, resulting in interrupts being missed. Fix this by consuming any unprocessed events for a particular CPU when that CPU dies. Signed-off-by: Ross Lagerwall Cc: # 3.14+ Signed-off-by: David Vrabel diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c index e3e9e3d..96a1b8d 100644 --- a/drivers/xen/events/events_fifo.c +++ b/drivers/xen/events/events_fifo.c @@ -281,7 +281,8 @@ static void handle_irq_for_port(unsigned port) static void consume_one_event(unsigned cpu, struct evtchn_fifo_control_block *control_block, - unsigned priority, unsigned long *ready) + unsigned priority, unsigned long *ready, + bool drop) { struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu); uint32_t head; @@ -313,13 +314,17 @@ static void consume_one_event(unsigned cpu, if (head == 0) clear_bit(priority, ready); - if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port)) - handle_irq_for_port(port); + if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port)) { + if (unlikely(drop)) + pr_warn("Dropping pending event for port %u\n", port); + else + handle_irq_for_port(port); + } q->head[priority] = head; } -static void evtchn_fifo_handle_events(unsigned cpu) +static void __evtchn_fifo_handle_events(unsigned cpu, bool drop) { struct evtchn_fifo_control_block *control_block; unsigned long ready; @@ -331,11 +336,16 @@ static void evtchn_fifo_handle_events(unsigned cpu) while (ready) { q = find_first_bit(&ready, EVTCHN_FIFO_MAX_QUEUES); - consume_one_event(cpu, control_block, q, &ready); + consume_one_event(cpu, control_block, q, &ready, drop); ready |= xchg(&control_block->ready, 0); } } +static void evtchn_fifo_handle_events(unsigned cpu) +{ + __evtchn_fifo_handle_events(cpu, false); +} + static void evtchn_fifo_resume(void) { unsigned cpu; @@ -420,6 +430,9 @@ static int evtchn_fifo_cpu_notification(struct notifier_block *self, if (!per_cpu(cpu_control_block, cpu)) ret = evtchn_fifo_alloc_control_block(cpu); break; + case CPU_DEAD: + __evtchn_fifo_handle_events(cpu, true); + break; default: break; } -- cgit v0.10.2 From de0afc9bdeeadaa998797d2333c754bf9f4d5dcf Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Wed, 2 Dec 2015 12:10:48 -0500 Subject: xen: Resume PMU from non-atomic context Resuming PMU currently triggers a warning from ___might_sleep() (assuming CONFIG_DEBUG_ATOMIC_SLEEP is set) when xen_pmu_init() allocates GFP_KERNEL page because we are in state resembling atomic context. Move resuming PMU to xen_arch_resume() which is called in regular context. For symmetry move suspending PMU to xen_arch_suspend() as well. Signed-off-by: Boris Ostrovsky Reported-by: Konrad Rzeszutek Wilk Cc: # 4.3 Signed-off-by: David Vrabel diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index feddabd..3705eab 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -68,26 +68,16 @@ static void xen_pv_post_suspend(int suspend_cancelled) void xen_arch_pre_suspend(void) { - int cpu; - - for_each_online_cpu(cpu) - xen_pmu_finish(cpu); - if (xen_pv_domain()) xen_pv_pre_suspend(); } void xen_arch_post_suspend(int cancelled) { - int cpu; - if (xen_pv_domain()) xen_pv_post_suspend(cancelled); else xen_hvm_post_suspend(cancelled); - - for_each_online_cpu(cpu) - xen_pmu_init(cpu); } static void xen_vcpu_notify_restore(void *data) @@ -106,10 +96,20 @@ static void xen_vcpu_notify_suspend(void *data) void xen_arch_resume(void) { + int cpu; + on_each_cpu(xen_vcpu_notify_restore, NULL, 1); + + for_each_online_cpu(cpu) + xen_pmu_init(cpu); } void xen_arch_suspend(void) { + int cpu; + + for_each_online_cpu(cpu) + xen_pmu_finish(cpu); + on_each_cpu(xen_vcpu_notify_suspend, NULL, 1); } -- cgit v0.10.2 From 20f36e0380a7e871a711d5e4e59d04d4948326b4 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Sat, 12 Dec 2015 19:25:55 -0500 Subject: xen/x86/pvh: Use HVM's flush_tlb_others op Using MMUEXT_TLB_FLUSH_MULTI doesn't buy us much since the hypervisor will likely perform same IPIs as would have the guest. More importantly, using MMUEXT_INVLPG_MULTI may not to invalidate the guest's address on remote CPU (when, for example, VCPU from another guest is running there). Signed-off-by: Boris Ostrovsky Suggested-by: Jan Beulich Signed-off-by: David Vrabel diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index ac161db..cb5e266 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2495,14 +2495,9 @@ void __init xen_init_mmu_ops(void) { x86_init.paging.pagetable_init = xen_pagetable_init; - /* Optimization - we can use the HVM one but it has no idea which - * VCPUs are descheduled - which means that it will needlessly IPI - * them. Xen knows so let it do the job. - */ - if (xen_feature(XENFEAT_auto_translated_physmap)) { - pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others; + if (xen_feature(XENFEAT_auto_translated_physmap)) return; - } + pv_mmu_ops = xen_mmu_ops; memset(dummy_mapping, 0xff, PAGE_SIZE); -- cgit v0.10.2 From 454d5d882c7e412b840e3c99010fe81a9862f6fb Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Fri, 30 Oct 2015 14:58:08 +0000 Subject: xen: Add RING_COPY_REQUEST() Using RING_GET_REQUEST() on a shared ring is easy to use incorrectly (i.e., by not considering that the other end may alter the data in the shared ring while it is being inspected). Safe usage of a request generally requires taking a local copy. Provide a RING_COPY_REQUEST() macro to use instead of RING_GET_REQUEST() and an open-coded memcpy(). This takes care of ensuring that the copy is done correctly regardless of any possible compiler optimizations. Use a volatile source to prevent the compiler from reordering or omitting the copy. This is part of XSA155. CC: stable@vger.kernel.org Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h index 7d28aff..7dc685b 100644 --- a/include/xen/interface/io/ring.h +++ b/include/xen/interface/io/ring.h @@ -181,6 +181,20 @@ struct __name##_back_ring { \ #define RING_GET_REQUEST(_r, _idx) \ (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req)) +/* + * Get a local copy of a request. + * + * Use this in preference to RING_GET_REQUEST() so all processing is + * done on a local copy that cannot be modified by the other end. + * + * Note that https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145 may cause this + * to be ineffective where _req is a struct which consists of only bitfields. + */ +#define RING_COPY_REQUEST(_r, _idx, _req) do { \ + /* Use volatile to force the copy into _req. */ \ + *(_req) = *(volatile typeof(_req))RING_GET_REQUEST(_r, _idx); \ +} while (0) + #define RING_GET_RESPONSE(_r, _idx) \ (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp)) -- cgit v0.10.2 From 0f589967a73f1f30ab4ac4dd9ce0bb399b4d6357 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Fri, 30 Oct 2015 15:16:01 +0000 Subject: xen-netback: don't use last request to determine minimum Tx credit The last from guest transmitted request gives no indication about the minimum amount of credit that the guest might need to send a packet since the last packet might have been a small one. Instead allow for the worst case 128 KiB packet. This is part of XSA155. CC: stable@vger.kernel.org Reviewed-by: Wei Liu Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index e481f37..b683581 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -679,9 +679,7 @@ static void tx_add_credit(struct xenvif_queue *queue) * Allow a burst big enough to transmit a jumbo packet of up to 128kB. * Otherwise the interface can seize up due to insufficient credit. */ - max_burst = RING_GET_REQUEST(&queue->tx, queue->tx.req_cons)->size; - max_burst = min(max_burst, 131072UL); - max_burst = max(max_burst, queue->credit_bytes); + max_burst = max(131072UL, queue->credit_bytes); /* Take care that adding a new chunk of credit doesn't wrap to zero. */ max_credit = queue->remaining_credit + queue->credit_bytes; -- cgit v0.10.2 From 68a33bfd8403e4e22847165d149823a2e0e67c9c Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Fri, 30 Oct 2015 15:17:06 +0000 Subject: xen-netback: use RING_COPY_REQUEST() throughout Instead of open-coding memcpy()s and directly accessing Tx and Rx requests, use the new RING_COPY_REQUEST() that ensures the local copy is correct. This is more than is strictly necessary for guest Rx requests since only the id and gref fields are used and it is harmless if the frontend modifies these. This is part of XSA155. CC: stable@vger.kernel.org Reviewed-by: Wei Liu Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index b683581..1049c34 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -258,18 +258,18 @@ static struct xenvif_rx_meta *get_next_rx_buffer(struct xenvif_queue *queue, struct netrx_pending_operations *npo) { struct xenvif_rx_meta *meta; - struct xen_netif_rx_request *req; + struct xen_netif_rx_request req; - req = RING_GET_REQUEST(&queue->rx, queue->rx.req_cons++); + RING_COPY_REQUEST(&queue->rx, queue->rx.req_cons++, &req); meta = npo->meta + npo->meta_prod++; meta->gso_type = XEN_NETIF_GSO_TYPE_NONE; meta->gso_size = 0; meta->size = 0; - meta->id = req->id; + meta->id = req.id; npo->copy_off = 0; - npo->copy_gref = req->gref; + npo->copy_gref = req.gref; return meta; } @@ -424,7 +424,7 @@ static int xenvif_gop_skb(struct sk_buff *skb, struct xenvif *vif = netdev_priv(skb->dev); int nr_frags = skb_shinfo(skb)->nr_frags; int i; - struct xen_netif_rx_request *req; + struct xen_netif_rx_request req; struct xenvif_rx_meta *meta; unsigned char *data; int head = 1; @@ -443,15 +443,15 @@ static int xenvif_gop_skb(struct sk_buff *skb, /* Set up a GSO prefix descriptor, if necessary */ if ((1 << gso_type) & vif->gso_prefix_mask) { - req = RING_GET_REQUEST(&queue->rx, queue->rx.req_cons++); + RING_COPY_REQUEST(&queue->rx, queue->rx.req_cons++, &req); meta = npo->meta + npo->meta_prod++; meta->gso_type = gso_type; meta->gso_size = skb_shinfo(skb)->gso_size; meta->size = 0; - meta->id = req->id; + meta->id = req.id; } - req = RING_GET_REQUEST(&queue->rx, queue->rx.req_cons++); + RING_COPY_REQUEST(&queue->rx, queue->rx.req_cons++, &req); meta = npo->meta + npo->meta_prod++; if ((1 << gso_type) & vif->gso_mask) { @@ -463,9 +463,9 @@ static int xenvif_gop_skb(struct sk_buff *skb, } meta->size = 0; - meta->id = req->id; + meta->id = req.id; npo->copy_off = 0; - npo->copy_gref = req->gref; + npo->copy_gref = req.gref; data = skb->data; while (data < skb_tail_pointer(skb)) { @@ -709,7 +709,7 @@ static void xenvif_tx_err(struct xenvif_queue *queue, spin_unlock_irqrestore(&queue->response_lock, flags); if (cons == end) break; - txp = RING_GET_REQUEST(&queue->tx, cons++); + RING_COPY_REQUEST(&queue->tx, cons++, txp); } while (1); queue->tx.req_cons = cons; } @@ -776,8 +776,7 @@ static int xenvif_count_requests(struct xenvif_queue *queue, if (drop_err) txp = &dropped_tx; - memcpy(txp, RING_GET_REQUEST(&queue->tx, cons + slots), - sizeof(*txp)); + RING_COPY_REQUEST(&queue->tx, cons + slots, txp); /* If the guest submitted a frame >= 64 KiB then * first->size overflowed and following slots will @@ -1110,8 +1109,7 @@ static int xenvif_get_extras(struct xenvif_queue *queue, return -EBADR; } - memcpy(&extra, RING_GET_REQUEST(&queue->tx, cons), - sizeof(extra)); + RING_COPY_REQUEST(&queue->tx, cons, &extra); if (unlikely(!extra.type || extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) { queue->tx.req_cons = ++cons; @@ -1320,7 +1318,7 @@ static void xenvif_tx_build_gops(struct xenvif_queue *queue, idx = queue->tx.req_cons; rmb(); /* Ensure that we see the request before we copy it. */ - memcpy(&txreq, RING_GET_REQUEST(&queue->tx, idx), sizeof(txreq)); + RING_COPY_REQUEST(&queue->tx, idx, &txreq); /* Credit-based scheduling. */ if (txreq.size > queue->remaining_credit && -- cgit v0.10.2 From 1f13d75ccb806260079e0679d55d9253e370ec8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= Date: Tue, 3 Nov 2015 16:34:09 +0000 Subject: xen-blkback: only read request operation from shared ring once MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A compiler may load a switch statement value multiple times, which could be bad when the value is in memory shared with the frontend. When converting a non-native request to a native one, ensure that src->operation is only loaded once by using READ_ONCE(). This is part of XSA155. CC: stable@vger.kernel.org Signed-off-by: Roger Pau Monné Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 68e87a0..c929ae2 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -408,8 +408,8 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst, struct blkif_x86_32_request *src) { int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j; - dst->operation = src->operation; - switch (src->operation) { + dst->operation = READ_ONCE(src->operation); + switch (dst->operation) { case BLKIF_OP_READ: case BLKIF_OP_WRITE: case BLKIF_OP_WRITE_BARRIER: @@ -456,8 +456,8 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst, struct blkif_x86_64_request *src) { int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j; - dst->operation = src->operation; - switch (src->operation) { + dst->operation = READ_ONCE(src->operation); + switch (dst->operation) { case BLKIF_OP_READ: case BLKIF_OP_WRITE: case BLKIF_OP_WRITE_BARRIER: -- cgit v0.10.2 From 18779149101c0dd43ded43669ae2a92d21b6f9cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= Date: Tue, 3 Nov 2015 16:40:43 +0000 Subject: xen-blkback: read from indirect descriptors only once MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since indirect descriptors are in memory shared with the frontend, the frontend could alter the first_sect and last_sect values after they have been validated but before they are recorded in the request. This may result in I/O requests that overflow the foreign page, possibly overwriting local pages when the I/O request is executed. When parsing indirect descriptors, only read first_sect and last_sect once. This is part of XSA155. CC: stable@vger.kernel.org Signed-off-by: Roger Pau Monné Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index f909994..41fb1a9 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -950,6 +950,8 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, goto unmap; for (n = 0, i = 0; n < nseg; n++) { + uint8_t first_sect, last_sect; + if ((n % SEGS_PER_INDIRECT_FRAME) == 0) { /* Map indirect segments */ if (segments) @@ -957,15 +959,18 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page); } i = n % SEGS_PER_INDIRECT_FRAME; + pending_req->segments[n]->gref = segments[i].gref; - seg[n].nsec = segments[i].last_sect - - segments[i].first_sect + 1; - seg[n].offset = (segments[i].first_sect << 9); - if ((segments[i].last_sect >= (XEN_PAGE_SIZE >> 9)) || - (segments[i].last_sect < segments[i].first_sect)) { + + first_sect = READ_ONCE(segments[i].first_sect); + last_sect = READ_ONCE(segments[i].last_sect); + if (last_sect >= (XEN_PAGE_SIZE >> 9) || last_sect < first_sect) { rc = -EINVAL; goto unmap; } + + seg[n].nsec = last_sect - first_sect + 1; + seg[n].offset = first_sect << 9; preq->nr_sects += seg[n].nsec; } -- cgit v0.10.2 From be69746ec12f35b484707da505c6c76ff06f97dc Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 16 Nov 2015 18:02:32 +0000 Subject: xen-scsiback: safely copy requests The copy of the ring request was lacking a following barrier(), potentially allowing the compiler to optimize the copy away. Use RING_COPY_REQUEST() to ensure the request is copied to local memory. This is part of XSA155. CC: stable@vger.kernel.org Reviewed-by: Juergen Gross Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk diff --git a/drivers/xen/xen-scsiback.c b/drivers/xen/xen-scsiback.c index 9eeefd7..2af9aa8 100644 --- a/drivers/xen/xen-scsiback.c +++ b/drivers/xen/xen-scsiback.c @@ -727,7 +727,7 @@ static int scsiback_do_cmd_fn(struct vscsibk_info *info) if (!pending_req) return 1; - ring_req = *RING_GET_REQUEST(ring, rc); + RING_COPY_REQUEST(ring, rc, &ring_req); ring->req_cons = ++rc; err = prepare_pending_reqs(info, &ring_req, pending_req); -- cgit v0.10.2 From 8135cf8b092723dbfcc611fe6fdcb3a36c9951c5 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 16 Nov 2015 12:40:48 -0500 Subject: xen/pciback: Save xen_pci_op commands before processing it Double fetch vulnerabilities that happen when a variable is fetched twice from shared memory but a security check is only performed the first time. The xen_pcibk_do_op function performs a switch statements on the op->cmd value which is stored in shared memory. Interestingly this can result in a double fetch vulnerability depending on the performed compiler optimization. This patch fixes it by saving the xen_pci_op command before processing it. We also use 'barrier' to make sure that the compiler does not perform any optimization. This is part of XSA155. CC: stable@vger.kernel.org Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Jan Beulich Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h index 58e38d5..4d529f3 100644 --- a/drivers/xen/xen-pciback/pciback.h +++ b/drivers/xen/xen-pciback/pciback.h @@ -37,6 +37,7 @@ struct xen_pcibk_device { struct xen_pci_sharedinfo *sh_info; unsigned long flags; struct work_struct op_work; + struct xen_pci_op op; }; struct xen_pcibk_dev_data { diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c index c4a0666..a0e0e3e 100644 --- a/drivers/xen/xen-pciback/pciback_ops.c +++ b/drivers/xen/xen-pciback/pciback_ops.c @@ -298,9 +298,11 @@ void xen_pcibk_do_op(struct work_struct *data) container_of(data, struct xen_pcibk_device, op_work); struct pci_dev *dev; struct xen_pcibk_dev_data *dev_data = NULL; - struct xen_pci_op *op = &pdev->sh_info->op; + struct xen_pci_op *op = &pdev->op; int test_intx = 0; + *op = pdev->sh_info->op; + barrier(); dev = xen_pcibk_get_pci_dev(pdev, op->domain, op->bus, op->devfn); if (dev == NULL) @@ -342,6 +344,17 @@ void xen_pcibk_do_op(struct work_struct *data) if ((dev_data->enable_intx != test_intx)) xen_pcibk_control_isr(dev, 0 /* no reset */); } + pdev->sh_info->op.err = op->err; + pdev->sh_info->op.value = op->value; +#ifdef CONFIG_PCI_MSI + if (op->cmd == XEN_PCI_OP_enable_msix && op->err == 0) { + unsigned int i; + + for (i = 0; i < op->value; i++) + pdev->sh_info->op.msix_entries[i].vector = + op->msix_entries[i].vector; + } +#endif /* Tell the driver domain that we're done. */ wmb(); clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags); -- cgit v0.10.2 From 56441f3c8e5bd45aab10dd9f8c505dd4bec03b0d Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 3 Apr 2015 11:08:22 -0400 Subject: xen/pciback: Return error on XEN_PCI_OP_enable_msi when device has MSI or MSI-X enabled The guest sequence of: a) XEN_PCI_OP_enable_msi b) XEN_PCI_OP_enable_msi c) XEN_PCI_OP_disable_msi results in hitting an BUG_ON condition in the msi.c code. The MSI code uses an dev->msi_list to which it adds MSI entries. Under the above conditions an BUG_ON() can be hit. The device passed in the guest MUST have MSI capability. The a) adds the entry to the dev->msi_list and sets msi_enabled. The b) adds a second entry but adding in to SysFS fails (duplicate entry) and deletes all of the entries from msi_list and returns (with msi_enabled is still set). c) pci_disable_msi passes the msi_enabled checks and hits: BUG_ON(list_empty(dev_to_msi_list(&dev->dev))); and blows up. The patch adds a simple check in the XEN_PCI_OP_enable_msi to guard against that. The check for msix_enabled is not stricly neccessary. This is part of XSA-157. CC: stable@vger.kernel.org Reviewed-by: David Vrabel Reviewed-by: Jan Beulich Signed-off-by: Konrad Rzeszutek Wilk diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c index a0e0e3e..8bfb87c 100644 --- a/drivers/xen/xen-pciback/pciback_ops.c +++ b/drivers/xen/xen-pciback/pciback_ops.c @@ -144,7 +144,12 @@ int xen_pcibk_enable_msi(struct xen_pcibk_device *pdev, if (unlikely(verbose_request)) printk(KERN_DEBUG DRV_NAME ": %s: enable MSI\n", pci_name(dev)); - status = pci_enable_msi(dev); + if (dev->msi_enabled) + status = -EALREADY; + else if (dev->msix_enabled) + status = -ENXIO; + else + status = pci_enable_msi(dev); if (status) { pr_warn_ratelimited("%s: error enabling MSI for guest %u: err %d\n", -- cgit v0.10.2 From 5e0ce1455c09dd61d029b8ad45d82e1ac0b6c4c9 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 2 Nov 2015 18:07:44 -0500 Subject: xen/pciback: Return error on XEN_PCI_OP_enable_msix when device has MSI or MSI-X enabled The guest sequence of: a) XEN_PCI_OP_enable_msix b) XEN_PCI_OP_enable_msix results in hitting an NULL pointer due to using freed pointers. The device passed in the guest MUST have MSI-X capability. The a) constructs and SysFS representation of MSI and MSI groups. The b) adds a second set of them but adding in to SysFS fails (duplicate entry). 'populate_msi_sysfs' frees the newly allocated msi_irq_groups (note that in a) pdev->msi_irq_groups is still set) and also free's ALL of the MSI-X entries of the device (the ones allocated in step a) and b)). The unwind code: 'free_msi_irqs' deletes all the entries and tries to delete the pdev->msi_irq_groups (which hasn't been set to NULL). However the pointers in the SysFS are already freed and we hit an NULL pointer further on when 'strlen' is attempted on a freed pointer. The patch adds a simple check in the XEN_PCI_OP_enable_msix to guard against that. The check for msi_enabled is not stricly neccessary. This is part of XSA-157 CC: stable@vger.kernel.org Reviewed-by: David Vrabel Reviewed-by: Jan Beulich Signed-off-by: Konrad Rzeszutek Wilk diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c index 8bfb87c..029f33d 100644 --- a/drivers/xen/xen-pciback/pciback_ops.c +++ b/drivers/xen/xen-pciback/pciback_ops.c @@ -206,9 +206,16 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev, if (unlikely(verbose_request)) printk(KERN_DEBUG DRV_NAME ": %s: enable MSI-X\n", pci_name(dev)); + if (op->value > SH_INFO_MAX_VEC) return -EINVAL; + if (dev->msix_enabled) + return -EALREADY; + + if (dev->msi_enabled) + return -ENXIO; + entries = kmalloc(op->value * sizeof(*entries), GFP_KERNEL); if (entries == NULL) return -ENOMEM; -- cgit v0.10.2 From a396f3a210c3a61e94d6b87ec05a75d0be2a60d0 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 2 Nov 2015 17:24:08 -0500 Subject: xen/pciback: Do not install an IRQ handler for MSI interrupts. Otherwise an guest can subvert the generic MSI code to trigger an BUG_ON condition during MSI interrupt freeing: for (i = 0; i < entry->nvec_used; i++) BUG_ON(irq_has_action(entry->irq + i)); Xen PCI backed installs an IRQ handler (request_irq) for the dev->irq whenever the guest writes PCI_COMMAND_MEMORY (or PCI_COMMAND_IO) to the PCI_COMMAND register. This is done in case the device has legacy interrupts the GSI line is shared by the backend devices. To subvert the backend the guest needs to make the backend to change the dev->irq from the GSI to the MSI interrupt line, make the backend allocate an interrupt handler, and then command the backend to free the MSI interrupt and hit the BUG_ON. Since the backend only calls 'request_irq' when the guest writes to the PCI_COMMAND register the guest needs to call XEN_PCI_OP_enable_msi before any other operation. This will cause the generic MSI code to setup an MSI entry and populate dev->irq with the new PIRQ value. Then the guest can write to PCI_COMMAND PCI_COMMAND_MEMORY and cause the backend to setup an IRQ handler for dev->irq (which instead of the GSI value has the MSI pirq). See 'xen_pcibk_control_isr'. Then the guest disables the MSI: XEN_PCI_OP_disable_msi which ends up triggering the BUG_ON condition in 'free_msi_irqs' as there is an IRQ handler for the entry->irq (dev->irq). Note that this cannot be done using MSI-X as the generic code does not over-write dev->irq with the MSI-X PIRQ values. The patch inhibits setting up the IRQ handler if MSI or MSI-X (for symmetry reasons) code had been called successfully. P.S. Xen PCIBack when it sets up the device for the guest consumption ends up writting 0 to the PCI_COMMAND (see xen_pcibk_reset_device). XSA-120 addendum patch removed that - however when upstreaming said addendum we found that it caused issues with qemu upstream. That has now been fixed in qemu upstream. This is part of XSA-157 CC: stable@vger.kernel.org Reviewed-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c index 029f33d..d0696ce 100644 --- a/drivers/xen/xen-pciback/pciback_ops.c +++ b/drivers/xen/xen-pciback/pciback_ops.c @@ -70,6 +70,13 @@ static void xen_pcibk_control_isr(struct pci_dev *dev, int reset) enable ? "enable" : "disable"); if (enable) { + /* + * The MSI or MSI-X should not have an IRQ handler. Otherwise + * if the guest terminates we BUG_ON in free_msi_irqs. + */ + if (dev->msi_enabled || dev->msix_enabled) + goto out; + rc = request_irq(dev_data->irq, xen_pcibk_guest_interrupt, IRQF_SHARED, dev_data->irq_name, dev); -- cgit v0.10.2 From 7cfb905b9638982862f0331b36ccaaca5d383b49 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 1 Apr 2015 10:49:47 -0400 Subject: xen/pciback: For XEN_PCI_OP_disable_msi[|x] only disable if device has MSI(X) enabled. Otherwise just continue on, returning the same values as previously (return of 0, and op->result has the PIRQ value). This does not change the behavior of XEN_PCI_OP_disable_msi[|x]. The pci_disable_msi or pci_disable_msix have the checks for msi_enabled or msix_enabled so they will error out immediately. However the guest can still call these operations and cause us to disable the 'ack_intr'. That means the backend IRQ handler for the legacy interrupt will not respond to interrupts anymore. This will lead to (if the device is causing an interrupt storm) for the Linux generic code to disable the interrupt line. Naturally this will only happen if the device in question is plugged in on the motherboard on shared level interrupt GSI. This is part of XSA-157 CC: stable@vger.kernel.org Reviewed-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c index d0696ce..4ee5fc0 100644 --- a/drivers/xen/xen-pciback/pciback_ops.c +++ b/drivers/xen/xen-pciback/pciback_ops.c @@ -185,20 +185,23 @@ static int xen_pcibk_disable_msi(struct xen_pcibk_device *pdev, struct pci_dev *dev, struct xen_pci_op *op) { - struct xen_pcibk_dev_data *dev_data; - if (unlikely(verbose_request)) printk(KERN_DEBUG DRV_NAME ": %s: disable MSI\n", pci_name(dev)); - pci_disable_msi(dev); + if (dev->msi_enabled) { + struct xen_pcibk_dev_data *dev_data; + + pci_disable_msi(dev); + + dev_data = pci_get_drvdata(dev); + if (dev_data) + dev_data->ack_intr = 1; + } op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0; if (unlikely(verbose_request)) printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev), op->value); - dev_data = pci_get_drvdata(dev); - if (dev_data) - dev_data->ack_intr = 1; return 0; } @@ -264,23 +267,27 @@ static int xen_pcibk_disable_msix(struct xen_pcibk_device *pdev, struct pci_dev *dev, struct xen_pci_op *op) { - struct xen_pcibk_dev_data *dev_data; if (unlikely(verbose_request)) printk(KERN_DEBUG DRV_NAME ": %s: disable MSI-X\n", pci_name(dev)); - pci_disable_msix(dev); + if (dev->msix_enabled) { + struct xen_pcibk_dev_data *dev_data; + + pci_disable_msix(dev); + + dev_data = pci_get_drvdata(dev); + if (dev_data) + dev_data->ack_intr = 1; + } /* * SR-IOV devices (which don't have any legacy IRQ) have * an undefined IRQ value of zero. */ op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0; if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: MSI-X: %d\n", pci_name(dev), - op->value); - dev_data = pci_get_drvdata(dev); - if (dev_data) - dev_data->ack_intr = 1; + printk(KERN_DEBUG DRV_NAME ": %s: MSI-X: %d\n", + pci_name(dev), op->value); return 0; } #endif -- cgit v0.10.2 From 408fb0e5aa7fda0059db282ff58c3b2a4278baa0 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 2 Nov 2015 18:13:27 -0500 Subject: xen/pciback: Don't allow MSI-X ops if PCI_COMMAND_MEMORY is not set. commit f598282f51 ("PCI: Fix the NIU MSI-X problem in a better way") teaches us that dealing with MSI-X can be troublesome. Further checks in the MSI-X architecture shows that if the PCI_COMMAND_MEMORY bit is turned of in the PCI_COMMAND we may not be able to access the BAR (since they are memory regions). Since the MSI-X tables are located in there.. that can lead to us causing PCIe errors. Inhibit us performing any operation on the MSI-X unless the MEMORY bit is set. Note that Xen hypervisor with: "x86/MSI-X: access MSI-X table only after having enabled MSI-X" will return: xen_pciback: 0000:0a:00.1: error -6 enabling MSI-X for guest 3! When the generic MSI code tries to setup the PIRQ without MEMORY bit set. Which means with later versions of Xen (4.6) this patch is not neccessary. This is part of XSA-157 CC: stable@vger.kernel.org Reviewed-by: Jan Beulich Signed-off-by: Konrad Rzeszutek Wilk diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c index 4ee5fc0..73dafdc 100644 --- a/drivers/xen/xen-pciback/pciback_ops.c +++ b/drivers/xen/xen-pciback/pciback_ops.c @@ -212,6 +212,7 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev, struct xen_pcibk_dev_data *dev_data; int i, result; struct msix_entry *entries; + u16 cmd; if (unlikely(verbose_request)) printk(KERN_DEBUG DRV_NAME ": %s: enable MSI-X\n", @@ -223,7 +224,12 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev, if (dev->msix_enabled) return -EALREADY; - if (dev->msi_enabled) + /* + * PCI_COMMAND_MEMORY must be enabled, otherwise we may not be able + * to access the BARs where the MSI-X entries reside. + */ + pci_read_config_word(dev, PCI_COMMAND, &cmd); + if (dev->msi_enabled || !(cmd & PCI_COMMAND_MEMORY)) return -ENXIO; entries = kmalloc(op->value * sizeof(*entries), GFP_KERNEL); -- cgit v0.10.2 From 584a561a6fee0d258f9ca644f58b73d9a41b8a46 Mon Sep 17 00:00:00 2001 From: Doug Goldstein Date: Thu, 26 Nov 2015 14:32:39 -0600 Subject: xen-pciback: fix up cleanup path when alloc fails When allocating a pciback device fails, clear the private field. This could lead to an use-after free, however the 'really_probe' takes care of setting dev_set_drvdata(dev, NULL) in its failure path (which we would exercise if the ->probe function failed), so we we are OK. However lets be defensive as the code can change. Going forward we should clean up the pci_set_drvdata(dev, NULL) in the various code-base. That will be for another day. Reviewed-by: Boris Ostrovsky Reported-by: Jonathan Creekmore Signed-off-by: Doug Goldstein Signed-off-by: Konrad Rzeszutek Wilk diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c index 98bc345..4843741 100644 --- a/drivers/xen/xen-pciback/xenbus.c +++ b/drivers/xen/xen-pciback/xenbus.c @@ -44,7 +44,6 @@ static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev) dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev); pdev->xdev = xdev; - dev_set_drvdata(&xdev->dev, pdev); mutex_init(&pdev->dev_lock); @@ -58,6 +57,9 @@ static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev) kfree(pdev); pdev = NULL; } + + dev_set_drvdata(&xdev->dev, pdev); + out: return pdev; } -- cgit v0.10.2