From 3de88d622fd68bd4dbee0f80168218b23f798fd0 Mon Sep 17 00:00:00 2001
From: Ross Lagerwall <ross.lagerwall@citrix.com>
Date: Fri, 19 Jun 2015 16:15:57 +0100
Subject: xen/events/fifo: Consume unprocessed events when a CPU dies

When a CPU is offlined, there may be unprocessed events on a port for
that CPU.  If the port is subsequently reused on a different CPU, it
could be in an unexpected state with the link bit set, resulting in
interrupts being missed. Fix this by consuming any unprocessed events
for a particular CPU when that CPU dies.

Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
Cc: <stable@vger.kernel.org> # 3.14+
Signed-off-by: David Vrabel <david.vrabel@citrix.com>

diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c
index e3e9e3d..96a1b8d 100644
--- a/drivers/xen/events/events_fifo.c
+++ b/drivers/xen/events/events_fifo.c
@@ -281,7 +281,8 @@ static void handle_irq_for_port(unsigned port)
 
 static void consume_one_event(unsigned cpu,
 			      struct evtchn_fifo_control_block *control_block,
-			      unsigned priority, unsigned long *ready)
+			      unsigned priority, unsigned long *ready,
+			      bool drop)
 {
 	struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu);
 	uint32_t head;
@@ -313,13 +314,17 @@ static void consume_one_event(unsigned cpu,
 	if (head == 0)
 		clear_bit(priority, ready);
 
-	if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port))
-		handle_irq_for_port(port);
+	if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port)) {
+		if (unlikely(drop))
+			pr_warn("Dropping pending event for port %u\n", port);
+		else
+			handle_irq_for_port(port);
+	}
 
 	q->head[priority] = head;
 }
 
-static void evtchn_fifo_handle_events(unsigned cpu)
+static void __evtchn_fifo_handle_events(unsigned cpu, bool drop)
 {
 	struct evtchn_fifo_control_block *control_block;
 	unsigned long ready;
@@ -331,11 +336,16 @@ static void evtchn_fifo_handle_events(unsigned cpu)
 
 	while (ready) {
 		q = find_first_bit(&ready, EVTCHN_FIFO_MAX_QUEUES);
-		consume_one_event(cpu, control_block, q, &ready);
+		consume_one_event(cpu, control_block, q, &ready, drop);
 		ready |= xchg(&control_block->ready, 0);
 	}
 }
 
+static void evtchn_fifo_handle_events(unsigned cpu)
+{
+	__evtchn_fifo_handle_events(cpu, false);
+}
+
 static void evtchn_fifo_resume(void)
 {
 	unsigned cpu;
@@ -420,6 +430,9 @@ static int evtchn_fifo_cpu_notification(struct notifier_block *self,
 		if (!per_cpu(cpu_control_block, cpu))
 			ret = evtchn_fifo_alloc_control_block(cpu);
 		break;
+	case CPU_DEAD:
+		__evtchn_fifo_handle_events(cpu, true);
+		break;
 	default:
 		break;
 	}
-- 
cgit v0.10.2


From de0afc9bdeeadaa998797d2333c754bf9f4d5dcf Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Wed, 2 Dec 2015 12:10:48 -0500
Subject: xen: Resume PMU from non-atomic context

Resuming PMU currently triggers a warning from ___might_sleep() (assuming
CONFIG_DEBUG_ATOMIC_SLEEP is set) when xen_pmu_init() allocates GFP_KERNEL
page because we are in state resembling atomic context.

Move resuming PMU to xen_arch_resume() which is called in regular context.
For symmetry move suspending PMU to xen_arch_suspend() as well.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reported-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: <stable@vger.kernel.org> # 4.3
Signed-off-by: David Vrabel <david.vrabel@citrix.com>

diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index feddabd..3705eab 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -68,26 +68,16 @@ static void xen_pv_post_suspend(int suspend_cancelled)
 
 void xen_arch_pre_suspend(void)
 {
-	int cpu;
-
-	for_each_online_cpu(cpu)
-		xen_pmu_finish(cpu);
-
 	if (xen_pv_domain())
 		xen_pv_pre_suspend();
 }
 
 void xen_arch_post_suspend(int cancelled)
 {
-	int cpu;
-
 	if (xen_pv_domain())
 		xen_pv_post_suspend(cancelled);
 	else
 		xen_hvm_post_suspend(cancelled);
-
-	for_each_online_cpu(cpu)
-		xen_pmu_init(cpu);
 }
 
 static void xen_vcpu_notify_restore(void *data)
@@ -106,10 +96,20 @@ static void xen_vcpu_notify_suspend(void *data)
 
 void xen_arch_resume(void)
 {
+	int cpu;
+
 	on_each_cpu(xen_vcpu_notify_restore, NULL, 1);
+
+	for_each_online_cpu(cpu)
+		xen_pmu_init(cpu);
 }
 
 void xen_arch_suspend(void)
 {
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		xen_pmu_finish(cpu);
+
 	on_each_cpu(xen_vcpu_notify_suspend, NULL, 1);
 }
-- 
cgit v0.10.2


From 20f36e0380a7e871a711d5e4e59d04d4948326b4 Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Sat, 12 Dec 2015 19:25:55 -0500
Subject: xen/x86/pvh: Use HVM's flush_tlb_others op

Using MMUEXT_TLB_FLUSH_MULTI doesn't buy us much since the hypervisor
will likely perform same IPIs as would have the guest.

More importantly, using MMUEXT_INVLPG_MULTI may not to invalidate the
guest's address on remote CPU (when, for example, VCPU from another guest
is running there).

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Suggested-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ac161db..cb5e266 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2495,14 +2495,9 @@ void __init xen_init_mmu_ops(void)
 {
 	x86_init.paging.pagetable_init = xen_pagetable_init;
 
-	/* Optimization - we can use the HVM one but it has no idea which
-	 * VCPUs are descheduled - which means that it will needlessly IPI
-	 * them. Xen knows so let it do the job.
-	 */
-	if (xen_feature(XENFEAT_auto_translated_physmap)) {
-		pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others;
+	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return;
-	}
+
 	pv_mmu_ops = xen_mmu_ops;
 
 	memset(dummy_mapping, 0xff, PAGE_SIZE);
-- 
cgit v0.10.2


From 454d5d882c7e412b840e3c99010fe81a9862f6fb Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Fri, 30 Oct 2015 14:58:08 +0000
Subject: xen: Add RING_COPY_REQUEST()

Using RING_GET_REQUEST() on a shared ring is easy to use incorrectly
(i.e., by not considering that the other end may alter the data in the
shared ring while it is being inspected).  Safe usage of a request
generally requires taking a local copy.

Provide a RING_COPY_REQUEST() macro to use instead of
RING_GET_REQUEST() and an open-coded memcpy().  This takes care of
ensuring that the copy is done correctly regardless of any possible
compiler optimizations.

Use a volatile source to prevent the compiler from reordering or
omitting the copy.

This is part of XSA155.

CC: stable@vger.kernel.org
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h
index 7d28aff..7dc685b 100644
--- a/include/xen/interface/io/ring.h
+++ b/include/xen/interface/io/ring.h
@@ -181,6 +181,20 @@ struct __name##_back_ring {						\
 #define RING_GET_REQUEST(_r, _idx)					\
     (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req))
 
+/*
+ * Get a local copy of a request.
+ *
+ * Use this in preference to RING_GET_REQUEST() so all processing is
+ * done on a local copy that cannot be modified by the other end.
+ *
+ * Note that https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145 may cause this
+ * to be ineffective where _req is a struct which consists of only bitfields.
+ */
+#define RING_COPY_REQUEST(_r, _idx, _req) do {				\
+	/* Use volatile to force the copy into _req. */			\
+	*(_req) = *(volatile typeof(_req))RING_GET_REQUEST(_r, _idx);	\
+} while (0)
+
 #define RING_GET_RESPONSE(_r, _idx)					\
     (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp))
 
-- 
cgit v0.10.2


From 0f589967a73f1f30ab4ac4dd9ce0bb399b4d6357 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Fri, 30 Oct 2015 15:16:01 +0000
Subject: xen-netback: don't use last request to determine minimum Tx credit

The last from guest transmitted request gives no indication about the
minimum amount of credit that the guest might need to send a packet
since the last packet might have been a small one.

Instead allow for the worst case 128 KiB packet.

This is part of XSA155.

CC: stable@vger.kernel.org
Reviewed-by: Wei Liu <wei.liu2@citrix.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index e481f37..b683581 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -679,9 +679,7 @@ static void tx_add_credit(struct xenvif_queue *queue)
 	 * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
 	 * Otherwise the interface can seize up due to insufficient credit.
 	 */
-	max_burst = RING_GET_REQUEST(&queue->tx, queue->tx.req_cons)->size;
-	max_burst = min(max_burst, 131072UL);
-	max_burst = max(max_burst, queue->credit_bytes);
+	max_burst = max(131072UL, queue->credit_bytes);
 
 	/* Take care that adding a new chunk of credit doesn't wrap to zero. */
 	max_credit = queue->remaining_credit + queue->credit_bytes;
-- 
cgit v0.10.2


From 68a33bfd8403e4e22847165d149823a2e0e67c9c Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Fri, 30 Oct 2015 15:17:06 +0000
Subject: xen-netback: use RING_COPY_REQUEST() throughout

Instead of open-coding memcpy()s and directly accessing Tx and Rx
requests, use the new RING_COPY_REQUEST() that ensures the local copy
is correct.

This is more than is strictly necessary for guest Rx requests since
only the id and gref fields are used and it is harmless if the
frontend modifies these.

This is part of XSA155.

CC: stable@vger.kernel.org
Reviewed-by: Wei Liu <wei.liu2@citrix.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index b683581..1049c34 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -258,18 +258,18 @@ static struct xenvif_rx_meta *get_next_rx_buffer(struct xenvif_queue *queue,
 						 struct netrx_pending_operations *npo)
 {
 	struct xenvif_rx_meta *meta;
-	struct xen_netif_rx_request *req;
+	struct xen_netif_rx_request req;
 
-	req = RING_GET_REQUEST(&queue->rx, queue->rx.req_cons++);
+	RING_COPY_REQUEST(&queue->rx, queue->rx.req_cons++, &req);
 
 	meta = npo->meta + npo->meta_prod++;
 	meta->gso_type = XEN_NETIF_GSO_TYPE_NONE;
 	meta->gso_size = 0;
 	meta->size = 0;
-	meta->id = req->id;
+	meta->id = req.id;
 
 	npo->copy_off = 0;
-	npo->copy_gref = req->gref;
+	npo->copy_gref = req.gref;
 
 	return meta;
 }
@@ -424,7 +424,7 @@ static int xenvif_gop_skb(struct sk_buff *skb,
 	struct xenvif *vif = netdev_priv(skb->dev);
 	int nr_frags = skb_shinfo(skb)->nr_frags;
 	int i;
-	struct xen_netif_rx_request *req;
+	struct xen_netif_rx_request req;
 	struct xenvif_rx_meta *meta;
 	unsigned char *data;
 	int head = 1;
@@ -443,15 +443,15 @@ static int xenvif_gop_skb(struct sk_buff *skb,
 
 	/* Set up a GSO prefix descriptor, if necessary */
 	if ((1 << gso_type) & vif->gso_prefix_mask) {
-		req = RING_GET_REQUEST(&queue->rx, queue->rx.req_cons++);
+		RING_COPY_REQUEST(&queue->rx, queue->rx.req_cons++, &req);
 		meta = npo->meta + npo->meta_prod++;
 		meta->gso_type = gso_type;
 		meta->gso_size = skb_shinfo(skb)->gso_size;
 		meta->size = 0;
-		meta->id = req->id;
+		meta->id = req.id;
 	}
 
-	req = RING_GET_REQUEST(&queue->rx, queue->rx.req_cons++);
+	RING_COPY_REQUEST(&queue->rx, queue->rx.req_cons++, &req);
 	meta = npo->meta + npo->meta_prod++;
 
 	if ((1 << gso_type) & vif->gso_mask) {
@@ -463,9 +463,9 @@ static int xenvif_gop_skb(struct sk_buff *skb,
 	}
 
 	meta->size = 0;
-	meta->id = req->id;
+	meta->id = req.id;
 	npo->copy_off = 0;
-	npo->copy_gref = req->gref;
+	npo->copy_gref = req.gref;
 
 	data = skb->data;
 	while (data < skb_tail_pointer(skb)) {
@@ -709,7 +709,7 @@ static void xenvif_tx_err(struct xenvif_queue *queue,
 		spin_unlock_irqrestore(&queue->response_lock, flags);
 		if (cons == end)
 			break;
-		txp = RING_GET_REQUEST(&queue->tx, cons++);
+		RING_COPY_REQUEST(&queue->tx, cons++, txp);
 	} while (1);
 	queue->tx.req_cons = cons;
 }
@@ -776,8 +776,7 @@ static int xenvif_count_requests(struct xenvif_queue *queue,
 		if (drop_err)
 			txp = &dropped_tx;
 
-		memcpy(txp, RING_GET_REQUEST(&queue->tx, cons + slots),
-		       sizeof(*txp));
+		RING_COPY_REQUEST(&queue->tx, cons + slots, txp);
 
 		/* If the guest submitted a frame >= 64 KiB then
 		 * first->size overflowed and following slots will
@@ -1110,8 +1109,7 @@ static int xenvif_get_extras(struct xenvif_queue *queue,
 			return -EBADR;
 		}
 
-		memcpy(&extra, RING_GET_REQUEST(&queue->tx, cons),
-		       sizeof(extra));
+		RING_COPY_REQUEST(&queue->tx, cons, &extra);
 		if (unlikely(!extra.type ||
 			     extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
 			queue->tx.req_cons = ++cons;
@@ -1320,7 +1318,7 @@ static void xenvif_tx_build_gops(struct xenvif_queue *queue,
 
 		idx = queue->tx.req_cons;
 		rmb(); /* Ensure that we see the request before we copy it. */
-		memcpy(&txreq, RING_GET_REQUEST(&queue->tx, idx), sizeof(txreq));
+		RING_COPY_REQUEST(&queue->tx, idx, &txreq);
 
 		/* Credit-based scheduling. */
 		if (txreq.size > queue->remaining_credit &&
-- 
cgit v0.10.2


From 1f13d75ccb806260079e0679d55d9253e370ec8a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
Date: Tue, 3 Nov 2015 16:34:09 +0000
Subject: xen-blkback: only read request operation from shared ring once
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A compiler may load a switch statement value multiple times, which could
be bad when the value is in memory shared with the frontend.

When converting a non-native request to a native one, ensure that
src->operation is only loaded once by using READ_ONCE().

This is part of XSA155.

CC: stable@vger.kernel.org
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 68e87a0..c929ae2 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -408,8 +408,8 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
 					struct blkif_x86_32_request *src)
 {
 	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
-	dst->operation = src->operation;
-	switch (src->operation) {
+	dst->operation = READ_ONCE(src->operation);
+	switch (dst->operation) {
 	case BLKIF_OP_READ:
 	case BLKIF_OP_WRITE:
 	case BLKIF_OP_WRITE_BARRIER:
@@ -456,8 +456,8 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst,
 					struct blkif_x86_64_request *src)
 {
 	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
-	dst->operation = src->operation;
-	switch (src->operation) {
+	dst->operation = READ_ONCE(src->operation);
+	switch (dst->operation) {
 	case BLKIF_OP_READ:
 	case BLKIF_OP_WRITE:
 	case BLKIF_OP_WRITE_BARRIER:
-- 
cgit v0.10.2


From 18779149101c0dd43ded43669ae2a92d21b6f9cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
Date: Tue, 3 Nov 2015 16:40:43 +0000
Subject: xen-blkback: read from indirect descriptors only once
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since indirect descriptors are in memory shared with the frontend, the
frontend could alter the first_sect and last_sect values after they have
been validated but before they are recorded in the request.  This may
result in I/O requests that overflow the foreign page, possibly
overwriting local pages when the I/O request is executed.

When parsing indirect descriptors, only read first_sect and last_sect
once.

This is part of XSA155.

CC: stable@vger.kernel.org
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index f909994..41fb1a9 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -950,6 +950,8 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
 		goto unmap;
 
 	for (n = 0, i = 0; n < nseg; n++) {
+		uint8_t first_sect, last_sect;
+
 		if ((n % SEGS_PER_INDIRECT_FRAME) == 0) {
 			/* Map indirect segments */
 			if (segments)
@@ -957,15 +959,18 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
 			segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page);
 		}
 		i = n % SEGS_PER_INDIRECT_FRAME;
+
 		pending_req->segments[n]->gref = segments[i].gref;
-		seg[n].nsec = segments[i].last_sect -
-			segments[i].first_sect + 1;
-		seg[n].offset = (segments[i].first_sect << 9);
-		if ((segments[i].last_sect >= (XEN_PAGE_SIZE >> 9)) ||
-		    (segments[i].last_sect < segments[i].first_sect)) {
+
+		first_sect = READ_ONCE(segments[i].first_sect);
+		last_sect = READ_ONCE(segments[i].last_sect);
+		if (last_sect >= (XEN_PAGE_SIZE >> 9) || last_sect < first_sect) {
 			rc = -EINVAL;
 			goto unmap;
 		}
+
+		seg[n].nsec = last_sect - first_sect + 1;
+		seg[n].offset = first_sect << 9;
 		preq->nr_sects += seg[n].nsec;
 	}
 
-- 
cgit v0.10.2


From be69746ec12f35b484707da505c6c76ff06f97dc Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Mon, 16 Nov 2015 18:02:32 +0000
Subject: xen-scsiback: safely copy requests

The copy of the ring request was lacking a following barrier(),
potentially allowing the compiler to optimize the copy away.

Use RING_COPY_REQUEST() to ensure the request is copied to local
memory.

This is part of XSA155.

CC: stable@vger.kernel.org
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/xen-scsiback.c b/drivers/xen/xen-scsiback.c
index 9eeefd7..2af9aa8 100644
--- a/drivers/xen/xen-scsiback.c
+++ b/drivers/xen/xen-scsiback.c
@@ -727,7 +727,7 @@ static int scsiback_do_cmd_fn(struct vscsibk_info *info)
 		if (!pending_req)
 			return 1;
 
-		ring_req = *RING_GET_REQUEST(ring, rc);
+		RING_COPY_REQUEST(ring, rc, &ring_req);
 		ring->req_cons = ++rc;
 
 		err = prepare_pending_reqs(info, &ring_req, pending_req);
-- 
cgit v0.10.2


From 8135cf8b092723dbfcc611fe6fdcb3a36c9951c5 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 16 Nov 2015 12:40:48 -0500
Subject: xen/pciback: Save xen_pci_op commands before processing it

Double fetch vulnerabilities that happen when a variable is
fetched twice from shared memory but a security check is only
performed the first time.

The xen_pcibk_do_op function performs a switch statements on the op->cmd
value which is stored in shared memory. Interestingly this can result
in a double fetch vulnerability depending on the performed compiler
optimization.

This patch fixes it by saving the xen_pci_op command before
processing it. We also use 'barrier' to make sure that the
compiler does not perform any optimization.

This is part of XSA155.

CC: stable@vger.kernel.org
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Jan Beulich <JBeulich@suse.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h
index 58e38d5..4d529f3 100644
--- a/drivers/xen/xen-pciback/pciback.h
+++ b/drivers/xen/xen-pciback/pciback.h
@@ -37,6 +37,7 @@ struct xen_pcibk_device {
 	struct xen_pci_sharedinfo *sh_info;
 	unsigned long flags;
 	struct work_struct op_work;
+	struct xen_pci_op op;
 };
 
 struct xen_pcibk_dev_data {
diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c
index c4a0666..a0e0e3e 100644
--- a/drivers/xen/xen-pciback/pciback_ops.c
+++ b/drivers/xen/xen-pciback/pciback_ops.c
@@ -298,9 +298,11 @@ void xen_pcibk_do_op(struct work_struct *data)
 		container_of(data, struct xen_pcibk_device, op_work);
 	struct pci_dev *dev;
 	struct xen_pcibk_dev_data *dev_data = NULL;
-	struct xen_pci_op *op = &pdev->sh_info->op;
+	struct xen_pci_op *op = &pdev->op;
 	int test_intx = 0;
 
+	*op = pdev->sh_info->op;
+	barrier();
 	dev = xen_pcibk_get_pci_dev(pdev, op->domain, op->bus, op->devfn);
 
 	if (dev == NULL)
@@ -342,6 +344,17 @@ void xen_pcibk_do_op(struct work_struct *data)
 		if ((dev_data->enable_intx != test_intx))
 			xen_pcibk_control_isr(dev, 0 /* no reset */);
 	}
+	pdev->sh_info->op.err = op->err;
+	pdev->sh_info->op.value = op->value;
+#ifdef CONFIG_PCI_MSI
+	if (op->cmd == XEN_PCI_OP_enable_msix && op->err == 0) {
+		unsigned int i;
+
+		for (i = 0; i < op->value; i++)
+			pdev->sh_info->op.msix_entries[i].vector =
+				op->msix_entries[i].vector;
+	}
+#endif
 	/* Tell the driver domain that we're done. */
 	wmb();
 	clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
-- 
cgit v0.10.2


From 56441f3c8e5bd45aab10dd9f8c505dd4bec03b0d Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 3 Apr 2015 11:08:22 -0400
Subject: xen/pciback: Return error on XEN_PCI_OP_enable_msi when device has
 MSI or MSI-X enabled

The guest sequence of:

 a) XEN_PCI_OP_enable_msi
 b) XEN_PCI_OP_enable_msi
 c) XEN_PCI_OP_disable_msi

results in hitting an BUG_ON condition in the msi.c code.

The MSI code uses an dev->msi_list to which it adds MSI entries.
Under the above conditions an BUG_ON() can be hit. The device
passed in the guest MUST have MSI capability.

The a) adds the entry to the dev->msi_list and sets msi_enabled.
The b) adds a second entry but adding in to SysFS fails (duplicate entry)
and deletes all of the entries from msi_list and returns (with msi_enabled
is still set).  c) pci_disable_msi passes the msi_enabled checks and hits:

BUG_ON(list_empty(dev_to_msi_list(&dev->dev)));

and blows up.

The patch adds a simple check in the XEN_PCI_OP_enable_msi to guard
against that. The check for msix_enabled is not stricly neccessary.

This is part of XSA-157.

CC: stable@vger.kernel.org
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c
index a0e0e3e..8bfb87c 100644
--- a/drivers/xen/xen-pciback/pciback_ops.c
+++ b/drivers/xen/xen-pciback/pciback_ops.c
@@ -144,7 +144,12 @@ int xen_pcibk_enable_msi(struct xen_pcibk_device *pdev,
 	if (unlikely(verbose_request))
 		printk(KERN_DEBUG DRV_NAME ": %s: enable MSI\n", pci_name(dev));
 
-	status = pci_enable_msi(dev);
+	if (dev->msi_enabled)
+		status = -EALREADY;
+	else if (dev->msix_enabled)
+		status = -ENXIO;
+	else
+		status = pci_enable_msi(dev);
 
 	if (status) {
 		pr_warn_ratelimited("%s: error enabling MSI for guest %u: err %d\n",
-- 
cgit v0.10.2


From 5e0ce1455c09dd61d029b8ad45d82e1ac0b6c4c9 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 2 Nov 2015 18:07:44 -0500
Subject: xen/pciback: Return error on XEN_PCI_OP_enable_msix when device has
 MSI or MSI-X enabled

The guest sequence of:

  a) XEN_PCI_OP_enable_msix
  b) XEN_PCI_OP_enable_msix

results in hitting an NULL pointer due to using freed pointers.

The device passed in the guest MUST have MSI-X capability.

The a) constructs and SysFS representation of MSI and MSI groups.
The b) adds a second set of them but adding in to SysFS fails (duplicate entry).
'populate_msi_sysfs' frees the newly allocated msi_irq_groups (note that
in a) pdev->msi_irq_groups is still set) and also free's ALL of the
MSI-X entries of the device (the ones allocated in step a) and b)).

The unwind code: 'free_msi_irqs' deletes all the entries and tries to
delete the pdev->msi_irq_groups (which hasn't been set to NULL).
However the pointers in the SysFS are already freed and we hit an
NULL pointer further on when 'strlen' is attempted on a freed pointer.

The patch adds a simple check in the XEN_PCI_OP_enable_msix to guard
against that. The check for msi_enabled is not stricly neccessary.

This is part of XSA-157

CC: stable@vger.kernel.org
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c
index 8bfb87c..029f33d 100644
--- a/drivers/xen/xen-pciback/pciback_ops.c
+++ b/drivers/xen/xen-pciback/pciback_ops.c
@@ -206,9 +206,16 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev,
 	if (unlikely(verbose_request))
 		printk(KERN_DEBUG DRV_NAME ": %s: enable MSI-X\n",
 		       pci_name(dev));
+
 	if (op->value > SH_INFO_MAX_VEC)
 		return -EINVAL;
 
+	if (dev->msix_enabled)
+		return -EALREADY;
+
+	if (dev->msi_enabled)
+		return -ENXIO;
+
 	entries = kmalloc(op->value * sizeof(*entries), GFP_KERNEL);
 	if (entries == NULL)
 		return -ENOMEM;
-- 
cgit v0.10.2


From a396f3a210c3a61e94d6b87ec05a75d0be2a60d0 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 2 Nov 2015 17:24:08 -0500
Subject: xen/pciback: Do not install an IRQ handler for MSI interrupts.

Otherwise an guest can subvert the generic MSI code to trigger
an BUG_ON condition during MSI interrupt freeing:

 for (i = 0; i < entry->nvec_used; i++)
        BUG_ON(irq_has_action(entry->irq + i));

Xen PCI backed installs an IRQ handler (request_irq) for
the dev->irq whenever the guest writes PCI_COMMAND_MEMORY
(or PCI_COMMAND_IO) to the PCI_COMMAND register. This is
done in case the device has legacy interrupts the GSI line
is shared by the backend devices.

To subvert the backend the guest needs to make the backend
to change the dev->irq from the GSI to the MSI interrupt line,
make the backend allocate an interrupt handler, and then command
the backend to free the MSI interrupt and hit the BUG_ON.

Since the backend only calls 'request_irq' when the guest
writes to the PCI_COMMAND register the guest needs to call
XEN_PCI_OP_enable_msi before any other operation. This will
cause the generic MSI code to setup an MSI entry and
populate dev->irq with the new PIRQ value.

Then the guest can write to PCI_COMMAND PCI_COMMAND_MEMORY
and cause the backend to setup an IRQ handler for dev->irq
(which instead of the GSI value has the MSI pirq). See
'xen_pcibk_control_isr'.

Then the guest disables the MSI: XEN_PCI_OP_disable_msi
which ends up triggering the BUG_ON condition in 'free_msi_irqs'
as there is an IRQ handler for the entry->irq (dev->irq).

Note that this cannot be done using MSI-X as the generic
code does not over-write dev->irq with the MSI-X PIRQ values.

The patch inhibits setting up the IRQ handler if MSI or
MSI-X (for symmetry reasons) code had been called successfully.

P.S.
Xen PCIBack when it sets up the device for the guest consumption
ends up writting 0 to the PCI_COMMAND (see xen_pcibk_reset_device).
XSA-120 addendum patch removed that - however when upstreaming said
addendum we found that it caused issues with qemu upstream. That
has now been fixed in qemu upstream.

This is part of XSA-157

CC: stable@vger.kernel.org
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c
index 029f33d..d0696ce 100644
--- a/drivers/xen/xen-pciback/pciback_ops.c
+++ b/drivers/xen/xen-pciback/pciback_ops.c
@@ -70,6 +70,13 @@ static void xen_pcibk_control_isr(struct pci_dev *dev, int reset)
 		enable ? "enable" : "disable");
 
 	if (enable) {
+		/*
+		 * The MSI or MSI-X should not have an IRQ handler. Otherwise
+		 * if the guest terminates we BUG_ON in free_msi_irqs.
+		 */
+		if (dev->msi_enabled || dev->msix_enabled)
+			goto out;
+
 		rc = request_irq(dev_data->irq,
 				xen_pcibk_guest_interrupt, IRQF_SHARED,
 				dev_data->irq_name, dev);
-- 
cgit v0.10.2


From 7cfb905b9638982862f0331b36ccaaca5d383b49 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 1 Apr 2015 10:49:47 -0400
Subject: xen/pciback: For XEN_PCI_OP_disable_msi[|x] only disable if device
 has MSI(X) enabled.

Otherwise just continue on, returning the same values as
previously (return of 0, and op->result has the PIRQ value).

This does not change the behavior of XEN_PCI_OP_disable_msi[|x].

The pci_disable_msi or pci_disable_msix have the checks for
msi_enabled or msix_enabled so they will error out immediately.

However the guest can still call these operations and cause
us to disable the 'ack_intr'. That means the backend IRQ handler
for the legacy interrupt will not respond to interrupts anymore.

This will lead to (if the device is causing an interrupt storm)
for the Linux generic code to disable the interrupt line.

Naturally this will only happen if the device in question
is plugged in on the motherboard on shared level interrupt GSI.

This is part of XSA-157

CC: stable@vger.kernel.org
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c
index d0696ce..4ee5fc0 100644
--- a/drivers/xen/xen-pciback/pciback_ops.c
+++ b/drivers/xen/xen-pciback/pciback_ops.c
@@ -185,20 +185,23 @@ static
 int xen_pcibk_disable_msi(struct xen_pcibk_device *pdev,
 			  struct pci_dev *dev, struct xen_pci_op *op)
 {
-	struct xen_pcibk_dev_data *dev_data;
-
 	if (unlikely(verbose_request))
 		printk(KERN_DEBUG DRV_NAME ": %s: disable MSI\n",
 		       pci_name(dev));
-	pci_disable_msi(dev);
 
+	if (dev->msi_enabled) {
+		struct xen_pcibk_dev_data *dev_data;
+
+		pci_disable_msi(dev);
+
+		dev_data = pci_get_drvdata(dev);
+		if (dev_data)
+			dev_data->ack_intr = 1;
+	}
 	op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
 	if (unlikely(verbose_request))
 		printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev),
 			op->value);
-	dev_data = pci_get_drvdata(dev);
-	if (dev_data)
-		dev_data->ack_intr = 1;
 	return 0;
 }
 
@@ -264,23 +267,27 @@ static
 int xen_pcibk_disable_msix(struct xen_pcibk_device *pdev,
 			   struct pci_dev *dev, struct xen_pci_op *op)
 {
-	struct xen_pcibk_dev_data *dev_data;
 	if (unlikely(verbose_request))
 		printk(KERN_DEBUG DRV_NAME ": %s: disable MSI-X\n",
 			pci_name(dev));
-	pci_disable_msix(dev);
 
+	if (dev->msix_enabled) {
+		struct xen_pcibk_dev_data *dev_data;
+
+		pci_disable_msix(dev);
+
+		dev_data = pci_get_drvdata(dev);
+		if (dev_data)
+			dev_data->ack_intr = 1;
+	}
 	/*
 	 * SR-IOV devices (which don't have any legacy IRQ) have
 	 * an undefined IRQ value of zero.
 	 */
 	op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
 	if (unlikely(verbose_request))
-		printk(KERN_DEBUG DRV_NAME ": %s: MSI-X: %d\n", pci_name(dev),
-			op->value);
-	dev_data = pci_get_drvdata(dev);
-	if (dev_data)
-		dev_data->ack_intr = 1;
+		printk(KERN_DEBUG DRV_NAME ": %s: MSI-X: %d\n",
+		       pci_name(dev), op->value);
 	return 0;
 }
 #endif
-- 
cgit v0.10.2


From 408fb0e5aa7fda0059db282ff58c3b2a4278baa0 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 2 Nov 2015 18:13:27 -0500
Subject: xen/pciback: Don't allow MSI-X ops if PCI_COMMAND_MEMORY is not set.

commit f598282f51 ("PCI: Fix the NIU MSI-X problem in a better way")
teaches us that dealing with MSI-X can be troublesome.

Further checks in the MSI-X architecture shows that if the
PCI_COMMAND_MEMORY bit is turned of in the PCI_COMMAND we
may not be able to access the BAR (since they are memory regions).

Since the MSI-X tables are located in there.. that can lead
to us causing PCIe errors. Inhibit us performing any
operation on the MSI-X unless the MEMORY bit is set.

Note that Xen hypervisor with:
"x86/MSI-X: access MSI-X table only after having enabled MSI-X"
will return:
xen_pciback: 0000:0a:00.1: error -6 enabling MSI-X for guest 3!

When the generic MSI code tries to setup the PIRQ without
MEMORY bit set. Which means with later versions of Xen
(4.6) this patch is not neccessary.

This is part of XSA-157

CC: stable@vger.kernel.org
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c
index 4ee5fc0..73dafdc 100644
--- a/drivers/xen/xen-pciback/pciback_ops.c
+++ b/drivers/xen/xen-pciback/pciback_ops.c
@@ -212,6 +212,7 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev,
 	struct xen_pcibk_dev_data *dev_data;
 	int i, result;
 	struct msix_entry *entries;
+	u16 cmd;
 
 	if (unlikely(verbose_request))
 		printk(KERN_DEBUG DRV_NAME ": %s: enable MSI-X\n",
@@ -223,7 +224,12 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev,
 	if (dev->msix_enabled)
 		return -EALREADY;
 
-	if (dev->msi_enabled)
+	/*
+	 * PCI_COMMAND_MEMORY must be enabled, otherwise we may not be able
+	 * to access the BARs where the MSI-X entries reside.
+	 */
+	pci_read_config_word(dev, PCI_COMMAND, &cmd);
+	if (dev->msi_enabled || !(cmd & PCI_COMMAND_MEMORY))
 		return -ENXIO;
 
 	entries = kmalloc(op->value * sizeof(*entries), GFP_KERNEL);
-- 
cgit v0.10.2


From 584a561a6fee0d258f9ca644f58b73d9a41b8a46 Mon Sep 17 00:00:00 2001
From: Doug Goldstein <cardoe@cardoe.com>
Date: Thu, 26 Nov 2015 14:32:39 -0600
Subject: xen-pciback: fix up cleanup path when alloc fails

When allocating a pciback device fails, clear the private
field. This could lead to an use-after free, however
the 'really_probe' takes care of setting
dev_set_drvdata(dev, NULL) in its failure path (which we would
exercise if the ->probe function failed), so we we
are OK. However lets be defensive as the code can change.

Going forward we should clean up the pci_set_drvdata(dev, NULL)
in the various code-base. That will be for another day.

Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reported-by: Jonathan Creekmore <jonathan.creekmore@gmail.com>
Signed-off-by: Doug Goldstein <cardoe@cardoe.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c
index 98bc345..4843741 100644
--- a/drivers/xen/xen-pciback/xenbus.c
+++ b/drivers/xen/xen-pciback/xenbus.c
@@ -44,7 +44,6 @@ static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev)
 	dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev);
 
 	pdev->xdev = xdev;
-	dev_set_drvdata(&xdev->dev, pdev);
 
 	mutex_init(&pdev->dev_lock);
 
@@ -58,6 +57,9 @@ static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev)
 		kfree(pdev);
 		pdev = NULL;
 	}
+
+	dev_set_drvdata(&xdev->dev, pdev);
+
 out:
 	return pdev;
 }
-- 
cgit v0.10.2