From 9c17d96500f78d7ecdb71ca6942830158bc75a2b Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Tue, 10 Nov 2015 15:10:33 -0500 Subject: xen/gntdev: Grant maps should not be subject to NUMA balancing Doing so will cause the grant to be unmapped and then, during fault handling, the fault to be mistakenly treated as NUMA hint fault. In addition, even if those maps could partcipate in NUMA balancing, it wouldn't provide any benefit since we are unable to determine physical page's node (even if/when VNUMA is implemented). Marking grant maps' VMAs as VM_IO will exclude them from being part of NUMA balancing. Signed-off-by: Boris Ostrovsky Cc: stable@vger.kernel.org Signed-off-by: David Vrabel diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 2ea0b3b..1be5dd0 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -804,7 +804,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) vma->vm_ops = &gntdev_vmops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP | VM_IO; if (use_ptemod) vma->vm_flags |= VM_DONTCOPY; -- cgit v0.10.2 From b4ff8389ed14b849354b59ce9b360bdefcdbf99c Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Fri, 20 Nov 2015 11:25:04 -0500 Subject: xen/events: Always allocate legacy interrupts on PV guests After commit 8c058b0b9c34 ("x86/irq: Probe for PIC presence before allocating descs for legacy IRQs") early_irq_init() will no longer preallocate descriptors for legacy interrupts if PIC does not exist, which is the case for Xen PV guests. Therefore we may need to allocate those descriptors ourselves. Signed-off-by: Boris Ostrovsky Suggested-by: Thomas Gleixner Signed-off-by: David Vrabel diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h index be1d07d..1bd9510de 100644 --- a/arch/arm/include/asm/irq.h +++ b/arch/arm/include/asm/irq.h @@ -40,6 +40,11 @@ extern void arch_trigger_all_cpu_backtrace(bool); #define arch_trigger_all_cpu_backtrace(x) arch_trigger_all_cpu_backtrace(x) #endif +static inline int nr_legacy_irqs(void) +{ + return NR_IRQS_LEGACY; +} + #endif #endif diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index bbb251b..8b9bf54 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -21,4 +21,9 @@ static inline void acpi_irq_init(void) } #define acpi_irq_init acpi_irq_init +static inline int nr_legacy_irqs(void) +{ + return 0; +} + #endif diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 849500e..524c221 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #endif #include @@ -420,7 +421,7 @@ static int __must_check xen_allocate_irq_gsi(unsigned gsi) return xen_allocate_irq_dynamic(); /* Legacy IRQ descriptors are already allocated by the arch. */ - if (gsi < NR_IRQS_LEGACY) + if (gsi < nr_legacy_irqs()) irq = gsi; else irq = irq_alloc_desc_at(gsi, -1); @@ -446,7 +447,7 @@ static void xen_free_irq(unsigned irq) kfree(info); /* Legacy IRQ descriptors are managed by the arch. */ - if (irq < NR_IRQS_LEGACY) + if (irq < nr_legacy_irqs()) return; irq_free_desc(irq); -- cgit v0.10.2 From 8620015499101090ae275bf11e9bc2f9febfdf08 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 26 Nov 2015 16:14:35 +0000 Subject: xen/evtchn: dynamically grow pending event channel ring If more than 1024 event channels are bound to a evtchn device then it possible (even with well behaved applications) for the ring to overflow and events to be lost (reported as an -EFBIG error). Dynamically increase the size of the ring so there is always enough space for all bound events. Well behaved applicables that only unmask events after draining them from the ring can thus no longer lose events. However, an application could unmask an event before draining it, allowing multiple entries per port to accumulate in the ring, and a overflow could still occur. So the overflow detection and reporting is retained. The ring size is initially only 64 entries so the common use case of an application only binding a few events will use less memory than before. The ring size may grow to 512 KiB (enough for all 2^17 possible channels). This order 7 kmalloc() may fail due to memory fragmentation, so we fall back to trying vmalloc(). Signed-off-by: David Vrabel Reviewed-by: Andrew Cooper diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index 00f40f0..38272ad 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -49,6 +49,8 @@ #include #include #include +#include +#include #include #include @@ -58,10 +60,10 @@ struct per_user_data { struct mutex bind_mutex; /* serialize bind/unbind operations */ struct rb_root evtchns; + unsigned int nr_evtchns; /* Notification ring, accessed via /dev/xen/evtchn. */ -#define EVTCHN_RING_SIZE (PAGE_SIZE / sizeof(evtchn_port_t)) -#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1)) + unsigned int ring_size; evtchn_port_t *ring; unsigned int ring_cons, ring_prod, ring_overflow; struct mutex ring_cons_mutex; /* protect against concurrent readers */ @@ -80,10 +82,41 @@ struct user_evtchn { bool enabled; }; +static evtchn_port_t *evtchn_alloc_ring(unsigned int size) +{ + evtchn_port_t *ring; + size_t s = size * sizeof(*ring); + + ring = kmalloc(s, GFP_KERNEL); + if (!ring) + ring = vmalloc(s); + + return ring; +} + +static void evtchn_free_ring(evtchn_port_t *ring) +{ + kvfree(ring); +} + +static unsigned int evtchn_ring_offset(struct per_user_data *u, + unsigned int idx) +{ + return idx & (u->ring_size - 1); +} + +static evtchn_port_t *evtchn_ring_entry(struct per_user_data *u, + unsigned int idx) +{ + return u->ring + evtchn_ring_offset(u, idx); +} + static int add_evtchn(struct per_user_data *u, struct user_evtchn *evtchn) { struct rb_node **new = &(u->evtchns.rb_node), *parent = NULL; + u->nr_evtchns++; + while (*new) { struct user_evtchn *this; @@ -107,6 +140,7 @@ static int add_evtchn(struct per_user_data *u, struct user_evtchn *evtchn) static void del_evtchn(struct per_user_data *u, struct user_evtchn *evtchn) { + u->nr_evtchns--; rb_erase(&evtchn->node, &u->evtchns); kfree(evtchn); } @@ -144,8 +178,8 @@ static irqreturn_t evtchn_interrupt(int irq, void *data) spin_lock(&u->ring_prod_lock); - if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) { - u->ring[EVTCHN_RING_MASK(u->ring_prod)] = evtchn->port; + if ((u->ring_prod - u->ring_cons) < u->ring_size) { + *evtchn_ring_entry(u, u->ring_prod) = evtchn->port; wmb(); /* Ensure ring contents visible */ if (u->ring_cons == u->ring_prod++) { wake_up_interruptible(&u->evtchn_wait); @@ -200,10 +234,10 @@ static ssize_t evtchn_read(struct file *file, char __user *buf, } /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */ - if (((c ^ p) & EVTCHN_RING_SIZE) != 0) { - bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) * + if (((c ^ p) & u->ring_size) != 0) { + bytes1 = (u->ring_size - evtchn_ring_offset(u, c)) * sizeof(evtchn_port_t); - bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t); + bytes2 = evtchn_ring_offset(u, p) * sizeof(evtchn_port_t); } else { bytes1 = (p - c) * sizeof(evtchn_port_t); bytes2 = 0; @@ -219,7 +253,7 @@ static ssize_t evtchn_read(struct file *file, char __user *buf, rc = -EFAULT; rmb(); /* Ensure that we see the port before we copy it. */ - if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) || + if (copy_to_user(buf, evtchn_ring_entry(u, c), bytes1) || ((bytes2 != 0) && copy_to_user(&buf[bytes1], &u->ring[0], bytes2))) goto unlock_out; @@ -278,6 +312,66 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf, return rc; } +static int evtchn_resize_ring(struct per_user_data *u) +{ + unsigned int new_size; + evtchn_port_t *new_ring, *old_ring; + unsigned int p, c; + + /* + * Ensure the ring is large enough to capture all possible + * events. i.e., one free slot for each bound event. + */ + if (u->nr_evtchns <= u->ring_size) + return 0; + + if (u->ring_size == 0) + new_size = 64; + else + new_size = 2 * u->ring_size; + + new_ring = evtchn_alloc_ring(new_size); + if (!new_ring) + return -ENOMEM; + + old_ring = u->ring; + + /* + * Access to the ring contents is serialized by either the + * prod /or/ cons lock so take both when resizing. + */ + mutex_lock(&u->ring_cons_mutex); + spin_lock_irq(&u->ring_prod_lock); + + /* + * Copy the old ring contents to the new ring. + * + * If the ring contents crosses the end of the current ring, + * it needs to be copied in two chunks. + * + * +---------+ +------------------+ + * |34567 12| -> | 1234567 | + * +-----p-c-+ +------------------+ + */ + p = evtchn_ring_offset(u, u->ring_prod); + c = evtchn_ring_offset(u, u->ring_cons); + if (p < c) { + memcpy(new_ring + c, u->ring + c, (u->ring_size - c) * sizeof(*u->ring)); + memcpy(new_ring + u->ring_size, u->ring, p * sizeof(*u->ring)); + } else + memcpy(new_ring + c, u->ring + c, (p - c) * sizeof(*u->ring)); + + u->ring = new_ring; + u->ring_size = new_size; + + spin_unlock_irq(&u->ring_prod_lock); + mutex_unlock(&u->ring_cons_mutex); + + evtchn_free_ring(old_ring); + + return 0; +} + static int evtchn_bind_to_user(struct per_user_data *u, int port) { struct user_evtchn *evtchn; @@ -305,6 +399,10 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port) if (rc < 0) goto err; + rc = evtchn_resize_ring(u); + if (rc < 0) + goto err; + rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, 0, u->name, evtchn); if (rc < 0) @@ -503,13 +601,6 @@ static int evtchn_open(struct inode *inode, struct file *filp) init_waitqueue_head(&u->evtchn_wait); - u->ring = (evtchn_port_t *)__get_free_page(GFP_KERNEL); - if (u->ring == NULL) { - kfree(u->name); - kfree(u); - return -ENOMEM; - } - mutex_init(&u->bind_mutex); mutex_init(&u->ring_cons_mutex); spin_lock_init(&u->ring_prod_lock); @@ -532,7 +623,7 @@ static int evtchn_release(struct inode *inode, struct file *filp) evtchn_unbind_from_user(u, evtchn); } - free_page((unsigned long)u->ring); + evtchn_free_ring(u->ring); kfree(u->name); kfree(u); -- cgit v0.10.2