Merge commit 'v2.6.37-rc8' into x86/apic

Conflicts: arch/x86/include/asm/io_apic.h Merge reason: move to a fresh -rc, resolve the conflict. Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2011-01-04 08:43:42 (GMT)
committer: Ingo Molnar <mingo@elte.hu> 2011-01-04 08:43:42 (GMT)
commit: bc030d6cb9532877c1c5a3f5e7123344fa24a285 (patch)
tree: d223d410b868b80d4c0deec192d354a5d06b201a /arch/s390/mm
parent: d3bd058826aa8b79590cca6c8e6d1557bf576ada (diff)
parent: 387c31c7e5c9805b0aef8833d1731a5fe7bdea14 (diff)
download: linux-bc030d6cb9532877c1c5a3f5e7123344fa24a285.tar.xz
7 files changed, 480 insertions, 57 deletions
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index eec0544..6fbc6f3 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -3,6 +3,6 @@
 #
 
 obj-y	 := init.o fault.o extmem.o mmap.o vmem.o pgtable.o maccess.o \
-	    page-states.o
+	    page-states.o gup.o
 obj-$(CONFIG_CMM) += cmm.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index a9550dc..c66ffd8 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -23,7 +23,10 @@
 #include <asm/pgalloc.h>
 #include <asm/diag.h>
 
-static char *sender = "VMRMSVM";
+#ifdef CONFIG_CMM_IUCV
+static char *cmm_default_sender = "VMRMSVM";
+#endif
+static char *sender;
 module_param(sender, charp, 0400);
 MODULE_PARM_DESC(sender,
 		 "Guest name that may send SMSG messages (default VMRMSVM)");
@@ -440,6 +443,8 @@ static int __init cmm_init(void)
 		int len = strlen(sender);
 		while (len--)
 			sender[len] = toupper(sender[len]);
+	} else {
+		sender = cmm_default_sender;
 	}
 
 	rc = smsg_register_callback(SMSG_PREFIX, cmm_smsg_target);
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 2505b2e..fe5701e 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -52,6 +52,14 @@
 #define VM_FAULT_BADMAP		0x020000
 #define VM_FAULT_BADACCESS	0x040000
 
+static unsigned long store_indication;
+
+void fault_init(void)
+{
+	if (test_facility(2) && test_facility(75))
+		store_indication = 0xc00;
+}
+
 static inline int notify_page_fault(struct pt_regs *regs)
 {
 	int ret = 0;
@@ -199,14 +207,21 @@ static noinline void do_sigbus(struct pt_regs *regs, long int_code,
 			       unsigned long trans_exc_code)
 {
 	struct task_struct *tsk = current;
+	unsigned long address;
+	struct siginfo si;
 
 	/*
 	 * Send a sigbus, regardless of whether we were in kernel
 	 * or user mode.
 	 */
-	tsk->thread.prot_addr = trans_exc_code & __FAIL_ADDR_MASK;
+	address = trans_exc_code & __FAIL_ADDR_MASK;
+	tsk->thread.prot_addr = address;
 	tsk->thread.trap_no = int_code;
-	force_sig(SIGBUS, tsk);
+	si.si_signo = SIGBUS;
+	si.si_errno = 0;
+	si.si_code = BUS_ADRERR;
+	si.si_addr = (void __user *) address;
+	force_sig_info(SIGBUS, &si, tsk);
 }
 
 #ifdef CONFIG_S390_EXEC_PROTECT
@@ -266,10 +281,11 @@ static noinline void do_fault_error(struct pt_regs *regs, long int_code,
 		if (fault & VM_FAULT_OOM)
 			pagefault_out_of_memory();
 		else if (fault & VM_FAULT_SIGBUS) {
-			do_sigbus(regs, int_code, trans_exc_code);
 			/* Kernel mode? Handle exceptions or die */
 			if (!(regs->psw.mask & PSW_MASK_PSTATE))
 				do_no_context(regs, int_code, trans_exc_code);
+			else
+				do_sigbus(regs, int_code, trans_exc_code);
 		} else
 			BUG();
 		break;
@@ -294,7 +310,7 @@ static inline int do_exception(struct pt_regs *regs, int access,
 	struct mm_struct *mm;
 	struct vm_area_struct *vma;
 	unsigned long address;
-	int fault;
+	int fault, write;
 
 	if (notify_page_fault(regs))
 		return 0;
@@ -312,12 +328,6 @@ static inline int do_exception(struct pt_regs *regs, int access,
 		goto out;
 
 	address = trans_exc_code & __FAIL_ADDR_MASK;
-	/*
-	 * When we get here, the fault happened in the current
-	 * task's user address space, so we can switch on the
-	 * interrupts again and then search the VMAs
-	 */
-	local_irq_enable();
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 	down_read(&mm->mmap_sem);
 
@@ -348,8 +358,10 @@ static inline int do_exception(struct pt_regs *regs, int access,
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	fault = handle_mm_fault(mm, vma, address,
-				(access == VM_WRITE) ? FAULT_FLAG_WRITE : 0);
+	write = (access == VM_WRITE ||
+		 (trans_exc_code & store_indication) == 0x400) ?
+		FAULT_FLAG_WRITE : 0;
+	fault = handle_mm_fault(mm, vma, address, write);
 	if (unlikely(fault & VM_FAULT_ERROR))
 		goto out_up;
 
@@ -374,20 +386,20 @@ out:
 	return fault;
 }
 
-void __kprobes do_protection_exception(struct pt_regs *regs, long int_code)
+void __kprobes do_protection_exception(struct pt_regs *regs, long pgm_int_code,
+				       unsigned long trans_exc_code)
 {
-	unsigned long trans_exc_code = S390_lowcore.trans_exc_code;
 	int fault;
 
 	/* Protection exception is supressing, decrement psw address. */
-	regs->psw.addr -= (int_code >> 16);
+	regs->psw.addr -= (pgm_int_code >> 16);
 	/*
 	 * Check for low-address protection.  This needs to be treated
 	 * as a special case because the translation exception code
 	 * field is not guaranteed to contain valid data in this case.
 	 */
 	if (unlikely(!(trans_exc_code & 4))) {
-		do_low_address(regs, int_code, trans_exc_code);
+		do_low_address(regs, pgm_int_code, trans_exc_code);
 		return;
 	}
 	fault = do_exception(regs, VM_WRITE, trans_exc_code);
@@ -395,9 +407,9 @@ void __kprobes do_protection_exception(struct pt_regs *regs, long int_code)
 		do_fault_error(regs, 4, trans_exc_code, fault);
 }
 
-void __kprobes do_dat_exception(struct pt_regs *regs, long int_code)
+void __kprobes do_dat_exception(struct pt_regs *regs, long pgm_int_code,
+				unsigned long trans_exc_code)
 {
-	unsigned long trans_exc_code = S390_lowcore.trans_exc_code;
 	int access, fault;
 
 	access = VM_READ | VM_EXEC | VM_WRITE;
@@ -408,21 +420,19 @@ void __kprobes do_dat_exception(struct pt_regs *regs, long int_code)
 #endif
 	fault = do_exception(regs, access, trans_exc_code);
 	if (unlikely(fault))
-		do_fault_error(regs, int_code & 255, trans_exc_code, fault);
+		do_fault_error(regs, pgm_int_code & 255, trans_exc_code, fault);
 }
 
 #ifdef CONFIG_64BIT
-void __kprobes do_asce_exception(struct pt_regs *regs, long int_code)
+void __kprobes do_asce_exception(struct pt_regs *regs, long pgm_int_code,
+				 unsigned long trans_exc_code)
 {
-	unsigned long trans_exc_code = S390_lowcore.trans_exc_code;
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 
 	if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm))
 		goto no_context;
 
-	local_irq_enable();
-
 	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, trans_exc_code & __FAIL_ADDR_MASK);
 	up_read(&mm->mmap_sem);
@@ -434,16 +444,16 @@ void __kprobes do_asce_exception(struct pt_regs *regs, long int_code)
 
 	/* User mode accesses just cause a SIGSEGV */
 	if (regs->psw.mask & PSW_MASK_PSTATE) {
-		do_sigsegv(regs, int_code, SEGV_MAPERR, trans_exc_code);
+		do_sigsegv(regs, pgm_int_code, SEGV_MAPERR, trans_exc_code);
 		return;
 	}
 
 no_context:
-	do_no_context(regs, int_code, trans_exc_code);
+	do_no_context(regs, pgm_int_code, trans_exc_code);
 }
 #endif
 
-int __handle_fault(unsigned long uaddr, unsigned long int_code, int write_user)
+int __handle_fault(unsigned long uaddr, unsigned long pgm_int_code, int write)
 {
 	struct pt_regs regs;
 	int access, fault;
@@ -454,14 +464,14 @@ int __handle_fault(unsigned long uaddr, unsigned long int_code, int write_user)
 	regs.psw.addr = (unsigned long) __builtin_return_address(0);
 	regs.psw.addr |= PSW_ADDR_AMODE;
 	uaddr &= PAGE_MASK;
-	access = write_user ? VM_WRITE : VM_READ;
+	access = write ? VM_WRITE : VM_READ;
 	fault = do_exception(&regs, access, uaddr | 2);
 	if (unlikely(fault)) {
 		if (fault & VM_FAULT_OOM) {
 			pagefault_out_of_memory();
 			fault = 0;
 		} else if (fault & VM_FAULT_SIGBUS)
-			do_sigbus(&regs, int_code, uaddr);
+			do_sigbus(&regs, pgm_int_code, uaddr);
 	}
 	return fault ? -EFAULT : 0;
 }
@@ -527,7 +537,8 @@ void pfault_fini(void)
 		: : "a" (&refbk), "m" (refbk) : "cc");
 }
 
-static void pfault_interrupt(__u16 int_code)
+static void pfault_interrupt(unsigned int ext_int_code,
+			     unsigned int param32, unsigned long param64)
 {
 	struct task_struct *tsk;
 	__u16 subcode;
@@ -538,14 +549,18 @@ static void pfault_interrupt(__u16 int_code)
 	 * in the 'cpu address' field associated with the
          * external interrupt. 
 	 */
-	subcode = S390_lowcore.cpu_addr;
+	subcode = ext_int_code >> 16;
 	if ((subcode & 0xff00) != __SUBCODE_MASK)
 		return;
 
 	/*
 	 * Get the token (= address of the task structure of the affected task).
 	 */
-	tsk = *(struct task_struct **) __LC_PFAULT_INTPARM;
+#ifdef CONFIG_64BIT
+	tsk = *(struct task_struct **) param64;
+#else
+	tsk = *(struct task_struct **) param32;
+#endif
 
 	if (subcode & 0x0080) {
 		/* signal bit is set -> a page has been swapped in by VM */
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
new file mode 100644
index 0000000..45b405c
--- /dev/null
+++ b/arch/s390/mm/gup.c
@@ -0,0 +1,224 @@
+/*
+ *  Lockless get_user_pages_fast for s390
+ *
+ *  Copyright IBM Corp. 2010
+ *  Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/vmstat.h>
+#include <linux/pagemap.h>
+#include <linux/rwsem.h>
+#include <asm/pgtable.h>
+
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
+		unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long mask;
+	pte_t *ptep, pte;
+	struct page *page;
+
+	mask = (write ? _PAGE_RO : 0) | _PAGE_INVALID | _PAGE_SPECIAL;
+
+	ptep = ((pte_t *) pmd_deref(pmd)) + pte_index(addr);
+	do {
+		pte = *ptep;
+		barrier();
+		if ((pte_val(pte) & mask) != 0)
+			return 0;
+		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+		page = pte_page(pte);
+		if (!page_cache_get_speculative(page))
+			return 0;
+		if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+			put_page(page);
+			return 0;
+		}
+		pages[*nr] = page;
+		(*nr)++;
+
+	} while (ptep++, addr += PAGE_SIZE, addr != end);
+
+	return 1;
+}
+
+static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
+		unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long mask, result;
+	struct page *head, *page;
+	int refs;
+
+	result = write ? 0 : _SEGMENT_ENTRY_RO;
+	mask = result | _SEGMENT_ENTRY_INV;
+	if ((pmd_val(pmd) & mask) != result)
+		return 0;
+	VM_BUG_ON(!pfn_valid(pmd_val(pmd) >> PAGE_SHIFT));
+
+	refs = 0;
+	head = pmd_page(pmd);
+	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+	do {
+		VM_BUG_ON(compound_head(page) != head);
+		pages[*nr] = page;
+		(*nr)++;
+		page++;
+		refs++;
+	} while (addr += PAGE_SIZE, addr != end);
+
+	if (!page_cache_add_speculative(head, refs)) {
+		*nr -= refs;
+		return 0;
+	}
+
+	if (unlikely(pmd_val(pmd) != pmd_val(*pmdp))) {
+		*nr -= refs;
+		while (refs--)
+			put_page(head);
+	}
+
+	return 1;
+}
+
+
+static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
+		unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pmd_t *pmdp, pmd;
+
+	pmdp = (pmd_t *) pudp;
+#ifdef CONFIG_64BIT
+	if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
+		pmdp = (pmd_t *) pud_deref(pud);
+	pmdp += pmd_index(addr);
+#endif
+	do {
+		pmd = *pmdp;
+		barrier();
+		next = pmd_addr_end(addr, end);
+		if (pmd_none(pmd))
+			return 0;
+		if (unlikely(pmd_huge(pmd))) {
+			if (!gup_huge_pmd(pmdp, pmd, addr, next,
+					  write, pages, nr))
+				return 0;
+		} else if (!gup_pte_range(pmdp, pmd, addr, next,
+					  write, pages, nr))
+			return 0;
+	} while (pmdp++, addr = next, addr != end);
+
+	return 1;
+}
+
+static inline int gup_pud_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
+		unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pud_t *pudp, pud;
+
+	pudp = (pud_t *) pgdp;
+#ifdef CONFIG_64BIT
+	if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2)
+		pudp = (pud_t *) pgd_deref(pgd);
+	pudp += pud_index(addr);
+#endif
+	do {
+		pud = *pudp;
+		barrier();
+		next = pud_addr_end(addr, end);
+		if (pud_none(pud))
+			return 0;
+		if (!gup_pmd_range(pudp, pud, addr, next, write, pages, nr))
+			return 0;
+	} while (pudp++, addr = next, addr != end);
+
+	return 1;
+}
+
+/**
+ * get_user_pages_fast() - pin user pages in memory
+ * @start:	starting user address
+ * @nr_pages:	number of pages from start to pin
+ * @write:	whether pages will be written to
+ * @pages:	array that receives pointers to the pages pinned.
+ *		Should be at least nr_pages long.
+ *
+ * Attempt to pin user pages in memory without taking mm->mmap_sem.
+ * If not successful, it will fall back to taking the lock and
+ * calling get_user_pages().
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno.
+ */
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr, len, end;
+	unsigned long next;
+	pgd_t *pgdp, pgd;
+	int nr = 0;
+
+	start &= PAGE_MASK;
+	addr = start;
+	len = (unsigned long) nr_pages << PAGE_SHIFT;
+	end = start + len;
+	if (end < start)
+		goto slow_irqon;
+
+	/*
+	 * local_irq_disable() doesn't prevent pagetable teardown, but does
+	 * prevent the pagetables from being freed on s390.
+	 *
+	 * So long as we atomically load page table pointers versus teardown,
+	 * we can follow the address down to the the page and take a ref on it.
+	 */
+	local_irq_disable();
+	pgdp = pgd_offset(mm, addr);
+	do {
+		pgd = *pgdp;
+		barrier();
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(pgd))
+			goto slow;
+		if (!gup_pud_range(pgdp, pgd, addr, next, write, pages, &nr))
+			goto slow;
+	} while (pgdp++, addr = next, addr != end);
+	local_irq_enable();
+
+	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+	return nr;
+
+	{
+		int ret;
+slow:
+		local_irq_enable();
+slow_irqon:
+		/* Try to get the remaining pages with get_user_pages */
+		start += nr << PAGE_SHIFT;
+		pages += nr;
+
+		down_read(&mm->mmap_sem);
+		ret = get_user_pages(current, mm, start,
+			(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
+		up_read(&mm->mmap_sem);
+
+		/* Have to be a bit careful with return values */
+		if (nr > 0) {
+			if (ret < 0)
+				ret = nr;
+			else
+				ret += nr;
+		}
+
+		return ret;
+	}
+}
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index f28c43d..639cd21 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -68,7 +68,7 @@ void arch_release_hugepage(struct page *page)
 	ptep = (pte_t *) page[1].index;
 	if (!ptep)
 		return;
-	pte_free(&init_mm, ptep);
+	page_table_free(&init_mm, (unsigned long *) ptep);
 	page[1].index = 0;
 }
 
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 94b8ba2..bb40933 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -38,13 +38,54 @@
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
 pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((__aligned__(PAGE_SIZE)));
 
-char  empty_zero_page[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE)));
+unsigned long empty_zero_page, zero_page_mask;
 EXPORT_SYMBOL(empty_zero_page);
 
+static unsigned long setup_zero_pages(void)
+{
+	struct cpuid cpu_id;
+	unsigned int order;
+	unsigned long size;
+	struct page *page;
+	int i;
+
+	get_cpu_id(&cpu_id);
+	switch (cpu_id.machine) {
+	case 0x9672:	/* g5 */
+	case 0x2064:	/* z900 */
+	case 0x2066:	/* z900 */
+	case 0x2084:	/* z990 */
+	case 0x2086:	/* z990 */
+	case 0x2094:	/* z9-109 */
+	case 0x2096:	/* z9-109 */
+		order = 0;
+		break;
+	case 0x2097:	/* z10 */
+	case 0x2098:	/* z10 */
+	default:
+		order = 2;
+		break;
+	}
+
+	empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
+	if (!empty_zero_page)
+		panic("Out of memory in setup_zero_pages");
+
+	page = virt_to_page((void *) empty_zero_page);
+	split_page(page, order);
+	for (i = 1 << order; i > 0; i--) {
+		SetPageReserved(page);
+		page++;
+	}
+
+	size = PAGE_SIZE << order;
+	zero_page_mask = (size - 1) & PAGE_MASK;
+
+	return 1UL << order;
+}
+
 /*
  * paging_init() sets up the page tables
  */
@@ -83,6 +124,7 @@ void __init paging_init(void)
 #endif
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
 	free_area_init_nodes(max_zone_pfns);
+	fault_init();
 }
 
 void __init mem_init(void)
@@ -92,14 +134,12 @@ void __init mem_init(void)
         max_mapnr = num_physpages = max_low_pfn;
         high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
 
-        /* clear the zero-page */
-        memset(empty_zero_page, 0, PAGE_SIZE);
-
 	/* Setup guest page hinting */
 	cmma_init();
 
 	/* this will put all low memory onto the freelists */
 	totalram_pages += free_all_bootmem();
+	totalram_pages -= setup_zero_pages();	/* Setup zeroed pages. */
 
 	reservedpages = 0;
 
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 8d99924..0c719c6 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -15,6 +15,7 @@
 #include <linux/spinlock.h>
 #include <linux/module.h>
 #include <linux/quicklist.h>
+#include <linux/rcupdate.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
@@ -23,6 +24,67 @@
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 
+struct rcu_table_freelist {
+	struct rcu_head rcu;
+	struct mm_struct *mm;
+	unsigned int pgt_index;
+	unsigned int crst_index;
+	unsigned long *table[0];
+};
+
+#define RCU_FREELIST_SIZE \
+	((PAGE_SIZE - sizeof(struct rcu_table_freelist)) \
+	  / sizeof(unsigned long))
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+static DEFINE_PER_CPU(struct rcu_table_freelist *, rcu_table_freelist);
+
+static void __page_table_free(struct mm_struct *mm, unsigned long *table);
+static void __crst_table_free(struct mm_struct *mm, unsigned long *table);
+
+static struct rcu_table_freelist *rcu_table_freelist_get(struct mm_struct *mm)
+{
+	struct rcu_table_freelist **batchp = &__get_cpu_var(rcu_table_freelist);
+	struct rcu_table_freelist *batch = *batchp;
+
+	if (batch)
+		return batch;
+	batch = (struct rcu_table_freelist *) __get_free_page(GFP_ATOMIC);
+	if (batch) {
+		batch->mm = mm;
+		batch->pgt_index = 0;
+		batch->crst_index = RCU_FREELIST_SIZE;
+		*batchp = batch;
+	}
+	return batch;
+}
+
+static void rcu_table_freelist_callback(struct rcu_head *head)
+{
+	struct rcu_table_freelist *batch =
+		container_of(head, struct rcu_table_freelist, rcu);
+
+	while (batch->pgt_index > 0)
+		__page_table_free(batch->mm, batch->table[--batch->pgt_index]);
+	while (batch->crst_index < RCU_FREELIST_SIZE)
+		__crst_table_free(batch->mm, batch->table[batch->crst_index++]);
+	free_page((unsigned long) batch);
+}
+
+void rcu_table_freelist_finish(void)
+{
+	struct rcu_table_freelist *batch = __get_cpu_var(rcu_table_freelist);
+
+	if (!batch)
+		return;
+	call_rcu(&batch->rcu, rcu_table_freelist_callback);
+	__get_cpu_var(rcu_table_freelist) = NULL;
+}
+
+static void smp_sync(void *arg)
+{
+}
+
 #ifndef CONFIG_64BIT
 #define ALLOC_ORDER	1
 #define TABLES_PER_PAGE	4
@@ -78,25 +140,55 @@ unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec)
 		}
 		page->index = page_to_phys(shadow);
 	}
-	spin_lock(&mm->context.list_lock);
+	spin_lock_bh(&mm->context.list_lock);
 	list_add(&page->lru, &mm->context.crst_list);
-	spin_unlock(&mm->context.list_lock);
+	spin_unlock_bh(&mm->context.list_lock);
 	return (unsigned long *) page_to_phys(page);
 }
 
-void crst_table_free(struct mm_struct *mm, unsigned long *table)
+static void __crst_table_free(struct mm_struct *mm, unsigned long *table)
 {
 	unsigned long *shadow = get_shadow_table(table);
-	struct page *page = virt_to_page(table);
 
-	spin_lock(&mm->context.list_lock);
-	list_del(&page->lru);
-	spin_unlock(&mm->context.list_lock);
 	if (shadow)
 		free_pages((unsigned long) shadow, ALLOC_ORDER);
 	free_pages((unsigned long) table, ALLOC_ORDER);
 }
 
+void crst_table_free(struct mm_struct *mm, unsigned long *table)
+{
+	struct page *page = virt_to_page(table);
+
+	spin_lock_bh(&mm->context.list_lock);
+	list_del(&page->lru);
+	spin_unlock_bh(&mm->context.list_lock);
+	__crst_table_free(mm, table);
+}
+
+void crst_table_free_rcu(struct mm_struct *mm, unsigned long *table)
+{
+	struct rcu_table_freelist *batch;
+	struct page *page = virt_to_page(table);
+
+	spin_lock_bh(&mm->context.list_lock);
+	list_del(&page->lru);
+	spin_unlock_bh(&mm->context.list_lock);
+	if (atomic_read(&mm->mm_users) < 2 &&
+	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
+		__crst_table_free(mm, table);
+		return;
+	}
+	batch = rcu_table_freelist_get(mm);
+	if (!batch) {
+		smp_call_function(smp_sync, NULL, 1);
+		__crst_table_free(mm, table);
+		return;
+	}
+	batch->table[--batch->crst_index] = table;
+	if (batch->pgt_index >= batch->crst_index)
+		rcu_table_freelist_finish();
+}
+
 #ifdef CONFIG_64BIT
 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
 {
@@ -108,7 +200,7 @@ repeat:
 	table = crst_table_alloc(mm, mm->context.noexec);
 	if (!table)
 		return -ENOMEM;
-	spin_lock(&mm->page_table_lock);
+	spin_lock_bh(&mm->page_table_lock);
 	if (mm->context.asce_limit < limit) {
 		pgd = (unsigned long *) mm->pgd;
 		if (mm->context.asce_limit <= (1UL << 31)) {
@@ -130,7 +222,7 @@ repeat:
 		mm->task_size = mm->context.asce_limit;
 		table = NULL;
 	}
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock_bh(&mm->page_table_lock);
 	if (table)
 		crst_table_free(mm, table);
 	if (mm->context.asce_limit < limit)
@@ -182,7 +274,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
 	unsigned long bits;
 
 	bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
-	spin_lock(&mm->context.list_lock);
+	spin_lock_bh(&mm->context.list_lock);
 	page = NULL;
 	if (!list_empty(&mm->context.pgtable_list)) {
 		page = list_first_entry(&mm->context.pgtable_list,
@@ -191,7 +283,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
 			page = NULL;
 	}
 	if (!page) {
-		spin_unlock(&mm->context.list_lock);
+		spin_unlock_bh(&mm->context.list_lock);
 		page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
 		if (!page)
 			return NULL;
@@ -202,7 +294,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
 			clear_table_pgstes(table);
 		else
 			clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
-		spin_lock(&mm->context.list_lock);
+		spin_lock_bh(&mm->context.list_lock);
 		list_add(&page->lru, &mm->context.pgtable_list);
 	}
 	table = (unsigned long *) page_to_phys(page);
@@ -213,10 +305,25 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
 	page->flags |= bits;
 	if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1))
 		list_move_tail(&page->lru, &mm->context.pgtable_list);
-	spin_unlock(&mm->context.list_lock);
+	spin_unlock_bh(&mm->context.list_lock);
 	return table;
 }
 
+static void __page_table_free(struct mm_struct *mm, unsigned long *table)
+{
+	struct page *page;
+	unsigned long bits;
+
+	bits = ((unsigned long) table) & 15;
+	table = (unsigned long *)(((unsigned long) table) ^ bits);
+	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+	page->flags ^= bits;
+	if (!(page->flags & FRAG_MASK)) {
+		pgtable_page_dtor(page);
+		__free_page(page);
+	}
+}
+
 void page_table_free(struct mm_struct *mm, unsigned long *table)
 {
 	struct page *page;
@@ -225,7 +332,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
 	bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
 	bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
 	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
-	spin_lock(&mm->context.list_lock);
+	spin_lock_bh(&mm->context.list_lock);
 	page->flags ^= bits;
 	if (page->flags & FRAG_MASK) {
 		/* Page now has some free pgtable fragments. */
@@ -234,18 +341,48 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
 	} else
 		/* All fragments of the 4K page have been freed. */
 		list_del(&page->lru);
-	spin_unlock(&mm->context.list_lock);
+	spin_unlock_bh(&mm->context.list_lock);
 	if (page) {
 		pgtable_page_dtor(page);
 		__free_page(page);
 	}
 }
 
+void page_table_free_rcu(struct mm_struct *mm, unsigned long *table)
+{
+	struct rcu_table_freelist *batch;
+	struct page *page;
+	unsigned long bits;
+
+	if (atomic_read(&mm->mm_users) < 2 &&
+	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
+		page_table_free(mm, table);
+		return;
+	}
+	batch = rcu_table_freelist_get(mm);
+	if (!batch) {
+		smp_call_function(smp_sync, NULL, 1);
+		page_table_free(mm, table);
+		return;
+	}
+	bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
+	bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
+	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+	spin_lock_bh(&mm->context.list_lock);
+	/* Delayed freeing with rcu prevents reuse of pgtable fragments */
+	list_del_init(&page->lru);
+	spin_unlock_bh(&mm->context.list_lock);
+	table = (unsigned long *)(((unsigned long) table) | bits);
+	batch->table[batch->pgt_index++] = table;
+	if (batch->pgt_index >= batch->crst_index)
+		rcu_table_freelist_finish();
+}
+
 void disable_noexec(struct mm_struct *mm, struct task_struct *tsk)
 {
 	struct page *page;
 
-	spin_lock(&mm->context.list_lock);
+	spin_lock_bh(&mm->context.list_lock);
 	/* Free shadow region and segment tables. */
 	list_for_each_entry(page, &mm->context.crst_list, lru)
 		if (page->index) {
@@ -255,7 +392,7 @@ void disable_noexec(struct mm_struct *mm, struct task_struct *tsk)
 	/* "Free" second halves of page tables. */
 	list_for_each_entry(page, &mm->context.pgtable_list, lru)
 		page->flags &= ~SECOND_HALVES;
-	spin_unlock(&mm->context.list_lock);
+	spin_unlock_bh(&mm->context.list_lock);
 	mm->context.noexec = 0;
 	update_mm(mm, tsk);
 }
@@ -312,6 +449,8 @@ int s390_enable_sie(void)
 	tsk->mm = tsk->active_mm = mm;
 	preempt_disable();
 	update_mm(mm, tsk);
+	atomic_inc(&mm->context.attach_count);
+	atomic_dec(&old_mm->context.attach_count);
 	cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
 	preempt_enable();
 	task_unlock(tsk);
author	Ingo Molnar <mingo@elte.hu>	2011-01-04 08:43:42 (GMT)
committer	Ingo Molnar <mingo@elte.hu>	2011-01-04 08:43:42 (GMT)
commit	bc030d6cb9532877c1c5a3f5e7123344fa24a285 (patch)
tree	d223d410b868b80d4c0deec192d354a5d06b201a /arch/s390/mm
parent	d3bd058826aa8b79590cca6c8e6d1557bf576ada (diff)
parent	387c31c7e5c9805b0aef8833d1731a5fe7bdea14 (diff)
download	linux-bc030d6cb9532877c1c5a3f5e7123344fa24a285.tar.xz