From 7d7b4ae418728916456c6fd03a97ef35345ff30d Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Fri, 25 Mar 2016 17:30:07 +0800
Subject: arm64: cpufeature: append additional id_aa64mmfr2 fields to
 cpufeature

There are some new cpu features which can be identified by id_aa64mmfr2,
this patch appends all fields of it.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 1287416..19182ef 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -145,7 +145,11 @@
 #define ID_AA64MMFR1_VMIDBITS_16	2
 
 /* id_aa64mmfr2 */
+#define ID_AA64MMFR2_LVA_SHIFT		16
+#define ID_AA64MMFR2_IESB_SHIFT		12
+#define ID_AA64MMFR2_LSM_SHIFT		8
 #define ID_AA64MMFR2_UAO_SHIFT		4
+#define ID_AA64MMFR2_CNP_SHIFT		0
 
 /* id_aa64dfr0 */
 #define ID_AA64DFR0_CTX_CMPS_SHIFT	28
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 943f514..677f17c 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -130,7 +130,11 @@ static struct arm64_ftr_bits ftr_id_aa64mmfr1[] = {
 };
 
 static struct arm64_ftr_bits ftr_id_aa64mmfr2[] = {
+	ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR2_LVA_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR2_IESB_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR2_LSM_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR2_UAO_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR2_CNP_SHIFT, 4, 0),
 	ARM64_FTR_END,
 };
 
-- 
cgit v0.10.2


From b5fda7ed5c63297d0f43e608b455d82ceabdebf0 Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <jszhang@marvell.com>
Date: Fri, 25 Mar 2016 11:08:55 +0800
Subject: arm64: cpuidle: make arm_cpuidle_suspend() a bit more efficient

Currently, we check two pointers: cpu_ops and cpu_suspend on every idle
state entry. These pointers check can be avoided:

If cpu_ops has not been registered, arm_cpuidle_init() will return
-EOPNOTSUPP, so arm_cpuidle_suspend() will never have chance to
run. In other word, the cpu_ops check can be avoid.

Similarly, the cpu_suspend check could be avoided in this hot path by
moving it into arm_cpuidle_init().

I measured the 4096 * time from arm_cpuidle_suspend entry point to the
cpu_psci_cpu_suspend entry point. HW platform is Marvell BG4CT STB
board.

1. only one shell, no other process, hot-unplug secondary cpus, execute
the following cmd

while true
do
	sleep 0.2
done

before the patch: 1581220ns

after the patch: 1579630ns

reduced by 0.1%

2. only one shell, no other process, hot-unplug secondary cpus, execute
the following cmd

while true
do
	md5sum /tmp/testfile
	sleep 0.2
done

NOTE: the testfile size should be larger than L1+L2 cache size

before the patch: 1961960ns
after the patch: 1912500ns

reduced by 2.5%

So the more complex the system load, the bigger the improvement.

Signed-off-by: Jisheng Zhang <jszhang@marvell.com>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/cpuidle.c b/arch/arm64/kernel/cpuidle.c
index 9047cab6..e11857f 100644
--- a/arch/arm64/kernel/cpuidle.c
+++ b/arch/arm64/kernel/cpuidle.c
@@ -19,7 +19,8 @@ int __init arm_cpuidle_init(unsigned int cpu)
 {
 	int ret = -EOPNOTSUPP;
 
-	if (cpu_ops[cpu] && cpu_ops[cpu]->cpu_init_idle)
+	if (cpu_ops[cpu] && cpu_ops[cpu]->cpu_suspend &&
+			cpu_ops[cpu]->cpu_init_idle)
 		ret = cpu_ops[cpu]->cpu_init_idle(cpu);
 
 	return ret;
@@ -36,11 +37,5 @@ int arm_cpuidle_suspend(int index)
 {
 	int cpu = smp_processor_id();
 
-	/*
-	 * If cpu_ops have not been registered or suspend
-	 * has not been initialized, cpu_suspend call fails early.
-	 */
-	if (!cpu_ops[cpu] || !cpu_ops[cpu]->cpu_suspend)
-		return -EOPNOTSUPP;
 	return cpu_ops[cpu]->cpu_suspend(index);
 }
-- 
cgit v0.10.2


From 2958987f5da2ebcf6a237c5f154d7e3340e60945 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 30 Mar 2016 14:25:46 +0200
Subject: arm64/mm: ensure memstart_addr remains sufficiently aligned

After choosing memstart_addr to be the highest multiple of
ARM64_MEMSTART_ALIGN less than or equal to the first usable physical memory
address, we clip the memblocks to the maximum size of the linear region.
Since the kernel may be high up in memory, we take care not to clip the
kernel itself, which means we have to clip some memory from the bottom if
this occurs, to ensure that the distance between the first and the last
usable physical memory address can be covered by the linear region.

However, we fail to update memstart_addr if this clipping from the bottom
occurs, which means that we may still end up with virtual addresses that
wrap into the userland range. So increment memstart_addr as appropriate to
prevent this from happening.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index ea989d8..83ae8b5 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -190,8 +190,12 @@ void __init arm64_memblock_init(void)
 	 */
 	memblock_remove(max_t(u64, memstart_addr + linear_region_size, __pa(_end)),
 			ULLONG_MAX);
-	if (memblock_end_of_DRAM() > linear_region_size)
-		memblock_remove(0, memblock_end_of_DRAM() - linear_region_size);
+	if (memstart_addr + linear_region_size < memblock_end_of_DRAM()) {
+		/* ensure that memstart_addr remains sufficiently aligned */
+		memstart_addr = round_up(memblock_end_of_DRAM() - linear_region_size,
+					 ARM64_MEMSTART_ALIGN);
+		memblock_remove(0, memstart_addr);
+	}
 
 	/*
 	 * Apply the memory limit if it was set. Since the kernel may be loaded
-- 
cgit v0.10.2


From 06e9bf2fd9b372bc1c757995b6ca1cfab0720acb Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 30 Mar 2016 14:25:47 +0200
Subject: arm64: choose memstart_addr based on minimum sparsemem section
 alignment

This redefines ARM64_MEMSTART_ALIGN in terms of the minimal alignment
required by sparsemem vmemmap. This comes down to using 1 GB for all
translation granules if CONFIG_SPARSEMEM_VMEMMAP is enabled.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index 5c6375d..7e51d1b 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -19,6 +19,7 @@
 #ifndef __ASM_KERNEL_PGTABLE_H
 #define __ASM_KERNEL_PGTABLE_H
 
+#include <asm/sparsemem.h>
 
 /*
  * The linear mapping and the start of memory are both 2M aligned (per
@@ -86,10 +87,24 @@
  * (64k granule), or a multiple that can be mapped using contiguous bits
  * in the page tables: 32 * PMD_SIZE (16k granule)
  */
-#ifdef CONFIG_ARM64_64K_PAGES
-#define ARM64_MEMSTART_ALIGN	SZ_512M
+#if defined(CONFIG_ARM64_4K_PAGES)
+#define ARM64_MEMSTART_SHIFT		PUD_SHIFT
+#elif defined(CONFIG_ARM64_16K_PAGES)
+#define ARM64_MEMSTART_SHIFT		(PMD_SHIFT + 5)
 #else
-#define ARM64_MEMSTART_ALIGN	SZ_1G
+#define ARM64_MEMSTART_SHIFT		PMD_SHIFT
+#endif
+
+/*
+ * sparsemem vmemmap imposes an additional requirement on the alignment of
+ * memstart_addr, due to the fact that the base of the vmemmap region
+ * has a direct correspondence, and needs to appear sufficiently aligned
+ * in the virtual address space.
+ */
+#if defined(CONFIG_SPARSEMEM_VMEMMAP) && ARM64_MEMSTART_SHIFT < SECTION_SIZE_BITS
+#define ARM64_MEMSTART_ALIGN	(1UL << SECTION_SIZE_BITS)
+#else
+#define ARM64_MEMSTART_ALIGN	(1UL << ARM64_MEMSTART_SHIFT)
 #endif
 
 #endif	/* __ASM_KERNEL_PGTABLE_H */
-- 
cgit v0.10.2


From 3bab79edc67152cb6cadb12773b7c7d05228eb77 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 30 Mar 2016 14:25:48 +0200
Subject: Revert "arm64: account for sparsemem section alignment when choosing
 vmemmap offset"

This reverts commit 36e5cd6b897e17d03008f81e075625d8e43e52d0, since the
section alignment is now guaranteed by construction when choosing the
value of memstart_addr.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 989fef1..aa6106a 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -32,14 +32,13 @@
  * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space,
  *	fixed mappings and modules
  */
-#define VMEMMAP_SIZE		ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE)
+#define VMEMMAP_SIZE		ALIGN((1UL << (VA_BITS - PAGE_SHIFT - 1)) * sizeof(struct page), PUD_SIZE)
 
 #define VMALLOC_START		(MODULES_END)
 #define VMALLOC_END		(PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K)
 
 #define VMEMMAP_START		(VMALLOC_END + SZ_64K)
-#define vmemmap			((struct page *)VMEMMAP_START - \
-				 SECTION_ALIGN_DOWN(memstart_addr >> PAGE_SHIFT))
+#define vmemmap			((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT))
 
 #define FIRST_USER_ADDRESS	0UL
 
-- 
cgit v0.10.2


From 177e15f0c1444cd392374ec7175c4787fd911369 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 30 Mar 2016 15:18:42 +0200
Subject: arm64: add the initrd region to the linear mapping explicitly

Instead of going out of our way to relocate the initrd if it turns out
to occupy memory that is not covered by the linear mapping, just add the
initrd to the linear mapping. This puts the burden on the bootloader to
pass initrd= and mem= options that are mutually consistent.

Note that, since the placement of the linear region in the PA space is
also dependent on the placement of the kernel Image, which may reside
anywhere in memory, we may still end up with a situation where the initrd
and the kernel Image are simply too far apart to be covered by the linear
region.

Since we now leave it up to the bootloader to pass the initrd in memory
that is guaranteed to be accessible by the kernel, add a mention of this to
the arm64 boot protocol specification as well.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/Documentation/arm64/booting.txt b/Documentation/arm64/booting.txt
index 56d6d8b..8d0df62 100644
--- a/Documentation/arm64/booting.txt
+++ b/Documentation/arm64/booting.txt
@@ -132,6 +132,10 @@ NOTE: versions prior to v4.6 cannot make use of memory below the
 physical offset of the Image so it is recommended that the Image be
 placed as close as possible to the start of system RAM.
 
+If an initrd/initramfs is passed to the kernel at boot, it must reside
+entirely within a 1 GB aligned physical memory window of up to 32 GB in
+size that fully covers the kernel Image as well.
+
 Any memory described to the kernel (even that below the start of the
 image) which is not marked as reserved from the kernel (e.g., with a
 memreserve region in the device tree) will be considered as available to
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 83ae8b5..82ced5f 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -207,6 +207,35 @@ void __init arm64_memblock_init(void)
 		memblock_add(__pa(_text), (u64)(_end - _text));
 	}
 
+	if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && initrd_start) {
+		/*
+		 * Add back the memory we just removed if it results in the
+		 * initrd to become inaccessible via the linear mapping.
+		 * Otherwise, this is a no-op
+		 */
+		u64 base = initrd_start & PAGE_MASK;
+		u64 size = PAGE_ALIGN(initrd_end) - base;
+
+		/*
+		 * We can only add back the initrd memory if we don't end up
+		 * with more memory than we can address via the linear mapping.
+		 * It is up to the bootloader to position the kernel and the
+		 * initrd reasonably close to each other (i.e., within 32 GB of
+		 * each other) so that all granule/#levels combinations can
+		 * always access both.
+		 */
+		if (WARN(base < memblock_start_of_DRAM() ||
+			 base + size > memblock_start_of_DRAM() +
+				       linear_region_size,
+			"initrd not fully accessible via the linear mapping -- please check your bootloader ...\n")) {
+			initrd_start = 0;
+		} else {
+			memblock_remove(base, size); /* clear MEMBLOCK_ flags */
+			memblock_add(base, size);
+			memblock_reserve(base, size);
+		}
+	}
+
 	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
 		extern u16 memstart_offset_seed;
 		u64 range = linear_region_size -
-- 
cgit v0.10.2


From 8923a16686569375bfa25b877769f9a3093e9f22 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 30 Mar 2016 15:18:43 +0200
Subject: arm64: remove the now unneeded relocate_initrd()

This removes the relocate_initrd() implementation and invocation, which are
no longer needed now that the placement of the initrd is guaranteed to be
covered by the linear mapping.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 9dc6776..7b85b1d 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -224,69 +224,6 @@ static void __init request_standard_resources(void)
 	}
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-/*
- * Relocate initrd if it is not completely within the linear mapping.
- * This would be the case if mem= cuts out all or part of it.
- */
-static void __init relocate_initrd(void)
-{
-	phys_addr_t orig_start = __virt_to_phys(initrd_start);
-	phys_addr_t orig_end = __virt_to_phys(initrd_end);
-	phys_addr_t ram_end = memblock_end_of_DRAM();
-	phys_addr_t new_start;
-	unsigned long size, to_free = 0;
-	void *dest;
-
-	if (orig_end <= ram_end)
-		return;
-
-	/*
-	 * Any of the original initrd which overlaps the linear map should
-	 * be freed after relocating.
-	 */
-	if (orig_start < ram_end)
-		to_free = ram_end - orig_start;
-
-	size = orig_end - orig_start;
-	if (!size)
-		return;
-
-	/* initrd needs to be relocated completely inside linear mapping */
-	new_start = memblock_find_in_range(0, PFN_PHYS(max_pfn),
-					   size, PAGE_SIZE);
-	if (!new_start)
-		panic("Cannot relocate initrd of size %ld\n", size);
-	memblock_reserve(new_start, size);
-
-	initrd_start = __phys_to_virt(new_start);
-	initrd_end   = initrd_start + size;
-
-	pr_info("Moving initrd from [%llx-%llx] to [%llx-%llx]\n",
-		orig_start, orig_start + size - 1,
-		new_start, new_start + size - 1);
-
-	dest = (void *)initrd_start;
-
-	if (to_free) {
-		memcpy(dest, (void *)__phys_to_virt(orig_start), to_free);
-		dest += to_free;
-	}
-
-	copy_from_early_mem(dest, orig_start + to_free, size - to_free);
-
-	if (to_free) {
-		pr_info("Freeing original RAMDISK from [%llx-%llx]\n",
-			orig_start, orig_start + to_free - 1);
-		memblock_free(orig_start, to_free);
-	}
-}
-#else
-static inline void __init relocate_initrd(void)
-{
-}
-#endif
-
 u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID };
 
 void __init setup_arch(char **cmdline_p)
@@ -327,7 +264,6 @@ void __init setup_arch(char **cmdline_p)
 	acpi_boot_table_init();
 
 	paging_init();
-	relocate_initrd();
 
 	kasan_init();
 
-- 
cgit v0.10.2


From 97bbb54e4f27f4c63645f603bf767183ee0dd549 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 30 Mar 2016 16:45:56 +0200
Subject: arm64: vdso: avoid virt_to_page() translations on kernel symbols

The translation performed by virt_to_page() is only valid for linear
addresses, and kernel symbols are no longer in the linear mapping.
So perform the __pa() translation explicitly, which does the right
thing in either case, and only then translate to a struct page offset.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 97bc68f..64fc030 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -131,11 +131,11 @@ static int __init vdso_init(void)
 		return -ENOMEM;
 
 	/* Grab the vDSO data page. */
-	vdso_pagelist[0] = virt_to_page(vdso_data);
+	vdso_pagelist[0] = pfn_to_page(PHYS_PFN(__pa(vdso_data)));
 
 	/* Grab the vDSO code pages. */
 	for (i = 0; i < vdso_pages; i++)
-		vdso_pagelist[i + 1] = virt_to_page(&vdso_start + i * PAGE_SIZE);
+		vdso_pagelist[i + 1] = pfn_to_page(PHYS_PFN(__pa(&vdso_start)) + i);
 
 	/* Populate the special mapping structures */
 	vdso_spec[0] = (struct vm_special_mapping) {
-- 
cgit v0.10.2


From d386825c959efeaae3315c25dd2d2874c68829c7 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 30 Mar 2016 16:45:57 +0200
Subject: arm64: mm: free __init memory via the linear mapping

The implementation of free_initmem_default() expects __init_begin
and __init_end to be covered by the linear mapping, which is no
longer the case. So open code it instead, using addresses that are
explicitly translated from kernel virtual to linear virtual.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 82ced5f..89376f3 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -452,7 +452,8 @@ void __init mem_init(void)
 
 void free_initmem(void)
 {
-	free_initmem_default(0);
+	free_reserved_area(__va(__pa(__init_begin)), __va(__pa(__init_end)),
+			   0, "unused kernel");
 	fixup_init();
 }
 
-- 
cgit v0.10.2


From 22b6f3b0549be61a2d5fe8210ff7628e6d2d8185 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 30 Mar 2016 16:45:58 +0200
Subject: arm64: mm: avoid virt_to_page() translation for the zero page

The zero page is statically allocated, so grab its struct page pointer
without using virt_to_page(), which will be restricted to the linear
mapping later.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index aa6106a..377257a 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -57,7 +57,7 @@ extern void __pgd_error(const char *file, int line, unsigned long val);
  * for zero-mapped memory areas etc..
  */
 extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
-#define ZERO_PAGE(vaddr)	virt_to_page(empty_zero_page)
+#define ZERO_PAGE(vaddr)	pfn_to_page(PHYS_PFN(__pa(empty_zero_page)))
 
 #define pte_ERROR(pte)		__pte_error(__FILE__, __LINE__, pte_val(pte))
 
-- 
cgit v0.10.2


From e44308e62e19b42810207780a2a32148af0cb5d9 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 30 Mar 2016 16:45:59 +0200
Subject: arm64: insn: avoid virt_to_page() translations on core kernel symbols

Before restricting virt_to_page() to the linear mapping, ensure that
the text patching code does not use it to resolve references into the
core kernel text, which is mapped in the vmalloc area.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index 7371455..368c082 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -96,7 +96,7 @@ static void __kprobes *patch_map(void *addr, int fixmap)
 	if (module && IS_ENABLED(CONFIG_DEBUG_SET_MODULE_RONX))
 		page = vmalloc_to_page(addr);
 	else if (!module && IS_ENABLED(CONFIG_DEBUG_RODATA))
-		page = virt_to_page(addr);
+		page = pfn_to_page(PHYS_PFN(__pa(addr)));
 	else
 		return addr;
 
-- 
cgit v0.10.2


From 3e1907d5bf5a1e0b182ee599f92586f0165029e2 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 30 Mar 2016 16:46:00 +0200
Subject: arm64: mm: move vmemmap region right below the linear region

This moves the vmemmap region right below PAGE_OFFSET, aka the start
of the linear region, and redefines its size to be a power of two.
Due to the placement of PAGE_OFFSET in the middle of the address space,
whose size is a power of two as well, this guarantees that virt to
page conversions and vice versa can be implemented efficiently, by
masking and shifting rather than ordinary arithmetic.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 12f8a00..8a2ab19 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -40,6 +40,21 @@
 #define PCI_IO_SIZE		SZ_16M
 
 /*
+ * Log2 of the upper bound of the size of a struct page. Used for sizing
+ * the vmemmap region only, does not affect actual memory footprint.
+ * We don't use sizeof(struct page) directly since taking its size here
+ * requires its definition to be available at this point in the inclusion
+ * chain, and it may not be a power of 2 in the first place.
+ */
+#define STRUCT_PAGE_MAX_SHIFT	6
+
+/*
+ * VMEMMAP_SIZE - allows the whole linear region to be covered by
+ *                a struct page array
+ */
+#define VMEMMAP_SIZE (UL(1) << (VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT))
+
+/*
  * PAGE_OFFSET - the virtual address of the start of the kernel image (top
  *		 (VA_BITS - 1))
  * VA_BITS - the maximum number of bits for virtual addresses.
@@ -54,7 +69,8 @@
 #define MODULES_END		(MODULES_VADDR + MODULES_VSIZE)
 #define MODULES_VADDR		(VA_START + KASAN_SHADOW_SIZE)
 #define MODULES_VSIZE		(SZ_128M)
-#define PCI_IO_END		(PAGE_OFFSET - SZ_2M)
+#define VMEMMAP_START		(PAGE_OFFSET - VMEMMAP_SIZE)
+#define PCI_IO_END		(VMEMMAP_START - SZ_2M)
 #define PCI_IO_START		(PCI_IO_END - PCI_IO_SIZE)
 #define FIXADDR_TOP		(PCI_IO_START - SZ_2M)
 #define TASK_SIZE_64		(UL(1) << VA_BITS)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 377257a..2abfa4d 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -24,20 +24,15 @@
 #include <asm/pgtable-prot.h>
 
 /*
- * VMALLOC and SPARSEMEM_VMEMMAP ranges.
+ * VMALLOC range.
  *
- * VMEMAP_SIZE: allows the whole linear region to be covered by a struct page array
- *	(rounded up to PUD_SIZE).
  * VMALLOC_START: beginning of the kernel vmalloc space
- * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space,
- *	fixed mappings and modules
+ * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space
+ *	and fixed mappings
  */
-#define VMEMMAP_SIZE		ALIGN((1UL << (VA_BITS - PAGE_SHIFT - 1)) * sizeof(struct page), PUD_SIZE)
-
 #define VMALLOC_START		(MODULES_END)
 #define VMALLOC_END		(PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K)
 
-#define VMEMMAP_START		(VMALLOC_END + SZ_64K)
 #define vmemmap			((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT))
 
 #define FIRST_USER_ADDRESS	0UL
diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c
index f9271cb..a21f4742 100644
--- a/arch/arm64/mm/dump.c
+++ b/arch/arm64/mm/dump.c
@@ -37,14 +37,14 @@ enum address_markers_idx {
 	MODULES_END_NR,
 	VMALLOC_START_NR,
 	VMALLOC_END_NR,
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-	VMEMMAP_START_NR,
-	VMEMMAP_END_NR,
-#endif
 	FIXADDR_START_NR,
 	FIXADDR_END_NR,
 	PCI_START_NR,
 	PCI_END_NR,
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	VMEMMAP_START_NR,
+	VMEMMAP_END_NR,
+#endif
 	KERNEL_SPACE_NR,
 };
 
@@ -53,14 +53,14 @@ static struct addr_marker address_markers[] = {
 	{ MODULES_END,		"Modules end" },
 	{ VMALLOC_START,	"vmalloc() Area" },
 	{ VMALLOC_END,		"vmalloc() End" },
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-	{ 0,			"vmemmap start" },
-	{ 0,			"vmemmap end" },
-#endif
 	{ FIXADDR_START,	"Fixmap start" },
 	{ FIXADDR_TOP,		"Fixmap end" },
 	{ PCI_IO_START,		"PCI I/O start" },
 	{ PCI_IO_END,		"PCI I/O end" },
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	{ 0,			"vmemmap start" },
+	{ 0,			"vmemmap end" },
+#endif
 	{ PAGE_OFFSET,		"Linear Mapping" },
 	{ -1,			NULL },
 };
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 89376f3..d55d720 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -412,6 +412,10 @@ void __init mem_init(void)
 		MLK_ROUNDUP(__start_rodata, _etext),
 		MLK_ROUNDUP(__init_begin, __init_end),
 		MLK_ROUNDUP(_sdata, _edata));
+	pr_cont("    fixed   : 0x%16lx - 0x%16lx   (%6ld KB)\n",
+		MLK(FIXADDR_START, FIXADDR_TOP));
+	pr_cont("    PCI I/O : 0x%16lx - 0x%16lx   (%6ld MB)\n",
+		MLM(PCI_IO_START, PCI_IO_END));
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 	pr_cont("    vmemmap : 0x%16lx - 0x%16lx   (%6ld GB maximum)\n"
 		"              0x%16lx - 0x%16lx   (%6ld MB actual)\n",
@@ -420,10 +424,6 @@ void __init mem_init(void)
 		MLM((unsigned long)phys_to_page(memblock_start_of_DRAM()),
 		    (unsigned long)virt_to_page(high_memory)));
 #endif
-	pr_cont("    fixed   : 0x%16lx - 0x%16lx   (%6ld KB)\n",
-		MLK(FIXADDR_START, FIXADDR_TOP));
-	pr_cont("    PCI I/O : 0x%16lx - 0x%16lx   (%6ld MB)\n",
-		MLM(PCI_IO_START, PCI_IO_END));
 	pr_cont("    memory  : 0x%16lx - 0x%16lx   (%6ld MB)\n",
 		MLM(__phys_to_virt(memblock_start_of_DRAM()),
 		    (unsigned long)high_memory));
@@ -440,6 +440,12 @@ void __init mem_init(void)
 	BUILD_BUG_ON(TASK_SIZE_32			> TASK_SIZE_64);
 #endif
 
+	/*
+	 * Make sure we chose the upper bound of sizeof(struct page)
+	 * correctly.
+	 */
+	BUILD_BUG_ON(sizeof(struct page) > (1 << STRUCT_PAGE_MAX_SHIFT));
+
 	if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) {
 		extern int sysctl_overcommit_memory;
 		/*
-- 
cgit v0.10.2


From 9f2875912dac35d9272a82ea9eec9e5884b42cd2 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 30 Mar 2016 16:46:01 +0200
Subject: arm64: mm: restrict virt_to_page() to the linear mapping

Now that the vmemmap region has been redefined to cover the linear region
rather than the entire physical address space, we no longer need to
perform a virtual-to-physical translation in the implementaion of
virt_to_page(). This restricts virt_to_page() translations to the linear
region, so redefine virt_addr_valid() as well.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 8a2ab19..f412f50 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -208,9 +208,19 @@ static inline void *phys_to_virt(phys_addr_t x)
  */
 #define ARCH_PFN_OFFSET		((unsigned long)PHYS_PFN_OFFSET)
 
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
 #define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
-#define	virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+#define virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+#else
+#define __virt_to_pgoff(kaddr)	(((u64)(kaddr) & ~PAGE_OFFSET) / PAGE_SIZE * sizeof(struct page))
+#define __page_to_voff(kaddr)	(((u64)(page) & ~VMEMMAP_START) * PAGE_SIZE / sizeof(struct page))
+
+#define page_to_virt(page)	((void *)((__page_to_voff(page)) | PAGE_OFFSET))
+#define virt_to_page(vaddr)	((struct page *)((__virt_to_pgoff(vaddr)) | VMEMMAP_START))
 
+#define virt_addr_valid(kaddr)	pfn_valid((((u64)(kaddr) & ~PAGE_OFFSET) \
+					   + PHYS_OFFSET) >> PAGE_SHIFT)
+#endif
 #endif
 
 #include <asm-generic/memory_model.h>
-- 
cgit v0.10.2


From 2c09ec06bc39fc85a2b3856524348c301def27af Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 30 Mar 2016 17:43:06 +0200
Subject: arm64: use 'segment' rather than 'chunk' to describe mapped kernel
 regions

Replace the poorly defined term chunk with segment, which is a term that is
already used by the ELF spec to describe contiguous mappings with the same
permission attributes of statically allocated ranges of an executable.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index f3e5c74..9be2065 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -471,8 +471,8 @@ void fixup_init(void)
 	unmap_kernel_range((u64)__init_begin, (u64)(__init_end - __init_begin));
 }
 
-static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end,
-				    pgprot_t prot, struct vm_struct *vma)
+static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end,
+				      pgprot_t prot, struct vm_struct *vma)
 {
 	phys_addr_t pa_start = __pa(va_start);
 	unsigned long size = va_end - va_start;
@@ -499,11 +499,11 @@ static void __init map_kernel(pgd_t *pgd)
 {
 	static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_init, vmlinux_data;
 
-	map_kernel_chunk(pgd, _stext, __start_rodata, PAGE_KERNEL_EXEC, &vmlinux_text);
-	map_kernel_chunk(pgd, __start_rodata, _etext, PAGE_KERNEL, &vmlinux_rodata);
-	map_kernel_chunk(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC,
-			 &vmlinux_init);
-	map_kernel_chunk(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data);
+	map_kernel_segment(pgd, _stext, __start_rodata, PAGE_KERNEL_EXEC, &vmlinux_text);
+	map_kernel_segment(pgd, __start_rodata, _etext, PAGE_KERNEL, &vmlinux_rodata);
+	map_kernel_segment(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC,
+			   &vmlinux_init);
+	map_kernel_segment(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data);
 
 	if (!pgd_val(*pgd_offset_raw(pgd, FIXADDR_START))) {
 		/*
-- 
cgit v0.10.2


From 546c8c44f092b2f23291fe499c221efc8cabbb67 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 30 Mar 2016 17:43:07 +0200
Subject: arm64: move early boot code to the .init segment

Apart from the arm64/linux and EFI header data structures, there is nothing
in the .head.text section that must reside at the beginning of the Image.
So let's move it to the .init section where it belongs.

Note that this involves some minor tweaking of the EFI header, primarily
because the address of 'stext' no longer coincides with the start of the
.text section. It also requires a couple of relocated symbol references
to be slightly rewritten or their definition moved to the linker script.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S
index cae3112..e88c064 100644
--- a/arch/arm64/kernel/efi-entry.S
+++ b/arch/arm64/kernel/efi-entry.S
@@ -62,7 +62,7 @@ ENTRY(entry)
 	 */
 	mov	x20, x0		// DTB address
 	ldr	x0, [sp, #16]	// relocated _text address
-	movz	x21, #:abs_g0:stext_offset
+	ldr	w21, =stext_offset
 	add	x21, x0, x21
 
 	/*
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 4203d5f..b434176 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -102,8 +102,6 @@ _head:
 #endif
 
 #ifdef CONFIG_EFI
-	.globl	__efistub_stext_offset
-	.set	__efistub_stext_offset, stext - _head
 	.align 3
 pe_header:
 	.ascii	"PE"
@@ -123,11 +121,11 @@ optional_header:
 	.short	0x20b				// PE32+ format
 	.byte	0x02				// MajorLinkerVersion
 	.byte	0x14				// MinorLinkerVersion
-	.long	_end - stext			// SizeOfCode
+	.long	_end - efi_header_end		// SizeOfCode
 	.long	0				// SizeOfInitializedData
 	.long	0				// SizeOfUninitializedData
 	.long	__efistub_entry - _head		// AddressOfEntryPoint
-	.long	__efistub_stext_offset		// BaseOfCode
+	.long	efi_header_end - _head		// BaseOfCode
 
 extra_header_fields:
 	.quad	0				// ImageBase
@@ -144,7 +142,7 @@ extra_header_fields:
 	.long	_end - _head			// SizeOfImage
 
 	// Everything before the kernel image is considered part of the header
-	.long	__efistub_stext_offset		// SizeOfHeaders
+	.long	efi_header_end - _head		// SizeOfHeaders
 	.long	0				// CheckSum
 	.short	0xa				// Subsystem (EFI application)
 	.short	0				// DllCharacteristics
@@ -188,10 +186,10 @@ section_table:
 	.byte	0
 	.byte	0
 	.byte	0        		// end of 0 padding of section name
-	.long	_end - stext		// VirtualSize
-	.long	__efistub_stext_offset	// VirtualAddress
-	.long	_edata - stext		// SizeOfRawData
-	.long	__efistub_stext_offset	// PointerToRawData
+	.long	_end - efi_header_end	// VirtualSize
+	.long	efi_header_end - _head	// VirtualAddress
+	.long	_edata - efi_header_end	// SizeOfRawData
+	.long	efi_header_end - _head	// PointerToRawData
 
 	.long	0		// PointerToRelocations (0 for executables)
 	.long	0		// PointerToLineNumbers (0 for executables)
@@ -200,15 +198,18 @@ section_table:
 	.long	0xe0500020	// Characteristics (section flags)
 
 	/*
-	 * EFI will load stext onwards at the 4k section alignment
+	 * EFI will load .text onwards at the 4k section alignment
 	 * described in the PE/COFF header. To ensure that instruction
 	 * sequences using an adrp and a :lo12: immediate will function
-	 * correctly at this alignment, we must ensure that stext is
+	 * correctly at this alignment, we must ensure that .text is
 	 * placed at a 4k boundary in the Image to begin with.
 	 */
 	.align 12
+efi_header_end:
 #endif
 
+	__INIT
+
 ENTRY(stext)
 	bl	preserve_boot_args
 	bl	el2_setup			// Drop to EL1, w20=cpu_boot_mode
@@ -223,12 +224,12 @@ ENTRY(stext)
 	 * the TCR will have been set.
 	 */
 	ldr	x27, 0f				// address to jump to after
-						// MMU has been enabled
+	neg	x27, x27			// MMU has been enabled
 	adr_l	lr, __enable_mmu		// return (PIC) address
 	b	__cpu_setup			// initialise processor
 ENDPROC(stext)
 	.align	3
-0:	.quad	__mmap_switched - (_head - TEXT_OFFSET) + KIMAGE_VADDR
+0:	.quad	(_text - TEXT_OFFSET) - __mmap_switched - KIMAGE_VADDR
 
 /*
  * Preserve the arguments passed by the bootloader in x0 .. x3
@@ -397,7 +398,7 @@ __create_page_tables:
 	ldr	x5, =KIMAGE_VADDR
 	add	x5, x5, x23			// add KASLR displacement
 	create_pgd_entry x0, x5, x3, x6
-	ldr	w6, kernel_img_size
+	ldr	w6, =kernel_img_size
 	add	x6, x6, x5
 	mov	x3, x24				// phys offset
 	create_block_map x0, x7, x3, x5, x6
@@ -414,9 +415,6 @@ __create_page_tables:
 
 	ret	x28
 ENDPROC(__create_page_tables)
-
-kernel_img_size:
-	.long	_end - (_head - TEXT_OFFSET)
 	.ltorg
 
 /*
diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
index 5e360ce..4fd72da 100644
--- a/arch/arm64/kernel/image.h
+++ b/arch/arm64/kernel/image.h
@@ -71,8 +71,12 @@
 	DEFINE_IMAGE_LE64(_kernel_offset_le, TEXT_OFFSET);	\
 	DEFINE_IMAGE_LE64(_kernel_flags_le, __HEAD_FLAGS);
 
+kernel_img_size = _end - (_text - TEXT_OFFSET);
+
 #ifdef CONFIG_EFI
 
+__efistub_stext_offset = stext - _text;
+
 /*
  * Prevent the symbol aliases below from being emitted into the kallsyms
  * table, by forcing them to be absolute symbols (which are conveniently
-- 
cgit v0.10.2


From 7eb90f2ff7e3ee814ff12f3cd909b965cdd4a869 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 30 Mar 2016 17:43:08 +0200
Subject: arm64: cover the .head.text section in the .text segment mapping

Keeping .head.text out of the .text mapping buys us very little: its actual
payload is only 4 KB, most of which is padding, but the page alignment may
add up to 2 MB (in case of CONFIG_DEBUG_ALIGN_RODATA=y) of additional
padding to the uncompressed kernel Image.

Also, on 4 KB granule kernels, the 4 KB misalignment of .text forces us to
map the adjacent 56 KB of code without the PTE_CONT attribute, and since
this region contains things like the vector table and the GIC interrupt
handling entry point, this region is likely to benefit from the reduced TLB
pressure that results from PTE_CONT mappings.

So remove the alignment between the .head.text and .text sections, and use
the [_text, _etext) rather than the [_stext, _etext) interval for mapping
the .text segment.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 5a1939a..61a1075 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -96,7 +96,6 @@ SECTIONS
 		_text = .;
 		HEAD_TEXT
 	}
-	ALIGN_DEBUG_RO_MIN(PAGE_SIZE)
 	.text : {			/* Real text segment		*/
 		_stext = .;		/* Text and read-only data	*/
 			__exception_text_start = .;
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 9be2065..dbad533 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -385,7 +385,7 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt,
 
 static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end)
 {
-	unsigned long kernel_start = __pa(_stext);
+	unsigned long kernel_start = __pa(_text);
 	unsigned long kernel_end = __pa(_etext);
 
 	/*
@@ -417,7 +417,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end
 				     early_pgtable_alloc);
 
 	/*
-	 * Map the linear alias of the [_stext, _etext) interval as
+	 * Map the linear alias of the [_text, _etext) interval as
 	 * read-only/non-executable. This makes the contents of the
 	 * region accessible to subsystems such as hibernate, but
 	 * protects it from inadvertent modification or execution.
@@ -449,8 +449,8 @@ void mark_rodata_ro(void)
 {
 	unsigned long section_size;
 
-	section_size = (unsigned long)__start_rodata - (unsigned long)_stext;
-	create_mapping_late(__pa(_stext), (unsigned long)_stext,
+	section_size = (unsigned long)__start_rodata - (unsigned long)_text;
+	create_mapping_late(__pa(_text), (unsigned long)_text,
 			    section_size, PAGE_KERNEL_ROX);
 	/*
 	 * mark .rodata as read only. Use _etext rather than __end_rodata to
@@ -499,7 +499,7 @@ static void __init map_kernel(pgd_t *pgd)
 {
 	static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_init, vmlinux_data;
 
-	map_kernel_segment(pgd, _stext, __start_rodata, PAGE_KERNEL_EXEC, &vmlinux_text);
+	map_kernel_segment(pgd, _text, __start_rodata, PAGE_KERNEL_EXEC, &vmlinux_text);
 	map_kernel_segment(pgd, __start_rodata, _etext, PAGE_KERNEL, &vmlinux_rodata);
 	map_kernel_segment(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC,
 			   &vmlinux_init);
-- 
cgit v0.10.2


From 97740051dd31d200a0efaa84544fe5e4713aac40 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 30 Mar 2016 17:43:09 +0200
Subject: arm64: simplify kernel segment mapping granularity

The mapping of the kernel consist of four segments, each of which is mapped
with different permission attributes and/or lifetimes. To optimize the TLB
and translation table footprint, we define various opaque constants in the
linker script that resolve to different aligment values depending on the
page size and whether CONFIG_DEBUG_ALIGN_RODATA is set.

Considering that
- a 4 KB granule kernel benefits from a 64 KB segment alignment (due to
  the fact that it allows the use of the contiguous bit),
- the minimum alignment of the .data segment is THREAD_SIZE already, not
  PAGE_SIZE (i.e., we already have padding between _data and the start of
  the .data payload in many cases),
- 2 MB is a suitable alignment value on all granule sizes, either for
  mapping directly (level 2 on 4 KB), or via the contiguous bit (level 3 on
  16 KB and 64 KB),
- anything beyond 2 MB exceeds the minimum alignment mandated by the boot
  protocol, and can only be mapped efficiently if the physical alignment
  happens to be the same,

we can simplify this by standardizing on 64 KB (or 2 MB) explicitly, i.e.,
regardless of granule size, all segments are aligned either to 64 KB, or to
2 MB if CONFIG_DEBUG_ALIGN_RODATA=y. This also means we can drop the Kconfig
dependency of CONFIG_DEBUG_ALIGN_RODATA on CONFIG_ARM64_4K_PAGES.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug
index 7e76845..710fde4 100644
--- a/arch/arm64/Kconfig.debug
+++ b/arch/arm64/Kconfig.debug
@@ -59,7 +59,7 @@ config DEBUG_RODATA
 	  If in doubt, say Y
 
 config DEBUG_ALIGN_RODATA
-	depends on DEBUG_RODATA && ARM64_4K_PAGES
+	depends on DEBUG_RODATA
 	bool "Align linker sections up to SECTION_SIZE"
 	help
 	  If this option is enabled, sections that may potentially be marked as
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 61a1075..77d86c9 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -63,14 +63,19 @@ PECOFF_FILE_ALIGNMENT = 0x200;
 #endif
 
 #if defined(CONFIG_DEBUG_ALIGN_RODATA)
-#define ALIGN_DEBUG_RO			. = ALIGN(1<<SECTION_SHIFT);
-#define ALIGN_DEBUG_RO_MIN(min)		ALIGN_DEBUG_RO
-#elif defined(CONFIG_DEBUG_RODATA)
-#define ALIGN_DEBUG_RO			. = ALIGN(1<<PAGE_SHIFT);
-#define ALIGN_DEBUG_RO_MIN(min)		ALIGN_DEBUG_RO
+/*
+ *  4 KB granule:   1 level 2 entry
+ * 16 KB granule: 128 level 3 entries, with contiguous bit
+ * 64 KB granule:  32 level 3 entries, with contiguous bit
+ */
+#define SEGMENT_ALIGN			SZ_2M
 #else
-#define ALIGN_DEBUG_RO
-#define ALIGN_DEBUG_RO_MIN(min)		. = ALIGN(min);
+/*
+ *  4 KB granule:  16 level 3 entries, with contiguous bit
+ * 16 KB granule:   4 level 3 entries, without contiguous bit
+ * 64 KB granule:   1 level 3 entry
+ */
+#define SEGMENT_ALIGN			SZ_64K
 #endif
 
 SECTIONS
@@ -114,12 +119,12 @@ SECTIONS
 		*(.got)			/* Global offset table		*/
 	}
 
-	ALIGN_DEBUG_RO_MIN(PAGE_SIZE)
+	. = ALIGN(SEGMENT_ALIGN);
 	RO_DATA(PAGE_SIZE)		/* everything from this point to */
 	EXCEPTION_TABLE(8)		/* _etext will be marked RO NX   */
 	NOTES
 
-	ALIGN_DEBUG_RO_MIN(PAGE_SIZE)
+	. = ALIGN(SEGMENT_ALIGN);
 	_etext = .;			/* End of text and rodata section */
 	__init_begin = .;
 
@@ -168,7 +173,7 @@ SECTIONS
 		*(.hash)
 	}
 
-	. = ALIGN(PAGE_SIZE);
+	. = ALIGN(SEGMENT_ALIGN);
 	__init_end = .;
 
 	_data = .;
-- 
cgit v0.10.2


From 499c81507f599d4e753cf6401f921e68efab235a Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Wed, 6 Apr 2016 10:42:28 +0200
Subject: arm64/debug: Remove superfluous SMP function call

Since commit 1cf4f629d9d2 ("cpu/hotplug: Move online calls to
hotplugged cpu") it is ensured that callbacks of CPU_ONLINE and
CPU_DOWN_PREPARE are processed on the hotplugged CPU. Due to this SMP
function calls are no longer required.

Replace smp_call_function_single() with a direct call to
clear_os_lock(). The function writes the OSLAR register to clear OS
locking. This does not require to be called with interrupts disabled,
therefore the smp_call_function_single() calling convention is not
preserved.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c
index c45f296..4fbf3c5 100644
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -135,9 +135,8 @@ static void clear_os_lock(void *unused)
 static int os_lock_notify(struct notifier_block *self,
 				    unsigned long action, void *data)
 {
-	int cpu = (unsigned long)data;
 	if ((action & ~CPU_TASKS_FROZEN) == CPU_ONLINE)
-		smp_call_function_single(cpu, clear_os_lock, NULL, 1);
+		clear_os_lock(NULL);
 	return NOTIFY_OK;
 }
 
-- 
cgit v0.10.2


From 4bc49274403395db41dc4ae7d2080346b2319d34 Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Wed, 6 Apr 2016 10:42:49 +0200
Subject: arm64: hw-breakpoint: Remove superfluous SMP function call

Since commit 1cf4f629d9d2 ("cpu/hotplug: Move online calls to
hotplugged cpu") it is ensured that callbacks of CPU_ONLINE and
CPU_DOWN_PREPARE are processed on the hotplugged CPU. Due to this SMP
function calls are no longer required.

Replace smp_call_function_single() with a direct call of
hw_breakpoint_reset(). To keep the calling convention, interrupts are
explicitly disabled around the call.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index b45c95d..11dc50a 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -886,9 +886,11 @@ static int hw_breakpoint_reset_notify(struct notifier_block *self,
 						unsigned long action,
 						void *hcpu)
 {
-	int cpu = (long)hcpu;
-	if ((action & ~CPU_TASKS_FROZEN) == CPU_ONLINE)
-		smp_call_function_single(cpu, hw_breakpoint_reset, NULL, 1);
+	if ((action & ~CPU_TASKS_FROZEN) == CPU_ONLINE) {
+		local_irq_disable();
+		hw_breakpoint_reset(NULL);
+		local_irq_enable();
+	}
 	return NOTIFY_OK;
 }
 
-- 
cgit v0.10.2


From 6afedcd23cfd7ac56c011069e4a8db37b46e4623 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 13 Apr 2016 13:40:00 +0100
Subject: arm64: mm: Add trace_irqflags annotations to do_debug_exception()

With CONFIG_PROVE_LOCKING, CONFIG_DEBUG_LOCKDEP and CONFIG_TRACE_IRQFLAGS
enabled, lockdep will compare current->hardirqs_enabled with the flags from
local_irq_save().

When a debug exception occurs, interrupts are disabled in entry.S, but
lockdep isn't told, resulting in:
DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)
------------[ cut here ]------------
WARNING: at ../kernel/locking/lockdep.c:3523
Modules linked in:
CPU: 3 PID: 1752 Comm: perf Not tainted 4.5.0-rc4+ #2204
Hardware name: ARM Juno development board (r1) (DT)
task: ffffffc974868000 ti: ffffffc975f40000 task.ti: ffffffc975f40000
PC is at check_flags.part.35+0x17c/0x184
LR is at check_flags.part.35+0x17c/0x184
pc : [<ffffff80080fc93c>] lr : [<ffffff80080fc93c>] pstate: 600003c5
[...]
---[ end trace 74631f9305ef5020 ]---
Call trace:
[<ffffff80080fc93c>] check_flags.part.35+0x17c/0x184
[<ffffff80080ffe30>] lock_acquire+0xa8/0xc4
[<ffffff8008093038>] breakpoint_handler+0x118/0x288
[<ffffff8008082434>] do_debug_exception+0x3c/0xa8
[<ffffff80080854b4>] el1_dbg+0x18/0x6c
[<ffffff80081e82f4>] do_filp_open+0x64/0xdc
[<ffffff80081d6e60>] do_sys_open+0x140/0x204
[<ffffff80081d6f58>] SyS_openat+0x10/0x18
[<ffffff8008085d30>] el0_svc_naked+0x24/0x28
possible reason: unannotated irqs-off.
irq event stamp: 65857
hardirqs last  enabled at (65857): [<ffffff80081fb1c0>] lookup_mnt+0xf4/0x1b4
hardirqs last disabled at (65856): [<ffffff80081fb188>] lookup_mnt+0xbc/0x1b4
softirqs last  enabled at (65790): [<ffffff80080bdca4>] __do_softirq+0x1f8/0x290
softirqs last disabled at (65757): [<ffffff80080be038>] irq_exit+0x9c/0xd0

This patch adds the annotations to do_debug_exception(), while trying not
to call trace_hardirqs_off() if el1_dbg() interrupted a task that already
had irqs disabled.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 95df28b..c12e967 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -555,20 +555,33 @@ asmlinkage int __exception do_debug_exception(unsigned long addr,
 {
 	const struct fault_info *inf = debug_fault_info + DBG_ESR_EVT(esr);
 	struct siginfo info;
+	int rv;
 
-	if (!inf->fn(addr, esr, regs))
-		return 1;
+	/*
+	 * Tell lockdep we disabled irqs in entry.S. Do nothing if they were
+	 * already disabled to preserve the last enabled/disabled addresses.
+	 */
+	if (interrupts_enabled(regs))
+		trace_hardirqs_off();
 
-	pr_alert("Unhandled debug exception: %s (0x%08x) at 0x%016lx\n",
-		 inf->name, esr, addr);
+	if (!inf->fn(addr, esr, regs)) {
+		rv = 1;
+	} else {
+		pr_alert("Unhandled debug exception: %s (0x%08x) at 0x%016lx\n",
+			 inf->name, esr, addr);
+
+		info.si_signo = inf->sig;
+		info.si_errno = 0;
+		info.si_code  = inf->code;
+		info.si_addr  = (void __user *)addr;
+		arm64_notify_die("", regs, &info, 0);
+		rv = 0;
+	}
 
-	info.si_signo = inf->sig;
-	info.si_errno = 0;
-	info.si_code  = inf->code;
-	info.si_addr  = (void __user *)addr;
-	arm64_notify_die("", regs, &info, 0);
+	if (interrupts_enabled(regs))
+		trace_hardirqs_on();
 
-	return 0;
+	return rv;
 }
 
 #ifdef CONFIG_ARM64_PAN
-- 
cgit v0.10.2


From 17eebd1a435c8616c47b715e3447f4a9c15b741f Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Tue, 12 Apr 2016 15:46:00 +0100
Subject: arm64: Add cpu_panic_kernel helper

During the activation of a secondary CPU, we could report serious
configuration issues and hence request to crash the kernel. We do
this for CPU ASID bit check now. We will need it also for handling
mismatched exception levels for the CPUs with VHE. Hence, add a
helper to do the same for reusability.

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 817a067..433e504 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -113,6 +113,17 @@ static inline void update_cpu_boot_status(int val)
 	dsb(ishst);
 }
 
+/*
+ * The calling secondary CPU has detected serious configuration mismatch,
+ * which calls for a kernel panic. Update the boot status and park the calling
+ * CPU.
+ */
+static inline void cpu_panic_kernel(void)
+{
+	update_cpu_boot_status(CPU_PANIC_KERNEL);
+	cpu_park_loop();
+}
+
 #endif /* ifndef __ASSEMBLY__ */
 
 #endif /* ifndef __ASM_SMP_H */
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index c90c3c5..b7b3978 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -75,8 +75,7 @@ void verify_cpu_asid_bits(void)
 		 */
 		pr_crit("CPU%d: smaller ASID size(%u) than boot CPU (%u)\n",
 				smp_processor_id(), asid, asid_bits);
-		update_cpu_boot_status(CPU_PANIC_KERNEL);
-		cpu_park_loop();
+		cpu_panic_kernel();
 	}
 }
 
-- 
cgit v0.10.2


From ac1ad20f9ed73a22b0a72eb83206302f5fbff55c Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Wed, 13 Apr 2016 14:41:33 +0100
Subject: arm64: vhe: Verify CPU Exception Levels

With a VHE capable CPU, kernel can run at EL2 and is a decided at early
boot. If some of the CPUs didn't start it EL2 or doesn't have VHE, we
could have CPUs running at different exception levels, all in the same
kernel! This patch adds an early check for the secondary CPUs to detect
such situations.

For each non-boot CPU add a sanity check to make sure we don't have
different run levels w.r.t the boot CPU. We save the information on
whether the boot CPU is running in hyp mode or not and ensure the
remaining CPUs match it.

Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
[will: made boot_cpu_hyp_mode static]
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index 9f22dd6..a5c2b48 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -60,6 +60,12 @@ static inline bool is_kernel_in_hyp_mode(void)
 	return el == CurrentEL_EL2;
 }
 
+#ifdef CONFIG_ARM64_VHE
+extern void verify_cpu_run_el(void);
+#else
+static inline void verify_cpu_run_el(void) {}
+#endif
+
 /* The section containing the hypervisor text */
 extern char __hyp_text_start[];
 extern char __hyp_text_end[];
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 677f17c..f586343 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -912,6 +912,7 @@ static u64 __raw_read_system_reg(u32 sys_id)
  */
 static void check_early_cpu_features(void)
 {
+	verify_cpu_run_el();
 	verify_cpu_asid_bits();
 }
 
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index b2d5f4e..5fbab1c 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -75,6 +75,43 @@ enum ipi_msg_type {
 	IPI_WAKEUP
 };
 
+#ifdef CONFIG_ARM64_VHE
+
+/* Whether the boot CPU is running in HYP mode or not*/
+static bool boot_cpu_hyp_mode;
+
+static inline void save_boot_cpu_run_el(void)
+{
+	boot_cpu_hyp_mode = is_kernel_in_hyp_mode();
+}
+
+static inline bool is_boot_cpu_in_hyp_mode(void)
+{
+	return boot_cpu_hyp_mode;
+}
+
+/*
+ * Verify that a secondary CPU is running the kernel at the same
+ * EL as that of the boot CPU.
+ */
+void verify_cpu_run_el(void)
+{
+	bool in_el2 = is_kernel_in_hyp_mode();
+	bool boot_cpu_el2 = is_boot_cpu_in_hyp_mode();
+
+	if (in_el2 ^ boot_cpu_el2) {
+		pr_crit("CPU%d: mismatched Exception Level(EL%d) with boot CPU(EL%d)\n",
+					smp_processor_id(),
+					in_el2 ? 2 : 1,
+					boot_cpu_el2 ? 2 : 1);
+		cpu_panic_kernel();
+	}
+}
+
+#else
+static inline void save_boot_cpu_run_el(void) {}
+#endif
+
 #ifdef CONFIG_HOTPLUG_CPU
 static int op_cpu_kill(unsigned int cpu);
 #else
@@ -401,6 +438,7 @@ void __init smp_cpus_done(unsigned int max_cpus)
 void __init smp_prepare_boot_cpu(void)
 {
 	cpuinfo_store_boot_cpu();
+	save_boot_cpu_run_el();
 	set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
 }
 
-- 
cgit v0.10.2


From 500899c2cc3e3f06140373b587a69d30650f2d9d Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 8 Apr 2016 15:50:23 -0700
Subject: efi: ARM/arm64: ignore DT memory nodes instead of removing them

There are two problems with the UEFI stub DT memory node removal
routine:
- it deletes nodes as it traverses the tree, which happens to work
  but is not supported, as deletion invalidates the node iterator;
- deleting memory nodes entirely may discard annotations in the form
  of additional properties on the nodes.

Since the discovery of DT memory nodes occurs strictly before the
UEFI init sequence, we can simply clear the memblock memory table
before parsing the UEFI memory map. This way, it is no longer
necessary to remove the nodes, so we can remove that logic from the
stub as well.

Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Acked-by: Steve Capper <steve.capper@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: David Daney <david.daney@cavium.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c
index aa1f743..5d6945b 100644
--- a/drivers/firmware/efi/arm-init.c
+++ b/drivers/firmware/efi/arm-init.c
@@ -143,6 +143,14 @@ static __init void reserve_regions(void)
 	if (efi_enabled(EFI_DBG))
 		pr_info("Processing EFI memory map:\n");
 
+	/*
+	 * Discard memblocks discovered so far: if there are any at this
+	 * point, they originate from memory nodes in the DT, and UEFI
+	 * uses its own memory map instead.
+	 */
+	memblock_dump_all();
+	memblock_remove(0, ULLONG_MAX);
+
 	for_each_efi_memory_desc(&memmap, md) {
 		paddr = md->phys_addr;
 		npages = md->num_pages;
diff --git a/drivers/firmware/efi/libstub/fdt.c b/drivers/firmware/efi/libstub/fdt.c
index 6dba78a..e58abfa 100644
--- a/drivers/firmware/efi/libstub/fdt.c
+++ b/drivers/firmware/efi/libstub/fdt.c
@@ -24,7 +24,7 @@ efi_status_t update_fdt(efi_system_table_t *sys_table, void *orig_fdt,
 			unsigned long map_size, unsigned long desc_size,
 			u32 desc_ver)
 {
-	int node, prev, num_rsv;
+	int node, num_rsv;
 	int status;
 	u32 fdt_val32;
 	u64 fdt_val64;
@@ -54,28 +54,6 @@ efi_status_t update_fdt(efi_system_table_t *sys_table, void *orig_fdt,
 		goto fdt_set_fail;
 
 	/*
-	 * Delete any memory nodes present. We must delete nodes which
-	 * early_init_dt_scan_memory may try to use.
-	 */
-	prev = 0;
-	for (;;) {
-		const char *type;
-		int len;
-
-		node = fdt_next_node(fdt, prev, NULL);
-		if (node < 0)
-			break;
-
-		type = fdt_getprop(fdt, node, "device_type", &len);
-		if (type && strncmp(type, "memory", len) == 0) {
-			fdt_del_node(fdt, node);
-			continue;
-		}
-
-		prev = node;
-	}
-
-	/*
 	 * Delete all memory reserve map entries. When booting via UEFI,
 	 * kernel will use the UEFI memory map to find reserved regions.
 	 */
-- 
cgit v0.10.2


From 2bc4da1d2b4d828cb4e3a5593967556b1bd78898 Mon Sep 17 00:00:00 2001
From: Ganapatrao Kulkarni <gkulkarni@caviumnetworks.com>
Date: Fri, 8 Apr 2016 15:50:24 -0700
Subject: Documentation, dt, numa: dt bindings for NUMA.

Add DT bindings for numa mapping of memory, CPUs and IOs.

Reviewed-by: Robert Richter <rrichter@cavium.com>
Signed-off-by: Ganapatrao Kulkarni <gkulkarni@caviumnetworks.com>
Signed-off-by: David Daney <david.daney@cavium.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/Documentation/devicetree/bindings/numa.txt b/Documentation/devicetree/bindings/numa.txt
new file mode 100644
index 0000000..21b3505
--- /dev/null
+++ b/Documentation/devicetree/bindings/numa.txt
@@ -0,0 +1,275 @@
+==============================================================================
+NUMA binding description.
+==============================================================================
+
+==============================================================================
+1 - Introduction
+==============================================================================
+
+Systems employing a Non Uniform Memory Access (NUMA) architecture contain
+collections of hardware resources including processors, memory, and I/O buses,
+that comprise what is commonly known as a NUMA node.
+Processor accesses to memory within the local NUMA node is generally faster
+than processor accesses to memory outside of the local NUMA node.
+DT defines interfaces that allow the platform to convey NUMA node
+topology information to OS.
+
+==============================================================================
+2 - numa-node-id
+==============================================================================
+
+For the purpose of identification, each NUMA node is associated with a unique
+token known as a node id. For the purpose of this binding
+a node id is a 32-bit integer.
+
+A device node is associated with a NUMA node by the presence of a
+numa-node-id property which contains the node id of the device.
+
+Example:
+	/* numa node 0 */
+	numa-node-id = <0>;
+
+	/* numa node 1 */
+	numa-node-id = <1>;
+
+==============================================================================
+3 - distance-map
+==============================================================================
+
+The optional device tree node distance-map describes the relative
+distance (memory latency) between all numa nodes.
+
+- compatible : Should at least contain "numa-distance-map-v1".
+
+- distance-matrix
+  This property defines a matrix to describe the relative distances
+  between all numa nodes.
+  It is represented as a list of node pairs and their relative distance.
+
+  Note:
+	1. Each entry represents distance from first node to second node.
+	The distances are equal in either direction.
+	2. The distance from a node to self (local distance) is represented
+	with value 10 and all internode distance should be represented with
+	a value greater than 10.
+	3. distance-matrix should have entries in lexicographical ascending
+	order of nodes.
+	4. There must be only one device node distance-map which must
+	reside in the root node.
+	5. If the distance-map node is not present, a default
+	distance-matrix is used.
+
+Example:
+	4 nodes connected in mesh/ring topology as below,
+
+		0_______20______1
+		|               |
+		|               |
+		20             20
+		|               |
+		|               |
+		|_______________|
+		3       20      2
+
+	if relative distance for each hop is 20,
+	then internode distance would be,
+	      0 -> 1 = 20
+	      1 -> 2 = 20
+	      2 -> 3 = 20
+	      3 -> 0 = 20
+	      0 -> 2 = 40
+	      1 -> 3 = 40
+
+     and dt presentation for this distance matrix is,
+
+		distance-map {
+			 compatible = "numa-distance-map-v1";
+			 distance-matrix = <0 0  10>,
+					   <0 1  20>,
+					   <0 2  40>,
+					   <0 3  20>,
+					   <1 0  20>,
+					   <1 1  10>,
+					   <1 2  20>,
+					   <1 3  40>,
+					   <2 0  40>,
+					   <2 1  20>,
+					   <2 2  10>,
+					   <2 3  20>,
+					   <3 0  20>,
+					   <3 1  40>,
+					   <3 2  20>,
+					   <3 3  10>;
+		};
+
+==============================================================================
+4 - Example dts
+==============================================================================
+
+Dual socket system consists of 2 boards connected through ccn bus and
+each board having one socket/soc of 8 cpus, memory and pci bus.
+
+	memory@c00000 {
+		device_type = "memory";
+		reg = <0x0 0xc00000 0x0 0x80000000>;
+		/* node 0 */
+		numa-node-id = <0>;
+	};
+
+	memory@10000000000 {
+		device_type = "memory";
+		reg = <0x100 0x0 0x0 0x80000000>;
+		/* node 1 */
+		numa-node-id = <1>;
+	};
+
+	cpus {
+		#address-cells = <2>;
+		#size-cells = <0>;
+
+		cpu@0 {
+			device_type = "cpu";
+			compatible =  "arm,armv8";
+			reg = <0x0 0x0>;
+			enable-method = "psci";
+			/* node 0 */
+			numa-node-id = <0>;
+		};
+		cpu@1 {
+			device_type = "cpu";
+			compatible =  "arm,armv8";
+			reg = <0x0 0x1>;
+			enable-method = "psci";
+			numa-node-id = <0>;
+		};
+		cpu@2 {
+			device_type = "cpu";
+			compatible =  "arm,armv8";
+			reg = <0x0 0x2>;
+			enable-method = "psci";
+			numa-node-id = <0>;
+		};
+		cpu@3 {
+			device_type = "cpu";
+			compatible =  "arm,armv8";
+			reg = <0x0 0x3>;
+			enable-method = "psci";
+			numa-node-id = <0>;
+		};
+		cpu@4 {
+			device_type = "cpu";
+			compatible =  "arm,armv8";
+			reg = <0x0 0x4>;
+			enable-method = "psci";
+			numa-node-id = <0>;
+		};
+		cpu@5 {
+			device_type = "cpu";
+			compatible =  "arm,armv8";
+			reg = <0x0 0x5>;
+			enable-method = "psci";
+			numa-node-id = <0>;
+		};
+		cpu@6 {
+			device_type = "cpu";
+			compatible =  "arm,armv8";
+			reg = <0x0 0x6>;
+			enable-method = "psci";
+			numa-node-id = <0>;
+		};
+		cpu@7 {
+			device_type = "cpu";
+			compatible =  "arm,armv8";
+			reg = <0x0 0x7>;
+			enable-method = "psci";
+			numa-node-id = <0>;
+		};
+		cpu@8 {
+			device_type = "cpu";
+			compatible =  "arm,armv8";
+			reg = <0x0 0x8>;
+			enable-method = "psci";
+			/* node 1 */
+			numa-node-id = <1>;
+		};
+		cpu@9 {
+			device_type = "cpu";
+			compatible =  "arm,armv8";
+			reg = <0x0 0x9>;
+			enable-method = "psci";
+			numa-node-id = <1>;
+		};
+		cpu@a {
+			device_type = "cpu";
+			compatible =  "arm,armv8";
+			reg = <0x0 0xa>;
+			enable-method = "psci";
+			numa-node-id = <1>;
+		};
+		cpu@b {
+			device_type = "cpu";
+			compatible =  "arm,armv8";
+			reg = <0x0 0xb>;
+			enable-method = "psci";
+			numa-node-id = <1>;
+		};
+		cpu@c {
+			device_type = "cpu";
+			compatible =  "arm,armv8";
+			reg = <0x0 0xc>;
+			enable-method = "psci";
+			numa-node-id = <1>;
+		};
+		cpu@d {
+			device_type = "cpu";
+			compatible =  "arm,armv8";
+			reg = <0x0 0xd>;
+			enable-method = "psci";
+			numa-node-id = <1>;
+		};
+		cpu@e {
+			device_type = "cpu";
+			compatible =  "arm,armv8";
+			reg = <0x0 0xe>;
+			enable-method = "psci";
+			numa-node-id = <1>;
+		};
+		cpu@f {
+			device_type = "cpu";
+			compatible =  "arm,armv8";
+			reg = <0x0 0xf>;
+			enable-method = "psci";
+			numa-node-id = <1>;
+		};
+	};
+
+	pcie0: pcie0@848000000000 {
+		compatible = "arm,armv8";
+		device_type = "pci";
+		bus-range = <0 255>;
+		#size-cells = <2>;
+		#address-cells = <3>;
+		reg = <0x8480 0x00000000 0 0x10000000>;  /* Configuration space */
+		ranges = <0x03000000 0x8010 0x00000000 0x8010 0x00000000 0x70 0x00000000>;
+		/* node 0 */
+		numa-node-id = <0>;
+        };
+
+	pcie1: pcie1@948000000000 {
+		compatible = "arm,armv8";
+		device_type = "pci";
+		bus-range = <0 255>;
+		#size-cells = <2>;
+		#address-cells = <3>;
+		reg = <0x9480 0x00000000 0 0x10000000>;  /* Configuration space */
+		ranges = <0x03000000 0x9010 0x00000000 0x9010 0x00000000 0x70 0x00000000>;
+		/* node 1 */
+		numa-node-id = <1>;
+        };
+
+	distance-map {
+		compatible = "numa-distance-map-v1";
+		distance-matrix = <0 0 10>,
+				  <0 1 20>,
+				  <1 1 10>;
+	};
-- 
cgit v0.10.2


From 298535c00a2cbcd59e38f8f1c0c9ae7b9911e946 Mon Sep 17 00:00:00 2001
From: David Daney <david.daney@cavium.com>
Date: Fri, 8 Apr 2016 15:50:25 -0700
Subject: of, numa: Add NUMA of binding implementation.

Add device tree parsing for NUMA topology using device
"numa-node-id" property in distance-map and cpu nodes.

This is a complete rewrite of a previous patch by:
   Ganapatrao Kulkarni<gkulkarni@caviumnetworks.com>

Signed-off-by: David Daney <david.daney@cavium.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
index e2a4841..b3bec3a 100644
--- a/drivers/of/Kconfig
+++ b/drivers/of/Kconfig
@@ -112,4 +112,7 @@ config OF_OVERLAY
 	  While this option is selected automatically when needed, you can
 	  enable it manually to improve device tree unit test coverage.
 
+config OF_NUMA
+	bool
+
 endif # OF
diff --git a/drivers/of/Makefile b/drivers/of/Makefile
index 156c072..bee3fa9 100644
--- a/drivers/of/Makefile
+++ b/drivers/of/Makefile
@@ -14,5 +14,6 @@ obj-$(CONFIG_OF_MTD)	+= of_mtd.o
 obj-$(CONFIG_OF_RESERVED_MEM) += of_reserved_mem.o
 obj-$(CONFIG_OF_RESOLVE)  += resolver.o
 obj-$(CONFIG_OF_OVERLAY) += overlay.o
+obj-$(CONFIG_OF_NUMA) += of_numa.o
 
 obj-$(CONFIG_OF_UNITTEST) += unittest-data/
diff --git a/drivers/of/of_numa.c b/drivers/of/of_numa.c
new file mode 100644
index 0000000..0f2784b
--- /dev/null
+++ b/drivers/of/of_numa.c
@@ -0,0 +1,211 @@
+/*
+ * OF NUMA Parsing support.
+ *
+ * Copyright (C) 2015 - 2016 Cavium Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/nodemask.h>
+
+#include <asm/numa.h>
+
+/* define default numa node to 0 */
+#define DEFAULT_NODE 0
+
+/*
+ * Even though we connect cpus to numa domains later in SMP
+ * init, we need to know the node ids now for all cpus.
+*/
+static void __init of_numa_parse_cpu_nodes(void)
+{
+	u32 nid;
+	int r;
+	struct device_node *cpus;
+	struct device_node *np = NULL;
+
+	cpus = of_find_node_by_path("/cpus");
+	if (!cpus)
+		return;
+
+	for_each_child_of_node(cpus, np) {
+		/* Skip things that are not CPUs */
+		if (of_node_cmp(np->type, "cpu") != 0)
+			continue;
+
+		r = of_property_read_u32(np, "numa-node-id", &nid);
+		if (r)
+			continue;
+
+		pr_debug("NUMA: CPU on %u\n", nid);
+		if (nid >= MAX_NUMNODES)
+			pr_warn("NUMA: Node id %u exceeds maximum value\n",
+				nid);
+		else
+			node_set(nid, numa_nodes_parsed);
+	}
+}
+
+static int __init of_numa_parse_memory_nodes(void)
+{
+	struct device_node *np = NULL;
+	struct resource rsrc;
+	u32 nid;
+	int r = 0;
+
+	for (;;) {
+		np = of_find_node_by_type(np, "memory");
+		if (!np)
+			break;
+
+		r = of_property_read_u32(np, "numa-node-id", &nid);
+		if (r == -EINVAL)
+			/*
+			 * property doesn't exist if -EINVAL, continue
+			 * looking for more memory nodes with
+			 * "numa-node-id" property
+			 */
+			continue;
+		else if (r)
+			/* some other error */
+			break;
+
+		r = of_address_to_resource(np, 0, &rsrc);
+		if (r) {
+			pr_err("NUMA: bad reg property in memory node\n");
+			break;
+		}
+
+		pr_debug("NUMA:  base = %llx len = %llx, node = %u\n",
+			 rsrc.start, rsrc.end - rsrc.start + 1, nid);
+
+		r = numa_add_memblk(nid, rsrc.start,
+				    rsrc.end - rsrc.start + 1);
+		if (r)
+			break;
+	}
+	of_node_put(np);
+
+	return r;
+}
+
+static int __init of_numa_parse_distance_map_v1(struct device_node *map)
+{
+	const __be32 *matrix;
+	int entry_count;
+	int i;
+
+	pr_info("NUMA: parsing numa-distance-map-v1\n");
+
+	matrix = of_get_property(map, "distance-matrix", NULL);
+	if (!matrix) {
+		pr_err("NUMA: No distance-matrix property in distance-map\n");
+		return -EINVAL;
+	}
+
+	entry_count = of_property_count_u32_elems(map, "distance-matrix");
+	if (entry_count <= 0) {
+		pr_err("NUMA: Invalid distance-matrix\n");
+		return -EINVAL;
+	}
+
+	for (i = 0; i + 2 < entry_count; i += 3) {
+		u32 nodea, nodeb, distance;
+
+		nodea = of_read_number(matrix, 1);
+		matrix++;
+		nodeb = of_read_number(matrix, 1);
+		matrix++;
+		distance = of_read_number(matrix, 1);
+		matrix++;
+
+		numa_set_distance(nodea, nodeb, distance);
+		pr_debug("NUMA:  distance[node%d -> node%d] = %d\n",
+			 nodea, nodeb, distance);
+
+		/* Set default distance of node B->A same as A->B */
+		if (nodeb > nodea)
+			numa_set_distance(nodeb, nodea, distance);
+	}
+
+	return 0;
+}
+
+static int __init of_numa_parse_distance_map(void)
+{
+	int ret = 0;
+	struct device_node *np;
+
+	np = of_find_compatible_node(NULL, NULL,
+				     "numa-distance-map-v1");
+	if (np)
+		ret = of_numa_parse_distance_map_v1(np);
+
+	of_node_put(np);
+	return ret;
+}
+
+int of_node_to_nid(struct device_node *device)
+{
+	struct device_node *np;
+	u32 nid;
+	int r = -ENODATA;
+
+	np = of_node_get(device);
+
+	while (np) {
+		struct device_node *parent;
+
+		r = of_property_read_u32(np, "numa-node-id", &nid);
+		/*
+		 * -EINVAL indicates the property was not found, and
+		 *  we walk up the tree trying to find a parent with a
+		 *  "numa-node-id".  Any other type of error indicates
+		 *  a bad device tree and we give up.
+		 */
+		if (r != -EINVAL)
+			break;
+
+		parent = of_get_parent(np);
+		of_node_put(np);
+		np = parent;
+	}
+	if (np && r)
+		pr_warn("NUMA: Invalid \"numa-node-id\" property in node %s\n",
+			np->name);
+	of_node_put(np);
+
+	if (!r) {
+		if (nid >= MAX_NUMNODES)
+			pr_warn("NUMA: Node id %u exceeds maximum value\n",
+				nid);
+		else
+			return nid;
+	}
+
+	return NUMA_NO_NODE;
+}
+EXPORT_SYMBOL(of_node_to_nid);
+
+int __init of_numa_init(void)
+{
+	int r;
+
+	of_numa_parse_cpu_nodes();
+	r = of_numa_parse_memory_nodes();
+	if (r)
+		return r;
+	return of_numa_parse_distance_map();
+}
diff --git a/include/linux/of.h b/include/linux/of.h
index 7fcb681..76f07c8 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -685,6 +685,15 @@ static inline int of_node_to_nid(struct device_node *device)
 }
 #endif
 
+#ifdef CONFIG_OF_NUMA
+extern int of_numa_init(void);
+#else
+static inline int of_numa_init(void)
+{
+	return -ENOSYS;
+}
+#endif
+
 static inline struct device_node *of_find_matching_node(
 	struct device_node *from,
 	const struct of_device_id *matches)
-- 
cgit v0.10.2


From 3194ac6e66cc7a00c1fa9fecf33a7c376b489497 Mon Sep 17 00:00:00 2001
From: David Daney <david.daney@cavium.com>
Date: Fri, 8 Apr 2016 15:50:26 -0700
Subject: arm64: Move unflatten_device_tree() call earlier.

In order to extract NUMA information from the device tree, we need to
have the tree in its unflattened form.

Move the call to bootmem_init() in the tail of paging_init() into
setup_arch, and adjust header files so that its declaration is
visible.

Move the unflatten_device_tree() call between the calls to
paging_init() and bootmem_init().  Follow on patches add NUMA handling
to bootmem_init().

Signed-off-by: David Daney <david.daney@cavium.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 990124a..97b1d8f 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -29,6 +29,7 @@ typedef struct {
 #define ASID(mm)	((mm)->context.id.counter & 0xffff)
 
 extern void paging_init(void);
+extern void bootmem_init(void);
 extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt);
 extern void init_mem_pgprot(void);
 extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 7b85b1d..432bc7f 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -265,18 +265,22 @@ void __init setup_arch(char **cmdline_p)
 
 	paging_init();
 
+	if (acpi_disabled)
+		unflatten_device_tree();
+
+	bootmem_init();
+
 	kasan_init();
 
 	request_standard_resources();
 
 	early_ioremap_reset();
 
-	if (acpi_disabled) {
-		unflatten_device_tree();
+	if (acpi_disabled)
 		psci_dt_init();
-	} else {
+	else
 		psci_acpi_init();
-	}
+
 	xen_early_init();
 
 	cpu_read_bootcpu_ops();
diff --git a/arch/arm64/mm/mm.h b/arch/arm64/mm/mm.h
index ef47d99..71fe989 100644
--- a/arch/arm64/mm/mm.h
+++ b/arch/arm64/mm/mm.h
@@ -1,3 +1,2 @@
-extern void __init bootmem_init(void);
 
 void fixup_init(void);
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index dbad533..0f85a46 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -564,8 +564,6 @@ void __init paging_init(void)
 	 */
 	memblock_free(__pa(swapper_pg_dir) + PAGE_SIZE,
 		      SWAPPER_DIR_SIZE - PAGE_SIZE);
-
-	bootmem_init();
 }
 
 /*
-- 
cgit v0.10.2


From 1a2db300348b799479d2d22b84d51b27ad0458c7 Mon Sep 17 00:00:00 2001
From: Ganapatrao Kulkarni <gkulkarni@caviumnetworks.com>
Date: Fri, 8 Apr 2016 15:50:27 -0700
Subject: arm64, numa: Add NUMA support for arm64 platforms.

Attempt to get the memory and CPU NUMA node via of_numa.  If that
fails, default the dummy NUMA node and map all memory and CPUs to node
0.

Tested-by: Shannon Zhao <shannon.zhao@linaro.org>
Reviewed-by: Robert Richter <rrichter@cavium.com>
Signed-off-by: Ganapatrao Kulkarni <gkulkarni@caviumnetworks.com>
Signed-off-by: David Daney <david.daney@cavium.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 4f43622..99f9b55 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -76,6 +76,7 @@ config ARM64
 	select HAVE_HW_BREAKPOINT if PERF_EVENTS
 	select HAVE_IRQ_TIME_ACCOUNTING
 	select HAVE_MEMBLOCK
+	select HAVE_MEMBLOCK_NODE_MAP if NUMA
 	select HAVE_PATA_PLATFORM
 	select HAVE_PERF_EVENTS
 	select HAVE_PERF_REGS
@@ -98,6 +99,7 @@ config ARM64
 	select SYSCTL_EXCEPTION_TRACE
 	select HAVE_CONTEXT_TRACKING
 	select HAVE_ARM_SMCCC
+	select OF_NUMA if NUMA && OF
 	help
 	  ARM 64-bit (AArch64) Linux support.
 
@@ -546,6 +548,30 @@ config HOTPLUG_CPU
 	  Say Y here to experiment with turning CPUs off and on.  CPUs
 	  can be controlled through /sys/devices/system/cpu.
 
+# Common NUMA Features
+config NUMA
+	bool "Numa Memory Allocation and Scheduler Support"
+	depends on SMP
+	help
+	  Enable NUMA (Non Uniform Memory Access) support.
+
+	  The kernel will try to allocate memory used by a CPU on the
+	  local memory of the CPU and add some more
+	  NUMA awareness to the kernel.
+
+config NODES_SHIFT
+	int "Maximum NUMA Nodes (as a power of 2)"
+	range 1 10
+	default "2"
+	depends on NEED_MULTIPLE_NODES
+	help
+	  Specify the maximum number of NUMA Nodes available on the target
+	  system.  Increases memory reserved to accommodate various tables.
+
+config USE_PERCPU_NUMA_NODE_ID
+	def_bool y
+	depends on NUMA
+
 source kernel/Kconfig.preempt
 source kernel/Kconfig.hz
 
diff --git a/arch/arm64/include/asm/mmzone.h b/arch/arm64/include/asm/mmzone.h
new file mode 100644
index 0000000..a0de9e6
--- /dev/null
+++ b/arch/arm64/include/asm/mmzone.h
@@ -0,0 +1,12 @@
+#ifndef __ASM_MMZONE_H
+#define __ASM_MMZONE_H
+
+#ifdef CONFIG_NUMA
+
+#include <asm/numa.h>
+
+extern struct pglist_data *node_data[];
+#define NODE_DATA(nid)		(node_data[(nid)])
+
+#endif /* CONFIG_NUMA */
+#endif /* __ASM_MMZONE_H */
diff --git a/arch/arm64/include/asm/numa.h b/arch/arm64/include/asm/numa.h
new file mode 100644
index 0000000..e9b4f29
--- /dev/null
+++ b/arch/arm64/include/asm/numa.h
@@ -0,0 +1,45 @@
+#ifndef __ASM_NUMA_H
+#define __ASM_NUMA_H
+
+#include <asm/topology.h>
+
+#ifdef CONFIG_NUMA
+
+/* currently, arm64 implements flat NUMA topology */
+#define parent_node(node)	(node)
+
+int __node_distance(int from, int to);
+#define node_distance(a, b) __node_distance(a, b)
+
+extern nodemask_t numa_nodes_parsed __initdata;
+
+/* Mappings between node number and cpus on that node. */
+extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
+void numa_clear_node(unsigned int cpu);
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+const struct cpumask *cpumask_of_node(int node);
+#else
+/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
+static inline const struct cpumask *cpumask_of_node(int node)
+{
+	return node_to_cpumask_map[node];
+}
+#endif
+
+void __init arm64_numa_init(void);
+int __init numa_add_memblk(int nodeid, u64 start, u64 end);
+void __init numa_set_distance(int from, int to, int distance);
+void __init numa_free_distance(void);
+void __init early_map_cpu_to_node(unsigned int cpu, int nid);
+void numa_store_cpu_info(unsigned int cpu);
+
+#else	/* CONFIG_NUMA */
+
+static inline void numa_store_cpu_info(unsigned int cpu) { }
+static inline void arm64_numa_init(void) { }
+static inline void early_map_cpu_to_node(unsigned int cpu, int nid) { }
+
+#endif	/* CONFIG_NUMA */
+
+#endif	/* __ASM_NUMA_H */
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index a3e9d6f..8b57339 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -22,6 +22,16 @@ void init_cpu_topology(void);
 void store_cpu_topology(unsigned int cpuid);
 const struct cpumask *cpu_coregroup_mask(int cpu);
 
+#ifdef CONFIG_NUMA
+
+struct pci_bus;
+int pcibus_to_node(struct pci_bus *bus);
+#define cpumask_of_pcibus(bus)	(pcibus_to_node(bus) == -1 ?		\
+				 cpu_all_mask :				\
+				 cpumask_of_node(pcibus_to_node(bus)))
+
+#endif /* CONFIG_NUMA */
+
 #include <asm-generic/topology.h>
 
 #endif /* _ASM_ARM_TOPOLOGY_H */
diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c
index c72de66..3c4e308 100644
--- a/arch/arm64/kernel/pci.c
+++ b/arch/arm64/kernel/pci.c
@@ -74,6 +74,16 @@ int raw_pci_write(unsigned int domain, unsigned int bus,
 	return -ENXIO;
 }
 
+#ifdef CONFIG_NUMA
+
+int pcibus_to_node(struct pci_bus *bus)
+{
+	return dev_to_node(&bus->dev);
+}
+EXPORT_SYMBOL(pcibus_to_node);
+
+#endif
+
 #ifdef CONFIG_ACPI
 /* Root bridge scanning */
 struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 432bc7f..65f5159 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -53,6 +53,7 @@
 #include <asm/cpufeature.h>
 #include <asm/cpu_ops.h>
 #include <asm/kasan.h>
+#include <asm/numa.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
 #include <asm/smp_plat.h>
@@ -319,6 +320,9 @@ static int __init topology_init(void)
 {
 	int i;
 
+	for_each_online_node(i)
+		register_one_node(i);
+
 	for_each_possible_cpu(i) {
 		struct cpu *cpu = &per_cpu(cpu_data.cpu, i);
 		cpu->hotpluggable = 1;
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 5fbab1c..622bd6c 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -45,6 +45,7 @@
 #include <asm/cputype.h>
 #include <asm/cpu_ops.h>
 #include <asm/mmu_context.h>
+#include <asm/numa.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/processor.h>
@@ -203,6 +204,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 static void smp_store_cpu_info(unsigned int cpuid)
 {
 	store_cpu_topology(cpuid);
+	numa_store_cpu_info(cpuid);
 }
 
 /*
@@ -633,6 +635,8 @@ static void __init of_parse_and_init_cpus(void)
 
 		pr_debug("cpu logical map 0x%llx\n", hwid);
 		cpu_logical_map(cpu_count) = hwid;
+
+		early_map_cpu_to_node(cpu_count, of_node_to_nid(dn));
 next:
 		cpu_count++;
 	}
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index 57f57fd..54bb209 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -4,6 +4,7 @@ obj-y				:= dma-mapping.o extable.o fault.o init.o \
 				   context.o proc.o pageattr.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 obj-$(CONFIG_ARM64_PTDUMP)	+= dump.o
+obj-$(CONFIG_NUMA)		+= numa.o
 
 obj-$(CONFIG_KASAN)		+= kasan_init.o
 KASAN_SANITIZE_kasan_init.o	:= n
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index d55d720..f64dbd4 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -40,6 +40,7 @@
 #include <asm/kasan.h>
 #include <asm/kernel-pgtable.h>
 #include <asm/memory.h>
+#include <asm/numa.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
 #include <asm/sizes.h>
@@ -86,6 +87,21 @@ static phys_addr_t __init max_zone_dma_phys(void)
 	return min(offset + (1ULL << 32), memblock_end_of_DRAM());
 }
 
+#ifdef CONFIG_NUMA
+
+static void __init zone_sizes_init(unsigned long min, unsigned long max)
+{
+	unsigned long max_zone_pfns[MAX_NR_ZONES]  = {0};
+
+	if (IS_ENABLED(CONFIG_ZONE_DMA))
+		max_zone_pfns[ZONE_DMA] = PFN_DOWN(max_zone_dma_phys());
+	max_zone_pfns[ZONE_NORMAL] = max;
+
+	free_area_init_nodes(max_zone_pfns);
+}
+
+#else
+
 static void __init zone_sizes_init(unsigned long min, unsigned long max)
 {
 	struct memblock_region *reg;
@@ -126,6 +142,8 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
 	free_area_init_node(0, zone_size, min, zhole_size);
 }
 
+#endif /* CONFIG_NUMA */
+
 #ifdef CONFIG_HAVE_ARCH_PFN_VALID
 int pfn_valid(unsigned long pfn)
 {
@@ -142,10 +160,15 @@ static void __init arm64_memory_present(void)
 static void __init arm64_memory_present(void)
 {
 	struct memblock_region *reg;
+	int nid = 0;
 
-	for_each_memblock(memory, reg)
-		memory_present(0, memblock_region_memory_base_pfn(reg),
-			       memblock_region_memory_end_pfn(reg));
+	for_each_memblock(memory, reg) {
+#ifdef CONFIG_NUMA
+		nid = reg->nid;
+#endif
+		memory_present(nid, memblock_region_memory_base_pfn(reg),
+				memblock_region_memory_end_pfn(reg));
+	}
 }
 #endif
 
@@ -278,7 +301,6 @@ void __init arm64_memblock_init(void)
 	dma_contiguous_reserve(arm64_dma_phys_limit);
 
 	memblock_allow_resize();
-	memblock_dump_all();
 }
 
 void __init bootmem_init(void)
@@ -290,6 +312,9 @@ void __init bootmem_init(void)
 
 	early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT);
 
+	max_pfn = max_low_pfn = max;
+
+	arm64_numa_init();
 	/*
 	 * Sparsemem tries to allocate bootmem in memory_present(), so must be
 	 * done after the fixed reservations.
@@ -300,7 +325,7 @@ void __init bootmem_init(void)
 	zone_sizes_init(min, max);
 
 	high_memory = __va((max << PAGE_SHIFT) - 1) + 1;
-	max_pfn = max_low_pfn = max;
+	memblock_dump_all();
 }
 
 #ifndef CONFIG_SPARSEMEM_VMEMMAP
diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
new file mode 100644
index 0000000..98dc104
--- /dev/null
+++ b/arch/arm64/mm/numa.c
@@ -0,0 +1,396 @@
+/*
+ * NUMA support, based on the x86 implementation.
+ *
+ * Copyright (C) 2015 Cavium Inc.
+ * Author: Ganapatrao Kulkarni <gkulkarni@cavium.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/module.h>
+#include <linux/of.h>
+
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
+EXPORT_SYMBOL(node_data);
+nodemask_t numa_nodes_parsed __initdata;
+static int cpu_to_node_map[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
+
+static int numa_distance_cnt;
+static u8 *numa_distance;
+static int numa_off;
+
+static __init int numa_parse_early_param(char *opt)
+{
+	if (!opt)
+		return -EINVAL;
+	if (!strncmp(opt, "off", 3)) {
+		pr_info("%s\n", "NUMA turned off");
+		numa_off = 1;
+	}
+	return 0;
+}
+early_param("numa", numa_parse_early_param);
+
+cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
+EXPORT_SYMBOL(node_to_cpumask_map);
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+
+/*
+ * Returns a pointer to the bitmask of CPUs on Node 'node'.
+ */
+const struct cpumask *cpumask_of_node(int node)
+{
+	if (WARN_ON(node >= nr_node_ids))
+		return cpu_none_mask;
+
+	if (WARN_ON(node_to_cpumask_map[node] == NULL))
+		return cpu_online_mask;
+
+	return node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(cpumask_of_node);
+
+#endif
+
+static void map_cpu_to_node(unsigned int cpu, int nid)
+{
+	set_cpu_numa_node(cpu, nid);
+	if (nid >= 0)
+		cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
+}
+
+void numa_clear_node(unsigned int cpu)
+{
+	int nid = cpu_to_node(cpu);
+
+	if (nid >= 0)
+		cpumask_clear_cpu(cpu, node_to_cpumask_map[nid]);
+	set_cpu_numa_node(cpu, NUMA_NO_NODE);
+}
+
+/*
+ * Allocate node_to_cpumask_map based on number of available nodes
+ * Requires node_possible_map to be valid.
+ *
+ * Note: cpumask_of_node() is not valid until after this is done.
+ * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
+ */
+static void __init setup_node_to_cpumask_map(void)
+{
+	unsigned int cpu;
+	int node;
+
+	/* setup nr_node_ids if not done yet */
+	if (nr_node_ids == MAX_NUMNODES)
+		setup_nr_node_ids();
+
+	/* allocate and clear the mapping */
+	for (node = 0; node < nr_node_ids; node++) {
+		alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
+		cpumask_clear(node_to_cpumask_map[node]);
+	}
+
+	for_each_possible_cpu(cpu)
+		set_cpu_numa_node(cpu, NUMA_NO_NODE);
+
+	/* cpumask_of_node() will now work */
+	pr_debug("NUMA: Node to cpumask map for %d nodes\n", nr_node_ids);
+}
+
+/*
+ *  Set the cpu to node and mem mapping
+ */
+void numa_store_cpu_info(unsigned int cpu)
+{
+	map_cpu_to_node(cpu, numa_off ? 0 : cpu_to_node_map[cpu]);
+}
+
+void __init early_map_cpu_to_node(unsigned int cpu, int nid)
+{
+	/* fallback to node 0 */
+	if (nid < 0 || nid >= MAX_NUMNODES)
+		nid = 0;
+
+	cpu_to_node_map[cpu] = nid;
+}
+
+/**
+ * numa_add_memblk - Set node id to memblk
+ * @nid: NUMA node ID of the new memblk
+ * @start: Start address of the new memblk
+ * @size:  Size of the new memblk
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_add_memblk(int nid, u64 start, u64 size)
+{
+	int ret;
+
+	ret = memblock_set_node(start, size, &memblock.memory, nid);
+	if (ret < 0) {
+		pr_err("NUMA: memblock [0x%llx - 0x%llx] failed to add on node %d\n",
+			start, (start + size - 1), nid);
+		return ret;
+	}
+
+	node_set(nid, numa_nodes_parsed);
+	pr_info("NUMA: Adding memblock [0x%llx - 0x%llx] on node %d\n",
+			start, (start + size - 1), nid);
+	return ret;
+}
+
+/**
+ * Initialize NODE_DATA for a node on the local memory
+ */
+static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
+{
+	const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
+	u64 nd_pa;
+	void *nd;
+	int tnid;
+
+	pr_info("NUMA: Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
+			nid, start_pfn << PAGE_SHIFT,
+			(end_pfn << PAGE_SHIFT) - 1);
+
+	nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
+	nd = __va(nd_pa);
+
+	/* report and initialize */
+	pr_info("NUMA: NODE_DATA [mem %#010Lx-%#010Lx]\n",
+		nd_pa, nd_pa + nd_size - 1);
+	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
+	if (tnid != nid)
+		pr_info("NUMA: NODE_DATA(%d) on node %d\n", nid, tnid);
+
+	node_data[nid] = nd;
+	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
+	NODE_DATA(nid)->node_id = nid;
+	NODE_DATA(nid)->node_start_pfn = start_pfn;
+	NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
+}
+
+/**
+ * numa_free_distance
+ *
+ * The current table is freed.
+ */
+void __init numa_free_distance(void)
+{
+	size_t size;
+
+	if (!numa_distance)
+		return;
+
+	size = numa_distance_cnt * numa_distance_cnt *
+		sizeof(numa_distance[0]);
+
+	memblock_free(__pa(numa_distance), size);
+	numa_distance_cnt = 0;
+	numa_distance = NULL;
+}
+
+/**
+ *
+ * Create a new NUMA distance table.
+ *
+ */
+static int __init numa_alloc_distance(void)
+{
+	size_t size;
+	u64 phys;
+	int i, j;
+
+	size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]);
+	phys = memblock_find_in_range(0, PFN_PHYS(max_pfn),
+				      size, PAGE_SIZE);
+	if (WARN_ON(!phys))
+		return -ENOMEM;
+
+	memblock_reserve(phys, size);
+
+	numa_distance = __va(phys);
+	numa_distance_cnt = nr_node_ids;
+
+	/* fill with the default distances */
+	for (i = 0; i < numa_distance_cnt; i++)
+		for (j = 0; j < numa_distance_cnt; j++)
+			numa_distance[i * numa_distance_cnt + j] = i == j ?
+				LOCAL_DISTANCE : REMOTE_DISTANCE;
+
+	pr_debug("NUMA: Initialized distance table, cnt=%d\n",
+			numa_distance_cnt);
+
+	return 0;
+}
+
+/**
+ * numa_set_distance - Set inter node NUMA distance from node to node.
+ * @from: the 'from' node to set distance
+ * @to: the 'to'  node to set distance
+ * @distance: NUMA distance
+ *
+ * Set the distance from node @from to @to to @distance.
+ * If distance table doesn't exist, a warning is printed.
+ *
+ * If @from or @to is higher than the highest known node or lower than zero
+ * or @distance doesn't make sense, the call is ignored.
+ *
+ */
+void __init numa_set_distance(int from, int to, int distance)
+{
+	if (!numa_distance) {
+		pr_warn_once("NUMA: Warning: distance table not allocated yet\n");
+		return;
+	}
+
+	if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
+			from < 0 || to < 0) {
+		pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
+			    from, to, distance);
+		return;
+	}
+
+	if ((u8)distance != distance ||
+	    (from == to && distance != LOCAL_DISTANCE)) {
+		pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
+			     from, to, distance);
+		return;
+	}
+
+	numa_distance[from * numa_distance_cnt + to] = distance;
+}
+
+/**
+ * Return NUMA distance @from to @to
+ */
+int __node_distance(int from, int to)
+{
+	if (from >= numa_distance_cnt || to >= numa_distance_cnt)
+		return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
+	return numa_distance[from * numa_distance_cnt + to];
+}
+EXPORT_SYMBOL(__node_distance);
+
+static int __init numa_register_nodes(void)
+{
+	int nid;
+	struct memblock_region *mblk;
+
+	/* Check that valid nid is set to memblks */
+	for_each_memblock(memory, mblk)
+		if (mblk->nid == NUMA_NO_NODE || mblk->nid >= MAX_NUMNODES) {
+			pr_warn("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
+				mblk->nid, mblk->base,
+				mblk->base + mblk->size - 1);
+			return -EINVAL;
+		}
+
+	/* Finally register nodes. */
+	for_each_node_mask(nid, numa_nodes_parsed) {
+		unsigned long start_pfn, end_pfn;
+
+		get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+		setup_node_data(nid, start_pfn, end_pfn);
+		node_set_online(nid);
+	}
+
+	/* Setup online nodes to actual nodes*/
+	node_possible_map = numa_nodes_parsed;
+
+	return 0;
+}
+
+static int __init numa_init(int (*init_func)(void))
+{
+	int ret;
+
+	nodes_clear(numa_nodes_parsed);
+	nodes_clear(node_possible_map);
+	nodes_clear(node_online_map);
+	numa_free_distance();
+
+	ret = numa_alloc_distance();
+	if (ret < 0)
+		return ret;
+
+	ret = init_func();
+	if (ret < 0)
+		return ret;
+
+	if (nodes_empty(numa_nodes_parsed))
+		return -EINVAL;
+
+	ret = numa_register_nodes();
+	if (ret < 0)
+		return ret;
+
+	setup_node_to_cpumask_map();
+
+	/* init boot processor */
+	cpu_to_node_map[0] = 0;
+	map_cpu_to_node(0, 0);
+
+	return 0;
+}
+
+/**
+ * dummy_numa_init - Fallback dummy NUMA init
+ *
+ * Used if there's no underlying NUMA architecture, NUMA initialization
+ * fails, or NUMA is disabled on the command line.
+ *
+ * Must online at least one node (node 0) and add memory blocks that cover all
+ * allowed memory. It is unlikely that this function fails.
+ */
+static int __init dummy_numa_init(void)
+{
+	int ret;
+	struct memblock_region *mblk;
+
+	pr_info("%s\n", "No NUMA configuration found");
+	pr_info("NUMA: Faking a node at [mem %#018Lx-%#018Lx]\n",
+	       0LLU, PFN_PHYS(max_pfn) - 1);
+
+	for_each_memblock(memory, mblk) {
+		ret = numa_add_memblk(0, mblk->base, mblk->size);
+		if (!ret)
+			continue;
+
+		pr_err("NUMA init failed\n");
+		return ret;
+	}
+
+	numa_off = 1;
+	return 0;
+}
+
+/**
+ * arm64_numa_init - Initialize NUMA
+ *
+ * Try each configured NUMA initialization method until one succeeds.  The
+ * last fallback is dummy single node config encomapssing whole memory.
+ */
+void __init arm64_numa_init(void)
+{
+	if (!numa_off) {
+		if (!numa_init(of_numa_init))
+			return;
+	}
+
+	numa_init(dummy_numa_init);
+}
-- 
cgit v0.10.2


From 561662301eb6a40d569453b3555fc8cfce094b93 Mon Sep 17 00:00:00 2001
From: Ganapatrao Kulkarni <gkulkarni@caviumnetworks.com>
Date: Fri, 8 Apr 2016 15:50:28 -0700
Subject: arm64, mm, numa: Add NUMA balancing support for arm64.

Enable NUMA balancing for arm64 platforms.
Add pte, pmd protnone helpers for use by automatic NUMA balancing.

Reviewed-by: Steve Capper <steve.capper@arm.com>
Reviewed-by: Robert Richter <rrichter@cavium.com>
Signed-off-by: Ganapatrao Kulkarni <gkulkarni@caviumnetworks.com>
Signed-off-by: David Daney <david.daney@cavium.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 99f9b55..a578080 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -11,6 +11,7 @@ config ARM64
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select ARCH_SUPPORTS_ATOMIC_RMW
+	select ARCH_SUPPORTS_NUMA_BALANCING
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION
 	select ARCH_WANT_FRAME_POINTERS
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 2abfa4d..3c91b17 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -266,6 +266,21 @@ static inline pgprot_t mk_sect_prot(pgprot_t prot)
 	return __pgprot(pgprot_val(prot) & ~PTE_TABLE_BIT);
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * See the comment in include/asm-generic/pgtable.h
+ */
+static inline int pte_protnone(pte_t pte)
+{
+	return (pte_val(pte) & (PTE_VALID | PTE_PROT_NONE)) == PTE_PROT_NONE;
+}
+
+static inline int pmd_protnone(pmd_t pmd)
+{
+	return pte_protnone(pmd_pte(pmd));
+}
+#endif
+
 /*
  * THP definitions.
  */
-- 
cgit v0.10.2


From 66dbd6e61a526ae7d11a208238ae2c17e5cacb6b Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Wed, 13 Apr 2016 16:01:22 +0100
Subject: arm64: Implement ptep_set_access_flags() for hardware AF/DBM

When hardware updates of the access and dirty states are enabled, the
default ptep_set_access_flags() implementation based on calling
set_pte_at() directly is potentially racy. This triggers the "racy dirty
state clearing" warning in set_pte_at() because an existing writable PTE
is overridden with a clean entry.

There are two main scenarios for this situation:

1. The CPU getting an access fault does not support hardware updates of
   the access/dirty flags. However, a different agent in the system
   (e.g. SMMU) can do this, therefore overriding a writable entry with a
   clean one could potentially lose the automatically updated dirty
   status

2. A more complex situation is possible when all CPUs support hardware
   AF/DBM:

   a) Initial state: shareable + writable vma and pte_none(pte)
   b) Read fault taken by two threads of the same process on different
      CPUs
   c) CPU0 takes the mmap_sem and proceeds to handling the fault. It
      eventually reaches do_set_pte() which sets a writable + clean pte.
      CPU0 releases the mmap_sem
   d) CPU1 acquires the mmap_sem and proceeds to handle_pte_fault(). The
      pte entry it reads is present, writable and clean and it continues
      to pte_mkyoung()
   e) CPU1 calls ptep_set_access_flags()

   If between (d) and (e) the hardware (another CPU) updates the dirty
   state (clears PTE_RDONLY), CPU1 will override the PTR_RDONLY bit
   marking the entry clean again.

This patch implements an arm64-specific ptep_set_access_flags() function
to perform an atomic update of the PTE flags.

Fixes: 2f4b829c625e ("arm64: Add support for hardware updates of the access and dirty pte bits")
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: Ming Lei <tom.leiming@gmail.com>
Tested-by: Julien Grall <julien.grall@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: <stable@vger.kernel.org> # 4.3+
[will: reworded comment]
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 3c91b17..3eefc18 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -535,6 +535,11 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 }
 
 #ifdef CONFIG_ARM64_HW_AFDBM
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+extern int ptep_set_access_flags(struct vm_area_struct *vma,
+				 unsigned long address, pte_t *ptep,
+				 pte_t entry, int dirty);
+
 /*
  * Atomic pte/pmd modifications.
  */
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c12e967..d24a5e0 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -81,6 +81,56 @@ void show_pte(struct mm_struct *mm, unsigned long addr)
 	printk("\n");
 }
 
+#ifdef CONFIG_ARM64_HW_AFDBM
+/*
+ * This function sets the access flags (dirty, accessed), as well as write
+ * permission, and only to a more permissive setting.
+ *
+ * It needs to cope with hardware update of the accessed/dirty state by other
+ * agents in the system and can safely skip the __sync_icache_dcache() call as,
+ * like set_pte_at(), the PTE is never changed from no-exec to exec here.
+ *
+ * Returns whether or not the PTE actually changed.
+ */
+int ptep_set_access_flags(struct vm_area_struct *vma,
+			  unsigned long address, pte_t *ptep,
+			  pte_t entry, int dirty)
+{
+	pteval_t old_pteval;
+	unsigned int tmp;
+
+	if (pte_same(*ptep, entry))
+		return 0;
+
+	/* only preserve the access flags and write permission */
+	pte_val(entry) &= PTE_AF | PTE_WRITE | PTE_DIRTY;
+
+	/*
+	 * PTE_RDONLY is cleared by default in the asm below, so set it in
+	 * back if necessary (read-only or clean PTE).
+	 */
+	if (!pte_write(entry) || !dirty)
+		pte_val(entry) |= PTE_RDONLY;
+
+	/*
+	 * Setting the flags must be done atomically to avoid racing with the
+	 * hardware update of the access/dirty state.
+	 */
+	asm volatile("//	ptep_set_access_flags\n"
+	"	prfm	pstl1strm, %2\n"
+	"1:	ldxr	%0, %2\n"
+	"	and	%0, %0, %3		// clear PTE_RDONLY\n"
+	"	orr	%0, %0, %4		// set flags\n"
+	"	stxr	%w1, %0, %2\n"
+	"	cbnz	%w1, 1b\n"
+	: "=&r" (old_pteval), "=&r" (tmp), "+Q" (pte_val(*ptep))
+	: "L" (~PTE_RDONLY), "r" (pte_val(entry)));
+
+	flush_tlb_fix_spurious_fault(vma, address);
+	return 1;
+}
+#endif
+
 /*
  * The kernel tried to access some page that wasn't present.
  */
-- 
cgit v0.10.2


From 3a72db703cc6965662ff6063befa7d7cf2b49a2e Mon Sep 17 00:00:00 2001
From: Huang Shijie <shijie.huang@arm.com>
Date: Mon, 18 Apr 2016 09:49:56 +0800
Subject: arm64: mm: remove the redundant code

We already re-enable interrupts where necessary in the entry code, so
there is no need to do it again in do_page fault. This patch removes
the redundant code.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Huang Shijie <shijie.huang@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index d24a5e0..5954881 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -262,10 +262,6 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	tsk = current;
 	mm  = tsk->mm;
 
-	/* Enable interrupts if they were enabled in the parent context. */
-	if (interrupts_enabled(regs))
-		local_irq_enable();
-
 	/*
 	 * If we're in an interrupt or have no user context, we must not take
 	 * the fault.
-- 
cgit v0.10.2


From 82611c14c4e479715c071fb0f44ddd72dc632037 Mon Sep 17 00:00:00 2001
From: Jan Glauber <jan.glauber@caviumnetworks.com>
Date: Mon, 18 Apr 2016 09:43:33 +0200
Subject: arm64: Reduce verbosity on SMP CPU stop

When CPUs are stopped during an abnormal operation like panic
for each CPU a line is printed and the stack trace is dumped.

This information is only interesting for the aborting CPU
and on systems with many CPUs it only makes it harder to
debug if after the aborting CPU the log is flooded with data
about all other CPUs too.

Therefore remove the stack dump and printk of other CPUs
and only print a single line that the other CPUs are going to be
stopped and, in case any CPUs remain online list them.

Signed-off-by: Jan Glauber <jglauber@cavium.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 622bd6c..fef2e73 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -805,21 +805,11 @@ void arch_irq_work_raise(void)
 }
 #endif
 
-static DEFINE_RAW_SPINLOCK(stop_lock);
-
 /*
  * ipi_cpu_stop - handle IPI from smp_send_stop()
  */
 static void ipi_cpu_stop(unsigned int cpu)
 {
-	if (system_state == SYSTEM_BOOTING ||
-	    system_state == SYSTEM_RUNNING) {
-		raw_spin_lock(&stop_lock);
-		pr_crit("CPU%u: stopping\n", cpu);
-		dump_stack();
-		raw_spin_unlock(&stop_lock);
-	}
-
 	set_cpu_online(cpu, false);
 
 	local_irq_disable();
@@ -914,6 +904,9 @@ void smp_send_stop(void)
 		cpumask_copy(&mask, cpu_online_mask);
 		cpumask_clear_cpu(smp_processor_id(), &mask);
 
+		if (system_state == SYSTEM_BOOTING ||
+		    system_state == SYSTEM_RUNNING)
+			pr_crit("SMP: stopping secondary CPUs\n");
 		smp_cross_call(&mask, IPI_CPU_STOP);
 	}
 
@@ -923,7 +916,8 @@ void smp_send_stop(void)
 		udelay(1);
 
 	if (num_online_cpus() > 1)
-		pr_warning("SMP: failed to stop secondary CPUs\n");
+		pr_warning("SMP: failed to stop secondary CPUs %*pbl\n",
+			   cpumask_pr_args(cpu_online_mask));
 }
 
 /*
-- 
cgit v0.10.2


From 7464b6e3a5fb213e7826d2fde4a2daf05abb6822 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 18 Apr 2016 10:34:19 +0200
Subject: efi: ARM: avoid warning about phys_addr_t cast

memblock_remove() takes a phys_addr_t, which may be narrower than 64 bits,
causing a harmless warning:

drivers/firmware/efi/arm-init.c: In function 'reserve_regions':
include/linux/kernel.h:29:20: error: large integer implicitly truncated to unsigned type [-Werror=overflow]
 #define ULLONG_MAX (~0ULL)
                    ^
drivers/firmware/efi/arm-init.c:152:21: note: in expansion of macro 'ULLONG_MAX'
  memblock_remove(0, ULLONG_MAX);

This adds an explicit typecast to avoid the warning

Fixes: 500899c2cc3e ("efi: ARM/arm64: ignore DT memory nodes instead of removing them")
Acked-by Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c
index 5d6945b..ca708fb 100644
--- a/drivers/firmware/efi/arm-init.c
+++ b/drivers/firmware/efi/arm-init.c
@@ -149,7 +149,7 @@ static __init void reserve_regions(void)
 	 * uses its own memory map instead.
 	 */
 	memblock_dump_all();
-	memblock_remove(0, ULLONG_MAX);
+	memblock_remove(0, (phys_addr_t)ULLONG_MAX);
 
 	for_each_efi_memory_desc(&memmap, md) {
 		paddr = md->phys_addr;
-- 
cgit v0.10.2


From d32351c8242b7067b3f3e3a6caf7a387ff43f978 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Mon, 18 Apr 2016 11:09:46 +0800
Subject: arm64: mm: make pr_cont() per line in Virtual kernel memory layout

Each line with single pr_cont() in Virtual kernel memory layout,
or the dump of the kernel memory layout in dmesg is not aligned
when PRINTK_TIME enabled, due to the missing time stamps.

Tested-by: James Morse <james.morse@arm.com>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index f64dbd4..f6ff5b5 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -429,23 +429,22 @@ void __init mem_init(void)
 		MLM(MODULES_VADDR, MODULES_END));
 	pr_cont("    vmalloc : 0x%16lx - 0x%16lx   (%6ld GB)\n",
 		MLG(VMALLOC_START, VMALLOC_END));
-	pr_cont("      .text : 0x%p" " - 0x%p" "   (%6ld KB)\n"
-		"    .rodata : 0x%p" " - 0x%p" "   (%6ld KB)\n"
-		"      .init : 0x%p" " - 0x%p" "   (%6ld KB)\n"
-		"      .data : 0x%p" " - 0x%p" "   (%6ld KB)\n",
-		MLK_ROUNDUP(_text, __start_rodata),
-		MLK_ROUNDUP(__start_rodata, _etext),
-		MLK_ROUNDUP(__init_begin, __init_end),
+	pr_cont("      .text : 0x%p" " - 0x%p" "   (%6ld KB)\n",
+		MLK_ROUNDUP(_text, __start_rodata));
+	pr_cont("    .rodata : 0x%p" " - 0x%p" "   (%6ld KB)\n",
+		MLK_ROUNDUP(__start_rodata, _etext));
+	pr_cont("      .init : 0x%p" " - 0x%p" "   (%6ld KB)\n",
+		MLK_ROUNDUP(__init_begin, __init_end));
+	pr_cont("      .data : 0x%p" " - 0x%p" "   (%6ld KB)\n",
 		MLK_ROUNDUP(_sdata, _edata));
 	pr_cont("    fixed   : 0x%16lx - 0x%16lx   (%6ld KB)\n",
 		MLK(FIXADDR_START, FIXADDR_TOP));
 	pr_cont("    PCI I/O : 0x%16lx - 0x%16lx   (%6ld MB)\n",
 		MLM(PCI_IO_START, PCI_IO_END));
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-	pr_cont("    vmemmap : 0x%16lx - 0x%16lx   (%6ld GB maximum)\n"
-		"              0x%16lx - 0x%16lx   (%6ld MB actual)\n",
-		MLG(VMEMMAP_START,
-		    VMEMMAP_START + VMEMMAP_SIZE),
+	pr_cont("    vmemmap : 0x%16lx - 0x%16lx   (%6ld GB maximum)\n",
+		MLG(VMEMMAP_START, VMEMMAP_START + VMEMMAP_SIZE));
+	pr_cont("              0x%16lx - 0x%16lx   (%6ld MB actual)\n",
 		MLM((unsigned long)phys_to_page(memblock_start_of_DRAM()),
 		    (unsigned long)virt_to_page(high_memory)));
 #endif
-- 
cgit v0.10.2


From 9974723e31d1fa99e3c0efb2ae6cbbf089c0080c Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Mon, 18 Apr 2016 11:09:47 +0800
Subject: arm64: mm: Show bss segment in kernel memory layout

Show the bss segment information as with text and data in Virtual
memory kernel layout.

Acked-by: James Morse <james.morse@arm.com>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index f6ff5b5..d45f862 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -437,6 +437,8 @@ void __init mem_init(void)
 		MLK_ROUNDUP(__init_begin, __init_end));
 	pr_cont("      .data : 0x%p" " - 0x%p" "   (%6ld KB)\n",
 		MLK_ROUNDUP(_sdata, _edata));
+	pr_cont("       .bss : 0x%p" " - 0x%p" "   (%6ld KB)\n",
+		MLK_ROUNDUP(__bss_start, __bss_stop));
 	pr_cont("    fixed   : 0x%16lx - 0x%16lx   (%6ld KB)\n",
 		MLK(FIXADDR_START, FIXADDR_TOP));
 	pr_cont("    PCI I/O : 0x%16lx - 0x%16lx   (%6ld MB)\n",
-- 
cgit v0.10.2


From 8ee708792e1ccda683ec9cce9fab3b25ece0eb60 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linaro.org>
Date: Mon, 18 Apr 2016 11:16:14 -0700
Subject: arm64: Kconfig: remove redundant HAVE_ARCH_TRANSPARENT_HUGEPAGE
 definition

HAVE_ARCH_TRANSPARENT_HUGEPAGE has been defined in arch/Kconfig already,
the ARM64 version is identical with it and the default value is Y. So remove
the redundant definition and just select it under CONFIG_ARM64.

Signed-off-by: Yang Shi <yang.shi@linaro.org>
[will: sort into alphabetical order whilst I'm resolving conflicts]
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a578080..c8762f4 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -59,11 +59,14 @@ config ARM64
 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_TRACEHOOK
+	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+	select HAVE_ARM_SMCCC
 	select HAVE_BPF_JIT
 	select HAVE_C_RECORDMCOUNT
 	select HAVE_CC_STACKPROTECTOR
 	select HAVE_CMPXCHG_DOUBLE
 	select HAVE_CMPXCHG_LOCAL
+	select HAVE_CONTEXT_TRACKING
 	select HAVE_DEBUG_BUGVERBOSE
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DMA_API_DEBUG
@@ -91,6 +94,7 @@ config ARM64
 	select NO_BOOTMEM
 	select OF
 	select OF_EARLY_FLATTREE
+	select OF_NUMA if NUMA && OF
 	select OF_RESERVED_MEM
 	select PERF_USE_VMALLOC
 	select POWER_RESET
@@ -98,9 +102,6 @@ config ARM64
 	select RTC_LIB
 	select SPARSE_IRQ
 	select SYSCTL_EXCEPTION_TRACE
-	select HAVE_CONTEXT_TRACKING
-	select HAVE_ARM_SMCCC
-	select OF_NUMA if NUMA && OF
 	help
 	  ARM 64-bit (AArch64) Linux support.
 
@@ -605,9 +606,6 @@ config SYS_SUPPORTS_HUGETLBFS
 config ARCH_WANT_HUGE_PMD_SHARE
 	def_bool y if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
 
-config HAVE_ARCH_TRANSPARENT_HUGEPAGE
-	def_bool y
-
 config ARCH_HAS_CACHE_LINE_SIZE
 	def_bool y
 
-- 
cgit v0.10.2


From 2ff4936c1d68060b080aac49ec622b047f9e6c45 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 19 Apr 2016 10:31:19 +0100
Subject: arm64: asm: remove unused push/pop macros

We haven't used the push/pop macros for a while now, as it's typically
better to use immediate offsets for batches of accesses to the stack, as
we now do in the entry assembly for the kernel and hyp code.

Remove the unused macros.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 70f7b9e..972fb55 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -27,18 +27,6 @@
 #include <asm/thread_info.h>
 
 /*
- * Stack pushing/popping (register pairs only). Equivalent to store decrement
- * before, load increment after.
- */
-	.macro	push, xreg1, xreg2
-	stp	\xreg1, \xreg2, [sp, #-16]!
-	.endm
-
-	.macro	pop, xreg1, xreg2
-	ldp	\xreg1, \xreg2, [sp], #16
-	.endm
-
-/*
  * Enable and disable interrupts.
  */
 	.macro	disable_irq
-- 
cgit v0.10.2


From f3efb67590e8f68918a7a63ab0d2debee04fba9e Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Mon, 18 Apr 2016 10:28:32 +0100
Subject: arm64: hwcaps: Cleanup naming

We use hwcaps for referring to ELF hwcaps capability information.
However this can be confusing with 'cpu_hwcaps' which stands for the
CPU capability bit field. This patch cleans up the names to make it
a bit more readable.

Tested-by: Yury Norov <ynorov@caviumnetworks.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index f586343..e3576d9 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -744,7 +744,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.hwcap = cap,					\
 	}
 
-static const struct arm64_cpu_capabilities arm64_hwcaps[] = {
+static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
 	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_PMULL),
 	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_AES),
 	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA1_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SHA1),
@@ -765,7 +765,7 @@ static const struct arm64_cpu_capabilities arm64_hwcaps[] = {
 	{},
 };
 
-static void __init cap_set_hwcap(const struct arm64_cpu_capabilities *cap)
+static void __init cap_set_elf_hwcap(const struct arm64_cpu_capabilities *cap)
 {
 	switch (cap->hwcap_type) {
 	case CAP_HWCAP:
@@ -786,7 +786,7 @@ static void __init cap_set_hwcap(const struct arm64_cpu_capabilities *cap)
 }
 
 /* Check if we have a particular HWCAP enabled */
-static bool __maybe_unused cpus_have_hwcap(const struct arm64_cpu_capabilities *cap)
+static bool cpus_have_elf_hwcap(const struct arm64_cpu_capabilities *cap)
 {
 	bool rc;
 
@@ -810,14 +810,14 @@ static bool __maybe_unused cpus_have_hwcap(const struct arm64_cpu_capabilities *
 	return rc;
 }
 
-static void __init setup_cpu_hwcaps(void)
+static void __init setup_elf_hwcaps(void)
 {
 	int i;
-	const struct arm64_cpu_capabilities *hwcaps = arm64_hwcaps;
+	const struct arm64_cpu_capabilities *hwcaps = arm64_elf_hwcaps;
 
 	for (i = 0; hwcaps[i].matches; i++)
 		if (hwcaps[i].matches(&hwcaps[i]))
-			cap_set_hwcap(&hwcaps[i]);
+			cap_set_elf_hwcap(&hwcaps[i]);
 }
 
 void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps,
@@ -955,8 +955,8 @@ void verify_local_cpu_capabilities(void)
 			caps[i].enable(NULL);
 	}
 
-	for (i = 0, caps = arm64_hwcaps; caps[i].matches; i++) {
-		if (!cpus_have_hwcap(&caps[i]))
+	for (i = 0, caps = arm64_elf_hwcaps; caps[i].matches; i++) {
+		if (!cpus_have_elf_hwcap(&caps[i]))
 			continue;
 		if (!feature_matches(__raw_read_system_reg(caps[i].sys_reg), &caps[i])) {
 			pr_crit("CPU%d: missing HWCAP: %s\n",
@@ -979,7 +979,7 @@ void __init setup_cpu_features(void)
 
 	/* Set the CPU feature capabilies */
 	setup_feature_capabilities();
-	setup_cpu_hwcaps();
+	setup_elf_hwcaps();
 
 	/* Advertise that we have computed the system capabilities */
 	set_sys_caps_initialised();
-- 
cgit v0.10.2


From 752835019c15f720a51433e6bc1c5c515c01d1a2 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Mon, 18 Apr 2016 10:28:33 +0100
Subject: arm64: HWCAP: Split COMPAT HWCAP table entries

In order to handle systems which do not support 32bit at EL0,
split the COMPAT HWCAP entries into a separate table which can
be processed, only if the support is available.

Tested-by: Yury Norov <ynorov@caviumnetworks.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index e3576d9..cac7c41 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -755,6 +755,10 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
 	HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_FPHP),
 	HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_ASIMD),
 	HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_ASIMDHP),
+	{},
+};
+
+static const struct arm64_cpu_capabilities compat_elf_hwcaps[] = {
 #ifdef CONFIG_COMPAT
 	HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_AES_SHIFT, FTR_UNSIGNED, 2, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_PMULL),
 	HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_AES_SHIFT, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_AES),
@@ -810,28 +814,23 @@ static bool cpus_have_elf_hwcap(const struct arm64_cpu_capabilities *cap)
 	return rc;
 }
 
-static void __init setup_elf_hwcaps(void)
+static void __init setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps)
 {
-	int i;
-	const struct arm64_cpu_capabilities *hwcaps = arm64_elf_hwcaps;
-
-	for (i = 0; hwcaps[i].matches; i++)
-		if (hwcaps[i].matches(&hwcaps[i]))
-			cap_set_elf_hwcap(&hwcaps[i]);
+	for (; hwcaps->matches; hwcaps++)
+		if (hwcaps->matches(hwcaps))
+			cap_set_elf_hwcap(hwcaps);
 }
 
 void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps,
 			    const char *info)
 {
-	int i;
-
-	for (i = 0; caps[i].matches; i++) {
-		if (!caps[i].matches(&caps[i]))
+	for (; caps->matches; caps++) {
+		if (!caps->matches(caps))
 			continue;
 
-		if (!cpus_have_cap(caps[i].capability) && caps[i].desc)
-			pr_info("%s %s\n", info, caps[i].desc);
-		cpus_set_cap(caps[i].capability);
+		if (!cpus_have_cap(caps->capability) && caps->desc)
+			pr_info("%s %s\n", info, caps->desc);
+		cpus_set_cap(caps->capability);
 	}
 }
 
@@ -842,11 +841,9 @@ void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps,
 static void __init
 enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps)
 {
-	int i;
-
-	for (i = 0; caps[i].matches; i++)
-		if (caps[i].enable && cpus_have_cap(caps[i].capability))
-			on_each_cpu(caps[i].enable, NULL, true);
+	for (; caps->matches; caps++)
+		if (caps->enable && cpus_have_cap(caps->capability))
+			on_each_cpu(caps->enable, NULL, true);
 }
 
 /*
@@ -916,6 +913,41 @@ static void check_early_cpu_features(void)
 	verify_cpu_asid_bits();
 }
 
+static void
+verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps)
+{
+
+	for (; caps->matches; caps++) {
+		if (!cpus_have_elf_hwcap(caps))
+			continue;
+		if (!feature_matches(__raw_read_system_reg(caps->sys_reg), caps)) {
+			pr_crit("CPU%d: missing HWCAP: %s\n",
+					smp_processor_id(), caps->desc);
+			cpu_die_early();
+		}
+	}
+}
+
+static void
+verify_local_cpu_features(const struct arm64_cpu_capabilities *caps)
+{
+	for (; caps->matches; caps++) {
+		if (!cpus_have_cap(caps->capability) || !caps->sys_reg)
+			continue;
+		/*
+		 * If the new CPU misses an advertised feature, we cannot proceed
+		 * further, park the cpu.
+		 */
+		if (!feature_matches(__raw_read_system_reg(caps->sys_reg), caps)) {
+			pr_crit("CPU%d: missing feature: %s\n",
+					smp_processor_id(), caps->desc);
+			cpu_die_early();
+		}
+		if (caps->enable)
+			caps->enable(NULL);
+	}
+}
+
 /*
  * Run through the enabled system capabilities and enable() it on this CPU.
  * The capabilities were decided based on the available CPUs at the boot time.
@@ -926,8 +958,6 @@ static void check_early_cpu_features(void)
  */
 void verify_local_cpu_capabilities(void)
 {
-	int i;
-	const struct arm64_cpu_capabilities *caps;
 
 	check_early_cpu_features();
 
@@ -938,32 +968,9 @@ void verify_local_cpu_capabilities(void)
 	if (!sys_caps_initialised)
 		return;
 
-	caps = arm64_features;
-	for (i = 0; caps[i].matches; i++) {
-		if (!cpus_have_cap(caps[i].capability) || !caps[i].sys_reg)
-			continue;
-		/*
-		 * If the new CPU misses an advertised feature, we cannot proceed
-		 * further, park the cpu.
-		 */
-		if (!feature_matches(__raw_read_system_reg(caps[i].sys_reg), &caps[i])) {
-			pr_crit("CPU%d: missing feature: %s\n",
-					smp_processor_id(), caps[i].desc);
-			cpu_die_early();
-		}
-		if (caps[i].enable)
-			caps[i].enable(NULL);
-	}
-
-	for (i = 0, caps = arm64_elf_hwcaps; caps[i].matches; i++) {
-		if (!cpus_have_elf_hwcap(&caps[i]))
-			continue;
-		if (!feature_matches(__raw_read_system_reg(caps[i].sys_reg), &caps[i])) {
-			pr_crit("CPU%d: missing HWCAP: %s\n",
-					smp_processor_id(), caps[i].desc);
-			cpu_die_early();
-		}
-	}
+	verify_local_cpu_features(arm64_features);
+	verify_local_elf_hwcaps(arm64_elf_hwcaps);
+	verify_local_elf_hwcaps(compat_elf_hwcaps);
 }
 
 static void __init setup_feature_capabilities(void)
@@ -979,7 +986,8 @@ void __init setup_cpu_features(void)
 
 	/* Set the CPU feature capabilies */
 	setup_feature_capabilities();
-	setup_elf_hwcaps();
+	setup_elf_hwcaps(arm64_elf_hwcaps);
+	setup_elf_hwcaps(compat_elf_hwcaps);
 
 	/* Advertise that we have computed the system capabilities */
 	set_sys_caps_initialised();
-- 
cgit v0.10.2


From c80aba803a9aa131f997f62a71ab453e456d08a8 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Mon, 18 Apr 2016 10:28:34 +0100
Subject: arm64: Add helpers for detecting AArch32 support at EL0

Adds a helper to extract the support for AArch32 at EL0

Tested-by: Yury Norov <ynorov@caviumnetworks.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index b9b6494..7f64285 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -170,6 +170,13 @@ static inline bool id_aa64mmfr0_mixed_endian_el0(u64 mmfr0)
 		cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_BIGENDEL0_SHIFT) == 0x1;
 }
 
+static inline bool id_aa64pfr0_32bit_el0(u64 pfr0)
+{
+	u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL0_SHIFT);
+
+	return val == ID_AA64PFR0_EL0_32BIT_64BIT;
+}
+
 void __init setup_cpu_features(void);
 
 void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps,
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 19182ef..d39d434 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -115,6 +115,7 @@
 #define ID_AA64PFR0_ASIMD_SUPPORTED	0x0
 #define ID_AA64PFR0_EL1_64BIT_ONLY	0x1
 #define ID_AA64PFR0_EL0_64BIT_ONLY	0x1
+#define ID_AA64PFR0_EL0_32BIT_64BIT	0x2
 
 /* id_aa64mmfr0 */
 #define ID_AA64MMFR0_TGRAN4_SHIFT	28
-- 
cgit v0.10.2


From a6dc3cd718ec2a19b8ca812087666899852af78f Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Mon, 18 Apr 2016 10:28:35 +0100
Subject: arm64: cpufeature: Check availability of AArch32

On ARMv8 support for AArch32 state is optional. Hence it is
not safe to check the AArch32 ID registers for sanity, which
could lead to false warnings. This patch makes sure that the
AArch32 state is implemented before we keep track of the 32bit
ID registers.

As per ARM ARM (D.1.21.2 - Support for Exception Levels and
Execution States, DDI0487A.h), checking the support for AArch32
at EL0 is good enough to check the support for AArch32 (i.e,
AArch32 at EL1 => AArch32 at EL0, but not vice versa).

Tested-by: Yury Norov <ynorov@caviumnetworks.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index cac7c41..9097ce9 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -439,22 +439,26 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info)
 	init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2);
 	init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0);
 	init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1);
-	init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0);
-	init_cpu_ftr_reg(SYS_ID_ISAR0_EL1, info->reg_id_isar0);
-	init_cpu_ftr_reg(SYS_ID_ISAR1_EL1, info->reg_id_isar1);
-	init_cpu_ftr_reg(SYS_ID_ISAR2_EL1, info->reg_id_isar2);
-	init_cpu_ftr_reg(SYS_ID_ISAR3_EL1, info->reg_id_isar3);
-	init_cpu_ftr_reg(SYS_ID_ISAR4_EL1, info->reg_id_isar4);
-	init_cpu_ftr_reg(SYS_ID_ISAR5_EL1, info->reg_id_isar5);
-	init_cpu_ftr_reg(SYS_ID_MMFR0_EL1, info->reg_id_mmfr0);
-	init_cpu_ftr_reg(SYS_ID_MMFR1_EL1, info->reg_id_mmfr1);
-	init_cpu_ftr_reg(SYS_ID_MMFR2_EL1, info->reg_id_mmfr2);
-	init_cpu_ftr_reg(SYS_ID_MMFR3_EL1, info->reg_id_mmfr3);
-	init_cpu_ftr_reg(SYS_ID_PFR0_EL1, info->reg_id_pfr0);
-	init_cpu_ftr_reg(SYS_ID_PFR1_EL1, info->reg_id_pfr1);
-	init_cpu_ftr_reg(SYS_MVFR0_EL1, info->reg_mvfr0);
-	init_cpu_ftr_reg(SYS_MVFR1_EL1, info->reg_mvfr1);
-	init_cpu_ftr_reg(SYS_MVFR2_EL1, info->reg_mvfr2);
+
+	if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) {
+		init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0);
+		init_cpu_ftr_reg(SYS_ID_ISAR0_EL1, info->reg_id_isar0);
+		init_cpu_ftr_reg(SYS_ID_ISAR1_EL1, info->reg_id_isar1);
+		init_cpu_ftr_reg(SYS_ID_ISAR2_EL1, info->reg_id_isar2);
+		init_cpu_ftr_reg(SYS_ID_ISAR3_EL1, info->reg_id_isar3);
+		init_cpu_ftr_reg(SYS_ID_ISAR4_EL1, info->reg_id_isar4);
+		init_cpu_ftr_reg(SYS_ID_ISAR5_EL1, info->reg_id_isar5);
+		init_cpu_ftr_reg(SYS_ID_MMFR0_EL1, info->reg_id_mmfr0);
+		init_cpu_ftr_reg(SYS_ID_MMFR1_EL1, info->reg_id_mmfr1);
+		init_cpu_ftr_reg(SYS_ID_MMFR2_EL1, info->reg_id_mmfr2);
+		init_cpu_ftr_reg(SYS_ID_MMFR3_EL1, info->reg_id_mmfr3);
+		init_cpu_ftr_reg(SYS_ID_PFR0_EL1, info->reg_id_pfr0);
+		init_cpu_ftr_reg(SYS_ID_PFR1_EL1, info->reg_id_pfr1);
+		init_cpu_ftr_reg(SYS_MVFR0_EL1, info->reg_mvfr0);
+		init_cpu_ftr_reg(SYS_MVFR1_EL1, info->reg_mvfr1);
+		init_cpu_ftr_reg(SYS_MVFR2_EL1, info->reg_mvfr2);
+	}
+
 }
 
 static void update_cpu_ftr_reg(struct arm64_ftr_reg *reg, u64 new)
@@ -559,47 +563,51 @@ void update_cpu_features(int cpu,
 				      info->reg_id_aa64pfr1, boot->reg_id_aa64pfr1);
 
 	/*
-	 * If we have AArch32, we care about 32-bit features for compat. These
-	 * registers should be RES0 otherwise.
+	 * If we have AArch32, we care about 32-bit features for compat.
+	 * If the system doesn't support AArch32, don't update them.
 	 */
-	taint |= check_update_ftr_reg(SYS_ID_DFR0_EL1, cpu,
+	if (id_aa64pfr0_32bit_el0(read_system_reg(SYS_ID_AA64PFR0_EL1)) &&
+		id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) {
+
+		taint |= check_update_ftr_reg(SYS_ID_DFR0_EL1, cpu,
 					info->reg_id_dfr0, boot->reg_id_dfr0);
-	taint |= check_update_ftr_reg(SYS_ID_ISAR0_EL1, cpu,
+		taint |= check_update_ftr_reg(SYS_ID_ISAR0_EL1, cpu,
 					info->reg_id_isar0, boot->reg_id_isar0);
-	taint |= check_update_ftr_reg(SYS_ID_ISAR1_EL1, cpu,
+		taint |= check_update_ftr_reg(SYS_ID_ISAR1_EL1, cpu,
 					info->reg_id_isar1, boot->reg_id_isar1);
-	taint |= check_update_ftr_reg(SYS_ID_ISAR2_EL1, cpu,
+		taint |= check_update_ftr_reg(SYS_ID_ISAR2_EL1, cpu,
 					info->reg_id_isar2, boot->reg_id_isar2);
-	taint |= check_update_ftr_reg(SYS_ID_ISAR3_EL1, cpu,
+		taint |= check_update_ftr_reg(SYS_ID_ISAR3_EL1, cpu,
 					info->reg_id_isar3, boot->reg_id_isar3);
-	taint |= check_update_ftr_reg(SYS_ID_ISAR4_EL1, cpu,
+		taint |= check_update_ftr_reg(SYS_ID_ISAR4_EL1, cpu,
 					info->reg_id_isar4, boot->reg_id_isar4);
-	taint |= check_update_ftr_reg(SYS_ID_ISAR5_EL1, cpu,
+		taint |= check_update_ftr_reg(SYS_ID_ISAR5_EL1, cpu,
 					info->reg_id_isar5, boot->reg_id_isar5);
 
-	/*
-	 * Regardless of the value of the AuxReg field, the AIFSR, ADFSR, and
-	 * ACTLR formats could differ across CPUs and therefore would have to
-	 * be trapped for virtualization anyway.
-	 */
-	taint |= check_update_ftr_reg(SYS_ID_MMFR0_EL1, cpu,
+		/*
+		 * Regardless of the value of the AuxReg field, the AIFSR, ADFSR, and
+		 * ACTLR formats could differ across CPUs and therefore would have to
+		 * be trapped for virtualization anyway.
+		 */
+		taint |= check_update_ftr_reg(SYS_ID_MMFR0_EL1, cpu,
 					info->reg_id_mmfr0, boot->reg_id_mmfr0);
-	taint |= check_update_ftr_reg(SYS_ID_MMFR1_EL1, cpu,
+		taint |= check_update_ftr_reg(SYS_ID_MMFR1_EL1, cpu,
 					info->reg_id_mmfr1, boot->reg_id_mmfr1);
-	taint |= check_update_ftr_reg(SYS_ID_MMFR2_EL1, cpu,
+		taint |= check_update_ftr_reg(SYS_ID_MMFR2_EL1, cpu,
 					info->reg_id_mmfr2, boot->reg_id_mmfr2);
-	taint |= check_update_ftr_reg(SYS_ID_MMFR3_EL1, cpu,
+		taint |= check_update_ftr_reg(SYS_ID_MMFR3_EL1, cpu,
 					info->reg_id_mmfr3, boot->reg_id_mmfr3);
-	taint |= check_update_ftr_reg(SYS_ID_PFR0_EL1, cpu,
+		taint |= check_update_ftr_reg(SYS_ID_PFR0_EL1, cpu,
 					info->reg_id_pfr0, boot->reg_id_pfr0);
-	taint |= check_update_ftr_reg(SYS_ID_PFR1_EL1, cpu,
+		taint |= check_update_ftr_reg(SYS_ID_PFR1_EL1, cpu,
 					info->reg_id_pfr1, boot->reg_id_pfr1);
-	taint |= check_update_ftr_reg(SYS_MVFR0_EL1, cpu,
+		taint |= check_update_ftr_reg(SYS_MVFR0_EL1, cpu,
 					info->reg_mvfr0, boot->reg_mvfr0);
-	taint |= check_update_ftr_reg(SYS_MVFR1_EL1, cpu,
+		taint |= check_update_ftr_reg(SYS_MVFR1_EL1, cpu,
 					info->reg_mvfr1, boot->reg_mvfr1);
-	taint |= check_update_ftr_reg(SYS_MVFR2_EL1, cpu,
+		taint |= check_update_ftr_reg(SYS_MVFR2_EL1, cpu,
 					info->reg_mvfr2, boot->reg_mvfr2);
+	}
 
 	/*
 	 * Mismatched CPU features are a recipe for disaster. Don't even
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 84c8684..92189e2 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -216,23 +216,26 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
 	info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1);
 	info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1);
 
-	info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1);
-	info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1);
-	info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1);
-	info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1);
-	info->reg_id_isar3 = read_cpuid(ID_ISAR3_EL1);
-	info->reg_id_isar4 = read_cpuid(ID_ISAR4_EL1);
-	info->reg_id_isar5 = read_cpuid(ID_ISAR5_EL1);
-	info->reg_id_mmfr0 = read_cpuid(ID_MMFR0_EL1);
-	info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1);
-	info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1);
-	info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1);
-	info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1);
-	info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1);
-
-	info->reg_mvfr0 = read_cpuid(MVFR0_EL1);
-	info->reg_mvfr1 = read_cpuid(MVFR1_EL1);
-	info->reg_mvfr2 = read_cpuid(MVFR2_EL1);
+	/* Update the 32bit ID registers only if AArch32 is implemented */
+	if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) {
+		info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1);
+		info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1);
+		info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1);
+		info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1);
+		info->reg_id_isar3 = read_cpuid(ID_ISAR3_EL1);
+		info->reg_id_isar4 = read_cpuid(ID_ISAR4_EL1);
+		info->reg_id_isar5 = read_cpuid(ID_ISAR5_EL1);
+		info->reg_id_mmfr0 = read_cpuid(ID_MMFR0_EL1);
+		info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1);
+		info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1);
+		info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1);
+		info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1);
+		info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1);
+
+		info->reg_mvfr0 = read_cpuid(MVFR0_EL1);
+		info->reg_mvfr1 = read_cpuid(MVFR1_EL1);
+		info->reg_mvfr2 = read_cpuid(MVFR2_EL1);
+	}
 
 	cpuinfo_detect_icache_policy(info);
 
-- 
cgit v0.10.2


From 042446a31e3803d81c7e618dd80928dc3dce70c5 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Mon, 18 Apr 2016 10:28:36 +0100
Subject: arm64: cpufeature: Track 32bit EL0 support

Add cpu_hwcap bit for keeping track of the support for 32bit EL0.

Tested-by: Yury Norov <ynorov@caviumnetworks.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 7f64285..ca8fb4b 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -35,8 +35,9 @@
 #define ARM64_ALT_PAN_NOT_UAO			10
 #define ARM64_HAS_VIRT_HOST_EXTN		11
 #define ARM64_WORKAROUND_CAVIUM_27456		12
+#define ARM64_HAS_32BIT_EL0			13
 
-#define ARM64_NCAPS				13
+#define ARM64_NCAPS				14
 
 #ifndef __ASSEMBLY__
 
@@ -192,6 +193,11 @@ static inline bool cpu_supports_mixed_endian_el0(void)
 	return id_aa64mmfr0_mixed_endian_el0(read_cpuid(ID_AA64MMFR0_EL1));
 }
 
+static inline bool system_supports_32bit_el0(void)
+{
+	return cpus_have_cap(ARM64_HAS_32BIT_EL0);
+}
+
 static inline bool system_supports_mixed_endian_el0(void)
 {
 	return id_aa64mmfr0_mixed_endian_el0(read_system_reg(SYS_ID_AA64MMFR0_EL1));
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 9097ce9..40a23f2 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -737,6 +737,15 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.capability = ARM64_HAS_VIRT_HOST_EXTN,
 		.matches = runs_at_el2,
 	},
+	{
+		.desc = "32-bit EL0 Support",
+		.capability = ARM64_HAS_32BIT_EL0,
+		.matches = has_cpuid_feature,
+		.sys_reg = SYS_ID_AA64PFR0_EL1,
+		.sign = FTR_UNSIGNED,
+		.field_pos = ID_AA64PFR0_EL0_SHIFT,
+		.min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT,
+	},
 	{},
 };
 
-- 
cgit v0.10.2


From 643d703d2d2dbf8e2f16efa0a6a32b1eca101d02 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Mon, 18 Apr 2016 10:28:37 +0100
Subject: arm64: compat: Check for AArch32 state

Make sure we have AArch32 state available for running COMPAT
binaries and also for switching the personality to PER_LINUX32.

Signed-off-by: Yury Norov <ynorov@caviumnetworks.com>
[ Added cap bit, checks for HWCAP, personality ]
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Tested-by: Yury Norov <ynorov@caviumnetworks.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index 24ed037..7a09c48 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -177,7 +177,8 @@ typedef compat_elf_greg_t		compat_elf_gregset_t[COMPAT_ELF_NGREG];
 
 /* AArch32 EABI. */
 #define EF_ARM_EABI_MASK		0xff000000
-#define compat_elf_check_arch(x)	(((x)->e_machine == EM_ARM) && \
+#define compat_elf_check_arch(x)	(system_supports_32bit_el0() && \
+					 ((x)->e_machine == EM_ARM) && \
 					 ((x)->e_flags & EF_ARM_EABI_MASK))
 
 #define compat_start_thread		compat_start_thread
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 40a23f2..8e62d61 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -987,7 +987,8 @@ void verify_local_cpu_capabilities(void)
 
 	verify_local_cpu_features(arm64_features);
 	verify_local_elf_hwcaps(arm64_elf_hwcaps);
-	verify_local_elf_hwcaps(compat_elf_hwcaps);
+	if (system_supports_32bit_el0())
+		verify_local_elf_hwcaps(compat_elf_hwcaps);
 }
 
 static void __init setup_feature_capabilities(void)
@@ -1004,7 +1005,9 @@ void __init setup_cpu_features(void)
 	/* Set the CPU feature capabilies */
 	setup_feature_capabilities();
 	setup_elf_hwcaps(arm64_elf_hwcaps);
-	setup_elf_hwcaps(compat_elf_hwcaps);
+
+	if (system_supports_32bit_el0())
+		setup_elf_hwcaps(compat_elf_hwcaps);
 
 	/* Advertise that we have computed the system capabilities */
 	set_sys_caps_initialised();
diff --git a/arch/arm64/kernel/sys.c b/arch/arm64/kernel/sys.c
index 75151aa..26fe8ea 100644
--- a/arch/arm64/kernel/sys.c
+++ b/arch/arm64/kernel/sys.c
@@ -25,6 +25,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/syscalls.h>
+#include <asm/cpufeature.h>
 
 asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
 			 unsigned long prot, unsigned long flags,
@@ -36,11 +37,20 @@ asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
 	return sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
 }
 
+SYSCALL_DEFINE1(arm64_personality, unsigned int, personality)
+{
+	if (personality(personality) == PER_LINUX32 &&
+		!system_supports_32bit_el0())
+		return -EINVAL;
+	return sys_personality(personality);
+}
+
 /*
  * Wrappers to pass the pt_regs argument.
  */
 asmlinkage long sys_rt_sigreturn_wrapper(void);
 #define sys_rt_sigreturn	sys_rt_sigreturn_wrapper
+#define sys_personality		sys_arm64_personality
 
 #undef __SYSCALL
 #define __SYSCALL(nr, sym)	[nr] = sym,
-- 
cgit v0.10.2


From 226d89cbb242f3fbd3e93367dfd5138f524aae5c Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 19 Apr 2016 17:01:31 +0100
Subject: arm64/dma-mapping: Extend DMA ops workaround to PCI devices

PCI devices now suffer the same hiccup as platform devices, in that they
get their DMA ops configured before they have been added to their bus,
and thus before we know whether they have successfully registered with
an IOMMU or not. Until the necessary driver core changes to reorder
calls during device creation have been worked out, extend our delayed
notifier trick onto the PCI bus so as to avoid broken DMA ops once
IOMMUs get plugged into the PCI code.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index a6e757c..607e709 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -933,6 +933,10 @@ static int __init __iommu_dma_init(void)
 		ret = register_iommu_dma_ops_notifier(&platform_bus_type);
 	if (!ret)
 		ret = register_iommu_dma_ops_notifier(&amba_bustype);
+#ifdef CONFIG_PCI
+	if (!ret)
+		ret = register_iommu_dma_ops_notifier(&pci_bus_type);
+#endif
 
 	/* handle devices queued before this arch_initcall */
 	if (!ret)
-- 
cgit v0.10.2


From 921b1f52c942ad2da971c38f54a2045ec7cfe9cb Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 19 Apr 2016 17:01:32 +0100
Subject: arm64/dma-mapping: Remove default domain workaround

With the IOMMU core now taking care of default domains for groups
regardless of bus type, we can gleefully rip out this stop-gap, as
slight recompense for having to expand the other one.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 607e709..fd8b942 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -804,57 +804,24 @@ struct iommu_dma_notifier_data {
 static LIST_HEAD(iommu_dma_masters);
 static DEFINE_MUTEX(iommu_dma_notifier_lock);
 
-/*
- * Temporarily "borrow" a domain feature flag to to tell if we had to resort
- * to creating our own domain here, in case we need to clean it up again.
- */
-#define __IOMMU_DOMAIN_FAKE_DEFAULT		(1U << 31)
-
 static bool do_iommu_attach(struct device *dev, const struct iommu_ops *ops,
 			   u64 dma_base, u64 size)
 {
 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 
 	/*
-	 * Best case: The device is either part of a group which was
-	 * already attached to a domain in a previous call, or it's
-	 * been put in a default DMA domain by the IOMMU core.
+	 * If the IOMMU driver has the DMA domain support that we require,
+	 * then the IOMMU core will have already configured a group for this
+	 * device, and allocated the default domain for that group.
 	 */
-	if (!domain) {
-		/*
-		 * Urgh. The IOMMU core isn't going to do default domains
-		 * for non-PCI devices anyway, until it has some means of
-		 * abstracting the entirely implementation-specific
-		 * sideband data/SoC topology/unicorn dust that may or
-		 * may not differentiate upstream masters.
-		 * So until then, HORRIBLE HACKS!
-		 */
-		domain = ops->domain_alloc(IOMMU_DOMAIN_DMA);
-		if (!domain)
-			goto out_no_domain;
-
-		domain->ops = ops;
-		domain->type = IOMMU_DOMAIN_DMA | __IOMMU_DOMAIN_FAKE_DEFAULT;
-
-		if (iommu_attach_device(domain, dev))
-			goto out_put_domain;
+	if (!domain || iommu_dma_init_domain(domain, dma_base, size)) {
+		pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA ops\n",
+			dev_name(dev));
+		return false;
 	}
 
-	if (iommu_dma_init_domain(domain, dma_base, size))
-		goto out_detach;
-
 	dev->archdata.dma_ops = &iommu_dma_ops;
 	return true;
-
-out_detach:
-	iommu_detach_device(domain, dev);
-out_put_domain:
-	if (domain->type & __IOMMU_DOMAIN_FAKE_DEFAULT)
-		iommu_domain_free(domain);
-out_no_domain:
-	pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA ops\n",
-		dev_name(dev));
-	return false;
 }
 
 static void queue_iommu_attach(struct device *dev, const struct iommu_ops *ops,
@@ -971,11 +938,8 @@ void arch_teardown_dma_ops(struct device *dev)
 {
 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 
-	if (domain) {
+	if (WARN_ON(domain))
 		iommu_detach_device(domain, dev);
-		if (domain->type & __IOMMU_DOMAIN_FAKE_DEFAULT)
-			iommu_domain_free(domain);
-	}
 
 	dev->archdata.dma_ops = NULL;
 }
-- 
cgit v0.10.2


From 046982c760bcbb0d92981a7be577dbc081323a78 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 18 Apr 2016 18:04:55 +0200
Subject: nios2: use correct void* return type for page_to_virt()

To align with other architectures, the expression produced by expanding
the macro page_to_virt() should be of type void*, since it returns a
virtual address. Fix that, and also fix up an instance where page_to_virt
was expected to return 'unsigned long', and drop another instance that was
entirely unused (page_to_bus)

Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/nios2/include/asm/io.h b/arch/nios2/include/asm/io.h
index c5a62da..ce072ba 100644
--- a/arch/nios2/include/asm/io.h
+++ b/arch/nios2/include/asm/io.h
@@ -50,7 +50,6 @@ static inline void iounmap(void __iomem *addr)
 
 /* Pages to physical address... */
 #define page_to_phys(page)	virt_to_phys(page_to_virt(page))
-#define page_to_bus(page)	page_to_virt(page)
 
 /* Macros used for converting between virtual and physical mappings. */
 #define phys_to_virt(vaddr)	\
diff --git a/arch/nios2/include/asm/page.h b/arch/nios2/include/asm/page.h
index 4b32d6f..c1683f5 100644
--- a/arch/nios2/include/asm/page.h
+++ b/arch/nios2/include/asm/page.h
@@ -84,7 +84,7 @@ extern struct page *mem_map;
 	((void *)((unsigned long)(x) + PAGE_OFFSET - PHYS_OFFSET))
 
 #define page_to_virt(page)	\
-	((((page) - mem_map) << PAGE_SHIFT) + PAGE_OFFSET)
+	((void *)(((page) - mem_map) << PAGE_SHIFT) + PAGE_OFFSET)
 
 # define pfn_to_kaddr(pfn)	__va((pfn) << PAGE_SHIFT)
 # define pfn_valid(pfn)		((pfn) >= ARCH_PFN_OFFSET &&	\
diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index a213e8c..298393c 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h
@@ -209,7 +209,7 @@ static inline void set_pte(pte_t *ptep, pte_t pteval)
 static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep, pte_t pteval)
 {
-	unsigned long paddr = page_to_virt(pte_page(pteval));
+	unsigned long paddr = (unsigned long)page_to_virt(pte_page(pteval));
 
 	flush_dcache_range(paddr, paddr + PAGE_SIZE);
 	set_pte(ptep, pteval);
-- 
cgit v0.10.2


From 86d618cd1d9fdf4b46b68eb014d4fbde08c4ae62 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 18 Apr 2016 18:04:56 +0200
Subject: openrisc: drop wrongly typed definition of page_to_virt()

To align with generic code and other architectures that expect the macro
page_to_virt to produce an expression whose type is 'void*', drop the
arch specific definition, which is never referenced anyway.

Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/openrisc/include/asm/page.h b/arch/openrisc/include/asm/page.h
index e613d36..35bcb7c 100644
--- a/arch/openrisc/include/asm/page.h
+++ b/arch/openrisc/include/asm/page.h
@@ -81,8 +81,6 @@ typedef struct page *pgtable_t;
 
 #define virt_to_page(addr) \
 	(mem_map + (((unsigned long)(addr)-PAGE_OFFSET) >> PAGE_SHIFT))
-#define page_to_virt(page) \
-	((((page) - mem_map) << PAGE_SHIFT) + PAGE_OFFSET)
 
 #define page_to_phys(page)      ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
 
-- 
cgit v0.10.2


From 1dff8083a024650c75a9c961c38082473ceae8cf Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 18 Apr 2016 18:04:57 +0200
Subject: mm: replace open coded page to virt conversion with page_to_virt()

The open coded conversion from struct page address to virtual address in
lowmem_page_address() involves an intermediate conversion step to pfn
number/physical address. Since the placement of the struct page array
relative to the linear mapping may be completely independent from the
placement of physical RAM (as is that case for arm64 after commit
dfd55ad85e 'arm64: vmemmap: use virtual projection of linear region'),
the conversion to physical address and back again should factor out of
the equation, but unfortunately, the shifting and pointer arithmetic
involved prevent this from happening, and the resulting calculation
essentially subtracts the address of the start of physical memory and
adds it back again, in a way that prevents the compiler from optimizing
it away.

Since the start of physical memory is not a build time constant on arm64,
the resulting conversion involves an unnecessary memory access, which
we would like to get rid of. So replace the open coded conversion with
a call to page_to_virt(), and use the open coded conversion as its
default definition, to be overriden by the architecture, if desired.
The existing arch specific definitions of page_to_virt are all equivalent
to this default definition, so by itself this patch is a no-op.

Acked-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ffcff53..1c1e921 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -72,6 +72,10 @@ extern int mmap_rnd_compat_bits __read_mostly;
 #define __pa_symbol(x)  __pa(RELOC_HIDE((unsigned long)(x), 0))
 #endif
 
+#ifndef page_to_virt
+#define page_to_virt(x)	__va(PFN_PHYS(page_to_pfn(x)))
+#endif
+
 /*
  * To prevent common memory management code establishing
  * a zero page mapping on a read fault.
@@ -948,7 +952,7 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
 
 static __always_inline void *lowmem_page_address(const struct page *page)
 {
-	return __va(PFN_PHYS(page_to_pfn(page)));
+	return page_to_virt(page);
 }
 
 #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
-- 
cgit v0.10.2


From c8f8cca483aa3ec1beb63ee8633de9c76bb3fe6f Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 22 Apr 2016 18:48:03 +0200
Subject: arm64: ptdump: use static initializers for vmemmap region boundaries

There is no need to initialize the vmemmap region boundaries dynamically,
since they are compile time constants. So just add these constants to the
global struct initializer, and drop the dynamic assignment and related code.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c
index a21f4742..4934611 100644
--- a/arch/arm64/mm/dump.c
+++ b/arch/arm64/mm/dump.c
@@ -32,37 +32,21 @@ struct addr_marker {
 	const char *name;
 };
 
-enum address_markers_idx {
-	MODULES_START_NR = 0,
-	MODULES_END_NR,
-	VMALLOC_START_NR,
-	VMALLOC_END_NR,
-	FIXADDR_START_NR,
-	FIXADDR_END_NR,
-	PCI_START_NR,
-	PCI_END_NR,
+static const struct addr_marker address_markers[] = {
+	{ MODULES_VADDR,		"Modules start" },
+	{ MODULES_END,			"Modules end" },
+	{ VMALLOC_START,		"vmalloc() Area" },
+	{ VMALLOC_END,			"vmalloc() End" },
+	{ FIXADDR_START,		"Fixmap start" },
+	{ FIXADDR_TOP,			"Fixmap end" },
+	{ PCI_IO_START,			"PCI I/O start" },
+	{ PCI_IO_END,			"PCI I/O end" },
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-	VMEMMAP_START_NR,
-	VMEMMAP_END_NR,
+	{ VMEMMAP_START,		"vmemmap start" },
+	{ VMEMMAP_START + VMEMMAP_SIZE,	"vmemmap end" },
 #endif
-	KERNEL_SPACE_NR,
-};
-
-static struct addr_marker address_markers[] = {
-	{ MODULES_VADDR,	"Modules start" },
-	{ MODULES_END,		"Modules end" },
-	{ VMALLOC_START,	"vmalloc() Area" },
-	{ VMALLOC_END,		"vmalloc() End" },
-	{ FIXADDR_START,	"Fixmap start" },
-	{ FIXADDR_TOP,		"Fixmap end" },
-	{ PCI_IO_START,		"PCI I/O start" },
-	{ PCI_IO_END,		"PCI I/O end" },
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-	{ 0,			"vmemmap start" },
-	{ 0,			"vmemmap end" },
-#endif
-	{ PAGE_OFFSET,		"Linear Mapping" },
-	{ -1,			NULL },
+	{ PAGE_OFFSET,			"Linear Mapping" },
+	{ -1,				NULL },
 };
 
 /*
@@ -347,13 +331,6 @@ static int ptdump_init(void)
 			for (j = 0; j < pg_level[i].num; j++)
 				pg_level[i].mask |= pg_level[i].bits[j].mask;
 
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-	address_markers[VMEMMAP_START_NR].start_address =
-				(unsigned long)virt_to_page(PAGE_OFFSET);
-	address_markers[VMEMMAP_END_NR].start_address =
-				(unsigned long)virt_to_page(high_memory);
-#endif
-
 	pe = debugfs_create_file("kernel_page_tables", 0400, NULL, NULL,
 				 &ptdump_fops);
 	return pe ? 0 : -ENOMEM;
-- 
cgit v0.10.2


From d8fc68a04d2ffa7327a5fd89d4cd9f2fe24659d3 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 22 Apr 2016 18:48:04 +0200
Subject: arm64: ptdump: add region marker for kasan shadow region

Annotate the KASAN shadow region with boundary markers, so that its
mappings stand out in the page table dumper output.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c
index 4934611..8404190 100644
--- a/arch/arm64/mm/dump.c
+++ b/arch/arm64/mm/dump.c
@@ -23,6 +23,7 @@
 #include <linux/seq_file.h>
 
 #include <asm/fixmap.h>
+#include <asm/kasan.h>
 #include <asm/memory.h>
 #include <asm/pgtable.h>
 #include <asm/pgtable-hwdef.h>
@@ -33,6 +34,10 @@ struct addr_marker {
 };
 
 static const struct addr_marker address_markers[] = {
+#ifdef CONFIG_KASAN
+	{ KASAN_SHADOW_START,		"Kasan shadow start" },
+	{ KASAN_SHADOW_END,		"Kasan shadow end" },
+#endif
 	{ MODULES_VADDR,		"Modules start" },
 	{ MODULES_END,			"Modules end" },
 	{ VMALLOC_START,		"vmalloc() Area" },
-- 
cgit v0.10.2


From 2366c7fdb59812bbd1382afed69e250582c64187 Mon Sep 17 00:00:00 2001
From: Shannon Zhao <shannon.zhao@linaro.org>
Date: Thu, 7 Apr 2016 20:03:29 +0800
Subject: ARM64: ACPI: Check if it runs on Xen to enable or disable ACPI

When it's a Xen domain0 booting with ACPI, it will supply a /chosen and
a /hypervisor node in DT. So check if it needs to enable ACPI.

Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
Reviewed-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Acked-by: Hanjun Guo <hanjun.guo@linaro.org>
Tested-by: Julien Grall <julien.grall@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index d1ce8e2..57ee317 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -67,10 +67,15 @@ static int __init dt_scan_depth1_nodes(unsigned long node,
 {
 	/*
 	 * Return 1 as soon as we encounter a node at depth 1 that is
-	 * not the /chosen node.
+	 * not the /chosen node, or /hypervisor node with compatible
+	 * string "xen,xen".
 	 */
-	if (depth == 1 && (strcmp(uname, "chosen") != 0))
-		return 1;
+	if (depth == 1 && (strcmp(uname, "chosen") != 0)) {
+		if (strcmp(uname, "hypervisor") != 0 ||
+		    !of_flat_dt_is_compatible(node, "xen,xen"))
+			return 1;
+	}
+
 	return 0;
 }
 
@@ -184,7 +189,8 @@ void __init acpi_boot_table_init(void)
 	/*
 	 * Enable ACPI instead of device tree unless
 	 * - ACPI has been disabled explicitly (acpi=off), or
-	 * - the device tree is not empty (it has more than just a /chosen node)
+	 * - the device tree is not empty (it has more than just a /chosen node,
+	 *   and a /hypervisor node when running on Xen)
 	 *   and ACPI has not been force enabled (acpi=force)
 	 */
 	if (param_acpi_off ||
-- 
cgit v0.10.2


From 9981293fb0f02e6127d6ab00218bf091f9f6c89b Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 25 Apr 2016 11:25:11 +0100
Subject: arm64: make dt_scan_depth1_nodes more readable

Improve the readability of dt_scan_depth1_nodes by removing the nested
conditionals.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Stefano Stabellini <sstabellini@kernel.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index 57ee317..6884c76 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -66,17 +66,24 @@ static int __init dt_scan_depth1_nodes(unsigned long node,
 				       void *data)
 {
 	/*
-	 * Return 1 as soon as we encounter a node at depth 1 that is
-	 * not the /chosen node, or /hypervisor node with compatible
-	 * string "xen,xen".
+	 * Ignore anything not directly under the root node; we'll
+	 * catch its parent instead.
 	 */
-	if (depth == 1 && (strcmp(uname, "chosen") != 0)) {
-		if (strcmp(uname, "hypervisor") != 0 ||
-		    !of_flat_dt_is_compatible(node, "xen,xen"))
-			return 1;
-	}
+	if (depth != 1)
+		return 0;
 
-	return 0;
+	if (strcmp(uname, "chosen") == 0)
+		return 0;
+
+	if (strcmp(uname, "hypervisor") == 0 &&
+	    of_flat_dt_is_compatible(node, "xen,xen"))
+		return 0;
+
+	/*
+	 * This node at depth 1 is neither a chosen node nor a xen node,
+	 * which we do not expect.
+	 */
+	return 1;
 }
 
 /*
-- 
cgit v0.10.2


From 92406f0cc9e3d5cc77bf3de6d68c9c2373dcd701 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Fri, 22 Apr 2016 12:25:31 +0100
Subject: arm64: cpufeature: Add scope for capability check

Add scope parameter to the arm64_cpu_capabilities::matches(), so that
this can be reused for checking the capability on a given CPU vs the
system wide. The system uses the default scope associated with the
capability for initialising the CPU_HWCAPs and ELF_HWCAPs.

Cc: James Morse <james.morse@arm.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Andre Przywara <andre.przywara@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index ca8fb4b..e501e4a 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -78,10 +78,17 @@ struct arm64_ftr_reg {
 	struct arm64_ftr_bits	*ftr_bits;
 };
 
+/* scope of capability check */
+enum {
+	SCOPE_SYSTEM,
+	SCOPE_LOCAL_CPU,
+};
+
 struct arm64_cpu_capabilities {
 	const char *desc;
 	u16 capability;
-	bool (*matches)(const struct arm64_cpu_capabilities *);
+	int def_scope;			/* default scope */
+	bool (*matches)(const struct arm64_cpu_capabilities *caps, int scope);
 	void (*enable)(void *);		/* Called on all active CPUs */
 	union {
 		struct {	/* To be used for erratum handling only */
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 06afd04..2fdecd7 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -22,14 +22,16 @@
 #include <asm/cpufeature.h>
 
 static bool __maybe_unused
-is_affected_midr_range(const struct arm64_cpu_capabilities *entry)
+is_affected_midr_range(const struct arm64_cpu_capabilities *entry, int scope)
 {
+	WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
 	return MIDR_IS_CPU_MODEL_RANGE(read_cpuid_id(), entry->midr_model,
 				       entry->midr_range_min,
 				       entry->midr_range_max);
 }
 
 #define MIDR_RANGE(model, min, max) \
+	.def_scope = SCOPE_LOCAL_CPU, \
 	.matches = is_affected_midr_range, \
 	.midr_model = model, \
 	.midr_range_min = min, \
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 8e62d61..54abd9b 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -71,7 +71,8 @@ DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS);
 
 /* meta feature for alternatives */
 static bool __maybe_unused
-cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry);
+cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused);
+
 
 static struct arm64_ftr_bits ftr_id_aa64isar0[] = {
 	ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 32, 32, 0),
@@ -626,6 +627,49 @@ u64 read_system_reg(u32 id)
 	return regp->sys_val;
 }
 
+/*
+ * __raw_read_system_reg() - Used by a STARTING cpu before cpuinfo is populated.
+ * Read the system register on the current CPU
+ */
+static u64 __raw_read_system_reg(u32 sys_id)
+{
+	switch (sys_id) {
+	case SYS_ID_PFR0_EL1:		return read_cpuid(ID_PFR0_EL1);
+	case SYS_ID_PFR1_EL1:		return read_cpuid(ID_PFR1_EL1);
+	case SYS_ID_DFR0_EL1:		return read_cpuid(ID_DFR0_EL1);
+	case SYS_ID_MMFR0_EL1:		return read_cpuid(ID_MMFR0_EL1);
+	case SYS_ID_MMFR1_EL1:		return read_cpuid(ID_MMFR1_EL1);
+	case SYS_ID_MMFR2_EL1:		return read_cpuid(ID_MMFR2_EL1);
+	case SYS_ID_MMFR3_EL1:		return read_cpuid(ID_MMFR3_EL1);
+	case SYS_ID_ISAR0_EL1:		return read_cpuid(ID_ISAR0_EL1);
+	case SYS_ID_ISAR1_EL1:		return read_cpuid(ID_ISAR1_EL1);
+	case SYS_ID_ISAR2_EL1:		return read_cpuid(ID_ISAR2_EL1);
+	case SYS_ID_ISAR3_EL1:		return read_cpuid(ID_ISAR3_EL1);
+	case SYS_ID_ISAR4_EL1:		return read_cpuid(ID_ISAR4_EL1);
+	case SYS_ID_ISAR5_EL1:		return read_cpuid(ID_ISAR4_EL1);
+	case SYS_MVFR0_EL1:		return read_cpuid(MVFR0_EL1);
+	case SYS_MVFR1_EL1:		return read_cpuid(MVFR1_EL1);
+	case SYS_MVFR2_EL1:		return read_cpuid(MVFR2_EL1);
+
+	case SYS_ID_AA64PFR0_EL1:	return read_cpuid(ID_AA64PFR0_EL1);
+	case SYS_ID_AA64PFR1_EL1:	return read_cpuid(ID_AA64PFR0_EL1);
+	case SYS_ID_AA64DFR0_EL1:	return read_cpuid(ID_AA64DFR0_EL1);
+	case SYS_ID_AA64DFR1_EL1:	return read_cpuid(ID_AA64DFR0_EL1);
+	case SYS_ID_AA64MMFR0_EL1:	return read_cpuid(ID_AA64MMFR0_EL1);
+	case SYS_ID_AA64MMFR1_EL1:	return read_cpuid(ID_AA64MMFR1_EL1);
+	case SYS_ID_AA64MMFR2_EL1:	return read_cpuid(ID_AA64MMFR2_EL1);
+	case SYS_ID_AA64ISAR0_EL1:	return read_cpuid(ID_AA64ISAR0_EL1);
+	case SYS_ID_AA64ISAR1_EL1:	return read_cpuid(ID_AA64ISAR1_EL1);
+
+	case SYS_CNTFRQ_EL0:		return read_cpuid(CNTFRQ_EL0);
+	case SYS_CTR_EL0:		return read_cpuid(CTR_EL0);
+	case SYS_DCZID_EL0:		return read_cpuid(DCZID_EL0);
+	default:
+		BUG();
+		return 0;
+	}
+}
+
 #include <linux/irqchip/arm-gic-v3.h>
 
 static bool
@@ -637,19 +681,24 @@ feature_matches(u64 reg, const struct arm64_cpu_capabilities *entry)
 }
 
 static bool
-has_cpuid_feature(const struct arm64_cpu_capabilities *entry)
+has_cpuid_feature(const struct arm64_cpu_capabilities *entry, int scope)
 {
 	u64 val;
 
-	val = read_system_reg(entry->sys_reg);
+	WARN_ON(scope == SCOPE_LOCAL_CPU && preemptible());
+	if (scope == SCOPE_SYSTEM)
+		val = read_system_reg(entry->sys_reg);
+	else
+		val = __raw_read_system_reg(entry->sys_reg);
+
 	return feature_matches(val, entry);
 }
 
-static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry)
+static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry, int scope)
 {
 	bool has_sre;
 
-	if (!has_cpuid_feature(entry))
+	if (!has_cpuid_feature(entry, scope))
 		return false;
 
 	has_sre = gic_enable_sre();
@@ -660,7 +709,7 @@ static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry)
 	return has_sre;
 }
 
-static bool has_no_hw_prefetch(const struct arm64_cpu_capabilities *entry)
+static bool has_no_hw_prefetch(const struct arm64_cpu_capabilities *entry, int __unused)
 {
 	u32 midr = read_cpuid_id();
 	u32 rv_min, rv_max;
@@ -672,7 +721,7 @@ static bool has_no_hw_prefetch(const struct arm64_cpu_capabilities *entry)
 	return MIDR_IS_CPU_MODEL_RANGE(midr, MIDR_THUNDERX, rv_min, rv_max);
 }
 
-static bool runs_at_el2(const struct arm64_cpu_capabilities *entry)
+static bool runs_at_el2(const struct arm64_cpu_capabilities *entry, int __unused)
 {
 	return is_kernel_in_hyp_mode();
 }
@@ -681,6 +730,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 	{
 		.desc = "GIC system register CPU interface",
 		.capability = ARM64_HAS_SYSREG_GIC_CPUIF,
+		.def_scope = SCOPE_SYSTEM,
 		.matches = has_useable_gicv3_cpuif,
 		.sys_reg = SYS_ID_AA64PFR0_EL1,
 		.field_pos = ID_AA64PFR0_GIC_SHIFT,
@@ -691,6 +741,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 	{
 		.desc = "Privileged Access Never",
 		.capability = ARM64_HAS_PAN,
+		.def_scope = SCOPE_SYSTEM,
 		.matches = has_cpuid_feature,
 		.sys_reg = SYS_ID_AA64MMFR1_EL1,
 		.field_pos = ID_AA64MMFR1_PAN_SHIFT,
@@ -703,6 +754,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 	{
 		.desc = "LSE atomic instructions",
 		.capability = ARM64_HAS_LSE_ATOMICS,
+		.def_scope = SCOPE_SYSTEM,
 		.matches = has_cpuid_feature,
 		.sys_reg = SYS_ID_AA64ISAR0_EL1,
 		.field_pos = ID_AA64ISAR0_ATOMICS_SHIFT,
@@ -713,12 +765,14 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 	{
 		.desc = "Software prefetching using PRFM",
 		.capability = ARM64_HAS_NO_HW_PREFETCH,
+		.def_scope = SCOPE_SYSTEM,
 		.matches = has_no_hw_prefetch,
 	},
 #ifdef CONFIG_ARM64_UAO
 	{
 		.desc = "User Access Override",
 		.capability = ARM64_HAS_UAO,
+		.def_scope = SCOPE_SYSTEM,
 		.matches = has_cpuid_feature,
 		.sys_reg = SYS_ID_AA64MMFR2_EL1,
 		.field_pos = ID_AA64MMFR2_UAO_SHIFT,
@@ -729,17 +783,20 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 #ifdef CONFIG_ARM64_PAN
 	{
 		.capability = ARM64_ALT_PAN_NOT_UAO,
+		.def_scope = SCOPE_SYSTEM,
 		.matches = cpufeature_pan_not_uao,
 	},
 #endif /* CONFIG_ARM64_PAN */
 	{
 		.desc = "Virtualization Host Extensions",
 		.capability = ARM64_HAS_VIRT_HOST_EXTN,
+		.def_scope = SCOPE_SYSTEM,
 		.matches = runs_at_el2,
 	},
 	{
 		.desc = "32-bit EL0 Support",
 		.capability = ARM64_HAS_32BIT_EL0,
+		.def_scope = SCOPE_SYSTEM,
 		.matches = has_cpuid_feature,
 		.sys_reg = SYS_ID_AA64PFR0_EL1,
 		.sign = FTR_UNSIGNED,
@@ -752,6 +809,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 #define HWCAP_CAP(reg, field, s, min_value, type, cap)	\
 	{							\
 		.desc = #cap,					\
+		.def_scope = SCOPE_SYSTEM,			\
 		.matches = has_cpuid_feature,			\
 		.sys_reg = reg,					\
 		.field_pos = field,				\
@@ -834,7 +892,7 @@ static bool cpus_have_elf_hwcap(const struct arm64_cpu_capabilities *cap)
 static void __init setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps)
 {
 	for (; hwcaps->matches; hwcaps++)
-		if (hwcaps->matches(hwcaps))
+		if (hwcaps->matches(hwcaps, hwcaps->def_scope))
 			cap_set_elf_hwcap(hwcaps);
 }
 
@@ -842,7 +900,7 @@ void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps,
 			    const char *info)
 {
 	for (; caps->matches; caps++) {
-		if (!caps->matches(caps))
+		if (!caps->matches(caps, caps->def_scope))
 			continue;
 
 		if (!cpus_have_cap(caps->capability) && caps->desc)
@@ -879,48 +937,6 @@ static inline void set_sys_caps_initialised(void)
 }
 
 /*
- * __raw_read_system_reg() - Used by a STARTING cpu before cpuinfo is populated.
- */
-static u64 __raw_read_system_reg(u32 sys_id)
-{
-	switch (sys_id) {
-	case SYS_ID_PFR0_EL1:		return read_cpuid(ID_PFR0_EL1);
-	case SYS_ID_PFR1_EL1:		return read_cpuid(ID_PFR1_EL1);
-	case SYS_ID_DFR0_EL1:		return read_cpuid(ID_DFR0_EL1);
-	case SYS_ID_MMFR0_EL1:		return read_cpuid(ID_MMFR0_EL1);
-	case SYS_ID_MMFR1_EL1:		return read_cpuid(ID_MMFR1_EL1);
-	case SYS_ID_MMFR2_EL1:		return read_cpuid(ID_MMFR2_EL1);
-	case SYS_ID_MMFR3_EL1:		return read_cpuid(ID_MMFR3_EL1);
-	case SYS_ID_ISAR0_EL1:		return read_cpuid(ID_ISAR0_EL1);
-	case SYS_ID_ISAR1_EL1:		return read_cpuid(ID_ISAR1_EL1);
-	case SYS_ID_ISAR2_EL1:		return read_cpuid(ID_ISAR2_EL1);
-	case SYS_ID_ISAR3_EL1:		return read_cpuid(ID_ISAR3_EL1);
-	case SYS_ID_ISAR4_EL1:		return read_cpuid(ID_ISAR4_EL1);
-	case SYS_ID_ISAR5_EL1:		return read_cpuid(ID_ISAR4_EL1);
-	case SYS_MVFR0_EL1:		return read_cpuid(MVFR0_EL1);
-	case SYS_MVFR1_EL1:		return read_cpuid(MVFR1_EL1);
-	case SYS_MVFR2_EL1:		return read_cpuid(MVFR2_EL1);
-
-	case SYS_ID_AA64PFR0_EL1:	return read_cpuid(ID_AA64PFR0_EL1);
-	case SYS_ID_AA64PFR1_EL1:	return read_cpuid(ID_AA64PFR0_EL1);
-	case SYS_ID_AA64DFR0_EL1:	return read_cpuid(ID_AA64DFR0_EL1);
-	case SYS_ID_AA64DFR1_EL1:	return read_cpuid(ID_AA64DFR0_EL1);
-	case SYS_ID_AA64MMFR0_EL1:	return read_cpuid(ID_AA64MMFR0_EL1);
-	case SYS_ID_AA64MMFR1_EL1:	return read_cpuid(ID_AA64MMFR1_EL1);
-	case SYS_ID_AA64MMFR2_EL1:	return read_cpuid(ID_AA64MMFR2_EL1);
-	case SYS_ID_AA64ISAR0_EL1:	return read_cpuid(ID_AA64ISAR0_EL1);
-	case SYS_ID_AA64ISAR1_EL1:	return read_cpuid(ID_AA64ISAR1_EL1);
-
-	case SYS_CNTFRQ_EL0:		return read_cpuid(CNTFRQ_EL0);
-	case SYS_CTR_EL0:		return read_cpuid(CTR_EL0);
-	case SYS_DCZID_EL0:		return read_cpuid(DCZID_EL0);
-	default:
-		BUG();
-		return 0;
-	}
-}
-
-/*
  * Check for CPU features that are used in early boot
  * based on the Boot CPU value.
  */
@@ -934,28 +950,25 @@ static void
 verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps)
 {
 
-	for (; caps->matches; caps++) {
-		if (!cpus_have_elf_hwcap(caps))
-			continue;
-		if (!feature_matches(__raw_read_system_reg(caps->sys_reg), caps)) {
+	for (; caps->matches; caps++)
+		if (cpus_have_elf_hwcap(caps) && !caps->matches(caps, SCOPE_LOCAL_CPU)) {
 			pr_crit("CPU%d: missing HWCAP: %s\n",
 					smp_processor_id(), caps->desc);
 			cpu_die_early();
 		}
-	}
 }
 
 static void
 verify_local_cpu_features(const struct arm64_cpu_capabilities *caps)
 {
 	for (; caps->matches; caps++) {
-		if (!cpus_have_cap(caps->capability) || !caps->sys_reg)
+		if (!cpus_have_cap(caps->capability))
 			continue;
 		/*
 		 * If the new CPU misses an advertised feature, we cannot proceed
 		 * further, park the cpu.
 		 */
-		if (!feature_matches(__raw_read_system_reg(caps->sys_reg), caps)) {
+		if (!caps->matches(caps, SCOPE_LOCAL_CPU)) {
 			pr_crit("CPU%d: missing feature: %s\n",
 					smp_processor_id(), caps->desc);
 			cpu_die_early();
@@ -1026,7 +1039,7 @@ void __init setup_cpu_features(void)
 }
 
 static bool __maybe_unused
-cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry)
+cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused)
 {
 	return (cpus_have_cap(ARM64_HAS_PAN) && !cpus_have_cap(ARM64_HAS_UAO));
 }
-- 
cgit v0.10.2


From e3661b128e53ee281e1e7c589a5b647890bd6d7c Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 22 Apr 2016 12:25:32 +0100
Subject: arm64: Allow a capability to be checked on a single CPU

Now that the capabilities are only available once all the CPUs
have booted, we're unable to check for a particular feature
in any subsystem that gets initialized before then.

In order to support this, introduce a local_cpu_has_cap() function
that tests for the presence of a given capability independently
of the whole framework.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
[ Added preemptible() check ]
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
[will: remove duplicate initialisation of caps in this_cpu_has_cap]
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index e501e4a..d39db63 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -109,6 +109,8 @@ struct arm64_cpu_capabilities {
 
 extern DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS);
 
+bool this_cpu_has_cap(unsigned int cap);
+
 static inline bool cpu_have_feature(unsigned int num)
 {
 	return elf_hwcap & (1UL << num);
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 54abd9b..2b09dc4 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1010,6 +1010,24 @@ static void __init setup_feature_capabilities(void)
 	enable_cpu_capabilities(arm64_features);
 }
 
+/*
+ * Check if the current CPU has a given feature capability.
+ * Should be called from non-preemptible context.
+ */
+bool this_cpu_has_cap(unsigned int cap)
+{
+	const struct arm64_cpu_capabilities *caps;
+
+	if (WARN_ON(preemptible()))
+		return false;
+
+	for (caps = arm64_features; caps->desc; caps++)
+		if (caps->capability == cap && caps->matches)
+			return caps->matches(caps, SCOPE_LOCAL_CPU);
+
+	return false;
+}
+
 void __init setup_cpu_features(void)
 {
 	u32 cwg;
-- 
cgit v0.10.2


From 25fc11aead380501d70b701e136e89d321277177 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 22 Apr 2016 12:25:33 +0100
Subject: irqchip/gic: Restore CPU interface checking

When introducing the whole CPU feature detection framework,
we lost the capability to detect a mismatched GIC configuration
(using the GICv2 MMIO interface, but having the system register
interface enabled).

In order to solve this, use the new this_cpu_has_cap() helper.
Also move the check to the CPU interface path in order to catch
systems where the first CPU has been correctly configured,
but the secondaries are not.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index 282344b..095bb5b 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -55,7 +55,7 @@
 
 static void gic_check_cpu_features(void)
 {
-	WARN_TAINT_ONCE(cpus_have_cap(ARM64_HAS_SYSREG_GIC_CPUIF),
+	WARN_TAINT_ONCE(this_cpu_has_cap(ARM64_HAS_SYSREG_GIC_CPUIF),
 			TAINT_CPU_OUT_OF_SPEC,
 			"GICv3 system registers enabled, broken firmware!\n");
 }
@@ -490,6 +490,7 @@ static void gic_cpu_init(struct gic_chip_data *gic)
 		 * Get what the GIC says our CPU mask is.
 		 */
 		BUG_ON(cpu >= NR_GIC_CPU_IF);
+		gic_check_cpu_features();
 		cpu_mask = gic_get_cpumask(gic);
 		gic_cpu_map[cpu] = cpu_mask;
 
@@ -1021,8 +1022,6 @@ static void __init __gic_init_bases(unsigned int gic_nr, int irq_start,
 
 	BUG_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR);
 
-	gic_check_cpu_features();
-
 	gic = &gic_data[gic_nr];
 
 	/* Initialize irq_chip */
-- 
cgit v0.10.2


From 6a6efbb45b7d95c84840010095367eb06a64f342 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Fri, 22 Apr 2016 12:25:34 +0100
Subject: arm64: Verify CPU errata work arounds on hotplugged CPU

CPU Errata work arounds are detected and applied to the
kernel code at boot time and the data is then freed up.
If a new hotplugged CPU requires a work around which
was not applied at boot time, there is nothing we can
do but simply fail the booting.

Cc: Will Deacon <will.deacon@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index d39db63..224efe7 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -193,6 +193,7 @@ void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps,
 			    const char *info);
 void check_local_cpu_errata(void);
 
+void verify_local_cpu_errata(void);
 void verify_local_cpu_capabilities(void);
 
 u64 read_system_reg(u32 id);
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 2fdecd7..d427894 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -103,6 +103,26 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 	}
 };
 
+/*
+ * The CPU Errata work arounds are detected and applied at boot time
+ * and the related information is freed soon after. If the new CPU requires
+ * an errata not detected at boot, fail this CPU.
+ */
+void verify_local_cpu_errata(void)
+{
+	const struct arm64_cpu_capabilities *caps = arm64_errata;
+
+	for (; caps->matches; caps++)
+		if (!cpus_have_cap(caps->capability) &&
+			caps->matches(caps, SCOPE_LOCAL_CPU)) {
+			pr_crit("CPU%d: Requires work around for %s, not detected"
+					" at boot time\n",
+				smp_processor_id(),
+				caps->desc ? : "an erratum");
+			cpu_die_early();
+		}
+}
+
 void check_local_cpu_errata(void)
 {
 	update_cpu_capabilities(arm64_errata, "enabling workaround for");
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 2b09dc4..811773d 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -998,6 +998,7 @@ void verify_local_cpu_capabilities(void)
 	if (!sys_caps_initialised)
 		return;
 
+	verify_local_cpu_errata();
 	verify_local_cpu_features(arm64_features);
 	verify_local_elf_hwcaps(arm64_elf_hwcaps);
 	if (system_supports_32bit_el0())
-- 
cgit v0.10.2


From 44dbcc93ab67145bf8ebe3e91123303443c4a3bf Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Fri, 22 Apr 2016 12:25:35 +0100
Subject: arm64: Fix behavior of maxcpus=N

maxcpu=n sets the number of CPUs activated at boot time to a max of n,
but allowing the remaining CPUs to be brought up later if the user
decides to do so. However, on arm64 due to various reasons, we disallowed
hotplugging CPUs beyond n, by marking them not present. Now that
we have checks in place to make sure the hotplugged CPUs have compatible
features with system and requires no new errata, relax the restriction.

Cc: Will Deacon <will.deacon@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: James Morse <james.morse@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index fef2e73..dc96475 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -689,33 +689,18 @@ void __init smp_init_cpus(void)
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
 	int err;
-	unsigned int cpu, ncores = num_possible_cpus();
+	unsigned int cpu;
 
 	init_cpu_topology();
 
 	smp_store_cpu_info(smp_processor_id());
 
 	/*
-	 * are we trying to boot more cores than exist?
-	 */
-	if (max_cpus > ncores)
-		max_cpus = ncores;
-
-	/* Don't bother if we're effectively UP */
-	if (max_cpus <= 1)
-		return;
-
-	/*
 	 * Initialise the present map (which describes the set of CPUs
 	 * actually populated at the present time) and release the
 	 * secondaries from the bootloader.
-	 *
-	 * Make sure we online at most (max_cpus - 1) additional CPUs.
 	 */
-	max_cpus--;
 	for_each_possible_cpu(cpu) {
-		if (max_cpus == 0)
-			break;
 
 		if (cpu == smp_processor_id())
 			continue;
@@ -728,7 +713,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 			continue;
 
 		set_cpu_present(cpu, true);
-		max_cpus--;
 	}
 }
 
-- 
cgit v0.10.2


From 190c056fc32a528979807cb5d5a0d68285933073 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 18 Apr 2016 17:09:41 +0200
Subject: arm64: kernel: don't export local symbols from head.S

This unexports some symbols from head.S that are only used locally.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index b434176..ac27d8d 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -638,7 +638,7 @@ ENDPROC(el2_setup)
  * Sets the __boot_cpu_mode flag depending on the CPU boot mode passed
  * in x20. See arch/arm64/include/asm/virt.h for more info.
  */
-ENTRY(set_cpu_boot_mode_flag)
+set_cpu_boot_mode_flag:
 	adr_l	x1, __boot_cpu_mode
 	cmp	w20, #BOOT_CPU_MODE_EL2
 	b.ne	1f
@@ -691,7 +691,7 @@ ENTRY(secondary_entry)
 	b	secondary_startup
 ENDPROC(secondary_entry)
 
-ENTRY(secondary_startup)
+secondary_startup:
 	/*
 	 * Common entry point for secondary CPUs.
 	 */
@@ -706,7 +706,7 @@ ENTRY(secondary_startup)
 ENDPROC(secondary_startup)
 0:	.long	(_text - TEXT_OFFSET) - __secondary_switched
 
-ENTRY(__secondary_switched)
+__secondary_switched:
 	adr_l	x5, vectors
 	msr	vbar_el1, x5
 	isb
-- 
cgit v0.10.2


From e5ebeec879b726c755af0c1c15f3699b53268cd5 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 18 Apr 2016 17:09:42 +0200
Subject: arm64: kernel: use literal for relocated address of
 __secondary_switched

We can simply use a relocated 64-bit literal to store the address of
__secondary_switched(), and the relocation code will ensure that it
holds the correct value at secondary entry time, as long as we make sure
that the literal is not dereferenced until after we have enabled the MMU.

So jump via a small __secondary_switch() function covered by the ID map
that performs the literal load and branch-to-register.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index ac27d8d..f13276d 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -468,9 +468,7 @@ __mmap_switched:
 	str	x15, [x11, x23]
 	b	0b
 
-2:	adr_l	x8, kimage_vaddr		// make relocated kimage_vaddr
-	dc	cvac, x8			// value visible to secondaries
-	dsb	sy				// with MMU off
+2:
 #endif
 
 	adr_l	sp, initial_sp, x4
@@ -699,12 +697,9 @@ secondary_startup:
 	adrp	x26, swapper_pg_dir
 	bl	__cpu_setup			// initialise processor
 
-	ldr	x8, kimage_vaddr
-	ldr	w9, 0f
-	sub	x27, x8, w9, sxtw		// address to jump to after enabling the MMU
+	adr_l	x27, __secondary_switch		// address to jump to after enabling the MMU
 	b	__enable_mmu
 ENDPROC(secondary_startup)
-0:	.long	(_text - TEXT_OFFSET) - __secondary_switched
 
 __secondary_switched:
 	adr_l	x5, vectors
@@ -806,3 +801,8 @@ __no_granule_support:
 	wfi
 	b 1b
 ENDPROC(__no_granule_support)
+
+__secondary_switch:
+	ldr	x8, =__secondary_switched
+	br	x8
+ENDPROC(__secondary_switch)
-- 
cgit v0.10.2


From 0cd3defe0af4153ffc5fe39bcfa4abfc301984e9 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 18 Apr 2016 17:09:43 +0200
Subject: arm64: kernel: perform relocation processing from ID map

Refactor the relocation processing so that the code executes from the
ID map while accessing the relocation tables via the virtual mapping.
This way, we can use literals containing virtual addresses as before,
instead of having to use convoluted absolute expressions.

For symmetry with the secondary code path, the relocation code and the
subsequent jump to the virtual entry point are implemented in a function
called __primary_switch(), and __mmap_switched() is renamed to
__primary_switched(). Also, the call sequence in stext() is aligned with
the one in secondary_startup(), by replacing the awkward 'adr_l lr' and
'b cpu_setup' sequence with a simple branch and link.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index f13276d..0d487d9 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -223,13 +223,11 @@ ENTRY(stext)
 	 * On return, the CPU will be ready for the MMU to be turned on and
 	 * the TCR will have been set.
 	 */
-	ldr	x27, 0f				// address to jump to after
-	neg	x27, x27			// MMU has been enabled
-	adr_l	lr, __enable_mmu		// return (PIC) address
-	b	__cpu_setup			// initialise processor
+	bl	__cpu_setup			// initialise processor
+	adr_l	x27, __primary_switch		// address to jump to after
+						// MMU has been enabled
+	b	__enable_mmu
 ENDPROC(stext)
-	.align	3
-0:	.quad	(_text - TEXT_OFFSET) - __mmap_switched - KIMAGE_VADDR
 
 /*
  * Preserve the arguments passed by the bootloader in x0 .. x3
@@ -421,7 +419,7 @@ ENDPROC(__create_page_tables)
  * The following fragment of code is executed with the MMU enabled.
  */
 	.set	initial_sp, init_thread_union + THREAD_START_SP
-__mmap_switched:
+__primary_switched:
 	mov	x28, lr				// preserve LR
 	adr_l	x8, vectors			// load VBAR_EL1 with virtual
 	msr	vbar_el1, x8			// vector table address
@@ -435,42 +433,6 @@ __mmap_switched:
 	bl	__pi_memset
 	dsb	ishst				// Make zero page visible to PTW
 
-#ifdef CONFIG_RELOCATABLE
-
-	/*
-	 * Iterate over each entry in the relocation table, and apply the
-	 * relocations in place.
-	 */
-	adr_l	x8, __dynsym_start		// start of symbol table
-	adr_l	x9, __reloc_start		// start of reloc table
-	adr_l	x10, __reloc_end		// end of reloc table
-
-0:	cmp	x9, x10
-	b.hs	2f
-	ldp	x11, x12, [x9], #24
-	ldr	x13, [x9, #-8]
-	cmp	w12, #R_AARCH64_RELATIVE
-	b.ne	1f
-	add	x13, x13, x23			// relocate
-	str	x13, [x11, x23]
-	b	0b
-
-1:	cmp	w12, #R_AARCH64_ABS64
-	b.ne	0b
-	add	x12, x12, x12, lsl #1		// symtab offset: 24x top word
-	add	x12, x8, x12, lsr #(32 - 3)	// ... shifted into bottom word
-	ldrsh	w14, [x12, #6]			// Elf64_Sym::st_shndx
-	ldr	x15, [x12, #8]			// Elf64_Sym::st_value
-	cmp	w14, #-0xf			// SHN_ABS (0xfff1) ?
-	add	x14, x15, x23			// relocate
-	csel	x15, x14, x15, ne
-	add	x15, x13, x15
-	str	x15, [x11, x23]
-	b	0b
-
-2:
-#endif
-
 	adr_l	sp, initial_sp, x4
 	mov	x4, sp
 	and	x4, x4, #~(THREAD_SIZE - 1)
@@ -496,7 +458,7 @@ __mmap_switched:
 0:
 #endif
 	b	start_kernel
-ENDPROC(__mmap_switched)
+ENDPROC(__primary_switched)
 
 /*
  * end early head section, begin head code that is also used for
@@ -788,7 +750,6 @@ __enable_mmu:
 	ic	iallu				// flush instructions fetched
 	dsb	nsh				// via old mapping
 	isb
-	add	x27, x27, x23			// relocated __mmap_switched
 #endif
 	br	x27
 ENDPROC(__enable_mmu)
@@ -802,6 +763,51 @@ __no_granule_support:
 	b 1b
 ENDPROC(__no_granule_support)
 
+__primary_switch:
+#ifdef CONFIG_RELOCATABLE
+	/*
+	 * Iterate over each entry in the relocation table, and apply the
+	 * relocations in place.
+	 */
+	ldr	w8, =__dynsym_offset		// offset to symbol table
+	ldr	w9, =__rela_offset		// offset to reloc table
+	ldr	w10, =__rela_size		// size of reloc table
+
+	ldr	x11, =KIMAGE_VADDR		// default virtual offset
+	add	x11, x11, x23			// actual virtual offset
+	add	x8, x8, x11			// __va(.dynsym)
+	add	x9, x9, x11			// __va(.rela)
+	add	x10, x9, x10			// __va(.rela) + sizeof(.rela)
+
+0:	cmp	x9, x10
+	b.hs	2f
+	ldp	x11, x12, [x9], #24
+	ldr	x13, [x9, #-8]
+	cmp	w12, #R_AARCH64_RELATIVE
+	b.ne	1f
+	add	x13, x13, x23			// relocate
+	str	x13, [x11, x23]
+	b	0b
+
+1:	cmp	w12, #R_AARCH64_ABS64
+	b.ne	0b
+	add	x12, x12, x12, lsl #1		// symtab offset: 24x top word
+	add	x12, x8, x12, lsr #(32 - 3)	// ... shifted into bottom word
+	ldrsh	w14, [x12, #6]			// Elf64_Sym::st_shndx
+	ldr	x15, [x12, #8]			// Elf64_Sym::st_value
+	cmp	w14, #-0xf			// SHN_ABS (0xfff1) ?
+	add	x14, x15, x23			// relocate
+	csel	x15, x14, x15, ne
+	add	x15, x13, x15
+	str	x15, [x11, x23]
+	b	0b
+
+2:
+#endif
+	ldr	x8, =__primary_switched
+	br	x8
+ENDPROC(__primary_switch)
+
 __secondary_switch:
 	ldr	x8, =__secondary_switched
 	br	x8
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 77d86c9..8918b30 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -158,12 +158,9 @@ SECTIONS
 		*(.altinstr_replacement)
 	}
 	.rela : ALIGN(8) {
-		__reloc_start = .;
 		*(.rela .rela*)
-		__reloc_end = .;
 	}
 	.dynsym : ALIGN(8) {
-		__dynsym_start = .;
 		*(.dynsym)
 	}
 	.dynstr : {
@@ -173,6 +170,10 @@ SECTIONS
 		*(.hash)
 	}
 
+	__rela_offset	= ADDR(.rela) - KIMAGE_VADDR;
+	__rela_size	= SIZEOF(.rela);
+	__dynsym_offset	= ADDR(.dynsym) - KIMAGE_VADDR;
+
 	. = ALIGN(SEGMENT_ALIGN);
 	__init_end = .;
 
-- 
cgit v0.10.2


From 30b5ba5cf333cc650e474eaf2cc1ae91bc7cf89f Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 18 Apr 2016 17:09:44 +0200
Subject: arm64: introduce mov_q macro to move a constant into a 64-bit
 register

Implement a macro mov_q that can be used to move an immediate constant
into a 64-bit register, using between 2 and 4 movz/movk instructions
(depending on the operand)

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 972fb55..f9b6f7a 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -221,4 +221,24 @@ lr	.req	x30		// link register
 	.long	\sym\()_hi32
 	.endm
 
+	/*
+	 * mov_q - move an immediate constant into a 64-bit register using
+	 *         between 2 and 4 movz/movk instructions (depending on the
+	 *         magnitude and sign of the operand)
+	 */
+	.macro	mov_q, reg, val
+	.if (((\val) >> 31) == 0 || ((\val) >> 31) == 0x1ffffffff)
+	movz	\reg, :abs_g1_s:\val
+	.else
+	.if (((\val) >> 47) == 0 || ((\val) >> 47) == 0x1ffff)
+	movz	\reg, :abs_g2_s:\val
+	.else
+	movz	\reg, :abs_g3:\val
+	movk	\reg, :abs_g2_nc:\val
+	.endif
+	movk	\reg, :abs_g1_nc:\val
+	.endif
+	movk	\reg, :abs_g0_nc:\val
+	.endm
+
 #endif	/* __ASM_ASSEMBLER_H */
-- 
cgit v0.10.2


From b03cc885328e3c0de61843737d42eb0a0f112aab Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 18 Apr 2016 17:09:45 +0200
Subject: arm64: kernel: replace early 64-bit literal loads with
 move-immediates

When building a relocatable kernel, we currently rely on the fact that
early 64-bit literal loads need to be deferred to after the relocation
has been performed only if they involve symbol references, and not if
they involve assemble time constants. While this is not an unreasonable
assumption to make, it is better to switch to movk/movz sequences, since
these are guaranteed to be resolved at link time, simply because there are
no dynamic relocation types to describe them.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 0d487d9..dae9cab 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -337,7 +337,7 @@ __create_page_tables:
 	cmp	x0, x6
 	b.lo	1b
 
-	ldr	x7, =SWAPPER_MM_MMUFLAGS
+	mov	x7, SWAPPER_MM_MMUFLAGS
 
 	/*
 	 * Create the identity mapping.
@@ -393,7 +393,7 @@ __create_page_tables:
 	 * Map the kernel image (starting with PHYS_OFFSET).
 	 */
 	mov	x0, x26				// swapper_pg_dir
-	ldr	x5, =KIMAGE_VADDR
+	mov_q	x5, KIMAGE_VADDR
 	add	x5, x5, x23			// add KASLR displacement
 	create_pgd_entry x0, x5, x3, x6
 	ldr	w6, =kernel_img_size
@@ -631,7 +631,7 @@ ENTRY(secondary_holding_pen)
 	bl	el2_setup			// Drop to EL1, w20=cpu_boot_mode
 	bl	set_cpu_boot_mode_flag
 	mrs	x0, mpidr_el1
-	ldr     x1, =MPIDR_HWID_BITMASK
+	mov_q	x1, MPIDR_HWID_BITMASK
 	and	x0, x0, x1
 	adr_l	x3, secondary_holding_pen_release
 pen:	ldr	x4, [x3]
@@ -773,7 +773,7 @@ __primary_switch:
 	ldr	w9, =__rela_offset		// offset to reloc table
 	ldr	w10, =__rela_size		// size of reloc table
 
-	ldr	x11, =KIMAGE_VADDR		// default virtual offset
+	mov_q	x11, KIMAGE_VADDR		// default virtual offset
 	add	x11, x11, x23			// actual virtual offset
 	add	x8, x8, x11			// __va(.dynsym)
 	add	x9, x9, x11			// __va(.rela)
-- 
cgit v0.10.2


From 18b9c0d641938242d8bcdba3c14a8f2beec2a97e Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 18 Apr 2016 17:09:46 +0200
Subject: arm64: don't map TEXT_OFFSET bytes below the kernel if we can avoid
 it

For historical reasons, the kernel Image must be loaded into physical
memory at a 512 KB offset above a 2 MB aligned base address. The region
between the base address and the start of the kernel Image has no
significance to the kernel itself, but it is currently mapped explicitly
into the early kernel VMA range for all translation granules.

In some cases (i.e., 4 KB granule), this is unavoidable, due to the 2 MB
granularity of the early kernel mappings. However, in other cases, e.g.,
when running with larger page sizes, or in the future, with more granular
KASLR, there is no reason to map it explicitly like we do currently.

So update the logic so that the region is mapped only if that happens as
a side effect of rounding the start address of the kernel to swapper block
size, and leave it unmapped otherwise.

Since the symbol kernel_img_size now simply resolves to the memory
footprint of the kernel Image, we can drop its definition from image.h
and opencode its calculation.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index dae9cab..c5e5edc 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -393,12 +393,13 @@ __create_page_tables:
 	 * Map the kernel image (starting with PHYS_OFFSET).
 	 */
 	mov	x0, x26				// swapper_pg_dir
-	mov_q	x5, KIMAGE_VADDR
+	mov_q	x5, KIMAGE_VADDR + TEXT_OFFSET	// compile time __va(_text)
 	add	x5, x5, x23			// add KASLR displacement
 	create_pgd_entry x0, x5, x3, x6
-	ldr	w6, =kernel_img_size
-	add	x6, x6, x5
-	mov	x3, x24				// phys offset
+	adrp	x6, _end			// runtime __pa(_end)
+	adrp	x3, _text			// runtime __pa(_text)
+	sub	x6, x6, x3			// _end - _text
+	add	x6, x6, x5			// runtime __va(_end)
 	create_block_map x0, x7, x3, x5, x6
 
 	/*
diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
index 4fd72da..86d444f 100644
--- a/arch/arm64/kernel/image.h
+++ b/arch/arm64/kernel/image.h
@@ -71,8 +71,6 @@
 	DEFINE_IMAGE_LE64(_kernel_offset_le, TEXT_OFFSET);	\
 	DEFINE_IMAGE_LE64(_kernel_flags_le, __HEAD_FLAGS);
 
-kernel_img_size = _end - (_text - TEXT_OFFSET);
-
 #ifdef CONFIG_EFI
 
 __efistub_stext_offset = stext - _text;
-- 
cgit v0.10.2


From 08cdac619c81b3fa8cd73aeed2330ffe0a0b73ca Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 18 Apr 2016 17:09:47 +0200
Subject: arm64: relocatable: deal with physically misaligned kernel images

When booting a relocatable kernel image, there is no practical reason
to refuse an image whose load address is not exactly TEXT_OFFSET bytes
above a 2 MB aligned base address, as long as the physical and virtual
misalignment with respect to the swapper block size are equal, and are
both aligned to THREAD_SIZE.

Since the virtual misalignment is under our control when we first enter
the kernel proper, we can simply choose its value to be equal to the
physical misalignment.

So treat the misalignment of the physical load address as the initial
KASLR offset, and fix up the remaining code to deal with that.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index c5e5edc..00a3210 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -25,6 +25,7 @@
 #include <linux/irqchip/arm-gic-v3.h>
 
 #include <asm/assembler.h>
+#include <asm/boot.h>
 #include <asm/ptrace.h>
 #include <asm/asm-offsets.h>
 #include <asm/cache.h>
@@ -213,8 +214,8 @@ efi_header_end:
 ENTRY(stext)
 	bl	preserve_boot_args
 	bl	el2_setup			// Drop to EL1, w20=cpu_boot_mode
-	mov	x23, xzr			// KASLR offset, defaults to 0
 	adrp	x24, __PHYS_OFFSET
+	and	x23, x24, MIN_KIMG_ALIGN - 1	// KASLR offset, defaults to 0
 	bl	set_cpu_boot_mode_flag
 	bl	__create_page_tables		// x25=TTBR0, x26=TTBR1
 	/*
@@ -449,11 +450,13 @@ __primary_switched:
 	bl	kasan_early_init
 #endif
 #ifdef CONFIG_RANDOMIZE_BASE
-	cbnz	x23, 0f				// already running randomized?
+	tst	x23, ~(MIN_KIMG_ALIGN - 1)	// already running randomized?
+	b.ne	0f
 	mov	x0, x21				// pass FDT address in x0
+	mov	x1, x23				// pass modulo offset in x1
 	bl	kaslr_early_init		// parse FDT for KASLR options
 	cbz	x0, 0f				// KASLR disabled? just proceed
-	mov	x23, x0				// record KASLR offset
+	orr	x23, x23, x0			// record KASLR offset
 	ret	x28				// we must enable KASLR, return
 						// to __enable_mmu()
 0:
diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
index 5829839..b054691 100644
--- a/arch/arm64/kernel/kaslr.c
+++ b/arch/arm64/kernel/kaslr.c
@@ -74,7 +74,7 @@ extern void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size,
  * containing function pointers) to be reinitialized, and zero-initialized
  * .bss variables will be reset to 0.
  */
-u64 __init kaslr_early_init(u64 dt_phys)
+u64 __init kaslr_early_init(u64 dt_phys, u64 modulo_offset)
 {
 	void *fdt;
 	u64 seed, offset, mask, module_range;
@@ -132,8 +132,8 @@ u64 __init kaslr_early_init(u64 dt_phys)
 	 * boundary (for 4KB/16KB/64KB granule kernels, respectively). If this
 	 * happens, increase the KASLR offset by the size of the kernel image.
 	 */
-	if ((((u64)_text + offset) >> SWAPPER_TABLE_SHIFT) !=
-	    (((u64)_end + offset) >> SWAPPER_TABLE_SHIFT))
+	if ((((u64)_text + offset + modulo_offset) >> SWAPPER_TABLE_SHIFT) !=
+	    (((u64)_end + offset + modulo_offset) >> SWAPPER_TABLE_SHIFT))
 		offset = (offset + (u64)(_end - _text)) & mask;
 
 	if (IS_ENABLED(CONFIG_KASAN))
-- 
cgit v0.10.2


From 6a1f5471144754f165427a93f35c897f85680594 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 12 Apr 2016 16:09:11 +0200
Subject: arm64: acpi: add acpi=on cmdline option to prefer ACPI boot over DT

If both ACPI and DT platform descriptions are available, and the
kernel was configured at build time to support both flavours, the
default policy is to prefer DT over ACPI, and preferring ACPI over
DT while still allowing DT as a fallback is not possible.

Since some enterprise features (such as RAS) depend on ACPI, it may
be desirable for, e.g., distro installers to prefer ACPI boot but
fall back to DT rather than failing completely if no ACPI tables are
available.

So introduce the 'acpi=on' kernel command line parameter for arm64,
which signifies that ACPI should be used if available, and DT should
only be used as a fallback.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index ecc74fa..748129c 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -167,16 +167,18 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
 	acpi=		[HW,ACPI,X86,ARM64]
 			Advanced Configuration and Power Interface
-			Format: { force | off | strict | noirq | rsdt |
+			Format: { force | on | off | strict | noirq | rsdt |
 				  copy_dsdt }
 			force -- enable ACPI if default was off
+			on -- enable ACPI but allow fallback to DT [arm64]
 			off -- disable ACPI if default was on
 			noirq -- do not use ACPI for IRQ routing
 			strict -- Be less tolerant of platforms that are not
 				strictly ACPI specification compliant.
 			rsdt -- prefer RSDT over (default) XSDT
 			copy_dsdt -- copy DSDT to memory
-			For ARM64, ONLY "acpi=off" or "acpi=force" are available
+			For ARM64, ONLY "acpi=off", "acpi=on" or "acpi=force"
+			are available
 
 			See also Documentation/power/runtime_pm.txt, pci=noacpi
 
diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index 6884c76..3e4f1a4 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -42,6 +42,7 @@ int acpi_pci_disabled = 1;	/* skip ACPI PCI scan and IRQ initialization */
 EXPORT_SYMBOL(acpi_pci_disabled);
 
 static bool param_acpi_off __initdata;
+static bool param_acpi_on __initdata;
 static bool param_acpi_force __initdata;
 
 static int __init parse_acpi(char *arg)
@@ -52,6 +53,8 @@ static int __init parse_acpi(char *arg)
 	/* "acpi=off" disables both ACPI table parsing and interpreter */
 	if (strcmp(arg, "off") == 0)
 		param_acpi_off = true;
+	else if (strcmp(arg, "on") == 0) /* prefer ACPI over DT */
+		param_acpi_on = true;
 	else if (strcmp(arg, "force") == 0) /* force ACPI to be enabled */
 		param_acpi_force = true;
 	else
@@ -198,10 +201,11 @@ void __init acpi_boot_table_init(void)
 	 * - ACPI has been disabled explicitly (acpi=off), or
 	 * - the device tree is not empty (it has more than just a /chosen node,
 	 *   and a /hypervisor node when running on Xen)
-	 *   and ACPI has not been force enabled (acpi=force)
+	 *   and ACPI has not been [force] enabled (acpi=on|force)
 	 */
 	if (param_acpi_off ||
-	    (!param_acpi_force && of_scan_flat_dt(dt_scan_depth1_nodes, NULL)))
+	    (!param_acpi_on && !param_acpi_force &&
+	     of_scan_flat_dt(dt_scan_depth1_nodes, NULL)))
 		return;
 
 	/*
-- 
cgit v0.10.2


From 7b7293ae3dbd0a1965bf310b77fed5f9bb37bb93 Mon Sep 17 00:00:00 2001
From: Geoff Levand <geoff@infradead.org>
Date: Wed, 27 Apr 2016 17:47:00 +0100
Subject: arm64: Fold proc-macros.S into assembler.h

To allow the assembler macros defined in arch/arm64/mm/proc-macros.S to
be used outside the mm code move the contents of proc-macros.S to
asm/assembler.h.  Also, delete proc-macros.S, and fix up all references
to proc-macros.S.

Signed-off-by: Geoff Levand <geoff@infradead.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
[rebased, included dcache_by_line_op]
Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index f9b6f7a..3b3812a 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -1,5 +1,5 @@
 /*
- * Based on arch/arm/include/asm/assembler.h
+ * Based on arch/arm/include/asm/assembler.h, arch/arm/mm/proc-macros.S
  *
  * Copyright (C) 1996-2000 Russell King
  * Copyright (C) 2012 ARM Ltd.
@@ -23,6 +23,8 @@
 #ifndef __ASM_ASSEMBLER_H
 #define __ASM_ASSEMBLER_H
 
+#include <asm/asm-offsets.h>
+#include <asm/pgtable-hwdef.h>
 #include <asm/ptrace.h>
 #include <asm/thread_info.h>
 
@@ -200,6 +202,84 @@ lr	.req	x30		// link register
 	.endm
 
 /*
+ * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm)
+ */
+	.macro	vma_vm_mm, rd, rn
+	ldr	\rd, [\rn, #VMA_VM_MM]
+	.endm
+
+/*
+ * mmid - get context id from mm pointer (mm->context.id)
+ */
+	.macro	mmid, rd, rn
+	ldr	\rd, [\rn, #MM_CONTEXT_ID]
+	.endm
+
+/*
+ * dcache_line_size - get the minimum D-cache line size from the CTR register.
+ */
+	.macro	dcache_line_size, reg, tmp
+	mrs	\tmp, ctr_el0			// read CTR
+	ubfm	\tmp, \tmp, #16, #19		// cache line size encoding
+	mov	\reg, #4			// bytes per word
+	lsl	\reg, \reg, \tmp		// actual cache line size
+	.endm
+
+/*
+ * icache_line_size - get the minimum I-cache line size from the CTR register.
+ */
+	.macro	icache_line_size, reg, tmp
+	mrs	\tmp, ctr_el0			// read CTR
+	and	\tmp, \tmp, #0xf		// cache line size encoding
+	mov	\reg, #4			// bytes per word
+	lsl	\reg, \reg, \tmp		// actual cache line size
+	.endm
+
+/*
+ * tcr_set_idmap_t0sz - update TCR.T0SZ so that we can load the ID map
+ */
+	.macro	tcr_set_idmap_t0sz, valreg, tmpreg
+#ifndef CONFIG_ARM64_VA_BITS_48
+	ldr_l	\tmpreg, idmap_t0sz
+	bfi	\valreg, \tmpreg, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH
+#endif
+	.endm
+
+/*
+ * Macro to perform a data cache maintenance for the interval
+ * [kaddr, kaddr + size)
+ *
+ * 	op:		operation passed to dc instruction
+ * 	domain:		domain used in dsb instruciton
+ * 	kaddr:		starting virtual address of the region
+ * 	size:		size of the region
+ * 	Corrupts:	kaddr, size, tmp1, tmp2
+ */
+	.macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2
+	dcache_line_size \tmp1, \tmp2
+	add	\size, \kaddr, \size
+	sub	\tmp2, \tmp1, #1
+	bic	\kaddr, \kaddr, \tmp2
+9998:	dc	\op, \kaddr
+	add	\kaddr, \kaddr, \tmp1
+	cmp	\kaddr, \size
+	b.lo	9998b
+	dsb	\domain
+	.endm
+
+/*
+ * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present
+ */
+	.macro	reset_pmuserenr_el0, tmpreg
+	mrs	\tmpreg, id_aa64dfr0_el1	// Check ID_AA64DFR0_EL1 PMUVer
+	sbfx	\tmpreg, \tmpreg, #8, #4
+	cmp	\tmpreg, #1			// Skip if no PMU present
+	b.lt	9000f
+	msr	pmuserenr_el0, xzr		// Disable PMU access from EL0
+9000:
+	.endm
+
+/*
  * Annotate a function as position independent, i.e., safe to be called before
  * the kernel virtual mapping is activated.
  */
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 6df0706..50ff9ba 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -24,8 +24,6 @@
 #include <asm/cpufeature.h>
 #include <asm/alternative.h>
 
-#include "proc-macros.S"
-
 /*
  *	flush_icache_range(start,end)
  *
diff --git a/arch/arm64/mm/proc-macros.S b/arch/arm64/mm/proc-macros.S
deleted file mode 100644
index e6a30e1..0000000
--- a/arch/arm64/mm/proc-macros.S
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Based on arch/arm/mm/proc-macros.S
- *
- * Copyright (C) 2012 ARM Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-
-/*
- * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm)
- */
-	.macro	vma_vm_mm, rd, rn
-	ldr	\rd, [\rn, #VMA_VM_MM]
-	.endm
-
-/*
- * mmid - get context id from mm pointer (mm->context.id)
- */
-	.macro	mmid, rd, rn
-	ldr	\rd, [\rn, #MM_CONTEXT_ID]
-	.endm
-
-/*
- * dcache_line_size - get the minimum D-cache line size from the CTR register.
- */
-	.macro	dcache_line_size, reg, tmp
-	mrs	\tmp, ctr_el0			// read CTR
-	ubfm	\tmp, \tmp, #16, #19		// cache line size encoding
-	mov	\reg, #4			// bytes per word
-	lsl	\reg, \reg, \tmp		// actual cache line size
-	.endm
-
-/*
- * icache_line_size - get the minimum I-cache line size from the CTR register.
- */
-	.macro	icache_line_size, reg, tmp
-	mrs	\tmp, ctr_el0			// read CTR
-	and	\tmp, \tmp, #0xf		// cache line size encoding
-	mov	\reg, #4			// bytes per word
-	lsl	\reg, \reg, \tmp		// actual cache line size
-	.endm
-
-/*
- * tcr_set_idmap_t0sz - update TCR.T0SZ so that we can load the ID map
- */
-	.macro	tcr_set_idmap_t0sz, valreg, tmpreg
-#ifndef CONFIG_ARM64_VA_BITS_48
-	ldr_l	\tmpreg, idmap_t0sz
-	bfi	\valreg, \tmpreg, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH
-#endif
-	.endm
-
-/*
- * Macro to perform a data cache maintenance for the interval
- * [kaddr, kaddr + size)
- *
- * 	op:		operation passed to dc instruction
- * 	domain:		domain used in dsb instruciton
- * 	kaddr:		starting virtual address of the region
- * 	size:		size of the region
- * 	Corrupts: 	kaddr, size, tmp1, tmp2
- */
-	.macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2
-	dcache_line_size \tmp1, \tmp2
-	add	\size, \kaddr, \size
-	sub	\tmp2, \tmp1, #1
-	bic	\kaddr, \kaddr, \tmp2
-9998:	dc	\op, \kaddr
-	add	\kaddr, \kaddr, \tmp1
-	cmp	\kaddr, \size
-	b.lo	9998b
-	dsb	\domain
-	.endm
-
-/*
- * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present
- */
-	.macro	reset_pmuserenr_el0, tmpreg
-	mrs	\tmpreg, id_aa64dfr0_el1	// Check ID_AA64DFR0_EL1 PMUVer
-	sbfx	\tmpreg, \tmpreg, #8, #4
-	cmp	\tmpreg, #1			// Skip if no PMU present
-	b.lt	9000f
-	msr	pmuserenr_el0, xzr		// Disable PMU access from EL0
-9000:
-	.endm
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 543f519..9f6deac 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -23,13 +23,10 @@
 #include <asm/assembler.h>
 #include <asm/asm-offsets.h>
 #include <asm/hwcap.h>
-#include <asm/pgtable-hwdef.h>
 #include <asm/pgtable.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative.h>
 
-#include "proc-macros.S"
-
 #ifdef CONFIG_ARM64_64K_PAGES
 #define TCR_TG_FLAGS	TCR_TG0_64K | TCR_TG1_64K
 #elif defined(CONFIG_ARM64_16K_PAGES)
-- 
cgit v0.10.2


From e7227d0e528f9a96d4a866f43e20dd9b33f0e782 Mon Sep 17 00:00:00 2001
From: Geoff Levand <geoff@infradead.org>
Date: Wed, 27 Apr 2016 17:47:01 +0100
Subject: arm64: Cleanup SCTLR flags

We currently have macros defining flags for the arm64 sctlr registers in
both kvm_arm.h and sysreg.h.  To clean things up and simplify move the
definitions of the SCTLR_EL2 flags from kvm_arm.h to sysreg.h, rename any
SCTLR_EL1 or SCTLR_EL2 flags that are common to both registers to be
SCTLR_ELx, with 'x' indicating a common flag, and fixup all files to
include the proper header or to use the new macro names.

Signed-off-by: Geoff Levand <geoff@infradead.org>
[Restored pgtable-hwdef.h include]
Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 4150fd8..763762e 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -84,17 +84,6 @@
 #define HCR_INT_OVERRIDE   (HCR_FMO | HCR_IMO)
 #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H)
 
-/* Hyp System Control Register (SCTLR_EL2) bits */
-#define SCTLR_EL2_EE	(1 << 25)
-#define SCTLR_EL2_WXN	(1 << 19)
-#define SCTLR_EL2_I	(1 << 12)
-#define SCTLR_EL2_SA	(1 << 3)
-#define SCTLR_EL2_C	(1 << 2)
-#define SCTLR_EL2_A	(1 << 1)
-#define SCTLR_EL2_M	1
-#define SCTLR_EL2_FLAGS	(SCTLR_EL2_M | SCTLR_EL2_A | SCTLR_EL2_C |	\
-			 SCTLR_EL2_SA | SCTLR_EL2_I)
-
 /* TCR_EL2 Registers bits */
 #define TCR_EL2_RES1	((1 << 31) | (1 << 23))
 #define TCR_EL2_TBI	(1 << 20)
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index d39d434..751e901 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -86,10 +86,21 @@
 #define SET_PSTATE_UAO(x) __inst_arm(0xd5000000 | REG_PSTATE_UAO_IMM |\
 				     (!!x)<<8 | 0x1f)
 
-/* SCTLR_EL1 */
-#define SCTLR_EL1_CP15BEN	(0x1 << 5)
-#define SCTLR_EL1_SED		(0x1 << 8)
-#define SCTLR_EL1_SPAN		(0x1 << 23)
+/* Common SCTLR_ELx flags. */
+#define SCTLR_ELx_EE    (1 << 25)
+#define SCTLR_ELx_I	(1 << 12)
+#define SCTLR_ELx_SA	(1 << 3)
+#define SCTLR_ELx_C	(1 << 2)
+#define SCTLR_ELx_A	(1 << 1)
+#define SCTLR_ELx_M	1
+
+#define SCTLR_ELx_FLAGS	(SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \
+			 SCTLR_ELx_SA | SCTLR_ELx_I)
+
+/* SCTLR_EL1 specific flags. */
+#define SCTLR_EL1_SPAN		(1 << 23)
+#define SCTLR_EL1_SED		(1 << 8)
+#define SCTLR_EL1_CP15BEN	(1 << 5)
 
 
 /* id_aa64isar0 */
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index 7d8747c..5ce1b47 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -21,6 +21,7 @@
 #include <asm/kvm_arm.h>
 #include <asm/kvm_mmu.h>
 #include <asm/pgtable-hwdef.h>
+#include <asm/sysreg.h>
 
 	.text
 	.pushsection	.hyp.idmap.text, "ax"
@@ -103,8 +104,8 @@ __do_hyp_init:
 	dsb	sy
 
 	mrs	x4, sctlr_el2
-	and	x4, x4, #SCTLR_EL2_EE	// preserve endianness of EL2
-	ldr	x5, =SCTLR_EL2_FLAGS
+	and	x4, x4, #SCTLR_ELx_EE	// preserve endianness of EL2
+	ldr	x5, =SCTLR_ELx_FLAGS
 	orr	x4, x4, x5
 	msr	sctlr_el2, x4
 	isb
-- 
cgit v0.10.2


From 00a44cdaba0900c63a003e0c431f506f49376a90 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 27 Apr 2016 17:47:02 +0100
Subject: arm64: kvm: Move lr save/restore from do_el2_call into EL1

Today the 'hvc' calling KVM or the hyp-stub is expected to preserve all
registers. KVM saves/restores the registers it needs on the EL2 stack using
do_el2_call(). The hyp-stub has no stack, later patches need to be able to
be able to clobber the link register.

Move the link register save/restore to the the call sites.

Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
index a272f33..7eab8ac 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -101,10 +101,16 @@ ENDPROC(\label)
  */
 
 ENTRY(__hyp_get_vectors)
+	str	lr, [sp, #-16]!
 	mov	x0, xzr
-	// fall through
-ENTRY(__hyp_set_vectors)
 	hvc	#0
+	ldr	lr, [sp], #16
 	ret
 ENDPROC(__hyp_get_vectors)
+
+ENTRY(__hyp_set_vectors)
+	str	lr, [sp, #-16]!
+	hvc	#0
+	ldr	lr, [sp], #16
+	ret
 ENDPROC(__hyp_set_vectors)
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index 48f19a3..4ee5612 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -38,13 +38,18 @@
  * A function pointer with a value of 0 has a special meaning, and is
  * used to implement __hyp_get_vectors in the same way as in
  * arch/arm64/kernel/hyp_stub.S.
+ * HVC behaves as a 'bl' call and will clobber lr.
  */
 ENTRY(__kvm_call_hyp)
-alternative_if_not ARM64_HAS_VIRT_HOST_EXTN	
+alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
+	str     lr, [sp, #-16]!
 	hvc	#0
+	ldr     lr, [sp], #16
 	ret
 alternative_else
 	b	__vhe_hyp_call
 	nop
+	nop
+	nop
 alternative_endif
 ENDPROC(__kvm_call_hyp)
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 3488894..43b4b28 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -42,19 +42,17 @@
 	 * Shuffle the parameters before calling the function
 	 * pointed to in x0. Assumes parameters in x[1,2,3].
 	 */
-	sub	sp, sp, #16
-	str	lr, [sp]
 	mov	lr, x0
 	mov	x0, x1
 	mov	x1, x2
 	mov	x2, x3
 	blr	lr
-	ldr	lr, [sp]
-	add	sp, sp, #16
 .endm
 
 ENTRY(__vhe_hyp_call)
+	str	lr, [sp, #-16]!
 	do_el2_call
+	ldr	lr, [sp], #16
 	/*
 	 * We used to rely on having an exception return to get
 	 * an implicit isb. In the E2H case, we don't have it anymore.
-- 
cgit v0.10.2


From ad72e59ff2bad55f6b9e7ac1fe5d824831ea2550 Mon Sep 17 00:00:00 2001
From: Geoff Levand <geoff@infradead.org>
Date: Wed, 27 Apr 2016 17:47:03 +0100
Subject: arm64: hyp/kvm: Make hyp-stub extensible

The existing arm64 hcall implementations are limited in that they only
allow for two distinct hcalls; with the x0 register either zero or not
zero.  Also, the API of the hyp-stub exception vector routines and the
KVM exception vector routines differ; hyp-stub uses a non-zero value in
x0 to implement __hyp_set_vectors, whereas KVM uses it to implement
kvm_call_hyp.

To allow for additional hcalls to be defined and to make the arm64 hcall
API more consistent across exception vector routines, change the hcall
implementations to reserve all x0 values below 0xfff for hcalls such
as {s,g}et_vectors().

Define two new preprocessor macros HVC_GET_VECTORS, and HVC_SET_VECTORS
to be used as hcall type specifiers and convert the existing
__hyp_get_vectors() and __hyp_set_vectors() routines to use these new
macros when executing an HVC call.  Also, change the corresponding
hyp-stub and KVM el1_sync exception vector routines to use these new
macros.

Signed-off-by: Geoff Levand <geoff@infradead.org>
[Merged two hcall patches, moved immediate value from esr to x0, use lr
 as a scratch register, changed limit to 0xfff]
Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index a5c2b48..dcbcf8d 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -18,6 +18,22 @@
 #ifndef __ASM__VIRT_H
 #define __ASM__VIRT_H
 
+/*
+ * The arm64 hcall implementation uses x0 to specify the hcall type. A value
+ * less than 0xfff indicates a special hcall, such as get/set vector.
+ * Any other value is used as a pointer to the function to call.
+ */
+
+/* HVC_GET_VECTORS - Return the value of the vbar_el2 register. */
+#define HVC_GET_VECTORS 0
+
+/*
+ * HVC_SET_VECTORS - Set the value of the vbar_el2 register.
+ *
+ * @x1: Physical address of the new vector table.
+ */
+#define HVC_SET_VECTORS 1
+
 #define BOOT_CPU_MODE_EL1	(0xe11)
 #define BOOT_CPU_MODE_EL2	(0xe12)
 
diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
index 7eab8ac..894fb40 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -22,6 +22,7 @@
 #include <linux/irqchip/arm-gic-v3.h>
 
 #include <asm/assembler.h>
+#include <asm/kvm_arm.h>
 #include <asm/ptrace.h>
 #include <asm/virt.h>
 
@@ -53,15 +54,26 @@ ENDPROC(__hyp_stub_vectors)
 	.align 11
 
 el1_sync:
-	mrs	x1, esr_el2
-	lsr	x1, x1, #26
-	cmp	x1, #0x16
-	b.ne	2f				// Not an HVC trap
-	cbz	x0, 1f
-	msr	vbar_el2, x0			// Set vbar_el2
-	b	2f
-1:	mrs	x0, vbar_el2			// Return vbar_el2
-2:	eret
+	mrs	x30, esr_el2
+	lsr	x30, x30, #ESR_ELx_EC_SHIFT
+
+	cmp	x30, #ESR_ELx_EC_HVC64
+	b.ne	9f				// Not an HVC trap
+
+	cmp	x0, #HVC_GET_VECTORS
+	b.ne	1f
+	mrs	x0, vbar_el2
+	b	9f
+
+1:	cmp	x0, #HVC_SET_VECTORS
+	b.ne	2f
+	msr	vbar_el2, x1
+	b	9f
+
+	/* Unrecognised call type */
+2:	mov     x0, xzr
+
+9:	eret
 ENDPROC(el1_sync)
 
 .macro invalid_vector	label
@@ -102,7 +114,7 @@ ENDPROC(\label)
 
 ENTRY(__hyp_get_vectors)
 	str	lr, [sp, #-16]!
-	mov	x0, xzr
+	mov	x0, #HVC_GET_VECTORS
 	hvc	#0
 	ldr	lr, [sp], #16
 	ret
@@ -110,6 +122,8 @@ ENDPROC(__hyp_get_vectors)
 
 ENTRY(__hyp_set_vectors)
 	str	lr, [sp, #-16]!
+	mov	x1, x0
+	mov	x0, #HVC_SET_VECTORS
 	hvc	#0
 	ldr	lr, [sp], #16
 	ret
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index 4ee5612..7ce9315 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -35,8 +35,8 @@
  * in Hyp mode (see init_hyp_mode in arch/arm/kvm/arm.c).  Return values are
  * passed in x0.
  *
- * A function pointer with a value of 0 has a special meaning, and is
- * used to implement __hyp_get_vectors in the same way as in
+ * A function pointer with a value less than 0xfff has a special meaning,
+ * and is used to implement __hyp_get_vectors in the same way as in
  * arch/arm64/kernel/hyp_stub.S.
  * HVC behaves as a 'bl' call and will clobber lr.
  */
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 43b4b28..2d87f36 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -82,8 +82,8 @@ alternative_endif
 	/* Here, we're pretty sure the host called HVC. */
 	restore_x0_to_x3
 
-	/* Check for __hyp_get_vectors */
-	cbnz	x0, 1f
+	cmp	x0, #HVC_GET_VECTORS
+	b.ne	1f
 	mrs	x0, vbar_el2
 	b	2f
 
-- 
cgit v0.10.2


From c94b0cf28281d483c8b43b4874fcb7ab14ade1b1 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 27 Apr 2016 17:47:04 +0100
Subject: arm64: hyp/kvm: Make hyp-stub reject kvm_call_hyp()

A later patch implements kvm_arch_hardware_disable(), to remove kvm
from el2, and re-instate the hyp-stub.

This can happen while guests are running, particularly when kvm_reboot()
calls kvm_arch_hardware_disable() on each cpu. This can interrupt a guest,
remove kvm, then allow the guest to be scheduled again. This causes
kvm_call_hyp() to be run against the hyp-stub.

Change the hyp-stub to return a new exception type when this happens,
and add code to kvm's handle_exit() to tell userspace we failed to
enter the guest.

Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index eb7490d..dbfbc5f 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -22,6 +22,8 @@
 
 #define ARM_EXCEPTION_IRQ	  0
 #define ARM_EXCEPTION_TRAP	  1
+/* The hyp-stub will return this for any kvm_call_hyp() call */
+#define ARM_EXCEPTION_HYP_GONE	  2
 
 #define KVM_ARM64_DEBUG_DIRTY_SHIFT	0
 #define KVM_ARM64_DEBUG_DIRTY		(1 << KVM_ARM64_DEBUG_DIRTY_SHIFT)
diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
index 894fb40..8727f44 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -23,6 +23,7 @@
 
 #include <asm/assembler.h>
 #include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
 #include <asm/ptrace.h>
 #include <asm/virt.h>
 
@@ -70,8 +71,8 @@ el1_sync:
 	msr	vbar_el2, x1
 	b	9f
 
-	/* Unrecognised call type */
-2:	mov     x0, xzr
+	/* Someone called kvm_call_hyp() against the hyp-stub... */
+2:	mov     x0, #ARM_EXCEPTION_HYP_GONE
 
 9:	eret
 ENDPROC(el1_sync)
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index eba89e4..3246c4a 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -186,6 +186,13 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 		exit_handler = kvm_get_exit_handler(vcpu);
 
 		return exit_handler(vcpu, run);
+	case ARM_EXCEPTION_HYP_GONE:
+		/*
+		 * EL2 has been reset to the hyp-stub. This happens when a guest
+		 * is pre-empted by kvm_reboot()'s shutdown call.
+		 */
+		run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+		return 0;
 	default:
 		kvm_pr_unimpl("Unsupported exception type: %d",
 			      exception_index);
-- 
cgit v0.10.2


From 67f6919766620e7ea7aab11a6a3470dc7b451359 Mon Sep 17 00:00:00 2001
From: AKASHI Takahiro <takahiro.akashi@linaro.org>
Date: Wed, 27 Apr 2016 17:47:05 +0100
Subject: arm64: kvm: allows kvm cpu hotplug

The current kvm implementation on arm64 does cpu-specific initialization
at system boot, and has no way to gracefully shutdown a core in terms of
kvm. This prevents kexec from rebooting the system at EL2.

This patch adds a cpu tear-down function and also puts an existing cpu-init
code into a separate function, kvm_arch_hardware_disable() and
kvm_arch_hardware_enable() respectively.
We don't need the arm64 specific cpu hotplug hook any more.

Since this patch modifies common code between arm and arm64, one stub
definition, __cpu_reset_hyp_mode(), is added on arm side to avoid
compilation errors.

Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
[Rebase, added separate VHE init/exit path, changed resets use of
 kvm_call_hyp() to the __version, en/disabled hardware in init_subsystems(),
 added icache maintenance to __kvm_hyp_reset() and removed lr restore, removed
 guest-enter after teardown handling]
Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 3850701..738d5ee 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -265,6 +265,15 @@ static inline void __cpu_init_stage2(void)
 	kvm_call_hyp(__init_stage2_translation);
 }
 
+static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
+					phys_addr_t phys_idmap_start)
+{
+	/*
+	 * TODO
+	 * kvm_call_reset(boot_pgd_ptr, phys_idmap_start);
+	 */
+}
+
 static inline int kvm_arch_dev_ioctl_check_extension(long ext)
 {
 	return 0;
@@ -277,7 +286,6 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
 
 struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
 
-static inline void kvm_arch_hardware_disable(void) {}
 static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index da44be9..f17a8d4 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -66,6 +66,7 @@ void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
 phys_addr_t kvm_mmu_get_httbr(void);
 phys_addr_t kvm_mmu_get_boot_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
+phys_addr_t kvm_get_idmap_start(void);
 int kvm_mmu_init(void);
 void kvm_clear_hyp_idmap(void);
 
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index b538431..1687e14 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -16,7 +16,6 @@
  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  */
 
-#include <linux/cpu.h>
 #include <linux/cpu_pm.h>
 #include <linux/errno.h>
 #include <linux/err.h>
@@ -66,6 +65,8 @@ static DEFINE_SPINLOCK(kvm_vmid_lock);
 
 static bool vgic_present;
 
+static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
+
 static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
 {
 	BUG_ON(preemptible());
@@ -90,11 +91,6 @@ struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
 	return &kvm_arm_running_vcpu;
 }
 
-int kvm_arch_hardware_enable(void)
-{
-	return 0;
-}
-
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 {
 	return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
@@ -1033,11 +1029,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	}
 }
 
-static void cpu_init_stage2(void *dummy)
-{
-	__cpu_init_stage2();
-}
-
 static void cpu_init_hyp_mode(void *dummy)
 {
 	phys_addr_t boot_pgd_ptr;
@@ -1065,43 +1056,87 @@ static void cpu_hyp_reinit(void)
 {
 	if (is_kernel_in_hyp_mode()) {
 		/*
-		 * cpu_init_stage2() is safe to call even if the PM
+		 * __cpu_init_stage2() is safe to call even if the PM
 		 * event was cancelled before the CPU was reset.
 		 */
-		cpu_init_stage2(NULL);
+		__cpu_init_stage2();
 	} else {
 		if (__hyp_get_vectors() == hyp_default_vectors)
 			cpu_init_hyp_mode(NULL);
 	}
 }
 
-static int hyp_init_cpu_notify(struct notifier_block *self,
-			       unsigned long action, void *cpu)
+static void cpu_hyp_reset(void)
+{
+	phys_addr_t boot_pgd_ptr;
+	phys_addr_t phys_idmap_start;
+
+	if (!is_kernel_in_hyp_mode()) {
+		boot_pgd_ptr = kvm_mmu_get_boot_httbr();
+		phys_idmap_start = kvm_get_idmap_start();
+
+		__cpu_reset_hyp_mode(boot_pgd_ptr, phys_idmap_start);
+	}
+}
+
+static void _kvm_arch_hardware_enable(void *discard)
 {
-	switch (action) {
-	case CPU_STARTING:
-	case CPU_STARTING_FROZEN:
+	if (!__this_cpu_read(kvm_arm_hardware_enabled)) {
 		cpu_hyp_reinit();
+		__this_cpu_write(kvm_arm_hardware_enabled, 1);
 	}
+}
+
+int kvm_arch_hardware_enable(void)
+{
+	_kvm_arch_hardware_enable(NULL);
+	return 0;
+}
 
-	return NOTIFY_OK;
+static void _kvm_arch_hardware_disable(void *discard)
+{
+	if (__this_cpu_read(kvm_arm_hardware_enabled)) {
+		cpu_hyp_reset();
+		__this_cpu_write(kvm_arm_hardware_enabled, 0);
+	}
 }
 
-static struct notifier_block hyp_init_cpu_nb = {
-	.notifier_call = hyp_init_cpu_notify,
-};
+void kvm_arch_hardware_disable(void)
+{
+	_kvm_arch_hardware_disable(NULL);
+}
 
 #ifdef CONFIG_CPU_PM
 static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
 				    unsigned long cmd,
 				    void *v)
 {
-	if (cmd == CPU_PM_EXIT) {
-		cpu_hyp_reinit();
+	/*
+	 * kvm_arm_hardware_enabled is left with its old value over
+	 * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
+	 * re-enable hyp.
+	 */
+	switch (cmd) {
+	case CPU_PM_ENTER:
+		if (__this_cpu_read(kvm_arm_hardware_enabled))
+			/*
+			 * don't update kvm_arm_hardware_enabled here
+			 * so that the hardware will be re-enabled
+			 * when we resume. See below.
+			 */
+			cpu_hyp_reset();
+
+		return NOTIFY_OK;
+	case CPU_PM_EXIT:
+		if (__this_cpu_read(kvm_arm_hardware_enabled))
+			/* The hardware was enabled before suspend. */
+			cpu_hyp_reinit();
+
 		return NOTIFY_OK;
-	}
 
-	return NOTIFY_DONE;
+	default:
+		return NOTIFY_DONE;
+	}
 }
 
 static struct notifier_block hyp_init_cpu_pm_nb = {
@@ -1136,18 +1171,12 @@ static int init_common_resources(void)
 
 static int init_subsystems(void)
 {
-	int err;
+	int err = 0;
 
 	/*
-	 * Register CPU Hotplug notifier
+	 * Enable hardware so that subsystem initialisation can access EL2.
 	 */
-	cpu_notifier_register_begin();
-	err = __register_cpu_notifier(&hyp_init_cpu_nb);
-	cpu_notifier_register_done();
-	if (err) {
-		kvm_err("Cannot register KVM init CPU notifier (%d)\n", err);
-		return err;
-	}
+	on_each_cpu(_kvm_arch_hardware_enable, NULL, 1);
 
 	/*
 	 * Register CPU lower-power notifier
@@ -1165,9 +1194,10 @@ static int init_subsystems(void)
 	case -ENODEV:
 	case -ENXIO:
 		vgic_present = false;
+		err = 0;
 		break;
 	default:
-		return err;
+		goto out;
 	}
 
 	/*
@@ -1175,12 +1205,15 @@ static int init_subsystems(void)
 	 */
 	err = kvm_timer_hyp_init();
 	if (err)
-		return err;
+		goto out;
 
 	kvm_perf_init();
 	kvm_coproc_table_init();
 
-	return 0;
+out:
+	on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
+
+	return err;
 }
 
 static void teardown_hyp_mode(void)
@@ -1197,11 +1230,6 @@ static void teardown_hyp_mode(void)
 
 static int init_vhe_mode(void)
 {
-	/*
-	 * Execute the init code on each CPU.
-	 */
-	on_each_cpu(cpu_init_stage2, NULL, 1);
-
 	/* set size of VMID supported by CPU */
 	kvm_vmid_bits = kvm_get_vmid_bits();
 	kvm_info("%d-bit VMID\n", kvm_vmid_bits);
@@ -1288,11 +1316,6 @@ static int init_hyp_mode(void)
 		}
 	}
 
-	/*
-	 * Execute the init code on each CPU.
-	 */
-	on_each_cpu(cpu_init_hyp_mode, NULL, 1);
-
 #ifndef CONFIG_HOTPLUG_CPU
 	free_boot_hyp_pgd();
 #endif
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 58dbd5c..7d86e15 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -1666,6 +1666,11 @@ phys_addr_t kvm_get_idmap_vector(void)
 	return hyp_idmap_vector;
 }
 
+phys_addr_t kvm_get_idmap_start(void)
+{
+	return hyp_idmap_start;
+}
+
 int kvm_mmu_init(void)
 {
 	int err;
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index dbfbc5f..a88da13 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -42,6 +42,7 @@ struct kvm_vcpu;
 
 extern char __kvm_hyp_init[];
 extern char __kvm_hyp_init_end[];
+extern char __kvm_hyp_reset[];
 
 extern char __kvm_hyp_vector[];
 
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index b7e82a7..b6f194d 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -46,6 +46,7 @@
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
 int kvm_arch_dev_ioctl_check_extension(long ext);
+phys_addr_t kvm_hyp_reset_entry(void);
 
 struct kvm_arch {
 	/* The VMID generation used for the virt. memory system */
@@ -352,7 +353,17 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
 		       hyp_stack_ptr, vector_ptr);
 }
 
-static inline void kvm_arch_hardware_disable(void) {}
+static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
+					phys_addr_t phys_idmap_start)
+{
+	/*
+	 * Call reset code, and switch back to stub hyp vectors.
+	 * Uses __kvm_call_hyp() to avoid kaslr's kvm_ksym_ref() translation.
+	 */
+	__kvm_call_hyp((void *)kvm_hyp_reset_entry(),
+		       boot_pgd_ptr, phys_idmap_start);
+}
+
 static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 22732a5..e8d39d4 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -109,6 +109,7 @@ void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
 phys_addr_t kvm_mmu_get_httbr(void);
 phys_addr_t kvm_mmu_get_boot_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
+phys_addr_t kvm_get_idmap_start(void);
 int kvm_mmu_init(void);
 void kvm_clear_hyp_idmap(void);
 
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index 5ce1b47..44ec4cb 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -139,6 +139,44 @@ merged:
 	eret
 ENDPROC(__kvm_hyp_init)
 
+	/*
+	 * x0: HYP boot pgd
+	 * x1: HYP phys_idmap_start
+	 */
+ENTRY(__kvm_hyp_reset)
+	/* We're in trampoline code in VA, switch back to boot page tables */
+	msr	ttbr0_el2, x0
+	isb
+
+	/* Ensure the PA branch doesn't find a stale tlb entry or stale code. */
+	ic	iallu
+	tlbi	alle2
+	dsb	sy
+	isb
+
+	/* Branch into PA space */
+	adr	x0, 1f
+	bfi	x1, x0, #0, #PAGE_SHIFT
+	br	x1
+
+	/* We're now in idmap, disable MMU */
+1:	mrs	x0, sctlr_el2
+	ldr	x1, =SCTLR_ELx_FLAGS
+	bic	x0, x0, x1		// Clear SCTL_M and etc
+	msr	sctlr_el2, x0
+	isb
+
+	/* Invalidate the old TLBs */
+	tlbi	alle2
+	dsb	sy
+
+	/* Install stub vectors */
+	adr_l	x0, __hyp_stub_vectors
+	msr	vbar_el2, x0
+
+	eret
+ENDPROC(__kvm_hyp_reset)
+
 	.ltorg
 
 	.popsection
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 9677bf0..4062e6d 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -29,7 +29,9 @@
 #include <asm/cputype.h>
 #include <asm/ptrace.h>
 #include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
 #include <asm/kvm_coproc.h>
+#include <asm/kvm_mmu.h>
 
 /*
  * ARMv8 Reset Values
@@ -130,3 +132,15 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 	/* Reset timer */
 	return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
 }
+
+extern char __hyp_idmap_text_start[];
+
+phys_addr_t kvm_hyp_reset_entry(void)
+{
+	unsigned long offset;
+
+	offset = (unsigned long)__kvm_hyp_reset
+		 - ((unsigned long)__hyp_idmap_text_start & PAGE_MASK);
+
+	return TRAMPOLINE_VA + offset;
+}
-- 
cgit v0.10.2


From adc9b2dfd00924e9e9b98613f36a6cb8c51f0dc6 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 27 Apr 2016 17:47:06 +0100
Subject: arm64: kernel: Rework finisher callback out of __cpu_suspend_enter()

Hibernate could make use of the cpu_suspend() code to save/restore cpu
state, however it needs to be able to return '0' from the 'finisher'.

Rework cpu_suspend() so that the finisher is called from C code,
independently from the save/restore of cpu state. Space to save the context
in is allocated in the caller's stack frame, and passed into
__cpu_suspend_enter().

Hibernate's use of this API will look like a copy of the cpu_suspend()
function.

Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/suspend.h b/arch/arm64/include/asm/suspend.h
index 59a5b0f1..365d8cd 100644
--- a/arch/arm64/include/asm/suspend.h
+++ b/arch/arm64/include/asm/suspend.h
@@ -2,6 +2,7 @@
 #define __ASM_SUSPEND_H
 
 #define NR_CTX_REGS 11
+#define NR_CALLEE_SAVED_REGS 12
 
 /*
  * struct cpu_suspend_ctx must be 16-byte aligned since it is allocated on
@@ -21,6 +22,25 @@ struct sleep_save_sp {
 	phys_addr_t save_ptr_stash_phys;
 };
 
+/*
+ * Memory to save the cpu state is allocated on the stack by
+ * __cpu_suspend_enter()'s caller, and populated by __cpu_suspend_enter().
+ * This data must survive until cpu_resume() is called.
+ *
+ * This struct desribes the size and the layout of the saved cpu state.
+ * The layout of the callee_saved_regs is defined by the implementation
+ * of __cpu_suspend_enter(), and cpu_resume(). This struct must be passed
+ * in by the caller as __cpu_suspend_enter()'s stack-frame is gone once it
+ * returns, and the data would be subsequently corrupted by the call to the
+ * finisher.
+ */
+struct sleep_stack_data {
+	struct cpu_suspend_ctx	system_regs;
+	unsigned long		callee_saved_regs[NR_CALLEE_SAVED_REGS];
+};
+
 extern int cpu_suspend(unsigned long arg, int (*fn)(unsigned long));
 extern void cpu_resume(void);
+int __cpu_suspend_enter(struct sleep_stack_data *state);
+void __cpu_suspend_exit(void);
 #endif
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 3ae6b31..0902acb 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -122,6 +122,8 @@ int main(void)
   DEFINE(SLEEP_SAVE_SP_SZ,	sizeof(struct sleep_save_sp));
   DEFINE(SLEEP_SAVE_SP_PHYS,	offsetof(struct sleep_save_sp, save_ptr_stash_phys));
   DEFINE(SLEEP_SAVE_SP_VIRT,	offsetof(struct sleep_save_sp, save_ptr_stash));
+  DEFINE(SLEEP_STACK_DATA_SYSTEM_REGS,	offsetof(struct sleep_stack_data, system_regs));
+  DEFINE(SLEEP_STACK_DATA_CALLEE_REGS,	offsetof(struct sleep_stack_data, callee_saved_regs));
 #endif
   DEFINE(ARM_SMCCC_RES_X0_OFFS,	offsetof(struct arm_smccc_res, a0));
   DEFINE(ARM_SMCCC_RES_X2_OFFS,	offsetof(struct arm_smccc_res, a2));
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index fd10eb6..b7c4b68 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -49,37 +49,30 @@
 	orr	\dst, \dst, \mask		// dst|=(aff3>>rs3)
 	.endm
 /*
- * Save CPU state for a suspend and execute the suspend finisher.
- * On success it will return 0 through cpu_resume - ie through a CPU
- * soft/hard reboot from the reset vector.
- * On failure it returns the suspend finisher return value or force
- * -EOPNOTSUPP if the finisher erroneously returns 0 (the suspend finisher
- * is not allowed to return, if it does this must be considered failure).
- * It saves callee registers, and allocates space on the kernel stack
- * to save the CPU specific registers + some other data for resume.
+ * Save CPU state in the provided sleep_stack_data area, and publish its
+ * location for cpu_resume()'s use in sleep_save_stash.
  *
- *  x0 = suspend finisher argument
- *  x1 = suspend finisher function pointer
+ * cpu_resume() will restore this saved state, and return. Because the
+ * link-register is saved and restored, it will appear to return from this
+ * function. So that the caller can tell the suspend/resume paths apart,
+ * __cpu_suspend_enter() will always return a non-zero value, whereas the
+ * path through cpu_resume() will return 0.
+ *
+ *  x0 = struct sleep_stack_data area
  */
 ENTRY(__cpu_suspend_enter)
-	stp	x29, lr, [sp, #-96]!
-	stp	x19, x20, [sp,#16]
-	stp	x21, x22, [sp,#32]
-	stp	x23, x24, [sp,#48]
-	stp	x25, x26, [sp,#64]
-	stp	x27, x28, [sp,#80]
-	/*
-	 * Stash suspend finisher and its argument in x20 and x19
-	 */
-	mov	x19, x0
-	mov	x20, x1
+	stp	x29, lr, [x0, #SLEEP_STACK_DATA_CALLEE_REGS]
+	stp	x19, x20, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+16]
+	stp	x21, x22, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+32]
+	stp	x23, x24, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+48]
+	stp	x25, x26, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+64]
+	stp	x27, x28, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+80]
+
+	/* save the sp in cpu_suspend_ctx */
 	mov	x2, sp
-	sub	sp, sp, #CPU_SUSPEND_SZ	// allocate cpu_suspend_ctx
-	mov	x0, sp
-	/*
-	 * x0 now points to struct cpu_suspend_ctx allocated on the stack
-	 */
-	str	x2, [x0, #CPU_CTX_SP]
+	str	x2, [x0, #SLEEP_STACK_DATA_SYSTEM_REGS + CPU_CTX_SP]
+
+	/* find the mpidr_hash */
 	ldr	x1, =sleep_save_sp
 	ldr	x1, [x1, #SLEEP_SAVE_SP_VIRT]
 	mrs	x7, mpidr_el1
@@ -93,34 +86,11 @@ ENTRY(__cpu_suspend_enter)
 	ldp	w5, w6, [x9, #(MPIDR_HASH_SHIFTS + 8)]
 	compute_mpidr_hash x8, x3, x4, x5, x6, x7, x10
 	add	x1, x1, x8, lsl #3
+
+	stp	x29, lr, [sp, #-16]!
 	bl	__cpu_suspend_save
-	/*
-	 * Grab suspend finisher in x20 and its argument in x19
-	 */
-	mov	x0, x19
-	mov	x1, x20
-	/*
-	 * We are ready for power down, fire off the suspend finisher
-	 * in x1, with argument in x0
-	 */
-	blr	x1
-        /*
-	 * Never gets here, unless suspend finisher fails.
-	 * Successful cpu_suspend should return from cpu_resume, returning
-	 * through this code path is considered an error
-	 * If the return value is set to 0 force x0 = -EOPNOTSUPP
-	 * to make sure a proper error condition is propagated
-	 */
-	cmp	x0, #0
-	mov	x3, #-EOPNOTSUPP
-	csel	x0, x3, x0, eq
-	add	sp, sp, #CPU_SUSPEND_SZ	// rewind stack pointer
-	ldp	x19, x20, [sp, #16]
-	ldp	x21, x22, [sp, #32]
-	ldp	x23, x24, [sp, #48]
-	ldp	x25, x26, [sp, #64]
-	ldp	x27, x28, [sp, #80]
-	ldp	x29, lr, [sp], #96
+	ldp	x29, lr, [sp], #16
+	mov	x0, #1
 	ret
 ENDPROC(__cpu_suspend_enter)
 	.ltorg
@@ -150,12 +120,6 @@ cpu_resume_after_mmu:
 	bl	kasan_unpoison_remaining_stack
 #endif
 	mov	x0, #0			// return zero on success
-	ldp	x19, x20, [sp, #16]
-	ldp	x21, x22, [sp, #32]
-	ldp	x23, x24, [sp, #48]
-	ldp	x25, x26, [sp, #64]
-	ldp	x27, x28, [sp, #80]
-	ldp	x29, lr, [sp], #96
 	ret
 ENDPROC(cpu_resume_after_mmu)
 
@@ -172,6 +136,8 @@ ENTRY(cpu_resume)
         /* x7 contains hash index, let's use it to grab context pointer */
 	ldr_l	x0, sleep_save_sp + SLEEP_SAVE_SP_PHYS
 	ldr	x0, [x0, x7, lsl #3]
+	add	x29, x0, #SLEEP_STACK_DATA_CALLEE_REGS
+	add	x0, x0, #SLEEP_STACK_DATA_SYSTEM_REGS
 	/* load sp from context */
 	ldr	x2, [x0, #CPU_CTX_SP]
 	/* load physical address of identity map page table in x1 */
@@ -185,5 +151,12 @@ ENTRY(cpu_resume)
 	 * pointer and x1 to contain physical address of 1:1 page tables
 	 */
 	bl	cpu_do_resume		// PC relative jump, MMU off
+	/* Can't access these by physical address once the MMU is on */
+	ldp	x19, x20, [x29, #16]
+	ldp	x21, x22, [x29, #32]
+	ldp	x23, x24, [x29, #48]
+	ldp	x25, x26, [x29, #64]
+	ldp	x27, x28, [x29, #80]
+	ldp	x29, lr, [x29]
 	b	cpu_resume_mmu		// Resume MMU, never returns
 ENDPROC(cpu_resume)
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index 6605539..0d52ec9 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -10,22 +10,22 @@
 #include <asm/suspend.h>
 #include <asm/tlbflush.h>
 
-extern int __cpu_suspend_enter(unsigned long arg, int (*fn)(unsigned long));
+
 /*
  * This is called by __cpu_suspend_enter() to save the state, and do whatever
  * flushing is required to ensure that when the CPU goes to sleep we have
  * the necessary data available when the caches are not searched.
  *
- * ptr: CPU context virtual address
+ * ptr: sleep_stack_data containing cpu state virtual address.
  * save_ptr: address of the location where the context physical address
  *           must be saved
  */
-void notrace __cpu_suspend_save(struct cpu_suspend_ctx *ptr,
+void notrace __cpu_suspend_save(struct sleep_stack_data *ptr,
 				phys_addr_t *save_ptr)
 {
 	*save_ptr = virt_to_phys(ptr);
 
-	cpu_do_suspend(ptr);
+	cpu_do_suspend(&ptr->system_regs);
 	/*
 	 * Only flush the context that must be retrieved with the MMU
 	 * off. VA primitives ensure the flush is applied to all
@@ -51,6 +51,30 @@ void __init cpu_suspend_set_dbg_restorer(void (*hw_bp_restore)(void *))
 	hw_breakpoint_restore = hw_bp_restore;
 }
 
+void notrace __cpu_suspend_exit(void)
+{
+	/*
+	 * We are resuming from reset with the idmap active in TTBR0_EL1.
+	 * We must uninstall the idmap and restore the expected MMU
+	 * state before we can possibly return to userspace.
+	 */
+	cpu_uninstall_idmap();
+
+	/*
+	 * Restore per-cpu offset before any kernel
+	 * subsystem relying on it has a chance to run.
+	 */
+	set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
+
+	/*
+	 * Restore HW breakpoint registers to sane values
+	 * before debug exceptions are possibly reenabled
+	 * through local_dbg_restore.
+	 */
+	if (hw_breakpoint_restore)
+		hw_breakpoint_restore(NULL);
+}
+
 /*
  * cpu_suspend
  *
@@ -60,8 +84,9 @@ void __init cpu_suspend_set_dbg_restorer(void (*hw_bp_restore)(void *))
  */
 int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 {
-	int ret;
+	int ret = 0;
 	unsigned long flags;
+	struct sleep_stack_data state;
 
 	/*
 	 * From this point debug exceptions are disabled to prevent
@@ -77,34 +102,21 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 	 */
 	pause_graph_tracing();
 
-	/*
-	 * mm context saved on the stack, it will be restored when
-	 * the cpu comes out of reset through the identity mapped
-	 * page tables, so that the thread address space is properly
-	 * set-up on function return.
-	 */
-	ret = __cpu_suspend_enter(arg, fn);
-	if (ret == 0) {
-		/*
-		 * We are resuming from reset with the idmap active in TTBR0_EL1.
-		 * We must uninstall the idmap and restore the expected MMU
-		 * state before we can possibly return to userspace.
-		 */
-		cpu_uninstall_idmap();
-
-		/*
-		 * Restore per-cpu offset before any kernel
-		 * subsystem relying on it has a chance to run.
-		 */
-		set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
+	if (__cpu_suspend_enter(&state)) {
+		/* Call the suspend finisher */
+		ret = fn(arg);
 
 		/*
-		 * Restore HW breakpoint registers to sane values
-		 * before debug exceptions are possibly reenabled
-		 * through local_dbg_restore.
+		 * Never gets here, unless the suspend finisher fails.
+		 * Successful cpu_suspend() should return from cpu_resume(),
+		 * returning through this code path is considered an error
+		 * If the return value is set to 0 force ret = -EOPNOTSUPP
+		 * to make sure a proper error condition is propagated
 		 */
-		if (hw_breakpoint_restore)
-			hw_breakpoint_restore(NULL);
+		if (!ret)
+			ret = -EOPNOTSUPP;
+	} else {
+		__cpu_suspend_exit();
 	}
 
 	unpause_graph_tracing();
-- 
cgit v0.10.2


From cabe1c81ea5be983425d117912d7883e252a3b09 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 27 Apr 2016 17:47:07 +0100
Subject: arm64: Change cpu_resume() to enable mmu early then access sleep_sp
 by va

By enabling the MMU early in cpu_resume(), the sleep_save_sp and stack can
be accessed by VA, which avoids the need to convert-addresses and clean to
PoC on the suspend path.

MMU setup is shared with the boot path, meaning the swapper_pg_dir is
restored directly: ttbr1_el1 is no longer saved/restored.

struct sleep_save_sp is removed, replacing it with a single array of
pointers.

cpu_do_{suspend,resume} could be further reduced to not restore: cpacr_el1,
mdscr_el1, tcr_el1, vbar_el1 and sctlr_el1, all of which are set by
__cpu_setup(). However these values all contain res0 bits that may be used
to enable future features.

Signed-off-by: James Morse <james.morse@arm.com>
Reviewed-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/suspend.h b/arch/arm64/include/asm/suspend.h
index 365d8cd..29d3c71 100644
--- a/arch/arm64/include/asm/suspend.h
+++ b/arch/arm64/include/asm/suspend.h
@@ -1,7 +1,7 @@
 #ifndef __ASM_SUSPEND_H
 #define __ASM_SUSPEND_H
 
-#define NR_CTX_REGS 11
+#define NR_CTX_REGS 10
 #define NR_CALLEE_SAVED_REGS 12
 
 /*
@@ -17,11 +17,6 @@ struct cpu_suspend_ctx {
 	u64 sp;
 } __aligned(16);
 
-struct sleep_save_sp {
-	phys_addr_t *save_ptr_stash;
-	phys_addr_t save_ptr_stash_phys;
-};
-
 /*
  * Memory to save the cpu state is allocated on the stack by
  * __cpu_suspend_enter()'s caller, and populated by __cpu_suspend_enter().
@@ -39,6 +34,8 @@ struct sleep_stack_data {
 	unsigned long		callee_saved_regs[NR_CALLEE_SAVED_REGS];
 };
 
+extern unsigned long *sleep_save_stash;
+
 extern int cpu_suspend(unsigned long arg, int (*fn)(unsigned long));
 extern void cpu_resume(void);
 int __cpu_suspend_enter(struct sleep_stack_data *state);
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 0902acb..ac742ef 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -119,9 +119,6 @@ int main(void)
   DEFINE(CPU_CTX_SP,		offsetof(struct cpu_suspend_ctx, sp));
   DEFINE(MPIDR_HASH_MASK,	offsetof(struct mpidr_hash, mask));
   DEFINE(MPIDR_HASH_SHIFTS,	offsetof(struct mpidr_hash, shift_aff));
-  DEFINE(SLEEP_SAVE_SP_SZ,	sizeof(struct sleep_save_sp));
-  DEFINE(SLEEP_SAVE_SP_PHYS,	offsetof(struct sleep_save_sp, save_ptr_stash_phys));
-  DEFINE(SLEEP_SAVE_SP_VIRT,	offsetof(struct sleep_save_sp, save_ptr_stash));
   DEFINE(SLEEP_STACK_DATA_SYSTEM_REGS,	offsetof(struct sleep_stack_data, system_regs));
   DEFINE(SLEEP_STACK_DATA_CALLEE_REGS,	offsetof(struct sleep_stack_data, callee_saved_regs));
 #endif
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 00a3210..0674384 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -716,7 +716,7 @@ ENTRY(__early_cpu_boot_status)
  * If it isn't, park the CPU
  */
 	.section	".idmap.text", "ax"
-__enable_mmu:
+ENTRY(__enable_mmu)
 	mrs	x22, sctlr_el1			// preserve old SCTLR_EL1 value
 	mrs	x1, ID_AA64MMFR0_EL1
 	ubfx	x2, x1, #ID_AA64MMFR0_TGRAN_SHIFT, 4
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 65f5159..3279def 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -176,7 +176,6 @@ static void __init smp_build_mpidr_hash(void)
 	 */
 	if (mpidr_hash_size() > 4 * num_possible_cpus())
 		pr_warn("Large number of MPIDR hash buckets detected\n");
-	__flush_dcache_area(&mpidr_hash, sizeof(struct mpidr_hash));
 }
 
 static void __init setup_machine_fdt(phys_addr_t dt_phys)
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index b7c4b68..9a3aec9 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -73,8 +73,8 @@ ENTRY(__cpu_suspend_enter)
 	str	x2, [x0, #SLEEP_STACK_DATA_SYSTEM_REGS + CPU_CTX_SP]
 
 	/* find the mpidr_hash */
-	ldr	x1, =sleep_save_sp
-	ldr	x1, [x1, #SLEEP_SAVE_SP_VIRT]
+	ldr	x1, =sleep_save_stash
+	ldr	x1, [x1]
 	mrs	x7, mpidr_el1
 	ldr	x9, =mpidr_hash
 	ldr	x10, [x9, #MPIDR_HASH_MASK]
@@ -87,44 +87,27 @@ ENTRY(__cpu_suspend_enter)
 	compute_mpidr_hash x8, x3, x4, x5, x6, x7, x10
 	add	x1, x1, x8, lsl #3
 
+	str	x0, [x1]
+	add	x0, x0, #SLEEP_STACK_DATA_SYSTEM_REGS
 	stp	x29, lr, [sp, #-16]!
-	bl	__cpu_suspend_save
+	bl	cpu_do_suspend
 	ldp	x29, lr, [sp], #16
 	mov	x0, #1
 	ret
 ENDPROC(__cpu_suspend_enter)
 	.ltorg
 
-/*
- * x0 must contain the sctlr value retrieved from restored context
- */
-	.pushsection	".idmap.text", "ax"
-ENTRY(cpu_resume_mmu)
-	ldr	x3, =cpu_resume_after_mmu
-	msr	sctlr_el1, x0		// restore sctlr_el1
-	isb
-	/*
-	 * Invalidate the local I-cache so that any instructions fetched
-	 * speculatively from the PoC are discarded, since they may have
-	 * been dynamically patched at the PoU.
-	 */
-	ic	iallu
-	dsb	nsh
-	isb
-	br	x3			// global jump to virtual address
-ENDPROC(cpu_resume_mmu)
-	.popsection
-cpu_resume_after_mmu:
-#ifdef CONFIG_KASAN
-	mov	x0, sp
-	bl	kasan_unpoison_remaining_stack
-#endif
-	mov	x0, #0			// return zero on success
-	ret
-ENDPROC(cpu_resume_after_mmu)
-
 ENTRY(cpu_resume)
 	bl	el2_setup		// if in EL2 drop to EL1 cleanly
+	/* enable the MMU early - so we can access sleep_save_stash by va */
+	adr_l	lr, __enable_mmu	/* __cpu_setup will return here */
+	ldr	x27, =_cpu_resume	/* __enable_mmu will branch here */
+	adrp	x25, idmap_pg_dir
+	adrp	x26, swapper_pg_dir
+	b	__cpu_setup
+ENDPROC(cpu_resume)
+
+ENTRY(_cpu_resume)
 	mrs	x1, mpidr_el1
 	adrp	x8, mpidr_hash
 	add x8, x8, #:lo12:mpidr_hash // x8 = struct mpidr_hash phys address
@@ -134,29 +117,32 @@ ENTRY(cpu_resume)
 	ldp	w5, w6, [x8, #(MPIDR_HASH_SHIFTS + 8)]
 	compute_mpidr_hash x7, x3, x4, x5, x6, x1, x2
         /* x7 contains hash index, let's use it to grab context pointer */
-	ldr_l	x0, sleep_save_sp + SLEEP_SAVE_SP_PHYS
+	ldr_l	x0, sleep_save_stash
 	ldr	x0, [x0, x7, lsl #3]
 	add	x29, x0, #SLEEP_STACK_DATA_CALLEE_REGS
 	add	x0, x0, #SLEEP_STACK_DATA_SYSTEM_REGS
 	/* load sp from context */
 	ldr	x2, [x0, #CPU_CTX_SP]
-	/* load physical address of identity map page table in x1 */
-	adrp	x1, idmap_pg_dir
 	mov	sp, x2
 	/* save thread_info */
 	and	x2, x2, #~(THREAD_SIZE - 1)
 	msr	sp_el0, x2
 	/*
-	 * cpu_do_resume expects x0 to contain context physical address
-	 * pointer and x1 to contain physical address of 1:1 page tables
+	 * cpu_do_resume expects x0 to contain context address pointer
 	 */
-	bl	cpu_do_resume		// PC relative jump, MMU off
-	/* Can't access these by physical address once the MMU is on */
+	bl	cpu_do_resume
+
+#ifdef CONFIG_KASAN
+	mov	x0, sp
+	bl	kasan_unpoison_remaining_stack
+#endif
+
 	ldp	x19, x20, [x29, #16]
 	ldp	x21, x22, [x29, #32]
 	ldp	x23, x24, [x29, #48]
 	ldp	x25, x26, [x29, #64]
 	ldp	x27, x28, [x29, #80]
 	ldp	x29, lr, [x29]
-	b	cpu_resume_mmu		// Resume MMU, never returns
-ENDPROC(cpu_resume)
+	mov	x0, #0
+	ret
+ENDPROC(_cpu_resume)
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index 0d52ec9..b616e36 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -10,30 +10,11 @@
 #include <asm/suspend.h>
 #include <asm/tlbflush.h>
 
-
 /*
- * This is called by __cpu_suspend_enter() to save the state, and do whatever
- * flushing is required to ensure that when the CPU goes to sleep we have
- * the necessary data available when the caches are not searched.
- *
- * ptr: sleep_stack_data containing cpu state virtual address.
- * save_ptr: address of the location where the context physical address
- *           must be saved
+ * This is allocated by cpu_suspend_init(), and used to store a pointer to
+ * the 'struct sleep_stack_data' the contains a particular CPUs state.
  */
-void notrace __cpu_suspend_save(struct sleep_stack_data *ptr,
-				phys_addr_t *save_ptr)
-{
-	*save_ptr = virt_to_phys(ptr);
-
-	cpu_do_suspend(&ptr->system_regs);
-	/*
-	 * Only flush the context that must be retrieved with the MMU
-	 * off. VA primitives ensure the flush is applied to all
-	 * cache levels so context is pushed to DRAM.
-	 */
-	__flush_dcache_area(ptr, sizeof(*ptr));
-	__flush_dcache_area(save_ptr, sizeof(*save_ptr));
-}
+unsigned long *sleep_save_stash;
 
 /*
  * This hook is provided so that cpu_suspend code can restore HW
@@ -131,22 +112,15 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 	return ret;
 }
 
-struct sleep_save_sp sleep_save_sp;
-
 static int __init cpu_suspend_init(void)
 {
-	void *ctx_ptr;
-
 	/* ctx_ptr is an array of physical addresses */
-	ctx_ptr = kcalloc(mpidr_hash_size(), sizeof(phys_addr_t), GFP_KERNEL);
+	sleep_save_stash = kcalloc(mpidr_hash_size(), sizeof(*sleep_save_stash),
+				   GFP_KERNEL);
 
-	if (WARN_ON(!ctx_ptr))
+	if (WARN_ON(!sleep_save_stash))
 		return -ENOMEM;
 
-	sleep_save_sp.save_ptr_stash = ctx_ptr;
-	sleep_save_sp.save_ptr_stash_phys = virt_to_phys(ctx_ptr);
-	__flush_dcache_area(&sleep_save_sp, sizeof(struct sleep_save_sp));
-
 	return 0;
 }
 early_initcall(cpu_suspend_init);
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 9f6deac..c431787 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -24,6 +24,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/hwcap.h>
 #include <asm/pgtable.h>
+#include <asm/pgtable-hwdef.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative.h>
 
@@ -63,62 +64,50 @@ ENTRY(cpu_do_suspend)
 	mrs	x2, tpidr_el0
 	mrs	x3, tpidrro_el0
 	mrs	x4, contextidr_el1
-	mrs	x5, mair_el1
-	mrs	x6, cpacr_el1
-	mrs	x7, ttbr1_el1
-	mrs	x8, tcr_el1
-	mrs	x9, vbar_el1
-	mrs	x10, mdscr_el1
-	mrs	x11, oslsr_el1
-	mrs	x12, sctlr_el1
+	mrs	x5, cpacr_el1
+	mrs	x6, tcr_el1
+	mrs	x7, vbar_el1
+	mrs	x8, mdscr_el1
+	mrs	x9, oslsr_el1
+	mrs	x10, sctlr_el1
 	stp	x2, x3, [x0]
-	stp	x4, x5, [x0, #16]
-	stp	x6, x7, [x0, #32]
-	stp	x8, x9, [x0, #48]
-	stp	x10, x11, [x0, #64]
-	str	x12, [x0, #80]
+	stp	x4, xzr, [x0, #16]
+	stp	x5, x6, [x0, #32]
+	stp	x7, x8, [x0, #48]
+	stp	x9, x10, [x0, #64]
 	ret
 ENDPROC(cpu_do_suspend)
 
 /**
  * cpu_do_resume - restore CPU register context
  *
- * x0: Physical address of context pointer
- * x1: ttbr0_el1 to be restored
- *
- * Returns:
- *	sctlr_el1 value in x0
+ * x0: Address of context pointer
  */
 ENTRY(cpu_do_resume)
-	/*
-	 * Invalidate local tlb entries before turning on MMU
-	 */
-	tlbi	vmalle1
 	ldp	x2, x3, [x0]
 	ldp	x4, x5, [x0, #16]
-	ldp	x6, x7, [x0, #32]
-	ldp	x8, x9, [x0, #48]
-	ldp	x10, x11, [x0, #64]
-	ldr	x12, [x0, #80]
+	ldp	x6, x8, [x0, #32]
+	ldp	x9, x10, [x0, #48]
+	ldp	x11, x12, [x0, #64]
 	msr	tpidr_el0, x2
 	msr	tpidrro_el0, x3
 	msr	contextidr_el1, x4
-	msr	mair_el1, x5
 	msr	cpacr_el1, x6
-	msr	ttbr0_el1, x1
-	msr	ttbr1_el1, x7
-	tcr_set_idmap_t0sz x8, x7
+
+	/* Don't change t0sz here, mask those bits when restoring */
+	mrs	x5, tcr_el1
+	bfi	x8, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
+
 	msr	tcr_el1, x8
 	msr	vbar_el1, x9
 	msr	mdscr_el1, x10
+	msr	sctlr_el1, x12
 	/*
 	 * Restore oslsr_el1 by writing oslar_el1
 	 */
 	ubfx	x11, x11, #1, #1
 	msr	oslar_el1, x11
 	reset_pmuserenr_el0 x0			// Disable PMU access from EL0
-	mov	x0, x12
-	dsb	nsh		// Make sure local tlb invalidation completed
 	isb
 	ret
 ENDPROC(cpu_do_resume)
-- 
cgit v0.10.2


From 812264550dcba6cdbe84bfac2f27e7d23b5b8733 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 27 Apr 2016 17:47:08 +0100
Subject: arm64: kernel: Include _AC definition in page.h

page.h uses '_AC' in the definition of PAGE_SIZE, but doesn't include
linux/const.h where this is defined. This produces build warnings when only
asm/page.h is included by asm code.

Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index ae615b9..17b45f7 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -19,6 +19,8 @@
 #ifndef __ASM_PAGE_H
 #define __ASM_PAGE_H
 
+#include <linux/const.h>
+
 /* PAGE_SHIFT determines the page size */
 /* CONT_SHIFT determines the number of pages which can be tracked together  */
 #ifdef CONFIG_ARM64_64K_PAGES
-- 
cgit v0.10.2


From 28c7258330ee4ce701a4da7af96d6605d1a0b3bd Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 27 Apr 2016 17:47:09 +0100
Subject: arm64: Promote KERNEL_START/KERNEL_END definitions to a header file

KERNEL_START and KERNEL_END are useful outside head.S, move them to a
header file.

Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index f412f50..72a3025 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -87,6 +87,9 @@
 
 #define TASK_UNMAPPED_BASE	(PAGE_ALIGN(TASK_SIZE / 4))
 
+#define KERNEL_START      _text
+#define KERNEL_END        _end
+
 /*
  * The size of the KASAN shadow region. This should be 1/8th of the
  * size of the entire kernel virtual address space.
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 0674384..564c340 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -52,9 +52,6 @@
 #error TEXT_OFFSET must be less than 2MB
 #endif
 
-#define KERNEL_START	_text
-#define KERNEL_END	_end
-
 /*
  * Kernel startup entry point.
  * ---------------------------
-- 
cgit v0.10.2


From 5003dbde45961dd7ab3d8a09ab9ad8bcb604db40 Mon Sep 17 00:00:00 2001
From: Geoff Levand <geoff@infradead.org>
Date: Wed, 27 Apr 2016 17:47:10 +0100
Subject: arm64: Add new asm macro copy_page

Kexec and hibernate need to copy pages of memory, but may not have all
of the kernel mapped, and are unable to call copy_page().

Add a simplistic copy_page() macro, that can be inlined in these
situations. lib/copy_page.S provides a bigger better version, but
uses more registers.

Signed-off-by: Geoff Levand <geoff@infradead.org>
[Changed asm label to 9998, added commit message]
Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 3b3812a..10b017c 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -24,6 +24,7 @@
 #define __ASM_ASSEMBLER_H
 
 #include <asm/asm-offsets.h>
+#include <asm/page.h>
 #include <asm/pgtable-hwdef.h>
 #include <asm/ptrace.h>
 #include <asm/thread_info.h>
@@ -280,6 +281,24 @@ lr	.req	x30		// link register
 	.endm
 
 /*
+ * copy_page - copy src to dest using temp registers t1-t8
+ */
+	.macro copy_page dest:req src:req t1:req t2:req t3:req t4:req t5:req t6:req t7:req t8:req
+9998:	ldp	\t1, \t2, [\src]
+	ldp	\t3, \t4, [\src, #16]
+	ldp	\t5, \t6, [\src, #32]
+	ldp	\t7, \t8, [\src, #48]
+	add	\src, \src, #64
+	stnp	\t1, \t2, [\dest]
+	stnp	\t3, \t4, [\dest, #16]
+	stnp	\t5, \t6, [\dest, #32]
+	stnp	\t7, \t8, [\dest, #48]
+	add	\dest, \dest, #64
+	tst	\src, #(PAGE_SIZE - 1)
+	b.ne	9998b
+	.endm
+
+/*
  * Annotate a function as position independent, i.e., safe to be called before
  * the kernel virtual mapping is activated.
  */
-- 
cgit v0.10.2


From f6cf0545ec697ddc278b7457b7d0c0d86a2ea88e Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 27 Apr 2016 17:47:11 +0100
Subject: PM / Hibernate: Call flush_icache_range() on pages restored in-place

Some architectures require code written to memory as if it were data to be
'cleaned' from any data caches before the processor can fetch them as new
instructions.

During resume from hibernate, the snapshot code copies some pages directly,
meaning these architectures do not get a chance to perform their cache
maintenance. Modify the read and decompress code to call
flush_icache_range() on all pages that are restored, so that the restored
in-place pages are guaranteed to be executable on these architectures.

Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Rafael J. Wysocki <rjw@rjwysocki.net>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
[will: make clean_pages_on_* static and remove initialisers]
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 12cd989..160e100 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -37,6 +37,14 @@
 #define HIBERNATE_SIG	"S1SUSPEND"
 
 /*
+ * When reading an {un,}compressed image, we may restore pages in place,
+ * in which case some architectures need these pages cleaning before they
+ * can be executed. We don't know which pages these may be, so clean the lot.
+ */
+static bool clean_pages_on_read;
+static bool clean_pages_on_decompress;
+
+/*
  *	The swap map is a data structure used for keeping track of each page
  *	written to a swap partition.  It consists of many swap_map_page
  *	structures that contain each an array of MAP_PAGE_ENTRIES swap entries.
@@ -241,6 +249,9 @@ static void hib_end_io(struct bio *bio)
 
 	if (bio_data_dir(bio) == WRITE)
 		put_page(page);
+	else if (clean_pages_on_read)
+		flush_icache_range((unsigned long)page_address(page),
+				   (unsigned long)page_address(page) + PAGE_SIZE);
 
 	if (bio->bi_error && !hb->error)
 		hb->error = bio->bi_error;
@@ -1049,6 +1060,7 @@ static int load_image(struct swap_map_handle *handle,
 
 	hib_init_batch(&hb);
 
+	clean_pages_on_read = true;
 	printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
 		nr_to_read);
 	m = nr_to_read / 10;
@@ -1124,6 +1136,10 @@ static int lzo_decompress_threadfn(void *data)
 		d->unc_len = LZO_UNC_SIZE;
 		d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len,
 		                               d->unc, &d->unc_len);
+		if (clean_pages_on_decompress)
+			flush_icache_range((unsigned long)d->unc,
+					   (unsigned long)d->unc + d->unc_len);
+
 		atomic_set(&d->stop, 1);
 		wake_up(&d->done);
 	}
@@ -1189,6 +1205,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
 	}
 	memset(crc, 0, offsetof(struct crc_data, go));
 
+	clean_pages_on_decompress = true;
+
 	/*
 	 * Start the decompression threads.
 	 */
-- 
cgit v0.10.2


From 82869ac57b5d3b550446932c918dbf2caf020c9e Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 27 Apr 2016 17:47:12 +0100
Subject: arm64: kernel: Add support for hibernate/suspend-to-disk

Add support for hibernate/suspend-to-disk.

Suspend borrows code from cpu_suspend() to write cpu state onto the stack,
before calling swsusp_save() to save the memory image.

Restore creates a set of temporary page tables, covering only the
linear map, copies the restore code to a 'safe' page, then uses the copy to
restore the memory image. The copied code executes in the lower half of the
address space, and once complete, restores the original kernel's page
tables. It then calls into cpu_resume(), and follows the normal
cpu_suspend() path back into the suspend code.

To restore a kernel using KASLR, the address of the page tables, and
cpu_resume() are stored in the hibernate arch-header and the el2
vectors are pivotted via the 'safe' page in low memory.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Kevin Hilman <khilman@baylibre.com> # Tested on Juno R2
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index c8762f4..b87f303 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -978,6 +978,14 @@ menu "Power management options"
 
 source "kernel/power/Kconfig"
 
+config ARCH_HIBERNATION_POSSIBLE
+	def_bool y
+	depends on CPU_PM
+
+config ARCH_HIBERNATION_HEADER
+	def_bool y
+	depends on HIBERNATION
+
 config ARCH_SUSPEND_POSSIBLE
 	def_bool y
 
diff --git a/arch/arm64/include/asm/suspend.h b/arch/arm64/include/asm/suspend.h
index 29d3c71..024d623 100644
--- a/arch/arm64/include/asm/suspend.h
+++ b/arch/arm64/include/asm/suspend.h
@@ -40,4 +40,11 @@ extern int cpu_suspend(unsigned long arg, int (*fn)(unsigned long));
 extern void cpu_resume(void);
 int __cpu_suspend_enter(struct sleep_stack_data *state);
 void __cpu_suspend_exit(void);
+void _cpu_resume(void);
+
+int swsusp_arch_suspend(void);
+int swsusp_arch_resume(void);
+int arch_hibernation_header_save(void *addr, unsigned int max_size);
+int arch_hibernation_header_restore(void *addr);
+
 #endif
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 3793003..2173149 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -45,6 +45,7 @@ arm64-obj-$(CONFIG_ACPI)		+= acpi.o
 arm64-obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL)	+= acpi_parking_protocol.o
 arm64-obj-$(CONFIG_PARAVIRT)		+= paravirt.o
 arm64-obj-$(CONFIG_RANDOMIZE_BASE)	+= kaslr.o
+arm64-obj-$(CONFIG_HIBERNATION)		+= hibernate.o hibernate-asm.o
 
 obj-y					+= $(arm64-obj-y) vdso/
 obj-m					+= $(arm64-obj-m)
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index ac742ef..f8e5d47 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -22,6 +22,7 @@
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
 #include <linux/kvm_host.h>
+#include <linux/suspend.h>
 #include <asm/thread_info.h>
 #include <asm/memory.h>
 #include <asm/smp_plat.h>
@@ -124,5 +125,9 @@ int main(void)
 #endif
   DEFINE(ARM_SMCCC_RES_X0_OFFS,	offsetof(struct arm_smccc_res, a0));
   DEFINE(ARM_SMCCC_RES_X2_OFFS,	offsetof(struct arm_smccc_res, a2));
+  BLANK();
+  DEFINE(HIBERN_PBE_ORIG,	offsetof(struct pbe, orig_address));
+  DEFINE(HIBERN_PBE_ADDR,	offsetof(struct pbe, address));
+  DEFINE(HIBERN_PBE_NEXT,	offsetof(struct pbe, next));
   return 0;
 }
diff --git a/arch/arm64/kernel/hibernate-asm.S b/arch/arm64/kernel/hibernate-asm.S
new file mode 100644
index 0000000..46f29b6
--- /dev/null
+++ b/arch/arm64/kernel/hibernate-asm.S
@@ -0,0 +1,176 @@
+/*
+ * Hibernate low-level support
+ *
+ * Copyright (C) 2016 ARM Ltd.
+ * Author:	James Morse <james.morse@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/linkage.h>
+#include <linux/errno.h>
+
+#include <asm/asm-offsets.h>
+#include <asm/assembler.h>
+#include <asm/cputype.h>
+#include <asm/memory.h>
+#include <asm/page.h>
+#include <asm/virt.h>
+
+/*
+ * To prevent the possibility of old and new partial table walks being visible
+ * in the tlb, switch the ttbr to a zero page when we invalidate the old
+ * records. D4.7.1 'General TLB maintenance requirements' in ARM DDI 0487A.i
+ * Even switching to our copied tables will cause a changed output address at
+ * each stage of the walk.
+ */
+.macro break_before_make_ttbr_switch zero_page, page_table
+	msr	ttbr1_el1, \zero_page
+	isb
+	tlbi	vmalle1is
+	dsb	ish
+	msr	ttbr1_el1, \page_table
+	isb
+.endm
+
+
+/*
+ * Resume from hibernate
+ *
+ * Loads temporary page tables then restores the memory image.
+ * Finally branches to cpu_resume() to restore the state saved by
+ * swsusp_arch_suspend().
+ *
+ * Because this code has to be copied to a 'safe' page, it can't call out to
+ * other functions by PC-relative address. Also remember that it may be
+ * mid-way through over-writing other functions. For this reason it contains
+ * code from flush_icache_range() and uses the copy_page() macro.
+ *
+ * This 'safe' page is mapped via ttbr0, and executed from there. This function
+ * switches to a copy of the linear map in ttbr1, performs the restore, then
+ * switches ttbr1 to the original kernel's swapper_pg_dir.
+ *
+ * All of memory gets written to, including code. We need to clean the kernel
+ * text to the Point of Coherence (PoC) before secondary cores can be booted.
+ * Because the kernel modules and executable pages mapped to user space are
+ * also written as data, we clean all pages we touch to the Point of
+ * Unification (PoU).
+ *
+ * x0: physical address of temporary page tables
+ * x1: physical address of swapper page tables
+ * x2: address of cpu_resume
+ * x3: linear map address of restore_pblist in the current kernel
+ * x4: physical address of __hyp_stub_vectors, or 0
+ * x5: physical address of a  zero page that remains zero after resume
+ */
+.pushsection    ".hibernate_exit.text", "ax"
+ENTRY(swsusp_arch_suspend_exit)
+	/*
+	 * We execute from ttbr0, change ttbr1 to our copied linear map tables
+	 * with a break-before-make via the zero page
+	 */
+	break_before_make_ttbr_switch	x5, x0
+
+	mov	x21, x1
+	mov	x30, x2
+	mov	x24, x4
+	mov	x25, x5
+
+	/* walk the restore_pblist and use copy_page() to over-write memory */
+	mov	x19, x3
+
+1:	ldr	x10, [x19, #HIBERN_PBE_ORIG]
+	mov	x0, x10
+	ldr	x1, [x19, #HIBERN_PBE_ADDR]
+
+	copy_page	x0, x1, x2, x3, x4, x5, x6, x7, x8, x9
+
+	add	x1, x10, #PAGE_SIZE
+	/* Clean the copied page to PoU - based on flush_icache_range() */
+	dcache_line_size x2, x3
+	sub	x3, x2, #1
+	bic	x4, x10, x3
+2:	dc	cvau, x4	/* clean D line / unified line */
+	add	x4, x4, x2
+	cmp	x4, x1
+	b.lo	2b
+
+	ldr	x19, [x19, #HIBERN_PBE_NEXT]
+	cbnz	x19, 1b
+	dsb	ish		/* wait for PoU cleaning to finish */
+
+	/* switch to the restored kernels page tables */
+	break_before_make_ttbr_switch	x25, x21
+
+	ic	ialluis
+	dsb	ish
+	isb
+
+	cbz	x24, 3f		/* Do we need to re-initialise EL2? */
+	hvc	#0
+3:	ret
+
+	.ltorg
+ENDPROC(swsusp_arch_suspend_exit)
+
+/*
+ * Restore the hyp stub.
+ * This must be done before the hibernate page is unmapped by _cpu_resume(),
+ * but happens before any of the hyp-stub's code is cleaned to PoC.
+ *
+ * x24: The physical address of __hyp_stub_vectors
+ */
+el1_sync:
+	msr	vbar_el2, x24
+	eret
+ENDPROC(el1_sync)
+
+.macro invalid_vector	label
+\label:
+	b \label
+ENDPROC(\label)
+.endm
+
+	invalid_vector	el2_sync_invalid
+	invalid_vector	el2_irq_invalid
+	invalid_vector	el2_fiq_invalid
+	invalid_vector	el2_error_invalid
+	invalid_vector	el1_sync_invalid
+	invalid_vector	el1_irq_invalid
+	invalid_vector	el1_fiq_invalid
+	invalid_vector	el1_error_invalid
+
+/* el2 vectors - switch el2 here while we restore the memory image. */
+	.align 11
+ENTRY(hibernate_el2_vectors)
+	ventry	el2_sync_invalid		// Synchronous EL2t
+	ventry	el2_irq_invalid			// IRQ EL2t
+	ventry	el2_fiq_invalid			// FIQ EL2t
+	ventry	el2_error_invalid		// Error EL2t
+
+	ventry	el2_sync_invalid		// Synchronous EL2h
+	ventry	el2_irq_invalid			// IRQ EL2h
+	ventry	el2_fiq_invalid			// FIQ EL2h
+	ventry	el2_error_invalid		// Error EL2h
+
+	ventry	el1_sync			// Synchronous 64-bit EL1
+	ventry	el1_irq_invalid			// IRQ 64-bit EL1
+	ventry	el1_fiq_invalid			// FIQ 64-bit EL1
+	ventry	el1_error_invalid		// Error 64-bit EL1
+
+	ventry	el1_sync_invalid		// Synchronous 32-bit EL1
+	ventry	el1_irq_invalid			// IRQ 32-bit EL1
+	ventry	el1_fiq_invalid			// FIQ 32-bit EL1
+	ventry	el1_error_invalid		// Error 32-bit EL1
+END(hibernate_el2_vectors)
+
+.popsection
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
new file mode 100644
index 0000000..7e16fb3
--- /dev/null
+++ b/arch/arm64/kernel/hibernate.c
@@ -0,0 +1,461 @@
+/*:
+ * Hibernate support specific for ARM64
+ *
+ * Derived from work on ARM hibernation support by:
+ *
+ * Ubuntu project, hibernation support for mach-dove
+ * Copyright (C) 2010 Nokia Corporation (Hiroshi Doyu)
+ * Copyright (C) 2010 Texas Instruments, Inc. (Teerth Reddy et al.)
+ *  https://lkml.org/lkml/2010/6/18/4
+ *  https://lists.linux-foundation.org/pipermail/linux-pm/2010-June/027422.html
+ *  https://patchwork.kernel.org/patch/96442/
+ *
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * License terms: GNU General Public License (GPL) version 2
+ */
+#define pr_fmt(x) "hibernate: " x
+#include <linux/kvm_host.h>
+#include <linux/mm.h>
+#include <linux/pm.h>
+#include <linux/sched.h>
+#include <linux/suspend.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+
+#include <asm/barrier.h>
+#include <asm/cacheflush.h>
+#include <asm/irqflags.h>
+#include <asm/memory.h>
+#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/pgtable-hwdef.h>
+#include <asm/sections.h>
+#include <asm/suspend.h>
+#include <asm/virt.h>
+
+/*
+ * Hibernate core relies on this value being 0 on resume, and marks it
+ * __nosavedata assuming it will keep the resume kernel's '0' value. This
+ * doesn't happen with either KASLR.
+ *
+ * defined as "__visible int in_suspend __nosavedata" in
+ * kernel/power/hibernate.c
+ */
+extern int in_suspend;
+
+/* Find a symbols alias in the linear map */
+#define LMADDR(x)	phys_to_virt(virt_to_phys(x))
+
+/* Do we need to reset el2? */
+#define el2_reset_needed() (is_hyp_mode_available() && !is_kernel_in_hyp_mode())
+
+/*
+ * Start/end of the hibernate exit code, this must be copied to a 'safe'
+ * location in memory, and executed from there.
+ */
+extern char __hibernate_exit_text_start[], __hibernate_exit_text_end[];
+
+/* temporary el2 vectors in the __hibernate_exit_text section. */
+extern char hibernate_el2_vectors[];
+
+/* hyp-stub vectors, used to restore el2 during resume from hibernate. */
+extern char __hyp_stub_vectors[];
+
+/*
+ * Values that may not change over hibernate/resume. We put the build number
+ * and date in here so that we guarantee not to resume with a different
+ * kernel.
+ */
+struct arch_hibernate_hdr_invariants {
+	char		uts_version[__NEW_UTS_LEN + 1];
+};
+
+/* These values need to be know across a hibernate/restore. */
+static struct arch_hibernate_hdr {
+	struct arch_hibernate_hdr_invariants invariants;
+
+	/* These are needed to find the relocated kernel if built with kaslr */
+	phys_addr_t	ttbr1_el1;
+	void		(*reenter_kernel)(void);
+
+	/*
+	 * We need to know where the __hyp_stub_vectors are after restore to
+	 * re-configure el2.
+	 */
+	phys_addr_t	__hyp_stub_vectors;
+} resume_hdr;
+
+static inline void arch_hdr_invariants(struct arch_hibernate_hdr_invariants *i)
+{
+	memset(i, 0, sizeof(*i));
+	memcpy(i->uts_version, init_utsname()->version, sizeof(i->uts_version));
+}
+
+int pfn_is_nosave(unsigned long pfn)
+{
+	unsigned long nosave_begin_pfn = virt_to_pfn(&__nosave_begin);
+	unsigned long nosave_end_pfn = virt_to_pfn(&__nosave_end - 1);
+
+	return (pfn >= nosave_begin_pfn) && (pfn <= nosave_end_pfn);
+}
+
+void notrace save_processor_state(void)
+{
+	WARN_ON(num_online_cpus() != 1);
+}
+
+void notrace restore_processor_state(void)
+{
+}
+
+int arch_hibernation_header_save(void *addr, unsigned int max_size)
+{
+	struct arch_hibernate_hdr *hdr = addr;
+
+	if (max_size < sizeof(*hdr))
+		return -EOVERFLOW;
+
+	arch_hdr_invariants(&hdr->invariants);
+	hdr->ttbr1_el1		= virt_to_phys(swapper_pg_dir);
+	hdr->reenter_kernel	= _cpu_resume;
+
+	/* We can't use __hyp_get_vectors() because kvm may still be loaded */
+	if (el2_reset_needed())
+		hdr->__hyp_stub_vectors = virt_to_phys(__hyp_stub_vectors);
+	else
+		hdr->__hyp_stub_vectors = 0;
+
+	return 0;
+}
+EXPORT_SYMBOL(arch_hibernation_header_save);
+
+int arch_hibernation_header_restore(void *addr)
+{
+	struct arch_hibernate_hdr_invariants invariants;
+	struct arch_hibernate_hdr *hdr = addr;
+
+	arch_hdr_invariants(&invariants);
+	if (memcmp(&hdr->invariants, &invariants, sizeof(invariants))) {
+		pr_crit("Hibernate image not generated by this kernel!\n");
+		return -EINVAL;
+	}
+
+	resume_hdr = *hdr;
+
+	return 0;
+}
+EXPORT_SYMBOL(arch_hibernation_header_restore);
+
+/*
+ * Copies length bytes, starting at src_start into an new page,
+ * perform cache maintentance, then maps it at the specified address low
+ * address as executable.
+ *
+ * This is used by hibernate to copy the code it needs to execute when
+ * overwriting the kernel text. This function generates a new set of page
+ * tables, which it loads into ttbr0.
+ *
+ * Length is provided as we probably only want 4K of data, even on a 64K
+ * page system.
+ */
+static int create_safe_exec_page(void *src_start, size_t length,
+				 unsigned long dst_addr,
+				 phys_addr_t *phys_dst_addr,
+				 void *(*allocator)(gfp_t mask),
+				 gfp_t mask)
+{
+	int rc = 0;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	unsigned long dst = (unsigned long)allocator(mask);
+
+	if (!dst) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	memcpy((void *)dst, src_start, length);
+	flush_icache_range(dst, dst + length);
+
+	pgd = pgd_offset_raw(allocator(mask), dst_addr);
+	if (pgd_none(*pgd)) {
+		pud = allocator(mask);
+		if (!pud) {
+			rc = -ENOMEM;
+			goto out;
+		}
+		pgd_populate(&init_mm, pgd, pud);
+	}
+
+	pud = pud_offset(pgd, dst_addr);
+	if (pud_none(*pud)) {
+		pmd = allocator(mask);
+		if (!pmd) {
+			rc = -ENOMEM;
+			goto out;
+		}
+		pud_populate(&init_mm, pud, pmd);
+	}
+
+	pmd = pmd_offset(pud, dst_addr);
+	if (pmd_none(*pmd)) {
+		pte = allocator(mask);
+		if (!pte) {
+			rc = -ENOMEM;
+			goto out;
+		}
+		pmd_populate_kernel(&init_mm, pmd, pte);
+	}
+
+	pte = pte_offset_kernel(pmd, dst_addr);
+	set_pte(pte, __pte(virt_to_phys((void *)dst) |
+			 pgprot_val(PAGE_KERNEL_EXEC)));
+
+	/* Load our new page tables */
+	asm volatile("msr	ttbr0_el1, %0;"
+		     "isb;"
+		     "tlbi	vmalle1is;"
+		     "dsb	ish;"
+		     "isb" : : "r"(virt_to_phys(pgd)));
+
+	*phys_dst_addr = virt_to_phys((void *)dst);
+
+out:
+	return rc;
+}
+
+
+int swsusp_arch_suspend(void)
+{
+	int ret = 0;
+	unsigned long flags;
+	struct sleep_stack_data state;
+
+	local_dbg_save(flags);
+
+	if (__cpu_suspend_enter(&state)) {
+		ret = swsusp_save();
+	} else {
+		/* Clean kernel to PoC for secondary core startup */
+		__flush_dcache_area(LMADDR(KERNEL_START), KERNEL_END - KERNEL_START);
+
+		/*
+		 * Tell the hibernation core that we've just restored
+		 * the memory
+		 */
+		in_suspend = 0;
+
+		__cpu_suspend_exit();
+	}
+
+	local_dbg_restore(flags);
+
+	return ret;
+}
+
+static int copy_pte(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long start,
+		    unsigned long end)
+{
+	pte_t *src_pte;
+	pte_t *dst_pte;
+	unsigned long addr = start;
+
+	dst_pte = (pte_t *)get_safe_page(GFP_ATOMIC);
+	if (!dst_pte)
+		return -ENOMEM;
+	pmd_populate_kernel(&init_mm, dst_pmd, dst_pte);
+	dst_pte = pte_offset_kernel(dst_pmd, start);
+
+	src_pte = pte_offset_kernel(src_pmd, start);
+	do {
+		if (!pte_none(*src_pte))
+			/*
+			 * Resume will overwrite areas that may be marked
+			 * read only (code, rodata). Clear the RDONLY bit from
+			 * the temporary mappings we use during restore.
+			 */
+			set_pte(dst_pte, __pte(pte_val(*src_pte) & ~PTE_RDONLY));
+	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+
+	return 0;
+}
+
+static int copy_pmd(pud_t *dst_pud, pud_t *src_pud, unsigned long start,
+		    unsigned long end)
+{
+	pmd_t *src_pmd;
+	pmd_t *dst_pmd;
+	unsigned long next;
+	unsigned long addr = start;
+
+	if (pud_none(*dst_pud)) {
+		dst_pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
+		if (!dst_pmd)
+			return -ENOMEM;
+		pud_populate(&init_mm, dst_pud, dst_pmd);
+	}
+	dst_pmd = pmd_offset(dst_pud, start);
+
+	src_pmd = pmd_offset(src_pud, start);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none(*src_pmd))
+			continue;
+		if (pmd_table(*src_pmd)) {
+			if (copy_pte(dst_pmd, src_pmd, addr, next))
+				return -ENOMEM;
+		} else {
+			set_pmd(dst_pmd,
+				__pmd(pmd_val(*src_pmd) & ~PMD_SECT_RDONLY));
+		}
+	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
+
+	return 0;
+}
+
+static int copy_pud(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long start,
+		    unsigned long end)
+{
+	pud_t *dst_pud;
+	pud_t *src_pud;
+	unsigned long next;
+	unsigned long addr = start;
+
+	if (pgd_none(*dst_pgd)) {
+		dst_pud = (pud_t *)get_safe_page(GFP_ATOMIC);
+		if (!dst_pud)
+			return -ENOMEM;
+		pgd_populate(&init_mm, dst_pgd, dst_pud);
+	}
+	dst_pud = pud_offset(dst_pgd, start);
+
+	src_pud = pud_offset(src_pgd, start);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none(*src_pud))
+			continue;
+		if (pud_table(*(src_pud))) {
+			if (copy_pmd(dst_pud, src_pud, addr, next))
+				return -ENOMEM;
+		} else {
+			set_pud(dst_pud,
+				__pud(pud_val(*src_pud) & ~PMD_SECT_RDONLY));
+		}
+	} while (dst_pud++, src_pud++, addr = next, addr != end);
+
+	return 0;
+}
+
+static int copy_page_tables(pgd_t *dst_pgd, unsigned long start,
+			    unsigned long end)
+{
+	unsigned long next;
+	unsigned long addr = start;
+	pgd_t *src_pgd = pgd_offset_k(start);
+
+	dst_pgd = pgd_offset_raw(dst_pgd, start);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(*src_pgd))
+			continue;
+		if (copy_pud(dst_pgd, src_pgd, addr, next))
+			return -ENOMEM;
+	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
+
+	return 0;
+}
+
+/*
+ * Setup then Resume from the hibernate image using swsusp_arch_suspend_exit().
+ *
+ * Memory allocated by get_safe_page() will be dealt with by the hibernate code,
+ * we don't need to free it here.
+ */
+int swsusp_arch_resume(void)
+{
+	int rc = 0;
+	void *zero_page;
+	size_t exit_size;
+	pgd_t *tmp_pg_dir;
+	void *lm_restore_pblist;
+	phys_addr_t phys_hibernate_exit;
+	void __noreturn (*hibernate_exit)(phys_addr_t, phys_addr_t, void *,
+					  void *, phys_addr_t, phys_addr_t);
+
+	/*
+	 * Locate the exit code in the bottom-but-one page, so that *NULL
+	 * still has disastrous affects.
+	 */
+	hibernate_exit = (void *)PAGE_SIZE;
+	exit_size = __hibernate_exit_text_end - __hibernate_exit_text_start;
+	/*
+	 * Copy swsusp_arch_suspend_exit() to a safe page. This will generate
+	 * a new set of ttbr0 page tables and load them.
+	 */
+	rc = create_safe_exec_page(__hibernate_exit_text_start, exit_size,
+				   (unsigned long)hibernate_exit,
+				   &phys_hibernate_exit,
+				   (void *)get_safe_page, GFP_ATOMIC);
+	if (rc) {
+		pr_err("Failed to create safe executable page for hibernate_exit code.");
+		goto out;
+	}
+
+	/*
+	 * The hibernate exit text contains a set of el2 vectors, that will
+	 * be executed at el2 with the mmu off in order to reload hyp-stub.
+	 */
+	__flush_dcache_area(hibernate_exit, exit_size);
+
+	/*
+	 * Restoring the memory image will overwrite the ttbr1 page tables.
+	 * Create a second copy of just the linear map, and use this when
+	 * restoring.
+	 */
+	tmp_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC);
+	if (!tmp_pg_dir) {
+		pr_err("Failed to allocate memory for temporary page tables.");
+		rc = -ENOMEM;
+		goto out;
+	}
+	rc = copy_page_tables(tmp_pg_dir, PAGE_OFFSET, 0);
+	if (rc)
+		goto out;
+
+	/*
+	 * Since we only copied the linear map, we need to find restore_pblist's
+	 * linear map address.
+	 */
+	lm_restore_pblist = LMADDR(restore_pblist);
+
+	/*
+	 * KASLR will cause the el2 vectors to be in a different location in
+	 * the resumed kernel. Load hibernate's temporary copy into el2.
+	 *
+	 * We can skip this step if we booted at EL1, or are running with VHE.
+	 */
+	if (el2_reset_needed()) {
+		phys_addr_t el2_vectors = phys_hibernate_exit;  /* base */
+		el2_vectors += hibernate_el2_vectors -
+			       __hibernate_exit_text_start;     /* offset */
+
+		__hyp_set_vectors(el2_vectors);
+	}
+
+	/*
+	 * We need a zero page that is zero before & after resume in order to
+	 * to break before make on the ttbr1 page tables.
+	 */
+	zero_page = (void *)get_safe_page(GFP_ATOMIC);
+
+	hibernate_exit(virt_to_phys(tmp_pg_dir), resume_hdr.ttbr1_el1,
+		       resume_hdr.reenter_kernel, lm_restore_pblist,
+		       resume_hdr.__hyp_stub_vectors, virt_to_phys(zero_page));
+
+out:
+	return rc;
+}
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 8918b30..435e820 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -46,6 +46,16 @@ jiffies = jiffies_64;
 	*(.idmap.text)					\
 	VMLINUX_SYMBOL(__idmap_text_end) = .;
 
+#ifdef CONFIG_HIBERNATION
+#define HIBERNATE_TEXT					\
+	. = ALIGN(SZ_4K);				\
+	VMLINUX_SYMBOL(__hibernate_exit_text_start) = .;\
+	*(.hibernate_exit.text)				\
+	VMLINUX_SYMBOL(__hibernate_exit_text_end) = .;
+#else
+#define HIBERNATE_TEXT
+#endif
+
 /*
  * The size of the PE/COFF section that covers the kernel image, which
  * runs from stext to _edata, must be a round multiple of the PE/COFF
@@ -113,6 +123,7 @@ SECTIONS
 			LOCK_TEXT
 			HYPERVISOR_TEXT
 			IDMAP_TEXT
+			HIBERNATE_TEXT
 			*(.fixup)
 			*(.gnu.warning)
 		. = ALIGN(16);
@@ -206,6 +217,10 @@ ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
 	"HYP init code too big or misaligned")
 ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
 	"ID map text too big or misaligned")
+#ifdef CONFIG_HIBERNATION
+ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1))
+	<= SZ_4K, "Hibernate exit text too big or misaligned")
+#endif
 
 /*
  * If padding is applied before .head.text, virt<->phys conversions will fail.
-- 
cgit v0.10.2


From 1fe492ce6482b77807b25d29690a48c46456beee Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 27 Apr 2016 17:47:13 +0100
Subject: arm64: hibernate: Refuse to hibernate if the boot cpu is offline

Hibernation represents a system state save/restore through
a system reboot; this implies that the logical cpus carrying
out hibernation/thawing must be the same, so that the context
saved in the snapshot image on hibernation is consistent with
the state of the system on resume. If resume from hibernation
is driven through kernel command line parameter, the cpu responsible
for thawing the system will be whatever CPU firmware boots the system
on upon cold-boot (ie logical cpu 0); this means that in order to
keep system context consistent between the hibernate snapshot image
and system state on kernel resume from hibernate, logical cpu 0 must
be online on hibernation and must be the logical cpu that creates
the snapshot image.

This patch adds a PM notifier that enforces logical cpu 0 is online
when the hibernation is started (and prevents hibernation if it is
not), which is sufficient to guarantee it will be the one creating
the snapshot image therefore providing the resume cpu a consistent
snapshot of the system to resume to.

Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index 7e16fb3..f8df75d 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -17,6 +17,7 @@
 #define pr_fmt(x) "hibernate: " x
 #include <linux/kvm_host.h>
 #include <linux/mm.h>
+#include <linux/notifier.h>
 #include <linux/pm.h>
 #include <linux/sched.h>
 #include <linux/suspend.h>
@@ -459,3 +460,28 @@ int swsusp_arch_resume(void)
 out:
 	return rc;
 }
+
+static int check_boot_cpu_online_pm_callback(struct notifier_block *nb,
+					     unsigned long action, void *ptr)
+{
+	if (action == PM_HIBERNATION_PREPARE &&
+	     cpumask_first(cpu_online_mask) != 0) {
+		pr_warn("CPU0 is offline.\n");
+		return notifier_from_errno(-ENODEV);
+	}
+
+	return NOTIFY_OK;
+}
+
+static int __init check_boot_cpu_online_init(void)
+{
+	/*
+	 * Set this pm_notifier callback with a lower priority than
+	 * cpu_hotplug_pm_callback, so that cpu_hotplug_pm_callback will be
+	 * called earlier to disable cpu hotplug before the cpu online check.
+	 */
+	pm_notifier(check_boot_cpu_online_pm_callback, -INT_MAX);
+
+	return 0;
+}
+core_initcall(check_boot_cpu_online_init);
-- 
cgit v0.10.2


From da24eb1f3f9e2c7b75c5f8c40d8e48e2c4789596 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 28 Apr 2016 19:38:16 +0100
Subject: arm64: make ARCH_SUPPORTS_DEBUG_PAGEALLOC depend on !HIBERNATION

Selecting both DEBUG_PAGEALLOC and HIBERNATION results in a build failure:

| kernel/built-in.o: In function `saveable_page':
| memremap.c:(.text+0x100f90): undefined reference to `kernel_page_present'
| kernel/built-in.o: In function `swsusp_save':
| memremap.c:(.text+0x1026f0): undefined reference to `kernel_page_present'
| make: *** [vmlinux] Error 1

James sayeth:

"This is caused by DEBUG_PAGEALLOC, which clears the PTE_VALID bit from
'free' pages. Hibernate uses it as a hint that it shouldn't save/access
that page. This function is used to test whether the PTE_VALID bit has
been cleared by kernel_map_pages(), hibernate is the only user.

Fixing this exposes a bigger problem with that configuration though: if
the resume kernel has cut free pages out of the linear map, we copy this
swiss-cheese view of memory, and try to use it to restore...

We can fixup the copy of the linear map, but it then explodes in my lazy
'clean the whole kernel to PoC' after resume, as now both the kernel and
linear map have holes in them."

On closer inspection, the whole Kconfig machinery around DEBUG_PAGEALLOC,
HIBERNATION, ARCH_SUPPORTS_DEBUG_PAGEALLOC and PAGE_POISONING looks like
it might need some affection. In particular, DEBUG_ALLOC has:

> depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC

which looks pretty fishy.

For the moment, require ARCH_SUPPORTS_DEBUG_PAGEALLOC to depend on
!HIBERNATION on arm64 and get allmodconfig building again.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b87f303..e945b82 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -578,6 +578,7 @@ source kernel/Kconfig.preempt
 source kernel/Kconfig.hz
 
 config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	depends on !HIBERNATION
 	def_bool y
 
 config ARCH_HAS_HOLES_MEMORYMODEL
-- 
cgit v0.10.2


From 99a507771fa57238dc7ffe674ae06090333d02c9 Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Date: Wed, 27 Apr 2016 13:55:28 -0300
Subject: arm64: kconfig: drop CONFIG_RTC_LIB dependency

The rtc-lib dependency is not required, and seems it was just
copy-pasted from ARM's Kconfig. If platform requires rtc-lib,
they should select it individually.

Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index e945b82..8845c0d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -99,7 +99,6 @@ config ARM64
 	select PERF_USE_VMALLOC
 	select POWER_RESET
 	select POWER_SUPPLY
-	select RTC_LIB
 	select SPARSE_IRQ
 	select SYSCTL_EXCEPTION_TRACE
 	help
-- 
cgit v0.10.2


From 6f26b3671184c36d07eb5d61ba9a6d0aeb583c5d Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 18 Apr 2016 17:09:48 +0200
Subject: arm64: kaslr: increase randomization granularity

Currently, our KASLR implementation randomizes the placement of the core
kernel at 2 MB granularity. This is based on the arm64 kernel boot
protocol, which mandates that the kernel is loaded TEXT_OFFSET bytes above
a 2 MB aligned base address. This requirement is a result of the fact that
the block size used by the early mapping code may be 2 MB at the most (for
a 4 KB granule kernel)

But we can do better than that: since a KASLR kernel needs to be relocated
in any case, we can tolerate a physical misalignment as long as the virtual
misalignment relative to this 2 MB block size is equal in size, and code to
deal with this is already in place.

Since we align the kernel segments to 64 KB, let's randomize the physical
offset at 64 KB granularity as well (unless CONFIG_DEBUG_ALIGN_RODATA is
enabled). This way, the page table and TLB footprint is not affected.

The higher granularity allows for 5 bits of additional entropy to be used.

Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c
index a90f645..eae693e 100644
--- a/drivers/firmware/efi/libstub/arm64-stub.c
+++ b/drivers/firmware/efi/libstub/arm64-stub.c
@@ -81,15 +81,24 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table_arg,
 
 	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && phys_seed != 0) {
 		/*
+		 * If CONFIG_DEBUG_ALIGN_RODATA is not set, produce a
+		 * displacement in the interval [0, MIN_KIMG_ALIGN) that
+		 * is a multiple of the minimal segment alignment (SZ_64K)
+		 */
+		u32 mask = (MIN_KIMG_ALIGN - 1) & ~(SZ_64K - 1);
+		u32 offset = !IS_ENABLED(CONFIG_DEBUG_ALIGN_RODATA) ?
+			     (phys_seed >> 32) & mask : TEXT_OFFSET;
+
+		/*
 		 * If KASLR is enabled, and we have some randomness available,
 		 * locate the kernel at a randomized offset in physical memory.
 		 */
-		*reserve_size = kernel_memsize + TEXT_OFFSET;
+		*reserve_size = kernel_memsize + offset;
 		status = efi_random_alloc(sys_table_arg, *reserve_size,
 					  MIN_KIMG_ALIGN, reserve_addr,
-					  phys_seed);
+					  (u32)phys_seed);
 
-		*image_addr = *reserve_addr + TEXT_OFFSET;
+		*image_addr = *reserve_addr + offset;
 	} else {
 		/*
 		 * Else, try a straight allocation at the preferred offset.
-- 
cgit v0.10.2


From c612505f860c6d4fac03924879982adcd042e239 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 29 Apr 2016 18:27:03 +0100
Subject: arm64: kvm: Fix kvm teardown for systems using the extended idmap

If memory is located above 1<<VA_BITS, kvm adds an extra level to its page
tables, merging the runtime tables and boot tables that contain the idmap.
This lets us avoid the trampoline dance during initialisation.

This also means there is no trampoline page mapped, so
__cpu_reset_hyp_mode() can't call __kvm_hyp_reset() in this page. The good
news is the idmap is still mapped, so we don't need the trampoline page.
The bad news is we can't call it directly as the idmap is above
HYP_PAGE_OFFSET, so its address is masked by kvm_call_hyp.

Add a function __extended_idmap_trampoline which will branch into
__kvm_hyp_reset in the idmap, change kvm_hyp_reset_entry() to return
this address if __kvm_cpu_uses_extended_idmap(). In this case
__kvm_hyp_reset() will still switch to the boot tables (which are the
merged tables that were already in use), and branch into the idmap (where
it already was).

This fixes boot failures on these systems, where we fail to execute the
missing trampoline page when tearing down kvm in init_subsystems():
[    2.508922] kvm [1]: 8-bit VMID
[    2.512057] kvm [1]: Hyp mode initialized successfully
[    2.517242] kvm [1]: interrupt-controller@e1140000 IRQ13
[    2.522622] kvm [1]: timer IRQ3
[    2.525783] Kernel panic - not syncing: HYP panic:
[    2.525783] PS:200003c9 PC:0000007ffffff820 ESR:86000005
[    2.525783] FAR:0000007ffffff820 HPFAR:00000000003ffff0 PAR:0000000000000000
[    2.525783] VCPU:          (null)
[    2.525783]
[    2.547667] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G        W       4.6.0-rc5+ #1
[    2.555137] Hardware name: Default string Default string/Default string, BIOS ROD0084E 09/03/2015
[    2.563994] Call trace:
[    2.566432] [<ffffff80080888d0>] dump_backtrace+0x0/0x240
[    2.571818] [<ffffff8008088b24>] show_stack+0x14/0x20
[    2.576858] [<ffffff80083423ac>] dump_stack+0x94/0xb8
[    2.581899] [<ffffff8008152130>] panic+0x10c/0x250
[    2.586677] [<ffffff8008152024>] panic+0x0/0x250
[    2.591281] SMP: stopping secondary CPUs
[    3.649692] SMP: failed to stop secondary CPUs 0-2,4-7
[    3.654818] Kernel Offset: disabled
[    3.658293] Memory Limit: none
[    3.661337] ---[ end Kernel panic - not syncing: HYP panic:
[    3.661337] PS:200003c9 PC:0000007ffffff820 ESR:86000005
[    3.661337] FAR:0000007ffffff820 HPFAR:00000000003ffff0 PAR:0000000000000000
[    3.661337] VCPU:          (null)
[    3.661337]

Reported-by: Will Deacon <will.deacon@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index b6f194d..88a3467 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -46,7 +46,8 @@
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
 int kvm_arch_dev_ioctl_check_extension(long ext);
-phys_addr_t kvm_hyp_reset_entry(void);
+unsigned long kvm_hyp_reset_entry(void);
+void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start);
 
 struct kvm_arch {
 	/* The VMID generation used for the virt. memory system */
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index 44ec4cb..a873a6d 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -140,6 +140,11 @@ merged:
 ENDPROC(__kvm_hyp_init)
 
 	/*
+	 * Reset kvm back to the hyp stub. This is the trampoline dance in
+	 * reverse. If kvm used an extended idmap, __extended_idmap_trampoline
+	 * calls this code directly in the idmap. In this case switching to the
+	 * boot tables is a no-op.
+	 *
 	 * x0: HYP boot pgd
 	 * x1: HYP phys_idmap_start
 	 */
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index ce9e5e5..70254a6 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -164,3 +164,22 @@ alternative_endif
 
 	eret
 ENDPROC(__fpsimd_guest_restore)
+
+/*
+ * When using the extended idmap, we don't have a trampoline page we can use
+ * while we switch pages tables during __kvm_hyp_reset. Accessing the idmap
+ * directly would be ideal, but if we're using the extended idmap then the
+ * idmap is located above HYP_PAGE_OFFSET, and the address will be masked by
+ * kvm_call_hyp using kern_hyp_va.
+ *
+ * x0: HYP boot pgd
+ * x1: HYP phys_idmap_start
+ */
+ENTRY(__extended_idmap_trampoline)
+	mov	x4, x1
+	adr_l	x3, __kvm_hyp_reset
+
+	/* insert __kvm_hyp_reset()s offset into phys_idmap_start */
+	bfi	x4, x3, #0, #PAGE_SHIFT
+	br	x4
+ENDPROC(__extended_idmap_trampoline)
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 4062e6d..b1ad730 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -135,12 +135,28 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 
 extern char __hyp_idmap_text_start[];
 
-phys_addr_t kvm_hyp_reset_entry(void)
+unsigned long kvm_hyp_reset_entry(void)
 {
-	unsigned long offset;
-
-	offset = (unsigned long)__kvm_hyp_reset
-		 - ((unsigned long)__hyp_idmap_text_start & PAGE_MASK);
-
-	return TRAMPOLINE_VA + offset;
+	if (!__kvm_cpu_uses_extended_idmap()) {
+		unsigned long offset;
+
+		/*
+		 * Find the address of __kvm_hyp_reset() in the trampoline page.
+		 * This is present in the running page tables, and the boot page
+		 * tables, so we call the code here to start the trampoline
+		 * dance in reverse.
+		 */
+		offset = (unsigned long)__kvm_hyp_reset
+			 - ((unsigned long)__hyp_idmap_text_start & PAGE_MASK);
+
+		return TRAMPOLINE_VA + offset;
+	} else {
+		/*
+		 * KVM is running with merged page tables, which don't have the
+		 * trampoline page mapped. We know the idmap is still mapped,
+		 * but can't be called into directly. Use
+		 * __extended_idmap_trampoline to do the call.
+		 */
+		return (unsigned long)kvm_ksym_ref(__extended_idmap_trampoline);
+	}
 }
-- 
cgit v0.10.2


From 2326df551b137dd88652b54016905e13579b7a25 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linaro.org>
Date: Fri, 29 Apr 2016 14:07:54 -0700
Subject: arm64: always use STRICT_MM_TYPECHECKS

Inspired by the counterpart of powerpc [1], which shows there is no negative
effect on code generation from enabling STRICT_MM_TYPECHECKS with a modern
compiler.

And, Arnd's comment [2] about that patch says STRICT_MM_TYPECHECKS could
be default as long as the architecture can pass structures in registers as
function arguments. ARM64 can do it as long as the size of structure <= 16
bytes. All the page table value types are u64 on ARM64.

The below disassembly demonstrates it, entry is pte_t type:

            entry = arch_make_huge_pte(entry, vma, page, writable);
   0xffff00000826fc38 <+80>:    and     x0, x0, #0xfffffffffffffffd
   0xffff00000826fc3c <+84>:    mov     w3, w21
   0xffff00000826fc40 <+88>:    mov     x2, x20
   0xffff00000826fc44 <+92>:    mov     x1, x19
   0xffff00000826fc48 <+96>:    orr     x0, x0, #0x400
   0xffff00000826fc4c <+100>:   bl      0xffff00000809bcc0 <arch_make_huge_pte>

[1] http://www.spinics.net/lists/linux-mm/msg105951.html
[2] http://www.spinics.net/lists/linux-mm/msg105969.html

Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Yang Shi <yang.shi@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/pgtable-types.h b/arch/arm64/include/asm/pgtable-types.h
index 2b1bd7e..69b2fd4 100644
--- a/arch/arm64/include/asm/pgtable-types.h
+++ b/arch/arm64/include/asm/pgtable-types.h
@@ -27,10 +27,6 @@ typedef u64 pmdval_t;
 typedef u64 pudval_t;
 typedef u64 pgdval_t;
 
-#undef STRICT_MM_TYPECHECKS
-
-#ifdef STRICT_MM_TYPECHECKS
-
 /*
  * These are used to make use of C type-checking..
  */
@@ -58,34 +54,6 @@ typedef struct { pteval_t pgprot; } pgprot_t;
 #define pgprot_val(x)	((x).pgprot)
 #define __pgprot(x)	((pgprot_t) { (x) } )
 
-#else	/* !STRICT_MM_TYPECHECKS */
-
-typedef pteval_t pte_t;
-#define pte_val(x)	(x)
-#define __pte(x)	(x)
-
-#if CONFIG_PGTABLE_LEVELS > 2
-typedef pmdval_t pmd_t;
-#define pmd_val(x)	(x)
-#define __pmd(x)	(x)
-#endif
-
-#if CONFIG_PGTABLE_LEVELS > 3
-typedef pudval_t pud_t;
-#define pud_val(x)	(x)
-#define __pud(x)	(x)
-#endif
-
-typedef pgdval_t pgd_t;
-#define pgd_val(x)	(x)
-#define __pgd(x)	(x)
-
-typedef pteval_t pgprot_t;
-#define pgprot_val(x)	(x)
-#define __pgprot(x)	(x)
-
-#endif /* STRICT_MM_TYPECHECKS */
-
 #if CONFIG_PGTABLE_LEVELS == 2
 #include <asm-generic/pgtable-nopmd.h>
 #elif CONFIG_PGTABLE_LEVELS == 3
-- 
cgit v0.10.2


From 394bf2f24834352f461cb139eb8e00af26d5f51a Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linaro.org>
Date: Wed, 4 May 2016 11:14:27 -0700
Subject: arm64: mm: remove unnecessary EXPORT_SYMBOL_GPL

arch_pick_mmap_layout is only called by fs/exec.c which is always built into
kernel, it looks the EXPORT_SYMBOL_GPL is pointless and no architectures export
it other than ARM64.

Signed-off-by: Yang Shi <yang.shi@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 232f787..01c1717 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -95,8 +95,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
 	}
 }
-EXPORT_SYMBOL_GPL(arch_pick_mmap_layout);
-
 
 /*
  * You really shouldn't be using read() or write() on /dev/mem.  This might go
-- 
cgit v0.10.2


From 911f56eeb87ee378f5e215469268a7a2f68a5a8a Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 5 May 2016 10:43:59 +0100
Subject: arm64: Fix typo in the pmdp_huge_get_and_clear() definition

With hardware AF/DBM support, pmd modifications (transparent huge pages)
should be performed atomically using load/store exclusive. The initial
patches defined the get-and-clear function and __HAVE_ARCH_* macro
without the "huge" word, leaving the pmdp_huge_get_and_clear() to the
default, non-atomic implementation.

Fixes: 2f4b829c625e ("arm64: Add support for hardware updates of the access and dirty pte bits")
Cc: <stable@vger.kernel.org> # 4.3+
Reviewed-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 3eefc18..b0ad16f 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -592,9 +592,9 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
-				       unsigned long address, pmd_t *pmdp)
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+					    unsigned long address, pmd_t *pmdp)
 {
 	return pte_pmd(ptep_get_and_clear(mm, address, (pte_t *)pmdp));
 }
-- 
cgit v0.10.2


From 282aa7051b0169991b34716f0f22d9c2f59c46c4 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 5 May 2016 10:44:00 +0100
Subject: arm64: Implement pmdp_set_access_flags() for hardware AF/DBM

The update to the accessed or dirty states for block mappings must be
done atomically on hardware with support for automatic AF/DBM. The
ptep_set_access_flags() function has been fixed as part of commit
66dbd6e61a52 ("arm64: Implement ptep_set_access_flags() for hardware
AF/DBM"). This patch brings pmdp_set_access_flags() in line with the pte
counterpart.

Fixes: 2f4b829c625e ("arm64: Add support for hardware updates of the access and dirty pte bits")
Cc: <stable@vger.kernel.org> # 4.4.x: 66dbd6e61a52: arm64: Implement ptep_set_access_flags() for hardware AF/DBM
Cc: <stable@vger.kernel.org> # 4.3+
Reviewed-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index b0ad16f..072c335 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -540,6 +540,16 @@ extern int ptep_set_access_flags(struct vm_area_struct *vma,
 				 unsigned long address, pte_t *ptep,
 				 pte_t entry, int dirty);
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
+					unsigned long address, pmd_t *pmdp,
+					pmd_t entry, int dirty)
+{
+	return ptep_set_access_flags(vma, address, (pte_t *)pmdp, pmd_pte(entry), dirty);
+}
+#endif
+
 /*
  * Atomic pte/pmd modifications.
  */
-- 
cgit v0.10.2


From ab4db1f2244dfc5ecd7adf9927c29cd654cc14c6 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 5 May 2016 10:44:01 +0100
Subject: arm64: Replace hard-coded values in the pmd/pud_bad() macros

This patch replaces the hard-coded value 2 with PMD_TABLE_BIT in the
pmd/pud_bad() macros. Note that using these macros on pmd_trans_huge()
entries is giving incorrect results
(pmd_none_or_trans_huge_or_clear_bad() correctly checks for
pmd_trans_huge before pmd_bad).

Additionally, white-space clean-up for pmd_mkclean().

Reviewed-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 072c335..b90e4ad 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -294,7 +294,7 @@ static inline int pmd_protnone(pmd_t pmd)
 #define pmd_wrprotect(pmd)	pte_pmd(pte_wrprotect(pmd_pte(pmd)))
 #define pmd_mkold(pmd)		pte_pmd(pte_mkold(pmd_pte(pmd)))
 #define pmd_mkwrite(pmd)	pte_pmd(pte_mkwrite(pmd_pte(pmd)))
-#define pmd_mkclean(pmd)       pte_pmd(pte_mkclean(pmd_pte(pmd)))
+#define pmd_mkclean(pmd)	pte_pmd(pte_mkclean(pmd_pte(pmd)))
 #define pmd_mkdirty(pmd)	pte_pmd(pte_mkdirty(pmd_pte(pmd)))
 #define pmd_mkyoung(pmd)	pte_pmd(pte_mkyoung(pmd_pte(pmd)))
 #define pmd_mknotpresent(pmd)	(__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK))
@@ -338,7 +338,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 #define pmd_none(pmd)		(!pmd_val(pmd))
 #define pmd_present(pmd)	(pmd_val(pmd))
 
-#define pmd_bad(pmd)		(!(pmd_val(pmd) & 2))
+#define pmd_bad(pmd)		(!(pmd_val(pmd) & PMD_TABLE_BIT))
 
 #define pmd_table(pmd)		((pmd_val(pmd) & PMD_TYPE_MASK) == \
 				 PMD_TYPE_TABLE)
@@ -403,7 +403,7 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd)
 #define pmd_ERROR(pmd)		__pmd_error(__FILE__, __LINE__, pmd_val(pmd))
 
 #define pud_none(pud)		(!pud_val(pud))
-#define pud_bad(pud)		(!(pud_val(pud) & 2))
+#define pud_bad(pud)		(!(pud_val(pud) & PUD_TABLE_BIT))
 #define pud_present(pud)	(pud_val(pud))
 
 static inline void set_pud(pud_t *pudp, pud_t pud)
-- 
cgit v0.10.2


From 5bb1cc0ff9a6b68871970737e6c4c16919928d8b Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 5 May 2016 10:44:02 +0100
Subject: arm64: Ensure pmd_present() returns false after pmd_mknotpresent()

Currently, pmd_present() only checks for a non-zero value, returning
true even after pmd_mknotpresent() (which only clears the type bits).
This patch converts pmd_present() to using pte_present(), similar to the
other pmd_*() checks. As a side effect, it will return true for
PROT_NONE mappings, though they are not yet used by the kernel with
transparent huge pages.

For consistency, also change pmd_mknotpresent() to only clear the
PMD_SECT_VALID bit, even though the PMD_TABLE_BIT is already 0 for block
mappings (no functional change). The unused PMD_SECT_PROT_NONE
definition is removed as transparent huge pages use the pte page prot
values.

Fixes: 9c7e535fcc17 ("arm64: mm: Route pmd thp functions through pte equivalents")
Cc: <stable@vger.kernel.org> # 3.15+
Reviewed-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 5c25b83..9786f77 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -133,7 +133,6 @@
  * Section
  */
 #define PMD_SECT_VALID		(_AT(pmdval_t, 1) << 0)
-#define PMD_SECT_PROT_NONE	(_AT(pmdval_t, 1) << 58)
 #define PMD_SECT_USER		(_AT(pmdval_t, 1) << 6)		/* AP[1] */
 #define PMD_SECT_RDONLY		(_AT(pmdval_t, 1) << 7)		/* AP[2] */
 #define PMD_SECT_S		(_AT(pmdval_t, 3) << 8)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index b90e4ad..2da46ae 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -289,6 +289,7 @@ static inline int pmd_protnone(pmd_t pmd)
 #define pmd_trans_huge(pmd)	(pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+#define pmd_present(pmd)	pte_present(pmd_pte(pmd))
 #define pmd_dirty(pmd)		pte_dirty(pmd_pte(pmd))
 #define pmd_young(pmd)		pte_young(pmd_pte(pmd))
 #define pmd_wrprotect(pmd)	pte_pmd(pte_wrprotect(pmd_pte(pmd)))
@@ -297,7 +298,7 @@ static inline int pmd_protnone(pmd_t pmd)
 #define pmd_mkclean(pmd)	pte_pmd(pte_mkclean(pmd_pte(pmd)))
 #define pmd_mkdirty(pmd)	pte_pmd(pte_mkdirty(pmd_pte(pmd)))
 #define pmd_mkyoung(pmd)	pte_pmd(pte_mkyoung(pmd_pte(pmd)))
-#define pmd_mknotpresent(pmd)	(__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK))
+#define pmd_mknotpresent(pmd)	(__pmd(pmd_val(pmd) & ~PMD_SECT_VALID))
 
 #define __HAVE_ARCH_PMD_WRITE
 #define pmd_write(pmd)		pte_write(pmd_pte(pmd))
@@ -336,7 +337,6 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 				     unsigned long size, pgprot_t vma_prot);
 
 #define pmd_none(pmd)		(!pmd_val(pmd))
-#define pmd_present(pmd)	(pmd_val(pmd))
 
 #define pmd_bad(pmd)		(!(pmd_val(pmd) & PMD_TABLE_BIT))
 
-- 
cgit v0.10.2


From 99aa036241ed4a08a71b627bb903b5d7c75d78c1 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Tue, 10 May 2016 11:14:41 +0100
Subject: arm64: secondary_start_kernel: Remove unnecessary barrier

Remove the unnecessary smp_wmb(), which was added to make sure
that the update_cpu_boot_status() completes before we mark the
CPU online. But update_cpu_boot_status() already has dsb() (required
for the failing CPUs) to ensure the correct behavior.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Reported-by: Dennis Chen <dennis.chen@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index dc96475..678e084 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -264,8 +264,6 @@ asmlinkage void secondary_start_kernel(void)
 	pr_info("CPU%u: Booted secondary processor [%08x]\n",
 					 cpu, read_cpuid_id());
 	update_cpu_boot_status(CPU_BOOT_SUCCESS);
-	/* Make sure the status update is visible before we complete */
-	smp_wmb();
 	set_cpu_online(cpu, true);
 	complete(&cpu_running);
 
-- 
cgit v0.10.2


From f228b494e56d949be8d8ea09d4f973d1979201bf Mon Sep 17 00:00:00 2001
From: Julien Grall <julien.grall@arm.com>
Date: Tue, 10 May 2016 15:40:31 +0100
Subject: arm64: cpuinfo: Missing NULL terminator in compat_hwcap_str

The loop that browses the array compat_hwcap_str will stop when a NULL
is encountered, however NULL is missing at the end of array. This will
lead to overrun until a NULL is found somewhere in the following memory.
In reality, this works out because the compat_hwcap2_str array tends to
follow immediately in memory, and that *is* terminated correctly.
Furthermore, the unsigned int compat_elf_hwcap is checked before
printing each capability, so we end up doing the right thing because
the size of the two arrays is less than 32. Still, this is an obvious
mistake and should be fixed.

Note for backporting: commit 12d11817eaafa414 ("arm64: Move
/proc/cpuinfo handling code") moved this code in v4.4. Prior to that
commit, the same change should be made in arch/arm64/kernel/setup.c.

Fixes: 44b82b7700d0 "arm64: Fix up /proc/cpuinfo"
Cc: <stable@vger.kernel.org> # v3.19+ (but see note above prior to v4.4)
Signed-off-by: Julien Grall <julien.grall@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 92189e2..3808470 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -87,7 +87,8 @@ static const char *const compat_hwcap_str[] = {
 	"idivt",
 	"vfpd32",
 	"lpae",
-	"evtstrm"
+	"evtstrm",
+	NULL
 };
 
 static const char *const compat_hwcap2_str[] = {
-- 
cgit v0.10.2


From 61462c8a6b140fe2f93cb911684837e05950e680 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 10 May 2016 10:55:49 -0700
Subject: arm64: kernel: Fix incorrect brk randomization

This fixes two issues with the arm64 brk randomziation. First, the
STACK_RND_MASK was being used incorrectly. The original code was:

	unsigned long range_end = base + (STACK_RND_MASK << PAGE_SHIFT) + 1;

STACK_RND_MASK is 0x7ff (32-bit) or 0x3ffff (64-bit), with 4K pages where
PAGE_SHIFT is 12:

	#define STACK_RND_MASK	(test_thread_flag(TIF_32BIT) ? \
						0x7ff >> (PAGE_SHIFT - 12) : \
						0x3ffff >> (PAGE_SHIFT - 12))

This means the resulting offset from base would be 0x7ff0001 or 0x3ffff0001,
which is wrong since it creates an unaligned end address. It was likely
intended to be:

	unsigned long range_end = base + ((STACK_RND_MASK + 1) << PAGE_SHIFT)

Which would result in offsets of 0x800000 (32-bit) and 0x40000000 (64-bit).

However, even this corrected 32-bit compat offset (0x00800000) is much
smaller than native ARM's brk randomization value (0x02000000):

	unsigned long arch_randomize_brk(struct mm_struct *mm)
	{
	        unsigned long range_end = mm->brk + 0x02000000;
	        return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
	}

So, instead of basing arm64's brk randomization on mistaken STACK_RND_MASK
calculations, just use specific corrected values for compat (0x2000000)
and native arm64 (0x40000000).

Reviewed-by: Jon Medhurst <tixy@linaro.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
[will: use is_compat_task() as suggested by tixy]
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 8062482..ad4a7e1 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -382,13 +382,14 @@ unsigned long arch_align_stack(unsigned long sp)
 	return sp & ~0xf;
 }
 
-static unsigned long randomize_base(unsigned long base)
-{
-	unsigned long range_end = base + (STACK_RND_MASK << PAGE_SHIFT) + 1;
-	return randomize_range(base, range_end, 0) ? : base;
-}
-
 unsigned long arch_randomize_brk(struct mm_struct *mm)
 {
-	return randomize_base(mm->brk);
+	unsigned long range_end = mm->brk;
+
+	if (is_compat_task())
+		range_end += 0x02000000;
+	else
+		range_end += 0x40000000;
+
+	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
 }
-- 
cgit v0.10.2


From e6d9a52543338603e25e71e0e4942f05dae0dd8a Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 11 May 2016 17:56:54 +0100
Subject: arm64: do not enforce strict 16 byte alignment to stack pointer

copy_thread should not be enforcing 16 byte aligment and returning
-EINVAL. Other architectures trap misaligned stack access with SIGBUS
so arm64 should follow this convention, so remove the strict enforcement
check.

For example, currently clone(2) fails with -EINVAL when passing
a misaligned stack and this gives little clue to what is wrong. Instead,
it is arguable that a SIGBUS on the fist access to a misaligned stack
allows one to figure out that it is a misaligned stack issue rather
than trying to figure out why an unconventional (and undocumented)
-EINVAL is being returned.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index ad4a7e1..48eea68 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -265,9 +265,6 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
 		if (stack_start) {
 			if (is_compat_thread(task_thread_info(p)))
 				childregs->compat_sp = stack_start;
-			/* 16-byte aligned stack mandatory on AArch64 */
-			else if (stack_start & 15)
-				return -EINVAL;
 			else
 				childregs->sp = stack_start;
 		}
-- 
cgit v0.10.2