diff options
Diffstat (limited to 'arch')
-rw-r--r-- | arch/arm/include/asm/kvm_mmu.h | 43 | ||||
-rw-r--r-- | arch/arm/include/asm/stage2_pgtable.h | 61 | ||||
-rw-r--r-- | arch/arm/kvm/arm.c | 2 | ||||
-rw-r--r-- | arch/arm/kvm/mmu.c | 408 | ||||
-rw-r--r-- | arch/arm64/include/asm/kvm_arm.h | 85 | ||||
-rw-r--r-- | arch/arm64/include/asm/kvm_mmu.h | 111 | ||||
-rw-r--r-- | arch/arm64/include/asm/pgtable-hwdef.h | 80 | ||||
-rw-r--r-- | arch/arm64/include/asm/pgtable.h | 15 | ||||
-rw-r--r-- | arch/arm64/include/asm/stage2_pgtable-nopmd.h | 42 | ||||
-rw-r--r-- | arch/arm64/include/asm/stage2_pgtable-nopud.h | 39 | ||||
-rw-r--r-- | arch/arm64/include/asm/stage2_pgtable.h | 142 | ||||
-rw-r--r-- | arch/arm64/kvm/Kconfig | 1 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/s2-setup.c | 8 |
13 files changed, 676 insertions, 361 deletions
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index da44be9..ef0b276 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -47,6 +47,7 @@ #include <linux/highmem.h> #include <asm/cacheflush.h> #include <asm/pgalloc.h> +#include <asm/stage2_pgtable.h> int create_hyp_mappings(void *from, void *to); int create_hyp_io_mappings(void *from, void *to, phys_addr_t); @@ -105,14 +106,16 @@ static inline void kvm_clean_pte(pte_t *pte) clean_pte_table(pte); } -static inline void kvm_set_s2pte_writable(pte_t *pte) +static inline pte_t kvm_s2pte_mkwrite(pte_t pte) { - pte_val(*pte) |= L_PTE_S2_RDWR; + pte_val(pte) |= L_PTE_S2_RDWR; + return pte; } -static inline void kvm_set_s2pmd_writable(pmd_t *pmd) +static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd) { - pmd_val(*pmd) |= L_PMD_S2_RDWR; + pmd_val(pmd) |= L_PMD_S2_RDWR; + return pmd; } static inline void kvm_set_s2pte_readonly(pte_t *pte) @@ -135,22 +138,6 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd) return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY; } - -/* Open coded p*d_addr_end that can deal with 64bit addresses */ -#define kvm_pgd_addr_end(addr, end) \ -({ u64 __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \ - (__boundary - 1 < (end) - 1)? __boundary: (end); \ -}) - -#define kvm_pud_addr_end(addr,end) (end) - -#define kvm_pmd_addr_end(addr, end) \ -({ u64 __boundary = ((addr) + PMD_SIZE) & PMD_MASK; \ - (__boundary - 1 < (end) - 1)? __boundary: (end); \ -}) - -#define kvm_pgd_index(addr) pgd_index(addr) - static inline bool kvm_page_empty(void *ptr) { struct page *ptr_page = virt_to_page(ptr); @@ -159,19 +146,11 @@ static inline bool kvm_page_empty(void *ptr) #define kvm_pte_table_empty(kvm, ptep) kvm_page_empty(ptep) #define kvm_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) -#define kvm_pud_table_empty(kvm, pudp) (0) - -#define KVM_PREALLOC_LEVEL 0 +#define kvm_pud_table_empty(kvm, pudp) false -static inline void *kvm_get_hwpgd(struct kvm *kvm) -{ - return kvm->arch.pgd; -} - -static inline unsigned int kvm_get_hwpgd_size(void) -{ - return PTRS_PER_S2_PGD * sizeof(pgd_t); -} +#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) +#define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp) +#define hyp_pud_table_empty(pudp) false struct kvm; diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h new file mode 100644 index 0000000..460d616 --- /dev/null +++ b/arch/arm/include/asm/stage2_pgtable.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2016 - ARM Ltd + * + * stage2 page table helpers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef __ARM_S2_PGTABLE_H_ +#define __ARM_S2_PGTABLE_H_ + +#define stage2_pgd_none(pgd) pgd_none(pgd) +#define stage2_pgd_clear(pgd) pgd_clear(pgd) +#define stage2_pgd_present(pgd) pgd_present(pgd) +#define stage2_pgd_populate(pgd, pud) pgd_populate(NULL, pgd, pud) +#define stage2_pud_offset(pgd, address) pud_offset(pgd, address) +#define stage2_pud_free(pud) pud_free(NULL, pud) + +#define stage2_pud_none(pud) pud_none(pud) +#define stage2_pud_clear(pud) pud_clear(pud) +#define stage2_pud_present(pud) pud_present(pud) +#define stage2_pud_populate(pud, pmd) pud_populate(NULL, pud, pmd) +#define stage2_pmd_offset(pud, address) pmd_offset(pud, address) +#define stage2_pmd_free(pmd) pmd_free(NULL, pmd) + +#define stage2_pud_huge(pud) pud_huge(pud) + +/* Open coded p*d_addr_end that can deal with 64bit addresses */ +static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t boundary = (addr + PGDIR_SIZE) & PGDIR_MASK; + + return (boundary - 1 < end - 1) ? boundary : end; +} + +#define stage2_pud_addr_end(addr, end) (end) + +static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t boundary = (addr + PMD_SIZE) & PMD_MASK; + + return (boundary - 1 < end - 1) ? boundary : end; +} + +#define stage2_pgd_index(addr) pgd_index(addr) + +#define stage2_pte_table_empty(ptep) kvm_page_empty(ptep) +#define stage2_pmd_table_empty(pmdp) kvm_page_empty(pmdp) +#define stage2_pud_table_empty(pudp) false + +#endif /* __ARM_S2_PGTABLE_H_ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index dded1b7..be4b639 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -448,7 +448,7 @@ static void update_vttbr(struct kvm *kvm) kvm_next_vmid &= (1 << kvm_vmid_bits) - 1; /* update vttbr to be used with the new vmid */ - pgd_phys = virt_to_phys(kvm_get_hwpgd(kvm)); + pgd_phys = virt_to_phys(kvm->arch.pgd); BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK); vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits); kvm->arch.vttbr = pgd_phys | vmid; diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 58dbd5c..783e5ff 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -43,11 +43,9 @@ static unsigned long hyp_idmap_start; static unsigned long hyp_idmap_end; static phys_addr_t hyp_idmap_vector; +#define S2_PGD_SIZE (PTRS_PER_S2_PGD * sizeof(pgd_t)) #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) -#define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x)) -#define kvm_pud_huge(_x) pud_huge(_x) - #define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) #define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) @@ -69,14 +67,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) { - /* - * This function also gets called when dealing with HYP page - * tables. As HYP doesn't have an associated struct kvm (and - * the HYP page tables are fairly static), we don't do - * anything there. - */ - if (kvm) - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); } /* @@ -115,7 +106,7 @@ static bool kvm_is_device_pfn(unsigned long pfn) */ static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) { - if (!kvm_pmd_huge(*pmd)) + if (!pmd_thp_or_huge(*pmd)) return; pmd_clear(pmd); @@ -155,29 +146,29 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) return p; } -static void clear_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) +static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) { - pud_t *pud_table __maybe_unused = pud_offset(pgd, 0); - pgd_clear(pgd); + pud_t *pud_table __maybe_unused = stage2_pud_offset(pgd, 0UL); + stage2_pgd_clear(pgd); kvm_tlb_flush_vmid_ipa(kvm, addr); - pud_free(NULL, pud_table); + stage2_pud_free(pud_table); put_page(virt_to_page(pgd)); } -static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) +static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) { - pmd_t *pmd_table = pmd_offset(pud, 0); - VM_BUG_ON(pud_huge(*pud)); - pud_clear(pud); + pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(pud, 0); + VM_BUG_ON(stage2_pud_huge(*pud)); + stage2_pud_clear(pud); kvm_tlb_flush_vmid_ipa(kvm, addr); - pmd_free(NULL, pmd_table); + stage2_pmd_free(pmd_table); put_page(virt_to_page(pud)); } -static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) +static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) { pte_t *pte_table = pte_offset_kernel(pmd, 0); - VM_BUG_ON(kvm_pmd_huge(*pmd)); + VM_BUG_ON(pmd_thp_or_huge(*pmd)); pmd_clear(pmd); kvm_tlb_flush_vmid_ipa(kvm, addr); pte_free_kernel(NULL, pte_table); @@ -204,7 +195,7 @@ static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure * the IO subsystem will never hit in the cache. */ -static void unmap_ptes(struct kvm *kvm, pmd_t *pmd, +static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr, phys_addr_t end) { phys_addr_t start_addr = addr; @@ -226,21 +217,21 @@ static void unmap_ptes(struct kvm *kvm, pmd_t *pmd, } } while (pte++, addr += PAGE_SIZE, addr != end); - if (kvm_pte_table_empty(kvm, start_pte)) - clear_pmd_entry(kvm, pmd, start_addr); + if (stage2_pte_table_empty(start_pte)) + clear_stage2_pmd_entry(kvm, pmd, start_addr); } -static void unmap_pmds(struct kvm *kvm, pud_t *pud, +static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, phys_addr_t addr, phys_addr_t end) { phys_addr_t next, start_addr = addr; pmd_t *pmd, *start_pmd; - start_pmd = pmd = pmd_offset(pud, addr); + start_pmd = pmd = stage2_pmd_offset(pud, addr); do { - next = kvm_pmd_addr_end(addr, end); + next = stage2_pmd_addr_end(addr, end); if (!pmd_none(*pmd)) { - if (kvm_pmd_huge(*pmd)) { + if (pmd_thp_or_huge(*pmd)) { pmd_t old_pmd = *pmd; pmd_clear(pmd); @@ -250,57 +241,64 @@ static void unmap_pmds(struct kvm *kvm, pud_t *pud, put_page(virt_to_page(pmd)); } else { - unmap_ptes(kvm, pmd, addr, next); + unmap_stage2_ptes(kvm, pmd, addr, next); } } } while (pmd++, addr = next, addr != end); - if (kvm_pmd_table_empty(kvm, start_pmd)) - clear_pud_entry(kvm, pud, start_addr); + if (stage2_pmd_table_empty(start_pmd)) + clear_stage2_pud_entry(kvm, pud, start_addr); } -static void unmap_puds(struct kvm *kvm, pgd_t *pgd, +static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr, phys_addr_t end) { phys_addr_t next, start_addr = addr; pud_t *pud, *start_pud; - start_pud = pud = pud_offset(pgd, addr); + start_pud = pud = stage2_pud_offset(pgd, addr); do { - next = kvm_pud_addr_end(addr, end); - if (!pud_none(*pud)) { - if (pud_huge(*pud)) { + next = stage2_pud_addr_end(addr, end); + if (!stage2_pud_none(*pud)) { + if (stage2_pud_huge(*pud)) { pud_t old_pud = *pud; - pud_clear(pud); + stage2_pud_clear(pud); kvm_tlb_flush_vmid_ipa(kvm, addr); - kvm_flush_dcache_pud(old_pud); - put_page(virt_to_page(pud)); } else { - unmap_pmds(kvm, pud, addr, next); + unmap_stage2_pmds(kvm, pud, addr, next); } } } while (pud++, addr = next, addr != end); - if (kvm_pud_table_empty(kvm, start_pud)) - clear_pgd_entry(kvm, pgd, start_addr); + if (stage2_pud_table_empty(start_pud)) + clear_stage2_pgd_entry(kvm, pgd, start_addr); } - -static void unmap_range(struct kvm *kvm, pgd_t *pgdp, - phys_addr_t start, u64 size) +/** + * unmap_stage2_range -- Clear stage2 page table entries to unmap a range + * @kvm: The VM pointer + * @start: The intermediate physical base address of the range to unmap + * @size: The size of the area to unmap + * + * Clear a range of stage-2 mappings, lowering the various ref-counts. Must + * be called while holding mmu_lock (unless for freeing the stage2 pgd before + * destroying the VM), otherwise another faulting VCPU may come in and mess + * with things behind our backs. + */ +static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) { pgd_t *pgd; phys_addr_t addr = start, end = start + size; phys_addr_t next; - pgd = pgdp + kvm_pgd_index(addr); + pgd = kvm->arch.pgd + stage2_pgd_index(addr); do { - next = kvm_pgd_addr_end(addr, end); - if (!pgd_none(*pgd)) - unmap_puds(kvm, pgd, addr, next); + next = stage2_pgd_addr_end(addr, end); + if (!stage2_pgd_none(*pgd)) + unmap_stage2_puds(kvm, pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -322,11 +320,11 @@ static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, pmd_t *pmd; phys_addr_t next; - pmd = pmd_offset(pud, addr); + pmd = stage2_pmd_offset(pud, addr); do { - next = kvm_pmd_addr_end(addr, end); + next = stage2_pmd_addr_end(addr, end); if (!pmd_none(*pmd)) { - if (kvm_pmd_huge(*pmd)) + if (pmd_thp_or_huge(*pmd)) kvm_flush_dcache_pmd(*pmd); else stage2_flush_ptes(kvm, pmd, addr, next); @@ -340,11 +338,11 @@ static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, pud_t *pud; phys_addr_t next; - pud = pud_offset(pgd, addr); + pud = stage2_pud_offset(pgd, addr); do { - next = kvm_pud_addr_end(addr, end); - if (!pud_none(*pud)) { - if (pud_huge(*pud)) + next = stage2_pud_addr_end(addr, end); + if (!stage2_pud_none(*pud)) { + if (stage2_pud_huge(*pud)) kvm_flush_dcache_pud(*pud); else stage2_flush_pmds(kvm, pud, addr, next); @@ -360,9 +358,9 @@ static void stage2_flush_memslot(struct kvm *kvm, phys_addr_t next; pgd_t *pgd; - pgd = kvm->arch.pgd + kvm_pgd_index(addr); + pgd = kvm->arch.pgd + stage2_pgd_index(addr); do { - next = kvm_pgd_addr_end(addr, end); + next = stage2_pgd_addr_end(addr, end); stage2_flush_puds(kvm, pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -391,6 +389,100 @@ static void stage2_flush_vm(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, idx); } +static void clear_hyp_pgd_entry(pgd_t *pgd) +{ + pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL); + pgd_clear(pgd); + pud_free(NULL, pud_table); + put_page(virt_to_page(pgd)); +} + +static void clear_hyp_pud_entry(pud_t *pud) +{ + pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0); + VM_BUG_ON(pud_huge(*pud)); + pud_clear(pud); + pmd_free(NULL, pmd_table); + put_page(virt_to_page(pud)); +} + +static void clear_hyp_pmd_entry(pmd_t *pmd) +{ + pte_t *pte_table = pte_offset_kernel(pmd, 0); + VM_BUG_ON(pmd_thp_or_huge(*pmd)); + pmd_clear(pmd); + pte_free_kernel(NULL, pte_table); + put_page(virt_to_page(pmd)); +} + +static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) +{ + pte_t *pte, *start_pte; + + start_pte = pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_none(*pte)) { + kvm_set_pte(pte, __pte(0)); + put_page(virt_to_page(pte)); + } + } while (pte++, addr += PAGE_SIZE, addr != end); + + if (hyp_pte_table_empty(start_pte)) + clear_hyp_pmd_entry(pmd); +} + +static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next; + pmd_t *pmd, *start_pmd; + + start_pmd = pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + /* Hyp doesn't use huge pmds */ + if (!pmd_none(*pmd)) + unmap_hyp_ptes(pmd, addr, next); + } while (pmd++, addr = next, addr != end); + + if (hyp_pmd_table_empty(start_pmd)) + clear_hyp_pud_entry(pud); +} + +static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next; + pud_t *pud, *start_pud; + + start_pud = pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + /* Hyp doesn't use huge puds */ + if (!pud_none(*pud)) + unmap_hyp_pmds(pud, addr, next); + } while (pud++, addr = next, addr != end); + + if (hyp_pud_table_empty(start_pud)) + clear_hyp_pgd_entry(pgd); +} + +static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) +{ + pgd_t *pgd; + phys_addr_t addr = start, end = start + size; + phys_addr_t next; + + /* + * We don't unmap anything from HYP, except at the hyp tear down. + * Hence, we don't have to invalidate the TLBs here. + */ + pgd = pgdp + pgd_index(addr); + do { + next = pgd_addr_end(addr, end); + if (!pgd_none(*pgd)) + unmap_hyp_puds(pgd, addr, next); + } while (pgd++, addr = next, addr != end); +} + /** * free_boot_hyp_pgd - free HYP boot page tables * @@ -401,14 +493,14 @@ void free_boot_hyp_pgd(void) mutex_lock(&kvm_hyp_pgd_mutex); if (boot_hyp_pgd) { - unmap_range(NULL, boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE); - unmap_range(NULL, boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); + unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE); + unmap_hyp_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); boot_hyp_pgd = NULL; } if (hyp_pgd) - unmap_range(NULL, hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); + unmap_hyp_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); mutex_unlock(&kvm_hyp_pgd_mutex); } @@ -433,9 +525,9 @@ void free_hyp_pgds(void) if (hyp_pgd) { for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE) - unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); + unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE) - unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); + unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); free_pages((unsigned long)hyp_pgd, hyp_pgd_order); hyp_pgd = NULL; @@ -645,20 +737,6 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr) __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE); } -/* Free the HW pgd, one page at a time */ -static void kvm_free_hwpgd(void *hwpgd) -{ - free_pages_exact(hwpgd, kvm_get_hwpgd_size()); -} - -/* Allocate the HW PGD, making sure that each page gets its own refcount */ -static void *kvm_alloc_hwpgd(void) -{ - unsigned int size = kvm_get_hwpgd_size(); - - return alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO); -} - /** * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. * @kvm: The KVM struct pointer for the VM. @@ -673,81 +751,22 @@ static void *kvm_alloc_hwpgd(void) int kvm_alloc_stage2_pgd(struct kvm *kvm) { pgd_t *pgd; - void *hwpgd; if (kvm->arch.pgd != NULL) { kvm_err("kvm_arch already initialized?\n"); return -EINVAL; } - hwpgd = kvm_alloc_hwpgd(); - if (!hwpgd) + /* Allocate the HW PGD, making sure that each page gets its own refcount */ + pgd = alloc_pages_exact(S2_PGD_SIZE, GFP_KERNEL | __GFP_ZERO); + if (!pgd) return -ENOMEM; - /* When the kernel uses more levels of page tables than the - * guest, we allocate a fake PGD and pre-populate it to point - * to the next-level page table, which will be the real - * initial page table pointed to by the VTTBR. - * - * When KVM_PREALLOC_LEVEL==2, we allocate a single page for - * the PMD and the kernel will use folded pud. - * When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD - * pages. - */ - if (KVM_PREALLOC_LEVEL > 0) { - int i; - - /* - * Allocate fake pgd for the page table manipulation macros to - * work. This is not used by the hardware and we have no - * alignment requirement for this allocation. - */ - pgd = kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t), - GFP_KERNEL | __GFP_ZERO); - - if (!pgd) { - kvm_free_hwpgd(hwpgd); - return -ENOMEM; - } - - /* Plug the HW PGD into the fake one. */ - for (i = 0; i < PTRS_PER_S2_PGD; i++) { - if (KVM_PREALLOC_LEVEL == 1) - pgd_populate(NULL, pgd + i, - (pud_t *)hwpgd + i * PTRS_PER_PUD); - else if (KVM_PREALLOC_LEVEL == 2) - pud_populate(NULL, pud_offset(pgd, 0) + i, - (pmd_t *)hwpgd + i * PTRS_PER_PMD); - } - } else { - /* - * Allocate actual first-level Stage-2 page table used by the - * hardware for Stage-2 page table walks. - */ - pgd = (pgd_t *)hwpgd; - } - kvm_clean_pgd(pgd); kvm->arch.pgd = pgd; return 0; } -/** - * unmap_stage2_range -- Clear stage2 page table entries to unmap a range - * @kvm: The VM pointer - * @start: The intermediate physical base address of the range to unmap - * @size: The size of the area to unmap - * - * Clear a range of stage-2 mappings, lowering the various ref-counts. Must - * be called while holding mmu_lock (unless for freeing the stage2 pgd before - * destroying the VM), otherwise another faulting VCPU may come in and mess - * with things behind our backs. - */ -static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) -{ - unmap_range(kvm, kvm->arch.pgd, start, size); -} - static void stage2_unmap_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { @@ -830,10 +849,8 @@ void kvm_free_stage2_pgd(struct kvm *kvm) return; unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); - kvm_free_hwpgd(kvm_get_hwpgd(kvm)); - if (KVM_PREALLOC_LEVEL > 0) - kfree(kvm->arch.pgd); - + /* Free the HW pgd, one page at a time */ + free_pages_exact(kvm->arch.pgd, S2_PGD_SIZE); kvm->arch.pgd = NULL; } @@ -843,16 +860,16 @@ static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache pgd_t *pgd; pud_t *pud; - pgd = kvm->arch.pgd + kvm_pgd_index(addr); - if (WARN_ON(pgd_none(*pgd))) { + pgd = kvm->arch.pgd + stage2_pgd_index(addr); + if (WARN_ON(stage2_pgd_none(*pgd))) { if (!cache) return NULL; pud = mmu_memory_cache_alloc(cache); - pgd_populate(NULL, pgd, pud); + stage2_pgd_populate(pgd, pud); get_page(virt_to_page(pgd)); } - return pud_offset(pgd, addr); + return stage2_pud_offset(pgd, addr); } static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, @@ -862,15 +879,15 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache pmd_t *pmd; pud = stage2_get_pud(kvm, cache, addr); - if (pud_none(*pud)) { + if (stage2_pud_none(*pud)) { if (!cache) return NULL; pmd = mmu_memory_cache_alloc(cache); - pud_populate(NULL, pud, pmd); + stage2_pud_populate(pud, pmd); get_page(virt_to_page(pud)); } - return pmd_offset(pud, addr); + return stage2_pmd_offset(pud, addr); } static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache @@ -893,11 +910,14 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd)); old_pmd = *pmd; - kvm_set_pmd(pmd, *new_pmd); - if (pmd_present(old_pmd)) + if (pmd_present(old_pmd)) { + pmd_clear(pmd); kvm_tlb_flush_vmid_ipa(kvm, addr); - else + } else { get_page(virt_to_page(pmd)); + } + + kvm_set_pmd(pmd, *new_pmd); return 0; } @@ -946,15 +966,38 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, /* Create 2nd stage page table mapping - Level 3 */ old_pte = *pte; - kvm_set_pte(pte, *new_pte); - if (pte_present(old_pte)) + if (pte_present(old_pte)) { + kvm_set_pte(pte, __pte(0)); kvm_tlb_flush_vmid_ipa(kvm, addr); - else + } else { get_page(virt_to_page(pte)); + } + kvm_set_pte(pte, *new_pte); return 0; } +#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +static int stage2_ptep_test_and_clear_young(pte_t *pte) +{ + if (pte_young(*pte)) { + *pte = pte_mkold(*pte); + return 1; + } + return 0; +} +#else +static int stage2_ptep_test_and_clear_young(pte_t *pte) +{ + return __ptep_test_and_clear_young(pte); +} +#endif + +static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) +{ + return stage2_ptep_test_and_clear_young((pte_t *)pmd); +} + /** * kvm_phys_addr_ioremap - map a device range to guest IPA * @@ -978,7 +1021,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE); if (writable) - kvm_set_s2pte_writable(&pte); + pte = kvm_s2pte_mkwrite(pte); ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES, KVM_NR_MEM_OBJS); @@ -1078,12 +1121,12 @@ static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) pmd_t *pmd; phys_addr_t next; - pmd = pmd_offset(pud, addr); + pmd = stage2_pmd_offset(pud, addr); do { - next = kvm_pmd_addr_end(addr, end); + next = stage2_pmd_addr_end(addr, end); if (!pmd_none(*pmd)) { - if (kvm_pmd_huge(*pmd)) { + if (pmd_thp_or_huge(*pmd)) { if (!kvm_s2pmd_readonly(pmd)) kvm_set_s2pmd_readonly(pmd); } else { @@ -1106,12 +1149,12 @@ static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) pud_t *pud; phys_addr_t next; - pud = pud_offset(pgd, addr); + pud = stage2_pud_offset(pgd, addr); do { - next = kvm_pud_addr_end(addr, end); - if (!pud_none(*pud)) { + next = stage2_pud_addr_end(addr, end); + if (!stage2_pud_none(*pud)) { /* TODO:PUD not supported, revisit later if supported */ - BUG_ON(kvm_pud_huge(*pud)); + BUG_ON(stage2_pud_huge(*pud)); stage2_wp_pmds(pud, addr, next); } } while (pud++, addr = next, addr != end); @@ -1128,7 +1171,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) pgd_t *pgd; phys_addr_t next; - pgd = kvm->arch.pgd + kvm_pgd_index(addr); + pgd = kvm->arch.pgd + stage2_pgd_index(addr); do { /* * Release kvm_mmu_lock periodically if the memory region is @@ -1140,8 +1183,8 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) if (need_resched() || spin_needbreak(&kvm->mmu_lock)) cond_resched_lock(&kvm->mmu_lock); - next = kvm_pgd_addr_end(addr, end); - if (pgd_present(*pgd)) + next = stage2_pgd_addr_end(addr, end); + if (stage2_pgd_present(*pgd)) stage2_wp_puds(pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -1320,7 +1363,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, pmd_t new_pmd = pfn_pmd(pfn, mem_type); new_pmd = pmd_mkhuge(new_pmd); if (writable) { - kvm_set_s2pmd_writable(&new_pmd); + new_pmd = kvm_s2pmd_mkwrite(new_pmd); kvm_set_pfn_dirty(pfn); } coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached); @@ -1329,7 +1372,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, pte_t new_pte = pfn_pte(pfn, mem_type); if (writable) { - kvm_set_s2pte_writable(&new_pte); + new_pte = kvm_s2pte_mkwrite(new_pte); kvm_set_pfn_dirty(pfn); mark_page_dirty(kvm, gfn); } @@ -1348,6 +1391,8 @@ out_unlock: * Resolve the access fault by making the page young again. * Note that because the faulting entry is guaranteed not to be * cached in the TLB, we don't need to invalidate anything. + * Only the HW Access Flag updates are supported for Stage 2 (no DBM), + * so there is no need for atomic (pte|pmd)_mkyoung operations. */ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { @@ -1364,7 +1409,7 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) if (!pmd || pmd_none(*pmd)) /* Nothing there */ goto out; - if (kvm_pmd_huge(*pmd)) { /* THP, HugeTLB */ + if (pmd_thp_or_huge(*pmd)) { /* THP, HugeTLB */ *pmd = pmd_mkyoung(*pmd); pfn = pmd_pfn(*pmd); pfn_valid = true; @@ -1588,25 +1633,14 @@ static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) if (!pmd || pmd_none(*pmd)) /* Nothing there */ return 0; - if (kvm_pmd_huge(*pmd)) { /* THP, HugeTLB */ - if (pmd_young(*pmd)) { - *pmd = pmd_mkold(*pmd); - return 1; - } - - return 0; - } + if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ + return stage2_pmdp_test_and_clear_young(pmd); pte = pte_offset_kernel(pmd, gpa); if (pte_none(*pte)) return 0; - if (pte_young(*pte)) { - *pte = pte_mkold(*pte); /* Just a page... */ - return 1; - } - - return 0; + return stage2_ptep_test_and_clear_young(pte); } static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) @@ -1618,7 +1652,7 @@ static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) if (!pmd || pmd_none(*pmd)) /* Nothing there */ return 0; - if (kvm_pmd_huge(*pmd)) /* THP, HugeTLB */ + if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ return pmd_young(*pmd); pte = pte_offset_kernel(pmd, gpa); diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 3f29887..ffde15f 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -96,32 +96,37 @@ SCTLR_EL2_SA | SCTLR_EL2_I) /* TCR_EL2 Registers bits */ -#define TCR_EL2_RES1 ((1 << 31) | (1 << 23)) -#define TCR_EL2_TBI (1 << 20) -#define TCR_EL2_PS (7 << 16) -#define TCR_EL2_PS_40B (2 << 16) -#define TCR_EL2_TG0 (1 << 14) -#define TCR_EL2_SH0 (3 << 12) -#define TCR_EL2_ORGN0 (3 << 10) -#define TCR_EL2_IRGN0 (3 << 8) -#define TCR_EL2_T0SZ 0x3f -#define TCR_EL2_MASK (TCR_EL2_TG0 | TCR_EL2_SH0 | \ - TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ) +#define TCR_EL2_RES1 ((1 << 31) | (1 << 23)) +#define TCR_EL2_TBI (1 << 20) +#define TCR_EL2_PS_SHIFT 16 +#define TCR_EL2_PS_MASK (7 << TCR_EL2_PS_SHIFT) +#define TCR_EL2_PS_40B (2 << TCR_EL2_PS_SHIFT) +#define TCR_EL2_TG0_MASK TCR_TG0_MASK +#define TCR_EL2_SH0_MASK TCR_SH0_MASK +#define TCR_EL2_ORGN0_MASK TCR_ORGN0_MASK +#define TCR_EL2_IRGN0_MASK TCR_IRGN0_MASK +#define TCR_EL2_T0SZ_MASK 0x3f +#define TCR_EL2_MASK (TCR_EL2_TG0_MASK | TCR_EL2_SH0_MASK | \ + TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK | TCR_EL2_T0SZ_MASK) /* VTCR_EL2 Registers bits */ #define VTCR_EL2_RES1 (1 << 31) -#define VTCR_EL2_PS_MASK (7 << 16) -#define VTCR_EL2_TG0_MASK (1 << 14) -#define VTCR_EL2_TG0_4K (0 << 14) -#define VTCR_EL2_TG0_64K (1 << 14) -#define VTCR_EL2_SH0_MASK (3 << 12) -#define VTCR_EL2_SH0_INNER (3 << 12) -#define VTCR_EL2_ORGN0_MASK (3 << 10) -#define VTCR_EL2_ORGN0_WBWA (1 << 10) -#define VTCR_EL2_IRGN0_MASK (3 << 8) -#define VTCR_EL2_IRGN0_WBWA (1 << 8) -#define VTCR_EL2_SL0_MASK (3 << 6) -#define VTCR_EL2_SL0_LVL1 (1 << 6) +#define VTCR_EL2_HD (1 << 22) +#define VTCR_EL2_HA (1 << 21) +#define VTCR_EL2_PS_MASK TCR_EL2_PS_MASK +#define VTCR_EL2_TG0_MASK TCR_TG0_MASK +#define VTCR_EL2_TG0_4K TCR_TG0_4K +#define VTCR_EL2_TG0_16K TCR_TG0_16K +#define VTCR_EL2_TG0_64K TCR_TG0_64K +#define VTCR_EL2_SH0_MASK TCR_SH0_MASK +#define VTCR_EL2_SH0_INNER TCR_SH0_INNER +#define VTCR_EL2_ORGN0_MASK TCR_ORGN0_MASK +#define VTCR_EL2_ORGN0_WBWA TCR_ORGN0_WBWA +#define VTCR_EL2_IRGN0_MASK TCR_IRGN0_MASK +#define VTCR_EL2_IRGN0_WBWA TCR_IRGN0_WBWA +#define VTCR_EL2_SL0_SHIFT 6 +#define VTCR_EL2_SL0_MASK (3 << VTCR_EL2_SL0_SHIFT) +#define VTCR_EL2_SL0_LVL1 (1 << VTCR_EL2_SL0_SHIFT) #define VTCR_EL2_T0SZ_MASK 0x3f #define VTCR_EL2_T0SZ_40B 24 #define VTCR_EL2_VS_SHIFT 19 @@ -137,35 +142,45 @@ * (see hyp-init.S). * * Note that when using 4K pages, we concatenate two first level page tables - * together. + * together. With 16K pages, we concatenate 16 first level page tables. * * The magic numbers used for VTTBR_X in this patch can be found in Tables * D4-23 and D4-25 in ARM DDI 0487A.b. */ + +#define VTCR_EL2_T0SZ_IPA VTCR_EL2_T0SZ_40B +#define VTCR_EL2_COMMON_BITS (VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \ + VTCR_EL2_IRGN0_WBWA | VTCR_EL2_RES1) + #ifdef CONFIG_ARM64_64K_PAGES /* * Stage2 translation configuration: - * 40bits input (T0SZ = 24) * 64kB pages (TG0 = 1) * 2 level page tables (SL = 1) */ -#define VTCR_EL2_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \ - VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \ - VTCR_EL2_SL0_LVL1 | VTCR_EL2_RES1) -#define VTTBR_X (38 - VTCR_EL2_T0SZ_40B) -#else +#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SL0_LVL1) +#define VTTBR_X_TGRAN_MAGIC 38 +#elif defined(CONFIG_ARM64_16K_PAGES) +/* + * Stage2 translation configuration: + * 16kB pages (TG0 = 2) + * 2 level page tables (SL = 1) + */ +#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_16K | VTCR_EL2_SL0_LVL1) +#define VTTBR_X_TGRAN_MAGIC 42 +#else /* 4K */ /* * Stage2 translation configuration: - * 40bits input (T0SZ = 24) * 4kB pages (TG0 = 0) * 3 level page tables (SL = 1) */ -#define VTCR_EL2_FLAGS (VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \ - VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \ - VTCR_EL2_SL0_LVL1 | VTCR_EL2_RES1) -#define VTTBR_X (37 - VTCR_EL2_T0SZ_40B) +#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_4K | VTCR_EL2_SL0_LVL1) +#define VTTBR_X_TGRAN_MAGIC 37 #endif +#define VTCR_EL2_FLAGS (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN_FLAGS) +#define VTTBR_X (VTTBR_X_TGRAN_MAGIC - VTCR_EL2_T0SZ_IPA) + #define VTTBR_BADDR_SHIFT (VTTBR_X - 1) #define VTTBR_BADDR_MASK (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT) #define VTTBR_VMID_SHIFT (UL(48)) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 22732a5..844fe5d 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -45,18 +45,6 @@ */ #define TRAMPOLINE_VA (HYP_PAGE_OFFSET_MASK & PAGE_MASK) -/* - * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation - * levels in addition to the PGD and potentially the PUD which are - * pre-allocated (we pre-allocate the fake PGD and the PUD when the Stage-2 - * tables use one level of tables less than the kernel. - */ -#ifdef CONFIG_ARM64_64K_PAGES -#define KVM_MMU_CACHE_MIN_PAGES 1 -#else -#define KVM_MMU_CACHE_MIN_PAGES 2 -#endif - #ifdef __ASSEMBLY__ #include <asm/alternative.h> @@ -91,6 +79,8 @@ alternative_endif #define KVM_PHYS_SIZE (1UL << KVM_PHYS_SHIFT) #define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1UL) +#include <asm/stage2_pgtable.h> + int create_hyp_mappings(void *from, void *to); int create_hyp_io_mappings(void *from, void *to, phys_addr_t); void free_boot_hyp_pgd(void); @@ -121,19 +111,32 @@ static inline void kvm_clean_pmd_entry(pmd_t *pmd) {} static inline void kvm_clean_pte(pte_t *pte) {} static inline void kvm_clean_pte_entry(pte_t *pte) {} -static inline void kvm_set_s2pte_writable(pte_t *pte) +static inline pte_t kvm_s2pte_mkwrite(pte_t pte) { - pte_val(*pte) |= PTE_S2_RDWR; + pte_val(pte) |= PTE_S2_RDWR; + return pte; } -static inline void kvm_set_s2pmd_writable(pmd_t *pmd) +static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd) { - pmd_val(*pmd) |= PMD_S2_RDWR; + pmd_val(pmd) |= PMD_S2_RDWR; + return pmd; } static inline void kvm_set_s2pte_readonly(pte_t *pte) { - pte_val(*pte) = (pte_val(*pte) & ~PTE_S2_RDWR) | PTE_S2_RDONLY; + pteval_t pteval; + unsigned long tmp; + + asm volatile("// kvm_set_s2pte_readonly\n" + " prfm pstl1strm, %2\n" + "1: ldxr %0, %2\n" + " and %0, %0, %3 // clear PTE_S2_RDWR\n" + " orr %0, %0, %4 // set PTE_S2_RDONLY\n" + " stxr %w1, %0, %2\n" + " cbnz %w1, 1b\n" + : "=&r" (pteval), "=&r" (tmp), "+Q" (pte_val(*pte)) + : "L" (~PTE_S2_RDWR), "L" (PTE_S2_RDONLY)); } static inline bool kvm_s2pte_readonly(pte_t *pte) @@ -143,69 +146,12 @@ static inline bool kvm_s2pte_readonly(pte_t *pte) static inline void kvm_set_s2pmd_readonly(pmd_t *pmd) { - pmd_val(*pmd) = (pmd_val(*pmd) & ~PMD_S2_RDWR) | PMD_S2_RDONLY; + kvm_set_s2pte_readonly((pte_t *)pmd); } static inline bool kvm_s2pmd_readonly(pmd_t *pmd) { - return (pmd_val(*pmd) & PMD_S2_RDWR) == PMD_S2_RDONLY; -} - - -#define kvm_pgd_addr_end(addr, end) pgd_addr_end(addr, end) -#define kvm_pud_addr_end(addr, end) pud_addr_end(addr, end) -#define kvm_pmd_addr_end(addr, end) pmd_addr_end(addr, end) - -/* - * In the case where PGDIR_SHIFT is larger than KVM_PHYS_SHIFT, we can address - * the entire IPA input range with a single pgd entry, and we would only need - * one pgd entry. Note that in this case, the pgd is actually not used by - * the MMU for Stage-2 translations, but is merely a fake pgd used as a data - * structure for the kernel pgtable macros to work. - */ -#if PGDIR_SHIFT > KVM_PHYS_SHIFT -#define PTRS_PER_S2_PGD_SHIFT 0 -#else -#define PTRS_PER_S2_PGD_SHIFT (KVM_PHYS_SHIFT - PGDIR_SHIFT) -#endif -#define PTRS_PER_S2_PGD (1 << PTRS_PER_S2_PGD_SHIFT) - -#define kvm_pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)) - -/* - * If we are concatenating first level stage-2 page tables, we would have less - * than or equal to 16 pointers in the fake PGD, because that's what the - * architecture allows. In this case, (4 - CONFIG_PGTABLE_LEVELS) - * represents the first level for the host, and we add 1 to go to the next - * level (which uses contatenation) for the stage-2 tables. - */ -#if PTRS_PER_S2_PGD <= 16 -#define KVM_PREALLOC_LEVEL (4 - CONFIG_PGTABLE_LEVELS + 1) -#else -#define KVM_PREALLOC_LEVEL (0) -#endif - -static inline void *kvm_get_hwpgd(struct kvm *kvm) -{ - pgd_t *pgd = kvm->arch.pgd; - pud_t *pud; - - if (KVM_PREALLOC_LEVEL == 0) - return pgd; - - pud = pud_offset(pgd, 0); - if (KVM_PREALLOC_LEVEL == 1) - return pud; - - BUG_ON(KVM_PREALLOC_LEVEL != 2); - return pmd_offset(pud, 0); -} - -static inline unsigned int kvm_get_hwpgd_size(void) -{ - if (KVM_PREALLOC_LEVEL > 0) - return PTRS_PER_S2_PGD * PAGE_SIZE; - return PTRS_PER_S2_PGD * sizeof(pgd_t); + return kvm_s2pte_readonly((pte_t *)pmd); } static inline bool kvm_page_empty(void *ptr) @@ -214,23 +160,20 @@ static inline bool kvm_page_empty(void *ptr) return page_count(ptr_page) == 1; } -#define kvm_pte_table_empty(kvm, ptep) kvm_page_empty(ptep) +#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) #ifdef __PAGETABLE_PMD_FOLDED -#define kvm_pmd_table_empty(kvm, pmdp) (0) +#define hyp_pmd_table_empty(pmdp) (0) #else -#define kvm_pmd_table_empty(kvm, pmdp) \ - (kvm_page_empty(pmdp) && (!(kvm) || KVM_PREALLOC_LEVEL < 2)) +#define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp) #endif #ifdef __PAGETABLE_PUD_FOLDED -#define kvm_pud_table_empty(kvm, pudp) (0) +#define hyp_pud_table_empty(pudp) (0) #else -#define kvm_pud_table_empty(kvm, pudp) \ - (kvm_page_empty(pudp) && (!(kvm) || KVM_PREALLOC_LEVEL < 1)) +#define hyp_pud_table_empty(pudp) kvm_page_empty(pudp) #endif - struct kvm; #define kvm_flush_dcache_to_poc(a,l) __flush_dcache_area((a), (l)) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 5c25b83..936f173 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -208,23 +208,69 @@ #define TCR_T1SZ(x) ((UL(64) - (x)) << TCR_T1SZ_OFFSET) #define TCR_TxSZ(x) (TCR_T0SZ(x) | TCR_T1SZ(x)) #define TCR_TxSZ_WIDTH 6 -#define TCR_IRGN_NC ((UL(0) << 8) | (UL(0) << 24)) -#define TCR_IRGN_WBWA ((UL(1) << 8) | (UL(1) << 24)) -#define TCR_IRGN_WT ((UL(2) << 8) | (UL(2) << 24)) -#define TCR_IRGN_WBnWA ((UL(3) << 8) | (UL(3) << 24)) -#define TCR_IRGN_MASK ((UL(3) << 8) | (UL(3) << 24)) -#define TCR_ORGN_NC ((UL(0) << 10) | (UL(0) << 26)) -#define TCR_ORGN_WBWA ((UL(1) << 10) | (UL(1) << 26)) -#define TCR_ORGN_WT ((UL(2) << 10) | (UL(2) << 26)) -#define TCR_ORGN_WBnWA ((UL(3) << 10) | (UL(3) << 26)) -#define TCR_ORGN_MASK ((UL(3) << 10) | (UL(3) << 26)) -#define TCR_SHARED ((UL(3) << 12) | (UL(3) << 28)) -#define TCR_TG0_4K (UL(0) << 14) -#define TCR_TG0_64K (UL(1) << 14) -#define TCR_TG0_16K (UL(2) << 14) -#define TCR_TG1_16K (UL(1) << 30) -#define TCR_TG1_4K (UL(2) << 30) -#define TCR_TG1_64K (UL(3) << 30) + +#define TCR_IRGN0_SHIFT 8 +#define TCR_IRGN0_MASK (UL(3) << TCR_IRGN0_SHIFT) +#define TCR_IRGN0_NC (UL(0) << TCR_IRGN0_SHIFT) +#define TCR_IRGN0_WBWA (UL(1) << TCR_IRGN0_SHIFT) +#define TCR_IRGN0_WT (UL(2) << TCR_IRGN0_SHIFT) +#define TCR_IRGN0_WBnWA (UL(3) << TCR_IRGN0_SHIFT) + +#define TCR_IRGN1_SHIFT 24 +#define TCR_IRGN1_MASK (UL(3) << TCR_IRGN1_SHIFT) +#define TCR_IRGN1_NC (UL(0) << TCR_IRGN1_SHIFT) +#define TCR_IRGN1_WBWA (UL(1) << TCR_IRGN1_SHIFT) +#define TCR_IRGN1_WT (UL(2) << TCR_IRGN1_SHIFT) +#define TCR_IRGN1_WBnWA (UL(3) << TCR_IRGN1_SHIFT) + +#define TCR_IRGN_NC (TCR_IRGN0_NC | TCR_IRGN1_NC) +#define TCR_IRGN_WBWA (TCR_IRGN0_WBWA | TCR_IRGN1_WBWA) +#define TCR_IRGN_WT (TCR_IRGN0_WT | TCR_IRGN1_WT) +#define TCR_IRGN_WBnWA (TCR_IRGN0_WBnWA | TCR_IRGN1_WBnWA) +#define TCR_IRGN_MASK (TCR_IRGN0_MASK | TCR_IRGN1_MASK) + + +#define TCR_ORGN0_SHIFT 10 +#define TCR_ORGN0_MASK (UL(3) << TCR_ORGN0_SHIFT) +#define TCR_ORGN0_NC (UL(0) << TCR_ORGN0_SHIFT) +#define TCR_ORGN0_WBWA (UL(1) << TCR_ORGN0_SHIFT) +#define TCR_ORGN0_WT (UL(2) << TCR_ORGN0_SHIFT) +#define TCR_ORGN0_WBnWA (UL(3) << TCR_ORGN0_SHIFT) + +#define TCR_ORGN1_SHIFT 26 +#define TCR_ORGN1_MASK (UL(3) << TCR_ORGN1_SHIFT) +#define TCR_ORGN1_NC (UL(0) << TCR_ORGN1_SHIFT) +#define TCR_ORGN1_WBWA (UL(1) << TCR_ORGN1_SHIFT) +#define TCR_ORGN1_WT (UL(2) << TCR_ORGN1_SHIFT) +#define TCR_ORGN1_WBnWA (UL(3) << TCR_ORGN1_SHIFT) + +#define TCR_ORGN_NC (TCR_ORGN0_NC | TCR_ORGN1_NC) +#define TCR_ORGN_WBWA (TCR_ORGN0_WBWA | TCR_ORGN1_WBWA) +#define TCR_ORGN_WT (TCR_ORGN0_WT | TCR_ORGN1_WT) +#define TCR_ORGN_WBnWA (TCR_ORGN0_WBnWA | TCR_ORGN1_WBnWA) +#define TCR_ORGN_MASK (TCR_ORGN0_MASK | TCR_ORGN1_MASK) + +#define TCR_SH0_SHIFT 12 +#define TCR_SH0_MASK (UL(3) << TCR_SH0_SHIFT) +#define TCR_SH0_INNER (UL(3) << TCR_SH0_SHIFT) + +#define TCR_SH1_SHIFT 28 +#define TCR_SH1_MASK (UL(3) << TCR_SH1_SHIFT) +#define TCR_SH1_INNER (UL(3) << TCR_SH1_SHIFT) +#define TCR_SHARED (TCR_SH0_INNER | TCR_SH1_INNER) + +#define TCR_TG0_SHIFT 14 +#define TCR_TG0_MASK (UL(3) << TCR_TG0_SHIFT) +#define TCR_TG0_4K (UL(0) << TCR_TG0_SHIFT) +#define TCR_TG0_64K (UL(1) << TCR_TG0_SHIFT) +#define TCR_TG0_16K (UL(2) << TCR_TG0_SHIFT) + +#define TCR_TG1_SHIFT 30 +#define TCR_TG1_MASK (UL(3) << TCR_TG1_SHIFT) +#define TCR_TG1_16K (UL(1) << TCR_TG1_SHIFT) +#define TCR_TG1_4K (UL(2) << TCR_TG1_SHIFT) +#define TCR_TG1_64K (UL(3) << TCR_TG1_SHIFT) + #define TCR_ASID16 (UL(1) << 36) #define TCR_TBI0 (UL(1) << 37) #define TCR_HA (UL(1) << 39) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 989fef1..f1d5afd 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -290,6 +290,8 @@ static inline pgprot_t mk_sect_prot(pgprot_t prot) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) #define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK)) +#define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd)) + #define __HAVE_ARCH_PMD_WRITE #define pmd_write(pmd) pte_write(pmd_pte(pmd)) @@ -530,14 +532,12 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) * Atomic pte/pmd modifications. */ #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pte_t *ptep) +static inline int __ptep_test_and_clear_young(pte_t *ptep) { pteval_t pteval; unsigned int tmp, res; - asm volatile("// ptep_test_and_clear_young\n" + asm volatile("// __ptep_test_and_clear_young\n" " prfm pstl1strm, %2\n" "1: ldxr %0, %2\n" " ubfx %w3, %w0, %5, #1 // extract PTE_AF (young)\n" @@ -550,6 +550,13 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, return res; } +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep) +{ + return __ptep_test_and_clear_young(ptep); +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, diff --git a/arch/arm64/include/asm/stage2_pgtable-nopmd.h b/arch/arm64/include/asm/stage2_pgtable-nopmd.h new file mode 100644 index 0000000..2656a0f --- /dev/null +++ b/arch/arm64/include/asm/stage2_pgtable-nopmd.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2016 - ARM Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef __ARM64_S2_PGTABLE_NOPMD_H_ +#define __ARM64_S2_PGTABLE_NOPMD_H_ + +#include <asm/stage2_pgtable-nopud.h> + +#define __S2_PGTABLE_PMD_FOLDED + +#define S2_PMD_SHIFT S2_PUD_SHIFT +#define S2_PTRS_PER_PMD 1 +#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT) +#define S2_PMD_MASK (~(S2_PMD_SIZE-1)) + +#define stage2_pud_none(pud) (0) +#define stage2_pud_present(pud) (1) +#define stage2_pud_clear(pud) do { } while (0) +#define stage2_pud_populate(pud, pmd) do { } while (0) +#define stage2_pmd_offset(pud, address) ((pmd_t *)(pud)) + +#define stage2_pmd_free(pmd) do { } while (0) + +#define stage2_pmd_addr_end(addr, end) (end) + +#define stage2_pud_huge(pud) (0) +#define stage2_pmd_table_empty(pmdp) (0) + +#endif diff --git a/arch/arm64/include/asm/stage2_pgtable-nopud.h b/arch/arm64/include/asm/stage2_pgtable-nopud.h new file mode 100644 index 0000000..5ee87b5 --- /dev/null +++ b/arch/arm64/include/asm/stage2_pgtable-nopud.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2016 - ARM Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef __ARM64_S2_PGTABLE_NOPUD_H_ +#define __ARM64_S2_PGTABLE_NOPUD_H_ + +#define __S2_PGTABLE_PUD_FOLDED + +#define S2_PUD_SHIFT S2_PGDIR_SHIFT +#define S2_PTRS_PER_PUD 1 +#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT) +#define S2_PUD_MASK (~(S2_PUD_SIZE-1)) + +#define stage2_pgd_none(pgd) (0) +#define stage2_pgd_present(pgd) (1) +#define stage2_pgd_clear(pgd) do { } while (0) +#define stage2_pgd_populate(pgd, pud) do { } while (0) + +#define stage2_pud_offset(pgd, address) ((pud_t *)(pgd)) + +#define stage2_pud_free(x) do { } while (0) + +#define stage2_pud_addr_end(addr, end) (end) +#define stage2_pud_table_empty(pmdp) (0) + +#endif diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h new file mode 100644 index 0000000..8b68099 --- /dev/null +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -0,0 +1,142 @@ +/* + * Copyright (C) 2016 - ARM Ltd + * + * stage2 page table helpers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef __ARM64_S2_PGTABLE_H_ +#define __ARM64_S2_PGTABLE_H_ + +#include <asm/pgtable.h> + +/* + * The hardware supports concatenation of up to 16 tables at stage2 entry level + * and we use the feature whenever possible. + * + * Now, the minimum number of bits resolved at any level is (PAGE_SHIFT - 3). + * On arm64, the smallest PAGE_SIZE supported is 4k, which means + * (PAGE_SHIFT - 3) > 4 holds for all page sizes. + * This implies, the total number of page table levels at stage2 expected + * by the hardware is actually the number of levels required for (KVM_PHYS_SHIFT - 4) + * in normal translations(e.g, stage1), since we cannot have another level in + * the range (KVM_PHYS_SHIFT, KVM_PHYS_SHIFT - 4). + */ +#define STAGE2_PGTABLE_LEVELS ARM64_HW_PGTABLE_LEVELS(KVM_PHYS_SHIFT - 4) + +/* + * With all the supported VA_BITs and 40bit guest IPA, the following condition + * is always true: + * + * STAGE2_PGTABLE_LEVELS <= CONFIG_PGTABLE_LEVELS + * + * We base our stage-2 page table walker helpers on this assumption and + * fall back to using the host version of the helper wherever possible. + * i.e, if a particular level is not folded (e.g, PUD) at stage2, we fall back + * to using the host version, since it is guaranteed it is not folded at host. + * + * If the condition breaks in the future, we can rearrange the host level + * definitions and reuse them for stage2. Till then... + */ +#if STAGE2_PGTABLE_LEVELS > CONFIG_PGTABLE_LEVELS +#error "Unsupported combination of guest IPA and host VA_BITS." +#endif + +/* S2_PGDIR_SHIFT is the size mapped by top-level stage2 entry */ +#define S2_PGDIR_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - STAGE2_PGTABLE_LEVELS) +#define S2_PGDIR_SIZE (_AC(1, UL) << S2_PGDIR_SHIFT) +#define S2_PGDIR_MASK (~(S2_PGDIR_SIZE - 1)) + +/* + * The number of PTRS across all concatenated stage2 tables given by the + * number of bits resolved at the initial level. + */ +#define PTRS_PER_S2_PGD (1 << (KVM_PHYS_SHIFT - S2_PGDIR_SHIFT)) + +/* + * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation + * levels in addition to the PGD. + */ +#define KVM_MMU_CACHE_MIN_PAGES (STAGE2_PGTABLE_LEVELS - 1) + + +#if STAGE2_PGTABLE_LEVELS > 3 + +#define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1) +#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT) +#define S2_PUD_MASK (~(S2_PUD_SIZE - 1)) + +#define stage2_pgd_none(pgd) pgd_none(pgd) +#define stage2_pgd_clear(pgd) pgd_clear(pgd) +#define stage2_pgd_present(pgd) pgd_present(pgd) +#define stage2_pgd_populate(pgd, pud) pgd_populate(NULL, pgd, pud) +#define stage2_pud_offset(pgd, address) pud_offset(pgd, address) +#define stage2_pud_free(pud) pud_free(NULL, pud) + +#define stage2_pud_table_empty(pudp) kvm_page_empty(pudp) + +static inline phys_addr_t stage2_pud_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK; + + return (boundary - 1 < end - 1) ? boundary : end; +} + +#endif /* STAGE2_PGTABLE_LEVELS > 3 */ + + +#if STAGE2_PGTABLE_LEVELS > 2 + +#define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2) +#define S2_PMD_SIZE (_AC(1, UL) << S2_PMD_SHIFT) +#define S2_PMD_MASK (~(S2_PMD_SIZE - 1)) + +#define stage2_pud_none(pud) pud_none(pud) +#define stage2_pud_clear(pud) pud_clear(pud) +#define stage2_pud_present(pud) pud_present(pud) +#define stage2_pud_populate(pud, pmd) pud_populate(NULL, pud, pmd) +#define stage2_pmd_offset(pud, address) pmd_offset(pud, address) +#define stage2_pmd_free(pmd) pmd_free(NULL, pmd) + +#define stage2_pud_huge(pud) pud_huge(pud) +#define stage2_pmd_table_empty(pmdp) kvm_page_empty(pmdp) + +static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK; + + return (boundary - 1 < end - 1) ? boundary : end; +} + +#endif /* STAGE2_PGTABLE_LEVELS > 2 */ + +#define stage2_pte_table_empty(ptep) kvm_page_empty(ptep) + +#if STAGE2_PGTABLE_LEVELS == 2 +#include <asm/stage2_pgtable-nopmd.h> +#elif STAGE2_PGTABLE_LEVELS == 3 +#include <asm/stage2_pgtable-nopud.h> +#endif + + +#define stage2_pgd_index(addr) (((addr) >> S2_PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)) + +static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t boundary = (addr + S2_PGDIR_SIZE) & S2_PGDIR_MASK; + + return (boundary - 1 < end - 1) ? boundary : end; +} + +#endif /* __ARM64_S2_PGTABLE_H_ */ diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index de7450d..aa2e34e 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -22,7 +22,6 @@ config KVM_ARM_VGIC_V3 config KVM bool "Kernel-based Virtual Machine (KVM) support" depends on OF - depends on !ARM64_16K_PAGES select MMU_NOTIFIER select PREEMPT_NOTIFIERS select ANON_INODES diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c index bcbe761..b81f409 100644 --- a/arch/arm64/kvm/hyp/s2-setup.c +++ b/arch/arm64/kvm/hyp/s2-setup.c @@ -66,6 +66,14 @@ u32 __hyp_text __init_stage2_translation(void) val |= 64 - (parange > 40 ? 40 : parange); /* + * Check the availability of Hardware Access Flag / Dirty Bit + * Management in ID_AA64MMFR1_EL1 and enable the feature in VTCR_EL2. + */ + tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_HADBS_SHIFT) & 0xf; + if (IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && tmp) + val |= VTCR_EL2_HA; + + /* * Read the VMIDBits bits from ID_AA64MMFR1_EL1 and set the VS * bit in VTCR_EL2. */ |