From 178a787502123b01499c5a4617b94bb69ad49dd5 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Mon, 1 Feb 2016 11:14:15 +1100 Subject: vfio: Enable VFIO device for powerpc ec53500f "kvm: Add VFIO device" added a special KVM pseudo-device which is used to handle any necessary interactions between KVM and VFIO. Currently that device is built on x86 and ARM, but not powerpc, although powerpc does support both KVM and VFIO. This makes things awkward in userspace Currently qemu prints an alarming error message if you attempt to use VFIO and it can't initialize the KVM VFIO device. We don't want to remove the warning, because lack of the KVM VFIO device could mean coherency problems on x86. On powerpc, however, the error is harmless but looks disturbing, and a test based on host architecture in qemu would be ugly, and break if we do need the KVM VFIO device for something important in future. There's nothing preventing the KVM VFIO device from being built for powerpc, so this patch turns it on. It won't actually do anything, since we don't define any of the arch_*() hooks, but it will make qemu happy and we can extend it in future if we need to. Signed-off-by: David Gibson Reviewed-by: Eric Auger Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 0570eef..7f7b6d8 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -8,7 +8,7 @@ ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm KVM := ../../../virt/kvm common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ - $(KVM)/eventfd.o + $(KVM)/eventfd.o $(KVM)/vfio.o CFLAGS_e500_mmu.o := -I. CFLAGS_e500_mmu_host.o := -I. -- cgit v0.10.2 From e9ab1a1cafb7911df1550a285f2f733ea5920f55 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 15 Feb 2016 12:55:03 +1100 Subject: powerpc: Make vmalloc_to_phys() public This makes vmalloc_to_phys() public as there will be another user (KVM in-kernel VFIO acceleration) for it soon. As this new user can be compiled as a module, this exports the symbol. As a little optimization, this changes the helper to call vmalloc_to_pfn() instead of vmalloc_to_page() as the size of the struct page may not be power-of-two aligned which will make gcc use multiply instructions instead of shifts. Signed-off-by: Alexey Kardashevskiy Acked-by: Michael Ellerman Reviewed-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index ac9fb11..47897a3 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -78,6 +78,9 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, } return __find_linux_pte_or_hugepte(pgdir, ea, is_thp, shift); } + +unsigned long vmalloc_to_phys(void *vmalloc_addr); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_H */ diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 83dfd79..de37ff4 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -243,3 +243,11 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) } #endif /* CONFIG_DEBUG_VM */ +unsigned long vmalloc_to_phys(void *va) +{ + unsigned long pfn = vmalloc_to_pfn(va); + + BUG_ON(!pfn); + return __pa(pfn_to_kaddr(pfn)) + offset_in_page(va); +} +EXPORT_SYMBOL_GPL(vmalloc_to_phys); diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 9f9dfda..3b09ecf 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -493,14 +493,6 @@ static size_t event_to_attr_ct(struct hv_24x7_event_data *event) } } -static unsigned long vmalloc_to_phys(void *v) -{ - struct page *p = vmalloc_to_page(v); - - BUG_ON(!p); - return page_to_phys(p) + offset_in_page(v); -} - /* */ struct event_uniq { struct rb_node node; -- cgit v0.10.2 From fcbb2ce672848481275c1f014ad44ccd1e43a7a2 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 15 Feb 2016 12:55:04 +1100 Subject: KVM: PPC: Rework H_PUT_TCE/H_GET_TCE handlers This reworks the existing H_PUT_TCE/H_GET_TCE handlers to have following patches applied nicer. This moves the ioba boundaries check to a helper and adds a check for least bits which have to be zeros. The patch is pretty mechanical (only check for least ioba bits is added) so no change in behaviour is expected. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 89e96b3..f29ba2c 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -35,71 +35,104 @@ #include #include #include +#include #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) +/* + * Finds a TCE table descriptor by LIOBN. + * + * WARNING: This will be called in real or virtual mode on HV KVM and virtual + * mode on PR KVM + */ +static struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu, + unsigned long liobn) +{ + struct kvm *kvm = vcpu->kvm; + struct kvmppc_spapr_tce_table *stt; + + list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) + if (stt->liobn == liobn) + return stt; + + return NULL; +} + +/* + * Validates IO address. + * + * WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ +static long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, + unsigned long ioba, unsigned long npages) +{ + unsigned long mask = (1ULL << IOMMU_PAGE_SHIFT_4K) - 1; + unsigned long idx = ioba >> IOMMU_PAGE_SHIFT_4K; + unsigned long size = stt->window_size >> IOMMU_PAGE_SHIFT_4K; + + if ((ioba & mask) || (idx + npages > size) || (idx + npages < idx)) + return H_PARAMETER; + + return H_SUCCESS; +} + /* WARNING: This will be called in real-mode on HV KVM and virtual * mode on PR KVM */ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce) { - struct kvm *kvm = vcpu->kvm; - struct kvmppc_spapr_tce_table *stt; + struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn); + long ret; + unsigned long idx; + struct page *page; + u64 *tbl; /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ /* liobn, ioba, tce); */ - list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { - if (stt->liobn == liobn) { - unsigned long idx = ioba >> SPAPR_TCE_SHIFT; - struct page *page; - u64 *tbl; - - /* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p window_size=0x%x\n", */ - /* liobn, stt, stt->window_size); */ - if (ioba >= stt->window_size) - return H_PARAMETER; - - page = stt->pages[idx / TCES_PER_PAGE]; - tbl = (u64 *)page_address(page); - - /* FIXME: Need to validate the TCE itself */ - /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */ - tbl[idx % TCES_PER_PAGE] = tce; - return H_SUCCESS; - } - } - - /* Didn't find the liobn, punt it to userspace */ - return H_TOO_HARD; + if (!stt) + return H_TOO_HARD; + + ret = kvmppc_ioba_validate(stt, ioba, 1); + if (ret != H_SUCCESS) + return ret; + + idx = ioba >> SPAPR_TCE_SHIFT; + page = stt->pages[idx / TCES_PER_PAGE]; + tbl = (u64 *)page_address(page); + + /* FIXME: Need to validate the TCE itself */ + /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */ + tbl[idx % TCES_PER_PAGE] = tce; + + return H_SUCCESS; } EXPORT_SYMBOL_GPL(kvmppc_h_put_tce); long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba) { - struct kvm *kvm = vcpu->kvm; - struct kvmppc_spapr_tce_table *stt; + struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn); + long ret; + unsigned long idx; + struct page *page; + u64 *tbl; - list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { - if (stt->liobn == liobn) { - unsigned long idx = ioba >> SPAPR_TCE_SHIFT; - struct page *page; - u64 *tbl; + if (!stt) + return H_TOO_HARD; - if (ioba >= stt->window_size) - return H_PARAMETER; + ret = kvmppc_ioba_validate(stt, ioba, 1); + if (ret != H_SUCCESS) + return ret; - page = stt->pages[idx / TCES_PER_PAGE]; - tbl = (u64 *)page_address(page); + idx = ioba >> SPAPR_TCE_SHIFT; + page = stt->pages[idx / TCES_PER_PAGE]; + tbl = (u64 *)page_address(page); - vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE]; - return H_SUCCESS; - } - } + vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE]; - /* Didn't find the liobn, punt it to userspace */ - return H_TOO_HARD; + return H_SUCCESS; } EXPORT_SYMBOL_GPL(kvmppc_h_get_tce); -- cgit v0.10.2 From 366baf28ee3fc22dea504a0bddf8edd1e9bcee70 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 15 Feb 2016 12:55:05 +1100 Subject: KVM: PPC: Use RCU for arch.spapr_tce_tables At the moment only spapr_tce_tables updates are protected against races but not lookups. This fixes missing protection by using RCU for the list. As lookups also happen in real mode, this uses list_for_each_entry_lockless() (which is expected not to access any vmalloc'd memory). This converts release_spapr_tce_table() to a RCU scheduled handler. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 9d08d8c..ffdbc2d 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -183,6 +183,7 @@ struct kvmppc_spapr_tce_table { struct kvm *kvm; u64 liobn; u32 window_size; + struct rcu_head rcu; struct page *pages[0]; }; diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 638c6d9..b34220d 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -807,7 +807,7 @@ int kvmppc_core_init_vm(struct kvm *kvm) { #ifdef CONFIG_PPC64 - INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); + INIT_LIST_HEAD_RCU(&kvm->arch.spapr_tce_tables); INIT_LIST_HEAD(&kvm->arch.rtas_tokens); #endif diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 54cf9bc..9526c34 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -45,19 +45,16 @@ static long kvmppc_stt_npages(unsigned long window_size) * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; } -static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt) +static void release_spapr_tce_table(struct rcu_head *head) { - struct kvm *kvm = stt->kvm; + struct kvmppc_spapr_tce_table *stt = container_of(head, + struct kvmppc_spapr_tce_table, rcu); int i; - mutex_lock(&kvm->lock); - list_del(&stt->list); for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++) __free_page(stt->pages[i]); - kfree(stt); - mutex_unlock(&kvm->lock); - kvm_put_kvm(kvm); + kfree(stt); } static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -88,7 +85,12 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) { struct kvmppc_spapr_tce_table *stt = filp->private_data; - release_spapr_tce_table(stt); + list_del_rcu(&stt->list); + + kvm_put_kvm(stt->kvm); + + call_rcu(&stt->rcu, release_spapr_tce_table); + return 0; } @@ -131,7 +133,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, kvm_get_kvm(kvm); mutex_lock(&kvm->lock); - list_add(&stt->list, &kvm->arch.spapr_tce_tables); + list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); mutex_unlock(&kvm->lock); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index f29ba2c..124d692 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -51,7 +51,7 @@ static struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu, struct kvm *kvm = vcpu->kvm; struct kvmppc_spapr_tce_table *stt; - list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) + list_for_each_entry_lockless(stt, &kvm->arch.spapr_tce_tables, list) if (stt->liobn == liobn) return stt; -- cgit v0.10.2 From f8626985c7c2485c423ce9f448028f81535b0ecc Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 15 Feb 2016 12:55:06 +1100 Subject: KVM: PPC: Account TCE-containing pages in locked_vm At the moment pages used for TCE tables (in addition to pages addressed by TCEs) are not counted in locked_vm counter so a malicious userspace tool can call ioctl(KVM_CREATE_SPAPR_TCE) as many times as RLIMIT_NOFILE and lock a lot of memory. This adds counting for pages used for TCE tables. This counts the number of pages required for a table plus pages for the kvmppc_spapr_tce_table struct (TCE table descriptor) itself. This changes release_spapr_tce_table() to store @npages on stack to avoid calling kvmppc_stt_npages() in the loop (tiny optimization, probably). This does not change the amount of used memory. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 9526c34..1a1e14f 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -39,19 +39,65 @@ #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) -static long kvmppc_stt_npages(unsigned long window_size) +static unsigned long kvmppc_tce_pages(unsigned long window_size) { return ALIGN((window_size >> SPAPR_TCE_SHIFT) * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; } +static unsigned long kvmppc_stt_pages(unsigned long tce_pages) +{ + unsigned long stt_bytes = sizeof(struct kvmppc_spapr_tce_table) + + (tce_pages * sizeof(struct page *)); + + return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE; +} + +static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc) +{ + long ret = 0; + + if (!current || !current->mm) + return ret; /* process exited */ + + down_write(¤t->mm->mmap_sem); + + if (inc) { + unsigned long locked, lock_limit; + + locked = current->mm->locked_vm + stt_pages; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + ret = -ENOMEM; + else + current->mm->locked_vm += stt_pages; + } else { + if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm)) + stt_pages = current->mm->locked_vm; + + current->mm->locked_vm -= stt_pages; + } + + pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid, + inc ? '+' : '-', + stt_pages << PAGE_SHIFT, + current->mm->locked_vm << PAGE_SHIFT, + rlimit(RLIMIT_MEMLOCK), + ret ? " - exceeded" : ""); + + up_write(¤t->mm->mmap_sem); + + return ret; +} + static void release_spapr_tce_table(struct rcu_head *head) { struct kvmppc_spapr_tce_table *stt = container_of(head, struct kvmppc_spapr_tce_table, rcu); int i; + unsigned long npages = kvmppc_tce_pages(stt->window_size); - for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++) + for (i = 0; i < npages; i++) __free_page(stt->pages[i]); kfree(stt); @@ -62,7 +108,7 @@ static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data; struct page *page; - if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size)) + if (vmf->pgoff >= kvmppc_tce_pages(stt->window_size)) return VM_FAULT_SIGBUS; page = stt->pages[vmf->pgoff]; @@ -89,6 +135,8 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) kvm_put_kvm(stt->kvm); + kvmppc_account_memlimit( + kvmppc_stt_pages(kvmppc_tce_pages(stt->window_size)), false); call_rcu(&stt->rcu, release_spapr_tce_table); return 0; @@ -103,7 +151,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args) { struct kvmppc_spapr_tce_table *stt = NULL; - long npages; + unsigned long npages; int ret = -ENOMEM; int i; @@ -113,7 +161,12 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, return -EBUSY; } - npages = kvmppc_stt_npages(args->window_size); + npages = kvmppc_tce_pages(args->window_size); + ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true); + if (ret) { + stt = NULL; + goto fail; + } stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), GFP_KERNEL); -- cgit v0.10.2 From 462ee11e58c96b81707d98fb1d02a8a3e84290ce Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 15 Feb 2016 12:55:07 +1100 Subject: KVM: PPC: Replace SPAPR_TCE_SHIFT with IOMMU_PAGE_SHIFT_4K SPAPR_TCE_SHIFT is used in few places only and since IOMMU_PAGE_SHIFT_4K can be easily used instead, remove SPAPR_TCE_SHIFT. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 2aa79c8..7529aab 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -33,8 +33,6 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu) } #endif -#define SPAPR_TCE_SHIFT 12 - #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ #endif diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 1a1e14f..84993d1 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -36,12 +36,13 @@ #include #include #include +#include #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) static unsigned long kvmppc_tce_pages(unsigned long window_size) { - return ALIGN((window_size >> SPAPR_TCE_SHIFT) + return ALIGN((window_size >> IOMMU_PAGE_SHIFT_4K) * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; } diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 124d692..0ce4ffb 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -99,7 +99,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret != H_SUCCESS) return ret; - idx = ioba >> SPAPR_TCE_SHIFT; + idx = ioba >> IOMMU_PAGE_SHIFT_4K; page = stt->pages[idx / TCES_PER_PAGE]; tbl = (u64 *)page_address(page); @@ -127,7 +127,7 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret != H_SUCCESS) return ret; - idx = ioba >> SPAPR_TCE_SHIFT; + idx = ioba >> IOMMU_PAGE_SHIFT_4K; page = stt->pages[idx / TCES_PER_PAGE]; tbl = (u64 *)page_address(page); -- cgit v0.10.2 From 5ee7af18642ce38c79b35927872f13d292cc3e27 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 15 Feb 2016 12:55:08 +1100 Subject: KVM: PPC: Move reusable bits of H_PUT_TCE handler to helpers Upcoming multi-tce support (H_PUT_TCE_INDIRECT/H_STUFF_TCE hypercalls) will validate TCE (not to have unexpected bits) and IO address (to be within the DMA window boundaries). This introduces helpers to validate TCE and IO address. The helpers are exported as they compile into vmlinux (to work in realmode) and will be used later by KVM kernel module in virtual mode. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2241d53..9513911 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -166,6 +166,10 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args); +extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, + unsigned long ioba, unsigned long npages); +extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt, + unsigned long tce); extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce); extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 0ce4ffb..b608fdd 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -36,6 +36,7 @@ #include #include #include +#include #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) @@ -64,7 +65,7 @@ static struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu, * WARNING: This will be called in real-mode on HV KVM and virtual * mode on PR KVM */ -static long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, +long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, unsigned long ioba, unsigned long npages) { unsigned long mask = (1ULL << IOMMU_PAGE_SHIFT_4K) - 1; @@ -76,6 +77,79 @@ static long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, return H_SUCCESS; } +EXPORT_SYMBOL_GPL(kvmppc_ioba_validate); + +/* + * Validates TCE address. + * At the moment flags and page mask are validated. + * As the host kernel does not access those addresses (just puts them + * to the table and user space is supposed to process them), we can skip + * checking other things (such as TCE is a guest RAM address or the page + * was actually allocated). + * + * WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ +long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce) +{ + unsigned long mask = + ~(IOMMU_PAGE_MASK_4K | TCE_PCI_WRITE | TCE_PCI_READ); + + if (tce & mask) + return H_PARAMETER; + + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_tce_validate); + +/* Note on the use of page_address() in real mode, + * + * It is safe to use page_address() in real mode on ppc64 because + * page_address() is always defined as lowmem_page_address() + * which returns __va(PFN_PHYS(page_to_pfn(page))) which is arithmetic + * operation and does not access page struct. + * + * Theoretically page_address() could be defined different + * but either WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL + * would have to be enabled. + * WANT_PAGE_VIRTUAL is never enabled on ppc32/ppc64, + * HASHED_PAGE_VIRTUAL could be enabled for ppc32 only and only + * if CONFIG_HIGHMEM is defined. As CONFIG_SPARSEMEM_VMEMMAP + * is not expected to be enabled on ppc32, page_address() + * is safe for ppc32 as well. + * + * WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ +static u64 *kvmppc_page_address(struct page *page) +{ +#if defined(HASHED_PAGE_VIRTUAL) || defined(WANT_PAGE_VIRTUAL) +#error TODO: fix to avoid page_address() here +#endif + return (u64 *) page_address(page); +} + +/* + * Handles TCE requests for emulated devices. + * Puts guest TCE values to the table and expects user space to convert them. + * Called in both real and virtual modes. + * Cannot fail so kvmppc_tce_validate must be called before it. + * + * WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ +void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, + unsigned long idx, unsigned long tce) +{ + struct page *page; + u64 *tbl; + + page = stt->pages[idx / TCES_PER_PAGE]; + tbl = kvmppc_page_address(page); + + tbl[idx % TCES_PER_PAGE] = tce; +} +EXPORT_SYMBOL_GPL(kvmppc_tce_put); /* WARNING: This will be called in real-mode on HV KVM and virtual * mode on PR KVM @@ -85,9 +159,6 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, { struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn); long ret; - unsigned long idx; - struct page *page; - u64 *tbl; /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ /* liobn, ioba, tce); */ @@ -99,13 +170,11 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret != H_SUCCESS) return ret; - idx = ioba >> IOMMU_PAGE_SHIFT_4K; - page = stt->pages[idx / TCES_PER_PAGE]; - tbl = (u64 *)page_address(page); + ret = kvmppc_tce_validate(stt, tce); + if (ret != H_SUCCESS) + return ret; - /* FIXME: Need to validate the TCE itself */ - /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */ - tbl[idx % TCES_PER_PAGE] = tce; + kvmppc_tce_put(stt, ioba >> IOMMU_PAGE_SHIFT_4K, tce); return H_SUCCESS; } -- cgit v0.10.2 From d3695aa4f452bc09c834a5010484f65fca37d87c Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 15 Feb 2016 12:55:09 +1100 Subject: KVM: PPC: Add support for multiple-TCE hcalls This adds real and virtual mode handlers for the H_PUT_TCE_INDIRECT and H_STUFF_TCE hypercalls for user space emulated devices such as IBMVIO devices or emulated PCI. These calls allow adding multiple entries (up to 512) into the TCE table in one call which saves time on transition between kernel and user space. The current implementation of kvmppc_h_stuff_tce() allows it to be executed in both real and virtual modes so there is one helper. The kvmppc_rm_h_put_tce_indirect() needs to translate the guest address to the host address and since the translation is different, there are 2 helpers - one for each mode. This implements the KVM_CAP_PPC_MULTITCE capability. When present, the kernel will try handling H_PUT_TCE_INDIRECT and H_STUFF_TCE if these are enabled by the userspace via KVM_CAP_PPC_ENABLE_HCALL. If they can not be handled by the kernel, they are passed on to the user space. The user space still has to have an implementation for these. Both HV and PR-syle KVM are supported. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 07e4cdf..da39435 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3035,6 +3035,31 @@ Returns: 0 on success, -1 on error Queues an SMI on the thread's vcpu. +4.97 KVM_CAP_PPC_MULTITCE + +Capability: KVM_CAP_PPC_MULTITCE +Architectures: ppc +Type: vm + +This capability means the kernel is capable of handling hypercalls +H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user +space. This significantly accelerates DMA operations for PPC KVM guests. +User space should expect that its handlers for these hypercalls +are not going to be called if user space previously registered LIOBN +in KVM (via KVM_CREATE_SPAPR_TCE or similar calls). + +In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest, +user space might have to advertise it for the guest. For example, +IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is +present in the "ibm,hypertas-functions" device-tree property. + +The hypercalls mentioned above may or may not be processed successfully +in the kernel based fast path. If they can not be handled by the kernel, +they will get passed on to user space. So user space still has to have +an implementation for these despite the in kernel acceleration. + +This capability is always enabled. + 5. The kvm_run structure ------------------------ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 9513911..4cadee5 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -166,12 +166,24 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args); +extern struct kvmppc_spapr_tce_table *kvmppc_find_table( + struct kvm_vcpu *vcpu, unsigned long liobn); extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, unsigned long ioba, unsigned long npages); extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt, unsigned long tce); +extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, + unsigned long *ua, unsigned long **prmap); +extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt, + unsigned long idx, unsigned long tce); extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce); +extern long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_list, unsigned long npages); +extern long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_value, unsigned long npages); extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba); extern struct page *kvm_alloc_hpt(unsigned long nr_pages); diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 84993d1..94c8e7e 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -14,6 +14,7 @@ * * Copyright 2010 Paul Mackerras, IBM Corp. * Copyright 2011 David Gibson, IBM Corporation + * Copyright 2016 Alexey Kardashevskiy, IBM Corporation */ #include @@ -37,8 +38,7 @@ #include #include #include - -#define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) +#include static unsigned long kvmppc_tce_pages(unsigned long window_size) { @@ -204,3 +204,59 @@ fail: } return ret; } + +long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_list, unsigned long npages) +{ + struct kvmppc_spapr_tce_table *stt; + long i, ret = H_SUCCESS, idx; + unsigned long entry, ua = 0; + u64 __user *tces, tce; + + stt = kvmppc_find_table(vcpu, liobn); + if (!stt) + return H_TOO_HARD; + + entry = ioba >> IOMMU_PAGE_SHIFT_4K; + /* + * SPAPR spec says that the maximum size of the list is 512 TCEs + * so the whole table fits in 4K page + */ + if (npages > 512) + return H_PARAMETER; + + if (tce_list & (SZ_4K - 1)) + return H_PARAMETER; + + ret = kvmppc_ioba_validate(stt, ioba, npages); + if (ret != H_SUCCESS) + return ret; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) { + ret = H_TOO_HARD; + goto unlock_exit; + } + tces = (u64 __user *) ua; + + for (i = 0; i < npages; ++i) { + if (get_user(tce, tces + i)) { + ret = H_TOO_HARD; + goto unlock_exit; + } + tce = be64_to_cpu(tce); + + ret = kvmppc_tce_validate(stt, tce); + if (ret != H_SUCCESS) + goto unlock_exit; + + kvmppc_tce_put(stt, entry + i, tce); + } + +unlock_exit: + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + return ret; +} +EXPORT_SYMBOL_GPL(kvmppc_h_put_tce_indirect); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index b608fdd..0486aa2 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -14,6 +14,7 @@ * * Copyright 2010 Paul Mackerras, IBM Corp. * Copyright 2011 David Gibson, IBM Corporation + * Copyright 2016 Alexey Kardashevskiy, IBM Corporation */ #include @@ -30,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -37,6 +39,7 @@ #include #include #include +#include #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) @@ -46,7 +49,7 @@ * WARNING: This will be called in real or virtual mode on HV KVM and virtual * mode on PR KVM */ -static struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu, +struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu, unsigned long liobn) { struct kvm *kvm = vcpu->kvm; @@ -58,6 +61,7 @@ static struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu, return NULL; } +EXPORT_SYMBOL_GPL(kvmppc_find_table); /* * Validates IO address. @@ -151,9 +155,29 @@ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, } EXPORT_SYMBOL_GPL(kvmppc_tce_put); -/* WARNING: This will be called in real-mode on HV KVM and virtual - * mode on PR KVM - */ +long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, + unsigned long *ua, unsigned long **prmap) +{ + unsigned long gfn = gpa >> PAGE_SHIFT; + struct kvm_memory_slot *memslot; + + memslot = search_memslots(kvm_memslots(kvm), gfn); + if (!memslot) + return -EINVAL; + + *ua = __gfn_to_hva_memslot(memslot, gfn) | + (gpa & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE)); + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + if (prmap) + *prmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; +#endif + + return 0; +} +EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce) { @@ -180,6 +204,122 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, } EXPORT_SYMBOL_GPL(kvmppc_h_put_tce); +static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu, + unsigned long ua, unsigned long *phpa) +{ + pte_t *ptep, pte; + unsigned shift = 0; + + ptep = __find_linux_pte_or_hugepte(vcpu->arch.pgdir, ua, NULL, &shift); + if (!ptep || !pte_present(*ptep)) + return -ENXIO; + pte = *ptep; + + if (!shift) + shift = PAGE_SHIFT; + + /* Avoid handling anything potentially complicated in realmode */ + if (shift > PAGE_SHIFT) + return -EAGAIN; + + if (!pte_young(pte)) + return -EAGAIN; + + *phpa = (pte_pfn(pte) << PAGE_SHIFT) | (ua & ((1ULL << shift) - 1)) | + (ua & ~PAGE_MASK); + + return 0; +} + +long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_list, unsigned long npages) +{ + struct kvmppc_spapr_tce_table *stt; + long i, ret = H_SUCCESS; + unsigned long tces, entry, ua = 0; + unsigned long *rmap = NULL; + + stt = kvmppc_find_table(vcpu, liobn); + if (!stt) + return H_TOO_HARD; + + entry = ioba >> IOMMU_PAGE_SHIFT_4K; + /* + * The spec says that the maximum size of the list is 512 TCEs + * so the whole table addressed resides in 4K page + */ + if (npages > 512) + return H_PARAMETER; + + if (tce_list & (SZ_4K - 1)) + return H_PARAMETER; + + ret = kvmppc_ioba_validate(stt, ioba, npages); + if (ret != H_SUCCESS) + return ret; + + if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) + return H_TOO_HARD; + + rmap = (void *) vmalloc_to_phys(rmap); + + /* + * Synchronize with the MMU notifier callbacks in + * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). + * While we have the rmap lock, code running on other CPUs + * cannot finish unmapping the host real page that backs + * this guest real page, so we are OK to access the host + * real page. + */ + lock_rmap(rmap); + if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) { + ret = H_TOO_HARD; + goto unlock_exit; + } + + for (i = 0; i < npages; ++i) { + unsigned long tce = be64_to_cpu(((u64 *)tces)[i]); + + ret = kvmppc_tce_validate(stt, tce); + if (ret != H_SUCCESS) + goto unlock_exit; + + kvmppc_tce_put(stt, entry + i, tce); + } + +unlock_exit: + unlock_rmap(rmap); + + return ret; +} + +long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_value, unsigned long npages) +{ + struct kvmppc_spapr_tce_table *stt; + long i, ret; + + stt = kvmppc_find_table(vcpu, liobn); + if (!stt) + return H_TOO_HARD; + + ret = kvmppc_ioba_validate(stt, ioba, npages); + if (ret != H_SUCCESS) + return ret; + + /* Check permission bits only to allow userspace poison TCE for debug */ + if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)) + return H_PARAMETER; + + for (i = 0; i < npages; ++i, ioba += IOMMU_PAGE_SIZE_4K) + kvmppc_tce_put(stt, ioba >> IOMMU_PAGE_SHIFT_4K, tce_value); + + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce); + long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba) { @@ -205,3 +345,5 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, return H_SUCCESS; } EXPORT_SYMBOL_GPL(kvmppc_h_get_tce); + +#endif /* KVM_BOOK3S_HV_POSSIBLE */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index baeddb0..33b491e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -768,7 +768,31 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) if (kvmppc_xics_enabled(vcpu)) { ret = kvmppc_xics_hcall(vcpu, req); break; - } /* fallthrough */ + } + return RESUME_HOST; + case H_PUT_TCE: + ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6)); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; + case H_PUT_TCE_INDIRECT: + ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6), + kvmppc_get_gpr(vcpu, 7)); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; + case H_STUFF_TCE: + ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6), + kvmppc_get_gpr(vcpu, 7)); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; default: return RESUME_HOST; } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 6ee26de..ed16182 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -2006,8 +2006,8 @@ hcall_real_table: .long 0 /* 0x12c */ .long 0 /* 0x130 */ .long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table - .long 0 /* 0x138 */ - .long 0 /* 0x13c */ + .long DOTSYM(kvmppc_h_stuff_tce) - hcall_real_table + .long DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table .long 0 /* 0x140 */ .long 0 /* 0x144 */ .long 0 /* 0x148 */ diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index f2c75a1..02176fd 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -280,6 +280,37 @@ static int kvmppc_h_pr_logical_ci_store(struct kvm_vcpu *vcpu) return EMULATE_DONE; } +static int kvmppc_h_pr_put_tce_indirect(struct kvm_vcpu *vcpu) +{ + unsigned long liobn = kvmppc_get_gpr(vcpu, 4); + unsigned long ioba = kvmppc_get_gpr(vcpu, 5); + unsigned long tce = kvmppc_get_gpr(vcpu, 6); + unsigned long npages = kvmppc_get_gpr(vcpu, 7); + long rc; + + rc = kvmppc_h_put_tce_indirect(vcpu, liobn, ioba, + tce, npages); + if (rc == H_TOO_HARD) + return EMULATE_FAIL; + kvmppc_set_gpr(vcpu, 3, rc); + return EMULATE_DONE; +} + +static int kvmppc_h_pr_stuff_tce(struct kvm_vcpu *vcpu) +{ + unsigned long liobn = kvmppc_get_gpr(vcpu, 4); + unsigned long ioba = kvmppc_get_gpr(vcpu, 5); + unsigned long tce_value = kvmppc_get_gpr(vcpu, 6); + unsigned long npages = kvmppc_get_gpr(vcpu, 7); + long rc; + + rc = kvmppc_h_stuff_tce(vcpu, liobn, ioba, tce_value, npages); + if (rc == H_TOO_HARD) + return EMULATE_FAIL; + kvmppc_set_gpr(vcpu, 3, rc); + return EMULATE_DONE; +} + static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) { long rc = kvmppc_xics_hcall(vcpu, cmd); @@ -306,6 +337,10 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) return kvmppc_h_pr_bulk_remove(vcpu); case H_PUT_TCE: return kvmppc_h_pr_put_tce(vcpu); + case H_PUT_TCE_INDIRECT: + return kvmppc_h_pr_put_tce_indirect(vcpu); + case H_STUFF_TCE: + return kvmppc_h_pr_stuff_tce(vcpu); case H_CEDE: kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE); kvm_vcpu_block(vcpu); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index a3b182d..69f897d 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -569,6 +569,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PPC_GET_SMMU_INFO: r = 1; break; + case KVM_CAP_SPAPR_MULTITCE: + r = 1; + break; #endif default: r = 0; -- cgit v0.10.2 From bd7f561f76563f0b21701628874d8adc863b0c25 Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Thu, 17 Dec 2015 14:59:03 -0600 Subject: powerpc/smp: Support more IPI messages This patch increases the number of demuxed messages for a controller with a single ipi to 8 for 64-bit systems. This is required because we want to use the IPI mechanism to send messages from a CPU running in KVM real mode in a guest to a CPU in the host to take some action. Currently, we only support 4 messages and all 4 are already taken. Define a fifth message PPC_MSG_RM_HOST_ACTION for this purpose. Signed-off-by: Suresh Warrier Acked-by: Michael Ellerman Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 825663c..9ef9c37 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -114,6 +114,9 @@ extern int cpu_to_core_id(int cpu); #define PPC_MSG_TICK_BROADCAST 2 #define PPC_MSG_DEBUGGER_BREAK 3 +/* This is only used by the powernv kernel */ +#define PPC_MSG_RM_HOST_ACTION 4 + /* for irq controllers that have dedicated ipis per message (4) */ extern int smp_request_message_ipi(int virq, int message); extern const char *smp_ipi_name[]; diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index ec9ec20..a53a130 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -206,7 +206,7 @@ int smp_request_message_ipi(int virq, int msg) #ifdef CONFIG_PPC_SMP_MUXED_IPI struct cpu_messages { - int messages; /* current messages */ + long messages; /* current messages */ unsigned long data; /* data for cause ipi */ }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_messages, ipi_message); @@ -236,15 +236,15 @@ void smp_muxed_ipi_message_pass(int cpu, int msg) } #ifdef __BIG_ENDIAN__ -#define IPI_MESSAGE(A) (1 << (24 - 8 * (A))) +#define IPI_MESSAGE(A) (1uL << ((BITS_PER_LONG - 8) - 8 * (A))) #else -#define IPI_MESSAGE(A) (1 << (8 * (A))) +#define IPI_MESSAGE(A) (1uL << (8 * (A))) #endif irqreturn_t smp_ipi_demux(void) { struct cpu_messages *info = this_cpu_ptr(&ipi_message); - unsigned int all; + unsigned long all; mb(); /* order any irq clear */ -- cgit v0.10.2 From 31639c77e0a7f9f742c813ae697f337b44981ed2 Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Thu, 17 Dec 2015 14:59:04 -0600 Subject: powerpc/smp: Add smp_muxed_ipi_set_message smp_muxed_ipi_message_pass() invokes smp_ops->cause_ipi, which uses an ioremapped address to access registers on the XICS interrupt controller to cause the IPI. Because of this real mode callers cannot call smp_muxed_ipi_message_pass() for IPI messaging. This patch creates a separate function smp_muxed_ipi_set_message just to set the IPI message without the cause_ipi routine. After calling this function to set the IPI message, real mode callers must cause the IPI by writing to the XICS registers directly. As part of this, we also change smp_muxed_ipi_message_pass to call smp_muxed_ipi_set_message to set the message instead of doing it directly inside the routine. Signed-off-by: Suresh Warrier Acked-by: Michael Ellerman Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 9ef9c37..78083ed 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -124,6 +124,7 @@ extern const char *smp_ipi_name[]; /* for irq controllers with only a single ipi */ extern void smp_muxed_ipi_set_data(int cpu, unsigned long data); extern void smp_muxed_ipi_message_pass(int cpu, int msg); +extern void smp_muxed_ipi_set_message(int cpu, int msg); extern irqreturn_t smp_ipi_demux(void); void smp_init_pSeries(void); diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index a53a130..e222efc 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -218,7 +218,7 @@ void smp_muxed_ipi_set_data(int cpu, unsigned long data) info->data = data; } -void smp_muxed_ipi_message_pass(int cpu, int msg) +void smp_muxed_ipi_set_message(int cpu, int msg) { struct cpu_messages *info = &per_cpu(ipi_message, cpu); char *message = (char *)&info->messages; @@ -228,6 +228,13 @@ void smp_muxed_ipi_message_pass(int cpu, int msg) */ smp_mb(); message[msg] = 1; +} + +void smp_muxed_ipi_message_pass(int cpu, int msg) +{ + struct cpu_messages *info = &per_cpu(ipi_message, cpu); + + smp_muxed_ipi_set_message(cpu, msg); /* * cause_ipi functions are required to include a full barrier * before doing whatever causes the IPI. -- cgit v0.10.2 From ec13e9b6b13d66c54951fec7f1158bf85f68fecd Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Thu, 17 Dec 2015 14:59:05 -0600 Subject: powerpc/xics: Add icp_native_cause_ipi_rm Function to cause an IPI by directly updating the MFFR register in the XICS. The function is meant for real-mode callers since they cannot use the smp_ops->cause_ipi function which uses an ioremapped address. Normal usage is for the the KVM real mode code to set the IPI message using smp_muxed_ipi_message_pass and then invoke icp_native_cause_ipi_rm to cause the actual IPI. The function requires kvm_hstate.xics_phys to have been initialized with the physical address of XICS. Signed-off-by: Suresh Warrier Acked-by: Michael Ellerman Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/xics.h b/arch/powerpc/include/asm/xics.h index 0e25bdb..2546048 100644 --- a/arch/powerpc/include/asm/xics.h +++ b/arch/powerpc/include/asm/xics.h @@ -30,6 +30,7 @@ #ifdef CONFIG_PPC_ICP_NATIVE extern int icp_native_init(void); extern void icp_native_flush_interrupt(void); +extern void icp_native_cause_ipi_rm(int cpu); #else static inline int icp_native_init(void) { return -ENODEV; } #endif diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c index eae3265..afdf62f 100644 --- a/arch/powerpc/sysdev/xics/icp-native.c +++ b/arch/powerpc/sysdev/xics/icp-native.c @@ -159,6 +159,27 @@ static void icp_native_cause_ipi(int cpu, unsigned long data) icp_native_set_qirr(cpu, IPI_PRIORITY); } +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +void icp_native_cause_ipi_rm(int cpu) +{ + /* + * Currently not used to send IPIs to another CPU + * on the same core. Only caller is KVM real mode. + * Need the physical address of the XICS to be + * previously saved in kvm_hstate in the paca. + */ + unsigned long xics_phys; + + /* + * Just like the cause_ipi functions, it is required to + * include a full barrier (out8 includes a sync) before + * causing the IPI. + */ + xics_phys = paca[cpu].kvm_hstate.xics_phys; + out_rm8((u8 *)(xics_phys + XICS_MFRR), IPI_PRIORITY); +} +#endif + /* * Called when an interrupt is received on an off-line CPU to * clear the interrupt, so that the CPU can go back to nap mode. -- cgit v0.10.2 From 79b6c247e9afe35714c1f83cfcecf40a438ca4a4 Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Thu, 17 Dec 2015 14:59:06 -0600 Subject: KVM: PPC: Book3S HV: Host-side RM data structures This patch defines the data structures to support the setting up of host side operations while running in real mode in the guest, and also the functions to allocate and free it. The operations are for now limited to virtual XICS operations. Currently, we have only defined one operation in the data structure: - Wake up a VCPU sleeping in the host when it receives a virtual interrupt The operations are assigned at the core level because PowerKVM requires that the host run in SMT off mode. For each core, we will need to manage its state atomically - where the state is defined by: 1. Is the core running in the host? 2. Is there a Real Mode (RM) operation pending on the host? Currently, core state is only managed at the whole-core level even when the system is in split-core mode. This just limits the number of free or "available" cores in the host to perform any host-side operations. The kvmppc_host_rm_core.rm_data allows any data to be passed by KVM in real mode to the host core along with the operation to be performed. The kvmppc_host_rm_ops structure is allocated the very first time a guest VM is started. Initial core state is also set - all online cores are in the host. This structure is never deleted, not even when there are no active guests. However, it needs to be freed when the module is unloaded because the kvmppc_host_rm_ops_hv can contain function pointers to kvm-hv.ko functions for the different supported host operations. Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 4cadee5..ded8dda 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -453,6 +453,8 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) { return vcpu->arch.irq_type == KVMPPC_IRQ_XICS; } +extern void kvmppc_alloc_host_rm_ops(void); +extern void kvmppc_free_host_rm_ops(void); extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu); extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server); extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args); @@ -462,6 +464,8 @@ extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval); extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, u32 cpu); #else +static inline void kvmppc_alloc_host_rm_ops(void) {}; +static inline void kvmppc_free_host_rm_ops(void) {}; static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) { return 0; } static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { } @@ -475,6 +479,33 @@ static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) { return 0; } #endif +/* + * Host-side operations we want to set up while running in real + * mode in the guest operating on the xics. + * Currently only VCPU wakeup is supported. + */ + +union kvmppc_rm_state { + unsigned long raw; + struct { + u32 in_host; + u32 rm_action; + }; +}; + +struct kvmppc_host_rm_core { + union kvmppc_rm_state rm_state; + void *rm_data; + char pad[112]; +}; + +struct kvmppc_host_rm_ops { + struct kvmppc_host_rm_core *rm_core; + void (*vcpu_kick)(struct kvm_vcpu *vcpu); +}; + +extern struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv; + static inline unsigned long kvmppc_get_epr(struct kvm_vcpu *vcpu) { #ifdef CONFIG_KVM_BOOKE_HV diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 33b491e..8b3332f 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3008,6 +3008,73 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) goto out_srcu; } +#ifdef CONFIG_KVM_XICS +/* + * Allocate a per-core structure for managing state about which cores are + * running in the host versus the guest and for exchanging data between + * real mode KVM and CPU running in the host. + * This is only done for the first VM. + * The allocated structure stays even if all VMs have stopped. + * It is only freed when the kvm-hv module is unloaded. + * It's OK for this routine to fail, we just don't support host + * core operations like redirecting H_IPI wakeups. + */ +void kvmppc_alloc_host_rm_ops(void) +{ + struct kvmppc_host_rm_ops *ops; + unsigned long l_ops; + int cpu, core; + int size; + + /* Not the first time here ? */ + if (kvmppc_host_rm_ops_hv != NULL) + return; + + ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL); + if (!ops) + return; + + size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core); + ops->rm_core = kzalloc(size, GFP_KERNEL); + + if (!ops->rm_core) { + kfree(ops); + return; + } + + for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) { + if (!cpu_online(cpu)) + continue; + + core = cpu >> threads_shift; + ops->rm_core[core].rm_state.in_host = 1; + } + + /* + * Make the contents of the kvmppc_host_rm_ops structure visible + * to other CPUs before we assign it to the global variable. + * Do an atomic assignment (no locks used here), but if someone + * beats us to it, just free our copy and return. + */ + smp_wmb(); + l_ops = (unsigned long) ops; + + if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) { + kfree(ops->rm_core); + kfree(ops); + } +} + +void kvmppc_free_host_rm_ops(void) +{ + if (kvmppc_host_rm_ops_hv) { + kfree(kvmppc_host_rm_ops_hv->rm_core); + kfree(kvmppc_host_rm_ops_hv); + kvmppc_host_rm_ops_hv = NULL; + } +} +#endif + static int kvmppc_core_init_vm_hv(struct kvm *kvm) { unsigned long lpcr, lpid; @@ -3020,6 +3087,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) return -ENOMEM; kvm->arch.lpid = lpid; + kvmppc_alloc_host_rm_ops(); + /* * Since we don't flush the TLB when tearing down a VM, * and this lpid might have previously been used, @@ -3253,6 +3322,7 @@ static int kvmppc_book3s_init_hv(void) static void kvmppc_book3s_exit_hv(void) { + kvmppc_free_host_rm_ops(); kvmppc_hv_ops = NULL; } diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index fd7006b..5f0380d 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -283,3 +283,6 @@ void kvmhv_commence_exit(int trap) kvmhv_interrupt_vcore(vc, ee); } } + +struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv; +EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv); -- cgit v0.10.2 From b8e6a87c82927ed9ccf0f3ee42946a41cb9d75fe Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Thu, 17 Dec 2015 14:59:07 -0600 Subject: KVM: PPC: Book3S HV: Manage core host state Update the core host state in kvmppc_host_rm_ops whenever the primary thread of the core enters the guest or returns back. Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 8b3332f..542ec97 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2303,6 +2303,46 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) } /* + * Clear core from the list of active host cores as we are about to + * enter the guest. Only do this if it is the primary thread of the + * core (not if a subcore) that is entering the guest. + */ +static inline void kvmppc_clear_host_core(int cpu) +{ + int core; + + if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu)) + return; + /* + * Memory barrier can be omitted here as we will do a smp_wmb() + * later in kvmppc_start_thread and we need ensure that state is + * visible to other CPUs only after we enter guest. + */ + core = cpu >> threads_shift; + kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0; +} + +/* + * Advertise this core as an active host core since we exited the guest + * Only need to do this if it is the primary thread of the core that is + * exiting. + */ +static inline void kvmppc_set_host_core(int cpu) +{ + int core; + + if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu)) + return; + + /* + * Memory barrier can be omitted here because we do a spin_unlock + * immediately after this which provides the memory barrier. + */ + core = cpu >> threads_shift; + kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1; +} + +/* * Run a set of guest threads on a physical core. * Called with vc->lock held. */ @@ -2414,6 +2454,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) } } + kvmppc_clear_host_core(pcpu); + /* Start all the threads */ active = 0; for (sub = 0; sub < core_info.n_subcores; ++sub) { @@ -2510,6 +2552,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) kvmppc_ipi_thread(pcpu + i); } + kvmppc_set_host_core(pcpu); + spin_unlock(&vc->lock); /* make sure updates to secondary vcpu structs are visible now */ -- cgit v0.10.2 From 6f3bb80944148012cbac1f98da249f591cbcae43 Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Thu, 17 Dec 2015 14:59:08 -0600 Subject: KVM: PPC: Book3S HV: kvmppc_host_rm_ops - handle offlining CPUs The kvmppc_host_rm_ops structure keeps track of which cores are are in the host by maintaining a bitmask of active/runnable online CPUs that have not entered the guest. This patch adds support to manage the bitmask when a CPU is offlined or onlined in the host. Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 542ec97..16304d2 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3053,6 +3053,36 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) } #ifdef CONFIG_KVM_XICS +static int kvmppc_cpu_notify(struct notifier_block *self, unsigned long action, + void *hcpu) +{ + unsigned long cpu = (long)hcpu; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + kvmppc_set_host_core(cpu); + break; + +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + case CPU_DEAD_FROZEN: + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + kvmppc_clear_host_core(cpu); + break; +#endif + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block kvmppc_cpu_notifier = { + .notifier_call = kvmppc_cpu_notify, +}; + /* * Allocate a per-core structure for managing state about which cores are * running in the host versus the guest and for exchanging data between @@ -3086,6 +3116,8 @@ void kvmppc_alloc_host_rm_ops(void) return; } + get_online_cpus(); + for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) { if (!cpu_online(cpu)) continue; @@ -3104,14 +3136,21 @@ void kvmppc_alloc_host_rm_ops(void) l_ops = (unsigned long) ops; if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) { + put_online_cpus(); kfree(ops->rm_core); kfree(ops); + return; } + + register_cpu_notifier(&kvmppc_cpu_notifier); + + put_online_cpus(); } void kvmppc_free_host_rm_ops(void) { if (kvmppc_host_rm_ops_hv) { + unregister_cpu_notifier(&kvmppc_cpu_notifier); kfree(kvmppc_host_rm_ops_hv->rm_core); kfree(kvmppc_host_rm_ops_hv); kvmppc_host_rm_ops_hv = NULL; -- cgit v0.10.2 From 0c2a66062470cd1f6d11ae6db31059f59d3f725f Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Thu, 17 Dec 2015 14:59:09 -0600 Subject: KVM: PPC: Book3S HV: Host side kick VCPU when poked by real-mode KVM This patch adds the support for the kick VCPU operation for kvmppc_host_rm_ops. The kvmppc_xics_ipi_action() function provides the function to be invoked for a host side operation when poked by the real mode KVM. This is initiated by KVM by sending an IPI to any free host core. KVM real mode must set the rm_action to XICS_RM_KICK_VCPU and rm_data to point to the VCPU to be woken up before sending the IPI. Note that we have allocated one kvmppc_host_rm_core structure per core. The above values need to be set in the structure corresponding to the core to which the IPI will be sent. Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index ded8dda..bc14e9e 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -463,6 +463,7 @@ extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu); extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval); extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, u32 cpu); +extern void kvmppc_xics_ipi_action(void); #else static inline void kvmppc_alloc_host_rm_ops(void) {}; static inline void kvmppc_free_host_rm_ops(void) {}; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 16304d2..c3c7310 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3126,6 +3126,8 @@ void kvmppc_alloc_host_rm_ops(void) ops->rm_core[core].rm_state.in_host = 1; } + ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv; + /* * Make the contents of the kvmppc_host_rm_ops structure visible * to other CPUs before we assign it to the global variable. diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 24f5807..43ffbfe 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "book3s_xics.h" @@ -623,3 +624,38 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) bail: return check_too_hard(xics, icp); } + +/* --- Non-real mode XICS-related built-in routines --- */ + +/** + * Host Operations poked by RM KVM + */ +static void rm_host_ipi_action(int action, void *data) +{ + switch (action) { + case XICS_RM_KICK_VCPU: + kvmppc_host_rm_ops_hv->vcpu_kick(data); + break; + default: + WARN(1, "Unexpected rm_action=%d data=%p\n", action, data); + break; + } + +} + +void kvmppc_xics_ipi_action(void) +{ + int core; + unsigned int cpu = smp_processor_id(); + struct kvmppc_host_rm_core *rm_corep; + + core = cpu >> threads_shift; + rm_corep = &kvmppc_host_rm_ops_hv->rm_core[core]; + + if (rm_corep->rm_data) { + rm_host_ipi_action(rm_corep->rm_state.rm_action, + rm_corep->rm_data); + rm_corep->rm_data = NULL; + rm_corep->rm_state.rm_action = 0; + } +} -- cgit v0.10.2 From e17769eb8c897101e2c6df62ec397e450b6e53b4 Mon Sep 17 00:00:00 2001 From: "Suresh E. Warrier" Date: Mon, 21 Dec 2015 16:22:51 -0600 Subject: KVM: PPC: Book3S HV: Send IPI to host core to wake VCPU This patch adds support to real-mode KVM to search for a core running in the host partition and send it an IPI message with VCPU to be woken. This avoids having to switch to the host partition to complete an H_IPI hypercall when the VCPU which is the target of the the H_IPI is not loaded (is not running in the guest). The patch also includes the support in the IPI handler running in the host to do the wakeup by calling kvmppc_xics_ipi_action for the PPC_MSG_RM_HOST_ACTION message. When a guest is being destroyed, we need to ensure that there are no pending IPIs waiting to wake up a VCPU before we free the VCPUs of the guest. This is accomplished by: - Forces a PPC_MSG_CALL_FUNCTION IPI to be completed by all CPUs before freeing any VCPUs in kvm_arch_destroy_vm(). - Any PPC_MSG_RM_HOST_ACTION messages must be executed first before any other PPC_MSG_CALL_FUNCTION messages. Signed-off-by: Suresh Warrier Acked-by: Michael Ellerman Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index e222efc..cb8be5d 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -257,6 +257,17 @@ irqreturn_t smp_ipi_demux(void) do { all = xchg(&info->messages, 0); +#if defined(CONFIG_KVM_XICS) && defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) + /* + * Must check for PPC_MSG_RM_HOST_ACTION messages + * before PPC_MSG_CALL_FUNCTION messages because when + * a VM is destroyed, we call kick_all_cpus_sync() + * to ensure that any pending PPC_MSG_RM_HOST_ACTION + * messages have completed before we free any VCPUs. + */ + if (all & IPI_MESSAGE(PPC_MSG_RM_HOST_ACTION)) + kvmppc_xics_ipi_action(); +#endif if (all & IPI_MESSAGE(PPC_MSG_CALL_FUNCTION)) generic_smp_call_function_interrupt(); if (all & IPI_MESSAGE(PPC_MSG_RESCHEDULE)) diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 43ffbfe..e673fb9 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -51,11 +51,84 @@ static void ics_rm_check_resend(struct kvmppc_xics *xics, /* -- ICP routines -- */ +#ifdef CONFIG_SMP +static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) +{ + int hcpu; + + hcpu = hcore << threads_shift; + kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu; + smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION); + icp_native_cause_ipi_rm(hcpu); +} +#else +static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { } +#endif + +/* + * We start the search from our current CPU Id in the core map + * and go in a circle until we get back to our ID looking for a + * core that is running in host context and that hasn't already + * been targeted for another rm_host_ops. + * + * In the future, could consider using a fairer algorithm (one + * that distributes the IPIs better) + * + * Returns -1, if no CPU could be found in the host + * Else, returns a CPU Id which has been reserved for use + */ +static inline int grab_next_hostcore(int start, + struct kvmppc_host_rm_core *rm_core, int max, int action) +{ + bool success; + int core; + union kvmppc_rm_state old, new; + + for (core = start + 1; core < max; core++) { + old = new = READ_ONCE(rm_core[core].rm_state); + + if (!old.in_host || old.rm_action) + continue; + + /* Try to grab this host core if not taken already. */ + new.rm_action = action; + + success = cmpxchg64(&rm_core[core].rm_state.raw, + old.raw, new.raw) == old.raw; + if (success) { + /* + * Make sure that the store to the rm_action is made + * visible before we return to caller (and the + * subsequent store to rm_data) to synchronize with + * the IPI handler. + */ + smp_wmb(); + return core; + } + } + + return -1; +} + +static inline int find_available_hostcore(int action) +{ + int core; + int my_core = smp_processor_id() >> threads_shift; + struct kvmppc_host_rm_core *rm_core = kvmppc_host_rm_ops_hv->rm_core; + + core = grab_next_hostcore(my_core, rm_core, cpu_nr_cores(), action); + if (core == -1) + core = grab_next_hostcore(core, rm_core, my_core, action); + + return core; +} + static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, struct kvm_vcpu *this_vcpu) { struct kvmppc_icp *this_icp = this_vcpu->arch.icp; int cpu; + int hcore; /* Mark the target VCPU as having an interrupt pending */ vcpu->stat.queue_intr++; @@ -67,11 +140,22 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, return; } - /* Check if the core is loaded, if not, too hard */ + /* + * Check if the core is loaded, + * if not, find an available host core to post to wake the VCPU, + * if we can't find one, set up state to eventually return too hard. + */ cpu = vcpu->arch.thread_cpu; if (cpu < 0 || cpu >= nr_cpu_ids) { - this_icp->rm_action |= XICS_RM_KICK_VCPU; - this_icp->rm_kick_target = vcpu; + hcore = -1; + if (kvmppc_host_rm_ops_hv) + hcore = find_available_hostcore(XICS_RM_KICK_VCPU); + if (hcore != -1) { + icp_send_hcore_msg(hcore, vcpu); + } else { + this_icp->rm_action |= XICS_RM_KICK_VCPU; + this_icp->rm_kick_target = vcpu; + } return; } @@ -655,7 +739,9 @@ void kvmppc_xics_ipi_action(void) if (rm_corep->rm_data) { rm_host_ipi_action(rm_corep->rm_state.rm_action, rm_corep->rm_data); + /* Order these stores against the real mode KVM */ rm_corep->rm_data = NULL; + smp_wmb(); rm_corep->rm_state.rm_action = 0; } } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 69f897d..9258675 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -437,6 +437,16 @@ void kvm_arch_destroy_vm(struct kvm *kvm) unsigned int i; struct kvm_vcpu *vcpu; +#ifdef CONFIG_KVM_XICS + /* + * We call kick_all_cpus_sync() to ensure that all + * CPUs have executed any pending IPIs before we + * continue and free VCPUs structures below. + */ + if (is_kvmppc_hv_enabled(kvm)) + kick_all_cpus_sync(); +#endif + kvm_for_each_vcpu(i, vcpu, kvm) kvm_arch_vcpu_free(vcpu); -- cgit v0.10.2 From 520fe9c607d3acea96391aad27e17518bd7d39bd Mon Sep 17 00:00:00 2001 From: "Suresh E. Warrier" Date: Mon, 21 Dec 2015 16:33:57 -0600 Subject: KVM: PPC: Book3S HV: Add tunable to control H_IPI redirection Redirecting the wakeup of a VCPU from the H_IPI hypercall to a core running in the host is usually a good idea, most workloads seemed to benefit. However, in one heavily interrupt-driven SMT1 workload, some regression was observed. This patch adds a kvm_hv module parameter called h_ipi_redirect to control this feature. The default value for this tunable is 1 - that is enable the feature. Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index bc14e9e..197a8ac 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -464,6 +464,7 @@ extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval); extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, u32 cpu); extern void kvmppc_xics_ipi_action(void); +extern int h_ipi_redirect; #else static inline void kvmppc_alloc_host_rm_ops(void) {}; static inline void kvmppc_free_host_rm_ops(void) {}; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index c3c7310..f47fffe 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -81,6 +81,17 @@ static int target_smt_mode; module_param(target_smt_mode, int, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)"); +#ifdef CONFIG_KVM_XICS +static struct kernel_param_ops module_param_ops = { + .set = param_set_int, + .get = param_get_int, +}; + +module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); +#endif + static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index e673fb9..980d8a6 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -24,6 +24,9 @@ #define DEBUG_PASSUP +int h_ipi_redirect = 1; +EXPORT_SYMBOL(h_ipi_redirect); + static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, u32 new_irq); @@ -148,7 +151,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, cpu = vcpu->arch.thread_cpu; if (cpu < 0 || cpu >= nr_cpu_ids) { hcore = -1; - if (kvmppc_host_rm_ops_hv) + if (kvmppc_host_rm_ops_hv && h_ipi_redirect) hcore = find_available_hostcore(XICS_RM_KICK_VCPU); if (hcore != -1) { icp_send_hcore_msg(hcore, vcpu); -- cgit v0.10.2 From 01d01d69192e417447dee97891d670804bedd2c8 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 1 Mar 2016 17:54:37 +1100 Subject: KVM: PPC: Reserve KVM_CAP_SPAPR_TCE_64 capability number This adds a capability number for 64-bit TCE tables support. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Paul Mackerras diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 9da9051..8ce5f64 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -850,6 +850,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_IOEVENTFD_ANY_LENGTH 122 #define KVM_CAP_HYPERV_SYNIC 123 #define KVM_CAP_S390_RI 124 +#define KVM_CAP_SPAPR_TCE_64 125 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v0.10.2 From fe26e52712ccab6648df17ecc029a68a69a01a85 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 1 Mar 2016 17:54:38 +1100 Subject: KVM: PPC: Add @page_shift to kvmppc_spapr_tce_table At the moment the kvmppc_spapr_tce_table struct can only describe 4GB windows and handle fixed size (4K) pages. Dynamic DMA windows support more so these limits need to be extended. This replaces window_size (in bytes, 4GB max) with page_shift (32bit) and size (64bit, in pages). This should cause no behavioural change as this is changing the internal structures only - the user interface still only allows one to create a 32-bit table with 4KiB pages at this stage. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index ffdbc2d..edf66f7 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -182,8 +182,9 @@ struct kvmppc_spapr_tce_table { struct list_head list; struct kvm *kvm; u64 liobn; - u32 window_size; struct rcu_head rcu; + u32 page_shift; + u64 size; /* window size in pages */ struct page *pages[0]; }; diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 94c8e7e..61cbc44 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -40,10 +40,9 @@ #include #include -static unsigned long kvmppc_tce_pages(unsigned long window_size) +static unsigned long kvmppc_tce_pages(unsigned long iommu_pages) { - return ALIGN((window_size >> IOMMU_PAGE_SHIFT_4K) - * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; + return ALIGN(iommu_pages * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; } static unsigned long kvmppc_stt_pages(unsigned long tce_pages) @@ -95,8 +94,7 @@ static void release_spapr_tce_table(struct rcu_head *head) { struct kvmppc_spapr_tce_table *stt = container_of(head, struct kvmppc_spapr_tce_table, rcu); - int i; - unsigned long npages = kvmppc_tce_pages(stt->window_size); + unsigned long i, npages = kvmppc_tce_pages(stt->size); for (i = 0; i < npages; i++) __free_page(stt->pages[i]); @@ -109,7 +107,7 @@ static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data; struct page *page; - if (vmf->pgoff >= kvmppc_tce_pages(stt->window_size)) + if (vmf->pgoff >= kvmppc_tce_pages(stt->size)) return VM_FAULT_SIGBUS; page = stt->pages[vmf->pgoff]; @@ -137,7 +135,7 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) kvm_put_kvm(stt->kvm); kvmppc_account_memlimit( - kvmppc_stt_pages(kvmppc_tce_pages(stt->window_size)), false); + kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false); call_rcu(&stt->rcu, release_spapr_tce_table); return 0; @@ -152,7 +150,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args) { struct kvmppc_spapr_tce_table *stt = NULL; - unsigned long npages; + unsigned long npages, size; int ret = -ENOMEM; int i; @@ -162,7 +160,8 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, return -EBUSY; } - npages = kvmppc_tce_pages(args->window_size); + size = args->window_size >> IOMMU_PAGE_SHIFT_4K; + npages = kvmppc_tce_pages(size); ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true); if (ret) { stt = NULL; @@ -175,7 +174,8 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, goto fail; stt->liobn = args->liobn; - stt->window_size = args->window_size; + stt->page_shift = IOMMU_PAGE_SHIFT_4K; + stt->size = size; stt->kvm = kvm; for (i = 0; i < npages; i++) { @@ -218,7 +218,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, if (!stt) return H_TOO_HARD; - entry = ioba >> IOMMU_PAGE_SHIFT_4K; + entry = ioba >> stt->page_shift; /* * SPAPR spec says that the maximum size of the list is 512 TCEs * so the whole table fits in 4K page diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 0486aa2..c786a58 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -72,11 +72,10 @@ EXPORT_SYMBOL_GPL(kvmppc_find_table); long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, unsigned long ioba, unsigned long npages) { - unsigned long mask = (1ULL << IOMMU_PAGE_SHIFT_4K) - 1; - unsigned long idx = ioba >> IOMMU_PAGE_SHIFT_4K; - unsigned long size = stt->window_size >> IOMMU_PAGE_SHIFT_4K; + unsigned long mask = (1ULL << stt->page_shift) - 1; + unsigned long idx = ioba >> stt->page_shift; - if ((ioba & mask) || (idx + npages > size) || (idx + npages < idx)) + if ((ioba & mask) || (idx + npages > stt->size) || (idx + npages < idx)) return H_PARAMETER; return H_SUCCESS; @@ -96,8 +95,8 @@ EXPORT_SYMBOL_GPL(kvmppc_ioba_validate); */ long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce) { - unsigned long mask = - ~(IOMMU_PAGE_MASK_4K | TCE_PCI_WRITE | TCE_PCI_READ); + unsigned long page_mask = ~((1ULL << stt->page_shift) - 1); + unsigned long mask = ~(page_mask | TCE_PCI_WRITE | TCE_PCI_READ); if (tce & mask) return H_PARAMETER; @@ -198,7 +197,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret != H_SUCCESS) return ret; - kvmppc_tce_put(stt, ioba >> IOMMU_PAGE_SHIFT_4K, tce); + kvmppc_tce_put(stt, ioba >> stt->page_shift, tce); return H_SUCCESS; } @@ -244,7 +243,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, if (!stt) return H_TOO_HARD; - entry = ioba >> IOMMU_PAGE_SHIFT_4K; + entry = ioba >> stt->page_shift; /* * The spec says that the maximum size of the list is 512 TCEs * so the whole table addressed resides in 4K page @@ -313,8 +312,8 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)) return H_PARAMETER; - for (i = 0; i < npages; ++i, ioba += IOMMU_PAGE_SIZE_4K) - kvmppc_tce_put(stt, ioba >> IOMMU_PAGE_SHIFT_4K, tce_value); + for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) + kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); return H_SUCCESS; } @@ -336,7 +335,7 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret != H_SUCCESS) return ret; - idx = ioba >> IOMMU_PAGE_SHIFT_4K; + idx = ioba >> stt->page_shift; page = stt->pages[idx / TCES_PER_PAGE]; tbl = (u64 *)page_address(page); -- cgit v0.10.2 From 14f853f1b257b69cf0213ad8c49c01038ccf7ef9 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 1 Mar 2016 17:54:39 +1100 Subject: KVM: PPC: Add @offset to kvmppc_spapr_tce_table This enables userspace view of TCE tables to start from non-zero offset on a bus. This will be used for huge DMA windows. This only changes the internal structure, the user interface needs to change in order to use an offset. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index edf66f7..2e7c791 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -184,6 +184,7 @@ struct kvmppc_spapr_tce_table { u64 liobn; struct rcu_head rcu; u32 page_shift; + u64 offset; /* in pages */ u64 size; /* window size in pages */ struct page *pages[0]; }; diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index c786a58..44be73e 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -75,7 +75,9 @@ long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, unsigned long mask = (1ULL << stt->page_shift) - 1; unsigned long idx = ioba >> stt->page_shift; - if ((ioba & mask) || (idx + npages > stt->size) || (idx + npages < idx)) + if ((ioba & mask) || (idx < stt->offset) || + (idx - stt->offset + npages > stt->size) || + (idx + npages < idx)) return H_PARAMETER; return H_SUCCESS; @@ -147,6 +149,7 @@ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, struct page *page; u64 *tbl; + idx -= stt->offset; page = stt->pages[idx / TCES_PER_PAGE]; tbl = kvmppc_page_address(page); @@ -335,7 +338,7 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret != H_SUCCESS) return ret; - idx = ioba >> stt->page_shift; + idx = (ioba >> stt->page_shift) - stt->offset; page = stt->pages[idx / TCES_PER_PAGE]; tbl = (u64 *)page_address(page); -- cgit v0.10.2 From 58ded4201ff028b15f6b317228faa5f154a0663f Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 1 Mar 2016 17:54:40 +1100 Subject: KVM: PPC: Add support for 64bit TCE windows The existing KVM_CREATE_SPAPR_TCE only supports 32bit windows which is not enough for directly mapped windows as the guest can get more than 4GB. This adds KVM_CREATE_SPAPR_TCE_64 ioctl and advertises it via KVM_CAP_SPAPR_TCE_64 capability. The table size is checked against the locked memory limit. Since 64bit windows are to support Dynamic DMA windows (DDW), let's add @bus_offset and @page_shift which are also required by DDW. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Paul Mackerras diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index da39435..bc78652 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3060,6 +3060,38 @@ an implementation for these despite the in kernel acceleration. This capability is always enabled. +4.98 KVM_CREATE_SPAPR_TCE_64 + +Capability: KVM_CAP_SPAPR_TCE_64 +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_create_spapr_tce_64 (in) +Returns: file descriptor for manipulating the created TCE table + +This is an extension for KVM_CAP_SPAPR_TCE which only supports 32bit +windows, described in 4.62 KVM_CREATE_SPAPR_TCE + +This capability uses extended struct in ioctl interface: + +/* for KVM_CAP_SPAPR_TCE_64 */ +struct kvm_create_spapr_tce_64 { + __u64 liobn; + __u32 page_shift; + __u32 flags; + __u64 offset; /* in pages */ + __u64 size; /* in pages */ +}; + +The aim of extension is to support an additional bigger DMA window with +a variable page size. +KVM_CREATE_SPAPR_TCE_64 receives a 64bit window size, an IOMMU page shift and +a bus offset of the corresponding DMA window, @size and @offset are numbers +of IOMMU pages. + +@flags are not used at the moment. + +The rest of functionality is identical to KVM_CREATE_SPAPR_TCE. + 5. The kvm_run structure ------------------------ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 197a8ac..2544eda 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -165,7 +165,7 @@ extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, - struct kvm_create_spapr_tce *args); + struct kvm_create_spapr_tce_64 *args); extern struct kvmppc_spapr_tce_table *kvmppc_find_table( struct kvm_vcpu *vcpu, unsigned long liobn); extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index ab4d473..c93cf35 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -333,6 +333,15 @@ struct kvm_create_spapr_tce { __u32 window_size; }; +/* for KVM_CAP_SPAPR_TCE_64 */ +struct kvm_create_spapr_tce_64 { + __u64 liobn; + __u32 page_shift; + __u32 flags; + __u64 offset; /* in pages */ + __u64 size; /* in pages */ +}; + /* for KVM_ALLOCATE_RMA */ struct kvm_allocate_rma { __u64 rma_size; diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 61cbc44..2c2d103 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -147,20 +147,23 @@ static const struct file_operations kvm_spapr_tce_fops = { }; long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, - struct kvm_create_spapr_tce *args) + struct kvm_create_spapr_tce_64 *args) { struct kvmppc_spapr_tce_table *stt = NULL; unsigned long npages, size; int ret = -ENOMEM; int i; + if (!args->size) + return -EINVAL; + /* Check this LIOBN hasn't been previously allocated */ list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { if (stt->liobn == args->liobn) return -EBUSY; } - size = args->window_size >> IOMMU_PAGE_SHIFT_4K; + size = args->size; npages = kvmppc_tce_pages(size); ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true); if (ret) { @@ -174,7 +177,8 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, goto fail; stt->liobn = args->liobn; - stt->page_shift = IOMMU_PAGE_SHIFT_4K; + stt->page_shift = args->page_shift; + stt->offset = args->offset; stt->size = size; stt->kvm = kvm; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 9258675..19aa59b 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "timing.h" #include "irq.h" #include "../mm/mmu_decl.h" @@ -519,6 +520,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #ifdef CONFIG_PPC_BOOK3S_64 case KVM_CAP_SPAPR_TCE: + case KVM_CAP_SPAPR_TCE_64: case KVM_CAP_PPC_ALLOC_HTAB: case KVM_CAP_PPC_RTAS: case KVM_CAP_PPC_FIXUP_HCALL: @@ -1344,13 +1346,34 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } #ifdef CONFIG_PPC_BOOK3S_64 + case KVM_CREATE_SPAPR_TCE_64: { + struct kvm_create_spapr_tce_64 create_tce_64; + + r = -EFAULT; + if (copy_from_user(&create_tce_64, argp, sizeof(create_tce_64))) + goto out; + if (create_tce_64.flags) { + r = -EINVAL; + goto out; + } + r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce_64); + goto out; + } case KVM_CREATE_SPAPR_TCE: { struct kvm_create_spapr_tce create_tce; + struct kvm_create_spapr_tce_64 create_tce_64; r = -EFAULT; if (copy_from_user(&create_tce, argp, sizeof(create_tce))) goto out; - r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce); + + create_tce_64.liobn = create_tce.liobn; + create_tce_64.page_shift = IOMMU_PAGE_SHIFT_4K; + create_tce_64.offset = 0; + create_tce_64.size = create_tce.window_size >> + IOMMU_PAGE_SHIFT_4K; + create_tce_64.flags = 0; + r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce_64); goto out; } case KVM_PPC_GET_SMMU_INFO: { diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 8ce5f64..b06208b 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1143,6 +1143,8 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_PPC_ALLOC_HTAB */ #define KVM_PPC_ALLOCATE_HTAB _IOWR(KVMIO, 0xa7, __u32) #define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce) +#define KVM_CREATE_SPAPR_TCE_64 _IOW(KVMIO, 0xa8, \ + struct kvm_create_spapr_tce_64) /* Available with KVM_CAP_RMA */ #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) /* Available with KVM_CAP_PPC_HTAB_FD */ -- cgit v0.10.2