From 4ed0d3e6c64cfd9ba4ceb2099b10d1cf8ece4320 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 24 Apr 2009 17:30:20 -0700 Subject: Intel IOMMU Pass Through Support The patch adds kernel parameter intel_iommu=pt to set up pass through mode in context mapping entry. This disables DMAR in linux kernel; but KVM still runs on VT-d and interrupt remapping still works. In this mode, kernel uses swiotlb for DMA API functions but other VT-d functionalities are enabled for KVM. KVM always uses multi level translation page table in VT-d. By default, pass though mode is disabled in kernel. This is useful when people don't want to enable VT-d DMAR in kernel but still want to use KVM and interrupt remapping for reasons like DMAR performance concern or debug purpose. Signed-off-by: Fenghua Yu Acked-by: Weidong Han Signed-off-by: David Woodhouse diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 600cdd7..fa4faeb 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -965,6 +965,7 @@ and is between 256 and 4096 characters. It is defined in the file nomerge forcesac soft + pt [x86, IA64] io7= [HW] IO7 for Marvel based alpha systems See comment before marvel_specify_io7 in diff --git a/arch/ia64/include/asm/iommu.h b/arch/ia64/include/asm/iommu.h index 0490794..37d41ca 100644 --- a/arch/ia64/include/asm/iommu.h +++ b/arch/ia64/include/asm/iommu.h @@ -9,6 +9,7 @@ extern void pci_iommu_shutdown(void); extern void no_iommu_init(void); extern int force_iommu, no_iommu; extern int iommu_detected; +extern int iommu_pass_through; extern void iommu_dma_init(void); extern void machvec_init(const char *name); diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c index 285aae8..223abb1 100644 --- a/arch/ia64/kernel/pci-swiotlb.c +++ b/arch/ia64/kernel/pci-swiotlb.c @@ -46,7 +46,7 @@ void __init swiotlb_dma_init(void) void __init pci_swiotlb_init(void) { - if (!iommu_detected) { + if (!iommu_detected || iommu_pass_through) { #ifdef CONFIG_IA64_GENERIC swiotlb = 1; printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n"); diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index af326a2..fd6d21b 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -6,6 +6,7 @@ extern void no_iommu_init(void); extern struct dma_map_ops nommu_dma_ops; extern int force_iommu, no_iommu; extern int iommu_detected; +extern int iommu_pass_through; /* 10 seconds */ #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 745579b..8cad0d8 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -160,6 +160,8 @@ again: return page_address(page); } +extern int iommu_pass_through; + /* * See for the iommu kernel parameter * documentation. @@ -209,6 +211,10 @@ static __init int iommu_setup(char *p) #ifdef CONFIG_SWIOTLB if (!strncmp(p, "soft", 4)) swiotlb = 1; + if (!strncmp(p, "pt", 2)) { + iommu_pass_through = 1; + return 1; + } #endif gart_parse_options(p); diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 221a385..3a0c51e 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -71,7 +71,8 @@ void __init pci_swiotlb_init(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ #ifdef CONFIG_X86_64 - if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) + if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) || + iommu_pass_through) swiotlb = 1; #endif if (swiotlb_force) diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c index fa3a113..d3d86b7 100644 --- a/drivers/pci/dmar.c +++ b/drivers/pci/dmar.c @@ -515,6 +515,7 @@ int alloc_iommu(struct dmar_drhd_unit *drhd) u32 ver; static int iommu_allocated = 0; int agaw = 0; + int msagaw = 0; iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); if (!iommu) @@ -535,12 +536,20 @@ int alloc_iommu(struct dmar_drhd_unit *drhd) agaw = iommu_calculate_agaw(iommu); if (agaw < 0) { printk(KERN_ERR - "Cannot get a valid agaw for iommu (seq_id = %d)\n", + "Cannot get a valid agaw for iommu (seq_id = %d)\n", + iommu->seq_id); + goto error; + } + msagaw = iommu_calculate_max_sagaw(iommu); + if (msagaw < 0) { + printk(KERN_ERR + "Cannot get a valid max agaw for iommu (seq_id = %d)\n", iommu->seq_id); goto error; } #endif iommu->agaw = agaw; + iommu->msagaw = msagaw; /* the registers might be more than one page */ map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap), diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 001b328..1312182 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -53,6 +53,8 @@ #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48 +#define MAX_AGAW_WIDTH 64 + #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1) #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) @@ -127,8 +129,6 @@ static inline void context_set_fault_enable(struct context_entry *context) context->lo &= (((u64)-1) << 2) | 1; } -#define CONTEXT_TT_MULTI_LEVEL 0 - static inline void context_set_translation_type(struct context_entry *context, unsigned long value) { @@ -288,6 +288,7 @@ int dmar_disabled = 1; static int __initdata dmar_map_gfx = 1; static int dmar_forcedac; static int intel_iommu_strict; +int iommu_pass_through; #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1)) static DEFINE_SPINLOCK(device_domain_lock); @@ -397,17 +398,13 @@ void free_iova_mem(struct iova *iova) static inline int width_to_agaw(int width); -/* calculate agaw for each iommu. - * "SAGAW" may be different across iommus, use a default agaw, and - * get a supported less agaw for iommus that don't support the default agaw. - */ -int iommu_calculate_agaw(struct intel_iommu *iommu) +static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) { unsigned long sagaw; int agaw = -1; sagaw = cap_sagaw(iommu->cap); - for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH); + for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { if (test_bit(agaw, &sagaw)) break; @@ -416,6 +413,24 @@ int iommu_calculate_agaw(struct intel_iommu *iommu) return agaw; } +/* + * Calculate max SAGAW for each iommu. + */ +int iommu_calculate_max_sagaw(struct intel_iommu *iommu) +{ + return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); +} + +/* + * calculate agaw for each iommu. + * "SAGAW" may be different across iommus, use a default agaw, and + * get a supported less agaw for iommus that don't support the default agaw. + */ +int iommu_calculate_agaw(struct intel_iommu *iommu) +{ + return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); +} + /* in native case, each domain is related to only one iommu */ static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) { @@ -1321,8 +1336,8 @@ static void domain_exit(struct dmar_domain *domain) free_domain_mem(domain); } -static int domain_context_mapping_one(struct dmar_domain *domain, - int segment, u8 bus, u8 devfn) +static int domain_context_mapping_one(struct dmar_domain *domain, int segment, + u8 bus, u8 devfn, int translation) { struct context_entry *context; unsigned long flags; @@ -1335,7 +1350,10 @@ static int domain_context_mapping_one(struct dmar_domain *domain, pr_debug("Set context mapping for %02x:%02x.%d\n", bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + BUG_ON(!domain->pgd); + BUG_ON(translation != CONTEXT_TT_PASS_THROUGH && + translation != CONTEXT_TT_MULTI_LEVEL); iommu = device_to_iommu(segment, bus, devfn); if (!iommu) @@ -1395,9 +1413,18 @@ static int domain_context_mapping_one(struct dmar_domain *domain, } context_set_domain_id(context, id); - context_set_address_width(context, iommu->agaw); - context_set_address_root(context, virt_to_phys(pgd)); - context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL); + + /* + * In pass through mode, AW must be programmed to indicate the largest + * AGAW value supported by hardware. And ASR is ignored by hardware. + */ + if (likely(translation == CONTEXT_TT_MULTI_LEVEL)) { + context_set_address_width(context, iommu->agaw); + context_set_address_root(context, virt_to_phys(pgd)); + } else + context_set_address_width(context, iommu->msagaw); + + context_set_translation_type(context, translation); context_set_fault_enable(context); context_set_present(context); domain_flush_cache(domain, context, sizeof(*context)); @@ -1422,13 +1449,15 @@ static int domain_context_mapping_one(struct dmar_domain *domain, } static int -domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev) +domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev, + int translation) { int ret; struct pci_dev *tmp, *parent; ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus), - pdev->bus->number, pdev->devfn); + pdev->bus->number, pdev->devfn, + translation); if (ret) return ret; @@ -1442,7 +1471,7 @@ domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev) ret = domain_context_mapping_one(domain, pci_domain_nr(parent->bus), parent->bus->number, - parent->devfn); + parent->devfn, translation); if (ret) return ret; parent = parent->bus->self; @@ -1450,12 +1479,14 @@ domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev) if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */ return domain_context_mapping_one(domain, pci_domain_nr(tmp->subordinate), - tmp->subordinate->number, 0); + tmp->subordinate->number, 0, + translation); else /* this is a legacy PCI bridge */ return domain_context_mapping_one(domain, pci_domain_nr(tmp->bus), tmp->bus->number, - tmp->devfn); + tmp->devfn, + translation); } static int domain_context_mapped(struct pci_dev *pdev) @@ -1752,7 +1783,7 @@ static int iommu_prepare_identity_map(struct pci_dev *pdev, goto error; /* context entry init */ - ret = domain_context_mapping(domain, pdev); + ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL); if (!ret) return 0; error: @@ -1853,6 +1884,23 @@ static inline void iommu_prepare_isa(void) } #endif /* !CONFIG_DMAR_FLPY_WA */ +/* Initialize each context entry as pass through.*/ +static int __init init_context_pass_through(void) +{ + struct pci_dev *pdev = NULL; + struct dmar_domain *domain; + int ret; + + for_each_pci_dev(pdev) { + domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH); + ret = domain_context_mapping(domain, pdev, + CONTEXT_TT_PASS_THROUGH); + if (ret) + return ret; + } + return 0; +} + static int __init init_dmars(void) { struct dmar_drhd_unit *drhd; @@ -1860,6 +1908,7 @@ static int __init init_dmars(void) struct pci_dev *pdev; struct intel_iommu *iommu; int i, ret; + int pass_through = 1; /* * for each drhd @@ -1913,7 +1962,15 @@ static int __init init_dmars(void) printk(KERN_ERR "IOMMU: allocate root entry failed\n"); goto error; } + if (!ecap_pass_through(iommu->ecap)) + pass_through = 0; } + if (iommu_pass_through) + if (!pass_through) { + printk(KERN_INFO + "Pass Through is not supported by hardware.\n"); + iommu_pass_through = 0; + } /* * Start from the sane iommu hardware state. @@ -1976,37 +2033,57 @@ static int __init init_dmars(void) "IOMMU: enable interrupt remapping failed\n"); } #endif + /* + * If pass through is set and enabled, context entries of all pci + * devices are intialized by pass through translation type. + */ + if (iommu_pass_through) { + ret = init_context_pass_through(); + if (ret) { + printk(KERN_ERR "IOMMU: Pass through init failed.\n"); + iommu_pass_through = 0; + } + } /* - * For each rmrr - * for each dev attached to rmrr - * do - * locate drhd for dev, alloc domain for dev - * allocate free domain - * allocate page table entries for rmrr - * if context not allocated for bus - * allocate and init context - * set present in root table for this bus - * init context with domain, translation etc - * endfor - * endfor + * If pass through is not set or not enabled, setup context entries for + * identity mappings for rmrr, gfx, and isa. */ - for_each_rmrr_units(rmrr) { - for (i = 0; i < rmrr->devices_cnt; i++) { - pdev = rmrr->devices[i]; - /* some BIOS lists non-exist devices in DMAR table */ - if (!pdev) - continue; - ret = iommu_prepare_rmrr_dev(rmrr, pdev); - if (ret) - printk(KERN_ERR + if (!iommu_pass_through) { + /* + * For each rmrr + * for each dev attached to rmrr + * do + * locate drhd for dev, alloc domain for dev + * allocate free domain + * allocate page table entries for rmrr + * if context not allocated for bus + * allocate and init context + * set present in root table for this bus + * init context with domain, translation etc + * endfor + * endfor + */ + for_each_rmrr_units(rmrr) { + for (i = 0; i < rmrr->devices_cnt; i++) { + pdev = rmrr->devices[i]; + /* + * some BIOS lists non-exist devices in DMAR + * table. + */ + if (!pdev) + continue; + ret = iommu_prepare_rmrr_dev(rmrr, pdev); + if (ret) + printk(KERN_ERR "IOMMU: mapping reserved region failed\n"); + } } - } - iommu_prepare_gfx_mapping(); + iommu_prepare_gfx_mapping(); - iommu_prepare_isa(); + iommu_prepare_isa(); + } /* * for each drhd @@ -2117,7 +2194,8 @@ get_valid_domain_for_dev(struct pci_dev *pdev) /* make sure context mapping is ok */ if (unlikely(!domain_context_mapped(pdev))) { - ret = domain_context_mapping(domain, pdev); + ret = domain_context_mapping(domain, pdev, + CONTEXT_TT_MULTI_LEVEL); if (ret) { printk(KERN_ERR "Domain context map for %s failed", @@ -2786,7 +2864,7 @@ int __init intel_iommu_init(void) * Check the need for DMA-remapping initialization now. * Above initialization will also be used by Interrupt-remapping. */ - if (no_iommu || swiotlb || dmar_disabled) + if (no_iommu || (swiotlb && !iommu_pass_through) || dmar_disabled) return -ENODEV; iommu_init_mempool(); @@ -2806,7 +2884,15 @@ int __init intel_iommu_init(void) init_timer(&unmap_timer); force_iommu = 1; - dma_ops = &intel_dma_ops; + + if (!iommu_pass_through) { + printk(KERN_INFO + "Multi-level page-table translation for DMAR.\n"); + dma_ops = &intel_dma_ops; + } else + printk(KERN_INFO + "DMAR: Pass through translation for DMAR.\n"); + init_iommu_sysfs(); register_iommu(&intel_iommu_ops); @@ -3146,7 +3232,7 @@ static int intel_iommu_attach_device(struct iommu_domain *domain, return -EFAULT; } - ret = domain_context_mapping(dmar_domain, pdev); + ret = domain_context_mapping(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL); if (ret) return ret; diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h index 1a455f1..e0a03af 100644 --- a/include/linux/dma_remapping.h +++ b/include/linux/dma_remapping.h @@ -13,6 +13,9 @@ #define DMA_PTE_WRITE (2) #define DMA_PTE_SNP (1 << 11) +#define CONTEXT_TT_MULTI_LEVEL 0 +#define CONTEXT_TT_PASS_THROUGH 2 + struct intel_iommu; struct dmar_domain; struct root_entry; @@ -21,11 +24,16 @@ extern void free_dmar_iommu(struct intel_iommu *iommu); #ifdef CONFIG_DMAR extern int iommu_calculate_agaw(struct intel_iommu *iommu); +extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu); #else static inline int iommu_calculate_agaw(struct intel_iommu *iommu) { return 0; } +static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu) +{ + return 0; +} #endif extern int dmar_disabled; diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index aa8c531..7246971 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -120,6 +120,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val) (ecap_iotlb_offset(e) + ecap_niotlb_iunits(e) * 16) #define ecap_coherent(e) ((e) & 0x1) #define ecap_qis(e) ((e) & 0x2) +#define ecap_pass_through(e) ((e >> 6) & 0x1) #define ecap_eim_support(e) ((e >> 4) & 0x1) #define ecap_ir_support(e) ((e >> 3) & 0x1) #define ecap_max_handle_mask(e) ((e >> 20) & 0xf) @@ -302,6 +303,7 @@ struct intel_iommu { spinlock_t register_lock; /* protect register handling */ int seq_id; /* sequence id of the iommu */ int agaw; /* agaw of this iommu */ + int msagaw; /* max sagaw of this iommu */ unsigned int irq; unsigned char name[13]; /* Device Name */ -- cgit v0.10.2 From aed5d5f4c5ea5da01a774e42cff08c4b4fa59072 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 30 Apr 2009 17:57:11 -0700 Subject: Fix !CONFIG_DMAR build failure introduced by Intel IOMMU Pass Through Support This updated patch should fix the compiling errors and remove the extern iommu_pass_through from drivers/pci/intel-iommu.c file. Signed-off-by: Fenghua Yu Signed-off-by: David Woodhouse diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c index eb98738..ecdde25 100644 --- a/arch/ia64/kernel/pci-dma.c +++ b/arch/ia64/kernel/pci-dma.c @@ -32,6 +32,8 @@ int force_iommu __read_mostly = 1; int force_iommu __read_mostly; #endif +int iommu_pass_through; + /* Dummy device used for NULL arguments (normally ISA). Better would be probably a smaller DMA mask, but this is bug-to-bug compatible to i386. */ diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 8cad0d8..049005e 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -32,6 +32,8 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; +int iommu_pass_through; + dma_addr_t bad_dma_address __read_mostly = 0; EXPORT_SYMBOL(bad_dma_address); @@ -160,8 +162,6 @@ again: return page_address(page); } -extern int iommu_pass_through; - /* * See for the iommu kernel parameter * documentation. diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 1312182..d3edd6a 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -288,7 +288,6 @@ int dmar_disabled = 1; static int __initdata dmar_map_gfx = 1; static int dmar_forcedac; static int intel_iommu_strict; -int iommu_pass_through; #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1)) static DEFINE_SPINLOCK(device_domain_lock); -- cgit v0.10.2 From fa3b6dcd5298db2e7b63c17795c9e5570d3df8d9 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Fri, 8 May 2009 10:33:38 +0800 Subject: VT-d: fix invalid domain id for KVM context flush The domain->id is a sequence number associated with the KVM guest and should not be used for the context flush. This patch replaces the domain->id with a proper id value for both bare metal and KVM. Signed-off-by: Yu Zhao Acked-by: Weidong Han Signed-off-by: David Woodhouse diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index d3edd6a..d6f4ee5 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1429,7 +1429,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment, domain_flush_cache(domain, context, sizeof(*context)); /* it's a non-present to present mapping */ - if (iommu->flush.flush_context(iommu, domain->id, + if (iommu->flush.flush_context(iommu, id, (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL, 1)) iommu_flush_write_buffer(iommu); -- cgit v0.10.2 From 4c25a2c1b90bf785fc2e2f0f0c74a80b3e070d39 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 10 May 2009 17:16:06 +0100 Subject: intel-iommu: Clean up handling of "caching mode" vs. context flushing. It really doesn't make a lot of sense to have some of the logic to handle caching vs. non-caching mode duplicated in qi_flush_context() and __iommu_flush_context(), while the return value indicates whether the caller should take other action which depends on the same thing. Especially since qi_flush_context() thought it was returning something entirely different anyway. This patch makes qi_flush_context() and __iommu_flush_context() both return void, removes the 'non_present_entry_flush' argument and makes the only call site which _set_ that argument to 1 do the right thing. Signed-off-by: David Woodhouse diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c index d3d86b7..10a071b 100644 --- a/drivers/pci/dmar.c +++ b/drivers/pci/dmar.c @@ -723,23 +723,16 @@ void qi_global_iec(struct intel_iommu *iommu) qi_submit_sync(&desc, iommu); } -int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, - u64 type, int non_present_entry_flush) +void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, + u64 type) { struct qi_desc desc; - if (non_present_entry_flush) { - if (!cap_caching_mode(iommu->cap)) - return 1; - else - did = 0; - } - desc.low = QI_CC_FM(fm) | QI_CC_SID(sid) | QI_CC_DID(did) | QI_CC_GRAN(type) | QI_CC_TYPE; desc.high = 0; - return qi_submit_sync(&desc, iommu); + qi_submit_sync(&desc, iommu); } int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index d6f4ee5..9f5d915 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -857,26 +857,13 @@ static void iommu_flush_write_buffer(struct intel_iommu *iommu) } /* return value determine if we need a write buffer flush */ -static int __iommu_flush_context(struct intel_iommu *iommu, - u16 did, u16 source_id, u8 function_mask, u64 type, - int non_present_entry_flush) +static void __iommu_flush_context(struct intel_iommu *iommu, + u16 did, u16 source_id, u8 function_mask, + u64 type) { u64 val = 0; unsigned long flag; - /* - * In the non-present entry flush case, if hardware doesn't cache - * non-present entry we do nothing and if hardware cache non-present - * entry, we flush entries of domain 0 (the domain id is used to cache - * any non-present entries) - */ - if (non_present_entry_flush) { - if (!cap_caching_mode(iommu->cap)) - return 1; - else - did = 0; - } - switch (type) { case DMA_CCMD_GLOBAL_INVL: val = DMA_CCMD_GLOBAL_INVL; @@ -901,9 +888,6 @@ static int __iommu_flush_context(struct intel_iommu *iommu, dmar_readq, (!(val & DMA_CCMD_ICC)), val); spin_unlock_irqrestore(&iommu->register_lock, flag); - - /* flush context entry will implicitly flush write buffer */ - return 0; } /* return value determine if we need a write buffer flush */ @@ -1428,14 +1412,21 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment, context_set_present(context); domain_flush_cache(domain, context, sizeof(*context)); - /* it's a non-present to present mapping */ - if (iommu->flush.flush_context(iommu, id, - (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, - DMA_CCMD_DEVICE_INVL, 1)) - iommu_flush_write_buffer(iommu); - else + /* + * It's a non-present to present mapping. If hardware doesn't cache + * non-present entry we only need to flush the write-buffer. If the + * _does_ cache non-present entries, then it does so in the special + * domain #0, which we have to flush: + */ + if (cap_caching_mode(iommu->cap)) { + iommu->flush.flush_context(iommu, 0, + (((u16)bus) << 8) | devfn, + DMA_CCMD_MASK_NOBIT, + DMA_CCMD_DEVICE_INVL); iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0); - + } else { + iommu_flush_write_buffer(iommu); + } spin_unlock_irqrestore(&iommu->lock, flags); spin_lock_irqsave(&domain->iommu_lock, flags); @@ -1566,7 +1557,7 @@ static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn) clear_context_table(iommu, bus, devfn); iommu->flush.flush_context(iommu, 0, 0, 0, - DMA_CCMD_GLOBAL_INVL, 0); + DMA_CCMD_GLOBAL_INVL); iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH, 0); } @@ -2104,8 +2095,7 @@ static int __init init_dmars(void) iommu_set_root_entry(iommu); - iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL, - 0); + iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH, 0); iommu_disable_protect_mem_regions(iommu); @@ -2721,7 +2711,7 @@ static int init_iommu_hw(void) iommu_set_root_entry(iommu); iommu->flush.flush_context(iommu, 0, 0, 0, - DMA_CCMD_GLOBAL_INVL, 0); + DMA_CCMD_GLOBAL_INVL); iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH, 0); iommu_disable_protect_mem_regions(iommu); @@ -2738,7 +2728,7 @@ static void iommu_flush_all(void) for_each_active_iommu(iommu, drhd) { iommu->flush.flush_context(iommu, 0, 0, 0, - DMA_CCMD_GLOBAL_INVL, 0); + DMA_CCMD_GLOBAL_INVL); iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH, 0); } diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 7246971..f2b94da 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -281,8 +281,8 @@ struct ir_table { #endif struct iommu_flush { - int (*flush_context)(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, - u64 type, int non_present_entry_flush); + void (*flush_context)(struct intel_iommu *iommu, u16 did, u16 sid, + u8 fm, u64 type); int (*flush_iotlb)(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type, int non_present_entry_flush); }; @@ -339,8 +339,8 @@ extern void dmar_disable_qi(struct intel_iommu *iommu); extern int dmar_reenable_qi(struct intel_iommu *iommu); extern void qi_global_iec(struct intel_iommu *iommu); -extern int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, - u8 fm, u64 type, int non_present_entry_flush); +extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, + u8 fm, u64 type); extern int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type, int non_present_entry_flush); -- cgit v0.10.2 From 1f0ef2aa18802a8ce7eb5a5164aaaf4d59073801 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 10 May 2009 19:58:49 +0100 Subject: intel-iommu: Clean up handling of "caching mode" vs. IOTLB flushing. As we just did for context cache flushing, clean up the logic around whether we need to flush the iotlb or just the write-buffer, depending on caching mode. Fix the same bug in qi_flush_iotlb() that qi_flush_context() had -- it isn't supposed to be returning an error; it's supposed to be returning a flag which triggers a write-buffer flush. Remove some superfluous conditional write-buffer flushes which could never have happened because they weren't for non-present-to-present mapping changes anyway. Signed-off-by: David Woodhouse diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c index 10a071b..df6af0d 100644 --- a/drivers/pci/dmar.c +++ b/drivers/pci/dmar.c @@ -735,22 +735,14 @@ void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, qi_submit_sync(&desc, iommu); } -int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, - unsigned int size_order, u64 type, - int non_present_entry_flush) +void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type) { u8 dw = 0, dr = 0; struct qi_desc desc; int ih = 0; - if (non_present_entry_flush) { - if (!cap_caching_mode(iommu->cap)) - return 1; - else - did = 0; - } - if (cap_write_drain(iommu->cap)) dw = 1; @@ -762,7 +754,7 @@ int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, desc.high = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih) | QI_IOTLB_AM(size_order); - return qi_submit_sync(&desc, iommu); + qi_submit_sync(&desc, iommu); } /* diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 9f5d915..f47d04a 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -891,27 +891,13 @@ static void __iommu_flush_context(struct intel_iommu *iommu, } /* return value determine if we need a write buffer flush */ -static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, - u64 addr, unsigned int size_order, u64 type, - int non_present_entry_flush) +static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, + u64 addr, unsigned int size_order, u64 type) { int tlb_offset = ecap_iotlb_offset(iommu->ecap); u64 val = 0, val_iva = 0; unsigned long flag; - /* - * In the non-present entry flush case, if hardware doesn't cache - * non-present entry we do nothing and if hardware cache non-present - * entry, we flush entries of domain 0 (the domain id is used to cache - * any non-present entries) - */ - if (non_present_entry_flush) { - if (!cap_caching_mode(iommu->cap)) - return 1; - else - did = 0; - } - switch (type) { case DMA_TLB_GLOBAL_FLUSH: /* global flush doesn't need set IVA_REG */ @@ -959,12 +945,10 @@ static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n", (unsigned long long)DMA_TLB_IIRG(type), (unsigned long long)DMA_TLB_IAIG(val)); - /* flush iotlb entry will implicitly flush write buffer */ - return 0; } -static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, - u64 addr, unsigned int pages, int non_present_entry_flush) +static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, + u64 addr, unsigned int pages) { unsigned int mask; @@ -974,8 +958,7 @@ static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, /* Fallback to domain selective flush if no PSI support */ if (!cap_pgsel_inv(iommu->cap)) return iommu->flush.flush_iotlb(iommu, did, 0, 0, - DMA_TLB_DSI_FLUSH, - non_present_entry_flush); + DMA_TLB_DSI_FLUSH); /* * PSI requires page size to be 2 ^ x, and the base address is naturally @@ -985,11 +968,10 @@ static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, /* Fallback to domain selective flush if size is too big */ if (mask > cap_max_amask_val(iommu->cap)) return iommu->flush.flush_iotlb(iommu, did, 0, 0, - DMA_TLB_DSI_FLUSH, non_present_entry_flush); + DMA_TLB_DSI_FLUSH); return iommu->flush.flush_iotlb(iommu, did, addr, mask, - DMA_TLB_PSI_FLUSH, - non_present_entry_flush); + DMA_TLB_PSI_FLUSH); } static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) @@ -1423,7 +1405,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment, (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); - iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0); + iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH); } else { iommu_flush_write_buffer(iommu); } @@ -1558,8 +1540,7 @@ static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn) clear_context_table(iommu, bus, devfn); iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); - iommu->flush.flush_iotlb(iommu, 0, 0, 0, - DMA_TLB_GLOBAL_FLUSH, 0); + iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); } static void domain_remove_dev_info(struct dmar_domain *domain) @@ -2096,8 +2077,7 @@ static int __init init_dmars(void) iommu_set_root_entry(iommu); iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); - iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH, - 0); + iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); iommu_disable_protect_mem_regions(iommu); ret = iommu_enable_translation(iommu); @@ -2244,10 +2224,11 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, if (ret) goto error; - /* it's a non-present to present mapping */ - ret = iommu_flush_iotlb_psi(iommu, domain->id, - start_paddr, size >> VTD_PAGE_SHIFT, 1); - if (ret) + /* it's a non-present to present mapping. Only flush if caching mode */ + if (cap_caching_mode(iommu->cap)) + iommu_flush_iotlb_psi(iommu, 0, start_paddr, + size >> VTD_PAGE_SHIFT); + else iommu_flush_write_buffer(iommu); return start_paddr + ((u64)paddr & (~PAGE_MASK)); @@ -2283,7 +2264,7 @@ static void flush_unmaps(void) if (deferred_flush[i].next) { iommu->flush.flush_iotlb(iommu, 0, 0, 0, - DMA_TLB_GLOBAL_FLUSH, 0); + DMA_TLB_GLOBAL_FLUSH); for (j = 0; j < deferred_flush[i].next; j++) { __free_iova(&deferred_flush[i].domain[j]->iovad, deferred_flush[i].iova[j]); @@ -2362,9 +2343,8 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr, /* free page tables */ dma_pte_free_pagetable(domain, start_addr, start_addr + size); if (intel_iommu_strict) { - if (iommu_flush_iotlb_psi(iommu, - domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0)) - iommu_flush_write_buffer(iommu); + iommu_flush_iotlb_psi(iommu, domain->id, start_addr, + size >> VTD_PAGE_SHIFT); /* free iova */ __free_iova(&domain->iovad, iova); } else { @@ -2455,9 +2435,8 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, /* free page tables */ dma_pte_free_pagetable(domain, start_addr, start_addr + size); - if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr, - size >> VTD_PAGE_SHIFT, 0)) - iommu_flush_write_buffer(iommu); + iommu_flush_iotlb_psi(iommu, domain->id, start_addr, + size >> VTD_PAGE_SHIFT); /* free iova */ __free_iova(&domain->iovad, iova); @@ -2549,10 +2528,13 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne offset += size; } - /* it's a non-present to present mapping */ - if (iommu_flush_iotlb_psi(iommu, domain->id, - start_addr, offset >> VTD_PAGE_SHIFT, 1)) + /* it's a non-present to present mapping. Only flush if caching mode */ + if (cap_caching_mode(iommu->cap)) + iommu_flush_iotlb_psi(iommu, 0, start_addr, + offset >> VTD_PAGE_SHIFT); + else iommu_flush_write_buffer(iommu); + return nelems; } @@ -2711,9 +2693,9 @@ static int init_iommu_hw(void) iommu_set_root_entry(iommu); iommu->flush.flush_context(iommu, 0, 0, 0, - DMA_CCMD_GLOBAL_INVL); + DMA_CCMD_GLOBAL_INVL); iommu->flush.flush_iotlb(iommu, 0, 0, 0, - DMA_TLB_GLOBAL_FLUSH, 0); + DMA_TLB_GLOBAL_FLUSH); iommu_disable_protect_mem_regions(iommu); iommu_enable_translation(iommu); } @@ -2728,9 +2710,9 @@ static void iommu_flush_all(void) for_each_active_iommu(iommu, drhd) { iommu->flush.flush_context(iommu, 0, 0, 0, - DMA_CCMD_GLOBAL_INVL); + DMA_CCMD_GLOBAL_INVL); iommu->flush.flush_iotlb(iommu, 0, 0, 0, - DMA_TLB_GLOBAL_FLUSH, 0); + DMA_TLB_GLOBAL_FLUSH); } } diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index f2b94da..29e05a0 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -283,8 +283,8 @@ struct ir_table { struct iommu_flush { void (*flush_context)(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, u64 type); - int (*flush_iotlb)(struct intel_iommu *iommu, u16 did, u64 addr, - unsigned int size_order, u64 type, int non_present_entry_flush); + void (*flush_iotlb)(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type); }; enum { @@ -341,9 +341,8 @@ extern void qi_global_iec(struct intel_iommu *iommu); extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, u64 type); -extern int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, - unsigned int size_order, u64 type, - int non_present_entry_flush); +extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type); extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu); -- cgit v0.10.2 From 462b60f6ccc685f7e8aa04ff430e6b4ffedf629f Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 10 May 2009 20:18:18 +0100 Subject: intel-iommu: Fix tiny theoretical race in write-buffer flush. In iommu_flush_write_buffer() we read iommu->gcmd before taking the register_lock, and then we mask in the WBF bit and write it to the register. There is a tiny chance that something else could have _changed_ iommu->gcmd before we take the lock, but after we read it. So we could be undoing that change. Never actually going to have happened in practice, since nothing else changes that register at runtime -- aside from the write-buffer flush it's only ever touched at startup for enabling translation, etc. But worth fixing anyway. Signed-off-by: David Woodhouse diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index f47d04a..2e2c740 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -844,10 +844,8 @@ static void iommu_flush_write_buffer(struct intel_iommu *iommu) if (!rwbf_quirk && !cap_rwbf(iommu->cap)) return; - val = iommu->gcmd | DMA_GCMD_WBF; - spin_lock_irqsave(&iommu->register_lock, flag); - writel(val, iommu->reg + DMAR_GCMD_REG); + writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, -- cgit v0.10.2 From c416daa98a584596df21ee2c26fac6579ee58f57 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 10 May 2009 20:30:58 +0100 Subject: intel-iommu: Tidy up iommu->gcmd handling Signed-off-by: David Woodhouse diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c index df6af0d..faf77a0 100644 --- a/drivers/pci/dmar.c +++ b/drivers/pci/dmar.c @@ -784,7 +784,6 @@ void dmar_disable_qi(struct intel_iommu *iommu) cpu_relax(); iommu->gcmd &= ~DMA_GCMD_QIE; - writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, @@ -798,7 +797,7 @@ end: */ static void __dmar_enable_qi(struct intel_iommu *iommu) { - u32 cmd, sts; + u32 sts; unsigned long flags; struct q_inval *qi = iommu->qi; @@ -812,9 +811,8 @@ static void __dmar_enable_qi(struct intel_iommu *iommu) dmar_writeq(iommu->reg + DMAR_IQA_REG, virt_to_phys(qi->desc)); - cmd = iommu->gcmd | DMA_GCMD_QIE; iommu->gcmd |= DMA_GCMD_QIE; - writel(cmd, iommu->reg + DMAR_GCMD_REG); + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_QIES), sts); diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 2e2c740..bc99b1e 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -819,7 +819,7 @@ static int iommu_alloc_root_entry(struct intel_iommu *iommu) static void iommu_set_root_entry(struct intel_iommu *iommu) { void *addr; - u32 cmd, sts; + u32 sts; unsigned long flag; addr = iommu->root_entry; @@ -827,12 +827,11 @@ static void iommu_set_root_entry(struct intel_iommu *iommu) spin_lock_irqsave(&iommu->register_lock, flag); dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr)); - cmd = iommu->gcmd | DMA_GCMD_SRTP; - writel(cmd, iommu->reg + DMAR_GCMD_REG); + writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, - readl, (sts & DMA_GSTS_RTPS), sts); + readl, (sts & DMA_GSTS_RTPS), sts); spin_unlock_irqrestore(&iommu->register_lock, flag); } @@ -844,12 +843,13 @@ static void iommu_flush_write_buffer(struct intel_iommu *iommu) if (!rwbf_quirk && !cap_rwbf(iommu->cap)) return; + spin_lock_irqsave(&iommu->register_lock, flag); writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, - readl, (!(val & DMA_GSTS_WBFS)), val); + readl, (!(val & DMA_GSTS_WBFS)), val); spin_unlock_irqrestore(&iommu->register_lock, flag); } @@ -995,13 +995,13 @@ static int iommu_enable_translation(struct intel_iommu *iommu) unsigned long flags; spin_lock_irqsave(&iommu->register_lock, flags); - writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG); + iommu->gcmd |= DMA_GCMD_TE; + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, - readl, (sts & DMA_GSTS_TES), sts); + readl, (sts & DMA_GSTS_TES), sts); - iommu->gcmd |= DMA_GCMD_TE; spin_unlock_irqrestore(&iommu->register_lock, flags); return 0; } @@ -1017,7 +1017,7 @@ static int iommu_disable_translation(struct intel_iommu *iommu) /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, - readl, (!(sts & DMA_GSTS_TES)), sts); + readl, (!(sts & DMA_GSTS_TES)), sts); spin_unlock_irqrestore(&iommu->register_lock, flag); return 0; diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c index f5e0ea7..1669596 100644 --- a/drivers/pci/intr_remapping.c +++ b/drivers/pci/intr_remapping.c @@ -404,7 +404,7 @@ int free_irte(int irq) static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode) { u64 addr; - u32 cmd, sts; + u32 sts; unsigned long flags; addr = virt_to_phys((void *)iommu->ir_table->base); @@ -415,9 +415,8 @@ static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode) (addr) | IR_X2APIC_MODE(mode) | INTR_REMAP_TABLE_REG_SIZE); /* Set interrupt-remapping table pointer */ - cmd = iommu->gcmd | DMA_GCMD_SIRTP; iommu->gcmd |= DMA_GCMD_SIRTP; - writel(cmd, iommu->reg + DMAR_GCMD_REG); + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_IRTPS), sts); @@ -427,9 +426,8 @@ static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode) spin_lock_irqsave(&iommu->register_lock, flags); /* enable comaptiblity format interrupt pass through */ - cmd = iommu->gcmd | DMA_GCMD_CFI; iommu->gcmd |= DMA_GCMD_CFI; - writel(cmd, iommu->reg + DMAR_GCMD_REG); + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_CFIS), sts); @@ -446,9 +444,8 @@ static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode) spin_lock_irqsave(&iommu->register_lock, flags); /* Enable interrupt-remapping */ - cmd = iommu->gcmd | DMA_GCMD_IRE; iommu->gcmd |= DMA_GCMD_IRE; - writel(cmd, iommu->reg + DMAR_GCMD_REG); + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_IRES), sts); -- cgit v0.10.2 From dd7264355a203c3456dbba04db471947d3b55e7e Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Wed, 13 May 2009 15:55:52 -0700 Subject: intel-iommu: dmar_set_interrupt return error value dmar_set_interrupt feigns success when arch_setup_dmar_msi fails, return error value. Signed-off-by: Chris Wright Signed-off-by: David Woodhouse diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c index faf77a0..f23460a 100644 --- a/drivers/pci/dmar.c +++ b/drivers/pci/dmar.c @@ -1088,7 +1088,7 @@ int dmar_set_interrupt(struct intel_iommu *iommu) set_irq_data(irq, NULL); iommu->irq = 0; destroy_irq(irq); - return 0; + return ret; } ret = request_irq(irq, dmar_fault, 0, iommu->name, iommu); -- cgit v0.10.2 From 302b4215daa0a704c843da40fd2529e5757a72da Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 18 May 2009 13:51:32 +0800 Subject: PCI: support the ATS capability The PCIe ATS capability makes the Endpoint be able to request the DMA address translation from the IOMMU and cache the translation in the device side, thus alleviate IOMMU pressure and improve the hardware performance in the I/O virtualization environment. Signed-off-by: Yu Zhao Acked-by: Jesse Barnes Signed-off-by: David Woodhouse diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index b497daa..0a7a1b4 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -5,6 +5,7 @@ * * PCI Express I/O Virtualization (IOV) support. * Single Root IOV 1.0 + * Address Translation Service 1.0 */ #include @@ -679,3 +680,107 @@ irqreturn_t pci_sriov_migration(struct pci_dev *dev) return sriov_migration(dev) ? IRQ_HANDLED : IRQ_NONE; } EXPORT_SYMBOL_GPL(pci_sriov_migration); + +static int ats_alloc_one(struct pci_dev *dev, int ps) +{ + int pos; + u16 cap; + struct pci_ats *ats; + + pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS); + if (!pos) + return -ENODEV; + + ats = kzalloc(sizeof(*ats), GFP_KERNEL); + if (!ats) + return -ENOMEM; + + ats->pos = pos; + ats->stu = ps; + pci_read_config_word(dev, pos + PCI_ATS_CAP, &cap); + ats->qdep = PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) : + PCI_ATS_MAX_QDEP; + dev->ats = ats; + + return 0; +} + +static void ats_free_one(struct pci_dev *dev) +{ + kfree(dev->ats); + dev->ats = NULL; +} + +/** + * pci_enable_ats - enable the ATS capability + * @dev: the PCI device + * @ps: the IOMMU page shift + * + * Returns 0 on success, or negative on failure. + */ +int pci_enable_ats(struct pci_dev *dev, int ps) +{ + int rc; + u16 ctrl; + + BUG_ON(dev->ats); + + if (ps < PCI_ATS_MIN_STU) + return -EINVAL; + + rc = ats_alloc_one(dev, ps); + if (rc) + return rc; + + ctrl = PCI_ATS_CTRL_ENABLE; + ctrl |= PCI_ATS_CTRL_STU(ps - PCI_ATS_MIN_STU); + pci_write_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, ctrl); + + return 0; +} + +/** + * pci_disable_ats - disable the ATS capability + * @dev: the PCI device + */ +void pci_disable_ats(struct pci_dev *dev) +{ + u16 ctrl; + + BUG_ON(!dev->ats); + + pci_read_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, &ctrl); + ctrl &= ~PCI_ATS_CTRL_ENABLE; + pci_write_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, ctrl); + + ats_free_one(dev); +} + +/** + * pci_ats_queue_depth - query the ATS Invalidate Queue Depth + * @dev: the PCI device + * + * Returns the queue depth on success, or negative on failure. + * + * The ATS spec uses 0 in the Invalidate Queue Depth field to + * indicate that the function can accept 32 Invalidate Request. + * But here we use the `real' values (i.e. 1~32) for the Queue + * Depth. + */ +int pci_ats_queue_depth(struct pci_dev *dev) +{ + int pos; + u16 cap; + + if (dev->ats) + return dev->ats->qdep; + + pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS); + if (!pos) + return -ENODEV; + + pci_read_config_word(dev, pos + PCI_ATS_CAP, &cap); + + return PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) : + PCI_ATS_MAX_QDEP; +} diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index d03f6b9..3c2ec64 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -229,6 +229,13 @@ struct pci_sriov { u8 __iomem *mstate; /* VF Migration State Array */ }; +/* Address Translation Service */ +struct pci_ats { + int pos; /* capability position */ + int stu; /* Smallest Translation Unit */ + int qdep; /* Invalidate Queue Depth */ +}; + #ifdef CONFIG_PCI_IOV extern int pci_iov_init(struct pci_dev *dev); extern void pci_iov_release(struct pci_dev *dev); @@ -236,6 +243,20 @@ extern int pci_iov_resource_bar(struct pci_dev *dev, int resno, enum pci_bar_type *type); extern void pci_restore_iov_state(struct pci_dev *dev); extern int pci_iov_bus_range(struct pci_bus *bus); + +extern int pci_enable_ats(struct pci_dev *dev, int ps); +extern void pci_disable_ats(struct pci_dev *dev); +extern int pci_ats_queue_depth(struct pci_dev *dev); +/** + * pci_ats_enabled - query the ATS status + * @dev: the PCI device + * + * Returns 1 if ATS capability is enabled, or 0 if not. + */ +static inline int pci_ats_enabled(struct pci_dev *dev) +{ + return !!dev->ats; +} #else static inline int pci_iov_init(struct pci_dev *dev) { @@ -257,6 +278,22 @@ static inline int pci_iov_bus_range(struct pci_bus *bus) { return 0; } + +static inline int pci_enable_ats(struct pci_dev *dev, int ps) +{ + return -ENODEV; +} +static inline void pci_disable_ats(struct pci_dev *dev) +{ +} +static inline int pci_ats_queue_depth(struct pci_dev *dev) +{ + return -ENODEV; +} +static inline int pci_ats_enabled(struct pci_dev *dev) +{ + return 0; +} #endif /* CONFIG_PCI_IOV */ #endif /* DRIVERS_PCI_H */ diff --git a/include/linux/pci.h b/include/linux/pci.h index 72698d8..bd3e4a7 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -188,6 +188,7 @@ struct pci_cap_saved_state { struct pcie_link_state; struct pci_vpd; struct pci_sriov; +struct pci_ats; /* * The pci_dev structure is used to describe PCI devices. @@ -285,6 +286,7 @@ struct pci_dev { struct pci_sriov *sriov; /* SR-IOV capability related */ struct pci_dev *physfn; /* the PF this VF is associated with */ }; + struct pci_ats *ats; /* Address Translation Service */ #endif }; diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h index e4d08c1..c03189c 100644 --- a/include/linux/pci_regs.h +++ b/include/linux/pci_regs.h @@ -501,6 +501,7 @@ #define PCI_EXT_CAP_ID_DSN 3 #define PCI_EXT_CAP_ID_PWR 4 #define PCI_EXT_CAP_ID_ARI 14 +#define PCI_EXT_CAP_ID_ATS 15 #define PCI_EXT_CAP_ID_SRIOV 16 /* Advanced Error Reporting */ @@ -619,6 +620,15 @@ #define PCI_ARI_CTRL_ACS 0x0002 /* ACS Function Groups Enable */ #define PCI_ARI_CTRL_FG(x) (((x) >> 4) & 7) /* Function Group */ +/* Address Translation Service */ +#define PCI_ATS_CAP 0x04 /* ATS Capability Register */ +#define PCI_ATS_CAP_QDEP(x) ((x) & 0x1f) /* Invalidate Queue Depth */ +#define PCI_ATS_MAX_QDEP 32 /* Max Invalidate Queue Depth */ +#define PCI_ATS_CTRL 0x06 /* ATS Control Register */ +#define PCI_ATS_CTRL_ENABLE 0x8000 /* ATS Enable */ +#define PCI_ATS_CTRL_STU(x) ((x) & 0x1f) /* Smallest Translation Unit */ +#define PCI_ATS_MIN_STU 12 /* shift of minimum STU block */ + /* Single Root I/O Virtualization */ #define PCI_SRIOV_CAP 0x04 /* SR-IOV Capabilities */ #define PCI_SRIOV_CAP_VFM 0x01 /* VF Migration Capable */ -- cgit v0.10.2 From e277d2fc79d6abb86fafadb58dca0b9c498a9aa7 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 18 May 2009 13:51:33 +0800 Subject: PCI: handle Virtual Function ATS enabling The SR-IOV spec requires that the Smallest Translation Unit and the Invalidate Queue Depth fields in the Virtual Function ATS capability are hardwired to 0. If a function is a Virtual Function, then and set its Physical Function's STU before enabling the ATS. Signed-off-by: Yu Zhao Acked-by: Jesse Barnes Signed-off-by: David Woodhouse diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 0a7a1b4..4151404 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -491,10 +491,10 @@ found: if (pdev) iov->dev = pci_dev_get(pdev); - else { + else iov->dev = dev; - mutex_init(&iov->lock); - } + + mutex_init(&iov->lock); dev->sriov = iov; dev->is_physfn = 1; @@ -514,11 +514,11 @@ static void sriov_release(struct pci_dev *dev) { BUG_ON(dev->sriov->nr_virtfn); - if (dev == dev->sriov->dev) - mutex_destroy(&dev->sriov->lock); - else + if (dev != dev->sriov->dev) pci_dev_put(dev->sriov->dev); + mutex_destroy(&dev->sriov->lock); + kfree(dev->sriov); dev->sriov = NULL; } @@ -723,19 +723,40 @@ int pci_enable_ats(struct pci_dev *dev, int ps) int rc; u16 ctrl; - BUG_ON(dev->ats); + BUG_ON(dev->ats && dev->ats->is_enabled); if (ps < PCI_ATS_MIN_STU) return -EINVAL; - rc = ats_alloc_one(dev, ps); - if (rc) - return rc; + if (dev->is_physfn || dev->is_virtfn) { + struct pci_dev *pdev = dev->is_physfn ? dev : dev->physfn; + + mutex_lock(&pdev->sriov->lock); + if (pdev->ats) + rc = pdev->ats->stu == ps ? 0 : -EINVAL; + else + rc = ats_alloc_one(pdev, ps); + + if (!rc) + pdev->ats->ref_cnt++; + mutex_unlock(&pdev->sriov->lock); + if (rc) + return rc; + } + + if (!dev->is_physfn) { + rc = ats_alloc_one(dev, ps); + if (rc) + return rc; + } ctrl = PCI_ATS_CTRL_ENABLE; - ctrl |= PCI_ATS_CTRL_STU(ps - PCI_ATS_MIN_STU); + if (!dev->is_virtfn) + ctrl |= PCI_ATS_CTRL_STU(ps - PCI_ATS_MIN_STU); pci_write_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, ctrl); + dev->ats->is_enabled = 1; + return 0; } @@ -747,13 +768,26 @@ void pci_disable_ats(struct pci_dev *dev) { u16 ctrl; - BUG_ON(!dev->ats); + BUG_ON(!dev->ats || !dev->ats->is_enabled); pci_read_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, &ctrl); ctrl &= ~PCI_ATS_CTRL_ENABLE; pci_write_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, ctrl); - ats_free_one(dev); + dev->ats->is_enabled = 0; + + if (dev->is_physfn || dev->is_virtfn) { + struct pci_dev *pdev = dev->is_physfn ? dev : dev->physfn; + + mutex_lock(&pdev->sriov->lock); + pdev->ats->ref_cnt--; + if (!pdev->ats->ref_cnt) + ats_free_one(pdev); + mutex_unlock(&pdev->sriov->lock); + } + + if (!dev->is_physfn) + ats_free_one(dev); } /** @@ -765,13 +799,17 @@ void pci_disable_ats(struct pci_dev *dev) * The ATS spec uses 0 in the Invalidate Queue Depth field to * indicate that the function can accept 32 Invalidate Request. * But here we use the `real' values (i.e. 1~32) for the Queue - * Depth. + * Depth; and 0 indicates the function shares the Queue with + * other functions (doesn't exclusively own a Queue). */ int pci_ats_queue_depth(struct pci_dev *dev) { int pos; u16 cap; + if (dev->is_virtfn) + return 0; + if (dev->ats) return dev->ats->qdep; diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 3c2ec64..f73bcbe 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -234,6 +234,8 @@ struct pci_ats { int pos; /* capability position */ int stu; /* Smallest Translation Unit */ int qdep; /* Invalidate Queue Depth */ + int ref_cnt; /* Physical Function reference count */ + int is_enabled:1; /* Enable bit is set */ }; #ifdef CONFIG_PCI_IOV @@ -255,7 +257,7 @@ extern int pci_ats_queue_depth(struct pci_dev *dev); */ static inline int pci_ats_enabled(struct pci_dev *dev) { - return !!dev->ats; + return dev->ats && dev->ats->is_enabled; } #else static inline int pci_iov_init(struct pci_dev *dev) -- cgit v0.10.2 From aa5d2b515b6fca5f8a56eac84f7fa0a68c1ce9b7 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 18 May 2009 13:51:34 +0800 Subject: VT-d: parse ATSR in DMA Remapping Reporting Structure Parse the Root Port ATS Capability Reporting Structure in the DMA Remapping Reporting Structure ACPI table. Signed-off-by: Yu Zhao Signed-off-by: David Woodhouse diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c index f23460a..6d7f9619 100644 --- a/drivers/pci/dmar.c +++ b/drivers/pci/dmar.c @@ -267,6 +267,84 @@ rmrr_parse_dev(struct dmar_rmrr_unit *rmrru) } return ret; } + +static LIST_HEAD(dmar_atsr_units); + +static int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr) +{ + struct acpi_dmar_atsr *atsr; + struct dmar_atsr_unit *atsru; + + atsr = container_of(hdr, struct acpi_dmar_atsr, header); + atsru = kzalloc(sizeof(*atsru), GFP_KERNEL); + if (!atsru) + return -ENOMEM; + + atsru->hdr = hdr; + atsru->include_all = atsr->flags & 0x1; + + list_add(&atsru->list, &dmar_atsr_units); + + return 0; +} + +static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru) +{ + int rc; + struct acpi_dmar_atsr *atsr; + + if (atsru->include_all) + return 0; + + atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); + rc = dmar_parse_dev_scope((void *)(atsr + 1), + (void *)atsr + atsr->header.length, + &atsru->devices_cnt, &atsru->devices, + atsr->segment); + if (rc || !atsru->devices_cnt) { + list_del(&atsru->list); + kfree(atsru); + } + + return rc; +} + +int dmar_find_matched_atsr_unit(struct pci_dev *dev) +{ + int i; + struct pci_bus *bus; + struct acpi_dmar_atsr *atsr; + struct dmar_atsr_unit *atsru; + + list_for_each_entry(atsru, &dmar_atsr_units, list) { + atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); + if (atsr->segment == pci_domain_nr(dev->bus)) + goto found; + } + + return 0; + +found: + for (bus = dev->bus; bus; bus = bus->parent) { + struct pci_dev *bridge = bus->self; + + if (!bridge || !bridge->is_pcie || + bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE) + return 0; + + if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) { + for (i = 0; i < atsru->devices_cnt; i++) + if (atsru->devices[i] == bridge) + return 1; + break; + } + } + + if (atsru->include_all) + return 1; + + return 0; +} #endif static void __init @@ -274,22 +352,28 @@ dmar_table_print_dmar_entry(struct acpi_dmar_header *header) { struct acpi_dmar_hardware_unit *drhd; struct acpi_dmar_reserved_memory *rmrr; + struct acpi_dmar_atsr *atsr; switch (header->type) { case ACPI_DMAR_TYPE_HARDWARE_UNIT: - drhd = (struct acpi_dmar_hardware_unit *)header; + drhd = container_of(header, struct acpi_dmar_hardware_unit, + header); printk (KERN_INFO PREFIX - "DRHD (flags: 0x%08x)base: 0x%016Lx\n", - drhd->flags, (unsigned long long)drhd->address); + "DRHD base: %#016Lx flags: %#x\n", + (unsigned long long)drhd->address, drhd->flags); break; case ACPI_DMAR_TYPE_RESERVED_MEMORY: - rmrr = (struct acpi_dmar_reserved_memory *)header; - + rmrr = container_of(header, struct acpi_dmar_reserved_memory, + header); printk (KERN_INFO PREFIX - "RMRR base: 0x%016Lx end: 0x%016Lx\n", + "RMRR base: %#016Lx end: %#016Lx\n", (unsigned long long)rmrr->base_address, (unsigned long long)rmrr->end_address); break; + case ACPI_DMAR_TYPE_ATSR: + atsr = container_of(header, struct acpi_dmar_atsr, header); + printk(KERN_INFO PREFIX "ATSR flags: %#x\n", atsr->flags); + break; } } @@ -363,6 +447,11 @@ parse_dmar_table(void) ret = dmar_parse_one_rmrr(entry_header); #endif break; + case ACPI_DMAR_TYPE_ATSR: +#ifdef CONFIG_DMAR + ret = dmar_parse_one_atsr(entry_header); +#endif + break; default: printk(KERN_WARNING PREFIX "Unknown DMAR structure type\n"); @@ -431,11 +520,19 @@ int __init dmar_dev_scope_init(void) #ifdef CONFIG_DMAR { struct dmar_rmrr_unit *rmrr, *rmrr_n; + struct dmar_atsr_unit *atsr, *atsr_n; + list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) { ret = rmrr_parse_dev(rmrr); if (ret) return ret; } + + list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) { + ret = atsr_parse_dev(atsr); + if (ret) + return ret; + } } #endif @@ -468,6 +565,9 @@ int __init dmar_table_init(void) #ifdef CONFIG_DMAR if (list_empty(&dmar_rmrr_units)) printk(KERN_INFO PREFIX "No RMRR found\n"); + + if (list_empty(&dmar_atsr_units)) + printk(KERN_INFO PREFIX "No ATSR found\n"); #endif #ifdef CONFIG_INTR_REMAP diff --git a/include/linux/dmar.h b/include/linux/dmar.h index e397dc3..7c9a207 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -185,6 +185,15 @@ struct dmar_rmrr_unit { #define for_each_rmrr_units(rmrr) \ list_for_each_entry(rmrr, &dmar_rmrr_units, list) + +struct dmar_atsr_unit { + struct list_head list; /* list of ATSR units */ + struct acpi_dmar_header *hdr; /* ACPI header */ + struct pci_dev **devices; /* target devices */ + int devices_cnt; /* target device count */ + u8 include_all:1; /* include all ports */ +}; + /* Intel DMAR initialization functions */ extern int intel_iommu_init(void); #else diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 29e05a0..0a1939f 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -331,6 +331,7 @@ static inline void __iommu_flush_cache( } extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev); +extern int dmar_find_matched_atsr_unit(struct pci_dev *dev); extern int alloc_iommu(struct dmar_drhd_unit *drhd); extern void free_iommu(struct intel_iommu *iommu); -- cgit v0.10.2 From 6ba6c3a4cacfd68bf970e3e04e2ff0d66fa0f695 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 18 May 2009 13:51:35 +0800 Subject: VT-d: add device IOTLB invalidation support Support device IOTLB invalidation to flush the translation cached in the Endpoint. Signed-off-by: Yu Zhao Signed-off-by: David Woodhouse diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c index 6d7f9619..7b287cb 100644 --- a/drivers/pci/dmar.c +++ b/drivers/pci/dmar.c @@ -699,7 +699,8 @@ void free_iommu(struct intel_iommu *iommu) */ static inline void reclaim_free_desc(struct q_inval *qi) { - while (qi->desc_status[qi->free_tail] == QI_DONE) { + while (qi->desc_status[qi->free_tail] == QI_DONE || + qi->desc_status[qi->free_tail] == QI_ABORT) { qi->desc_status[qi->free_tail] = QI_FREE; qi->free_tail = (qi->free_tail + 1) % QI_LENGTH; qi->free_cnt++; @@ -709,10 +710,13 @@ static inline void reclaim_free_desc(struct q_inval *qi) static int qi_check_fault(struct intel_iommu *iommu, int index) { u32 fault; - int head; + int head, tail; struct q_inval *qi = iommu->qi; int wait_index = (index + 1) % QI_LENGTH; + if (qi->desc_status[wait_index] == QI_ABORT) + return -EAGAIN; + fault = readl(iommu->reg + DMAR_FSTS_REG); /* @@ -722,7 +726,11 @@ static int qi_check_fault(struct intel_iommu *iommu, int index) */ if (fault & DMA_FSTS_IQE) { head = readl(iommu->reg + DMAR_IQH_REG); - if ((head >> 4) == index) { + if ((head >> DMAR_IQ_SHIFT) == index) { + printk(KERN_ERR "VT-d detected invalid descriptor: " + "low=%llx, high=%llx\n", + (unsigned long long)qi->desc[index].low, + (unsigned long long)qi->desc[index].high); memcpy(&qi->desc[index], &qi->desc[wait_index], sizeof(struct qi_desc)); __iommu_flush_cache(iommu, &qi->desc[index], @@ -732,6 +740,32 @@ static int qi_check_fault(struct intel_iommu *iommu, int index) } } + /* + * If ITE happens, all pending wait_desc commands are aborted. + * No new descriptors are fetched until the ITE is cleared. + */ + if (fault & DMA_FSTS_ITE) { + head = readl(iommu->reg + DMAR_IQH_REG); + head = ((head >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH; + head |= 1; + tail = readl(iommu->reg + DMAR_IQT_REG); + tail = ((tail >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH; + + writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG); + + do { + if (qi->desc_status[head] == QI_IN_USE) + qi->desc_status[head] = QI_ABORT; + head = (head - 2 + QI_LENGTH) % QI_LENGTH; + } while (head != tail); + + if (qi->desc_status[wait_index] == QI_ABORT) + return -EAGAIN; + } + + if (fault & DMA_FSTS_ICE) + writel(DMA_FSTS_ICE, iommu->reg + DMAR_FSTS_REG); + return 0; } @@ -741,7 +775,7 @@ static int qi_check_fault(struct intel_iommu *iommu, int index) */ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu) { - int rc = 0; + int rc; struct q_inval *qi = iommu->qi; struct qi_desc *hw, wait_desc; int wait_index, index; @@ -752,6 +786,9 @@ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu) hw = qi->desc; +restart: + rc = 0; + spin_lock_irqsave(&qi->q_lock, flags); while (qi->free_cnt < 3) { spin_unlock_irqrestore(&qi->q_lock, flags); @@ -782,7 +819,7 @@ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu) * update the HW tail register indicating the presence of * new descriptors. */ - writel(qi->free_head << 4, iommu->reg + DMAR_IQT_REG); + writel(qi->free_head << DMAR_IQ_SHIFT, iommu->reg + DMAR_IQT_REG); while (qi->desc_status[wait_index] != QI_DONE) { /* @@ -794,18 +831,21 @@ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu) */ rc = qi_check_fault(iommu, index); if (rc) - goto out; + break; spin_unlock(&qi->q_lock); cpu_relax(); spin_lock(&qi->q_lock); } -out: - qi->desc_status[index] = qi->desc_status[wait_index] = QI_DONE; + + qi->desc_status[index] = QI_DONE; reclaim_free_desc(qi); spin_unlock_irqrestore(&qi->q_lock, flags); + if (rc == -EAGAIN) + goto restart; + return rc; } @@ -857,6 +897,27 @@ void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, qi_submit_sync(&desc, iommu); } +void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep, + u64 addr, unsigned mask) +{ + struct qi_desc desc; + + if (mask) { + BUG_ON(addr & ((1 << (VTD_PAGE_SHIFT + mask)) - 1)); + addr |= (1 << (VTD_PAGE_SHIFT + mask - 1)) - 1; + desc.high = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE; + } else + desc.high = QI_DEV_IOTLB_ADDR(addr); + + if (qdep >= QI_DEV_IOTLB_MAX_INVS) + qdep = 0; + + desc.low = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) | + QI_DIOTLB_TYPE; + + qi_submit_sync(&desc, iommu); +} + /* * Disable Queued Invalidation interface. */ diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 0a1939f..40561b2 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -53,6 +53,7 @@ #define DMAR_PHMLIMIT_REG 0x78 /* pmrr high limit */ #define DMAR_IQH_REG 0x80 /* Invalidation queue head register */ #define DMAR_IQT_REG 0x88 /* Invalidation queue tail register */ +#define DMAR_IQ_SHIFT 4 /* Invalidation queue head/tail shift */ #define DMAR_IQA_REG 0x90 /* Invalidation queue addr register */ #define DMAR_ICS_REG 0x98 /* Invalidation complete status register */ #define DMAR_IRTA_REG 0xb8 /* Interrupt remapping table addr register */ @@ -198,6 +199,8 @@ static inline void dmar_writeq(void __iomem *addr, u64 val) #define DMA_FSTS_PPF ((u32)2) #define DMA_FSTS_PFO ((u32)1) #define DMA_FSTS_IQE (1 << 4) +#define DMA_FSTS_ICE (1 << 5) +#define DMA_FSTS_ITE (1 << 6) #define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff) /* FRCD_REG, 32 bits access */ @@ -226,7 +229,8 @@ do { \ enum { QI_FREE, QI_IN_USE, - QI_DONE + QI_DONE, + QI_ABORT }; #define QI_CC_TYPE 0x1 @@ -255,6 +259,12 @@ enum { #define QI_CC_DID(did) (((u64)did) << 16) #define QI_CC_GRAN(gran) (((u64)gran) >> (DMA_CCMD_INVL_GRANU_OFFSET-4)) +#define QI_DEV_IOTLB_SID(sid) ((u64)((sid) & 0xffff) << 32) +#define QI_DEV_IOTLB_QDEP(qdep) (((qdep) & 0x1f) << 16) +#define QI_DEV_IOTLB_ADDR(addr) ((u64)(addr) & VTD_PAGE_MASK) +#define QI_DEV_IOTLB_SIZE 1 +#define QI_DEV_IOTLB_MAX_INVS 32 + struct qi_desc { u64 low, high; }; @@ -344,6 +354,8 @@ extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, u64 type); extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type); +extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep, + u64 addr, unsigned mask); extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu); -- cgit v0.10.2 From 9dd2fe89062c90a964d122b8be5615d6f2203bbe Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 18 May 2009 13:51:36 +0800 Subject: VT-d: cleanup iommu_flush_iotlb_psi and flush_unmaps Make iommu_flush_iotlb_psi() and flush_unmaps() more readable. Signed-off-by: Yu Zhao Signed-off-by: David Woodhouse diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index bc99b1e..6d7cb84 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -948,28 +948,23 @@ static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int pages) { - unsigned int mask; + unsigned int mask = ilog2(__roundup_pow_of_two(pages)); BUG_ON(addr & (~VTD_PAGE_MASK)); BUG_ON(pages == 0); - /* Fallback to domain selective flush if no PSI support */ - if (!cap_pgsel_inv(iommu->cap)) - return iommu->flush.flush_iotlb(iommu, did, 0, 0, - DMA_TLB_DSI_FLUSH); - /* + * Fallback to domain selective flush if no PSI support or the size is + * too big. * PSI requires page size to be 2 ^ x, and the base address is naturally * aligned to the size */ - mask = ilog2(__roundup_pow_of_two(pages)); - /* Fallback to domain selective flush if size is too big */ - if (mask > cap_max_amask_val(iommu->cap)) - return iommu->flush.flush_iotlb(iommu, did, 0, 0, + if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap)) + iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); - - return iommu->flush.flush_iotlb(iommu, did, addr, mask, - DMA_TLB_PSI_FLUSH); + else + iommu->flush.flush_iotlb(iommu, did, addr, mask, + DMA_TLB_PSI_FLUSH); } static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) @@ -2260,15 +2255,16 @@ static void flush_unmaps(void) if (!iommu) continue; - if (deferred_flush[i].next) { - iommu->flush.flush_iotlb(iommu, 0, 0, 0, - DMA_TLB_GLOBAL_FLUSH); - for (j = 0; j < deferred_flush[i].next; j++) { - __free_iova(&deferred_flush[i].domain[j]->iovad, - deferred_flush[i].iova[j]); - } - deferred_flush[i].next = 0; + if (!deferred_flush[i].next) + continue; + + iommu->flush.flush_iotlb(iommu, 0, 0, 0, + DMA_TLB_GLOBAL_FLUSH, 0); + for (j = 0; j < deferred_flush[i].next; j++) { + __free_iova(&deferred_flush[i].domain[j]->iovad, + deferred_flush[i].iova[j]); } + deferred_flush[i].next = 0; } list_size = 0; -- cgit v0.10.2 From 93a23a7271dfb811b3adb72779054c3a24433112 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 18 May 2009 13:51:37 +0800 Subject: VT-d: support the device IOTLB Enable the device IOTLB (i.e. ATS) for both the bare metal and KVM environments. Signed-off-by: Yu Zhao Signed-off-by: David Woodhouse diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 6d7cb84..c3cdfc9 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -252,6 +252,7 @@ struct device_domain_info { u8 bus; /* PCI bus number */ u8 devfn; /* PCI devfn number */ struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */ + struct intel_iommu *iommu; /* IOMMU used by this device */ struct dmar_domain *domain; /* pointer to domain */ }; @@ -945,6 +946,77 @@ static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, (unsigned long long)DMA_TLB_IAIG(val)); } +static struct device_domain_info *iommu_support_dev_iotlb( + struct dmar_domain *domain, int segment, u8 bus, u8 devfn) +{ + int found = 0; + unsigned long flags; + struct device_domain_info *info; + struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn); + + if (!ecap_dev_iotlb_support(iommu->ecap)) + return NULL; + + if (!iommu->qi) + return NULL; + + spin_lock_irqsave(&device_domain_lock, flags); + list_for_each_entry(info, &domain->devices, link) + if (info->bus == bus && info->devfn == devfn) { + found = 1; + break; + } + spin_unlock_irqrestore(&device_domain_lock, flags); + + if (!found || !info->dev) + return NULL; + + if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS)) + return NULL; + + if (!dmar_find_matched_atsr_unit(info->dev)) + return NULL; + + info->iommu = iommu; + + return info; +} + +static void iommu_enable_dev_iotlb(struct device_domain_info *info) +{ + if (!info) + return; + + pci_enable_ats(info->dev, VTD_PAGE_SHIFT); +} + +static void iommu_disable_dev_iotlb(struct device_domain_info *info) +{ + if (!info->dev || !pci_ats_enabled(info->dev)) + return; + + pci_disable_ats(info->dev); +} + +static void iommu_flush_dev_iotlb(struct dmar_domain *domain, + u64 addr, unsigned mask) +{ + u16 sid, qdep; + unsigned long flags; + struct device_domain_info *info; + + spin_lock_irqsave(&device_domain_lock, flags); + list_for_each_entry(info, &domain->devices, link) { + if (!info->dev || !pci_ats_enabled(info->dev)) + continue; + + sid = info->bus << 8 | info->devfn; + qdep = pci_ats_queue_depth(info->dev); + qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask); + } + spin_unlock_irqrestore(&device_domain_lock, flags); +} + static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int pages) { @@ -965,6 +1037,8 @@ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, else iommu->flush.flush_iotlb(iommu, did, addr, mask, DMA_TLB_PSI_FLUSH); + if (did) + iommu_flush_dev_iotlb(iommu->domains[did], addr, mask); } static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) @@ -1305,6 +1379,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment, unsigned long ndomains; int id; int agaw; + struct device_domain_info *info = NULL; pr_debug("Set context mapping for %02x:%02x.%d\n", bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); @@ -1372,15 +1447,21 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment, context_set_domain_id(context, id); + if (translation != CONTEXT_TT_PASS_THROUGH) { + info = iommu_support_dev_iotlb(domain, segment, bus, devfn); + translation = info ? CONTEXT_TT_DEV_IOTLB : + CONTEXT_TT_MULTI_LEVEL; + } /* * In pass through mode, AW must be programmed to indicate the largest * AGAW value supported by hardware. And ASR is ignored by hardware. */ - if (likely(translation == CONTEXT_TT_MULTI_LEVEL)) { - context_set_address_width(context, iommu->agaw); - context_set_address_root(context, virt_to_phys(pgd)); - } else + if (unlikely(translation == CONTEXT_TT_PASS_THROUGH)) context_set_address_width(context, iommu->msagaw); + else { + context_set_address_root(context, virt_to_phys(pgd)); + context_set_address_width(context, iommu->agaw); + } context_set_translation_type(context, translation); context_set_fault_enable(context); @@ -1402,6 +1483,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment, } else { iommu_flush_write_buffer(iommu); } + iommu_enable_dev_iotlb(info); spin_unlock_irqrestore(&iommu->lock, flags); spin_lock_irqsave(&domain->iommu_lock, flags); @@ -1552,6 +1634,7 @@ static void domain_remove_dev_info(struct dmar_domain *domain) info->dev->dev.archdata.iommu = NULL; spin_unlock_irqrestore(&device_domain_lock, flags); + iommu_disable_dev_iotlb(info); iommu = device_to_iommu(info->segment, info->bus, info->devfn); iommu_detach_dev(iommu, info->bus, info->devfn); free_devinfo_mem(info); @@ -2259,10 +2342,16 @@ static void flush_unmaps(void) continue; iommu->flush.flush_iotlb(iommu, 0, 0, 0, - DMA_TLB_GLOBAL_FLUSH, 0); + DMA_TLB_GLOBAL_FLUSH); for (j = 0; j < deferred_flush[i].next; j++) { - __free_iova(&deferred_flush[i].domain[j]->iovad, - deferred_flush[i].iova[j]); + unsigned long mask; + struct iova *iova = deferred_flush[i].iova[j]; + + mask = (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT; + mask = ilog2(mask >> VTD_PAGE_SHIFT); + iommu_flush_dev_iotlb(deferred_flush[i].domain[j], + iova->pfn_lo << PAGE_SHIFT, mask); + __free_iova(&deferred_flush[i].domain[j]->iovad, iova); } deferred_flush[i].next = 0; } @@ -2943,6 +3032,7 @@ static void vm_domain_remove_one_dev_info(struct dmar_domain *domain, info->dev->dev.archdata.iommu = NULL; spin_unlock_irqrestore(&device_domain_lock, flags); + iommu_disable_dev_iotlb(info); iommu_detach_dev(iommu, info->bus, info->devfn); iommu_detach_dependent_devices(iommu, pdev); free_devinfo_mem(info); @@ -2993,6 +3083,7 @@ static void vm_domain_remove_all_dev_info(struct dmar_domain *domain) spin_unlock_irqrestore(&device_domain_lock, flags1); + iommu_disable_dev_iotlb(info); iommu = device_to_iommu(info->segment, info->bus, info->devfn); iommu_detach_dev(iommu, info->bus, info->devfn); iommu_detach_dependent_devices(iommu, info->dev); @@ -3197,11 +3288,11 @@ static int intel_iommu_attach_device(struct iommu_domain *domain, return -EFAULT; } - ret = domain_context_mapping(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL); + ret = vm_domain_add_dev_info(dmar_domain, pdev); if (ret) return ret; - ret = vm_domain_add_dev_info(dmar_domain, pdev); + ret = domain_context_mapping(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL); return ret; } diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h index e0a03af..5619f85 100644 --- a/include/linux/dma_remapping.h +++ b/include/linux/dma_remapping.h @@ -14,6 +14,7 @@ #define DMA_PTE_SNP (1 << 11) #define CONTEXT_TT_MULTI_LEVEL 0 +#define CONTEXT_TT_DEV_IOTLB 1 #define CONTEXT_TT_PASS_THROUGH 2 struct intel_iommu; diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 40561b2..482dc91 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -124,6 +124,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val) #define ecap_pass_through(e) ((e >> 6) & 0x1) #define ecap_eim_support(e) ((e >> 4) & 0x1) #define ecap_ir_support(e) ((e >> 3) & 0x1) +#define ecap_dev_iotlb_support(e) (((e) >> 2) & 0x1) #define ecap_max_handle_mask(e) ((e >> 20) & 0xf) #define ecap_sc_support(e) ((e >> 7) & 0x1) /* Snooping Control */ -- cgit v0.10.2 From 008fe148cb0fb51d266baabe2c09997b21cf90c6 Mon Sep 17 00:00:00 2001 From: "Luck, Tony" Date: Tue, 26 May 2009 15:45:12 -0700 Subject: intel-iommu: Fix one last ia64 build problem in Pass Through Support On ia64 with CONFIG_DMAR=n and CONFIG_SWIOTLB=y (as used in arch/ia64/configs/tiger_defconfig) there is still a link error with iommu_pass_through listed as an undefined symbol: arch/ia64/kernel/built-in.o: In function `pci_swiotlb_init': (.init.text+0x7f70): undefined reference to `iommu_pass_through' Fix it by #defining iommu_pass_through away in asm/iommu.h Signed-off-by: Tony Luck Signed-off-by: David Woodhouse diff --git a/arch/ia64/include/asm/iommu.h b/arch/ia64/include/asm/iommu.h index 37d41ca..745e095 100644 --- a/arch/ia64/include/asm/iommu.h +++ b/arch/ia64/include/asm/iommu.h @@ -9,7 +9,11 @@ extern void pci_iommu_shutdown(void); extern void no_iommu_init(void); extern int force_iommu, no_iommu; extern int iommu_detected; +#ifdef CONFIG_DMAR extern int iommu_pass_through; +#else +#define iommu_pass_through (0) +#endif extern void iommu_dma_init(void); extern void machvec_init(const char *name); -- cgit v0.10.2