From b779260b097dec295e8798277ed08ec5ecd3b2c1 Mon Sep 17 00:00:00 2001 From: Joseph Cihula Date: Tue, 3 May 2011 00:08:37 -0700 Subject: intel-iommu: fix VT-d PMR disable for TXT on S3 resume This patch is a follow on to https://lkml.org/lkml/2011/3/21/239, which was merged as commit 51a63e67da6056c13b5b597dcc9e1b3bd7ceaa55. This patch adds support for S3, as pointed out by Chris Wright. Signed-off-by: Joseph Cihula Signed-off-by: David Woodhouse diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index d552d2c..4e4e020 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -142,6 +142,12 @@ static void __init check_tylersburg_isoch(void); static int rwbf_quirk; /* + * set to 1 to panic kernel if can't successfully enable VT-d + * (used when kernel is launched w/ TXT) + */ +static int force_on = 0; + +/* * 0: Present * 1-11: Reserved * 12-63: Context Ptr (12 - (haw-1)) @@ -2217,7 +2223,7 @@ static int __init iommu_prepare_static_identity_mapping(int hw) return 0; } -static int __init init_dmars(int force_on) +static int __init init_dmars(void) { struct dmar_drhd_unit *drhd; struct dmar_rmrr_unit *rmrr; @@ -3117,7 +3123,17 @@ static int init_iommu_hw(void) if (iommu->qi) dmar_reenable_qi(iommu); - for_each_active_iommu(iommu, drhd) { + for_each_iommu(iommu, drhd) { + if (drhd->ignored) { + /* + * we always have to disable PMRs or DMA may fail on + * this device + */ + if (force_on) + iommu_disable_protect_mem_regions(iommu); + continue; + } + iommu_flush_write_buffer(iommu); iommu_set_root_entry(iommu); @@ -3126,7 +3142,8 @@ static int init_iommu_hw(void) DMA_CCMD_GLOBAL_INVL); iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); - iommu_enable_translation(iommu); + if (iommu_enable_translation(iommu)) + return 1; iommu_disable_protect_mem_regions(iommu); } @@ -3193,7 +3210,10 @@ static void iommu_resume(void) unsigned long flag; if (init_iommu_hw()) { - WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); + if (force_on) + panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); + else + WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); return; } @@ -3270,7 +3290,6 @@ static struct notifier_block device_nb = { int __init intel_iommu_init(void) { int ret = 0; - int force_on = 0; /* VT-d is required for a TXT/tboot launch, so enforce that */ force_on = tboot_force_iommu(); @@ -3308,7 +3327,7 @@ int __init intel_iommu_init(void) init_no_remapping_devices(); - ret = init_dmars(force_on); + ret = init_dmars(); if (ret) { if (force_on) panic("tboot: Failed to initialize DMARs\n"); -- cgit v0.10.2 From b3a530e4e734cab7043a3ab7d135f539042443a7 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 15 May 2011 12:34:55 +0200 Subject: intel-iommu: Remove obsolete comment from detect_intel_iommu Since cacd4213d8, this comment no longer applies. Signed-off-by: Jan Kiszka Signed-off-by: David Woodhouse diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c index 12e02bf..3dc9bef 100644 --- a/drivers/pci/dmar.c +++ b/drivers/pci/dmar.c @@ -698,12 +698,7 @@ int __init detect_intel_iommu(void) { #ifdef CONFIG_INTR_REMAP struct acpi_table_dmar *dmar; - /* - * for now we will disable dma-remapping when interrupt - * remapping is enabled. - * When support for queued invalidation for IOTLB invalidation - * is added, we will not need this any more. - */ + dmar = (struct acpi_table_dmar *) dmar_tbl; if (ret && cpu_has_x2apic && dmar->flags & 0x1) printk(KERN_INFO -- cgit v0.10.2 From 7b668357810ecb5fdda4418689d50f5d95aea6a8 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 24 May 2011 12:02:41 +0100 Subject: intel-iommu: Flush unmaps at domain_exit We typically batch unmaps to be lazily flushed out at regular intervals. When we destroy a domain, we need to force a flush of these lazy unmaps to be sure none reference the domain we're about to free. Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=35062 Signed-off-by: Alex Williamson Signed-off-by: David Woodhouse Cc: stable@kernel.org diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 4e4e020..395f253 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1422,6 +1422,10 @@ static void domain_exit(struct dmar_domain *domain) if (!domain) return; + /* Flush any lazy unmaps that may reference this domain */ + if (!intel_iommu_strict) + flush_unmaps_timeout(0); + domain_remove_dev_info(domain); /* destroy iovas */ put_iova_domain(&domain->iovad); -- cgit v0.10.2 From 6dd9a7c73761a8a5f5475d5cfdc15368a0f4c06d Mon Sep 17 00:00:00 2001 From: Youquan Song Date: Wed, 25 May 2011 19:13:49 +0100 Subject: intel-iommu: Enable super page (2MiB, 1GiB, etc.) support There are no externally-visible changes with this. In the loop in the internal __domain_mapping() function, we simply detect if we are mapping: - size >= 2MiB, and - virtual address aligned to 2MiB, and - physical address aligned to 2MiB, and - on hardware that supports superpages. (and likewise for larger superpages). We automatically use a superpage for such mappings. We never have to worry about *breaking* superpages, since we trust that we will always *unmap* the same range that was mapped. So all we need to do is ensure that dma_pte_clear_range() will also cope with superpages. Adjust pfn_to_dma_pte() to take a superpage 'level' as an argument, so it can return a PTE at the appropriate level rather than always extending the page tables all the way down to level 1. Again, this is simplified by the fact that we should never encounter existing small pages when we're creating a mapping; any old mapping that used the same virtual range will have been entirely removed and its obsolete page tables freed. Provide an 'intel_iommu=sp_off' argument on the command line as a chicken bit. Not that it should ever be required. == The original commit seen in the iommu-2.6.git was Youquan's implementation (and completion) of my own half-baked code which I'd typed into an email. Followed by half a dozen subsequent 'fixes'. I've taken the unusual step of rewriting history and collapsing the original commits in order to keep the main history simpler, and make life easier for the people who are going to have to backport this to older kernels. And also so I can give it a more coherent commit comment which (hopefully) gives a better explanation of what's going on. The original sequence of commits leading to identical code was: Youquan Song (3): intel-iommu: super page support intel-iommu: Fix superpage alignment calculation error intel-iommu: Fix superpage level calculation error in dma_pfn_level_pte() David Woodhouse (4): intel-iommu: Precalculate superpage support for dmar_domain intel-iommu: Fix hardware_largepage_caps() intel-iommu: Fix inappropriate use of superpages in __domain_mapping() intel-iommu: Fix phys_pfn in __domain_mapping for sglist pages Signed-off-by: Youquan Song Signed-off-by: David Woodhouse diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index cc85a92..d005487 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -999,7 +999,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. With this option on every unmap_single operation will result in a hardware IOTLB flush operation as opposed to batching them for performance. - + sp_off [Default Off] + By default, super page will be supported if Intel IOMMU + has the capability. With this option, super page will + not be supported. intremap= [X86-64, Intel-IOMMU] Format: { on (default) | off | nosid } on enable Interrupt Remapping (default) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 395f253..e6fe199 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -115,6 +115,11 @@ static inline unsigned long align_to_level(unsigned long pfn, int level) return (pfn + level_size(level) - 1) & level_mask(level); } +static inline unsigned long lvl_to_nr_pages(unsigned int lvl) +{ + return 1 << ((lvl - 1) * LEVEL_STRIDE); +} + /* VT-d pages must always be _smaller_ than MM pages. Otherwise things are never going to work. */ static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn) @@ -343,6 +348,9 @@ struct dmar_domain { int iommu_coherency;/* indicate coherency of iommu access */ int iommu_snooping; /* indicate snooping control feature*/ int iommu_count; /* reference count of iommu */ + int iommu_superpage;/* Level of superpages supported: + 0 == 4KiB (no superpages), 1 == 2MiB, + 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */ spinlock_t iommu_lock; /* protect iommu set in domain */ u64 max_addr; /* maximum mapped address */ }; @@ -392,6 +400,7 @@ int dmar_disabled = 1; static int dmar_map_gfx = 1; static int dmar_forcedac; static int intel_iommu_strict; +static int intel_iommu_superpage = 1; #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1)) static DEFINE_SPINLOCK(device_domain_lock); @@ -422,6 +431,10 @@ static int __init intel_iommu_setup(char *str) printk(KERN_INFO "Intel-IOMMU: disable batched IOTLB flush\n"); intel_iommu_strict = 1; + } else if (!strncmp(str, "sp_off", 6)) { + printk(KERN_INFO + "Intel-IOMMU: disable supported super page\n"); + intel_iommu_superpage = 0; } str += strcspn(str, ","); @@ -560,11 +573,32 @@ static void domain_update_iommu_snooping(struct dmar_domain *domain) } } +static void domain_update_iommu_superpage(struct dmar_domain *domain) +{ + int i, mask = 0xf; + + if (!intel_iommu_superpage) { + domain->iommu_superpage = 0; + return; + } + + domain->iommu_superpage = 4; /* 1TiB */ + + for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) { + mask |= cap_super_page_val(g_iommus[i]->cap); + if (!mask) { + break; + } + } + domain->iommu_superpage = fls(mask); +} + /* Some capabilities may be different across iommus */ static void domain_update_iommu_cap(struct dmar_domain *domain) { domain_update_iommu_coherency(domain); domain_update_iommu_snooping(domain); + domain_update_iommu_superpage(domain); } static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn) @@ -694,23 +728,31 @@ out: } static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, - unsigned long pfn) + unsigned long pfn, int large_level) { int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; struct dma_pte *parent, *pte = NULL; int level = agaw_to_level(domain->agaw); - int offset; + int offset, target_level; BUG_ON(!domain->pgd); BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width); parent = domain->pgd; + /* Search pte */ + if (!large_level) + target_level = 1; + else + target_level = large_level; + while (level > 0) { void *tmp_page; offset = pfn_level_offset(pfn, level); pte = &parent[offset]; - if (level == 1) + if (!large_level && (pte->val & DMA_PTE_LARGE_PAGE)) + break; + if (level == target_level) break; if (!dma_pte_present(pte)) { @@ -738,10 +780,11 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, return pte; } + /* return address's pte at specific level */ static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, unsigned long pfn, - int level) + int level, int *large_page) { struct dma_pte *parent, *pte = NULL; int total = agaw_to_level(domain->agaw); @@ -754,8 +797,16 @@ static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, if (level == total) return pte; - if (!dma_pte_present(pte)) + if (!dma_pte_present(pte)) { + *large_page = total; break; + } + + if (pte->val & DMA_PTE_LARGE_PAGE) { + *large_page = total; + return pte; + } + parent = phys_to_virt(dma_pte_addr(pte)); total--; } @@ -768,6 +819,7 @@ static void dma_pte_clear_range(struct dmar_domain *domain, unsigned long last_pfn) { int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; + unsigned int large_page = 1; struct dma_pte *first_pte, *pte; BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width); @@ -776,14 +828,15 @@ static void dma_pte_clear_range(struct dmar_domain *domain, /* we don't need lock here; nobody else touches the iova range */ do { - first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1); + large_page = 1; + first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); if (!pte) { - start_pfn = align_to_level(start_pfn + 1, 2); + start_pfn = align_to_level(start_pfn + 1, large_page + 1); continue; } - do { + do { dma_clear_pte(pte); - start_pfn++; + start_pfn += lvl_to_nr_pages(large_page); pte++; } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); @@ -803,6 +856,7 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain, int total = agaw_to_level(domain->agaw); int level; unsigned long tmp; + int large_page = 2; BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width); BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width); @@ -818,7 +872,10 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain, return; do { - first_pte = pte = dma_pfn_level_pte(domain, tmp, level); + large_page = level; + first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page); + if (large_page > level) + level = large_page + 1; if (!pte) { tmp = align_to_level(tmp + 1, level + 1); continue; @@ -1402,6 +1459,7 @@ static int domain_init(struct dmar_domain *domain, int guest_width) else domain->iommu_snooping = 0; + domain->iommu_superpage = fls(cap_super_page_val(iommu->cap)); domain->iommu_count = 1; domain->nid = iommu->node; @@ -1657,6 +1715,34 @@ static inline unsigned long aligned_nrpages(unsigned long host_addr, return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; } +/* Return largest possible superpage level for a given mapping */ +static inline int hardware_largepage_caps(struct dmar_domain *domain, + unsigned long iov_pfn, + unsigned long phy_pfn, + unsigned long pages) +{ + int support, level = 1; + unsigned long pfnmerge; + + support = domain->iommu_superpage; + + /* To use a large page, the virtual *and* physical addresses + must be aligned to 2MiB/1GiB/etc. Lower bits set in either + of them will mean we have to use smaller pages. So just + merge them and check both at once. */ + pfnmerge = iov_pfn | phy_pfn; + + while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { + pages >>= VTD_STRIDE_SHIFT; + if (!pages) + break; + pfnmerge >>= VTD_STRIDE_SHIFT; + level++; + support--; + } + return level; +} + static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, struct scatterlist *sg, unsigned long phys_pfn, unsigned long nr_pages, int prot) @@ -1665,6 +1751,8 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, phys_addr_t uninitialized_var(pteval); int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; unsigned long sg_res; + unsigned int largepage_lvl = 0; + unsigned long lvl_pages = 0; BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width); @@ -1680,7 +1768,7 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot; } - while (nr_pages--) { + while (nr_pages > 0) { uint64_t tmp; if (!sg_res) { @@ -1688,11 +1776,21 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset; sg->dma_length = sg->length; pteval = page_to_phys(sg_page(sg)) | prot; + phys_pfn = pteval >> VTD_PAGE_SHIFT; } + if (!pte) { - first_pte = pte = pfn_to_dma_pte(domain, iov_pfn); + largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res); + + first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl); if (!pte) return -ENOMEM; + /* It is large page*/ + if (largepage_lvl > 1) + pteval |= DMA_PTE_LARGE_PAGE; + else + pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; + } /* We don't need lock here, nobody else * touches the iova range @@ -1708,16 +1806,38 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, } WARN_ON(1); } + + lvl_pages = lvl_to_nr_pages(largepage_lvl); + + BUG_ON(nr_pages < lvl_pages); + BUG_ON(sg_res < lvl_pages); + + nr_pages -= lvl_pages; + iov_pfn += lvl_pages; + phys_pfn += lvl_pages; + pteval += lvl_pages * VTD_PAGE_SIZE; + sg_res -= lvl_pages; + + /* If the next PTE would be the first in a new page, then we + need to flush the cache on the entries we've just written. + And then we'll need to recalculate 'pte', so clear it and + let it get set again in the if (!pte) block above. + + If we're done (!nr_pages) we need to flush the cache too. + + Also if we've been setting superpages, we may need to + recalculate 'pte' and switch back to smaller pages for the + end of the mapping, if the trailing size is not enough to + use another superpage (i.e. sg_res < lvl_pages). */ pte++; - if (!nr_pages || first_pte_in_page(pte)) { + if (!nr_pages || first_pte_in_page(pte) || + (largepage_lvl > 1 && sg_res < lvl_pages)) { domain_flush_cache(domain, first_pte, (void *)pte - (void *)first_pte); pte = NULL; } - iov_pfn++; - pteval += VTD_PAGE_SIZE; - sg_res--; - if (!sg_res) + + if (!sg_res && nr_pages) sg = sg_next(sg); } return 0; @@ -3527,6 +3647,7 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width) domain->iommu_count = 0; domain->iommu_coherency = 0; domain->iommu_snooping = 0; + domain->iommu_superpage = 0; domain->max_addr = 0; domain->nid = -1; @@ -3742,7 +3863,7 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, struct dma_pte *pte; u64 phys = 0; - pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT); + pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0); if (pte) phys = dma_pte_addr(pte); diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h index 5619f85..bbd8661 100644 --- a/include/linux/dma_remapping.h +++ b/include/linux/dma_remapping.h @@ -9,8 +9,12 @@ #define VTD_PAGE_MASK (((u64)-1) << VTD_PAGE_SHIFT) #define VTD_PAGE_ALIGN(addr) (((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK) +#define VTD_STRIDE_SHIFT (9) +#define VTD_STRIDE_MASK (((u64)-1) << VTD_STRIDE_SHIFT) + #define DMA_PTE_READ (1) #define DMA_PTE_WRITE (2) +#define DMA_PTE_LARGE_PAGE (1 << 7) #define DMA_PTE_SNP (1 << 11) #define CONTEXT_TT_MULTI_LEVEL 0 -- cgit v0.10.2 From 9b4554b21ed07e8556405510638171f0c787742a Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 24 May 2011 12:19:04 -0400 Subject: intel-iommu: Only unlink device domains from iommu Commit a97590e5 added unlinking domains from iommus to reciprocate the iommu from domains unlinking that was already done. We actually want to only do this for device domains and never for the static identity map domain or VM domains. The SI domain is special and never freed, while VM domain->id lives in their own special address space, separate from iommu->domain_ids. In the current code, a VM can get domain->id zero, then mark that domain unused when unbound from pci-stub. This leads to DMAR write faults when the device is re-bound to the host driver. Signed-off-by: Alex Williamson Cc: stable@kernel.org Signed-off-by: David Woodhouse diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index e6fe199..4eaec2f 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -3561,10 +3561,13 @@ static void domain_remove_one_dev_info(struct dmar_domain *domain, domain_update_iommu_cap(domain); spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags); - spin_lock_irqsave(&iommu->lock, tmp_flags); - clear_bit(domain->id, iommu->domain_ids); - iommu->domains[domain->id] = NULL; - spin_unlock_irqrestore(&iommu->lock, tmp_flags); + if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) && + !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) { + spin_lock_irqsave(&iommu->lock, tmp_flags); + clear_bit(domain->id, iommu->domain_ids); + iommu->domains[domain->id] = NULL; + spin_unlock_irqrestore(&iommu->lock, tmp_flags); + } } spin_unlock_irqrestore(&device_domain_lock, flags); -- cgit v0.10.2 From 8fcc5372fbac085199d84a880503ed67aba3fe49 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Sat, 28 May 2011 13:15:02 -0500 Subject: intel-iommu: Check for identity mapping candidate using system dma mask The identity mapping code appears to make the assumption that if the devices dma_mask is greater than 32bits the device can use identity mapping. But that is not true: take the case where we have a 40bit device in a 44bit architecture. The device can potentially receive a physical address that it will truncate and cause incorrect addresses to be used. Instead check to see if the device's dma_mask is large enough to address the system's dma_mask. Signed-off-by: Mike Travis Reviewed-by: Mike Habeck Cc: stable@kernel.org Signed-off-by: David Woodhouse diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 4eaec2f..dcf051d 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2316,8 +2316,19 @@ static int iommu_should_identity_map(struct pci_dev *pdev, int startup) * Assume that they will -- if they turn out not to be, then we can * take them out of the 1:1 domain later. */ - if (!startup) - return pdev->dma_mask > DMA_BIT_MASK(32); + if (!startup) { + /* + * If the device's dma_mask is less than the system's memory + * size then this is not a candidate for identity mapping. + */ + u64 dma_mask = pdev->dma_mask; + + if (pdev->dev.coherent_dma_mask && + pdev->dev.coherent_dma_mask < dma_mask) + dma_mask = pdev->dev.coherent_dma_mask; + + return dma_mask >= dma_get_required_mask(&pdev->dev); + } return 1; } -- cgit v0.10.2 From cb452a4040bb051d92e85d6e7eb60c11734c1781 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 28 May 2011 13:15:03 -0500 Subject: intel-iommu: Speed up processing of the identity_mapping function When there are a large count of PCI devices, and the pass through option for iommu is set, much time is spent in the identity_mapping function hunting though the iommu domains to check if a specific device is "identity mapped". Speed up the function by checking the cached info to see if it's mapped to the static identity domain. Signed-off-by: Mike Travis Reviewed-by: Mike Habeck Cc: stable@kernel.org Signed-off-by: David Woodhouse diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index dcf051d..98be0b5 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2235,10 +2235,10 @@ static int identity_mapping(struct pci_dev *pdev) if (likely(!iommu_identity_mapping)) return 0; + info = pdev->dev.archdata.iommu; + if (info && info != DUMMY_DEVICE_DOMAIN_INFO) + return (info->domain == si_domain); - list_for_each_entry(info, &si_domain->devices, link) - if (info->dev == pdev) - return 1; return 0; } -- cgit v0.10.2 From 1c9fc3d11b84fbd0c4f4aa7855702c2a1f098ebb Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Sat, 28 May 2011 13:15:04 -0500 Subject: intel-iommu: Dont cache iova above 32bit Mike Travis and Mike Habeck reported an issue where iova allocation would return a range that was larger than a device's dma mask. https://lkml.org/lkml/2011/3/29/423 The dmar initialization code will reserve all PCI MMIO regions and copy those reservations into a domain specific iova tree. It is possible for one of those regions to be above the dma mask of a device. It is typical to allocate iovas with a 32bit mask (despite device's dma mask possibly being larger) and cache the result until it exhausts the lower 32bit address space. Freeing the iova range that is >= the last iova in the lower 32bit range when there is still an iova above the 32bit range will corrupt the cached iova by pointing it to a region that is above 32bit. If that region is also larger than the device's dma mask, a subsequent allocation will return an unusable iova and cause dma failure. Simply don't cache an iova that is above the 32bit caching boundary. Reported-by: Mike Travis Reported-by: Mike Habeck Cc: stable@kernel.org Acked-by: Mike Travis Tested-by: Mike Habeck Signed-off-by: Chris Wright Signed-off-by: David Woodhouse diff --git a/drivers/pci/iova.c b/drivers/pci/iova.c index 9606e59..c5c274a 100644 --- a/drivers/pci/iova.c +++ b/drivers/pci/iova.c @@ -63,8 +63,16 @@ __cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free) curr = iovad->cached32_node; cached_iova = container_of(curr, struct iova, node); - if (free->pfn_lo >= cached_iova->pfn_lo) - iovad->cached32_node = rb_next(&free->node); + if (free->pfn_lo >= cached_iova->pfn_lo) { + struct rb_node *node = rb_next(&free->node); + struct iova *iova = container_of(node, struct iova, node); + + /* only cache if it's below 32bit pfn */ + if (node && iova->pfn_lo < iovad->dma_32bit_pfn) + iovad->cached32_node = node; + else + iovad->cached32_node = NULL; + } } /* Computes the padding size required, to make the -- cgit v0.10.2 From c681d0ba1252954208220ad32248a3e8e2fc98e4 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 28 May 2011 13:15:05 -0500 Subject: intel-iommu: Use coherent DMA mask when requested The __intel_map_single function is not honoring the passed in DMA mask. This results in not using the coherent DMA mask when called from intel_alloc_coherent(). Signed-off-by: Mike Travis Acked-by: Chris Wright Reviewed-by: Mike Habeck Cc: stable@kernel.org Signed-off-by: David Woodhouse diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 98be0b5..5cbab7f 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2732,8 +2732,7 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, iommu = domain_get_iommu(domain); size = aligned_nrpages(paddr, size); - iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), - pdev->dma_mask); + iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask); if (!iova) goto error; -- cgit v0.10.2 From 825507d6d059f1cbe2503e0e5a3926225b983aec Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 28 May 2011 13:15:06 -0500 Subject: intel-iommu: Remove Host Bridge devices from identity mapping When using the 1:1 (identity) PCI DMA remapping, PCI Host Bridge devices that do not use the IOMMU causes a kernel panic. Fix that by not inserting those devices into the si_domain. Signed-off-by: Mike Travis Reviewed-by: Mike Habeck Cc: stable@kernel.org Signed-off-by: David Woodhouse diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 5cbab7f..7f757ce 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -46,6 +46,8 @@ #define ROOT_SIZE VTD_PAGE_SIZE #define CONTEXT_SIZE VTD_PAGE_SIZE +#define IS_BRIDGE_HOST_DEVICE(pdev) \ + ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST) #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) @@ -2343,6 +2345,9 @@ static int __init iommu_prepare_static_identity_mapping(int hw) return -EFAULT; for_each_pci_dev(pdev) { + /* Skip Host/PCI Bridge devices */ + if (IS_BRIDGE_HOST_DEVICE(pdev)) + continue; if (iommu_should_identity_map(pdev, 1)) { printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n", hw ? "hardware" : "software", pci_name(pdev)); -- cgit v0.10.2 From 8519dc4401ddf8a5399f979870bbeeadbc111186 Mon Sep 17 00:00:00 2001 From: Mike Habeck Date: Sat, 28 May 2011 13:15:07 -0500 Subject: intel-iommu: Add domain check in domain_remove_one_dev_info The comment in domain_remove_one_dev_info() states "No need to compare PCI domain; it has to be the same". But for the si_domain that isn't going to be true, as it consists of all the PCI devices that are identity mapped thus multiple PCI domains can be in si_domain. The code needs to validate the PCI domain too. Signed-off-by: Mike Habeck Signed-off-by: Mike Travis Cc: stable@kernel.org Signed-off-by: David Woodhouse diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 7f757ce..b0c96d3 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -3537,8 +3537,8 @@ static void domain_remove_one_dev_info(struct dmar_domain *domain, spin_lock_irqsave(&device_domain_lock, flags); list_for_each_safe(entry, tmp, &domain->devices) { info = list_entry(entry, struct device_domain_info, link); - /* No need to compare PCI domain; it has to be the same */ - if (info->bus == pdev->bus->number && + if (info->segment == pci_domain_nr(pdev->bus) && + info->bus == pdev->bus->number && info->devfn == pdev->devfn) { list_del(&info->link); list_del(&info->global); -- cgit v0.10.2 From 70e535d1e5d1e4317e894d6228b762cf9c3fbc6a Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 31 May 2011 00:22:52 +0100 Subject: intel-iommu: Fix off-by-one in RMRR setup We were mapping an extra byte (and hence usually an extra page): iommu_prepare_identity_map() expects to be given an 'end' argument which is the last byte to be mapped; not the first byte *not* to be mapped. Signed-off-by: David Woodhouse diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index b0c96d3..a8867bd 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2147,7 +2147,7 @@ static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr, if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO) return 0; return iommu_prepare_identity_map(pdev, rmrr->base_address, - rmrr->end_address + 1); + rmrr->end_address); } #ifdef CONFIG_DMAR_FLOPPY_WA @@ -2161,7 +2161,7 @@ static inline void iommu_prepare_isa(void) return; printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n"); - ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024); + ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1); if (ret) printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; " -- cgit v0.10.2