From 7752d5cfe3d11ca0bb9c673ec38bd78ba6578f8e Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Fri, 15 Feb 2008 01:27:20 -0800 Subject: x86: validate against acpi motherboard resources This path adds validation of the MMCONFIG table against the ACPI reserved motherboard resources. If the MMCONFIG table is found to be reserved in ACPI, we don't bother checking the E820 table. The PCI Express firmware spec apparently tells BIOS developers that reservation in ACPI is required and E820 reservation is optional, so checking against ACPI first makes sense. Many BIOSes don't reserve the MMCONFIG region in E820 even though it is perfectly functional, the existing check needlessly disables MMCONFIG in these cases. In order to do this, MMCONFIG setup has been split into two phases. If PCI configuration type 1 is not available then MMCONFIG is enabled early as before. Otherwise, it is enabled later after the ACPI interpreter is enabled, since we need to be able to execute control methods in order to check the ACPI reserved resources. Presently this is just triggered off the end of ACPI interpreter initialization. There are a few other behavioral changes here: - Validate all MMCONFIG configurations provided, not just the first one. - Validate the entire required length of each configuration according to the provided ending bus number is reserved, not just the minimum required allocation. - Validate that the area is reserved even if we read it from the chipset directly and not from the MCFG table. This catches the case where the BIOS didn't set the location properly in the chipset and has mapped it over other things it shouldn't have. This also cleans up the MMCONFIG initialization functions so that they simply do nothing if MMCONFIG is not compiled in. Based on an original patch by Rajesh Shah from Intel. [akpm@linux-foundation.org: many fixes and cleanups] Signed-off-by: Robert Hancock Signed-off-by: Andi Kleen Cc: Andrew Morton Cc: Greg KH Signed-off-by: Thomas Gleixner Tested-by: Andi Kleen Cc: Rajesh Shah Cc: Jesse Barnes Acked-by: Linus Torvalds Cc: Andi Kleen Cc: Greg KH Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c index 3de9f9b..2080b04 100644 --- a/arch/x86/pci/init.c +++ b/arch/x86/pci/init.c @@ -11,9 +11,7 @@ static __init int pci_access_init(void) #ifdef CONFIG_PCI_DIRECT type = pci_direct_probe(); #endif -#ifdef CONFIG_PCI_MMCONFIG - pci_mmcfg_init(type); -#endif + pci_mmcfg_early_init(type); if (raw_pci_ops) return 0; #ifdef CONFIG_PCI_BIOS diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 8d54df4..498e35e 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -173,9 +173,78 @@ static void __init pci_mmcfg_insert_resources(unsigned long resource_flags) pci_mmcfg_resources_inserted = 1; } -static void __init pci_mmcfg_reject_broken(int type) +static acpi_status __init check_mcfg_resource(struct acpi_resource *res, + void *data) +{ + struct resource *mcfg_res = data; + struct acpi_resource_address64 address; + acpi_status status; + + if (res->type == ACPI_RESOURCE_TYPE_FIXED_MEMORY32) { + struct acpi_resource_fixed_memory32 *fixmem32 = + &res->data.fixed_memory32; + if (!fixmem32) + return AE_OK; + if ((mcfg_res->start >= fixmem32->address) && + (mcfg_res->end < (fixmem32->address + + fixmem32->address_length))) { + mcfg_res->flags = 1; + return AE_CTRL_TERMINATE; + } + } + if ((res->type != ACPI_RESOURCE_TYPE_ADDRESS32) && + (res->type != ACPI_RESOURCE_TYPE_ADDRESS64)) + return AE_OK; + + status = acpi_resource_to_address64(res, &address); + if (ACPI_FAILURE(status) || + (address.address_length <= 0) || + (address.resource_type != ACPI_MEMORY_RANGE)) + return AE_OK; + + if ((mcfg_res->start >= address.minimum) && + (mcfg_res->end < (address.minimum + address.address_length))) { + mcfg_res->flags = 1; + return AE_CTRL_TERMINATE; + } + return AE_OK; +} + +static acpi_status __init find_mboard_resource(acpi_handle handle, u32 lvl, + void *context, void **rv) +{ + struct resource *mcfg_res = context; + + acpi_walk_resources(handle, METHOD_NAME__CRS, + check_mcfg_resource, context); + + if (mcfg_res->flags) + return AE_CTRL_TERMINATE; + + return AE_OK; +} + +static int __init is_acpi_reserved(unsigned long start, unsigned long end) +{ + struct resource mcfg_res; + + mcfg_res.start = start; + mcfg_res.end = end; + mcfg_res.flags = 0; + + acpi_get_devices("PNP0C01", find_mboard_resource, &mcfg_res, NULL); + + if (!mcfg_res.flags) + acpi_get_devices("PNP0C02", find_mboard_resource, &mcfg_res, + NULL); + + return mcfg_res.flags; +} + +static void __init pci_mmcfg_reject_broken(void) { typeof(pci_mmcfg_config[0]) *cfg; + int i; if ((pci_mmcfg_config_num == 0) || (pci_mmcfg_config == NULL) || @@ -196,17 +265,37 @@ static void __init pci_mmcfg_reject_broken(int type) goto reject; } - /* - * Only do this check when type 1 works. If it doesn't work - * assume we run on a Mac and always use MCFG - */ - if (type == 1 && !e820_all_mapped(cfg->address, - cfg->address + MMCONFIG_APER_MIN, - E820_RESERVED)) { - printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not" - " E820-reserved\n", cfg->address); - goto reject; + for (i = 0; i < pci_mmcfg_config_num; i++) { + u32 size = (cfg->end_bus_number + 1) << 20; + cfg = &pci_mmcfg_config[i]; + printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lu " + "segment %hu buses %u - %u\n", + i, (unsigned long)cfg->address, cfg->pci_segment, + (unsigned int)cfg->start_bus_number, + (unsigned int)cfg->end_bus_number); + if (is_acpi_reserved(cfg->address, cfg->address + size - 1)) { + printk(KERN_NOTICE "PCI: MCFG area at %Lx reserved " + "in ACPI motherboard resources\n", + cfg->address); + } else { + printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not" + " reserved in ACPI motherboard resources\n", + cfg->address); + /* Don't try to do this check unless configuration + type 1 is available. */ + if ((pci_probe & PCI_PROBE_CONF1) && + e820_all_mapped(cfg->address, + cfg->address + size - 1, + E820_RESERVED)) + printk(KERN_NOTICE + "PCI: MCFG area at %Lx reserved in " + "E820\n", + cfg->address); + else + goto reject; + } } + return; reject: @@ -216,20 +305,46 @@ reject: pci_mmcfg_config_num = 0; } -void __init pci_mmcfg_init(int type) +void __init pci_mmcfg_early_init(int type) +{ + if ((pci_probe & PCI_PROBE_MMCONF) == 0) + return; + + /* If type 1 access is available, no need to enable MMCONFIG yet, we can + defer until later when the ACPI interpreter is available to better + validate things. */ + if (type == 1) + return; + + acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); + + if ((pci_mmcfg_config_num == 0) || + (pci_mmcfg_config == NULL) || + (pci_mmcfg_config[0].address == 0)) + return; + + if (pci_mmcfg_arch_init()) + pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; +} + +void __init pci_mmcfg_late_init(void) { int known_bridge = 0; + /* MMCONFIG disabled */ if ((pci_probe & PCI_PROBE_MMCONF) == 0) return; - if (type == 1 && pci_mmcfg_check_hostbridge()) - known_bridge = 1; + /* MMCONFIG already enabled */ + if (!(pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF)) + return; - if (!known_bridge) { + if ((pci_probe & PCI_PROBE_CONF1) && pci_mmcfg_check_hostbridge()) + known_bridge = 1; + else acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); - pci_mmcfg_reject_broken(type); - } + + pci_mmcfg_reject_broken(); if ((pci_mmcfg_config_num == 0) || (pci_mmcfg_config == NULL) || diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h index c4bddae..28b9b72 100644 --- a/arch/x86/pci/pci.h +++ b/arch/x86/pci/pci.h @@ -97,7 +97,6 @@ extern struct pci_raw_ops pci_direct_conf1; extern int pci_direct_probe(void); extern void pci_direct_init(int type); extern void pci_pcbios_init(void); -extern void pci_mmcfg_init(int type); /* pci-mmconfig.c */ diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 2d1955c..a6dbcf4 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -35,6 +35,7 @@ #ifdef CONFIG_X86 #include #endif +#include #include #include @@ -784,6 +785,7 @@ static int __init acpi_init(void) result = acpi_bus_init(); if (!result) { + pci_mmcfg_late_init(); if (!(pm_flags & PM_APM)) pm_flags |= PM_ACPI; else { diff --git a/include/linux/pci.h b/include/linux/pci.h index 2924913..43a4f9c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1053,5 +1053,13 @@ extern unsigned long pci_cardbus_mem_size; extern int pcibios_add_platform_entries(struct pci_dev *dev); +#ifdef CONFIG_PCI_MMCONFIG +extern void __init pci_mmcfg_early_init(int type); +extern void __init pci_mmcfg_late_init(void); +#else +static inline void pci_mmcfg_early_init(int type) { } +static inline void pci_mmcfg_late_init(void) { } +#endif + #endif /* __KERNEL__ */ #endif /* LINUX_PCI_H */ -- cgit v0.10.2 From 0b64ad7123eb013c3de26750f2d4c356cd566231 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 Feb 2008 01:28:41 -0800 Subject: x86: clear pci_mmcfg_virt when mmcfg get rejected For x86_64, need to free pci_mmcfg_virt, and iounmap some pointers when MMCONF is not reserved in E820 or acpi _CRS and get rejected. Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Greg KH Cc: Greg KH Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 498e35e..8f20495 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -300,6 +300,7 @@ static void __init pci_mmcfg_reject_broken(void) reject: printk(KERN_ERR "PCI: Not using MMCONFIG.\n"); + pci_mmcfg_arch_free(); kfree(pci_mmcfg_config); pci_mmcfg_config = NULL; pci_mmcfg_config_num = 0; diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index 081816a..f3c761d 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -136,3 +136,7 @@ int __init pci_mmcfg_arch_init(void) raw_pci_ext_ops = &pci_mmcfg; return 1; } + +void __init pci_mmcfg_arch_free(void) +{ +} diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index 9207fd4..a199416 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -127,7 +127,7 @@ static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg) int __init pci_mmcfg_arch_init(void) { int i; - pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * + pci_mmcfg_virt = kzalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL); if (pci_mmcfg_virt == NULL) { printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n"); @@ -141,9 +141,29 @@ int __init pci_mmcfg_arch_init(void) printk(KERN_ERR "PCI: Cannot map mmconfig aperture for " "segment %d\n", pci_mmcfg_config[i].pci_segment); + pci_mmcfg_arch_free(); return 0; } } raw_pci_ext_ops = &pci_mmcfg; return 1; } + +void __init pci_mmcfg_arch_free(void) +{ + int i; + + if (pci_mmcfg_virt == NULL) + return; + + for (i = 0; i < pci_mmcfg_config_num; ++i) { + if (pci_mmcfg_virt[i].virt) { + iounmap(pci_mmcfg_virt[i].virt); + pci_mmcfg_virt[i].virt = NULL; + pci_mmcfg_virt[i].cfg = NULL; + } + } + + kfree(pci_mmcfg_virt); + pci_mmcfg_virt = NULL; +} diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h index 28b9b72..c8b89a8 100644 --- a/arch/x86/pci/pci.h +++ b/arch/x86/pci/pci.h @@ -101,6 +101,7 @@ extern void pci_pcbios_init(void); /* pci-mmconfig.c */ extern int __init pci_mmcfg_arch_init(void); +extern void __init pci_mmcfg_arch_free(void); /* * AMD Fam10h CPUs are buggy, and cannot access MMIO config space -- cgit v0.10.2 From 05c58b8ac77639c17205f0b2a2d9eb1971dc47ad Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 Feb 2008 01:30:14 -0800 Subject: x86: mmconf enable mcfg early Patch "x86: validate against ACPI motherboard resources" changed the mmconf init sequence, and init MMCONF late in acpi_init. here change it back to old sequence: 1. check hostbridge in early 2. check MCFG with e820 in early 3. if all fail, will check MCFg with acpi _CRS in acpi_init So we can make MCONF working again when acpi=off is set if hostbridge support that. Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Greg KH Cc: Greg KH Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 8f20495..36a4a75 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -241,7 +241,7 @@ static int __init is_acpi_reserved(unsigned long start, unsigned long end) return mcfg_res.flags; } -static void __init pci_mmcfg_reject_broken(void) +static void __init pci_mmcfg_reject_broken(int type, int early) { typeof(pci_mmcfg_config[0]) *cfg; int i; @@ -266,34 +266,43 @@ static void __init pci_mmcfg_reject_broken(void) } for (i = 0; i < pci_mmcfg_config_num; i++) { + int valid = 0; u32 size = (cfg->end_bus_number + 1) << 20; cfg = &pci_mmcfg_config[i]; - printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lu " + printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx " "segment %hu buses %u - %u\n", i, (unsigned long)cfg->address, cfg->pci_segment, (unsigned int)cfg->start_bus_number, (unsigned int)cfg->end_bus_number); - if (is_acpi_reserved(cfg->address, cfg->address + size - 1)) { + + if (!early && + is_acpi_reserved(cfg->address, cfg->address + size - 1)) { printk(KERN_NOTICE "PCI: MCFG area at %Lx reserved " "in ACPI motherboard resources\n", cfg->address); - } else { + valid = 1; + } + + if (valid) + continue; + + if (!early) printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not" " reserved in ACPI motherboard resources\n", cfg->address); - /* Don't try to do this check unless configuration - type 1 is available. */ - if ((pci_probe & PCI_PROBE_CONF1) && - e820_all_mapped(cfg->address, - cfg->address + size - 1, - E820_RESERVED)) - printk(KERN_NOTICE - "PCI: MCFG area at %Lx reserved in " - "E820\n", - cfg->address); - else - goto reject; + /* Don't try to do this check unless configuration + type 1 is available. */ + if (type == 1 && e820_all_mapped(cfg->address, + cfg->address + size - 1, + E820_RESERVED)) { + printk(KERN_NOTICE + "PCI: MCFG area at %Lx reserved in E820\n", + cfg->address); + valid = 1; } + + if (!valid) + goto reject; } return; @@ -306,46 +315,31 @@ reject: pci_mmcfg_config_num = 0; } -void __init pci_mmcfg_early_init(int type) -{ - if ((pci_probe & PCI_PROBE_MMCONF) == 0) - return; - - /* If type 1 access is available, no need to enable MMCONFIG yet, we can - defer until later when the ACPI interpreter is available to better - validate things. */ - if (type == 1) - return; - - acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); - - if ((pci_mmcfg_config_num == 0) || - (pci_mmcfg_config == NULL) || - (pci_mmcfg_config[0].address == 0)) - return; +static int __initdata known_bridge; - if (pci_mmcfg_arch_init()) - pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; -} - -void __init pci_mmcfg_late_init(void) +void __init __pci_mmcfg_init(int type, int early) { - int known_bridge = 0; - /* MMCONFIG disabled */ if ((pci_probe & PCI_PROBE_MMCONF) == 0) return; /* MMCONFIG already enabled */ - if (!(pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF)) + if (!early && !(pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF)) return; - if ((pci_probe & PCI_PROBE_CONF1) && pci_mmcfg_check_hostbridge()) - known_bridge = 1; - else - acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); + /* for late to exit */ + if (known_bridge) + return; - pci_mmcfg_reject_broken(); + if (early && type == 1) { + if (pci_mmcfg_check_hostbridge()) + known_bridge = 1; + } + + if (!known_bridge) { + acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); + pci_mmcfg_reject_broken(type, early); + } if ((pci_mmcfg_config_num == 0) || (pci_mmcfg_config == NULL) || @@ -365,6 +359,21 @@ void __init pci_mmcfg_late_init(void) } } +void __init pci_mmcfg_early_init(int type) +{ + __pci_mmcfg_init(type, 1); +} + +void __init pci_mmcfg_late_init(void) +{ + int type = 0; + + if (pci_probe & PCI_PROBE_CONF1) + type = 1; + + __pci_mmcfg_init(type, 0); +} + static int __init pci_mmcfg_late_insert_resources(void) { /* -- cgit v0.10.2 From 57741a779070e0b141b6148136b420c8d35ccbce Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 Feb 2008 01:32:50 -0800 Subject: x86_64: set cfg_size for AMD Family 10h in case MMCONFIG reuse pci_cfg_space_size but skip check pci express and pci-x CAP ID. Signed-off-by: Yinghai Lu Cc: Andrew Morton Acked-by: Greg Kroah-Hartman Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index a5ef5f5..b60b2ab 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -493,3 +493,20 @@ static void __devinit pci_siemens_interrupt_controller(struct pci_dev *dev) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SIEMENS, 0x0015, pci_siemens_interrupt_controller); + +/* + * Regular PCI devices have 256 bytes, but AMD Family 10h Opteron ext config + * have 4096 bytes. Even if the device is capable, that doesn't mean we can + * access it. Maybe we don't have a way to generate extended config space + * accesses. So check it + */ +static void fam10h_pci_cfg_space_size(struct pci_dev *dev) +{ + dev->cfg_size = pci_cfg_space_size_ext(dev, 0); +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1200, fam10h_pci_cfg_space_size); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1201, fam10h_pci_cfg_space_size); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, fam10h_pci_cfg_space_size); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, fam10h_pci_cfg_space_size); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, fam10h_pci_cfg_space_size); diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index f991359..a8efdae 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -842,11 +842,14 @@ static void set_pcie_port_type(struct pci_dev *pdev) * reading the dword at 0x100 which must either be 0 or a valid extended * capability header. */ -int pci_cfg_space_size(struct pci_dev *dev) +int pci_cfg_space_size_ext(struct pci_dev *dev, unsigned check_exp_pcix) { int pos; u32 status; + if (!check_exp_pcix) + goto skip; + pos = pci_find_capability(dev, PCI_CAP_ID_EXP); if (!pos) { pos = pci_find_capability(dev, PCI_CAP_ID_PCIX); @@ -858,6 +861,7 @@ int pci_cfg_space_size(struct pci_dev *dev) goto fail; } + skip: if (pci_read_config_dword(dev, 256, &status) != PCIBIOS_SUCCESSFUL) goto fail; if (status == 0xffffffff) @@ -869,6 +873,11 @@ int pci_cfg_space_size(struct pci_dev *dev) return PCI_CFG_SPACE_SIZE; } +int pci_cfg_space_size(struct pci_dev *dev) +{ + return pci_cfg_space_size_ext(dev, 1); +} + static void pci_release_bus_bridge_dev(struct device *dev) { kfree(dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index 43a4f9c..2b8f745 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -666,6 +666,7 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, void pci_walk_bus(struct pci_bus *top, void (*cb)(struct pci_dev *, void *), void *userdata); +int pci_cfg_space_size_ext(struct pci_dev *dev, unsigned check_exp_pcix); int pci_cfg_space_size(struct pci_dev *dev); unsigned char pci_bus_max_busnr(struct pci_bus *bus); -- cgit v0.10.2 From eee206c3bfd0888f22ae9da3172487c61d72187d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Feb 2008 03:13:43 -0800 Subject: x86_64: check and enable MMCONFIG for AMD Family 10h So we can use MMCONF when MMCONF is not set by BIOS using TOP_MEM2 msr to get memory top, and try to scan fam10h mmio routing to make sure the range is not conflicted with some prefetch MMIO that is above 4G. (current only LinuxBIOS assign 64 bit mmio above 4G for some co-processor) Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 60e64c8..185d3cc 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -40,6 +41,7 @@ #include #include #include +#include #include #include @@ -577,6 +579,205 @@ static int __cpuinit nearby_node(int apicid) } #endif +#ifdef CONFIG_PCI_MMCONFIG +struct pci_hostbridge_probe { + u32 bus; + u32 slot; + u32 vendor; + u32 device; +}; + +static u64 __cpuinitdata fam10h_pci_mmconf_base; +static int __cpuinitdata fam10h_pci_mmconf_base_status; + +static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = { + { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, + { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, +}; + +struct range { + u64 start; + u64 end; +}; + +static int __cpuinit cmp_range(const void *x1, const void *x2) +{ + const struct range *r1 = x1; + const struct range *r2 = x2; + int start1, start2; + + start1 = r1->start >> 32; + start2 = r2->start >> 32; + + return start1 - start2; +} + +/*[47:0] */ +/* need to avoid (0xfd<<32) and (0xfe<<32), ht used space */ +#define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32) +#define BASE_VALID(b) ((b != (0xfdULL << 32)) && (b != (0xfeULL << 32))) +static void __cpuinit get_fam10h_pci_mmconf_base(void) +{ + int i; + unsigned bus; + unsigned slot; + int found; + + u64 val; + u32 address; + u64 tom2; + u64 base = FAM10H_PCI_MMCONF_BASE; + + int hi_mmio_num; + struct range range[8]; + + /* only try to get setting from BSP */ + /* -1 or 1 */ + if (fam10h_pci_mmconf_base_status) + return; + + if (!early_pci_allowed()) + goto fail; + + found = 0; + for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { + u32 id; + u16 device; + u16 vendor; + + bus = pci_probes[i].bus; + slot = pci_probes[i].slot; + id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID); + + vendor = id & 0xffff; + device = (id>>16) & 0xffff; + if (pci_probes[i].vendor == vendor && + pci_probes[i].device == device) { + found = 1; + break; + } + } + + if (!found) + goto fail; + + /* SYS_CFG */ + address = MSR_K8_SYSCFG; + rdmsrl(address, val); + + /* TOP_MEM2 is not enabled? */ + if (!(val & (1<<21))) { + tom2 = 0; + } else { + /* TOP_MEM2 */ + address = MSR_K8_TOP_MEM2; + rdmsrl(address, val); + tom2 = val & (0xffffULL<<32); + } + + if (base <= tom2) + base = tom2 + (1ULL<<32); + + /* + * need to check if the range is in the high mmio range that is + * above 4G + */ + hi_mmio_num = 0; + for (i = 0; i < 8; i++) { + u32 reg; + u64 start; + u64 end; + reg = read_pci_config(bus, slot, 1, 0x80 + (i << 3)); + if (!(reg & 3)) + continue; + + start = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/ + reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3)); + end = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/ + + if (!end) + continue; + + range[hi_mmio_num].start = start; + range[hi_mmio_num].end = end; + hi_mmio_num++; + } + + if (!hi_mmio_num) + goto out; + + /* sort the range */ + sort(range, hi_mmio_num, sizeof(struct range), cmp_range, NULL); + + if (range[hi_mmio_num - 1].end < base) + goto out; + if (range[0].start > base) + goto out; + + /* need to find one window */ + base = range[0].start - (1ULL << 32); + if ((base > tom2) && BASE_VALID(base)) + goto out; + base = range[hi_mmio_num - 1].end + (1ULL << 32); + if ((base > tom2) && BASE_VALID(base)) + goto out; + /* need to find window between ranges */ + if (hi_mmio_num > 1) + for (i = 0; i < hi_mmio_num - 1; i++) { + if (range[i + 1].start > (range[i].end + (1ULL << 32))) { + base = range[i].end + (1ULL << 32); + if ((base > tom2) && BASE_VALID(base)) + goto out; + } + } + +fail: + fam10h_pci_mmconf_base_status = -1; + return; +out: + fam10h_pci_mmconf_base = base; + fam10h_pci_mmconf_base_status = 1; +} +#endif + +static void __cpuinit fam10h_check_enable_mmcfg(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_PCI_MMCONFIG + u64 val; + u32 address; + + address = MSR_FAM10H_MMIO_CONF_BASE; + rdmsrl(address, val); + + /* try to make sure that AP's setting is identical to BSP setting */ + if (val & FAM10H_MMIO_CONF_ENABLE) { + u64 base; + base = val & (0xffffULL << 32); + if (fam10h_pci_mmconf_base_status <= 0) { + fam10h_pci_mmconf_base = base; + fam10h_pci_mmconf_base_status = 1; + return; + } else if (fam10h_pci_mmconf_base == base) + return; + } + + /* + * if it is not enabled, try to enable it and assume only one segment + * with 256 buses + */ + get_fam10h_pci_mmconf_base(); + if (fam10h_pci_mmconf_base_status <= 0) + return; + + printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n"); + val &= ~((FAM10H_MMIO_CONF_BASE_MASK<x86 == 0x10) + fam10h_check_enable_mmcfg(c); + if (amd_apic_timer_broken()) disable_apic_timer = 1; -- cgit v0.10.2 From 7fd0da4085d5b012a6bdcbbd63da7ead9fc69ad4 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Feb 2008 03:13:02 -0800 Subject: x86_64: check MSR to get MMCONFIG for AMD Family 10h so even booting kernel with acpi=off or even MCFG is not there, we still can use MMCONFIG. Signed-off-by: Yinghai Lu Cc: Andi Kleen Cc: Greg KH Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 36a4a75..8707e24 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -100,33 +100,96 @@ static const char __init *pci_mmcfg_intel_945(void) return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub"; } +static const char __init *pci_mmcfg_amd_fam10h(void) +{ + u32 low, high, address; + u64 base, msr; + int i; + unsigned segnbits = 0, busnbits; + + address = MSR_FAM10H_MMIO_CONF_BASE; + if (rdmsr_safe(address, &low, &high)) + return NULL; + + msr = high; + msr <<= 32; + msr |= low; + + /* mmconfig is not enable */ + if (!(msr & FAM10H_MMIO_CONF_ENABLE)) + return NULL; + + base = msr & (FAM10H_MMIO_CONF_BASE_MASK<> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & + FAM10H_MMIO_CONF_BUSRANGE_MASK; + + /* + * only handle bus 0 ? + * need to skip it + */ + if (!busnbits) + return NULL; + + if (busnbits > 8) { + segnbits = busnbits - 8; + busnbits = 8; + } + + pci_mmcfg_config_num = (1 << segnbits); + pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]) * + pci_mmcfg_config_num, GFP_KERNEL); + if (!pci_mmcfg_config) + return NULL; + + for (i = 0; i < (1 << segnbits); i++) { + pci_mmcfg_config[i].address = base + (1<<28) * i; + pci_mmcfg_config[i].pci_segment = i; + pci_mmcfg_config[i].start_bus_number = 0; + pci_mmcfg_config[i].end_bus_number = (1 << busnbits) - 1; + } + + return "AMD Family 10h NB"; +} + struct pci_mmcfg_hostbridge_probe { + u32 bus; + u32 devfn; u32 vendor; u32 device; const char *(*probe)(void); }; static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = { - { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, pci_mmcfg_e7520 }, - { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82945G_HB, pci_mmcfg_intel_945 }, + { 0, PCI_DEVFN(0, 0), PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_E7520_MCH, pci_mmcfg_e7520 }, + { 0, PCI_DEVFN(0, 0), PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_82945G_HB, pci_mmcfg_intel_945 }, + { 0, PCI_DEVFN(0x18, 0), PCI_VENDOR_ID_AMD, + 0x1200, pci_mmcfg_amd_fam10h }, + { 0xff, PCI_DEVFN(0, 0), PCI_VENDOR_ID_AMD, + 0x1200, pci_mmcfg_amd_fam10h }, }; static int __init pci_mmcfg_check_hostbridge(void) { u32 l; + u32 bus, devfn; u16 vendor, device; int i; const char *name; - pci_direct_conf1.read(0, 0, PCI_DEVFN(0,0), 0, 4, &l); - vendor = l & 0xffff; - device = (l >> 16) & 0xffff; - pci_mmcfg_config_num = 0; pci_mmcfg_config = NULL; name = NULL; for (i = 0; !name && i < ARRAY_SIZE(pci_mmcfg_probes); i++) { + bus = pci_mmcfg_probes[i].bus; + devfn = pci_mmcfg_probes[i].devfn; + pci_direct_conf1.read(0, bus, devfn, 0, 4, &l); + vendor = l & 0xffff; + device = (l >> 16) & 0xffff; + if (pci_mmcfg_probes[i].vendor == vendor && pci_mmcfg_probes[i].device == device) name = pci_mmcfg_probes[i].probe(); -- cgit v0.10.2 From d4c4d09415c48ecb621804cd4ec4a7a4d9a3662f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 25 Feb 2008 18:41:35 -0800 Subject: x86: if acpi=off, force setting the mmconf for fam10h some BIOS only let AMD fam 10h handle bus0, and nvidia mcp55/ck804 to handle other buses. at that case MCFG will cover all over them. but with acpi=off, we can not use MCFG. this patch will double check the busnbits, and if it is less handling 256 bues, and acpi=off will forcely reset the mmconf in msr, so we still use mmconf in above case. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 185d3cc..5e269a5 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -751,14 +751,21 @@ static void __cpuinit fam10h_check_enable_mmcfg(struct cpuinfo_x86 *c) /* try to make sure that AP's setting is identical to BSP setting */ if (val & FAM10H_MMIO_CONF_ENABLE) { - u64 base; - base = val & (0xffffULL << 32); - if (fam10h_pci_mmconf_base_status <= 0) { - fam10h_pci_mmconf_base = base; - fam10h_pci_mmconf_base_status = 1; - return; - } else if (fam10h_pci_mmconf_base == base) - return; + unsigned busnbits; + busnbits = (val >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & + FAM10H_MMIO_CONF_BUSRANGE_MASK; + + /* only trust the one handle 256 buses, if acpi=off */ + if (!acpi_pci_disabled || busnbits >= 8) { + u64 base; + base = val & (0xffffULL << 32); + if (fam10h_pci_mmconf_base_status <= 0) { + fam10h_pci_mmconf_base = base; + fam10h_pci_mmconf_base_status = 1; + return; + } else if (fam10h_pci_mmconf_base == base) + return; + } } /* -- cgit v0.10.2 From d39398a333ddc490f842ccdd4b76c9674682aa5d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 26 Feb 2008 11:04:17 -0800 Subject: x86: seperate mmconf for fam10h out from setup_64.c Separate mmconf for fam10h out from setup_64.c Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 90e092d..815b650 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -99,4 +99,6 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o + + obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o endif diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c new file mode 100644 index 0000000..3789792 --- /dev/null +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -0,0 +1,215 @@ +/* + * AMD Family 10h mmconfig enablement + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct pci_hostbridge_probe { + u32 bus; + u32 slot; + u32 vendor; + u32 device; +}; + +static u64 __cpuinitdata fam10h_pci_mmconf_base; +static int __cpuinitdata fam10h_pci_mmconf_base_status; + +static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = { + { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, + { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, +}; + +struct range { + u64 start; + u64 end; +}; + +static int __cpuinit cmp_range(const void *x1, const void *x2) +{ + const struct range *r1 = x1; + const struct range *r2 = x2; + int start1, start2; + + start1 = r1->start >> 32; + start2 = r2->start >> 32; + + return start1 - start2; +} + +/*[47:0] */ +/* need to avoid (0xfd<<32) and (0xfe<<32), ht used space */ +#define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32) +#define BASE_VALID(b) ((b != (0xfdULL << 32)) && (b != (0xfeULL << 32))) +static void __cpuinit get_fam10h_pci_mmconf_base(void) +{ + int i; + unsigned bus; + unsigned slot; + int found; + + u64 val; + u32 address; + u64 tom2; + u64 base = FAM10H_PCI_MMCONF_BASE; + + int hi_mmio_num; + struct range range[8]; + + /* only try to get setting from BSP */ + /* -1 or 1 */ + if (fam10h_pci_mmconf_base_status) + return; + + if (!early_pci_allowed()) + goto fail; + + found = 0; + for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { + u32 id; + u16 device; + u16 vendor; + + bus = pci_probes[i].bus; + slot = pci_probes[i].slot; + id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID); + + vendor = id & 0xffff; + device = (id>>16) & 0xffff; + if (pci_probes[i].vendor == vendor && + pci_probes[i].device == device) { + found = 1; + break; + } + } + + if (!found) + goto fail; + + /* SYS_CFG */ + address = MSR_K8_SYSCFG; + rdmsrl(address, val); + + /* TOP_MEM2 is not enabled? */ + if (!(val & (1<<21))) { + tom2 = 0; + } else { + /* TOP_MEM2 */ + address = MSR_K8_TOP_MEM2; + rdmsrl(address, val); + tom2 = val & (0xffffULL<<32); + } + + if (base <= tom2) + base = tom2 + (1ULL<<32); + + /* + * need to check if the range is in the high mmio range that is + * above 4G + */ + hi_mmio_num = 0; + for (i = 0; i < 8; i++) { + u32 reg; + u64 start; + u64 end; + reg = read_pci_config(bus, slot, 1, 0x80 + (i << 3)); + if (!(reg & 3)) + continue; + + start = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/ + reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3)); + end = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/ + + if (!end) + continue; + + range[hi_mmio_num].start = start; + range[hi_mmio_num].end = end; + hi_mmio_num++; + } + + if (!hi_mmio_num) + goto out; + + /* sort the range */ + sort(range, hi_mmio_num, sizeof(struct range), cmp_range, NULL); + + if (range[hi_mmio_num - 1].end < base) + goto out; + if (range[0].start > base) + goto out; + + /* need to find one window */ + base = range[0].start - (1ULL << 32); + if ((base > tom2) && BASE_VALID(base)) + goto out; + base = range[hi_mmio_num - 1].end + (1ULL << 32); + if ((base > tom2) && BASE_VALID(base)) + goto out; + /* need to find window between ranges */ + if (hi_mmio_num > 1) + for (i = 0; i < hi_mmio_num - 1; i++) { + if (range[i + 1].start > (range[i].end + (1ULL << 32))) { + base = range[i].end + (1ULL << 32); + if ((base > tom2) && BASE_VALID(base)) + goto out; + } + } + +fail: + fam10h_pci_mmconf_base_status = -1; + return; +out: + fam10h_pci_mmconf_base = base; + fam10h_pci_mmconf_base_status = 1; +} + +void __cpuinit fam10h_check_enable_mmcfg(void) +{ + u64 val; + u32 address; + + address = MSR_FAM10H_MMIO_CONF_BASE; + rdmsrl(address, val); + + /* try to make sure that AP's setting is identical to BSP setting */ + if (val & FAM10H_MMIO_CONF_ENABLE) { + unsigned busnbits; + busnbits = (val >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & + FAM10H_MMIO_CONF_BUSRANGE_MASK; + + /* only trust the one handle 256 buses, if acpi=off */ + if (!acpi_pci_disabled || busnbits >= 8) { + u64 base; + base = val & (0xffffULL << 32); + if (fam10h_pci_mmconf_base_status <= 0) { + fam10h_pci_mmconf_base = base; + fam10h_pci_mmconf_base_status = 1; + return; + } else if (fam10h_pci_mmconf_base == base) + return; + } + } + + /* + * if it is not enabled, try to enable it and assume only one segment + * with 256 buses + */ + get_fam10h_pci_mmconf_base(); + if (fam10h_pci_mmconf_base_status <= 0) + return; + + printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n"); + val &= ~((FAM10H_MMIO_CONF_BASE_MASK<start >> 32; - start2 = r2->start >> 32; - - return start1 - start2; -} - -/*[47:0] */ -/* need to avoid (0xfd<<32) and (0xfe<<32), ht used space */ -#define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32) -#define BASE_VALID(b) ((b != (0xfdULL << 32)) && (b != (0xfeULL << 32))) -static void __cpuinit get_fam10h_pci_mmconf_base(void) -{ - int i; - unsigned bus; - unsigned slot; - int found; - - u64 val; - u32 address; - u64 tom2; - u64 base = FAM10H_PCI_MMCONF_BASE; - - int hi_mmio_num; - struct range range[8]; - - /* only try to get setting from BSP */ - /* -1 or 1 */ - if (fam10h_pci_mmconf_base_status) - return; - - if (!early_pci_allowed()) - goto fail; - - found = 0; - for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { - u32 id; - u16 device; - u16 vendor; - - bus = pci_probes[i].bus; - slot = pci_probes[i].slot; - id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID); - - vendor = id & 0xffff; - device = (id>>16) & 0xffff; - if (pci_probes[i].vendor == vendor && - pci_probes[i].device == device) { - found = 1; - break; - } - } - - if (!found) - goto fail; - - /* SYS_CFG */ - address = MSR_K8_SYSCFG; - rdmsrl(address, val); - - /* TOP_MEM2 is not enabled? */ - if (!(val & (1<<21))) { - tom2 = 0; - } else { - /* TOP_MEM2 */ - address = MSR_K8_TOP_MEM2; - rdmsrl(address, val); - tom2 = val & (0xffffULL<<32); - } - - if (base <= tom2) - base = tom2 + (1ULL<<32); - - /* - * need to check if the range is in the high mmio range that is - * above 4G - */ - hi_mmio_num = 0; - for (i = 0; i < 8; i++) { - u32 reg; - u64 start; - u64 end; - reg = read_pci_config(bus, slot, 1, 0x80 + (i << 3)); - if (!(reg & 3)) - continue; - - start = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/ - reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3)); - end = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/ - - if (!end) - continue; - - range[hi_mmio_num].start = start; - range[hi_mmio_num].end = end; - hi_mmio_num++; - } - - if (!hi_mmio_num) - goto out; - - /* sort the range */ - sort(range, hi_mmio_num, sizeof(struct range), cmp_range, NULL); - - if (range[hi_mmio_num - 1].end < base) - goto out; - if (range[0].start > base) - goto out; - - /* need to find one window */ - base = range[0].start - (1ULL << 32); - if ((base > tom2) && BASE_VALID(base)) - goto out; - base = range[hi_mmio_num - 1].end + (1ULL << 32); - if ((base > tom2) && BASE_VALID(base)) - goto out; - /* need to find window between ranges */ - if (hi_mmio_num > 1) - for (i = 0; i < hi_mmio_num - 1; i++) { - if (range[i + 1].start > (range[i].end + (1ULL << 32))) { - base = range[i].end + (1ULL << 32); - if ((base > tom2) && BASE_VALID(base)) - goto out; - } - } - -fail: - fam10h_pci_mmconf_base_status = -1; - return; -out: - fam10h_pci_mmconf_base = base; - fam10h_pci_mmconf_base_status = 1; -} -#endif - -static void __cpuinit fam10h_check_enable_mmcfg(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_PCI_MMCONFIG - u64 val; - u32 address; - - address = MSR_FAM10H_MMIO_CONF_BASE; - rdmsrl(address, val); - - /* try to make sure that AP's setting is identical to BSP setting */ - if (val & FAM10H_MMIO_CONF_ENABLE) { - unsigned busnbits; - busnbits = (val >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & - FAM10H_MMIO_CONF_BUSRANGE_MASK; - - /* only trust the one handle 256 buses, if acpi=off */ - if (!acpi_pci_disabled || busnbits >= 8) { - u64 base; - base = val & (0xffffULL << 32); - if (fam10h_pci_mmconf_base_status <= 0) { - fam10h_pci_mmconf_base = base; - fam10h_pci_mmconf_base_status = 1; - return; - } else if (fam10h_pci_mmconf_base == base) - return; - } - } - - /* - * if it is not enabled, try to enable it and assume only one segment - * with 256 buses - */ - get_fam10h_pci_mmconf_base(); - if (fam10h_pci_mmconf_base_status <= 0) - return; - - printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n"); - val &= ~((FAM10H_MMIO_CONF_BASE_MASK<x86 == 0x10) - fam10h_check_enable_mmcfg(c); + fam10h_check_enable_mmcfg(); if (amd_apic_timer_broken()) disable_apic_timer = 1; -- cgit v0.10.2 From 0d358f22f6c8f03ab215eee8d52b74f78cc3c7db Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Feb 2008 03:20:41 -0800 Subject: driver core: try parent numa_node at first before using default in the device_add, we try to use use parent numa_node. need to make sure pci root bus's bridge device numa_node is set. then we could use device->numa_node direclty for all device. and don't need to call pcibus_to_node(). Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar diff --git a/drivers/base/core.c b/drivers/base/core.c index 9248e09..be288b5 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -787,6 +787,10 @@ int device_add(struct device *dev) parent = get_device(dev->parent); setup_parent(dev, parent); + /* use parent numa_node */ + if (parent) + set_dev_node(dev, dev_to_node(parent)); + /* first, register with generic layer. */ error = kobject_add(&dev->kobj, dev->kobj.parent, "%s", dev->bus_id); if (error) @@ -1306,8 +1310,11 @@ int device_move(struct device *dev, struct device *new_parent) dev->parent = new_parent; if (old_parent) klist_remove(&dev->knode_parent); - if (new_parent) + if (new_parent) { klist_add_tail(&dev->knode_parent, &new_parent->klist_children); + set_dev_node(dev, dev_to_node(new_parent)); + } + if (!dev->class) goto out_put; error = device_move_class_links(dev, old_parent, new_parent); @@ -1317,9 +1324,12 @@ int device_move(struct device *dev, struct device *new_parent) if (!kobject_move(&dev->kobj, &old_parent->kobj)) { if (new_parent) klist_remove(&dev->knode_parent); - if (old_parent) + dev->parent = old_parent; + if (old_parent) { klist_add_tail(&dev->knode_parent, &old_parent->klist_children); + set_dev_node(dev, dev_to_node(old_parent)); + } } cleanup_glue_dir(dev, new_parent_kobj); put_device(new_parent); diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index a8efdae..a40043b 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -973,7 +973,6 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus) dev->dev.release = pci_release_dev; pci_dev_get(dev); - set_dev_node(&dev->dev, pcibus_to_node(bus)); dev->dev.dma_mask = &dev->dma_mask; dev->dev.dma_parms = &dev->dma_parms; dev->dev.coherent_dma_mask = 0xffffffffull; @@ -1128,6 +1127,9 @@ struct pci_bus * pci_create_bus(struct device *parent, goto dev_reg_err; b->bridge = get_device(dev); + if (!parent) + set_dev_node(b->bridge, pcibus_to_node(b)); + b->dev.class = &pcibus_class; b->dev.parent = b->bridge; sprintf(b->dev.bus_id, "%04x:%02x", pci_domain_nr(b), bus); -- cgit v0.10.2 From d2ebdf4bae4f1d7c30e71fd74f270ca4cda024fc Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 20 Feb 2008 22:21:57 -0800 Subject: x86: remove unneeded check in mmconf reject mmconfig is only used to access extended configuration space. so don't need to reject MFG that only have one entry and only handle bus0. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 8707e24..6f68658 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -316,18 +316,6 @@ static void __init pci_mmcfg_reject_broken(int type, int early) cfg = &pci_mmcfg_config[0]; - /* - * Handle more broken MCFG tables on Asus etc. - * They only contain a single entry for bus 0-0. - */ - if (pci_mmcfg_config_num == 1 && - cfg->pci_segment == 0 && - (cfg->start_bus_number | cfg->end_bus_number) == 0) { - printk(KERN_ERR "PCI: start and end of bus number is 0. " - "Rejected as broken MCFG.\n"); - goto reject; - } - for (i = 0; i < pci_mmcfg_config_num; i++) { int valid = 0; u32 size = (cfg->end_bus_number + 1) << 20; -- cgit v0.10.2 From bb63b4219976d48ed6d22ac33c18be334fb5a78c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 28 Feb 2008 23:56:50 -0800 Subject: x86 pci: remove checking type for mmconfig probe doesn't need to check if it is type1 or type2, we can use raw_pci_ops directly. also make pci_direct_conf1 static again. anyway is there system with type 2 and mmconf support? Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index 42f3e4c..21d1e0e 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c @@ -258,7 +258,8 @@ void __init pci_direct_init(int type) { if (type == 0) return; - printk(KERN_INFO "PCI: Using configuration type %d\n", type); + printk(KERN_INFO "PCI: Using configuration type %d for base access\n", + type); if (type == 1) raw_pci_ops = &pci_direct_conf1; else @@ -275,8 +276,10 @@ int __init pci_direct_probe(void) if (!region) goto type2; - if (pci_check_type1()) + if (pci_check_type1()) { + raw_pci_ops = &pci_direct_conf1; return 1; + } release_resource(region); type2: @@ -290,7 +293,6 @@ int __init pci_direct_probe(void) goto fail2; if (pci_check_type2()) { - printk(KERN_INFO "PCI: Using configuration type 2\n"); raw_pci_ops = &pci_direct_conf2; return 2; } diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c index 2080b04..343c363 100644 --- a/arch/x86/pci/init.c +++ b/arch/x86/pci/init.c @@ -6,14 +6,13 @@ in the right sequence from here. */ static __init int pci_access_init(void) { - int type __maybe_unused = 0; - #ifdef CONFIG_PCI_DIRECT + int type = 0; + type = pci_direct_probe(); #endif - pci_mmcfg_early_init(type); - if (raw_pci_ops) - return 0; + pci_mmcfg_early_init(); + #ifdef CONFIG_PCI_BIOS pci_pcbios_init(); #endif @@ -26,7 +25,7 @@ static __init int pci_access_init(void) #ifdef CONFIG_PCI_DIRECT pci_direct_init(type); #endif - if (!raw_pci_ops) + if (!raw_pci_ops && !raw_pci_ext_ops) printk(KERN_ERR "PCI: Fatal: No config space access function found\n"); diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 6f68658..bdf6224 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -28,7 +28,7 @@ static int __initdata pci_mmcfg_resources_inserted; static const char __init *pci_mmcfg_e7520(void) { u32 win; - pci_direct_conf1.read(0, 0, PCI_DEVFN(0,0), 0xce, 2, &win); + raw_pci_ops->read(0, 0, PCI_DEVFN(0, 0), 0xce, 2, &win); win = win & 0xf000; if(win == 0x0000 || win == 0xf000) @@ -53,7 +53,7 @@ static const char __init *pci_mmcfg_intel_945(void) pci_mmcfg_config_num = 1; - pci_direct_conf1.read(0, 0, PCI_DEVFN(0,0), 0x48, 4, &pciexbar); + raw_pci_ops->read(0, 0, PCI_DEVFN(0, 0), 0x48, 4, &pciexbar); /* Enable bit */ if (!(pciexbar & 1)) @@ -179,6 +179,9 @@ static int __init pci_mmcfg_check_hostbridge(void) int i; const char *name; + if (!raw_pci_ops) + return 0; + pci_mmcfg_config_num = 0; pci_mmcfg_config = NULL; name = NULL; @@ -186,7 +189,7 @@ static int __init pci_mmcfg_check_hostbridge(void) for (i = 0; !name && i < ARRAY_SIZE(pci_mmcfg_probes); i++) { bus = pci_mmcfg_probes[i].bus; devfn = pci_mmcfg_probes[i].devfn; - pci_direct_conf1.read(0, bus, devfn, 0, 4, &l); + raw_pci_ops->read(0, bus, devfn, 0, 4, &l); vendor = l & 0xffff; device = (l >> 16) & 0xffff; @@ -304,7 +307,7 @@ static int __init is_acpi_reserved(unsigned long start, unsigned long end) return mcfg_res.flags; } -static void __init pci_mmcfg_reject_broken(int type, int early) +static void __init pci_mmcfg_reject_broken(int early) { typeof(pci_mmcfg_config[0]) *cfg; int i; @@ -342,8 +345,8 @@ static void __init pci_mmcfg_reject_broken(int type, int early) " reserved in ACPI motherboard resources\n", cfg->address); /* Don't try to do this check unless configuration - type 1 is available. */ - if (type == 1 && e820_all_mapped(cfg->address, + type 1 is available. how about type 2 ?*/ + if (raw_pci_ops && e820_all_mapped(cfg->address, cfg->address + size - 1, E820_RESERVED)) { printk(KERN_NOTICE @@ -368,7 +371,7 @@ reject: static int __initdata known_bridge; -void __init __pci_mmcfg_init(int type, int early) +void __init __pci_mmcfg_init(int early) { /* MMCONFIG disabled */ if ((pci_probe & PCI_PROBE_MMCONF) == 0) @@ -382,14 +385,14 @@ void __init __pci_mmcfg_init(int type, int early) if (known_bridge) return; - if (early && type == 1) { + if (early) { if (pci_mmcfg_check_hostbridge()) known_bridge = 1; } if (!known_bridge) { acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); - pci_mmcfg_reject_broken(type, early); + pci_mmcfg_reject_broken(early); } if ((pci_mmcfg_config_num == 0) || @@ -410,19 +413,14 @@ void __init __pci_mmcfg_init(int type, int early) } } -void __init pci_mmcfg_early_init(int type) +void __init pci_mmcfg_early_init(void) { - __pci_mmcfg_init(type, 1); + __pci_mmcfg_init(1); } void __init pci_mmcfg_late_init(void) { - int type = 0; - - if (pci_probe & PCI_PROBE_CONF1) - type = 1; - - __pci_mmcfg_init(type, 0); + __pci_mmcfg_init(0); } static int __init pci_mmcfg_late_insert_resources(void) diff --git a/include/linux/pci.h b/include/linux/pci.h index 2b8f745..a71954a 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1055,10 +1055,10 @@ extern unsigned long pci_cardbus_mem_size; extern int pcibios_add_platform_entries(struct pci_dev *dev); #ifdef CONFIG_PCI_MMCONFIG -extern void __init pci_mmcfg_early_init(int type); +extern void __init pci_mmcfg_early_init(void); extern void __init pci_mmcfg_late_init(void); #else -static inline void pci_mmcfg_early_init(int type) { } +static inline void pci_mmcfg_early_init(void) { } static inline void pci_mmcfg_late_init(void) { } #endif -- cgit v0.10.2 From 871d5f8dd0f7647f03facd4cb79485938d1b61ab Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Feb 2008 03:20:09 -0800 Subject: x86: get mp_bus_to_node early Currently, on an amd k8 system with multi ht chains, the numa_node of pci devices under /sys/devices/pci0000:80/* is always 0, even if that chain is on node 1 or 2 or 3. Workaround: pcibus_to_node(bus) is used when we want to get the node that pci_device is on. In struct device, we already have numa_node member, and we could use dev_to_node()/set_dev_node() to get and set numa_node in the device. set_dev_node is called in pci_device_add() with pcibus_to_node(bus), and pcibus_to_node uses bus->sysdata for nodeid. The problem is when pci_add_device is called, bus->sysdata is not assigned correct nodeid yet. The result is that numa_node will always be 0. pcibios_scan_root and pci_scan_root could take sysdata. So we need to get mp_bus_to_node mapping before these two are called, and thus get_mp_bus_to_node could get correct node for sysdata in root bus. In scanning of the root bus, all child busses will take parent bus sysdata. So all pci_device->dev.numa_node will be assigned correctly and automatically. Later we could use dev_to_node(&pci_dev->dev) to get numa_node, and we could also could make other bus specific device get the correct numa_node too. This is an updated version of pci_sysdata and Jeff's pci_domain patch. [ mingo@elte.hu: build fix ] Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/pci/Makefile_32 b/arch/x86/pci/Makefile_32 index cdd6828..e9c5caf 100644 --- a/arch/x86/pci/Makefile_32 +++ b/arch/x86/pci/Makefile_32 @@ -10,5 +10,6 @@ pci-y += legacy.o irq.o pci-$(CONFIG_X86_VISWS) := visws.o fixup.o pci-$(CONFIG_X86_NUMAQ) := numa.o irq.o +pci-$(CONFIG_NUMA) += mp_bus_to_node.o obj-y += $(pci-y) common.o early.o diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 2664cb3..1a9c0c6 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -191,7 +191,10 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do { struct pci_bus *bus; struct pci_sysdata *sd; + int node; +#ifdef CONFIG_ACPI_NUMA int pxm; +#endif dmi_check_system(acpi_pciprobe_dmi_table); @@ -201,6 +204,17 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do return NULL; } + node = -1; +#ifdef CONFIG_ACPI_NUMA + pxm = acpi_get_pxm(device->handle); + if (pxm >= 0) + node = pxm_to_node(pxm); + if (node != -1) + set_mp_bus_to_node(busnum, node); + else + node = get_mp_bus_to_node(busnum); +#endif + /* Allocate per-root-bus (not per bus) arch-specific data. * TODO: leak; this memory is never freed. * It's arguable whether it's worth the trouble to care. @@ -212,13 +226,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do } sd->domain = domain; - sd->node = -1; - - pxm = acpi_get_pxm(device->handle); -#ifdef CONFIG_ACPI_NUMA - if (pxm >= 0) - sd->node = pxm_to_node(pxm); -#endif + sd->node = node; /* * Maybe the desired pci bus has been already scanned. In such case * it is unnecessary to scan the pci bus with the given domain,busnum. @@ -238,9 +246,9 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do kfree(sd); #ifdef CONFIG_ACPI_NUMA - if (bus != NULL) { + if (bus) { if (pxm >= 0) { - printk("bus %d -> pxm %d -> node %d\n", + printk(KERN_DEBUG "bus %02x -> pxm %d -> node %d\n", busnum, pxm, pxm_to_node(pxm)); } } @@ -248,7 +256,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do if (bus && (pci_probe & PCI_USE__CRS)) get_current_resources(device, busnum, domain, bus); - return bus; } diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 75fcc29..07d5318 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -342,9 +342,14 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum) return NULL; } + sd->node = get_mp_bus_to_node(busnum); + printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum); + bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd); + if (!bus) + kfree(sd); - return pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd); + return bus; } extern u8 pci_cache_line_size; @@ -480,7 +485,7 @@ void pcibios_disable_device (struct pci_dev *dev) pcibios_disable_irq(dev); } -struct pci_bus *__devinit pci_scan_bus_with_sysdata(int busno) +struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node) { struct pci_bus *bus = NULL; struct pci_sysdata *sd; @@ -495,10 +500,15 @@ struct pci_bus *__devinit pci_scan_bus_with_sysdata(int busno) printk(KERN_ERR "PCI: OOM, skipping PCI bus %02x\n", busno); return NULL; } - sd->node = -1; - bus = pci_scan_bus(busno, &pci_root_ops, sd); + sd->node = node; + bus = pci_scan_bus(busno, ops, sd); if (!bus) kfree(sd); return bus; } + +struct pci_bus *pci_scan_bus_with_sysdata(int busno) +{ + return pci_scan_bus_on_node(busno, &pci_root_ops, -1); +} diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 579745c..0908fca 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -136,9 +136,11 @@ static void __init pirq_peer_trick(void) busmap[e->bus] = 1; } for(i = 1; i < 256; i++) { + int node; if (!busmap[i] || pci_find_bus(0, i)) continue; - if (pci_scan_bus_with_sysdata(i)) + node = get_mp_bus_to_node(i); + if (pci_scan_bus_on_node(i, &pci_root_ops, node)) printk(KERN_INFO "PCI: Discovered primary peer " "bus %02x [IRQ]\n", i); } diff --git a/arch/x86/pci/k8-bus_64.c b/arch/x86/pci/k8-bus_64.c index 9cc813e..3903efbc 100644 --- a/arch/x86/pci/k8-bus_64.c +++ b/arch/x86/pci/k8-bus_64.c @@ -1,7 +1,9 @@ #include #include +#include #include #include +#include /* * This discovers the pcibus <-> node mapping on AMD K8. @@ -20,64 +22,102 @@ #define SUBORDINATE_LDT_BUS_NUMBER(dword) ((dword >> 16) & 0xFF) #define PCI_DEVICE_ID_K8HTCONFIG 0x1100 +#ifdef CONFIG_NUMA + +#define BUS_NR 256 + +static int mp_bus_to_node[BUS_NR]; + +void set_mp_bus_to_node(int busnum, int node) +{ + if (busnum >= 0 && busnum < BUS_NR) + mp_bus_to_node[busnum] = node; +} + +int get_mp_bus_to_node(int busnum) +{ + int node = -1; + + if (busnum < 0 || busnum > (BUS_NR - 1)) + return node; + + node = mp_bus_to_node[busnum]; + + /* + * let numa_node_id to decide it later in dma_alloc_pages + * if there is no ram on that node + */ + if (node != -1 && !node_online(node)) + node = -1; + + return node; +} + +#endif + /** - * fill_mp_bus_to_cpumask() + * early_fill_mp_bus_to_node() + * called before pcibios_scan_root and pci_scan_bus * fills the mp_bus_to_cpumask array based according to the LDT Bus Number * Registers found in the K8 northbridge */ __init static int -fill_mp_bus_to_cpumask(void) +early_fill_mp_bus_to_node(void) { - struct pci_dev *nb_dev = NULL; +#ifdef CONFIG_NUMA int i, j; + unsigned slot; u32 ldtbus, nid; + u32 id; static int lbnr[3] = { LDT_BUS_NUMBER_REGISTER_0, LDT_BUS_NUMBER_REGISTER_1, LDT_BUS_NUMBER_REGISTER_2 }; - while ((nb_dev = pci_get_device(PCI_VENDOR_ID_AMD, - PCI_DEVICE_ID_K8HTCONFIG, nb_dev))) { - pci_read_config_dword(nb_dev, NODE_ID_REGISTER, &nid); + for (i = 0; i < BUS_NR; i++) + mp_bus_to_node[i] = -1; + + if (!early_pci_allowed()) + return -1; + + for (slot = 0x18; slot < 0x20; slot++) { + id = read_pci_config(0, slot, 0, PCI_VENDOR_ID); + if (id != (PCI_VENDOR_ID_AMD | (PCI_DEVICE_ID_K8HTCONFIG<<16))) + break; + nid = read_pci_config(0, slot, 0, NODE_ID_REGISTER); for (i = 0; i < NR_LDT_BUS_NUMBER_REGISTERS; i++) { - pci_read_config_dword(nb_dev, lbnr[i], &ldtbus); + ldtbus = read_pci_config(0, slot, 0, lbnr[i]); /* * if there are no busses hanging off of the current * ldt link then both the secondary and subordinate * bus number fields are set to 0. - * + * * RED-PEN * This is slightly broken because it assumes - * HT node IDs == Linux node ids, which is not always + * HT node IDs == Linux node ids, which is not always * true. However it is probably mostly true. */ if (!(SECONDARY_LDT_BUS_NUMBER(ldtbus) == 0 && SUBORDINATE_LDT_BUS_NUMBER(ldtbus) == 0)) { for (j = SECONDARY_LDT_BUS_NUMBER(ldtbus); j <= SUBORDINATE_LDT_BUS_NUMBER(ldtbus); - j++) { - struct pci_bus *bus; - struct pci_sysdata *sd; - - long node = NODE_ID(nid); - /* Algorithm a bit dumb, but - it shouldn't matter here */ - bus = pci_find_bus(0, j); - if (!bus) - continue; - if (!node_online(node)) - node = 0; - - sd = bus->sysdata; - sd->node = node; - } + j++) { + int node = NODE_ID(nid); + mp_bus_to_node[j] = (unsigned char)node; + } } } } + for (i = 0; i < BUS_NR; i++) { + int node = mp_bus_to_node[i]; + if (node >= 0) + printk(KERN_DEBUG "bus: %02x to node: %02x\n", i, node); + } +#endif return 0; } -fs_initcall(fill_mp_bus_to_cpumask); +postcore_initcall(early_fill_mp_bus_to_node); diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index e041ced..a67921c 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -12,6 +12,7 @@ static void __devinit pcibios_fixup_peer_bridges(void) { int n, devfn; + long node; if (pcibios_last_bus <= 0 || pcibios_last_bus >= 0xff) return; @@ -21,12 +22,13 @@ static void __devinit pcibios_fixup_peer_bridges(void) u32 l; if (pci_find_bus(0, n)) continue; + node = get_mp_bus_to_node(n); for (devfn = 0; devfn < 256; devfn += 8) { if (!raw_pci_read(0, n, devfn, PCI_VENDOR_ID, 2, &l) && l != 0x0000 && l != 0xffff) { DBG("Found device at %02x:%02x [%04x]\n", n, devfn, l); printk(KERN_INFO "PCI: Discovered peer bus %02x\n", n); - pci_scan_bus_with_sysdata(n); + pci_scan_bus_on_node(n, &pci_root_ops, node); break; } } diff --git a/arch/x86/pci/mp_bus_to_node.c b/arch/x86/pci/mp_bus_to_node.c new file mode 100644 index 0000000..0229439 --- /dev/null +++ b/arch/x86/pci/mp_bus_to_node.c @@ -0,0 +1,23 @@ +#include +#include +#include + +#define BUS_NR 256 + +static unsigned char mp_bus_to_node[BUS_NR]; + +void set_mp_bus_to_node(int busnum, int node) +{ + if (busnum >= 0 && busnum < BUS_NR) + mp_bus_to_node[busnum] = (unsigned char) node; +} + +int get_mp_bus_to_node(int busnum) +{ + int node; + + if (busnum < 0 || busnum > (BUS_NR - 1)) + return 0; + node = mp_bus_to_node[busnum]; + return node; +} diff --git a/include/asm-x86/pci.h b/include/asm-x86/pci.h index ddd8e24..30bbde0 100644 --- a/include/asm-x86/pci.h +++ b/include/asm-x86/pci.h @@ -19,6 +19,8 @@ struct pci_sysdata { }; /* scan a bus after allocating a pci_sysdata for it */ +extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, + int node); extern struct pci_bus *pci_scan_bus_with_sysdata(int busno); static inline int pci_domain_nr(struct pci_bus *bus) diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h index 2207326..4793ae7 100644 --- a/include/asm-x86/topology.h +++ b/include/asm-x86/topology.h @@ -198,4 +198,17 @@ extern cpumask_t cpu_coregroup_map(int cpu); #define smt_capable() (smp_num_siblings > 1) #endif +#ifdef CONFIG_NUMA +extern int get_mp_bus_to_node(int busnum); +extern void set_mp_bus_to_node(int busnum, int node); +#else +static inline int get_mp_bus_to_node(int busnum) +{ + return 0; +} +static inline void set_mp_bus_to_node(int busnum, int node) +{ +} +#endif + #endif -- cgit v0.10.2 From 35ddd068fb94b187e94a3fc497ccecf27bdda9ae Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Feb 2008 03:15:08 -0800 Subject: x86: use bus conf in NB conf fun1 to get bus range on, on 64-bit ... so we use the same code with Quad core cpu as old opteron. This patch is useful when acpi=off or _PXM is not there in DSDT. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/pci/k8-bus_64.c b/arch/x86/pci/k8-bus_64.c index 3903efbc..dab3831 100644 --- a/arch/x86/pci/k8-bus_64.c +++ b/arch/x86/pci/k8-bus_64.c @@ -12,15 +12,18 @@ * RED-PEN empty cpus get reported wrong */ -#define NODE_ID_REGISTER 0x60 -#define NODE_ID(dword) (dword & 0x07) -#define LDT_BUS_NUMBER_REGISTER_0 0x94 -#define LDT_BUS_NUMBER_REGISTER_1 0xB4 -#define LDT_BUS_NUMBER_REGISTER_2 0xD4 -#define NR_LDT_BUS_NUMBER_REGISTERS 3 -#define SECONDARY_LDT_BUS_NUMBER(dword) ((dword >> 8) & 0xFF) -#define SUBORDINATE_LDT_BUS_NUMBER(dword) ((dword >> 16) & 0xFF) +#define NODE_ID(dword) ((dword>>4) & 0x07) +#define LDT_BUS_NUMBER_REGISTER_0 0xE0 +#define LDT_BUS_NUMBER_REGISTER_1 0xE4 +#define LDT_BUS_NUMBER_REGISTER_2 0xE8 +#define LDT_BUS_NUMBER_REGISTER_3 0xEC +#define NR_LDT_BUS_NUMBER_REGISTERS 4 +#define SECONDARY_LDT_BUS_NUMBER(dword) ((dword >> 16) & 0xFF) +#define SUBORDINATE_LDT_BUS_NUMBER(dword) ((dword >> 24) & 0xFF) + #define PCI_DEVICE_ID_K8HTCONFIG 0x1100 +#define PCI_DEVICE_ID_K8_10H_HTCONFIG 0x1200 +#define PCI_DEVICE_ID_K8_11H_HTCONFIG 0x1300 #ifdef CONFIG_NUMA @@ -67,12 +70,19 @@ early_fill_mp_bus_to_node(void) #ifdef CONFIG_NUMA int i, j; unsigned slot; - u32 ldtbus, nid; + u32 ldtbus; u32 id; - static int lbnr[3] = { + int node; + u16 deviceid; + u16 vendorid; + int min_bus; + int max_bus; + + static int lbnr[NR_LDT_BUS_NUMBER_REGISTERS] = { LDT_BUS_NUMBER_REGISTER_0, LDT_BUS_NUMBER_REGISTER_1, - LDT_BUS_NUMBER_REGISTER_2 + LDT_BUS_NUMBER_REGISTER_2, + LDT_BUS_NUMBER_REGISTER_3 }; for (i = 0; i < BUS_NR; i++) @@ -81,38 +91,36 @@ early_fill_mp_bus_to_node(void) if (!early_pci_allowed()) return -1; - for (slot = 0x18; slot < 0x20; slot++) { - id = read_pci_config(0, slot, 0, PCI_VENDOR_ID); - if (id != (PCI_VENDOR_ID_AMD | (PCI_DEVICE_ID_K8HTCONFIG<<16))) - break; - nid = read_pci_config(0, slot, 0, NODE_ID_REGISTER); - - for (i = 0; i < NR_LDT_BUS_NUMBER_REGISTERS; i++) { - ldtbus = read_pci_config(0, slot, 0, lbnr[i]); - /* - * if there are no busses hanging off of the current - * ldt link then both the secondary and subordinate - * bus number fields are set to 0. - * - * RED-PEN - * This is slightly broken because it assumes - * HT node IDs == Linux node ids, which is not always - * true. However it is probably mostly true. - */ - if (!(SECONDARY_LDT_BUS_NUMBER(ldtbus) == 0 - && SUBORDINATE_LDT_BUS_NUMBER(ldtbus) == 0)) { - for (j = SECONDARY_LDT_BUS_NUMBER(ldtbus); - j <= SUBORDINATE_LDT_BUS_NUMBER(ldtbus); - j++) { - int node = NODE_ID(nid); - mp_bus_to_node[j] = (unsigned char)node; - } - } - } + slot = 0x18; + id = read_pci_config(0, slot, 0, PCI_VENDOR_ID); + + vendorid = id & 0xffff; + if (vendorid != PCI_VENDOR_ID_AMD) + goto out; + + deviceid = (id>>16) & 0xffff; + if ((deviceid != PCI_DEVICE_ID_K8HTCONFIG) && + (deviceid != PCI_DEVICE_ID_K8_10H_HTCONFIG) && + (deviceid != PCI_DEVICE_ID_K8_11H_HTCONFIG)) + goto out; + + for (i = 0; i < NR_LDT_BUS_NUMBER_REGISTERS; i++) { + ldtbus = read_pci_config(0, slot, 1, lbnr[i]); + + /* Check if that register is enabled for bus range */ + if ((ldtbus & 7) != 3) + continue; + + min_bus = SECONDARY_LDT_BUS_NUMBER(ldtbus); + max_bus = SUBORDINATE_LDT_BUS_NUMBER(ldtbus); + node = NODE_ID(ldtbus); + for (j = min_bus; j <= max_bus; j++) + mp_bus_to_node[j] = (unsigned char) node; } +out: for (i = 0; i < BUS_NR; i++) { - int node = mp_bus_to_node[i]; + node = mp_bus_to_node[i]; if (node >= 0) printk(KERN_DEBUG "bus: %02x to node: %02x\n", i, node); } -- cgit v0.10.2 From 30a18d6c3f1e774de656ebd8ff219d53e2ba4029 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Feb 2008 03:21:20 -0800 Subject: x86: multi pci root bus with different io resource range, on 64-bit scan AMD opteron io/mmio routing to make sure every pci root bus get correct resource range. Thus later pci scan could assign correct resource to device with unassigned resource. this can fix a system without _CRS for multi pci root bus. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/pci/Makefile_64 b/arch/x86/pci/Makefile_64 index 7d8c467..8fbd198 100644 --- a/arch/x86/pci/Makefile_64 +++ b/arch/x86/pci/Makefile_64 @@ -13,5 +13,5 @@ obj-y += legacy.o irq.o common.o early.o # mmconfig has a 64bit special obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_64.o direct.o mmconfig-shared.o -obj-$(CONFIG_NUMA) += k8-bus_64.o +obj-y += k8-bus_64.o diff --git a/arch/x86/pci/k8-bus_64.c b/arch/x86/pci/k8-bus_64.c index dab3831..5e8a9d1 100644 --- a/arch/x86/pci/k8-bus_64.c +++ b/arch/x86/pci/k8-bus_64.c @@ -7,23 +7,29 @@ /* * This discovers the pcibus <-> node mapping on AMD K8. - * - * RED-PEN need to call this again on PCI hotplug - * RED-PEN empty cpus get reported wrong + * also get peer root bus resource for io,mmio */ -#define NODE_ID(dword) ((dword>>4) & 0x07) -#define LDT_BUS_NUMBER_REGISTER_0 0xE0 -#define LDT_BUS_NUMBER_REGISTER_1 0xE4 -#define LDT_BUS_NUMBER_REGISTER_2 0xE8 -#define LDT_BUS_NUMBER_REGISTER_3 0xEC -#define NR_LDT_BUS_NUMBER_REGISTERS 4 -#define SECONDARY_LDT_BUS_NUMBER(dword) ((dword >> 16) & 0xFF) -#define SUBORDINATE_LDT_BUS_NUMBER(dword) ((dword >> 24) & 0xFF) -#define PCI_DEVICE_ID_K8HTCONFIG 0x1100 -#define PCI_DEVICE_ID_K8_10H_HTCONFIG 0x1200 -#define PCI_DEVICE_ID_K8_11H_HTCONFIG 0x1300 +/* + * sub bus (transparent) will use entres from 3 to store extra from root, + * so need to make sure have enought slot there, increase PCI_BUS_NUM_RESOURCES? + */ +#define RES_NUM 16 +struct pci_root_info { + char name[12]; + unsigned int res_num; + struct resource res[RES_NUM]; + int bus_min; + int bus_max; + int node; + int link; +}; + +/* 4 at this time, it may become to 32 */ +#define PCI_ROOT_NR 4 +static int pci_root_num; +static struct pci_root_info pci_root_info[PCI_ROOT_NR]; #ifdef CONFIG_NUMA @@ -55,77 +61,375 @@ int get_mp_bus_to_node(int busnum) return node; } - #endif +void set_pci_bus_resources_arch_default(struct pci_bus *b) +{ + int i; + int j; + struct pci_root_info *info; + + if (!pci_root_num) + return; + + for (i = 0; i < pci_root_num; i++) { + if (pci_root_info[i].bus_min == b->number) + break; + } + + if (i == pci_root_num) + return; + + info = &pci_root_info[i]; + for (j = 0; j < info->res_num; j++) { + struct resource *res; + struct resource *root; + + res = &info->res[j]; + b->resource[j] = res; + if (res->flags & IORESOURCE_IO) + root = &ioport_resource; + else + root = &iomem_resource; + insert_resource(root, res); + } +} + +#define RANGE_NUM 16 + +struct res_range { + size_t start; + size_t end; +}; + +static void __init update_range(struct res_range *range, size_t start, + size_t end) +{ + int i; + int j; + + for (j = 0; j < RANGE_NUM; j++) { + if (!range[j].end) + continue; + if (start == range[j].start && end < range[j].end) { + range[j].start = end + 1; + break; + } else if (start == range[j].start && end == range[j].end) { + range[j].start = 0; + range[j].end = 0; + break; + } else if (start > range[j].start && end == range[j].end) { + range[j].end = start - 1; + break; + } else if (start > range[j].start && end < range[j].end) { + /* find the new spare */ + for (i = 0; i < RANGE_NUM; i++) { + if (range[i].end == 0) + break; + } + if (i < RANGE_NUM) { + range[i].end = range[j].end; + range[i].start = end + 1; + } else { + printk(KERN_ERR "run of slot in ranges\n"); + } + range[j].end = start - 1; + break; + } + } +} + +static void __init update_res(struct pci_root_info *info, size_t start, + size_t end, unsigned long flags, int merge) +{ + int i; + struct resource *res; + + if (!merge) + goto addit; + + /* try to merge it with old one */ + for (i = 0; i < info->res_num; i++) { + res = &info->res[i]; + if (res->flags != flags) + continue; + if (res->end + 1 == start) { + res->end = end; + return; + } else if (end + 1 == res->start) { + res->start = start; + return; + } + } + +addit: + + /* need to add that */ + if (info->res_num >= RES_NUM) + return; + + res = &info->res[info->res_num]; + res->name = info->name; + res->flags = flags; + res->start = start; + res->end = end; + res->child = NULL; + info->res_num++; +} + +struct pci_hostbridge_probe { + u32 bus; + u32 slot; + u32 vendor; + u32 device; +}; + +static struct pci_hostbridge_probe pci_probes[] __initdata = { + { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1100 }, + { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, + { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, + { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 }, +}; + /** * early_fill_mp_bus_to_node() * called before pcibios_scan_root and pci_scan_bus * fills the mp_bus_to_cpumask array based according to the LDT Bus Number * Registers found in the K8 northbridge */ -__init static int -early_fill_mp_bus_to_node(void) +static int __init early_fill_mp_bus_info(void) { -#ifdef CONFIG_NUMA - int i, j; + int i; + int j; + unsigned bus; unsigned slot; - u32 ldtbus; - u32 id; + int found; int node; - u16 deviceid; - u16 vendorid; - int min_bus; - int max_bus; - - static int lbnr[NR_LDT_BUS_NUMBER_REGISTERS] = { - LDT_BUS_NUMBER_REGISTER_0, - LDT_BUS_NUMBER_REGISTER_1, - LDT_BUS_NUMBER_REGISTER_2, - LDT_BUS_NUMBER_REGISTER_3 - }; + int link; + int def_node; + int def_link; + struct pci_root_info *info; + u32 reg; + struct resource *res; + size_t start; + size_t end; + struct res_range range[RANGE_NUM]; + u64 val; + u32 address; +#ifdef CONFIG_NUMA for (i = 0; i < BUS_NR; i++) mp_bus_to_node[i] = -1; +#endif if (!early_pci_allowed()) return -1; - slot = 0x18; - id = read_pci_config(0, slot, 0, PCI_VENDOR_ID); + found = 0; + for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { + u32 id; + u16 device; + u16 vendor; - vendorid = id & 0xffff; - if (vendorid != PCI_VENDOR_ID_AMD) - goto out; + bus = pci_probes[i].bus; + slot = pci_probes[i].slot; + id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID); - deviceid = (id>>16) & 0xffff; - if ((deviceid != PCI_DEVICE_ID_K8HTCONFIG) && - (deviceid != PCI_DEVICE_ID_K8_10H_HTCONFIG) && - (deviceid != PCI_DEVICE_ID_K8_11H_HTCONFIG)) - goto out; + vendor = id & 0xffff; + device = (id>>16) & 0xffff; + if (pci_probes[i].vendor == vendor && + pci_probes[i].device == device) { + found = 1; + break; + } + } + + if (!found) + return 0; - for (i = 0; i < NR_LDT_BUS_NUMBER_REGISTERS; i++) { - ldtbus = read_pci_config(0, slot, 1, lbnr[i]); + pci_root_num = 0; + for (i = 0; i < 4; i++) { + int min_bus; + int max_bus; + reg = read_pci_config(bus, slot, 1, 0xe0 + (i << 2)); /* Check if that register is enabled for bus range */ - if ((ldtbus & 7) != 3) + if ((reg & 7) != 3) continue; - min_bus = SECONDARY_LDT_BUS_NUMBER(ldtbus); - max_bus = SUBORDINATE_LDT_BUS_NUMBER(ldtbus); - node = NODE_ID(ldtbus); + min_bus = (reg >> 16) & 0xff; + max_bus = (reg >> 24) & 0xff; + node = (reg >> 4) & 0x07; +#ifdef CONFIG_NUMA for (j = min_bus; j <= max_bus; j++) mp_bus_to_node[j] = (unsigned char) node; +#endif + link = (reg >> 8) & 0x03; + + info = &pci_root_info[pci_root_num]; + info->bus_min = min_bus; + info->bus_max = max_bus; + info->node = node; + info->link = link; + sprintf(info->name, "PCI Bus #%02x", min_bus); + pci_root_num++; } -out: + /* get the default node and link for left over res */ + reg = read_pci_config(bus, slot, 0, 0x60); + def_node = (reg >> 8) & 0x07; + reg = read_pci_config(bus, slot, 0, 0x64); + def_link = (reg >> 8) & 0x03; + + memset(range, 0, sizeof(range)); + range[0].end = 0xffff; + /* io port resource */ + for (i = 0; i < 4; i++) { + reg = read_pci_config(bus, slot, 1, 0xc0 + (i << 3)); + if (!(reg & 3)) + continue; + + start = reg & 0xfff000; + reg = read_pci_config(bus, slot, 1, 0xc4 + (i << 3)); + node = reg & 0x07; + link = (reg >> 4) & 0x03; + end = (reg & 0xfff000) | 0xfff; + + /* find the position */ + for (j = 0; j < pci_root_num; j++) { + info = &pci_root_info[j]; + if (info->node == node && info->link == link) + break; + } + if (j == pci_root_num) + continue; /* not found */ + + info = &pci_root_info[j]; + update_res(info, start, end, IORESOURCE_IO, 0); + update_range(range, start, end); + } + /* add left over io port range to def node/link, [0, 0xffff] */ + /* find the position */ + for (j = 0; j < pci_root_num; j++) { + info = &pci_root_info[j]; + if (info->node == def_node && info->link == def_link) + break; + } + if (j < pci_root_num) { + info = &pci_root_info[j]; + for (i = 0; i < RANGE_NUM; i++) { + if (!range[i].end) + continue; + + update_res(info, range[i].start, range[i].end, + IORESOURCE_IO, 1); + } + } + + memset(range, 0, sizeof(range)); + /* 0xfd00000000-0xffffffffff for HT */ + /* 0xfc00000000-0xfcffffffff for Family 10h mmconfig*/ + range[0].end = 0xfbffffffffULL; + + /* need to take out [0, TOM) for RAM*/ + address = MSR_K8_TOP_MEM1; + rdmsrl(address, val); + end = (val & 0xffffff8000000ULL); + printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20); + if (end < (1ULL<<32)) + update_range(range, 0, end - 1); + + /* mmio resource */ + for (i = 0; i < 8; i++) { + reg = read_pci_config(bus, slot, 1, 0x80 + (i << 3)); + if (!(reg & 3)) + continue; + + start = reg & 0xffffff00; /* 39:16 on 31:8*/ + start <<= 8; + reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3)); + node = reg & 0x07; + link = (reg >> 4) & 0x03; + end = (reg & 0xffffff00); + end <<= 8; + end |= 0xffff; + + /* find the position */ + for (j = 0; j < pci_root_num; j++) { + info = &pci_root_info[j]; + if (info->node == node && info->link == link) + break; + } + if (j == pci_root_num) + continue; /* not found */ + + info = &pci_root_info[j]; + update_res(info, start, end, IORESOURCE_MEM, 0); + update_range(range, start, end); + } + + /* need to take out [4G, TOM2) for RAM*/ + /* SYS_CFG */ + address = MSR_K8_SYSCFG; + rdmsrl(address, val); + /* TOP_MEM2 is enabled? */ + if (val & (1<<21)) { + /* TOP_MEM2 */ + address = MSR_K8_TOP_MEM2; + rdmsrl(address, val); + end = (val & 0xffffff8000000ULL); + printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20); + update_range(range, 1ULL<<32, end - 1); + } + + /* + * add left over mmio range to def node/link ? + * that is tricky, just record range in from start_min to 4G + */ + for (j = 0; j < pci_root_num; j++) { + info = &pci_root_info[j]; + if (info->node == def_node && info->link == def_link) + break; + } + if (j < pci_root_num) { + info = &pci_root_info[j]; + + for (i = 0; i < RANGE_NUM; i++) { + if (!range[i].end) + continue; + + update_res(info, range[i].start, range[i].end, + IORESOURCE_MEM, 1); + } + } + +#ifdef CONFIG_NUMA for (i = 0; i < BUS_NR; i++) { node = mp_bus_to_node[i]; if (node >= 0) printk(KERN_DEBUG "bus: %02x to node: %02x\n", i, node); } #endif + + for (i = 0; i < pci_root_num; i++) { + int res_num; + int busnum; + + info = &pci_root_info[i]; + res_num = info->res_num; + busnum = info->bus_min; + printk(KERN_DEBUG "bus: [%02x,%02x] on node %x link %x\n", + info->bus_min, info->bus_max, info->node, info->link); + for (j = 0; j < res_num; j++) { + res = &info->res[j]; + printk(KERN_DEBUG "bus: %02x index %x %s: [%llx, %llx]\n", + busnum, j, + (res->flags & IORESOURCE_IO)?"io port":"mmio", + res->start, res->end); + } + } + return 0; } -postcore_initcall(early_fill_mp_bus_to_node); +postcore_initcall(early_fill_mp_bus_info); diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index a40043b..4a55bf3 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1088,6 +1088,10 @@ unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus) return max; } +void __attribute__((weak)) set_pci_bus_resources_arch_default(struct pci_bus *b) +{ +} + struct pci_bus * pci_create_bus(struct device *parent, int bus, struct pci_ops *ops, void *sysdata) { @@ -1147,6 +1151,8 @@ struct pci_bus * pci_create_bus(struct device *parent, b->resource[0] = &ioport_resource; b->resource[1] = &iomem_resource; + set_pci_bus_resources_arch_default(b); + return b; dev_create_file_err: diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h index 4793ae7..0e6d6b0 100644 --- a/include/asm-x86/topology.h +++ b/include/asm-x86/topology.h @@ -193,6 +193,9 @@ extern cpumask_t cpu_coregroup_map(int cpu); #define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu)) #endif +struct pci_bus; +void set_pci_bus_resources_arch_default(struct pci_bus *b); + #ifdef CONFIG_SMP #define mc_capable() (boot_cpu_data.x86_max_cores > 1) #define smt_capable() (smp_num_siblings > 1) diff --git a/include/linux/pci.h b/include/linux/pci.h index a71954a..abc998f 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -254,7 +254,7 @@ static inline void pci_add_saved_cap(struct pci_dev *pci_dev, #define PCI_NUM_RESOURCES 11 #ifndef PCI_BUS_NUM_RESOURCES -#define PCI_BUS_NUM_RESOURCES 8 +#define PCI_BUS_NUM_RESOURCES 16 #endif #define PCI_REGION_FLAG_MASK 0x0fU /* These bits of resource flags tell us the PCI region flags */ -- cgit v0.10.2 From 6e184f299d696203bc40545b9db216089d88bef7 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 6 Mar 2008 01:15:31 -0800 Subject: x86: double check the multi root bus with fam10h mmconf some bioses give same range to mmconf for fam10h msr, and mmio for node/link. fam10h msr will overide mmio for node/link. so we can not assign range to devices under node/link for unassigned resources. this patch will take range out from the mmio for node/link Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/pci/k8-bus_64.c b/arch/x86/pci/k8-bus_64.c index 5e8a9d1..84b2030 100644 --- a/arch/x86/pci/k8-bus_64.c +++ b/arch/x86/pci/k8-bus_64.c @@ -191,6 +191,34 @@ static struct pci_hostbridge_probe pci_probes[] __initdata = { { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 }, }; +static u64 __initdata fam10h_mmconf_start; +static u64 __initdata fam10h_mmconf_end; +static void __init get_pci_mmcfg_amd_fam10h_range(void) +{ + u32 address; + u64 base, msr; + unsigned segn_busn_bits; + + /* assume all cpus from fam10h have mmconf */ + if (boot_cpu_data.x86 < 0x10) + return; + + address = MSR_FAM10H_MMIO_CONF_BASE; + rdmsrl(address, msr); + + /* mmconfig is not enable */ + if (!(msr & FAM10H_MMIO_CONF_ENABLE)) + return; + + base = msr & (FAM10H_MMIO_CONF_BASE_MASK<> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & + FAM10H_MMIO_CONF_BUSRANGE_MASK; + + fam10h_mmconf_start = base; + fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1; +} + /** * early_fill_mp_bus_to_node() * called before pcibios_scan_root and pci_scan_bus @@ -305,6 +333,8 @@ static int __init early_fill_mp_bus_info(void) continue; /* not found */ info = &pci_root_info[j]; + printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n", + node, link, (u64)start, (u64)end); update_res(info, start, end, IORESOURCE_IO, 0); update_range(range, start, end); } @@ -328,8 +358,7 @@ static int __init early_fill_mp_bus_info(void) memset(range, 0, sizeof(range)); /* 0xfd00000000-0xffffffffff for HT */ - /* 0xfc00000000-0xfcffffffff for Family 10h mmconfig*/ - range[0].end = 0xfbffffffffULL; + range[0].end = (0xfdULL<<32) - 1; /* need to take out [0, TOM) for RAM*/ address = MSR_K8_TOP_MEM1; @@ -339,6 +368,14 @@ static int __init early_fill_mp_bus_info(void) if (end < (1ULL<<32)) update_range(range, 0, end - 1); + /* get mmconfig */ + get_pci_mmcfg_amd_fam10h_range(); + /* need to take out mmconf range */ + if (fam10h_mmconf_end) { + printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end); + update_range(range, fam10h_mmconf_start, fam10h_mmconf_end); + } + /* mmio resource */ for (i = 0; i < 8; i++) { reg = read_pci_config(bus, slot, 1, 0x80 + (i << 3)); @@ -364,8 +401,51 @@ static int __init early_fill_mp_bus_info(void) continue; /* not found */ info = &pci_root_info[j]; + + printk(KERN_DEBUG "node %d link %d: mmio [%llx, %llx]", + node, link, (u64)start, (u64)end); + /* + * some sick allocation would have range overlap with fam10h + * mmconf range, so need to update start and end. + */ + if (fam10h_mmconf_end) { + int changed = 0; + u64 endx = 0; + if (start >= fam10h_mmconf_start && + start <= fam10h_mmconf_end) { + start = fam10h_mmconf_end + 1; + changed = 1; + } + + if (end >= fam10h_mmconf_start && + end <= fam10h_mmconf_end) { + end = fam10h_mmconf_start - 1; + changed = 1; + } + + if (start < fam10h_mmconf_start && + end > fam10h_mmconf_end) { + /* we got a hole */ + endx = fam10h_mmconf_start - 1; + update_res(info, start, endx, IORESOURCE_MEM, 0); + update_range(range, start, endx); + printk(KERN_CONT " ==> [%llx, %llx]", (u64)start, endx); + start = fam10h_mmconf_end + 1; + changed = 1; + } + if (changed) { + if (start <= end) { + printk(KERN_CONT " %s [%llx, %llx]", endx?"and":"==>", (u64)start, (u64)end); + } else { + printk(KERN_CONT "%s\n", endx?"":" ==> none"); + continue; + } + } + } + update_res(info, start, end, IORESOURCE_MEM, 0); update_range(range, start, end); + printk(KERN_CONT "\n"); } /* need to take out [4G, TOM2) for RAM*/ -- cgit v0.10.2 From 4cf19463745fad81ef2eed3e4e0038da5fd153f8 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 11 Apr 2008 15:14:52 -0700 Subject: x86_64: don't need set default res if only have one root bus if there's only one root bus there's no need to split resources. This patch fixes the issue described at: http://lkml.org/lkml/2008/4/10/304 Reported-and-bisected-by: Rafael J. Wysocki Tested-by: Rafael J. Wysocki Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/pci/k8-bus_64.c b/arch/x86/pci/k8-bus_64.c index 84b2030..4e64c58 100644 --- a/arch/x86/pci/k8-bus_64.c +++ b/arch/x86/pci/k8-bus_64.c @@ -69,7 +69,8 @@ void set_pci_bus_resources_arch_default(struct pci_bus *b) int j; struct pci_root_info *info; - if (!pci_root_num) + /* if only one root bus, don't need to anything */ + if (pci_root_num < 2) return; for (i = 0; i < pci_root_num; i++) { -- cgit v0.10.2 From cbf9bd603ab1fc4d2ecb1c6a4b7bd1cc50a7e82a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Feb 2008 03:21:06 -0800 Subject: acpi: get boot_cpu_id as early for k8_scan_nodes [mingo@elte.hu: split from "x86_64: get boot_cpu_id as early for k8_scan_nodes] Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 977ed5c..c49ebcc 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -771,6 +771,32 @@ static void __init acpi_register_lapic_address(unsigned long address) boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); } +static int __init early_acpi_parse_madt_lapic_addr_ovr(void) +{ + int count; + + if (!cpu_has_apic) + return -ENODEV; + + /* + * Note that the LAPIC address is obtained from the MADT (32-bit value) + * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value). + */ + + count = + acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, + acpi_parse_lapic_addr_ovr, 0); + if (count < 0) { + printk(KERN_ERR PREFIX + "Error parsing LAPIC address override entry\n"); + return count; + } + + acpi_register_lapic_address(acpi_lapic_addr); + + return count; +} + static int __init acpi_parse_madt_lapic_entries(void) { int count; @@ -901,6 +927,33 @@ static inline int acpi_parse_madt_ioapic_entries(void) } #endif /* !CONFIG_X86_IO_APIC */ +static void __init early_acpi_process_madt(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + int error; + + if (!acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) { + + /* + * Parse MADT LAPIC entries + */ + error = early_acpi_parse_madt_lapic_addr_ovr(); + if (!error) { + acpi_lapic = 1; + smp_found_config = 1; + } + if (error == -EINVAL) { + /* + * Dell Precision Workstation 410, 610 come here. + */ + printk(KERN_ERR PREFIX + "Invalid BIOS MADT, disabling ACPI\n"); + disable_acpi(); + } + } +#endif +} + static void __init acpi_process_madt(void) { #ifdef CONFIG_X86_LOCAL_APIC @@ -1233,6 +1286,23 @@ int __init acpi_boot_table_init(void) return 0; } +int __init early_acpi_boot_init(void) +{ + /* + * If acpi_disabled, bail out + * One exception: acpi=ht continues far enough to enumerate LAPICs + */ + if (acpi_disabled && !acpi_ht) + return 1; + + /* + * Process the Multiple APIC Description Table (MADT), if present + */ + early_acpi_process_madt(); + + return 0; +} + int __init acpi_boot_init(void) { /* diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 86808e6..1f476e4 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -13,12 +13,15 @@ #include #include #include +#include #include #include #include #include #include #include +#include +#include static __init int find_northbridge(void) { @@ -44,6 +47,30 @@ static __init int find_northbridge(void) return -1; } +static __init void early_get_boot_cpu_id(void) +{ + /* + * need to get boot_cpu_id so can use that to create apicid_to_node + * in k8_scan_nodes() + */ + /* + * Find possible boot-time SMP configuration: + */ + early_find_smp_config(); +#ifdef CONFIG_ACPI + /* + * Read APIC information from ACPI tables. + */ + early_acpi_boot_init(); +#endif + /* + * get boot-time SMP configuration: + */ + if (smp_found_config) + early_get_smp_config(); + early_init_lapic_mapping(); +} + int __init k8_scan_nodes(unsigned long start, unsigned long end) { unsigned long prevbase; @@ -56,6 +83,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) unsigned cores; unsigned bits; int j; + unsigned apicid_base; if (!early_pci_allowed()) return -1; @@ -174,11 +202,19 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) /* use the coreid bits from early_identify_cpu */ bits = boot_cpu_data.x86_coreid_bits; cores = (1< 0) { + printk(KERN_INFO "BSP APIC ID: %02x\n", + boot_cpu_physical_apicid); + apicid_base = boot_cpu_physical_apicid; + } for (i = 0; i < 8; i++) { if (nodes[i].start != nodes[i].end) { nodeid = nodeids[i]; - for (j = 0; j < cores; j++) + for (j = apicid_base; j < cores + apicid_base; j++) apicid_to_node[(nodeid << bits) + j] = i; setup_node_bootmem(i, nodes[i].start, nodes[i].end); } diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 2c7e003..41f7ce7 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -79,6 +79,7 @@ typedef int (*acpi_table_handler) (struct acpi_table_header *table); typedef int (*acpi_table_entry_handler) (struct acpi_subtable_header *header, const unsigned long end); char * __acpi_map_table (unsigned long phys_addr, unsigned long size); +int early_acpi_boot_init(void); int acpi_boot_init (void); int acpi_boot_table_init (void); int acpi_numa_init (void); @@ -235,6 +236,10 @@ int acpi_check_mem_region(resource_size_t start, resource_size_t n, #else /* CONFIG_ACPI */ +static inline int early_acpi_boot_init(void) +{ + return 0; +} static inline int acpi_boot_init(void) { return 0; -- cgit v0.10.2 From e8ee6f0ae5cd860e8e6c02807edfa3c1fa01bcb5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 13 Apr 2008 01:41:58 -0700 Subject: x86: work around io allocation overlap of HT links normally BIOSes assign io/mmio range to different HT links without overlapping, even same node same link should get non overlapping entries. but Rafael L. Wysocki's buggy BIOS creates a link with overlapping entries for mmio and io: node 0 link 0: io port [1000, ffffff] node 0 link 0: mmio [e0000000, efffffff] node 0 link 0: mmio [a0000, bffff] node 0 link 0: mmio [80000000, ffffffff] try to merge them and we will get: bus: [00, ff] on node 0 link 0 bus: 00 index 0 io port: [0, ffff] bus: 00 index 1 mmio: [80000000, fcffffffff] bus: 00 index 2 mmio: [a0000, bffff] so later we will reduce the chance to assign used resource to unassigned device. Reported-by: "Rafael J. Wysocki" Signed-off-by: Yinghai Lu Tested-by: "Rafael J. Wysocki" Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/pci/k8-bus_64.c b/arch/x86/pci/k8-bus_64.c index 4e64c58..ab6d4b1 100644 --- a/arch/x86/pci/k8-bus_64.c +++ b/arch/x86/pci/k8-bus_64.c @@ -112,17 +112,25 @@ static void __init update_range(struct res_range *range, size_t start, for (j = 0; j < RANGE_NUM; j++) { if (!range[j].end) continue; - if (start == range[j].start && end < range[j].end) { - range[j].start = end + 1; - break; - } else if (start == range[j].start && end == range[j].end) { + + if (start <= range[j].start && end >= range[j].end) { range[j].start = 0; range[j].end = 0; - break; - } else if (start > range[j].start && end == range[j].end) { + continue; + } + + if (start <= range[j].start && end < range[j].end && range[j].start < end + 1) { + range[j].start = end + 1; + continue; + } + + + if (start > range[j].start && end >= range[j].end && range[j].end > start - 1) { range[j].end = start - 1; - break; - } else if (start > range[j].start && end < range[j].end) { + continue; + } + + if (start > range[j].start && end < range[j].end) { /* find the new spare */ for (i = 0; i < RANGE_NUM; i++) { if (range[i].end == 0) @@ -135,7 +143,7 @@ static void __init update_range(struct res_range *range, size_t start, printk(KERN_ERR "run of slot in ranges\n"); } range[j].end = start - 1; - break; + continue; } } } @@ -151,16 +159,24 @@ static void __init update_res(struct pci_root_info *info, size_t start, /* try to merge it with old one */ for (i = 0; i < info->res_num; i++) { + size_t final_start, final_end; + size_t common_start, common_end; + res = &info->res[i]; if (res->flags != flags) continue; - if (res->end + 1 == start) { - res->end = end; - return; - } else if (end + 1 == res->start) { - res->start = start; - return; - } + + common_start = max((size_t)res->start, start); + common_end = min((size_t)res->end, end); + if (common_start > common_end + 1) + continue; + + final_start = min((size_t)res->start, start); + final_end = max((size_t)res->end, end); + + res->start = final_start; + res->end = final_end; + return; } addit: @@ -336,7 +352,11 @@ static int __init early_fill_mp_bus_info(void) info = &pci_root_info[j]; printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n", node, link, (u64)start, (u64)end); - update_res(info, start, end, IORESOURCE_IO, 0); + + /* kernel only handle 16 bit only */ + if (end > 0xffff) + end = 0xffff; + update_res(info, start, end, IORESOURCE_IO, 1); update_range(range, start, end); } /* add left over io port range to def node/link, [0, 0xffff] */ @@ -444,7 +464,7 @@ static int __init early_fill_mp_bus_info(void) } } - update_res(info, start, end, IORESOURCE_MEM, 0); + update_res(info, start, end, IORESOURCE_MEM, 1); update_range(range, start, end); printk(KERN_CONT "\n"); } -- cgit v0.10.2 From 5f0b2976cb2b62668a076f54419c24b8ab677167 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 14 Apr 2008 16:08:25 -0700 Subject: x86: add pci=check_enable_amd_mmconf and dmi check so will disable that feature by default, and only enable that via pci=check_enable_amd_mmconf or for system match with dmi table. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index 3789792..edc5fbf 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -6,12 +6,15 @@ #include #include #include +#include #include #include #include #include #include +#include "../pci/pci.h" + struct pci_hostbridge_probe { u32 bus; u32 slot; @@ -176,6 +179,9 @@ void __cpuinit fam10h_check_enable_mmcfg(void) u64 val; u32 address; + if (!(pci_probe & PCI_CHECK_ENABLE_AMD_MMCONF)) + return; + address = MSR_FAM10H_MMIO_CONF_BASE; rdmsrl(address, val); @@ -213,3 +219,25 @@ void __cpuinit fam10h_check_enable_mmcfg(void) FAM10H_MMIO_CONF_ENABLE; wrmsrl(address, val); } + +static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d) +{ + pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF; + return 0; +} + +static struct dmi_system_id __devinitdata mmconf_dmi_table[] = { + { + .callback = set_check_enable_amd_mmconf, + .ident = "Sun Microsystems Machine", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Sun Microsystems"), + }, + }, + {} +}; + +void __init check_enable_amd_mmconf_dmi(void) +{ + dmi_check_system(mmconf_dmi_table); +} diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index d8a9ee7..2f5c488 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -289,6 +289,18 @@ static void __init parse_setup_data(void) } } +#ifdef CONFIG_PCI_MMCONFIG +extern void __cpuinit fam10h_check_enable_mmcfg(void); +extern void __init check_enable_amd_mmconf_dmi(void); +#else +void __cpuinit fam10h_check_enable_mmcfg(void) +{ +} +void __init check_enable_amd_mmconf_dmi(void) +{ +} +#endif + /* * setup_arch - architecture-specific boot-time initializations * @@ -510,6 +522,9 @@ void __init setup_arch(char **cmdline_p) conswitchp = &dummy_con; #endif #endif + + /* do this before identify_cpu for boot cpu */ + check_enable_amd_mmconf_dmi(); } static int __cpuinit get_model_name(struct cpuinfo_x86 *c) @@ -697,14 +712,6 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); } -#ifdef CONFIG_PCI_MMCONFIG -extern void __cpuinit fam10h_check_enable_mmcfg(void); -#else -void __cpuinit fam10h_check_enable_mmcfg(void) -{ -} -#endif - static void __cpuinit init_amd(struct cpuinfo_x86 *c) { unsigned level; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 07d5318..2a4d751 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -425,6 +425,10 @@ char * __devinit pcibios_setup(char *str) pci_probe &= ~PCI_PROBE_MMCONF; return NULL; } + else if (!strcmp(str, "check_enable_amd_mmconf")) { + pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF; + return NULL; + } #endif else if (!strcmp(str, "noacpi")) { acpi_noirq_set(); diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index bdf6224..0cfebec 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -107,6 +107,9 @@ static const char __init *pci_mmcfg_amd_fam10h(void) int i; unsigned segnbits = 0, busnbits; + if (!(pci_probe & PCI_CHECK_ENABLE_AMD_MMCONF)) + return NULL; + address = MSR_FAM10H_MMIO_CONF_BASE; if (rdmsr_safe(address, &low, &high)) return NULL; diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h index c8b89a8..8ef86b5 100644 --- a/arch/x86/pci/pci.h +++ b/arch/x86/pci/pci.h @@ -26,6 +26,7 @@ #define PCI_ASSIGN_ALL_BUSSES 0x4000 #define PCI_CAN_SKIP_ISA_ALIGN 0x8000 #define PCI_USE__CRS 0x10000 +#define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000 extern unsigned int pci_probe; extern unsigned long pirq_table_addr; -- cgit v0.10.2