From 50043e257cedb54af8e1545de38f1289d0a52bc4 Mon Sep 17 00:00:00 2001 From: Jim Snow Date: Tue, 18 Nov 2014 14:51:08 +0100 Subject: sb_edac: Fix off-by-one error in number of channels This prevented edac sysfs code from properly handling 6 channels per memory controller. Signed-off-by: Jim Snow Signed-off-by: Lukasz Anaczkowski Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index a6cd361..670d282 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -372,7 +372,7 @@ static int edac_create_csrow_object(struct mem_ctl_info *mci, { int err, chan; - if (csrow->nr_channels >= EDAC_NR_CHANNELS) + if (csrow->nr_channels > EDAC_NR_CHANNELS) return -ENODEV; csrow->dev.type = &csrow_attr_type; -- cgit v0.10.2 From 8c009100295597f23978c224aec5751a365bc965 Mon Sep 17 00:00:00 2001 From: Jim Snow Date: Tue, 18 Nov 2014 14:51:09 +0100 Subject: sb_edac: Fix erroneous bytes->gigabytes conversion Signed-off-by: Jim Snow Signed-off-by: Lukasz Anaczkowski Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index e9bb1af..f37d01f 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -909,7 +909,7 @@ static void get_memory_layout(const struct mem_ctl_info *mci) u32 reg; u64 limit, prv = 0; u64 tmp_mb; - u32 mb, kb; + u32 gb, mb; u32 rir_way; /* @@ -919,15 +919,17 @@ static void get_memory_layout(const struct mem_ctl_info *mci) pvt->tolm = pvt->info.get_tolm(pvt); tmp_mb = (1 + pvt->tolm) >> 20; - mb = div_u64_rem(tmp_mb, 1000, &kb); - edac_dbg(0, "TOLM: %u.%03u GB (0x%016Lx)\n", mb, kb, (u64)pvt->tolm); + gb = div_u64_rem(tmp_mb, 1024, &mb); + edac_dbg(0, "TOLM: %u.%03u GB (0x%016Lx)\n", + gb, (mb*1000)/1024, (u64)pvt->tolm); /* Address range is already 45:25 */ pvt->tohm = pvt->info.get_tohm(pvt); tmp_mb = (1 + pvt->tohm) >> 20; - mb = div_u64_rem(tmp_mb, 1000, &kb); - edac_dbg(0, "TOHM: %u.%03u GB (0x%016Lx)\n", mb, kb, (u64)pvt->tohm); + gb = div_u64_rem(tmp_mb, 1024, &mb); + edac_dbg(0, "TOHM: %u.%03u GB (0x%016Lx)\n", + gb, (mb*1000)/1024, (u64)pvt->tohm); /* * Step 2) Get SAD range and SAD Interleave list @@ -949,11 +951,11 @@ static void get_memory_layout(const struct mem_ctl_info *mci) break; tmp_mb = (limit + 1) >> 20; - mb = div_u64_rem(tmp_mb, 1000, &kb); + gb = div_u64_rem(tmp_mb, 1024, &mb); edac_dbg(0, "SAD#%d %s up to %u.%03u GB (0x%016Lx) Interleave: %s reg=0x%08x\n", n_sads, get_dram_attr(reg), - mb, kb, + gb, (mb*1000)/1024, ((u64)tmp_mb) << 20L, INTERLEAVE_MODE(reg) ? "8:6" : "[8:6]XOR[18:16]", reg); @@ -984,9 +986,9 @@ static void get_memory_layout(const struct mem_ctl_info *mci) break; tmp_mb = (limit + 1) >> 20; - mb = div_u64_rem(tmp_mb, 1000, &kb); + gb = div_u64_rem(tmp_mb, 1024, &mb); edac_dbg(0, "TAD#%d: up to %u.%03u GB (0x%016Lx), socket interleave %d, memory interleave %d, TGT: %d, %d, %d, %d, reg=0x%08x\n", - n_tads, mb, kb, + n_tads, gb, (mb*1000)/1024, ((u64)tmp_mb) << 20L, (u32)TAD_SOCK(reg), (u32)TAD_CH(reg), @@ -1009,10 +1011,10 @@ static void get_memory_layout(const struct mem_ctl_info *mci) tad_ch_nilv_offset[j], ®); tmp_mb = TAD_OFFSET(reg) >> 20; - mb = div_u64_rem(tmp_mb, 1000, &kb); + gb = div_u64_rem(tmp_mb, 1024, &mb); edac_dbg(0, "TAD CH#%d, offset #%d: %u.%03u GB (0x%016Lx), reg=0x%08x\n", i, j, - mb, kb, + gb, (mb*1000)/1024, ((u64)tmp_mb) << 20L, reg); } @@ -1034,10 +1036,10 @@ static void get_memory_layout(const struct mem_ctl_info *mci) tmp_mb = pvt->info.rir_limit(reg) >> 20; rir_way = 1 << RIR_WAY(reg); - mb = div_u64_rem(tmp_mb, 1000, &kb); + gb = div_u64_rem(tmp_mb, 1024, &mb); edac_dbg(0, "CH#%d RIR#%d, limit: %u.%03u GB (0x%016Lx), way: %d, reg=0x%08x\n", i, j, - mb, kb, + gb, (mb*1000)/1024, ((u64)tmp_mb) << 20L, rir_way, reg); @@ -1048,10 +1050,10 @@ static void get_memory_layout(const struct mem_ctl_info *mci) ®); tmp_mb = RIR_OFFSET(reg) << 6; - mb = div_u64_rem(tmp_mb, 1000, &kb); + gb = div_u64_rem(tmp_mb, 1024, &mb); edac_dbg(0, "CH#%d RIR#%d INTL#%d, offset %u.%03u GB (0x%016Lx), tgt: %d, reg=0x%08x\n", i, j, k, - mb, kb, + gb, (mb*1000)/1024, ((u64)tmp_mb) << 20L, (u32)RIR_RNK_TGT(reg), reg); @@ -1089,7 +1091,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci, u8 ch_way, sck_way, pkg, sad_ha = 0; u32 tad_offset; u32 rir_way; - u32 mb, kb; + u32 mb, gb; u64 ch_addr, offset, limit = 0, prv = 0; @@ -1358,10 +1360,10 @@ static int get_memory_error_data(struct mem_ctl_info *mci, continue; limit = pvt->info.rir_limit(reg); - mb = div_u64_rem(limit >> 20, 1000, &kb); + gb = div_u64_rem(limit >> 20, 1024, &mb); edac_dbg(0, "RIR#%d, limit: %u.%03u GB (0x%016Lx), way: %d\n", n_rir, - mb, kb, + gb, (mb*1000)/1024, limit, 1 << RIR_WAY(reg)); if (ch_addr <= limit) -- cgit v0.10.2 From f7cf2a22a2896d3b3595b71d7936b6d7a3316b00 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 29 Oct 2014 10:36:50 -0700 Subject: sb_edac: Fix discovery of top-of-low-memory for Haswell Haswell moved the TOLM/TOHM registers to a different device and offset. The sb_edac driver accounted for the change of device, but not for the new offset. There was also a typo in the constant to fill in the low 26 bits (was 0x1ffffff, should be 0x3ffffff). This resulted in a bogus value for the top of low memory: EDAC DEBUG: get_memory_layout: TOLM: 0.032 GB (0x0000000001ffffff) which would result in EDAC refusing to translate addresses for errors above the bogus value and below 4GB: sbridge MC3: HANDLING MCE MEMORY ERROR sbridge MC3: CPU 0: Machine Check Event: 0 Bank 7: 8c00004000010090 sbridge MC3: TSC 0 sbridge MC3: ADDR 2000000 sbridge MC3: MISC 523eac86 sbridge MC3: PROCESSOR 0:306f3 TIME 1414600951 SOCKET 0 APIC 0 MC3: 1 CE Error at TOLM area, on addr 0x02000000 on any memory ( page:0x0 offset:0x0 grain:32 syndrome:0x0) With the fix we see the correct TOLM value: DEBUG: get_memory_layout: TOLM: 2.048 GB (0x000000007fffffff) and we decode address 2000000 correctly: sbridge MC3: HANDLING MCE MEMORY ERROR sbridge MC3: CPU 0: Machine Check Event: 0 Bank 7: 8c00004000010090 sbridge MC3: TSC 0 sbridge MC3: ADDR 2000000 sbridge MC3: MISC 523e1086 sbridge MC3: PROCESSOR 0:306f3 TIME 1414601319 SOCKET 0 APIC 0 DEBUG: get_memory_error_data: SAD interleave package: 0 = CPU socket 0, HA 0, shiftup: 0 DEBUG: get_memory_error_data: TAD#0: address 0x0000000002000000 < 0x000000007fffffff, socket interleave 1, channel interleave 4 (offset 0x00000000), index 0, base ch: 0, ch mask: 0x01 DEBUG: get_memory_error_data: RIR#0, limit: 4.095 GB (0x00000000ffffffff), way: 1 DEBUG: get_memory_error_data: RIR#0: channel address 0x00200000 < 0xffffffff, RIR interleave 0, index 0 DEBUG: sbridge_mce_output_error: area:DRAM err_code:0001:0090 socket:0 channel_mask:1 rank:0 MC3: 1 CE memory read error on CPU_SrcID#0_Channel#0_DIMM#0 (channel:0 slot:0 page:0x2000 offset:0x0 grain:32 syndrome:0x0 - area:DRAM err_code:0001:0090 socket:0 channel_mask:1 rank:0) Signed-off-by: Tony Luck Acked-by: Aristeu Rozanski Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index f37d01f..ead0bf9 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -135,6 +135,7 @@ static inline int sad_pkg(const struct interleave_pkg *table, u32 reg, #define TOLM 0x80 #define TOHM 0x84 +#define HASWELL_TOLM 0xd0 #define HASWELL_TOHM_0 0xd4 #define HASWELL_TOHM_1 0xd8 @@ -706,8 +707,8 @@ static u64 haswell_get_tolm(struct sbridge_pvt *pvt) { u32 reg; - pci_read_config_dword(pvt->info.pci_vtd, TOLM, ®); - return (GET_BITFIELD(reg, 26, 31) << 26) | 0x1ffffff; + pci_read_config_dword(pvt->info.pci_vtd, HASWELL_TOLM, ®); + return (GET_BITFIELD(reg, 26, 31) << 26) | 0x3ffffff; } static u64 haswell_get_tohm(struct sbridge_pvt *pvt) -- cgit v0.10.2 From 1f39581a9a7a3387f40f379478d8827878c57530 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 2 Dec 2014 09:27:30 -0800 Subject: sb_edac: Add support for Broadwell-DE processor Broadwell-DE is the microserver version of next generation Xeon processors. A whole bunch of new PCIe device ids, but otherwise pretty much the same as Haswell. Acked-by: Aristeu Rozanski Signed-off-by: Tony Luck Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index ead0bf9..993e8b6 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -262,6 +262,7 @@ enum type { SANDY_BRIDGE, IVY_BRIDGE, HASWELL, + BROADWELL, }; struct sbridge_pvt; @@ -446,7 +447,7 @@ static const struct pci_id_table pci_dev_descr_ibridge_table[] = { * - each SMI channel interfaces with a scalable memory buffer * - each scalable memory buffer supports 4 DDR3/DDR4 channels, 3 DPC */ -#define HASWELL_DDRCRCLKCONTROLS 0xa10 +#define HASWELL_DDRCRCLKCONTROLS 0xa10 /* Ditto on Broadwell */ #define HASWELL_HASYSDEFEATURE2 0x84 #define PCI_DEVICE_ID_INTEL_HASWELL_IMC_VTD_MISC 0x2f28 #define PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0 0x2fa0 @@ -498,12 +499,53 @@ static const struct pci_id_table pci_dev_descr_haswell_table[] = { }; /* + * Broadwell support + * + * DE processor: + * - 1 IMC + * - 2 DDR3 channels, 2 DPC per channel + */ +#define PCI_DEVICE_ID_INTEL_BROADWELL_IMC_VTD_MISC 0x6f28 +#define PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0 0x6fa0 +#define PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TA 0x6fa8 +#define PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_THERMAL 0x6f71 +#define PCI_DEVICE_ID_INTEL_BROADWELL_IMC_CBO_SAD0 0x6ffc +#define PCI_DEVICE_ID_INTEL_BROADWELL_IMC_CBO_SAD1 0x6ffd +#define PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD0 0x6faa +#define PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD1 0x6fab +#define PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD2 0x6fac +#define PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD3 0x6fad +#define PCI_DEVICE_ID_INTEL_BROADWELL_IMC_DDRIO0 0x6faf + +static const struct pci_id_descr pci_dev_descr_broadwell[] = { + /* first item must be the HA */ + { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0, 0) }, + + { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_CBO_SAD0, 0) }, + { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_CBO_SAD1, 0) }, + + { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TA, 0) }, + { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_THERMAL, 0) }, + { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD0, 0) }, + { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD1, 0) }, + { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD2, 0) }, + { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD3, 0) }, + { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_DDRIO0, 1) }, +}; + +static const struct pci_id_table pci_dev_descr_broadwell_table[] = { + PCI_ID_TABLE_ENTRY(pci_dev_descr_broadwell), + {0,} /* 0 terminated list. */ +}; + +/* * pci_device_id table for which devices we are looking for */ static const struct pci_device_id sbridge_pci_tbl[] = { {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_HA0)}, {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TA)}, {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0)}, + {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0)}, {0,} /* 0 terminated list. */ }; @@ -768,12 +810,22 @@ static int check_if_ecc_is_active(const u8 bus, enum type type) struct pci_dev *pdev = NULL; u32 mcmtr, id; - if (type == IVY_BRIDGE) + switch (type) { + case IVY_BRIDGE: id = PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TA; - else if (type == HASWELL) + break; + case HASWELL: id = PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TA; - else + break; + case SANDY_BRIDGE: id = PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TA; + break; + case BROADWELL: + id = PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TA; + break; + default: + return -ENODEV; + } pdev = get_pdev_same_bus(bus, id); if (!pdev) { @@ -801,7 +853,7 @@ static int get_dimm_config(struct mem_ctl_info *mci) enum edac_type mode; enum mem_type mtype; - if (pvt->info.type == HASWELL) + if (pvt->info.type == HASWELL || pvt->info.type == BROADWELL) pci_read_config_dword(pvt->pci_sad1, SAD_TARGET, ®); else pci_read_config_dword(pvt->pci_br0, SAD_TARGET, ®); @@ -1182,7 +1234,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci, *socket = sad_interleave[idx]; edac_dbg(0, "SAD interleave index: %d (wayness %d) = CPU socket %d\n", idx, sad_way, *socket); - } else if (pvt->info.type == HASWELL) { + } else if (pvt->info.type == HASWELL || pvt->info.type == BROADWELL) { int bits, a7mode = A7MODE(dram_rule); if (a7mode) { @@ -1831,6 +1883,82 @@ enodev: return -ENODEV; } +static int broadwell_mci_bind_devs(struct mem_ctl_info *mci, + struct sbridge_dev *sbridge_dev) +{ + struct sbridge_pvt *pvt = mci->pvt_info; + struct pci_dev *pdev; + int i; + + /* there's only one device per system; not tied to any bus */ + if (pvt->info.pci_vtd == NULL) + /* result will be checked later */ + pvt->info.pci_vtd = pci_get_device(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_BROADWELL_IMC_VTD_MISC, + NULL); + + for (i = 0; i < sbridge_dev->n_devs; i++) { + pdev = sbridge_dev->pdev[i]; + if (!pdev) + continue; + + switch (pdev->device) { + case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_CBO_SAD0: + pvt->pci_sad0 = pdev; + break; + case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_CBO_SAD1: + pvt->pci_sad1 = pdev; + break; + case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0: + pvt->pci_ha0 = pdev; + break; + case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TA: + pvt->pci_ta = pdev; + break; + case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_THERMAL: + pvt->pci_ras = pdev; + break; + case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD0: + pvt->pci_tad[0] = pdev; + break; + case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD1: + pvt->pci_tad[1] = pdev; + break; + case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD2: + pvt->pci_tad[2] = pdev; + break; + case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD3: + pvt->pci_tad[3] = pdev; + break; + case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_DDRIO0: + pvt->pci_ddrio = pdev; + break; + default: + break; + } + + edac_dbg(0, "Associated PCI %02x.%02d.%d with dev = %p\n", + sbridge_dev->bus, + PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn), + pdev); + } + + /* Check if everything were registered */ + if (!pvt->pci_sad0 || !pvt->pci_ha0 || !pvt->pci_sad1 || + !pvt->pci_ras || !pvt->pci_ta || !pvt->info.pci_vtd) + goto enodev; + + for (i = 0; i < NUM_CHANNELS; i++) { + if (!pvt->pci_tad[i]) + goto enodev; + } + return 0; + +enodev: + sbridge_printk(KERN_ERR, "Some needed devices are missing\n"); + return -ENODEV; +} + /**************************************************************************** Error check routines ****************************************************************************/ @@ -2243,6 +2371,25 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type) if (unlikely(rc < 0)) goto fail0; break; + case BROADWELL: + /* rankcfgr isn't used */ + pvt->info.get_tolm = haswell_get_tolm; + pvt->info.get_tohm = haswell_get_tohm; + pvt->info.dram_rule = ibridge_dram_rule; + pvt->info.get_memory_type = haswell_get_memory_type; + pvt->info.get_node_id = haswell_get_node_id; + pvt->info.rir_limit = haswell_rir_limit; + pvt->info.max_sad = ARRAY_SIZE(ibridge_dram_rule); + pvt->info.interleave_list = ibridge_interleave_list; + pvt->info.max_interleave = ARRAY_SIZE(ibridge_interleave_list); + pvt->info.interleave_pkg = ibridge_interleave_pkg; + mci->ctl_name = kasprintf(GFP_KERNEL, "Broadwell Socket#%d", mci->mc_idx); + + /* Store pci devices at mci for faster access */ + rc = broadwell_mci_bind_devs(mci, sbridge_dev); + if (unlikely(rc < 0)) + goto fail0; + break; } /* Get dimm basic config and the memory layout */ @@ -2308,6 +2455,10 @@ static int sbridge_probe(struct pci_dev *pdev, const struct pci_device_id *id) rc = sbridge_get_all_devices(&num_mc, pci_dev_descr_haswell_table); type = HASWELL; break; + case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0: + rc = sbridge_get_all_devices(&num_mc, pci_dev_descr_broadwell_table); + type = BROADWELL; + break; } if (unlikely(rc < 0)) goto fail0; -- cgit v0.10.2 From fec53af531dd040e41fe358abe00b33747af2688 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 2 Dec 2014 09:41:58 -0800 Subject: sb_edac: Fix typo computing number of banks Code will always think there are 16 banks because of a typo Reported-by: Misha Signed-off-by: Tony Luck Acked-by: Aristeu Rozanski Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 993e8b6..63aa673 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -901,7 +901,7 @@ static int get_dimm_config(struct mem_ctl_info *mci) else edac_dbg(0, "Memory is unregistered\n"); - if (mtype == MEM_DDR4 || MEM_RDDR4) + if (mtype == MEM_DDR4 || mtype == MEM_RDDR4) banks = 16; else banks = 8; -- cgit v0.10.2