From 5b889e379f340f43b1252abde5d37a7084c846b9 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 7 Nov 2011 18:26:53 -0300 Subject: Fix sb_edac compilation with 32 bits kernels As reported by Josh Boyer : > drivers/edac/sb_edac.c: In function 'get_memory_error_data': > drivers/edac/sb_edac.c:861:2: warning: left shift count >= width of type > [enabled by default] > > ERROR: "__udivdi3" [drivers/edac/sb_edac.ko] undefined! > make[1]: *** [__modpost] Error 1 > make: *** [modules] Error 2 PS.: compile-tested only Reported-by: Josh Boyer Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 5948a21..203361e 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -214,7 +214,7 @@ config EDAC_I7300 config EDAC_SBRIDGE tristate "Intel Sandy-Bridge Integrated MC" - depends on EDAC_MM_EDAC && PCI && X86_64 && X86_MCE_INTEL + depends on EDAC_MM_EDAC && PCI && X86 && X86_MCE_INTEL depends on EXPERIMENTAL help Support for error detection and correction the Intel diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 1dc118d..f256a12 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -670,6 +671,7 @@ static void get_memory_layout(const struct mem_ctl_info *mci) u32 reg; u64 limit, prv = 0; u64 tmp_mb; + u32 mb, kb; u32 rir_way; /* @@ -682,8 +684,9 @@ static void get_memory_layout(const struct mem_ctl_info *mci) pvt->tolm = GET_TOLM(reg); tmp_mb = (1 + pvt->tolm) >> 20; - debugf0("TOLM: %Lu.%03Lu GB (0x%016Lx)\n", - tmp_mb / 1000, tmp_mb % 1000, (u64)pvt->tolm); + mb = div_u64_rem(tmp_mb, 1000, &kb); + debugf0("TOLM: %u.%03u GB (0x%016Lx)\n", + mb, kb, (u64)pvt->tolm); /* Address range is already 45:25 */ pci_read_config_dword(pvt->pci_sad1, TOHM, @@ -691,8 +694,9 @@ static void get_memory_layout(const struct mem_ctl_info *mci) pvt->tohm = GET_TOHM(reg); tmp_mb = (1 + pvt->tohm) >> 20; - debugf0("TOHM: %Lu.%03Lu GB (0x%016Lx)", - tmp_mb / 1000, tmp_mb % 1000, (u64)pvt->tohm); + mb = div_u64_rem(tmp_mb, 1000, &kb); + debugf0("TOHM: %u.%03u GB (0x%016Lx)", + mb, kb, (u64)pvt->tohm); /* * Step 2) Get SAD range and SAD Interleave list @@ -714,10 +718,11 @@ static void get_memory_layout(const struct mem_ctl_info *mci) break; tmp_mb = (limit + 1) >> 20; - debugf0("SAD#%d %s up to %Lu.%03Lu GB (0x%016Lx) %s reg=0x%08x\n", + mb = div_u64_rem(tmp_mb, 1000, &kb); + debugf0("SAD#%d %s up to %u.%03u GB (0x%016Lx) %s reg=0x%08x\n", n_sads, get_dram_attr(reg), - tmp_mb / 1000, tmp_mb % 1000, + mb, kb, ((u64)tmp_mb) << 20L, INTERLEAVE_MODE(reg) ? "Interleave: 8:6" : "Interleave: [8:6]XOR[18:16]", reg); @@ -747,8 +752,9 @@ static void get_memory_layout(const struct mem_ctl_info *mci) break; tmp_mb = (limit + 1) >> 20; - debugf0("TAD#%d: up to %Lu.%03Lu GB (0x%016Lx), socket interleave %d, memory interleave %d, TGT: %d, %d, %d, %d, reg=0x%08x\n", - n_tads, tmp_mb / 1000, tmp_mb % 1000, + mb = div_u64_rem(tmp_mb, 1000, &kb); + debugf0("TAD#%d: up to %u.%03u GB (0x%016Lx), socket interleave %d, memory interleave %d, TGT: %d, %d, %d, %d, reg=0x%08x\n", + n_tads, mb, kb, ((u64)tmp_mb) << 20L, (u32)TAD_SOCK(reg), (u32)TAD_CH(reg), @@ -771,9 +777,10 @@ static void get_memory_layout(const struct mem_ctl_info *mci) tad_ch_nilv_offset[j], ®); tmp_mb = TAD_OFFSET(reg) >> 20; - debugf0("TAD CH#%d, offset #%d: %Lu.%03Lu GB (0x%016Lx), reg=0x%08x\n", + mb = div_u64_rem(tmp_mb, 1000, &kb); + debugf0("TAD CH#%d, offset #%d: %u.%03u GB (0x%016Lx), reg=0x%08x\n", i, j, - tmp_mb / 1000, tmp_mb % 1000, + mb, kb, ((u64)tmp_mb) << 20L, reg); } @@ -795,9 +802,10 @@ static void get_memory_layout(const struct mem_ctl_info *mci) tmp_mb = RIR_LIMIT(reg) >> 20; rir_way = 1 << RIR_WAY(reg); - debugf0("CH#%d RIR#%d, limit: %Lu.%03Lu GB (0x%016Lx), way: %d, reg=0x%08x\n", + mb = div_u64_rem(tmp_mb, 1000, &kb); + debugf0("CH#%d RIR#%d, limit: %u.%03u GB (0x%016Lx), way: %d, reg=0x%08x\n", i, j, - tmp_mb / 1000, tmp_mb % 1000, + mb, kb, ((u64)tmp_mb) << 20L, rir_way, reg); @@ -808,9 +816,10 @@ static void get_memory_layout(const struct mem_ctl_info *mci) ®); tmp_mb = RIR_OFFSET(reg) << 6; - debugf0("CH#%d RIR#%d INTL#%d, offset %Lu.%03Lu GB (0x%016Lx), tgt: %d, reg=0x%08x\n", + mb = div_u64_rem(tmp_mb, 1000, &kb); + debugf0("CH#%d RIR#%d INTL#%d, offset %u.%03u GB (0x%016Lx), tgt: %d, reg=0x%08x\n", i, j, k, - tmp_mb / 1000, tmp_mb % 1000, + mb, kb, ((u64)tmp_mb) << 20L, (u32)RIR_RNK_TGT(reg), reg); @@ -848,6 +857,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci, u8 ch_way,sck_way; u32 tad_offset; u32 rir_way; + u32 mb, kb; u64 ch_addr, offset, limit, prv = 0; @@ -858,7 +868,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci, * range (e. g. VGA addresses). It is unlikely, however, that the * memory controller would generate an error on that range. */ - if ((addr > (u64) pvt->tolm) && (addr < (1L << 32))) { + if ((addr > (u64) pvt->tolm) && (addr < (1LL << 32))) { sprintf(msg, "Error at TOLM area, on addr 0x%08Lx", addr); edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; @@ -1053,7 +1063,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci, ch_addr = addr & 0x7f; /* Remove socket wayness and remove 6 bits */ addr >>= 6; - addr /= sck_xch; + addr = div_u64(addr, sck_xch); #if 0 /* Divide by channel way */ addr = addr / ch_way; @@ -1073,10 +1083,10 @@ static int get_memory_error_data(struct mem_ctl_info *mci, continue; limit = RIR_LIMIT(reg); - - debugf0("RIR#%d, limit: %Lu.%03Lu GB (0x%016Lx), way: %d\n", + mb = div_u64_rem(limit >> 20, 1000, &kb); + debugf0("RIR#%d, limit: %u.%03u GB (0x%016Lx), way: %d\n", n_rir, - (limit >> 20) / 1000, (limit >> 20) % 1000, + mb, kb, limit, 1 << RIR_WAY(reg)); if (ch_addr <= limit) -- cgit v0.10.2 From b877763ea01d740f6c62a35c45b7351fec9c288d Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 31 Jan 2012 16:36:08 -0300 Subject: edac/ppc4xx_edac: Fix compilation It seems that nobody is cross-compiling for this arch anymore... drivers/edac/ppc4xx_edac.c: In function 'ppc4xx_edac_probe': drivers/edac/ppc4xx_edac.c:188:12: error: storage class specified for parameter 'ppc4xx_edac_remove' ... drivers/edac/ppc4xx_edac.c:1068:19: error: 'match' undeclared (first use in this function) drivers/edac/ppc4xx_edac.c:1068:19: note: each undeclared identifier is reported only once for each function it appears in drivers/edac/ppc4xx_edac.c:1068:36: warning: left-hand operand of comma expression has no effect [-Wunused-value] Acked-by: Josh Boyer Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/edac/ppc4xx_edac.c b/drivers/edac/ppc4xx_edac.c index fc75706..d427c69 100644 --- a/drivers/edac/ppc4xx_edac.c +++ b/drivers/edac/ppc4xx_edac.c @@ -184,7 +184,7 @@ struct ppc4xx_ecc_status { /* Function Prototypes */ -static int ppc4xx_edac_probe(struct platform_device *device) +static int ppc4xx_edac_probe(struct platform_device *device); static int ppc4xx_edac_remove(struct platform_device *device); /* Global Variables */ @@ -1068,7 +1068,7 @@ ppc4xx_edac_mc_init(struct mem_ctl_info *mci, mci->mod_name = PPC4XX_EDAC_MODULE_NAME; mci->mod_ver = PPC4XX_EDAC_MODULE_REVISION; - mci->ctl_name = match->compatible, + mci->ctl_name = ppc4xx_edac_match->compatible, mci->dev_name = np->full_name; /* Initialize callbacks */ -- cgit v0.10.2 From 01a6e28b5096aea6801a21bdc20bf1de32833af5 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 3 Feb 2012 13:17:48 -0300 Subject: edac: Improve the comments to better describe the memory concepts The Computer memory terminology has changed with time since EDAC was originally written: new concepts were introduced, and some things have different meanings, depending on the memory architecture. Improve the definition of all related terms. Also, describe each memory type in a more detailed fashion. No functional changes. Just comments were touched. Acked-by: Borislav Petkov Signed-off-by: Mauro Carvalho Chehab diff --git a/include/linux/edac.h b/include/linux/edac.h index 1cd3947..0714d67 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -66,25 +66,64 @@ enum dev_type { #define DEV_FLAG_X32 BIT(DEV_X32) #define DEV_FLAG_X64 BIT(DEV_X64) -/* memory types */ +/** + * enum mem_type - memory types. For a more detailed reference, please see + * http://en.wikipedia.org/wiki/DRAM + * + * @MEM_EMPTY Empty csrow + * @MEM_RESERVED: Reserved csrow type + * @MEM_UNKNOWN: Unknown csrow type + * @MEM_FPM: FPM - Fast Page Mode, used on systems up to 1995. + * @MEM_EDO: EDO - Extended data out, used on systems up to 1998. + * @MEM_BEDO: BEDO - Burst Extended data out, an EDO variant. + * @MEM_SDR: SDR - Single data rate SDRAM + * http://en.wikipedia.org/wiki/Synchronous_dynamic_random-access_memory + * They use 3 pins for chip select: Pins 0 and 2 are + * for rank 0; pins 1 and 3 are for rank 1, if the memory + * is dual-rank. + * @MEM_RDR: Registered SDR SDRAM + * @MEM_DDR: Double data rate SDRAM + * http://en.wikipedia.org/wiki/DDR_SDRAM + * @MEM_RDDR: Registered Double data rate SDRAM + * This is a variant of the DDR memories. + * A registered memory has a buffer inside it, hiding + * part of the memory details to the memory controller. + * @MEM_RMBS: Rambus DRAM, used on a few Pentium III/IV controllers. + * @MEM_DDR2: DDR2 RAM, as described at JEDEC JESD79-2F. + * Those memories are labed as "PC2-" instead of "PC" to + * differenciate from DDR. + * @MEM_FB_DDR2: Fully-Buffered DDR2, as described at JEDEC Std No. 205 + * and JESD206. + * Those memories are accessed per DIMM slot, and not by + * a chip select signal. + * @MEM_RDDR2: Registered DDR2 RAM + * This is a variant of the DDR2 memories. + * @MEM_XDR: Rambus XDR + * It is an evolution of the original RAMBUS memories, + * created to compete with DDR2. Weren't used on any + * x86 arch, but cell_edac PPC memory controller uses it. + * @MEM_DDR3: DDR3 RAM + * @MEM_RDDR3: Registered DDR3 RAM + * This is a variant of the DDR3 memories. + */ enum mem_type { - MEM_EMPTY = 0, /* Empty csrow */ - MEM_RESERVED, /* Reserved csrow type */ - MEM_UNKNOWN, /* Unknown csrow type */ - MEM_FPM, /* Fast page mode */ - MEM_EDO, /* Extended data out */ - MEM_BEDO, /* Burst Extended data out */ - MEM_SDR, /* Single data rate SDRAM */ - MEM_RDR, /* Registered single data rate SDRAM */ - MEM_DDR, /* Double data rate SDRAM */ - MEM_RDDR, /* Registered Double data rate SDRAM */ - MEM_RMBS, /* Rambus DRAM */ - MEM_DDR2, /* DDR2 RAM */ - MEM_FB_DDR2, /* fully buffered DDR2 */ - MEM_RDDR2, /* Registered DDR2 RAM */ - MEM_XDR, /* Rambus XDR */ - MEM_DDR3, /* DDR3 RAM */ - MEM_RDDR3, /* Registered DDR3 RAM */ + MEM_EMPTY = 0, + MEM_RESERVED, + MEM_UNKNOWN, + MEM_FPM, + MEM_EDO, + MEM_BEDO, + MEM_SDR, + MEM_RDR, + MEM_DDR, + MEM_RDDR, + MEM_RMBS, + MEM_DDR2, + MEM_FB_DDR2, + MEM_RDDR2, + MEM_XDR, + MEM_DDR3, + MEM_RDDR3, }; #define MEM_FLAG_EMPTY BIT(MEM_EMPTY) @@ -162,8 +201,9 @@ enum scrub_type { #define OP_OFFLINE 0x300 /* - * There are several things to be aware of that aren't at all obvious: + * Concepts used at the EDAC subsystem * + * There are several things to be aware of that aren't at all obvious: * * SOCKETS, SOCKET SETS, BANKS, ROWS, CHIP-SELECT ROWS, CHANNELS, etc.. * @@ -172,36 +212,61 @@ enum scrub_type { * creating a common ground for discussion, terms and their definitions * will be established. * - * Memory devices: The individual chip on a memory stick. These devices - * commonly output 4 and 8 bits each. Grouping several - * of these in parallel provides 64 bits which is common - * for a memory stick. + * Memory devices: The individual DRAM chips on a memory stick. These + * devices commonly output 4 and 8 bits each (x4, x8). + * Grouping several of these in parallel provides the + * number of bits that the memory controller expects: + * typically 72 bits, in order to provide 64 bits + + * 8 bits of ECC data. * * Memory Stick: A printed circuit board that aggregates multiple - * memory devices in parallel. This is the atomic - * memory component that is purchaseable by Joe consumer - * and loaded into a memory socket. + * memory devices in parallel. In general, this is the + * Field Replaceable Unit (FRU) which gets replaced, in + * the case of excessive errors. Most often it is also + * called DIMM (Dual Inline Memory Module). * - * Socket: A physical connector on the motherboard that accepts - * a single memory stick. + * Memory Socket: A physical connector on the motherboard that accepts + * a single memory stick. Also called as "slot" on several + * datasheets. * - * Channel: Set of memory devices on a memory stick that must be - * grouped in parallel with one or more additional - * channels from other memory sticks. This parallel - * grouping of the output from multiple channels are - * necessary for the smallest granularity of memory access. - * Some memory controllers are capable of single channel - - * which means that memory sticks can be loaded - * individually. Other memory controllers are only - * capable of dual channel - which means that memory - * sticks must be loaded as pairs (see "socket set"). + * Channel: A memory controller channel, responsible to communicate + * with a group of DIMMs. Each channel has its own + * independent control (command) and data bus, and can + * be used independently or grouped with other channels. * - * Chip-select row: All of the memory devices that are selected together. - * for a single, minimum grain of memory access. - * This selects all of the parallel memory devices across - * all of the parallel channels. Common chip-select rows - * for single channel are 64 bits, for dual channel 128 - * bits. + * Branch: It is typically the highest hierarchy on a + * Fully-Buffered DIMM memory controller. + * Typically, it contains two channels. + * Two channels at the same branch can be used in single + * mode or in lockstep mode. + * When lockstep is enabled, the cacheline is doubled, + * but it generally brings some performance penalty. + * Also, it is generally not possible to point to just one + * memory stick when an error occurs, as the error + * correction code is calculated using two DIMMs instead + * of one. Due to that, it is capable of correcting more + * errors than on single mode. + * + * Single-channel: The data accessed by the memory controller is contained + * into one dimm only. E. g. if the data is 64 bits-wide, + * the data flows to the CPU using one 64 bits parallel + * access. + * Typically used with SDR, DDR, DDR2 and DDR3 memories. + * FB-DIMM and RAMBUS use a different concept for channel, + * so this concept doesn't apply there. + * + * Double-channel: The data size accessed by the memory controller is + * interlaced into two dimms, accessed at the same time. + * E. g. if the DIMM is 64 bits-wide (72 bits with ECC), + * the data flows to the CPU using a 128 bits parallel + * access. + * + * Chip-select row: This is the name of the DRAM signal used to select the + * DRAM ranks to be accessed. Common chip-select rows for + * single channel are 64 bits, for dual channel 128 bits. + * It may not be visible by the memory controller, as some + * DIMM types have a memory buffer that can hide direct + * access to it from the Memory Controller. * * Single-Ranked stick: A Single-ranked stick has 1 chip-select row of memory. * Motherboards commonly drive two chip-select pins to @@ -214,8 +279,8 @@ enum scrub_type { * * Double-sided stick: DEPRECATED TERM, see Double-Ranked stick. * A double-sided stick has two chip-select rows which - * access different sets of memory devices. The two - * rows cannot be accessed concurrently. "Double-sided" + * access different sets of memory devices. The two + * rows cannot be accessed concurrently. "Double-sided" * is irrespective of the memory devices being mounted * on both sides of the memory stick. * -- cgit v0.10.2 From 22a5c27bec0161c70cc2d8d1730f9b14b604757d Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Mon, 6 Feb 2012 04:10:59 -0300 Subject: edac: sb_edac: Let the driver depend on PCI_MMCONFIG This driver needs to access PCIe Extended Configuration Space Registers (0x100~0xfff), to correctly access those registers, we need to enable PCI_MMCONFIG option. Since this option is not enabled for X86_64 by default, we let the driver depend on it to prevent users forgetting to enable this option. Signed-off-by: Hui Wang Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 203361e..fdffa1b 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -214,8 +214,8 @@ config EDAC_I7300 config EDAC_SBRIDGE tristate "Intel Sandy-Bridge Integrated MC" - depends on EDAC_MM_EDAC && PCI && X86 && X86_MCE_INTEL - depends on EXPERIMENTAL + depends on EDAC_MM_EDAC && PCI && X86_64 && X86_MCE_INTEL + depends on PCI_MMCONFIG && EXPERIMENTAL help Support for error detection and correction the Intel Sandy Bridge Integrated Memory Controller. -- cgit v0.10.2 From ad9c40b7dd749d6429c8b21178c125004c6dc75d Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Mon, 6 Feb 2012 04:11:00 -0300 Subject: edac: sb_edac: Fix a INTERLEAVE_MODE() misuse We can identify dram interleave mode from the Dram Rule register rather than Dram Interleave list register. In this context, the reg of INTERLEAVE_MODE(reg) contains the Dram Interleave list register, we can't get interleave mode from the reg, while the variable interleave_mode saves the the mode got from the Dram Rule register, so we use the variable to replace INTERLEAVE_MDDE(reg) here. Signed-off-by: Hui Wang Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index f256a12..843545b 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -923,7 +923,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci, addr, limit, sad_way + 7, - INTERLEAVE_MODE(reg) ? "" : "XOR[18:16]"); + interleave_mode ? "" : "XOR[18:16]"); if (interleave_mode) idx = ((addr >> 6) ^ (addr >> 16)) & 7; else -- cgit v0.10.2 From 7fae0db43906ef182b3aee22415c9042ab18d6e6 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Mon, 6 Feb 2012 04:11:01 -0300 Subject: edac: sb_edac: Fix a wrong value setting for the previous value >From the driver design, the variable limit wants to compare with its previous value, we should set the value of limit instead of the value of tmp_mb to the variable prev. Signed-off-by: Hui Wang Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 843545b..2917887 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -763,7 +763,7 @@ static void get_memory_layout(const struct mem_ctl_info *mci) (u32)TAD_TGT2(reg), (u32)TAD_TGT3(reg), reg); - prv = tmp_mb; + prv = limit; } /* -- cgit v0.10.2 From b6378cb3e545912a19e6355aa9171326fdc004d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20S=C3=B6derlund?= Date: Fri, 17 Feb 2012 07:36:54 -0300 Subject: edac: i5100 fix erroneous define for M1Err MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to [1] the define for M1Err in the FERR_NF_MEM register is wrong. It should be at position 1 not 0. [1] Intel 5100 Memory Controller Hub Chipset Doc.Nr: 318378 http://www.intel.com/content/dam/doc/datasheet/5100- memory-controller-hub-chipset-datasheet.pdf Reported-by: Ba Thang Nguyen Signed-off-by: Niklas Söderlund Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/edac/i5100_edac.c b/drivers/edac/i5100_edac.c index bcbdeec..ab2f906 100644 --- a/drivers/edac/i5100_edac.c +++ b/drivers/edac/i5100_edac.c @@ -49,7 +49,7 @@ #define I5100_FERR_NF_MEM_M6ERR_MASK (1 << 6) #define I5100_FERR_NF_MEM_M5ERR_MASK (1 << 5) #define I5100_FERR_NF_MEM_M4ERR_MASK (1 << 4) -#define I5100_FERR_NF_MEM_M1ERR_MASK 1 +#define I5100_FERR_NF_MEM_M1ERR_MASK (1 << 1) #define I5100_FERR_NF_MEM_ANY_MASK \ (I5100_FERR_NF_MEM_M16ERR_MASK | \ I5100_FERR_NF_MEM_M15ERR_MASK | \ -- cgit v0.10.2 From df95e42e1f20a561f2fe0a632d5b8fd6c26f1bb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20S=C3=B6derlund?= Date: Fri, 9 Dec 2011 13:12:15 -0300 Subject: edac: i5100 ack error detection register after each read MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If I only ack the detection register after a error have been detected I'm unable to reliably detect errors. I have verified this behavior using both an error injection DIMM and software to inject errors. I can't find any documentation supporting this behavior in Intel 5100 Memory Controller Hub Chipset, see 1. So this is all based on experimentation. [1] Intel® 5100 Memory Controller Hub Chipset http://www.intel.com/content/dam/doc/datasheet/5100- memory-controller-hub-chipset-datasheet.pdf Signed-off-by: Niklas Söderlund Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/edac/i5100_edac.c b/drivers/edac/i5100_edac.c index ab2f906..2a6e7ff 100644 --- a/drivers/edac/i5100_edac.c +++ b/drivers/edac/i5100_edac.c @@ -535,23 +535,20 @@ static void i5100_read_log(struct mem_ctl_info *mci, int chan, static void i5100_check_error(struct mem_ctl_info *mci) { struct i5100_priv *priv = mci->pvt_info; - u32 dw; - + u32 dw, dw2; pci_read_config_dword(priv->mc, I5100_FERR_NF_MEM, &dw); if (i5100_ferr_nf_mem_any(dw)) { - u32 dw2; pci_read_config_dword(priv->mc, I5100_NERR_NF_MEM, &dw2); - if (dw2) - pci_write_config_dword(priv->mc, I5100_NERR_NF_MEM, - dw2); - pci_write_config_dword(priv->mc, I5100_FERR_NF_MEM, dw); i5100_read_log(mci, i5100_ferr_nf_mem_chan_indx(dw), i5100_ferr_nf_mem_any(dw), i5100_nerr_nf_mem_any(dw2)); + + pci_write_config_dword(priv->mc, I5100_NERR_NF_MEM, dw2); } + pci_write_config_dword(priv->mc, I5100_FERR_NF_MEM, dw); } /* The i5100 chipset will scrub the entire memory once, then -- cgit v0.10.2 From 0142877aa4e54dd9943fb727e9b386c36c8e3ab7 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Sun, 12 Feb 2012 08:21:34 -0300 Subject: i5400_edac: Avoid calling pci_put_device() twice When i5400_edac driver is removed and re-loaded a few times, it causes an OOPS, as it is currently decrementing some PCI device usage two times. When called inside a loop, pci_get_device() will call pci_put_device(). That mangles the error count. In this specific case, it seems easier to just duplicate the call. Also fixes the error logic when pci_get_device fails. Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/edac/i5400_edac.c b/drivers/edac/i5400_edac.c index 74d6ec34..b44a5de 100644 --- a/drivers/edac/i5400_edac.c +++ b/drivers/edac/i5400_edac.c @@ -735,7 +735,7 @@ static int i5400_get_devices(struct mem_ctl_info *mci, int dev_idx) /* Attempt to 'get' the MCH register we want */ pdev = NULL; - while (!pvt->branchmap_werrors || !pvt->fsb_error_regs) { + while (1) { pdev = pci_get_device(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_5400_ERR, pdev); if (!pdev) { @@ -743,23 +743,42 @@ static int i5400_get_devices(struct mem_ctl_info *mci, int dev_idx) i5400_printk(KERN_ERR, "'system address,Process Bus' " "device not found:" - "vendor 0x%x device 0x%x ERR funcs " + "vendor 0x%x device 0x%x ERR func 1 " "(broken BIOS?)\n", PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_5400_ERR); - goto error; + return -ENODEV; } - /* Store device 16 funcs 1 and 2 */ - switch (PCI_FUNC(pdev->devfn)) { - case 1: - pvt->branchmap_werrors = pdev; - break; - case 2: - pvt->fsb_error_regs = pdev; + /* Store device 16 func 1 */ + if (PCI_FUNC(pdev->devfn) == 1) break; + } + pvt->branchmap_werrors = pdev; + + pdev = NULL; + while (1) { + pdev = pci_get_device(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_5400_ERR, pdev); + if (!pdev) { + /* End of list, leave */ + i5400_printk(KERN_ERR, + "'system address,Process Bus' " + "device not found:" + "vendor 0x%x device 0x%x ERR func 2 " + "(broken BIOS?)\n", + PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_5400_ERR); + + pci_dev_put(pvt->branchmap_werrors); + return -ENODEV; } + + /* Store device 16 func 2 */ + if (PCI_FUNC(pdev->devfn) == 2) + break; } + pvt->fsb_error_regs = pdev; debugf1("System Address, processor bus- PCI Bus ID: %s %x:%x\n", pci_name(pvt->system_address), @@ -778,7 +797,10 @@ static int i5400_get_devices(struct mem_ctl_info *mci, int dev_idx) "MC: 'BRANCH 0' device not found:" "vendor 0x%x device 0x%x Func 0 (broken BIOS?)\n", PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_5400_FBD0); - goto error; + + pci_dev_put(pvt->fsb_error_regs); + pci_dev_put(pvt->branchmap_werrors); + return -ENODEV; } /* If this device claims to have more than 2 channels then @@ -796,14 +818,14 @@ static int i5400_get_devices(struct mem_ctl_info *mci, int dev_idx) "(broken BIOS?)\n", PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_5400_FBD1); - goto error; + + pci_dev_put(pvt->branch_0); + pci_dev_put(pvt->fsb_error_regs); + pci_dev_put(pvt->branchmap_werrors); + return -ENODEV; } return 0; - -error: - i5400_put_devices(mci); - return -ENODEV; } /* -- cgit v0.10.2 From a4b4be3fd7a76021f67380b03d8bccebf067db72 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 27 Jan 2012 10:26:13 -0300 Subject: edac: rename channel_info to rank_info What it is pointed by a csrow/channel vector is a rank information, and not a channel information. On a traditional architecture, the memory controller directly access the memory ranks, via chip select rows. Different ranks at the same DIMM is selected via different chip select rows. So, typically, one csrow/channel pair means one different DIMM. On FB-DIMMs, there's a microcontroller chip at the DIMM, called Advanced Memory Buffer (AMB) that serves as the interface between the memory controller and the memory chips. The AMB selection is via the DIMM slot, and not via a csrow. It is up to the AMB to talk with the csrows of the DRAM chips. So, the FB-DIMM memory controllers see the DIMM slot, and not the DIMM rank. RAMBUS is similar. Newer memory controllers, like the ones found on Intel Sandy Bridge and Nehalem, even working with normal DDR3 DIMM's, don't use the usual channel A/channel B interleaving schema to provide 128 bits data access. Instead, they have more channels (3 or 4 channels), and they can use several interleaving schemas. Such memory controllers see the DIMMs directly on their registers, instead of the ranks, which is better for the driver, as its main usageis to point to a broken DIMM stick (the Field Repleceable Unit), and not to point to a broken DRAM chip. The drivers that support such such newer memory architecture models currently need to fake information and to abuse on EDAC structures, as the subsystem was conceived with the idea that the csrow would always be visible by the CPU. To make things a little worse, those drivers don't currently fake csrows/channels on a consistent way, as the concepts there don't apply to the memory controllers they're talking with. So, each driver author interpreted the concepts using a different logic. In order to fix it, let's rename the data structure that points into a DIMM rank to "rank_info", in order to be clearer about what's stored there. Latter patches will provide a better way to represent the memory hierarchy for the other types of memory controller. Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index ca6c04d..690cbf1 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -39,7 +39,7 @@ static LIST_HEAD(mc_devices); #ifdef CONFIG_EDAC_DEBUG -static void edac_mc_dump_channel(struct channel_info *chan) +static void edac_mc_dump_channel(struct rank_info *chan) { debugf4("\tchannel = %p\n", chan); debugf4("\tchannel->chan_idx = %d\n", chan->chan_idx); @@ -156,7 +156,7 @@ struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, { struct mem_ctl_info *mci; struct csrow_info *csi, *csrow; - struct channel_info *chi, *chp, *chan; + struct rank_info *chi, *chp, *chan; void *pvt; unsigned size; int row, chn; @@ -181,7 +181,7 @@ struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, * rather than an imaginary chunk of memory located at address 0. */ csi = (struct csrow_info *)(((char *)mci) + ((unsigned long)csi)); - chi = (struct channel_info *)(((char *)mci) + ((unsigned long)chi)); + chi = (struct rank_info *)(((char *)mci) + ((unsigned long)chi)); pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL; /* setup index and various internal pointers */ diff --git a/include/linux/edac.h b/include/linux/edac.h index 0714d67..e3e3d26 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -308,10 +308,22 @@ enum scrub_type { * PS - I enjoyed writing all that about as much as you enjoyed reading it. */ -struct channel_info { - int chan_idx; /* channel index */ - u32 ce_count; /* Correctable Errors for this CHANNEL */ - char label[EDAC_MC_LABEL_LEN + 1]; /* DIMM label on motherboard */ +/** + * struct rank_info - contains the information for one DIMM rank + * + * @chan_idx: channel number where the rank is (typically, 0 or 1) + * @ce_count: number of correctable errors for this rank + * @label: DIMM label. Different ranks for the same DIMM should be + * filled, on userspace, with the same label. + * FIXME: The core currently won't enforce it. + * @csrow: A pointer to the chip select row structure (the parent + * structure). The location of the rank is given by + * the (csrow->csrow_idx, chan_idx) vector. + */ +struct rank_info { + int chan_idx; + u32 ce_count; + char label[EDAC_MC_LABEL_LEN + 1]; struct csrow_info *csrow; /* the parent */ }; @@ -335,7 +347,7 @@ struct csrow_info { /* channel information for this csrow */ u32 nr_channels; - struct channel_info *channels; + struct rank_info *channels; }; struct mcidev_sysfs_group { -- cgit v0.10.2