From 885b976fada5bc6595a9fd3e67e3cb1a3d11f50b Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 21 Feb 2011 13:54:41 +0800 Subject: ACPI, APEI, Add ERST record ID cache APEI ERST firmware interface and implementation has no multiple users in mind. For example, if there is four records in storage with ID: 1, 2, 3 and 4, if two ERST readers enumerate the records via GET_NEXT_RECORD_ID as follow, reader 1 reader 2 1 2 3 4 -1 -1 where -1 signals there is no more record ID. Reader 1 has no chance to check record 2 and 4, while reader 2 has no chance to check record 1 and 3. And any other GET_NEXT_RECORD_ID will return -1, that is, other readers will has no chance to check any record even they are not cleared by anyone. This makes raw GET_NEXT_RECORD_ID not suitable for used by multiple users. To solve the issue, an in-memory ERST record ID cache is designed and implemented. When enumerating record ID, the ID returned by GET_NEXT_RECORD_ID is added into cache in addition to be returned to caller. So other readers can check the cache to get all record ID available. Signed-off-by: Huang Ying Reviewed-by: Andi Kleen Signed-off-by: Len Brown diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 8209472..83930de 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -106,24 +106,34 @@ int apei_write_mce(struct mce *m) ssize_t apei_read_mce(struct mce *m, u64 *record_id) { struct cper_mce_record rcd; - ssize_t len; - - len = erst_read_next(&rcd.hdr, sizeof(rcd)); - if (len <= 0) - return len; - /* Can not skip other records in storage via ERST unless clear them */ - else if (len != sizeof(rcd) || - uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) { - if (printk_ratelimit()) - pr_warning( - "MCE-APEI: Can not skip the unknown record in ERST"); - return -EIO; - } - + int rc, pos; + + rc = erst_get_record_id_begin(&pos); + if (rc) + return rc; +retry: + rc = erst_get_record_id_next(&pos, record_id); + if (rc) + goto out; + /* no more record */ + if (*record_id == APEI_ERST_INVALID_RECORD_ID) + goto out; + rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd)); + /* someone else has cleared the record, try next one */ + if (rc == -ENOENT) + goto retry; + else if (rc < 0) + goto out; + /* try to skip other type records in storage */ + else if (rc != sizeof(rcd) || + uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) + goto retry; memcpy(m, &rcd.mce, sizeof(*m)); - *record_id = rcd.hdr.record_id; + rc = sizeof(*m); +out: + erst_get_record_id_end(); - return sizeof(*m); + return rc; } /* Check whether there is record in ERST */ diff --git a/drivers/acpi/apei/erst-dbg.c b/drivers/acpi/apei/erst-dbg.c index de73caf..a4cfb64 100644 --- a/drivers/acpi/apei/erst-dbg.c +++ b/drivers/acpi/apei/erst-dbg.c @@ -43,12 +43,27 @@ static DEFINE_MUTEX(erst_dbg_mutex); static int erst_dbg_open(struct inode *inode, struct file *file) { + int rc, *pos; + if (erst_disable) return -ENODEV; + pos = (int *)&file->private_data; + + rc = erst_get_record_id_begin(pos); + if (rc) + return rc; + return nonseekable_open(inode, file); } +static int erst_dbg_release(struct inode *inode, struct file *file) +{ + erst_get_record_id_end(); + + return 0; +} + static long erst_dbg_ioctl(struct file *f, unsigned int cmd, unsigned long arg) { int rc; @@ -79,18 +94,20 @@ static long erst_dbg_ioctl(struct file *f, unsigned int cmd, unsigned long arg) static ssize_t erst_dbg_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) { - int rc; + int rc, *pos; ssize_t len = 0; u64 id; - if (*off != 0) + if (*off) return -EINVAL; if (mutex_lock_interruptible(&erst_dbg_mutex) != 0) return -EINTR; + pos = (int *)&filp->private_data; + retry_next: - rc = erst_get_next_record_id(&id); + rc = erst_get_record_id_next(pos, &id); if (rc) goto out; /* no more record */ @@ -181,6 +198,7 @@ out: static const struct file_operations erst_dbg_ops = { .owner = THIS_MODULE, .open = erst_dbg_open, + .release = erst_dbg_release, .read = erst_dbg_read, .write = erst_dbg_write, .unlocked_ioctl = erst_dbg_ioctl, diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c index cf6db6b..8ff8c32 100644 --- a/drivers/acpi/apei/erst.c +++ b/drivers/acpi/apei/erst.c @@ -429,6 +429,22 @@ ssize_t erst_get_record_count(void) } EXPORT_SYMBOL_GPL(erst_get_record_count); +#define ERST_RECORD_ID_CACHE_SIZE_MIN 16 +#define ERST_RECORD_ID_CACHE_SIZE_MAX 1024 + +struct erst_record_id_cache { + struct mutex lock; + u64 *entries; + int len; + int size; + int refcount; +}; + +static struct erst_record_id_cache erst_record_id_cache = { + .lock = __MUTEX_INITIALIZER(erst_record_id_cache.lock), + .refcount = 0, +}; + static int __erst_get_next_record_id(u64 *record_id) { struct apei_exec_context ctx; @@ -443,26 +459,179 @@ static int __erst_get_next_record_id(u64 *record_id) return 0; } +int erst_get_record_id_begin(int *pos) +{ + int rc; + + if (erst_disable) + return -ENODEV; + + rc = mutex_lock_interruptible(&erst_record_id_cache.lock); + if (rc) + return rc; + erst_record_id_cache.refcount++; + mutex_unlock(&erst_record_id_cache.lock); + + *pos = 0; + + return 0; +} +EXPORT_SYMBOL_GPL(erst_get_record_id_begin); + +/* erst_record_id_cache.lock must be held by caller */ +static int __erst_record_id_cache_add_one(void) +{ + u64 id, prev_id, first_id; + int i, rc; + u64 *entries; + unsigned long flags; + + id = prev_id = first_id = APEI_ERST_INVALID_RECORD_ID; +retry: + raw_spin_lock_irqsave(&erst_lock, flags); + rc = __erst_get_next_record_id(&id); + raw_spin_unlock_irqrestore(&erst_lock, flags); + if (rc == -ENOENT) + return 0; + if (rc) + return rc; + if (id == APEI_ERST_INVALID_RECORD_ID) + return 0; + /* can not skip current ID, or loop back to first ID */ + if (id == prev_id || id == first_id) + return 0; + if (first_id == APEI_ERST_INVALID_RECORD_ID) + first_id = id; + prev_id = id; + + entries = erst_record_id_cache.entries; + for (i = 0; i < erst_record_id_cache.len; i++) { + if (entries[i] == id) + break; + } + /* record id already in cache, try next */ + if (i < erst_record_id_cache.len) + goto retry; + if (erst_record_id_cache.len >= erst_record_id_cache.size) { + int new_size, alloc_size; + u64 *new_entries; + + new_size = erst_record_id_cache.size * 2; + new_size = clamp_val(new_size, ERST_RECORD_ID_CACHE_SIZE_MIN, + ERST_RECORD_ID_CACHE_SIZE_MAX); + if (new_size <= erst_record_id_cache.size) { + if (printk_ratelimit()) + pr_warning(FW_WARN ERST_PFX + "too many record ID!\n"); + return 0; + } + alloc_size = new_size * sizeof(entries[0]); + if (alloc_size < PAGE_SIZE) + new_entries = kmalloc(alloc_size, GFP_KERNEL); + else + new_entries = vmalloc(alloc_size); + if (!new_entries) + return -ENOMEM; + memcpy(new_entries, entries, + erst_record_id_cache.len * sizeof(entries[0])); + if (erst_record_id_cache.size < PAGE_SIZE) + kfree(entries); + else + vfree(entries); + erst_record_id_cache.entries = entries = new_entries; + erst_record_id_cache.size = new_size; + } + entries[i] = id; + erst_record_id_cache.len++; + + return 1; +} + /* * Get the record ID of an existing error record on the persistent * storage. If there is no error record on the persistent storage, the * returned record_id is APEI_ERST_INVALID_RECORD_ID. */ -int erst_get_next_record_id(u64 *record_id) +int erst_get_record_id_next(int *pos, u64 *record_id) { - int rc; - unsigned long flags; + int rc = 0; + u64 *entries; if (erst_disable) return -ENODEV; - raw_spin_lock_irqsave(&erst_lock, flags); - rc = __erst_get_next_record_id(record_id); - raw_spin_unlock_irqrestore(&erst_lock, flags); + /* must be enclosed by erst_get_record_id_begin/end */ + BUG_ON(!erst_record_id_cache.refcount); + BUG_ON(*pos < 0 || *pos > erst_record_id_cache.len); + + mutex_lock(&erst_record_id_cache.lock); + entries = erst_record_id_cache.entries; + for (; *pos < erst_record_id_cache.len; (*pos)++) + if (entries[*pos] != APEI_ERST_INVALID_RECORD_ID) + break; + /* found next record id in cache */ + if (*pos < erst_record_id_cache.len) { + *record_id = entries[*pos]; + (*pos)++; + goto out_unlock; + } + + /* Try to add one more record ID to cache */ + rc = __erst_record_id_cache_add_one(); + if (rc < 0) + goto out_unlock; + /* successfully add one new ID */ + if (rc == 1) { + *record_id = erst_record_id_cache.entries[*pos]; + (*pos)++; + rc = 0; + } else { + *pos = -1; + *record_id = APEI_ERST_INVALID_RECORD_ID; + } +out_unlock: + mutex_unlock(&erst_record_id_cache.lock); return rc; } -EXPORT_SYMBOL_GPL(erst_get_next_record_id); +EXPORT_SYMBOL_GPL(erst_get_record_id_next); + +/* erst_record_id_cache.lock must be held by caller */ +static void __erst_record_id_cache_compact(void) +{ + int i, wpos = 0; + u64 *entries; + + if (erst_record_id_cache.refcount) + return; + + entries = erst_record_id_cache.entries; + for (i = 0; i < erst_record_id_cache.len; i++) { + if (entries[i] == APEI_ERST_INVALID_RECORD_ID) + continue; + if (wpos != i) + memcpy(&entries[wpos], &entries[i], sizeof(entries[i])); + wpos++; + } + erst_record_id_cache.len = wpos; +} + +void erst_get_record_id_end(void) +{ + /* + * erst_disable != 0 should be detected by invoker via the + * return value of erst_get_record_id_begin/next, so this + * function should not be called for erst_disable != 0. + */ + BUG_ON(erst_disable); + + mutex_lock(&erst_record_id_cache.lock); + erst_record_id_cache.refcount--; + BUG_ON(erst_record_id_cache.refcount < 0); + __erst_record_id_cache_compact(); + mutex_unlock(&erst_record_id_cache.lock); +} +EXPORT_SYMBOL_GPL(erst_get_record_id_end); static int __erst_write_to_storage(u64 offset) { @@ -703,56 +872,34 @@ ssize_t erst_read(u64 record_id, struct cper_record_header *record, } EXPORT_SYMBOL_GPL(erst_read); -/* - * If return value > buflen, the buffer size is not big enough, - * else if return value = 0, there is no more record to read, - * else if return value < 0, something goes wrong, - * else everything is OK, and return value is record length - */ -ssize_t erst_read_next(struct cper_record_header *record, size_t buflen) -{ - int rc; - ssize_t len; - unsigned long flags; - u64 record_id; - - if (erst_disable) - return -ENODEV; - - raw_spin_lock_irqsave(&erst_lock, flags); - rc = __erst_get_next_record_id(&record_id); - if (rc) { - raw_spin_unlock_irqrestore(&erst_lock, flags); - return rc; - } - /* no more record */ - if (record_id == APEI_ERST_INVALID_RECORD_ID) { - raw_spin_unlock_irqrestore(&erst_lock, flags); - return 0; - } - - len = __erst_read(record_id, record, buflen); - raw_spin_unlock_irqrestore(&erst_lock, flags); - - return len; -} -EXPORT_SYMBOL_GPL(erst_read_next); - int erst_clear(u64 record_id) { - int rc; + int rc, i; unsigned long flags; + u64 *entries; if (erst_disable) return -ENODEV; + rc = mutex_lock_interruptible(&erst_record_id_cache.lock); + if (rc) + return rc; raw_spin_lock_irqsave(&erst_lock, flags); if (erst_erange.attr & ERST_RANGE_NVRAM) rc = __erst_clear_from_nvram(record_id); else rc = __erst_clear_from_storage(record_id); raw_spin_unlock_irqrestore(&erst_lock, flags); - + if (rc) + goto out; + entries = erst_record_id_cache.entries; + for (i = 0; i < erst_record_id_cache.len; i++) { + if (entries[i] == record_id) + entries[i] = APEI_ERST_INVALID_RECORD_ID; + } + __erst_record_id_cache_compact(); +out: + mutex_unlock(&erst_record_id_cache.lock); return rc; } EXPORT_SYMBOL_GPL(erst_clear); diff --git a/include/acpi/apei.h b/include/acpi/apei.h index c4dbb13..e67b523 100644 --- a/include/acpi/apei.h +++ b/include/acpi/apei.h @@ -30,10 +30,11 @@ int apei_hest_parse(apei_hest_func_t func, void *data); int erst_write(const struct cper_record_header *record); ssize_t erst_get_record_count(void); -int erst_get_next_record_id(u64 *record_id); +int erst_get_record_id_begin(int *pos); +int erst_get_record_id_next(int *pos, u64 *record_id); +void erst_get_record_id_end(void); ssize_t erst_read(u64 record_id, struct cper_record_header *record, size_t buflen); -ssize_t erst_read_next(struct cper_record_header *record, size_t buflen); int erst_clear(u64 record_id); #endif -- cgit v0.10.2 From b64a44146540a4761bb1cf8047fffd9dbf0c3090 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 21 Feb 2011 13:54:42 +0800 Subject: PCIe, AER, use pre-generated prefix in error information printing When printing PCIe AER error information, each line is prefixed with PCIe device and driver information. In original implementation, the prefix is generated when each line is printed. In fact, all lines share the same prefix. So this patch pre-generated the prefix, and use that one when each line is printed. In addition to common prefix can be pre-generated, the trailing white spaces in string constants and NULLs in char * array constants can be removed too. These can reduce the object file size further. The size of object file before and after changing is as follow: text data bss dec before: 3038 0 0 3038 after: 2118 0 0 2118 Signed-off-by: Huang Ying CC: Jesse Barnes CC: Zhang Yanmin Signed-off-by: Len Brown diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c index 9d3e4c8..7a237f6 100644 --- a/drivers/pci/pcie/aer/aerdrv_errprint.c +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c @@ -57,86 +57,44 @@ (e & AER_DATA_LINK_LAYER_ERROR_MASK(t)) ? AER_DATA_LINK_LAYER_ERROR : \ AER_TRANSACTION_LAYER_ERROR) -#define AER_PR(info, pdev, fmt, args...) \ - printk("%s%s %s: " fmt, (info->severity == AER_CORRECTABLE) ? \ - KERN_WARNING : KERN_ERR, dev_driver_string(&pdev->dev), \ - dev_name(&pdev->dev), ## args) - /* * AER error strings */ -static char *aer_error_severity_string[] = { +static const char *aer_error_severity_string[] = { "Uncorrected (Non-Fatal)", "Uncorrected (Fatal)", "Corrected" }; -static char *aer_error_layer[] = { +static const char *aer_error_layer[] = { "Physical Layer", "Data Link Layer", "Transaction Layer" }; -static char *aer_correctable_error_string[] = { - "Receiver Error ", /* Bit Position 0 */ - NULL, - NULL, - NULL, - NULL, - NULL, - "Bad TLP ", /* Bit Position 6 */ - "Bad DLLP ", /* Bit Position 7 */ - "RELAY_NUM Rollover ", /* Bit Position 8 */ - NULL, - NULL, - NULL, - "Replay Timer Timeout ", /* Bit Position 12 */ - "Advisory Non-Fatal ", /* Bit Position 13 */ - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, + +static const char *aer_correctable_error_string[] = { + "Receiver Error", /* Bit Position 0 */ NULL, NULL, NULL, NULL, NULL, + "Bad TLP", /* Bit Position 6 */ + "Bad DLLP", /* Bit Position 7 */ + "RELAY_NUM Rollover", /* Bit Position 8 */ NULL, NULL, NULL, + "Replay Timer Timeout", /* Bit Position 12 */ + "Advisory Non-Fatal", /* Bit Position 13 */ }; -static char *aer_uncorrectable_error_string[] = { - NULL, - NULL, - NULL, - NULL, - "Data Link Protocol ", /* Bit Position 4 */ - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - "Poisoned TLP ", /* Bit Position 12 */ - "Flow Control Protocol ", /* Bit Position 13 */ - "Completion Timeout ", /* Bit Position 14 */ - "Completer Abort ", /* Bit Position 15 */ - "Unexpected Completion ", /* Bit Position 16 */ - "Receiver Overflow ", /* Bit Position 17 */ - "Malformed TLP ", /* Bit Position 18 */ - "ECRC ", /* Bit Position 19 */ - "Unsupported Request ", /* Bit Position 20 */ +static const char *aer_uncorrectable_error_string[] = { NULL, NULL, NULL, NULL, + "Data Link Protocol", /* Bit Position 4 */ NULL, NULL, NULL, @@ -144,19 +102,29 @@ static char *aer_uncorrectable_error_string[] = { NULL, NULL, NULL, + "Poisoned TLP", /* Bit Position 12 */ + "Flow Control Protocol", /* Bit Position 13 */ + "Completion Timeout", /* Bit Position 14 */ + "Completer Abort", /* Bit Position 15 */ + "Unexpected Completion", /* Bit Position 16 */ + "Receiver Overflow", /* Bit Position 17 */ + "Malformed TLP", /* Bit Position 18 */ + "ECRC", /* Bit Position 19 */ + "Unsupported Request", /* Bit Position 20 */ }; -static char *aer_agent_string[] = { +static const char *aer_agent_string[] = { "Receiver ID", "Requester ID", "Completer ID", "Transmitter ID" }; -static void __aer_print_error(struct aer_err_info *info, struct pci_dev *dev) +static void __aer_print_error(const char *prefix, + struct aer_err_info *info) { int i, status; - char *errmsg = NULL; + const char *errmsg = NULL; status = (info->status & ~info->mask); @@ -165,15 +133,17 @@ static void __aer_print_error(struct aer_err_info *info, struct pci_dev *dev) continue; if (info->severity == AER_CORRECTABLE) - errmsg = aer_correctable_error_string[i]; + errmsg = i < ARRAY_SIZE(aer_correctable_error_string) ? + aer_correctable_error_string[i] : NULL; else - errmsg = aer_uncorrectable_error_string[i]; + errmsg = i < ARRAY_SIZE(aer_uncorrectable_error_string) ? + aer_uncorrectable_error_string[i] : NULL; if (errmsg) - AER_PR(info, dev, " [%2d] %s%s\n", i, errmsg, + printk("%s"" [%2d] %-22s%s\n", prefix, i, errmsg, info->first_error == i ? " (First)" : ""); else - AER_PR(info, dev, " [%2d] Unknown Error Bit%s\n", i, + printk("%s"" [%2d] Unknown Error Bit%s\n", prefix, i, info->first_error == i ? " (First)" : ""); } } @@ -181,11 +151,15 @@ static void __aer_print_error(struct aer_err_info *info, struct pci_dev *dev) void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) { int id = ((dev->bus->number << 8) | dev->devfn); + char prefix[44]; + + snprintf(prefix, sizeof(prefix), "%s%s %s: ", + (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR, + dev_driver_string(&dev->dev), dev_name(&dev->dev)); if (info->status == 0) { - AER_PR(info, dev, - "PCIe Bus Error: severity=%s, type=Unaccessible, " - "id=%04x(Unregistered Agent ID)\n", + printk("%s""PCIe Bus Error: severity=%s, type=Unaccessible, " + "id=%04x(Unregistered Agent ID)\n", prefix, aer_error_severity_string[info->severity], id); } else { int layer, agent; @@ -193,23 +167,22 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) layer = AER_GET_LAYER_ERROR(info->severity, info->status); agent = AER_GET_AGENT(info->severity, info->status); - AER_PR(info, dev, - "PCIe Bus Error: severity=%s, type=%s, id=%04x(%s)\n", - aer_error_severity_string[info->severity], + printk("%s""PCIe Bus Error: severity=%s, type=%s, id=%04x(%s)\n", + prefix, aer_error_severity_string[info->severity], aer_error_layer[layer], id, aer_agent_string[agent]); - AER_PR(info, dev, - " device [%04x:%04x] error status/mask=%08x/%08x\n", - dev->vendor, dev->device, info->status, info->mask); + printk("%s"" device [%04x:%04x] error status/mask=%08x/%08x\n", + prefix, dev->vendor, dev->device, + info->status, info->mask); - __aer_print_error(info, dev); + __aer_print_error(prefix, info); if (info->tlp_header_valid) { unsigned char *tlp = (unsigned char *) &info->tlp; - AER_PR(info, dev, " TLP Header:" + printk("%s"" TLP Header:" " %02x%02x%02x%02x %02x%02x%02x%02x" " %02x%02x%02x%02x %02x%02x%02x%02x\n", - *(tlp + 3), *(tlp + 2), *(tlp + 1), *tlp, + prefix, *(tlp + 3), *(tlp + 2), *(tlp + 1), *tlp, *(tlp + 7), *(tlp + 6), *(tlp + 5), *(tlp + 4), *(tlp + 11), *(tlp + 10), *(tlp + 9), *(tlp + 8), *(tlp + 15), *(tlp + 14), @@ -218,8 +191,8 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) } if (info->id && info->error_dev_num > 1 && info->id == id) - AER_PR(info, dev, - " Error of this Agent(%04x) is reported first\n", id); + printk("%s"" Error of this Agent(%04x) is reported first\n", + prefix, id); } void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info) -- cgit v0.10.2 From c413d7682020a127f54744a1b30f597692aea1fd Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 21 Feb 2011 13:54:43 +0800 Subject: ACPI, APEI, Add PCIe AER error information printing support The AER error information printing support is implemented in drivers/pci/pcie/aer/aer_print.c. So some string constants, functions and macros definitions can be re-used without being exported. The original PCIe AER error information printing function is not re-used directly because the overall format is quite different. And changing the original printing format may make some original users' scripts broken. Signed-off-by: Huang Ying CC: Jesse Barnes CC: Zhang Yanmin Signed-off-by: Len Brown diff --git a/Documentation/acpi/apei/output_format.txt b/Documentation/acpi/apei/output_format.txt index 9146952..0c49c19 100644 --- a/Documentation/acpi/apei/output_format.txt +++ b/Documentation/acpi/apei/output_format.txt @@ -92,6 +92,11 @@ vendor_id: , device_id: class_code: ] [serial number: , ] [bridge: secondary_status: , control: ] +[aer_status: , aer_mask: + +[aer_uncor_severity: ] +aer_layer=, aer_agent= +aer_tlp_header: ] * := PCIe end point | legacy PCI end point | \ unknown | unknown | root port | upstream switch port | \ @@ -99,6 +104,26 @@ downstream switch port | PCIe to PCI/PCI-X bridge | \ PCI/PCI-X to PCIe bridge | root complex integrated endpoint device | \ root complex event collector +if section severity is fatal or recoverable +# := +unknown | unknown | unknown | unknown | Data Link Protocol | \ +unknown | unknown | unknown | unknown | unknown | unknown | unknown | \ +Poisoned TLP | Flow Control Protocol | Completion Timeout | \ +Completer Abort | Unexpected Completion | Receiver Overflow | \ +Malformed TLP | ECRC | Unsupported Request +else +# := +Receiver Error | unknown | unknown | unknown | unknown | unknown | \ +Bad TLP | Bad DLLP | RELAY_NUM Rollover | unknown | unknown | unknown | \ +Replay Timer Timeout | Advisory Non-Fatal +fi + + := +Physical Layer | Data Link Layer | Transaction Layer + + := +Receiver ID | Requester ID | Completer ID | Transmitter ID + Where, [] designate corresponding content is optional All description with * has the following format: diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig index fca34cc..9ecf6fe 100644 --- a/drivers/acpi/apei/Kconfig +++ b/drivers/acpi/apei/Kconfig @@ -21,6 +21,13 @@ config ACPI_APEI_GHES by firmware to produce more valuable hardware error information for Linux. +config ACPI_APEI_PCIEAER + bool "APEI PCIe AER logging/recovering support" + depends on ACPI_APEI && PCIEAER + help + PCIe AER errors may be reported via APEI firmware first mode. + Turn on this option to enable the corresponding support. + config ACPI_APEI_EINJ tristate "APEI Error INJection (EINJ)" depends on ACPI_APEI && DEBUG_FS diff --git a/drivers/acpi/apei/cper.c b/drivers/acpi/apei/cper.c index 31464a0..5d41894 100644 --- a/drivers/acpi/apei/cper.c +++ b/drivers/acpi/apei/cper.c @@ -29,6 +29,7 @@ #include #include #include +#include /* * CPER record ID need to be unique even after reboot, because record @@ -70,8 +71,8 @@ static const char *cper_severity_str(unsigned int severity) * If the output length is longer than 80, multiple line will be * printed, with @pfx is printed at the beginning of each line. */ -static void cper_print_bits(const char *pfx, unsigned int bits, - const char *strs[], unsigned int strs_size) +void cper_print_bits(const char *pfx, unsigned int bits, + const char *strs[], unsigned int strs_size) { int i, len = 0; const char *str; @@ -81,6 +82,8 @@ static void cper_print_bits(const char *pfx, unsigned int bits, if (!(bits & (1U << i))) continue; str = strs[i]; + if (!str) + continue; if (len && len + strlen(str) + 2 > 80) { printk("%s\n", buf); len = 0; @@ -243,7 +246,8 @@ static const char *cper_pcie_port_type_strs[] = { "root complex event collector", }; -static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie) +static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie, + const struct acpi_hest_generic_data *gdata) { if (pcie->validation_bits & CPER_PCIE_VALID_PORT_TYPE) printk("%s""port_type: %d, %s\n", pfx, pcie->port_type, @@ -276,6 +280,12 @@ static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie) printk( "%s""bridge: secondary_status: 0x%04x, control: 0x%04x\n", pfx, pcie->bridge.secondary_status, pcie->bridge.control); +#ifdef CONFIG_ACPI_APEI_PCIEAER + if (pcie->validation_bits & CPER_PCIE_VALID_AER_INFO) { + struct aer_capability_regs *aer_regs = (void *)pcie->aer_info; + cper_print_aer(pfx, gdata->error_severity, aer_regs); + } +#endif } static const char *apei_estatus_section_flag_strs[] = { @@ -322,7 +332,7 @@ static void apei_estatus_print_section( struct cper_sec_pcie *pcie = (void *)(gdata + 1); printk("%s""section_type: PCIe error\n", pfx); if (gdata->error_data_length >= sizeof(*pcie)) - cper_print_pcie(pfx, pcie); + cper_print_pcie(pfx, pcie, gdata); else goto err_section_too_small; } else diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h index 80c11d1..3eb7708 100644 --- a/drivers/pci/pcie/aer/aerdrv.h +++ b/drivers/pci/pcie/aer/aerdrv.h @@ -35,13 +35,6 @@ PCI_ERR_UNC_UNX_COMP| \ PCI_ERR_UNC_MALF_TLP) -struct header_log_regs { - unsigned int dw0; - unsigned int dw1; - unsigned int dw2; - unsigned int dw3; -}; - #define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */ struct aer_err_info { struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES]; @@ -59,7 +52,7 @@ struct aer_err_info { unsigned int status; /* COR/UNCOR Error Status */ unsigned int mask; /* COR/UNCOR Error Mask */ - struct header_log_regs tlp; /* TLP Header */ + struct aer_header_log_regs tlp; /* TLP Header */ }; struct aer_err_source { diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c index 7a237f6..b07a42e 100644 --- a/drivers/pci/pcie/aer/aerdrv_errprint.c +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "aerdrv.h" @@ -201,3 +202,61 @@ void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info) info->multi_error_valid ? "Multiple " : "", aer_error_severity_string[info->severity], info->id); } + +#ifdef CONFIG_ACPI_APEI_PCIEAER +static int cper_severity_to_aer(int cper_severity) +{ + switch (cper_severity) { + case CPER_SEV_RECOVERABLE: + return AER_NONFATAL; + case CPER_SEV_FATAL: + return AER_FATAL; + default: + return AER_CORRECTABLE; + } +} + +void cper_print_aer(const char *prefix, int cper_severity, + struct aer_capability_regs *aer) +{ + int aer_severity, layer, agent, status_strs_size, tlp_header_valid = 0; + u32 status, mask; + const char **status_strs; + + aer_severity = cper_severity_to_aer(cper_severity); + if (aer_severity == AER_CORRECTABLE) { + status = aer->cor_status; + mask = aer->cor_mask; + status_strs = aer_correctable_error_string; + status_strs_size = ARRAY_SIZE(aer_correctable_error_string); + } else { + status = aer->uncor_status; + mask = aer->uncor_mask; + status_strs = aer_uncorrectable_error_string; + status_strs_size = ARRAY_SIZE(aer_uncorrectable_error_string); + tlp_header_valid = status & AER_LOG_TLP_MASKS; + } + layer = AER_GET_LAYER_ERROR(aer_severity, status); + agent = AER_GET_AGENT(aer_severity, status); + printk("%s""aer_status: 0x%08x, aer_mask: 0x%08x\n", + prefix, status, mask); + cper_print_bits(prefix, status, status_strs, status_strs_size); + printk("%s""aer_layer=%s, aer_agent=%s\n", prefix, + aer_error_layer[layer], aer_agent_string[agent]); + if (aer_severity != AER_CORRECTABLE) + printk("%s""aer_uncor_severity: 0x%08x\n", + prefix, aer->uncor_severity); + if (tlp_header_valid) { + const unsigned char *tlp; + tlp = (const unsigned char *)&aer->header_log; + printk("%s""aer_tlp_header:" + " %02x%02x%02x%02x %02x%02x%02x%02x" + " %02x%02x%02x%02x %02x%02x%02x%02x\n", + prefix, *(tlp + 3), *(tlp + 2), *(tlp + 1), *tlp, + *(tlp + 7), *(tlp + 6), *(tlp + 5), *(tlp + 4), + *(tlp + 11), *(tlp + 10), *(tlp + 9), + *(tlp + 8), *(tlp + 15), *(tlp + 14), + *(tlp + 13), *(tlp + 12)); + } +} +#endif diff --git a/include/linux/aer.h b/include/linux/aer.h index f7df1ee..8414de2 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -7,6 +7,28 @@ #ifndef _AER_H_ #define _AER_H_ +struct aer_header_log_regs { + unsigned int dw0; + unsigned int dw1; + unsigned int dw2; + unsigned int dw3; +}; + +struct aer_capability_regs { + u32 header; + u32 uncor_status; + u32 uncor_mask; + u32 uncor_severity; + u32 cor_status; + u32 cor_mask; + u32 cap_control; + struct aer_header_log_regs header_log; + u32 root_command; + u32 root_status; + u16 cor_err_source; + u16 uncor_err_source; +}; + #if defined(CONFIG_PCIEAER) /* pci-e port driver needs this function to enable aer */ extern int pci_enable_pcie_error_reporting(struct pci_dev *dev); @@ -27,5 +49,7 @@ static inline int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev) } #endif +extern void cper_print_aer(const char *prefix, int cper_severity, + struct aer_capability_regs *aer); #endif //_AER_H_ diff --git a/include/linux/cper.h b/include/linux/cper.h index 3104aaf..372a258 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -388,5 +388,7 @@ struct cper_sec_pcie { #pragma pack() u64 cper_next_record_id(void); +void cper_print_bits(const char *prefix, unsigned int bits, + const char *strs[], unsigned int strs_size); #endif -- cgit v0.10.2