From 8c9ce606a60e4a0cb447bdc082ce383b96b227b4 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 25 May 2012 16:11:09 -0400 Subject: xen/blkback: Copy id field when doing BLKIF_DISCARD. We weren't copying the id field so when we sent the response back to the frontend (especially with a 64-bit host and 32-bit guest), we ended up using a random value. This lead to the frontend crashing as it would try to pass to __blk_end_request_all a NULL 'struct request' (b/c it would use the 'id' to find the proper 'struct request' in its shadow array) and end up crashing: BUG: unable to handle kernel NULL pointer dereference at 000000e4 IP: [] __blk_end_request_all+0xc/0x40 .. snip.. EIP is at __blk_end_request_all+0xc/0x40 .. snip.. [] blkif_interrupt+0x172/0x330 [xen_blkfront] This fixes the bug by passing in the proper id for the response. Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=824641 CC: stable@kernel.org Tested-by: William Dauchy Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 773cf27..9ad3b5e 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -257,6 +257,7 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst, break; case BLKIF_OP_DISCARD: dst->u.discard.flag = src->u.discard.flag; + dst->u.discard.id = src->u.discard.id; dst->u.discard.sector_number = src->u.discard.sector_number; dst->u.discard.nr_sectors = src->u.discard.nr_sectors; break; @@ -287,6 +288,7 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst, break; case BLKIF_OP_DISCARD: dst->u.discard.flag = src->u.discard.flag; + dst->u.discard.id = src->u.discard.id; dst->u.discard.sector_number = src->u.discard.sector_number; dst->u.discard.nr_sectors = src->u.discard.nr_sectors; break; -- cgit v0.10.2 From 87c9ea76a242c2f9063e2a8f3e90846c932c61a7 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Sun, 3 Jun 2012 21:56:21 +0530 Subject: mtip32xx: Remove version.h header file inclusion version.h header file inclusion is no longer required. Signed-off-by: Sachin Kamat diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index b2c88da..adb1aae 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -26,7 +26,6 @@ #include #include #include -#include /* Offset of Subsystem Device ID in pci confoguration space */ #define PCI_SUBSYSTEM_DEVICEID 0x2E -- cgit v0.10.2 From fd7949564ced88385ca7758a4c1f47c274233dd5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Jun 2012 10:01:38 +0200 Subject: block: fix return value on cfq_init() failure cfq_init() would return zero after kmem cache creation failure. Fix so that it returns -ENOMEM. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 673c977..ae5113d 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -4202,6 +4202,7 @@ static int __init cfq_init(void) if (ret) return ret; + ret = -ENOMEM; cfq_pool = KMEM_CACHE(cfq_queue, 0); if (!cfq_pool) goto err_pol_unreg; -- cgit v0.10.2 From ffea73fc723a12fdde4c9fb3fcce5d154d1104a1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Jun 2012 10:02:29 +0200 Subject: block: blkcg_policy_cfq shouldn't be used if !CONFIG_CFQ_GROUP_IOSCHED cfq may be built w/ or w/o blkcg support depending on CONFIG_CFQ_CGROUP_IOSCHED. If blkcg support is disabled, most of related code is ifdef'd out but some part is left dangling - blkcg_policy_cfq is left zero-filled and blkcg_policy_[un]register() calls are made on it. Feeding zero filled policy to blkcg_policy_register() is incorrect and triggers the following WARN_ON() if CONFIG_BLK_CGROUP && !CONFIG_CFQ_GROUP_IOSCHED. ------------[ cut here ]------------ WARNING: at block/blk-cgroup.c:867 Modules linked in: Modules linked in: CPU: 3 Not tainted 3.4.0-09547-gfb21aff #1 Process swapper/0 (pid: 1, task: 000000003ff80000, ksp: 000000003ff7f8b8) Krnl PSW : 0704100180000000 00000000003d76ca (blkcg_policy_register+0xca/0xe0) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:0 CC:1 PM:0 EA:3 Krnl GPRS: 0000000000000000 00000000014b85ec 00000000014b85b0 0000000000000000 000000000096fb60 0000000000000000 00000000009a8e78 0000000000000048 000000000099c070 0000000000b6f000 0000000000000000 000000000099c0b8 00000000014b85b0 0000000000667580 000000003ff7fd98 000000003ff7fd70 Krnl Code: 00000000003d76be: a7280001 lhi %r2,1 00000000003d76c2: a7f4ffdf brc 15,3d7680 #00000000003d76c6: a7f40001 brc 15,3d76c8 >00000000003d76ca: a7c8ffea lhi %r12,-22 00000000003d76ce: a7f4ffce brc 15,3d766a 00000000003d76d2: a7f40001 brc 15,3d76d4 00000000003d76d6: a7c80000 lhi %r12,0 00000000003d76da: a7f4ffc2 brc 15,3d765e Call Trace: ([<0000000000b6f000>] initcall_debug+0x0/0x4) [<0000000000989e8a>] cfq_init+0x62/0xd4 [<00000000001000ba>] do_one_initcall+0x3a/0x170 [<000000000096fb60>] kernel_init+0x214/0x2bc [<0000000000623202>] kernel_thread_starter+0x6/0xc [<00000000006231fc>] kernel_thread_starter+0x0/0xc no locks held by swapper/0/1. Last Breaking-Event-Address: [<00000000003d76c6>] blkcg_policy_register+0xc6/0xe0 ---[ end trace b8ef4903fcbf9dd3 ]--- This patch fixes the problem by ensuring all blkcg support code is inside CONFIG_CFQ_GROUP_IOSCHED. * blkcg_policy_cfq declaration and blkg_to_cfqg() definition are moved inside the first CONFIG_CFQ_GROUP_IOSCHED block. __maybe_unused is dropped from blkcg_policy_cfq decl. * blkcg_deactivate_poilcy() invocation is moved inside ifdef. This also makes the activation logic match cfq_init_queue(). * All blkcg_policy_[un]register() invocations are moved inside ifdef. Signed-off-by: Tejun Heo Reported-by: Heiko Carstens LKML-Reference: <20120601112954.GC3535@osiris.boeblingen.de.ibm.com> Signed-off-by: Jens Axboe diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ae5113d..fb52df9 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -17,8 +17,6 @@ #include "blk.h" #include "blk-cgroup.h" -static struct blkcg_policy blkcg_policy_cfq __maybe_unused; - /* * tunables */ @@ -418,11 +416,6 @@ static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) return pd ? container_of(pd, struct cfq_group, pd) : NULL; } -static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) -{ - return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); -} - static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) { return pd_to_blkg(&cfqg->pd); @@ -572,6 +565,13 @@ static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { } #ifdef CONFIG_CFQ_GROUP_IOSCHED +static struct blkcg_policy blkcg_policy_cfq; + +static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) +{ + return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); +} + static inline void cfqg_get(struct cfq_group *cfqg) { return blkg_get(cfqg_to_blkg(cfqg)); @@ -3951,10 +3951,11 @@ static void cfq_exit_queue(struct elevator_queue *e) cfq_shutdown_timer_wq(cfqd); -#ifndef CONFIG_CFQ_GROUP_IOSCHED +#ifdef CONFIG_CFQ_GROUP_IOSCHED + blkcg_deactivate_policy(q, &blkcg_policy_cfq); +#else kfree(cfqd->root_group); #endif - blkcg_deactivate_policy(q, &blkcg_policy_cfq); kfree(cfqd); } @@ -4194,13 +4195,13 @@ static int __init cfq_init(void) #ifdef CONFIG_CFQ_GROUP_IOSCHED if (!cfq_group_idle) cfq_group_idle = 1; -#else - cfq_group_idle = 0; -#endif ret = blkcg_policy_register(&blkcg_policy_cfq); if (ret) return ret; +#else + cfq_group_idle = 0; +#endif ret = -ENOMEM; cfq_pool = KMEM_CACHE(cfq_queue, 0); @@ -4216,13 +4217,17 @@ static int __init cfq_init(void) err_free_pool: kmem_cache_destroy(cfq_pool); err_pol_unreg: +#ifdef CONFIG_CFQ_GROUP_IOSCHED blkcg_policy_unregister(&blkcg_policy_cfq); +#endif return ret; } static void __exit cfq_exit(void) { +#ifdef CONFIG_CFQ_GROUP_IOSCHED blkcg_policy_unregister(&blkcg_policy_cfq); +#endif elv_unregister(&iosched_cfq); kmem_cache_destroy(cfq_pool); } -- cgit v0.10.2 From 9b2ea86bc9e940950a088e9795ab28f006e73276 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Jun 2012 15:21:00 +0900 Subject: blkcg: fix blkg_alloc() failure path When policy data allocation fails in the middle, blkg_alloc() invokes blkg_free() to destroy the half constructed blkg. This ends up calling pd_exit_fn() on policy datas which didn't go through pd_init_fn(). Fix it by making blkg_alloc() call pd_init_fn() immediately after each policy data allocation. Signed-off-by: Tejun Heo Acked-by: Vivek Goyal Signed-off-by: Jens Axboe diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 02cf633..4ab7420 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -125,12 +125,8 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q) blkg->pd[i] = pd; pd->blkg = blkg; - } - - /* invoke per-policy init */ - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; + /* invoke per-policy init */ if (blkcg_policy_enabled(blkg->q, pol)) pol->pd_init_fn(blkg); } -- cgit v0.10.2 From 9f132652d94c96476b0b0a8caf0c10e96ab10fa8 Mon Sep 17 00:00:00 2001 From: Pavel Vasilyev Date: Tue, 5 Jun 2012 00:02:05 -0400 Subject: ACPI sysfs.c strlen fix Current code is ignoring the last character of "enable" and "disable" in comparisons. https://bugzilla.kernel.org/show_bug.cgi?id=33732 Signed-off-by: Len Brown diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c index 9f66181..240a244 100644 --- a/drivers/acpi/sysfs.c +++ b/drivers/acpi/sysfs.c @@ -173,7 +173,7 @@ static int param_set_trace_state(const char *val, struct kernel_param *kp) { int result = 0; - if (!strncmp(val, "enable", strlen("enable") - 1)) { + if (!strncmp(val, "enable", strlen("enable"))) { result = acpi_debug_trace(trace_method_name, trace_debug_level, trace_debug_layer, 0); if (result) @@ -181,7 +181,7 @@ static int param_set_trace_state(const char *val, struct kernel_param *kp) goto exit; } - if (!strncmp(val, "disable", strlen("disable") - 1)) { + if (!strncmp(val, "disable", strlen("disable"))) { int name = 0; result = acpi_debug_trace((char *)&name, trace_debug_level, trace_debug_layer, 0); -- cgit v0.10.2 From 76eb9a30db4bc8fd172f9155247264b5f2686d7b Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 20 Feb 2012 14:20:06 +0800 Subject: ACPI, x86: fix Dell M6600 ACPI reboot regression via DMI Dell Precision M6600 is known to require PCI reboot, so add it to the reboot blacklist in pci_reboot_dmi_table[]. https://bugzilla.kernel.org/show_bug.cgi?id=42749 cc: x86@kernel.org Signed-off-by: Zhang Rui Signed-off-by: Len Brown diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 79c45af..412db57 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -451,6 +451,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), }, }, + { /* Handle problems with rebooting on the Precision M6600. */ + .callback = set_pci_reboot, + .ident = "Dell OptiPlex 990", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"), + }, + }, { } }; -- cgit v0.10.2 From 7412ff139d73f5561492478e89a22aede7252b7b Mon Sep 17 00:00:00 2001 From: Asai Thambi S P Date: Mon, 4 Jun 2012 12:43:03 -0700 Subject: mtip32xx: Remove 'registers' and 'flags' from sysfs This patch removes entries 'registers' and 'flags' from sysfs. Updated ABI file to reflect this change. Reported-by: Greg Kroah-Hartman Signed-off-by: Asai Thambi S P Signed-off-by: Jens Axboe diff --git a/Documentation/ABI/testing/sysfs-block-rssd b/Documentation/ABI/testing/sysfs-block-rssd index 679ce35..beef30c 100644 --- a/Documentation/ABI/testing/sysfs-block-rssd +++ b/Documentation/ABI/testing/sysfs-block-rssd @@ -1,26 +1,5 @@ -What: /sys/block/rssd*/registers -Date: March 2012 -KernelVersion: 3.3 -Contact: Asai Thambi S P -Description: This is a read-only file. Dumps below driver information and - hardware registers. - - S ACTive - - Command Issue - - Completed - - PORT IRQ STAT - - HOST IRQ STAT - - Allocated - - Commands in Q - What: /sys/block/rssd*/status Date: April 2012 KernelVersion: 3.4 Contact: Asai Thambi S P Description: This is a read-only file. Indicates the status of the device. - -What: /sys/block/rssd*/flags -Date: May 2012 -KernelVersion: 3.5 -Contact: Asai Thambi S P -Description: This is a read-only file. Dumps the flags in port and driver - data structure diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 264bc77..b6e95b9 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2546,7 +2546,7 @@ static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd, } /* - * Sysfs register/status dump. + * Sysfs status dump. * * @dev Pointer to the device structure, passed by the kernrel. * @attr Pointer to the device_attribute structure passed by the kernel. @@ -2555,71 +2555,6 @@ static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd, * return value * The size, in bytes, of the data copied into buf. */ -static ssize_t mtip_hw_show_registers(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - u32 group_allocated; - struct driver_data *dd = dev_to_disk(dev)->private_data; - int size = 0; - int n; - - size += sprintf(&buf[size], "Hardware\n--------\n"); - size += sprintf(&buf[size], "S ACTive : [ 0x"); - - for (n = dd->slot_groups-1; n >= 0; n--) - size += sprintf(&buf[size], "%08X ", - readl(dd->port->s_active[n])); - - size += sprintf(&buf[size], "]\n"); - size += sprintf(&buf[size], "Command Issue : [ 0x"); - - for (n = dd->slot_groups-1; n >= 0; n--) - size += sprintf(&buf[size], "%08X ", - readl(dd->port->cmd_issue[n])); - - size += sprintf(&buf[size], "]\n"); - size += sprintf(&buf[size], "Completed : [ 0x"); - - for (n = dd->slot_groups-1; n >= 0; n--) - size += sprintf(&buf[size], "%08X ", - readl(dd->port->completed[n])); - - size += sprintf(&buf[size], "]\n"); - size += sprintf(&buf[size], "PORT IRQ STAT : [ 0x%08X ]\n", - readl(dd->port->mmio + PORT_IRQ_STAT)); - size += sprintf(&buf[size], "HOST IRQ STAT : [ 0x%08X ]\n", - readl(dd->mmio + HOST_IRQ_STAT)); - size += sprintf(&buf[size], "\n"); - - size += sprintf(&buf[size], "Local\n-----\n"); - size += sprintf(&buf[size], "Allocated : [ 0x"); - - for (n = dd->slot_groups-1; n >= 0; n--) { - if (sizeof(long) > sizeof(u32)) - group_allocated = - dd->port->allocated[n/2] >> (32*(n&1)); - else - group_allocated = dd->port->allocated[n]; - size += sprintf(&buf[size], "%08X ", group_allocated); - } - size += sprintf(&buf[size], "]\n"); - - size += sprintf(&buf[size], "Commands in Q: [ 0x"); - - for (n = dd->slot_groups-1; n >= 0; n--) { - if (sizeof(long) > sizeof(u32)) - group_allocated = - dd->port->cmds_to_issue[n/2] >> (32*(n&1)); - else - group_allocated = dd->port->cmds_to_issue[n]; - size += sprintf(&buf[size], "%08X ", group_allocated); - } - size += sprintf(&buf[size], "]\n"); - - return size; -} - static ssize_t mtip_hw_show_status(struct device *dev, struct device_attribute *attr, char *buf) @@ -2637,24 +2572,7 @@ static ssize_t mtip_hw_show_status(struct device *dev, return size; } -static ssize_t mtip_hw_show_flags(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct driver_data *dd = dev_to_disk(dev)->private_data; - int size = 0; - - size += sprintf(&buf[size], "Flag in port struct : [ %08lX ]\n", - dd->port->flags); - size += sprintf(&buf[size], "Flag in dd struct : [ %08lX ]\n", - dd->dd_flag); - - return size; -} - -static DEVICE_ATTR(registers, S_IRUGO, mtip_hw_show_registers, NULL); static DEVICE_ATTR(status, S_IRUGO, mtip_hw_show_status, NULL); -static DEVICE_ATTR(flags, S_IRUGO, mtip_hw_show_flags, NULL); /* * Create the sysfs related attributes. @@ -2671,15 +2589,9 @@ static int mtip_hw_sysfs_init(struct driver_data *dd, struct kobject *kobj) if (!kobj || !dd) return -EINVAL; - if (sysfs_create_file(kobj, &dev_attr_registers.attr)) - dev_warn(&dd->pdev->dev, - "Error creating 'registers' sysfs entry\n"); if (sysfs_create_file(kobj, &dev_attr_status.attr)) dev_warn(&dd->pdev->dev, "Error creating 'status' sysfs entry\n"); - if (sysfs_create_file(kobj, &dev_attr_flags.attr)) - dev_warn(&dd->pdev->dev, - "Error creating 'flags' sysfs entry\n"); return 0; } @@ -2698,9 +2610,7 @@ static int mtip_hw_sysfs_exit(struct driver_data *dd, struct kobject *kobj) if (!kobj || !dd) return -EINVAL; - sysfs_remove_file(kobj, &dev_attr_registers.attr); sysfs_remove_file(kobj, &dev_attr_status.attr); - sysfs_remove_file(kobj, &dev_attr_flags.attr); return 0; } -- cgit v0.10.2 From 7b421d24eac79800ee68905f732300a291f72f00 Mon Sep 17 00:00:00 2001 From: Asai Thambi S P Date: Mon, 4 Jun 2012 12:44:02 -0700 Subject: mtip32xx: Create debugfs entries for troubleshooting On module load, creates a debugfs parent 'rssd' in debugfs root. Then for each device, create a new node with corresponding disk name. Under the new node, two entries 'registers' and 'flags' are created. NOTE: These entries were removed from sysfs in the previous patch Signed-off-by: Asai Thambi S P Signed-off-by: Jens Axboe diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index b6e95b9..a8fddeb 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -37,6 +37,7 @@ #include #include <../drivers/ata/ahci.h> #include +#include #include "mtip32xx.h" #define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32) @@ -85,6 +86,7 @@ static int instance; * allocated in mtip_init(). */ static int mtip_major; +static struct dentry *dfs_parent; static DEFINE_SPINLOCK(rssd_index_lock); static DEFINE_IDA(rssd_index_ida); @@ -2574,6 +2576,120 @@ static ssize_t mtip_hw_show_status(struct device *dev, static DEVICE_ATTR(status, S_IRUGO, mtip_hw_show_status, NULL); +static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf, + size_t len, loff_t *offset) +{ + struct driver_data *dd = (struct driver_data *)f->private_data; + char buf[MTIP_DFS_MAX_BUF_SIZE]; + u32 group_allocated; + int size = *offset; + int n; + + if (!len || size) + return 0; + + if (size < 0) + return -EINVAL; + + size += sprintf(&buf[size], "H/ S ACTive : [ 0x"); + + for (n = dd->slot_groups-1; n >= 0; n--) + size += sprintf(&buf[size], "%08X ", + readl(dd->port->s_active[n])); + + size += sprintf(&buf[size], "]\n"); + size += sprintf(&buf[size], "H/ Command Issue : [ 0x"); + + for (n = dd->slot_groups-1; n >= 0; n--) + size += sprintf(&buf[size], "%08X ", + readl(dd->port->cmd_issue[n])); + + size += sprintf(&buf[size], "]\n"); + size += sprintf(&buf[size], "H/ Completed : [ 0x"); + + for (n = dd->slot_groups-1; n >= 0; n--) + size += sprintf(&buf[size], "%08X ", + readl(dd->port->completed[n])); + + size += sprintf(&buf[size], "]\n"); + size += sprintf(&buf[size], "H/ PORT IRQ STAT : [ 0x%08X ]\n", + readl(dd->port->mmio + PORT_IRQ_STAT)); + size += sprintf(&buf[size], "H/ HOST IRQ STAT : [ 0x%08X ]\n", + readl(dd->mmio + HOST_IRQ_STAT)); + size += sprintf(&buf[size], "\n"); + + size += sprintf(&buf[size], "L/ Allocated : [ 0x"); + + for (n = dd->slot_groups-1; n >= 0; n--) { + if (sizeof(long) > sizeof(u32)) + group_allocated = + dd->port->allocated[n/2] >> (32*(n&1)); + else + group_allocated = dd->port->allocated[n]; + size += sprintf(&buf[size], "%08X ", group_allocated); + } + size += sprintf(&buf[size], "]\n"); + + size += sprintf(&buf[size], "L/ Commands in Q : [ 0x"); + + for (n = dd->slot_groups-1; n >= 0; n--) { + if (sizeof(long) > sizeof(u32)) + group_allocated = + dd->port->cmds_to_issue[n/2] >> (32*(n&1)); + else + group_allocated = dd->port->cmds_to_issue[n]; + size += sprintf(&buf[size], "%08X ", group_allocated); + } + size += sprintf(&buf[size], "]\n"); + + *offset = size <= len ? size : len; + size = copy_to_user(ubuf, buf, *offset); + if (size) + return -EFAULT; + + return *offset; +} + +static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf, + size_t len, loff_t *offset) +{ + struct driver_data *dd = (struct driver_data *)f->private_data; + char buf[MTIP_DFS_MAX_BUF_SIZE]; + int size = *offset; + + if (!len || size) + return 0; + + if (size < 0) + return -EINVAL; + + size += sprintf(&buf[size], "Flag-port : [ %08lX ]\n", + dd->port->flags); + size += sprintf(&buf[size], "Flag-dd : [ %08lX ]\n", + dd->dd_flag); + + *offset = size <= len ? size : len; + size = copy_to_user(ubuf, buf, *offset); + if (size) + return -EFAULT; + + return *offset; +} + +static const struct file_operations mtip_regs_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = mtip_hw_read_registers, + .llseek = no_llseek, +}; + +static const struct file_operations mtip_flags_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = mtip_hw_read_flags, + .llseek = no_llseek, +}; + /* * Create the sysfs related attributes. * @@ -2615,6 +2731,34 @@ static int mtip_hw_sysfs_exit(struct driver_data *dd, struct kobject *kobj) return 0; } +static int mtip_hw_debugfs_init(struct driver_data *dd) +{ + if (!dfs_parent) + return -1; + + dd->dfs_node = debugfs_create_dir(dd->disk->disk_name, dfs_parent); + if (IS_ERR_OR_NULL(dd->dfs_node)) { + dev_warn(&dd->pdev->dev, + "Error creating node %s under debugfs\n", + dd->disk->disk_name); + dd->dfs_node = NULL; + return -1; + } + + debugfs_create_file("flags", S_IRUGO, dd->dfs_node, dd, + &mtip_flags_fops); + debugfs_create_file("registers", S_IRUGO, dd->dfs_node, dd, + &mtip_regs_fops); + + return 0; +} + +static void mtip_hw_debugfs_exit(struct driver_data *dd) +{ + debugfs_remove_recursive(dd->dfs_node); +} + + /* * Perform any init/resume time hardware setup * @@ -3640,6 +3784,7 @@ skip_create_disk: mtip_hw_sysfs_init(dd, kobj); kobject_put(kobj); } + mtip_hw_debugfs_init(dd); if (dd->mtip_svc_handler) { set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag); @@ -3665,6 +3810,8 @@ start_service_thread: return rv; kthread_run_error: + mtip_hw_debugfs_exit(dd); + /* Delete our gendisk. This also removes the device from /dev */ del_gendisk(dd->disk); @@ -3715,6 +3862,7 @@ static int mtip_block_remove(struct driver_data *dd) kobject_put(kobj); } } + mtip_hw_debugfs_exit(dd); /* * Delete our gendisk structure. This also removes the device @@ -4062,10 +4210,20 @@ static int __init mtip_init(void) } mtip_major = error; + if (!dfs_parent) { + dfs_parent = debugfs_create_dir("rssd", NULL); + if (IS_ERR_OR_NULL(dfs_parent)) { + printk(KERN_WARNING "Error creating debugfs parent\n"); + dfs_parent = NULL; + } + } + /* Register our PCI operations. */ error = pci_register_driver(&mtip_pci_driver); - if (error) + if (error) { + debugfs_remove(dfs_parent); unregister_blkdev(mtip_major, MTIP_DRV_NAME); + } return error; } @@ -4082,6 +4240,8 @@ static int __init mtip_init(void) */ static void __exit mtip_exit(void) { + debugfs_remove_recursive(dfs_parent); + /* Release the allocated major block device number. */ unregister_blkdev(mtip_major, MTIP_DRV_NAME); diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index adb1aae..f51fc23 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -110,6 +110,8 @@ #define dbg_printk(format, arg...) #endif +#define MTIP_DFS_MAX_BUF_SIZE 1024 + #define __force_bit2int (unsigned int __force) enum { @@ -446,6 +448,8 @@ struct driver_data { unsigned long dd_flag; /* NOTE: use atomic bit operations on this */ struct task_struct *mtip_svc_handler; /* task_struct of svc thd */ + + struct dentry *dfs_node; }; #endif -- cgit v0.10.2 From 27e1f9d1cc87be4e53c6eb7158cafc21c4b85a14 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Jun 2012 13:36:44 +0200 Subject: blkcg: drop local variable @q from blkg_destroy() blkg_destroy() caches @blkg->q in local variable @q. While there are two places which needs @blkg->q, only lockdep_assert_held() used the local variable leading to unused local variable warning if lockdep is configured out. Drop the local variable and just use @blkg->q directly. Signed-off-by: Tejun Heo Reported-by: Rakesh Iyer Signed-off-by: Jens Axboe diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4ab7420..e7dee61 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -241,10 +241,9 @@ EXPORT_SYMBOL_GPL(blkg_lookup_create); static void blkg_destroy(struct blkcg_gq *blkg) { - struct request_queue *q = blkg->q; struct blkcg *blkcg = blkg->blkcg; - lockdep_assert_held(q->queue_lock); + lockdep_assert_held(blkg->q->queue_lock); lockdep_assert_held(&blkcg->lock); /* Something wrong if we are trying to remove same group twice */ -- cgit v0.10.2 From ae10ccdc3093486f8c2369d227583f9d79f628e5 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Mon, 4 Jun 2012 15:00:04 +0800 Subject: ACPI: Make acpi_skip_timer_override cover all source_irq==0 cases Currently when acpi_skip_timer_override is set, it only cover the (source_irq == 0 && global_irq == 2) cases. While there is also platform which need use this option and its global_irq is not 2. This patch will extend acpi_skip_timer_override to cover all timer overriding cases as long as the source irq is 0. This is the first part of a fix to kernel bug bugzilla 40002: "IRQ 0 assigned to VGA" https://bugzilla.kernel.org/show_bug.cgi?id=40002 Reported-and-tested-by: Szymon Kowalczyk Signed-off-by: Feng Tang Signed-off-by: Len Brown diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8afb693..e7c698e 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -422,12 +422,14 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header, return 0; } - if (intsrc->source_irq == 0 && intsrc->global_irq == 2) { + if (intsrc->source_irq == 0) { if (acpi_skip_timer_override) { - printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); + printk(PREFIX "BIOS IRQ0 override ignored.\n"); return 0; } - if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { + + if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity + && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK; printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n"); } @@ -1334,7 +1336,7 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d) } /* - * Force ignoring BIOS IRQ0 pin2 override + * Force ignoring BIOS IRQ0 override */ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) { @@ -1344,7 +1346,7 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) */ if (!acpi_skip_timer_override) { WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n"); - pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", + pr_notice("%s detected: Ignoring BIOS IRQ0 override\n", d->ident); acpi_skip_timer_override = 1; } @@ -1438,7 +1440,7 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = { * is enabled. This input is incorrectly designated the * ISA IRQ 0 via an interrupt source override even though * it is wired to the output of the master 8259A and INTIN0 - * is not connected at all. Force ignoring BIOS IRQ0 pin2 + * is not connected at all. Force ignoring BIOS IRQ0 * override in that cases. */ { -- cgit v0.10.2 From 7f68b4c2e158019c2ec494b5cfbd9c83b4e5b253 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Mon, 4 Jun 2012 15:00:05 +0800 Subject: ACPI: Remove one board specific WARN when ignoring timer overriding Current WARN msg is only for the ati_ixp4x0 board, while this function is used by mulitple platforms. So this one board specific warning is not appropriate any more. Signed-off-by: Feng Tang Signed-off-by: Len Brown diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index e7c698e..3a6afba 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1340,12 +1340,7 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d) */ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) { - /* - * The ati_ixp4x0_rev() early PCI quirk should have set - * the acpi_skip_timer_override flag already: - */ if (!acpi_skip_timer_override) { - WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n"); pr_notice("%s detected: Ignoring BIOS IRQ0 override\n", d->ident); acpi_skip_timer_override = 1; -- cgit v0.10.2 From f6b54f083cc66cf9b11d2120d8df3c2ad4e0836d Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Mon, 4 Jun 2012 15:00:06 +0800 Subject: ACPI: Add a quirk for "AMILO PRO V2030" to ignore the timer overriding This is the 2nd part of fix for kernel bugzilla 40002: "IRQ 0 assigned to VGA" https://bugzilla.kernel.org/show_bug.cgi?id=40002 The root cause is the buggy FW, whose ACPI tables assign the GSI 16 to 2 irqs 0 and 16(VGA), and the VGA is the right owner of GSI 16. So add a quirk to ignore the irq0 overriding GSI 16 for the FUJITSU SIEMENS AMILO PRO V2030 platform will solve this issue. Reported-and-tested-by: Szymon Kowalczyk Signed-off-by: Feng Tang Signed-off-by: Len Brown diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 3a6afba..b2297e5 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1470,6 +1470,14 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = { DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"), }, }, + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "FUJITSU SIEMENS", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), + DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"), + }, + }, {} }; -- cgit v0.10.2 From 34ddeb035d704eafdcdb3cbc781894300136c3c4 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 12 Jun 2012 11:20:19 +0800 Subject: ACPI, APEI, Avoid too much error reporting in runtime This patch fixed the following bug. https://bugzilla.kernel.org/show_bug.cgi?id=43282 This is caused by a firmware bug checking (checking generic address register provided by firmware) in runtime. The checking should be done in address mapping time instead of runtime to avoid too much error reporting in runtime. Reported-by: Pawel Sikora Signed-off-by: Huang Ying Tested-by: Jean Delvare Cc: stable@vger.kernel.org Signed-off-by: Len Brown diff --git a/drivers/acpi/apei/apei-base.c b/drivers/acpi/apei/apei-base.c index 5577762..6686b1e 100644 --- a/drivers/acpi/apei/apei-base.c +++ b/drivers/acpi/apei/apei-base.c @@ -243,7 +243,7 @@ static int pre_map_gar_callback(struct apei_exec_context *ctx, u8 ins = entry->instruction; if (ctx->ins_table[ins].flags & APEI_EXEC_INS_ACCESS_REGISTER) - return acpi_os_map_generic_address(&entry->register_region); + return apei_map_generic_address(&entry->register_region); return 0; } @@ -276,7 +276,7 @@ static int post_unmap_gar_callback(struct apei_exec_context *ctx, u8 ins = entry->instruction; if (ctx->ins_table[ins].flags & APEI_EXEC_INS_ACCESS_REGISTER) - acpi_os_unmap_generic_address(&entry->register_region); + apei_unmap_generic_address(&entry->register_region); return 0; } @@ -606,6 +606,19 @@ static int apei_check_gar(struct acpi_generic_address *reg, u64 *paddr, return 0; } +int apei_map_generic_address(struct acpi_generic_address *reg) +{ + int rc; + u32 access_bit_width; + u64 address; + + rc = apei_check_gar(reg, &address, &access_bit_width); + if (rc) + return rc; + return acpi_os_map_generic_address(reg); +} +EXPORT_SYMBOL_GPL(apei_map_generic_address); + /* read GAR in interrupt (including NMI) or process context */ int apei_read(u64 *val, struct acpi_generic_address *reg) { diff --git a/drivers/acpi/apei/apei-internal.h b/drivers/acpi/apei/apei-internal.h index cca240a..f220d64 100644 --- a/drivers/acpi/apei/apei-internal.h +++ b/drivers/acpi/apei/apei-internal.h @@ -7,6 +7,8 @@ #define APEI_INTERNAL_H #include +#include +#include struct apei_exec_context; @@ -68,6 +70,13 @@ static inline int apei_exec_run_optional(struct apei_exec_context *ctx, u8 actio /* IP has been set in instruction function */ #define APEI_EXEC_SET_IP 1 +int apei_map_generic_address(struct acpi_generic_address *reg); + +static inline void apei_unmap_generic_address(struct acpi_generic_address *reg) +{ + acpi_os_unmap_generic_address(reg); +} + int apei_read(u64 *val, struct acpi_generic_address *reg); int apei_write(u64 val, struct acpi_generic_address *reg); diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 9b3cac0..1599566 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -301,7 +301,7 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic) if (!ghes) return ERR_PTR(-ENOMEM); ghes->generic = generic; - rc = acpi_os_map_generic_address(&generic->error_status_address); + rc = apei_map_generic_address(&generic->error_status_address); if (rc) goto err_free; error_block_length = generic->error_block_length; @@ -321,7 +321,7 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic) return ghes; err_unmap: - acpi_os_unmap_generic_address(&generic->error_status_address); + apei_unmap_generic_address(&generic->error_status_address); err_free: kfree(ghes); return ERR_PTR(rc); @@ -330,7 +330,7 @@ err_free: static void ghes_fini(struct ghes *ghes) { kfree(ghes->estatus); - acpi_os_unmap_generic_address(&ghes->generic->error_status_address); + apei_unmap_generic_address(&ghes->generic->error_status_address); } enum { -- cgit v0.10.2 From 6878c32e5cc0e40980abe51d1f02fb453e27493e Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 25 May 2012 17:34:51 -0400 Subject: xen/blkfront: Add WARN to deal with misbehaving backends. Part of the ring structure is the 'id' field which is under control of the frontend. The frontend stamps it with "some" value (this some in this implementation being a value less than BLK_RING_SIZE), and when it gets a response expects said value to be in the response structure. We have a check for the id field when spolling new requests but not when de-spolling responses. We also add an extra check in add_id_to_freelist to make sure that the 'struct request' was not NULL - as we cannot pass a NULL to __blk_end_request_all, otherwise that crashes (and all the operations that the response is dealing with end up with __blk_end_request_all). Lastly we also print the name of the operation that failed. [v1: s/BUG/WARN/ suggested by Stefano] [v2: Add extra check in add_id_to_freelist] [v3: Redid op_name per Jan's suggestion] [v4: add const * and add WARN on failure returns] Acked-by: Jan Beulich Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 60eed4b..e4fb337 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -141,14 +141,36 @@ static int get_id_from_freelist(struct blkfront_info *info) return free; } -static void add_id_to_freelist(struct blkfront_info *info, +static int add_id_to_freelist(struct blkfront_info *info, unsigned long id) { + if (info->shadow[id].req.u.rw.id != id) + return -EINVAL; + if (info->shadow[id].request == NULL) + return -EINVAL; info->shadow[id].req.u.rw.id = info->shadow_free; info->shadow[id].request = NULL; info->shadow_free = id; + return 0; } +static const char *op_name(int op) +{ + static const char *const names[] = { + [BLKIF_OP_READ] = "read", + [BLKIF_OP_WRITE] = "write", + [BLKIF_OP_WRITE_BARRIER] = "barrier", + [BLKIF_OP_FLUSH_DISKCACHE] = "flush", + [BLKIF_OP_DISCARD] = "discard" }; + + if (op < 0 || op >= ARRAY_SIZE(names)) + return "unknown"; + + if (!names[op]) + return "reserved"; + + return names[op]; +} static int xlbd_reserve_minors(unsigned int minor, unsigned int nr) { unsigned int end = minor + nr; @@ -746,20 +768,36 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) bret = RING_GET_RESPONSE(&info->ring, i); id = bret->id; + /* + * The backend has messed up and given us an id that we would + * never have given to it (we stamp it up to BLK_RING_SIZE - + * look in get_id_from_freelist. + */ + if (id >= BLK_RING_SIZE) { + WARN(1, "%s: response to %s has incorrect id (%ld)\n", + info->gd->disk_name, op_name(bret->operation), id); + /* We can't safely get the 'struct request' as + * the id is busted. */ + continue; + } req = info->shadow[id].request; if (bret->operation != BLKIF_OP_DISCARD) blkif_completion(&info->shadow[id]); - add_id_to_freelist(info, id); + if (add_id_to_freelist(info, id)) { + WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", + info->gd->disk_name, op_name(bret->operation), id); + continue; + } error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; switch (bret->operation) { case BLKIF_OP_DISCARD: if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { struct request_queue *rq = info->rq; - printk(KERN_WARNING "blkfront: %s: discard op failed\n", - info->gd->disk_name); + printk(KERN_WARNING "blkfront: %s: %s op failed\n", + info->gd->disk_name, op_name(bret->operation)); error = -EOPNOTSUPP; info->feature_discard = 0; info->feature_secdiscard = 0; @@ -771,18 +809,14 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_WRITE_BARRIER: if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { - printk(KERN_WARNING "blkfront: %s: write %s op failed\n", - info->flush_op == BLKIF_OP_WRITE_BARRIER ? - "barrier" : "flush disk cache", - info->gd->disk_name); + printk(KERN_WARNING "blkfront: %s: %s op failed\n", + info->gd->disk_name, op_name(bret->operation)); error = -EOPNOTSUPP; } if (unlikely(bret->status == BLKIF_RSP_ERROR && info->shadow[id].req.u.rw.nr_segments == 0)) { - printk(KERN_WARNING "blkfront: %s: empty write %s op failed\n", - info->flush_op == BLKIF_OP_WRITE_BARRIER ? - "barrier" : "flush disk cache", - info->gd->disk_name); + printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", + info->gd->disk_name, op_name(bret->operation)); error = -EOPNOTSUPP; } if (unlikely(error)) { -- cgit v0.10.2 From 4eccc579795290a58e2262fa4e9d083d7672e699 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 8 Jun 2012 13:18:51 +0200 Subject: drbd: fix access of unallocated pages and kernel panic BUG: unable to handle kernel NULL pointer dereference at (null) ... [] ? _drbd_bm_set_bits+0x151/0x240 [drbd] [] ? receive_bitmap+0x4f8/0xbc0 [drbd] This fixes an off-by-one error in the receive_bitmap() path, if run-length encoded bitmap transfer is enabled. If the bitmap is an exact multiple of PAGE_SIZE, which means the visible capacity of the drbd device is an exact multiple of 128 MiB (for 4k page size), and bitmap compression (use-rle) is enabled (which became default with 8.4), and the very last bit is dirty and reported in an rle comressed bitmap packet, we ended up trying to kmap_atomic a page pointer that does not exist (bitmap->bm_pages[last index + 1]). bug introduced by: Date: Fri Jul 24 15:33:24 2009 +0200 set bits: optimize for complete last word, fix off-by-one-word corner case made effective by: Date: Thu Dec 16 00:32:38 2010 +0100 drbd: get rid of unused debug code Long time ago, we had paranoia code in the bitmap that allocated one extra word, assigned a magic value, and checked on every occasion that the magic value was still unchanged. That debug code is unused, the extra long word complicates code a bit. Get rid of it. No-one triggered this bug in the last few years, because a large subset of our userbase is unaffected: * typically the last few blocks of a device are not modified frequently, and remain unset * use-rle was disabled by default in drbd < 8.4 * those with slightly "odd" device sizes, or * drbd internal meta data (which will skew the device size slightly, thus makes it harder to have a bug relevant device size) Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index b5c5ff5..fcb956b 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -1475,10 +1475,17 @@ void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsi first_word = 0; spin_lock_irq(&b->bm_lock); } - /* last page (respectively only page, for first page == last page) */ last_word = MLPP(el >> LN2_BPL); - bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word); + + /* consider bitmap->bm_bits = 32768, bitmap->bm_number_of_pages = 1. (or multiples). + * ==> e = 32767, el = 32768, last_page = 2, + * and now last_word = 0. + * We do not want to touch last_page in this case, + * as we did not allocate it, it is not present in bitmap->bm_pages. + */ + if (last_word) + bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word); /* possibly trailing bits. * example: (e & 63) == 63, el will be e+1. -- cgit v0.10.2 From 1ed25b269e3dd5ecc64f17beef9ea21745c39ca6 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 8 Jun 2012 14:09:54 +0200 Subject: drbd: fix list corruption by failing but already aborted reads If a read is aborted due to force-detach of a supposedly unresponsive local backing device, and retried on the peer, it can happen that the local request later still completes (hopefully with an error). As it may already have been completed to upper layers meanwhile, it must not be retried again now. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 9c5c849..773f4e2 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -472,12 +472,17 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state |= RQ_LOCAL_COMPLETED; req->rq_state &= ~RQ_LOCAL_PENDING; - D_ASSERT(!(req->rq_state & RQ_NET_MASK)); + if (req->rq_state & RQ_LOCAL_ABORTED) { + _req_may_be_done(req, m); + break; + } __drbd_chk_io_error(mdev, false); goto_queue_for_net_read: + D_ASSERT(!(req->rq_state & RQ_NET_MASK)); + /* no point in retrying if there is no good remote data, * or we have no connection. */ if (mdev->state.pdsk != D_UP_TO_DATE) { -- cgit v0.10.2 From 0d5934e3c258fc5decc4103600c597086fd95a52 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 8 Jun 2012 14:17:36 +0200 Subject: drbd: fix null pointer dereference with on-congestion policy when diskless We must not look at mdev->actlog, unless we have a get_ldev() reference. It also does not make much sense to try to disconnect or pull-ahead of the peer, if we don't have good local data. Only even consider congestion policies, if our local disk is D_UP_TO_DATE. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 773f4e2..8e93a6a 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -770,6 +770,40 @@ static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int s return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr); } +static void maybe_pull_ahead(struct drbd_conf *mdev) +{ + int congested = 0; + + /* If I don't even have good local storage, we can not reasonably try + * to pull ahead of the peer. We also need the local reference to make + * sure mdev->act_log is there. + * Note: caller has to make sure that net_conf is there. + */ + if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) + return; + + if (mdev->net_conf->cong_fill && + atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) { + dev_info(DEV, "Congestion-fill threshold reached\n"); + congested = 1; + } + + if (mdev->act_log->used >= mdev->net_conf->cong_extents) { + dev_info(DEV, "Congestion-extents threshold reached\n"); + congested = 1; + } + + if (congested) { + queue_barrier(mdev); /* last barrier, after mirrored writes */ + + if (mdev->net_conf->on_congestion == OC_PULL_AHEAD) + _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL); + else /*mdev->net_conf->on_congestion == OC_DISCONNECT */ + _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL); + } + put_ldev(mdev); +} + static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) { const int rw = bio_rw(bio); @@ -977,29 +1011,8 @@ allocate_barrier: _req_mod(req, queue_for_send_oos); if (remote && - mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) { - int congested = 0; - - if (mdev->net_conf->cong_fill && - atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) { - dev_info(DEV, "Congestion-fill threshold reached\n"); - congested = 1; - } - - if (mdev->act_log->used >= mdev->net_conf->cong_extents) { - dev_info(DEV, "Congestion-extents threshold reached\n"); - congested = 1; - } - - if (congested) { - queue_barrier(mdev); /* last barrier, after mirrored writes */ - - if (mdev->net_conf->on_congestion == OC_PULL_AHEAD) - _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL); - else /*mdev->net_conf->on_congestion == OC_DISCONNECT */ - _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL); - } - } + mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) + maybe_pull_ahead(mdev); spin_unlock_irq(&mdev->req_lock); kfree(b); /* if someone else has beaten us to it... */ -- cgit v0.10.2 From 047fe3605235888f3ebcda0c728cb31937eadfe6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Jun 2012 15:24:40 +0200 Subject: splice: fix racy pipe->buffers uses Dave Jones reported a kernel BUG at mm/slub.c:3474! triggered by splice_shrink_spd() called from vmsplice_to_pipe() commit 35f3d14dbbc5 (pipe: add support for shrinking and growing pipes) added capability to adjust pipe->buffers. Problem is some paths don't hold pipe mutex and assume pipe->buffers doesn't change for their duration. Fix this by adding nr_pages_max field in struct splice_pipe_desc, and use it in place of pipe->buffers where appropriate. splice_shrink_spd() loses its struct pipe_inode_info argument. Reported-by: Dave Jones Signed-off-by: Eric Dumazet Cc: Jens Axboe Cc: Alexander Viro Cc: Tom Herbert Cc: stable # 2.6.35 Tested-by: Dave Jones Signed-off-by: Jens Axboe diff --git a/fs/splice.c b/fs/splice.c index c9f1318..7bf08fa 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -273,13 +273,16 @@ void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) * Check if we need to grow the arrays holding pages and partial page * descriptions. */ -int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) +int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { - if (pipe->buffers <= PIPE_DEF_BUFFERS) + unsigned int buffers = ACCESS_ONCE(pipe->buffers); + + spd->nr_pages_max = buffers; + if (buffers <= PIPE_DEF_BUFFERS) return 0; - spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL); - spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL); + spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL); + spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL); if (spd->pages && spd->partial) return 0; @@ -289,10 +292,9 @@ int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) return -ENOMEM; } -void splice_shrink_spd(struct pipe_inode_info *pipe, - struct splice_pipe_desc *spd) +void splice_shrink_spd(struct splice_pipe_desc *spd) { - if (pipe->buffers <= PIPE_DEF_BUFFERS) + if (spd->nr_pages_max <= PIPE_DEF_BUFFERS) return; kfree(spd->pages); @@ -315,6 +317,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, struct splice_pipe_desc spd = { .pages = pages, .partial = partial, + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &page_cache_pipe_buf_ops, .spd_release = spd_release_page, @@ -326,7 +329,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, index = *ppos >> PAGE_CACHE_SHIFT; loff = *ppos & ~PAGE_CACHE_MASK; req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - nr_pages = min(req_pages, pipe->buffers); + nr_pages = min(req_pages, spd.nr_pages_max); /* * Lookup the (hopefully) full range of pages we need. @@ -497,7 +500,7 @@ fill_it: if (spd.nr_pages) error = splice_to_pipe(pipe, &spd); - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); return error; } @@ -598,6 +601,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, struct splice_pipe_desc spd = { .pages = pages, .partial = partial, + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &default_pipe_buf_ops, .spd_release = spd_release_page, @@ -608,8 +612,8 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, res = -ENOMEM; vec = __vec; - if (pipe->buffers > PIPE_DEF_BUFFERS) { - vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL); + if (spd.nr_pages_max > PIPE_DEF_BUFFERS) { + vec = kmalloc(spd.nr_pages_max * sizeof(struct iovec), GFP_KERNEL); if (!vec) goto shrink_ret; } @@ -617,7 +621,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, offset = *ppos & ~PAGE_CACHE_MASK; nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) { + for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) { struct page *page; page = alloc_page(GFP_USER); @@ -665,7 +669,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, shrink_ret: if (vec != __vec) kfree(vec); - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); return res; err: @@ -1614,6 +1618,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, struct splice_pipe_desc spd = { .pages = pages, .partial = partial, + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &user_page_pipe_buf_ops, .spd_release = spd_release_page, @@ -1629,13 +1634,13 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages, spd.partial, false, - pipe->buffers); + spd.nr_pages_max); if (spd.nr_pages <= 0) ret = spd.nr_pages; else ret = splice_to_pipe(pipe, &spd); - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); return ret; } diff --git a/include/linux/splice.h b/include/linux/splice.h index 26e5b61..09a545a 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -51,7 +51,8 @@ struct partial_page { struct splice_pipe_desc { struct page **pages; /* page map */ struct partial_page *partial; /* pages[] may not be contig */ - int nr_pages; /* number of pages in map */ + int nr_pages; /* number of populated pages in map */ + unsigned int nr_pages_max; /* pages[] & partial[] arrays size */ unsigned int flags; /* splice flags */ const struct pipe_buf_operations *ops;/* ops associated with output pipe */ void (*spd_release)(struct splice_pipe_desc *, unsigned int); @@ -85,9 +86,8 @@ extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, /* * for dynamic pipe sizing */ -extern int splice_grow_spd(struct pipe_inode_info *, struct splice_pipe_desc *); -extern void splice_shrink_spd(struct pipe_inode_info *, - struct splice_pipe_desc *); +extern int splice_grow_spd(const struct pipe_inode_info *, struct splice_pipe_desc *); +extern void splice_shrink_spd(struct splice_pipe_desc *); extern void spd_release_page(struct splice_pipe_desc *, unsigned int); extern const struct pipe_buf_operations page_cache_pipe_buf_ops; diff --git a/kernel/relay.c b/kernel/relay.c index ab56a17..e8cd202 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1235,6 +1235,7 @@ static ssize_t subbuf_splice_actor(struct file *in, struct splice_pipe_desc spd = { .pages = pages, .nr_pages = 0, + .nr_pages_max = PIPE_DEF_BUFFERS, .partial = partial, .flags = flags, .ops = &relay_pipe_buf_ops, @@ -1302,8 +1303,8 @@ static ssize_t subbuf_splice_actor(struct file *in, ret += padding; out: - splice_shrink_spd(pipe, &spd); - return ret; + splice_shrink_spd(&spd); + return ret; } static ssize_t relay_file_splice_read(struct file *in, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 68032c6..2884880 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3609,6 +3609,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .pages = pages_def, .partial = partial_def, .nr_pages = 0, /* This gets updated below. */ + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &tracing_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, @@ -3680,7 +3681,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, ret = splice_to_pipe(pipe, &spd); out: - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); return ret; out_err: @@ -4231,6 +4232,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, struct splice_pipe_desc spd = { .pages = pages_def, .partial = partial_def, + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &buffer_pipe_buf_ops, .spd_release = buffer_spd_release, @@ -4318,7 +4320,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, } ret = splice_to_pipe(pipe, &spd); - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); out: return ret; } diff --git a/mm/shmem.c b/mm/shmem.c index 585bd220..c244e93 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1577,6 +1577,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, struct splice_pipe_desc spd = { .pages = pages, .partial = partial, + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &page_cache_pipe_buf_ops, .spd_release = spd_release_page, @@ -1665,7 +1666,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (spd.nr_pages) error = splice_to_pipe(pipe, &spd); - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); if (error > 0) { *ppos += error; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 016694d..bac3c57 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1755,6 +1755,7 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, struct splice_pipe_desc spd = { .pages = pages, .partial = partial, + .nr_pages_max = MAX_SKB_FRAGS, .flags = flags, .ops = &sock_pipe_buf_ops, .spd_release = sock_spd_release, -- cgit v0.10.2 From 32587371ad3db2f9d335de10dbd8cffd4fff5669 Mon Sep 17 00:00:00 2001 From: Tao Guo Date: Wed, 13 Jun 2012 21:17:21 +0200 Subject: umem: fix up unplugging Fix a regression introduced by 7eaceaccab5f40 ("block: remove per-queue plugging"). In that patch, Jens removed the whole mm_unplug_device() function, which used to be the trigger to make umem start to work. We need to implement unplugging to make umem start to work, or I/O will never be triggered. Signed-off-by: Tao Guo Cc: Neil Brown Cc: Jens Axboe Cc: Shaohua Li Cc: Acked-by: NeilBrown Signed-off-by: Jens Axboe diff --git a/drivers/block/umem.c b/drivers/block/umem.c index aa27120..9a72277 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -513,6 +513,44 @@ static void process_page(unsigned long data) } } +struct mm_plug_cb { + struct blk_plug_cb cb; + struct cardinfo *card; +}; + +static void mm_unplug(struct blk_plug_cb *cb) +{ + struct mm_plug_cb *mmcb = container_of(cb, struct mm_plug_cb, cb); + + spin_lock_irq(&mmcb->card->lock); + activate(mmcb->card); + spin_unlock_irq(&mmcb->card->lock); + kfree(mmcb); +} + +static int mm_check_plugged(struct cardinfo *card) +{ + struct blk_plug *plug = current->plug; + struct mm_plug_cb *mmcb; + + if (!plug) + return 0; + + list_for_each_entry(mmcb, &plug->cb_list, cb.list) { + if (mmcb->cb.callback == mm_unplug && mmcb->card == card) + return 1; + } + /* Not currently on the callback list */ + mmcb = kmalloc(sizeof(*mmcb), GFP_ATOMIC); + if (!mmcb) + return 0; + + mmcb->card = card; + mmcb->cb.callback = mm_unplug; + list_add(&mmcb->cb.list, &plug->cb_list); + return 1; +} + static void mm_make_request(struct request_queue *q, struct bio *bio) { struct cardinfo *card = q->queuedata; @@ -523,6 +561,8 @@ static void mm_make_request(struct request_queue *q, struct bio *bio) *card->biotail = bio; bio->bi_next = NULL; card->biotail = &bio->bi_next; + if (bio->bi_rw & REQ_SYNC || !mm_check_plugged(card)) + activate(card); spin_unlock_irq(&card->lock); return; -- cgit v0.10.2 From 0b91f45b23cb73ce11acdc3cf4c6efd4441e3b3e Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 14 Jun 2012 18:07:15 -0700 Subject: x86, compat: Use test_thread_flag(TIF_IA32) in compat signal delivery Signal delivery compat path may not have the 'TS_COMPAT' flag (that flag indicates how we entered the kernel). So use test_thread_flag(TIF_IA32) instead of is_ia32_task(): one of the functions of TIF_IA32 is just what kind of signal frame we want. Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1339722435.3475.57.camel@sbsiddha-desk.sc.intel.com Cc: stable@kernel.org # v3.4 Signed-off-by: H. Peter Anvin diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index daeca56..673ac9b 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -38,7 +38,7 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) { int err = 0; - bool ia32 = is_ia32_task(); + bool ia32 = test_thread_flag(TIF_IA32); if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) return -EFAULT; -- cgit v0.10.2 From 458f27a9823a0841acb4ca59e0e7f33e181f85e2 Mon Sep 17 00:00:00 2001 From: Asias He Date: Fri, 15 Jun 2012 08:45:25 +0200 Subject: block: Avoid missed wakeup in request waitqueue After hot-unplug a stressed disk, I found that rl->wait[] is not empty while rl->count[] is empty and there are theads still sleeping on get_request after the queue cleanup. With simple debug code, I found there are exactly nr_sleep - nr_wakeup of theads in D state. So there are missed wakeup. $ dmesg | grep nr_sleep [ 52.917115] ---> nr_sleep=1046, nr_wakeup=873, delta=173 $ vmstat 1 1 173 0 712640 24292 96172 0 0 0 0 419 757 0 0 0 100 0 To quote Tejun: Ah, okay, freed_request() wakes up single waiter with the assumption that after the wakeup there will at least be one successful allocation which in turn will continue the wakeup chain until the wait list is empty - ie. waiter wakeup is dependent on successful request allocation happening after each wakeup. With queue marked dead, any woken up waiter fails the allocation path, so the wakeup chaining is lost and we're left with hung waiters. What we need is wake_up_all() after drain completion. This patch fixes the missed wakeup by waking up all the theads which are sleeping on wait queue after queue drain. Changes in v2: Drop waitqueue_active() optimization Acked-by: Tejun Heo Signed-off-by: Asias He Fixed a bug by me, where stacked devices would oops on calling blk_drain_queue() since ->rq.wait[] do not get initialized unless it's a full queue setup. Signed-off-by: Jens Axboe diff --git a/block/blk-core.c b/block/blk-core.c index 3c923a7..ce7fbf8 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -361,9 +361,10 @@ EXPORT_SYMBOL(blk_put_queue); */ void blk_drain_queue(struct request_queue *q, bool drain_all) { + int i; + while (true) { bool drain = false; - int i; spin_lock_irq(q->queue_lock); @@ -408,6 +409,18 @@ void blk_drain_queue(struct request_queue *q, bool drain_all) break; msleep(10); } + + /* + * With queue marked dead, any woken up waiter will fail the + * allocation path, so the wakeup chaining is lost and we're + * left with hung waiters. We need to wake up those waiters. + */ + if (q->request_fn) { + spin_lock_irq(q->queue_lock); + for (i = 0; i < ARRAY_SIZE(q->rq.wait); i++) + wake_up_all(&q->rq.wait[i]); + spin_unlock_irq(q->queue_lock); + } } /** -- cgit v0.10.2 From 5e5cfac0c622d42eff4fa308e91b3c9c1884b4f0 Mon Sep 17 00:00:00 2001 From: Asias He Date: Thu, 24 May 2012 23:28:52 +0800 Subject: block: Mitigate lock unbalance caused by lock switching Commit 777eb1bf15b8532c396821774bf6451e563438f5 disconnects externally supplied queue_lock before blk_drain_queue(). Switching the lock would introduce lock unbalance because theads which have taken the external lock might unlock the internal lock in the during the queue drain. This patch mitigate this by disconnecting the lock after the queue draining since queue draining makes a lot of request_queue users go away. However, please note, this patch only makes the problem less likely to happen. Anyone who still holds a ref might try to issue a new request on a dead queue after the blk_cleanup_queue() finishes draining, the lock unbalance might still happen in this case. ===================================== [ BUG: bad unlock balance detected! ] 3.4.0+ #288 Not tainted ------------------------------------- fio/17706 is trying to release lock (&(&q->__queue_lock)->rlock) at: [] blk_queue_bio+0x2a2/0x380 but there are no more locks to release! other info that might help us debug this: 1 lock held by fio/17706: #0: (&(&vblk->lock)->rlock){......}, at: [] get_request_wait+0x19a/0x250 stack backtrace: Pid: 17706, comm: fio Not tainted 3.4.0+ #288 Call Trace: [] ? blk_queue_bio+0x2a2/0x380 [] print_unlock_inbalance_bug+0xf9/0x100 [] lock_release_non_nested+0x1df/0x330 [] ? dio_bio_end_aio+0x34/0xc0 [] ? bio_check_pages_dirty+0x85/0xe0 [] ? dio_bio_end_aio+0xb1/0xc0 [] ? blk_queue_bio+0x2a2/0x380 [] ? blk_queue_bio+0x2a2/0x380 [] lock_release+0xd9/0x250 [] _raw_spin_unlock_irq+0x23/0x40 [] blk_queue_bio+0x2a2/0x380 [] generic_make_request+0xca/0x100 [] submit_bio+0x76/0xf0 [] ? set_page_dirty_lock+0x3c/0x60 [] ? bio_set_pages_dirty+0x51/0x70 [] do_blockdev_direct_IO+0xbf8/0xee0 [] ? blkdev_get_block+0x80/0x80 [] __blockdev_direct_IO+0x55/0x60 [] ? blkdev_get_block+0x80/0x80 [] blkdev_direct_IO+0x57/0x60 [] ? blkdev_get_block+0x80/0x80 [] generic_file_aio_read+0x70e/0x760 [] ? __lock_acquire+0x215/0x5a0 [] ? aio_run_iocb+0x54/0x1a0 [] ? grab_cache_page_nowait+0xc0/0xc0 [] aio_rw_vect_retry+0x7c/0x1e0 [] ? aio_fsync+0x30/0x30 [] aio_run_iocb+0x66/0x1a0 [] do_io_submit+0x6f0/0xb80 [] ? trace_hardirqs_on_thunk+0x3a/0x3f [] sys_io_submit+0x10/0x20 [] system_call_fastpath+0x16/0x1b Changes since v2: Update commit log to explain how the code is still broken even if we delay the lock switching after the drain. Changes since v1: Update commit log as Tejun suggested. Acked-by: Tejun Heo Signed-off-by: Asias He Signed-off-by: Jens Axboe diff --git a/block/blk-core.c b/block/blk-core.c index ce7fbf8..93eb3e4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -480,7 +480,6 @@ void blk_cleanup_queue(struct request_queue *q) /* mark @q DEAD, no new request or merges will be allowed afterwards */ mutex_lock(&q->sysfs_lock); queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); - spin_lock_irq(lock); /* @@ -498,10 +497,6 @@ void blk_cleanup_queue(struct request_queue *q) queue_flag_set(QUEUE_FLAG_NOMERGES, q); queue_flag_set(QUEUE_FLAG_NOXMERGES, q); queue_flag_set(QUEUE_FLAG_DEAD, q); - - if (q->queue_lock != &q->__queue_lock) - q->queue_lock = &q->__queue_lock; - spin_unlock_irq(lock); mutex_unlock(&q->sysfs_lock); @@ -512,6 +507,11 @@ void blk_cleanup_queue(struct request_queue *q) del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); blk_sync_queue(q); + spin_lock_irq(lock); + if (q->queue_lock != &q->__queue_lock) + q->queue_lock = &q->__queue_lock; + spin_unlock_irq(lock); + /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); } -- cgit v0.10.2 From 76aaa5101fffaef12b45b4c01ed0d0528f23dedf Mon Sep 17 00:00:00 2001 From: Asias He Date: Thu, 14 Jun 2012 09:04:07 +0200 Subject: block: Drop dead function blk_abort_queue() This function was only used by btrfs code in btrfs_abort_devices() (seems in a wrong way). It was removed in commit d07eb9117050c9ed3f78296ebcc06128b52693be, So, Let's remove the dead code to avoid any confusion. Changes in v2: update commit log, btrfs_abort_devices() was removed already. Cc: Jens Axboe Cc: linux-kernel@vger.kernel.org Cc: Chris Mason Cc: linux-btrfs@vger.kernel.org Cc: David Sterba Signed-off-by: Asias He Signed-off-by: Jens Axboe diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 7803548..6e4744c 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -197,44 +197,3 @@ void blk_add_timer(struct request *req) mod_timer(&q->timeout, expiry); } -/** - * blk_abort_queue -- Abort all request on given queue - * @queue: pointer to queue - * - */ -void blk_abort_queue(struct request_queue *q) -{ - unsigned long flags; - struct request *rq, *tmp; - LIST_HEAD(list); - - /* - * Not a request based block device, nothing to abort - */ - if (!q->request_fn) - return; - - spin_lock_irqsave(q->queue_lock, flags); - - elv_abort_queue(q); - - /* - * Splice entries to local list, to avoid deadlocking if entries - * get readded to the timeout list by error handling - */ - list_splice_init(&q->timeout_list, &list); - - list_for_each_entry_safe(rq, tmp, &list, timeout_list) - blk_abort_request(rq); - - /* - * Occasionally, blk_abort_request() will return without - * deleting the element from the list. Make sure we add those back - * instead of leaving them on the local stack list. - */ - list_splice(&list, &q->timeout_list); - - spin_unlock_irqrestore(q->queue_lock, flags); - -} -EXPORT_SYMBOL_GPL(blk_abort_queue); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ba43f40..07954b0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -827,7 +827,6 @@ extern bool __blk_end_request_err(struct request *rq, int error); extern void blk_complete_request(struct request *); extern void __blk_complete_request(struct request *); extern void blk_abort_request(struct request *); -extern void blk_abort_queue(struct request_queue *); extern void blk_unprep_request(struct request *); /* -- cgit v0.10.2 From 6d9359280753d2955f86d6411047516a9431eb51 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 15 Jun 2012 12:52:46 +0200 Subject: scsi: Silence unnecessary warnings about ioctl to partition Sometimes, warnings about ioctls to partition happen often enough that they form majority of the warnings in the kernel log and users complain. In some cases warnings are about ioctls such as SG_IO so it's not good to get rid of the warnings completely as they can ease debugging of userspace problems when ioctl is refused. Since I have seen warnings from lots of commands, including some proprietary userspace applications, I don't think disallowing the ioctls for processes with CAP_SYS_RAWIO will happen in the near future if ever. So lets just stop warning for processes with CAP_SYS_RAWIO for which ioctl is allowed. CC: Paolo Bonzini CC: James Bottomley CC: linux-scsi@vger.kernel.org Acked-by: Paolo Bonzini Signed-off-by: Jan Kara Signed-off-by: Jens Axboe diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 260fa80..9a87daa 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -721,11 +721,14 @@ int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd) break; } + if (capable(CAP_SYS_RAWIO)) + return 0; + /* In particular, rule out all resets and host-specific ioctls. */ printk_ratelimited(KERN_WARNING "%s: sending ioctl %x to a partition!\n", current->comm, cmd); - return capable(CAP_SYS_RAWIO) ? 0 : -ENOIOCTLCMD; + return -ENOIOCTLCMD; } EXPORT_SYMBOL(scsi_verify_blk_ioctl); -- cgit v0.10.2 From c15acff337ca5c2f101fee99f36c89d47839d387 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 12 Jun 2012 12:53:21 +0800 Subject: x86: Fix kernel-doc warnings Signed-off-by: Wanpeng Li Cc: Peter Zijlstra Cc: Jason Wessel Cc: Jan Kiszka Cc: Gavin Shan Cc: Wanpeng Li Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 8bfb614..3f61904 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -444,12 +444,12 @@ void kgdb_roundup_cpus(unsigned long flags) /** * kgdb_arch_handle_exception - Handle architecture specific GDB packets. - * @vector: The error vector of the exception that happened. + * @e_vector: The error vector of the exception that happened. * @signo: The signal number of the exception that happened. * @err_code: The error code of the exception that happened. - * @remcom_in_buffer: The buffer of the packet we have read. - * @remcom_out_buffer: The buffer of %BUFMAX bytes to write a packet into. - * @regs: The &struct pt_regs of the current process. + * @remcomInBuffer: The buffer of the packet we have read. + * @remcomOutBuffer: The buffer of %BUFMAX bytes to write a packet into. + * @linux_regs: The &struct pt_regs of the current process. * * This function MUST handle the 'c' and 's' command packets, * as well packets to set / remove a hardware breakpoint, if used. diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 459b58a..25b7ae8 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -115,7 +115,7 @@ EXPORT_SYMBOL(csum_partial_copy_to_user); * @src: source address * @dst: destination address * @len: number of bytes to be copied. - * @isum: initial sum that is added into the result (32bit unfolded) + * @sum: initial sum that is added into the result (32bit unfolded) * * Returns an 32bit unfolded checksum of the buffer. */ -- cgit v0.10.2 From a5d8f4765f0e92ef027492a8cb979c5b8d45f2c3 Mon Sep 17 00:00:00 2001 From: Jonghwan Choi Date: Wed, 20 Jun 2012 17:05:37 +0900 Subject: ARM: SAMSUNG: Should check for IS_ERR(clk) instead of NULL On the error condition clk_get() returns ERR_PTR(). Signed-off-by: Jonghwan Choi Cc: Stable Signed-off-by: Kukjin Kim diff --git a/arch/arm/plat-samsung/include/plat/watchdog-reset.h b/arch/arm/plat-samsung/include/plat/watchdog-reset.h index f19aff1..bc4db9b 100644 --- a/arch/arm/plat-samsung/include/plat/watchdog-reset.h +++ b/arch/arm/plat-samsung/include/plat/watchdog-reset.h @@ -25,7 +25,7 @@ static inline void arch_wdt_reset(void) __raw_writel(0, S3C2410_WTCON); /* disable watchdog, to be safe */ - if (s3c2410_wdtclk) + if (!IS_ERR(s3c2410_wdtclk)) clk_enable(s3c2410_wdtclk); /* put initial values into count and data */ -- cgit v0.10.2 From 32103c7ba7d274bcb3ace48bc3366e1df37ebb56 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 20 Jun 2012 11:30:25 +0200 Subject: ARM: shmobile: sh73a0: bugfix: SY-DMAC number 681e1b3eeb3606e06a7c4984e8058df84296f8bb (ARM: mach-shmobile: sh73a0 DMA Engine support for SY-DMAC) adds SY-DMAC, but it is 218, not 318 This patch is based on v2.0 manual Signed-off-by: Kuninori Morimoto Acked-by: Magnus Damm Signed-off-by: Rafael J. Wysocki diff --git a/arch/arm/mach-shmobile/clock-sh73a0.c b/arch/arm/mach-shmobile/clock-sh73a0.c index 472d1f5..3946c4b 100644 --- a/arch/arm/mach-shmobile/clock-sh73a0.c +++ b/arch/arm/mach-shmobile/clock-sh73a0.c @@ -475,9 +475,9 @@ static struct clk *late_main_clks[] = { enum { MSTP001, MSTP129, MSTP128, MSTP127, MSTP126, MSTP125, MSTP118, MSTP116, MSTP100, - MSTP219, + MSTP219, MSTP218, MSTP207, MSTP206, MSTP204, MSTP203, MSTP202, MSTP201, MSTP200, - MSTP331, MSTP329, MSTP325, MSTP323, MSTP318, + MSTP331, MSTP329, MSTP325, MSTP323, MSTP314, MSTP313, MSTP312, MSTP311, MSTP303, MSTP302, MSTP301, MSTP300, MSTP411, MSTP410, MSTP403, @@ -497,6 +497,7 @@ static struct clk mstp_clks[MSTP_NR] = { [MSTP116] = MSTP(&div4_clks[DIV4_HP], SMSTPCR1, 16, 0), /* IIC0 */ [MSTP100] = MSTP(&div4_clks[DIV4_B], SMSTPCR1, 0, 0), /* LCDC0 */ [MSTP219] = MSTP(&div6_clks[DIV6_SUB], SMSTPCR2, 19, 0), /* SCIFA7 */ + [MSTP218] = MSTP(&div4_clks[DIV4_HP], SMSTPCR2, 18, 0), /* SY-DMAC */ [MSTP207] = MSTP(&div6_clks[DIV6_SUB], SMSTPCR2, 7, 0), /* SCIFA5 */ [MSTP206] = MSTP(&div6_clks[DIV6_SUB], SMSTPCR2, 6, 0), /* SCIFB */ [MSTP204] = MSTP(&div6_clks[DIV6_SUB], SMSTPCR2, 4, 0), /* SCIFA0 */ @@ -508,7 +509,6 @@ static struct clk mstp_clks[MSTP_NR] = { [MSTP329] = MSTP(&r_clk, SMSTPCR3, 29, 0), /* CMT10 */ [MSTP325] = MSTP(&div6_clks[DIV6_SUB], SMSTPCR3, 25, 0), /* IrDA */ [MSTP323] = MSTP(&div4_clks[DIV4_HP], SMSTPCR3, 23, 0), /* IIC1 */ - [MSTP318] = MSTP(&div4_clks[DIV4_HP], SMSTPCR3, 18, 0), /* SY-DMAC */ [MSTP314] = MSTP(&div6_clks[DIV6_SDHI0], SMSTPCR3, 14, 0), /* SDHI0 */ [MSTP313] = MSTP(&div6_clks[DIV6_SDHI1], SMSTPCR3, 13, 0), /* SDHI1 */ [MSTP312] = MSTP(&div4_clks[DIV4_HP], SMSTPCR3, 12, 0), /* MMCIF0 */ @@ -552,6 +552,7 @@ static struct clk_lookup lookups[] = { CLKDEV_DEV_ID("i2c-sh_mobile.0", &mstp_clks[MSTP116]), /* I2C0 */ CLKDEV_DEV_ID("sh_mobile_lcdc_fb.0", &mstp_clks[MSTP100]), /* LCDC0 */ CLKDEV_DEV_ID("sh-sci.7", &mstp_clks[MSTP219]), /* SCIFA7 */ + CLKDEV_DEV_ID("sh-dma-engine.0", &mstp_clks[MSTP218]), /* SY-DMAC */ CLKDEV_DEV_ID("sh-sci.5", &mstp_clks[MSTP207]), /* SCIFA5 */ CLKDEV_DEV_ID("sh-sci.8", &mstp_clks[MSTP206]), /* SCIFB */ CLKDEV_DEV_ID("sh-sci.0", &mstp_clks[MSTP204]), /* SCIFA0 */ @@ -563,7 +564,6 @@ static struct clk_lookup lookups[] = { CLKDEV_DEV_ID("sh_cmt.10", &mstp_clks[MSTP329]), /* CMT10 */ CLKDEV_DEV_ID("sh_irda.0", &mstp_clks[MSTP325]), /* IrDA */ CLKDEV_DEV_ID("i2c-sh_mobile.1", &mstp_clks[MSTP323]), /* I2C1 */ - CLKDEV_DEV_ID("sh-dma-engine.0", &mstp_clks[MSTP318]), /* SY-DMAC */ CLKDEV_DEV_ID("sh_mobile_sdhi.0", &mstp_clks[MSTP314]), /* SDHI0 */ CLKDEV_DEV_ID("sh_mobile_sdhi.1", &mstp_clks[MSTP313]), /* SDHI1 */ CLKDEV_DEV_ID("sh_mmcif.0", &mstp_clks[MSTP312]), /* MMCIF0 */ -- cgit v0.10.2 From 4d6344f3c943a5ff1fd8dcbd7be61ebdef1d0285 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 20 Jun 2012 11:30:32 +0200 Subject: ARM: shmobile: sh7372: bugfix: chclr_offset base chclr_write() will use (chan_reg + chclr_offset). In sh7372 case, DMA1CHCLR is started from 0xfe008220, and chan_reg is started from 0xfe008020 (= sh7372_dmae0_resources). Thus, chclr_offset should be (0x220 - 0x20) instead of 0x220. Signed-off-by: Kuninori Morimoto Acked-by: Magnus Damm Signed-off-by: Rafael J. Wysocki diff --git a/arch/arm/mach-shmobile/setup-sh7372.c b/arch/arm/mach-shmobile/setup-sh7372.c index 6a4bd58..fafce9c 100644 --- a/arch/arm/mach-shmobile/setup-sh7372.c +++ b/arch/arm/mach-shmobile/setup-sh7372.c @@ -484,7 +484,7 @@ static const struct sh_dmae_slave_config sh7372_dmae_slaves[] = { }, }; -#define SH7372_CHCLR 0x220 +#define SH7372_CHCLR (0x220 - 0x20) static const struct sh_dmae_channel sh7372_dmae_channels[] = { { -- cgit v0.10.2 From 6ae42bb22b40254e6488bbfe47f970620ab6d433 Mon Sep 17 00:00:00 2001 From: Nobuhiro Iwamatsu Date: Wed, 20 Jun 2012 11:30:41 +0200 Subject: ARM: mach-shmobile: Fix build when SMP is enabled and EMEV2 is not enabled Build failed, when SMP is enabled and EMEV2 is not enabled. arch/arm/mach-shmobile/built-in.o: In function `shmobile_platform_cpu_kill': /home/iwamatsu/work/kernel/sh-2.6-devel/arch/arm/mach-shmobile/platsmp.c:62: undefined reference to `emev2_platform_cpu_kill' arch/arm/mach-shmobile/built-in.o: In function `shmobile_smp_get_core_count': /home/iwamatsu/work/kernel/sh-2.6-devel/arch/arm/mach-shmobile/platsmp.c:39: undefined reference to `emev2_get_core_count' arch/arm/mach-shmobile/built-in.o: In function `shmobile_smp_prepare_cpus': /home/iwamatsu/work/kernel/sh-2.6-devel/arch/arm/mach-shmobile/platsmp.c:53: undefined reference to `emev2_smp_prepare_cpus' arch/arm/mach-shmobile/built-in.o: In function `platform_secondary_init': /home/iwamatsu/work/kernel/sh-2.6-devel/arch/arm/mach-shmobile/platsmp.c:78: undefined reference to `emev2_secondary_init' arch/arm/mach-shmobile/built-in.o: In function `boot_secondary': /home/iwamatsu/work/kernel/sh-2.6-devel/arch/arm/mach-shmobile/platsmp.c:90: undefined reference to `emev2_boot_secondary This is the cause by when EMEV2 is disabled, that the check by OF of EMEV2 is performed in platsmp.c. This patch revise what the function about EMEV2 may not be used in this file, when EMEV2 is not enabled. Signed-off-by: Nobuhiro Iwamatsu Acked-by: Simon Horman Acked-by: Magnus Damm Signed-off-by: Rafael J. Wysocki diff --git a/arch/arm/mach-shmobile/platsmp.c b/arch/arm/mach-shmobile/platsmp.c index bacdd66..e859fcd 100644 --- a/arch/arm/mach-shmobile/platsmp.c +++ b/arch/arm/mach-shmobile/platsmp.c @@ -25,7 +25,12 @@ #define is_sh73a0() (machine_is_ag5evm() || machine_is_kota2() || \ of_machine_is_compatible("renesas,sh73a0")) #define is_r8a7779() machine_is_marzen() + +#ifdef CONFIG_ARCH_EMEV2 #define is_emev2() of_machine_is_compatible("renesas,emev2") +#else +#define is_emev2() (0) +#endif static unsigned int __init shmobile_smp_get_core_count(void) { -- cgit v0.10.2 From 56fb523f12665974bd249a39524b75b4f5e57388 Mon Sep 17 00:00:00 2001 From: Guennadi Liakhovetski Date: Wed, 20 Jun 2012 12:15:59 +0200 Subject: ARM: mach-shmobile: add missing GPIO IRQ configuration on mackerel SDHI0 card-detect GPIO IRQ on mackarel currently works, because it is the default configuration of IRQ26. However, we should not rely on this and should configure the function explicitly. Signed-off-by: Guennadi Liakhovetski Acked-by: Magnus Damm Signed-off-by: Rafael J. Wysocki diff --git a/arch/arm/mach-shmobile/board-mackerel.c b/arch/arm/mach-shmobile/board-mackerel.c index b577f7c..150122a 100644 --- a/arch/arm/mach-shmobile/board-mackerel.c +++ b/arch/arm/mach-shmobile/board-mackerel.c @@ -1512,6 +1512,9 @@ static void __init mackerel_init(void) gpio_request(GPIO_FN_SDHID0_1, NULL); gpio_request(GPIO_FN_SDHID0_0, NULL); + /* SDHI0 PORT172 card-detect IRQ26 */ + gpio_request(GPIO_FN_IRQ26_172, NULL); + #if !defined(CONFIG_MMC_SH_MMCIF) && !defined(CONFIG_MMC_SH_MMCIF_MODULE) /* enable SDHI1 */ gpio_request(GPIO_FN_SDHICMD1, NULL); -- cgit v0.10.2 From 3dca938656c7b0ff6b0717a5dde0f5f45e592be5 Mon Sep 17 00:00:00 2001 From: Jose Miguel Goncalves Date: Sat, 12 May 2012 06:11:49 +0900 Subject: ARM: SAMSUNG: Fix for S3C2412 EBI memory mapping While upgrading the kernel on a S3C2412 based board I've noted that it was impossible to boot the board with a 2.6.32 or upper kernel. I've tracked down the problem to the EBI virtual memory mapping that is in conflict with the IO mapping definition in arch/arm/mach-s3c24xx/s3c2412.c. Signed-off-by: Jose Miguel Goncalves Cc: Stable Signed-off-by: Kukjin Kim diff --git a/arch/arm/plat-samsung/include/plat/map-s3c.h b/arch/arm/plat-samsung/include/plat/map-s3c.h index 7d04875..c0c70a8 100644 --- a/arch/arm/plat-samsung/include/plat/map-s3c.h +++ b/arch/arm/plat-samsung/include/plat/map-s3c.h @@ -22,7 +22,7 @@ #define S3C24XX_VA_WATCHDOG S3C_VA_WATCHDOG #define S3C2412_VA_SSMC S3C_ADDR_CPU(0x00000000) -#define S3C2412_VA_EBI S3C_ADDR_CPU(0x00010000) +#define S3C2412_VA_EBI S3C_ADDR_CPU(0x00100000) #define S3C2410_PA_UART (0x50000000) #define S3C24XX_PA_UART S3C2410_PA_UART -- cgit v0.10.2 From 37f971b68009b8fadd322130787d693137302925 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 20 Jun 2012 12:53:25 +0200 Subject: ARM: mach-shmobile: armadillo800eva: Use late init machine hook Since commit 21cc1b7ede3cf456cf1d51f8a906093261f7c111 ("ARM: shmobile: use machine specific hook for late init") suspend and CPU idle are not initialized automatically anymore. Set shmobile_init_late() as the machine late init hook to initialize them. Signed-off-by: Laurent Pinchart Acked-by: Magnus Damm Signed-off-by: Rafael J. Wysocki diff --git a/arch/arm/mach-shmobile/board-armadillo800eva.c b/arch/arm/mach-shmobile/board-armadillo800eva.c index 9e37026..9bd1355 100644 --- a/arch/arm/mach-shmobile/board-armadillo800eva.c +++ b/arch/arm/mach-shmobile/board-armadillo800eva.c @@ -779,6 +779,7 @@ DT_MACHINE_START(ARMADILLO800EVA_DT, "armadillo800eva") .init_irq = r8a7740_init_irq, .handle_irq = shmobile_handle_irq_intc, .init_machine = eva_init, + .init_late = shmobile_init_late, .timer = &shmobile_timer, .dt_compat = eva_boards_compat_dt, MACHINE_END -- cgit v0.10.2 From 14dd52f4389e206537488aeb2b85d6b21fc27196 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 20 Jun 2012 22:40:08 -0700 Subject: ARM: shmobile: kzm9g: use late init machine hook Since commit 21cc1b7ede3cf456cf1d51f8a906093261f7c111 ("ARM: shmobile: use machine specific hook for late init") suspend and CPU idle are not initialized automatically anymore. Set shmobile_init_late() as the machine late init hook to initialize them. Signed-off-by: Kuninori Morimoto Acked-by: Laurent Pinchart Acked-by: Magnus Damm Signed-off-by: Rafael J. Wysocki diff --git a/arch/arm/mach-shmobile/board-kzm9g.c b/arch/arm/mach-shmobile/board-kzm9g.c index d8e33b6..c0ae815 100644 --- a/arch/arm/mach-shmobile/board-kzm9g.c +++ b/arch/arm/mach-shmobile/board-kzm9g.c @@ -455,6 +455,7 @@ DT_MACHINE_START(KZM9G_DT, "kzm9g") .init_irq = sh73a0_init_irq, .handle_irq = gic_handle_irq, .init_machine = kzm_init, + .init_late = shmobile_init_late, .timer = &shmobile_timer, .dt_compat = kzm9g_boards_compat_dt, MACHINE_END -- cgit v0.10.2 From c207d2df1c1b2a160b7d52229e5a43feea6a7d26 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 20 Jun 2012 22:41:34 -0700 Subject: ARM: shmobile: kzm9d: use late init machine hook Since commit 21cc1b7ede3cf456cf1d51f8a906093261f7c111 ("ARM: shmobile: use machine specific hook for late init") suspend and CPU idle are not initialized automatically anymore. Set shmobile_init_late() as the machine late init hook to initialize them. Signed-off-by: Kuninori Morimoto Acked-by: Laurent Pinchart Acked-by: Magnus Damm Signed-off-by: Rafael J. Wysocki diff --git a/arch/arm/mach-shmobile/board-kzm9d.c b/arch/arm/mach-shmobile/board-kzm9d.c index 7bc5e7d..6a33cf3 100644 --- a/arch/arm/mach-shmobile/board-kzm9d.c +++ b/arch/arm/mach-shmobile/board-kzm9d.c @@ -80,6 +80,7 @@ DT_MACHINE_START(KZM9D_DT, "kzm9d") .init_irq = emev2_init_irq, .handle_irq = gic_handle_irq, .init_machine = kzm9d_add_standard_devices, + .init_late = shmobile_init_late, .timer = &shmobile_timer, .dt_compat = kzm9d_boards_compat_dt, MACHINE_END -- cgit v0.10.2 From e734568b675c985db2026848fefaac01c22977a5 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 8 Jun 2012 16:16:04 +0100 Subject: oprofile: perf: use NR_CPUS instead or nr_cpumask_bits for static array The OProfile perf backend uses a static array to keep track of the perf events on the system. When compiling with CONFIG_CPUMASK_OFFSTACK=y && SMP, nr_cpumask_bits is not a compile-time constant and the build will fail with: oprofile_perf.c:28: error: variably modified 'perf_events' at file scope This patch uses NR_CPUs instead of nr_cpumask_bits for the array initialisation. If this causes space problems in the future, we can always move to dynamic allocation for the events array. Cc: Matt Fleming Reported-by: Russell King - ARM Linux Signed-off-by: Will Deacon Cc: # v2.6.37+ Signed-off-by: Robert Richter diff --git a/drivers/oprofile/oprofile_perf.c b/drivers/oprofile/oprofile_perf.c index da14432..efc4b7f 100644 --- a/drivers/oprofile/oprofile_perf.c +++ b/drivers/oprofile/oprofile_perf.c @@ -25,7 +25,7 @@ static int oprofile_perf_enabled; static DEFINE_MUTEX(oprofile_perf_mutex); static struct op_counter_config *counter_config; -static struct perf_event **perf_events[nr_cpumask_bits]; +static struct perf_event **perf_events[NR_CPUS]; static int num_counters; /* -- cgit v0.10.2 From 9973290ce20ace7cac8ad06f753468c0b826fd0f Mon Sep 17 00:00:00 2001 From: David Brown Date: Wed, 20 Jun 2012 22:52:24 +0100 Subject: ARM: 7428/1: Prevent KALLSYM size mismatch on ARM. ARM builds seem to be plagued by an occasional build error: Inconsistent kallsyms data This is a bug - please report about it Try "make KALLSYMS_EXTRA_PASS=1" as a workaround The problem has to do with alignment of some sections by the linker. The kallsyms data is built in two passes by first linking the kernel without it, and then linking the kernel again with the symbols included. Normally, this just shifts the symbols, without changing their order, and the compression used by the kallsyms gives the same result. On non SMP, the per CPU data is empty. Depending on the where the alignment ends up, it can come out as either: +-------------------+ | last text segment | +-------------------+ /* padding */ +-------------------+ <- L1_CACHE_BYTES alignemnt | per cpu (empty) | +-------------------+ __per_cpu_end: /* padding */ __data_loc: +-------------------+ <- THREAD_SIZE alignment | data | +-------------------+ or +-------------------+ | last text segment | +-------------------+ /* padding */ +-------------------+ <- L1_CACHE_BYTES alignemnt | per cpu (empty) | +-------------------+ __per_cpu_end: /* no padding */ __data_loc: +-------------------+ <- THREAD_SIZE alignment | data | +-------------------+ if the alignment satisfies both. Because symbols that have the same address are sorted by 'nm -n', the second case will be in a different order than the first case. This changes the compression, changing the size of the kallsym data, causing the build failure. The KALLSYMS_EXTRA_PASS=1 workaround usually works, but it is still possible to have the alignment change between the second and third pass. It's probably even possible for it to never reach a fixedpoint. The problem only occurs on non-SMP, when the per-cpu data is empty, and when the data segment has alignment (and immediately follows the text segments). Fix this by only including the per_cpu section on SMP, when it is not empty. Signed-off-by: David Brown Signed-off-by: Russell King diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 43a31fb..36ff15b 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -183,7 +183,9 @@ SECTIONS } #endif +#ifdef CONFIG_SMP PERCPU_SECTION(L1_CACHE_BYTES) +#endif #ifdef CONFIG_XIP_KERNEL __data_loc = ALIGN(4); /* location in binary */ -- cgit v0.10.2 From 86f887c105b909a2cea7b06f2136d66b3438b038 Mon Sep 17 00:00:00 2001 From: Phil Edworthy Date: Sat, 23 Jun 2012 01:12:09 +0200 Subject: ARM: shmobile: r8a7779: Route all interrupts to ARM Without this, the interrupts for I2C, VIN, GPIO, SDHC, HSCIF and HPB-DMAC are sent to the SH processor. Signed-off-by: Phil Edworthy Acked-by: Magnus Damm Signed-off-by: Rafael J. Wysocki diff --git a/arch/arm/mach-shmobile/intc-r8a7779.c b/arch/arm/mach-shmobile/intc-r8a7779.c index 550b23d..f04fad4 100644 --- a/arch/arm/mach-shmobile/intc-r8a7779.c +++ b/arch/arm/mach-shmobile/intc-r8a7779.c @@ -35,6 +35,9 @@ #define INT2SMSKCR3 0xfe7822ac #define INT2SMSKCR4 0xfe7822b0 +#define INT2NTSR0 0xfe700060 +#define INT2NTSR1 0xfe700064 + static int r8a7779_set_wake(struct irq_data *data, unsigned int on) { return 0; /* always allow wakeup */ @@ -49,6 +52,10 @@ void __init r8a7779_init_irq(void) gic_init(0, 29, gic_dist_base, gic_cpu_base); gic_arch_extn.irq_set_wake = r8a7779_set_wake; + /* route all interrupts to ARM */ + __raw_writel(0xffffffff, INT2NTSR0); + __raw_writel(0x3fffffff, INT2NTSR1); + /* unmask all known interrupts in INTCS2 */ __raw_writel(0xfffffff0, INT2SMSKCR0); __raw_writel(0xfff7ffff, INT2SMSKCR1); -- cgit v0.10.2 From 1f758b23177d588a71b96ad02990e715949bb82f Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Sun, 24 Jun 2012 23:31:09 +0200 Subject: PM / Sleep: Prevent waiting forever on asynchronous suspend after abort __device_suspend() must always send a completion. Otherwise, parent devices will wait forever. Commit 1e2ef05b, "PM: Limit race conditions between runtime PM and system sleep (v2)", introduced a regression by short-circuiting the complete_all() for certain error cases. This patch fixes the bug by always signalling a completion. Addresses http://crosbug.com/31972 Tested by injecting an abort. Signed-off-by: Mandeep Singh Baines Cc: stable@vger.kernel.org Signed-off-by: Rafael J. Wysocki diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index e0fb5b0..9cb845e 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1031,7 +1031,7 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) dpm_wait_for_children(dev, async); if (async_error) - return 0; + goto Complete; pm_runtime_get_noresume(dev); if (pm_runtime_barrier(dev) && device_may_wakeup(dev)) @@ -1040,7 +1040,7 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) if (pm_wakeup_pending()) { pm_runtime_put_sync(dev); async_error = -EBUSY; - return 0; + goto Complete; } device_lock(dev); @@ -1097,6 +1097,8 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) } device_unlock(dev); + + Complete: complete_all(&dev->power.completion); if (error) { -- cgit v0.10.2 From ef5b6e127761667f78d99b7510a3876077fe9abe Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 17 Jun 2012 09:56:46 +0000 Subject: netfilter: ipset: fix interface comparision in hash-netiface sets ifname_compare() assumes that skb->dev is zero-padded, e.g 'eth1\0\0\0\0\0...'. This isn't always the case. e1000 driver does strncpy(netdev->name, pci_name(pdev), sizeof(netdev->name) - 1); in e1000_probe(), so once device is registered dev->name memory contains 'eth1\0:0:3\0\0\0' (or something like that), which makes eth1 compare fail. Use plain strcmp() instead. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index ee86394..d5d3607 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -38,30 +38,6 @@ struct iface_node { #define iface_data(n) (rb_entry(n, struct iface_node, node)->iface) -static inline long -ifname_compare(const char *_a, const char *_b) -{ - const long *a = (const long *)_a; - const long *b = (const long *)_b; - - BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long)); - if (a[0] != b[0]) - return a[0] - b[0]; - if (IFNAMSIZ > sizeof(long)) { - if (a[1] != b[1]) - return a[1] - b[1]; - } - if (IFNAMSIZ > 2 * sizeof(long)) { - if (a[2] != b[2]) - return a[2] - b[2]; - } - if (IFNAMSIZ > 3 * sizeof(long)) { - if (a[3] != b[3]) - return a[3] - b[3]; - } - return 0; -} - static void rbtree_destroy(struct rb_root *root) { @@ -99,7 +75,7 @@ iface_test(struct rb_root *root, const char **iface) while (n) { const char *d = iface_data(n); - long res = ifname_compare(*iface, d); + int res = strcmp(*iface, d); if (res < 0) n = n->rb_left; @@ -121,7 +97,7 @@ iface_add(struct rb_root *root, const char **iface) while (*n) { char *ifname = iface_data(*n); - long res = ifname_compare(*iface, ifname); + int res = strcmp(*iface, ifname); p = *n; if (res < 0) @@ -366,7 +342,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_netiface4_elem data = { .cidr = HOST_MASK }; u32 ip = 0, ip_to, last; u32 timeout = h->timeout; - char iface[IFNAMSIZ] = {}; + char iface[IFNAMSIZ]; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || @@ -663,7 +639,7 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface6_elem data = { .cidr = HOST_MASK }; u32 timeout = h->timeout; - char iface[IFNAMSIZ] = {}; + char iface[IFNAMSIZ]; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || -- cgit v0.10.2 From c24584c028a62900ea6b541b312030f0feac93b8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 24 Jun 2012 21:58:23 +0000 Subject: netfilter: ipvs: fix dst leak in __ip_vs_addr_is_local_v6 After call to ip6_route_output() we must release dst or we leak it. Also should test dst->error, as ip6_route_output() never returns NULL. Use boolean while we are at it. Signed-off-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index dd811b8..d43e3c1 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -76,19 +76,19 @@ static void __ip_vs_del_service(struct ip_vs_service *svc); #ifdef CONFIG_IP_VS_IPV6 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ -static int __ip_vs_addr_is_local_v6(struct net *net, - const struct in6_addr *addr) +static bool __ip_vs_addr_is_local_v6(struct net *net, + const struct in6_addr *addr) { - struct rt6_info *rt; struct flowi6 fl6 = { .daddr = *addr, }; + struct dst_entry *dst = ip6_route_output(net, NULL, &fl6); + bool is_local; - rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6); - if (rt && rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK)) - return 1; + is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK); - return 0; + dst_release(dst); + return is_local; } #endif -- cgit v0.10.2 From a2da399823ccb0f4ddf83700bf297803e5320f7e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 25 Jun 2012 12:07:18 +0200 Subject: netfilter: update location of my trees Signed-off-by: Pablo Neira Ayuso diff --git a/MAINTAINERS b/MAINTAINERS index f6e62de..302aa00 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4654,8 +4654,8 @@ L: netfilter@vger.kernel.org L: coreteam@netfilter.org W: http://www.netfilter.org/ W: http://www.iptables.org/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-2.6.git -T: git git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next-2.6.git +T: git git://1984.lsi.us.es/nf +T: git git://1984.lsi.us.es/nf-next S: Supported F: include/linux/netfilter* F: include/linux/netfilter/ -- cgit v0.10.2 From 4ad33411308596f2f918603509729922a1ec4411 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 22 Jun 2012 10:58:06 -0700 Subject: x86, cpufeature: Rename X86_FEATURE_DTS to X86_FEATURE_DTHERM It makes sense to label "Digital Thermal Sensor" as "DTS", but unfortunately the string "dts" was already used for "Debug Store", and /proc/cpuinfo is a user space ABI. Therefore, rename this to "dtherm". This conflict went into mainline via the hwmon tree without any x86 maintainer ack, and without any kind of hint in the subject. a4659053 x86/hwmon: fix initialization of coretemp Reported-by: Jean Delvare Link: http://lkml.kernel.org/r/4FE34BCB.5050305@linux.intel.com Cc: Jan Beulich Cc: v2.6.36..v3.4 Signed-off-by: H. Peter Anvin diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 340ee49..f91e80f 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -176,7 +176,7 @@ #define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */ #define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */ #define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */ -#define X86_FEATURE_DTS (7*32+ 7) /* Digital Thermal Sensor */ +#define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */ #define X86_FEATURE_HW_PSTATE (7*32+ 8) /* AMD HW-PState */ /* Virtualization flags: Linux defined, word 8 */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index addf9e8..ee8e9ab 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -31,7 +31,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) const struct cpuid_bit *cb; static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { - { X86_FEATURE_DTS, CR_EAX, 0, 0x00000006, 0 }, + { X86_FEATURE_DTHERM, CR_EAX, 0, 0x00000006, 0 }, { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c index b9d5123..0f52799 100644 --- a/drivers/hwmon/coretemp.c +++ b/drivers/hwmon/coretemp.c @@ -664,7 +664,7 @@ static void __cpuinit get_core_online(unsigned int cpu) * sensors. We check this bit only, all the early CPUs * without thermal sensors will be filtered out. */ - if (!cpu_has(c, X86_FEATURE_DTS)) + if (!cpu_has(c, X86_FEATURE_DTHERM)) return; if (!pdev) { @@ -765,7 +765,7 @@ static struct notifier_block coretemp_cpu_notifier __refdata = { }; static const struct x86_cpu_id coretemp_ids[] = { - { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_DTS }, + { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_DTHERM }, {} }; MODULE_DEVICE_TABLE(x86cpu, coretemp_ids); -- cgit v0.10.2 From 55f6cb9d0b364e7e8cb65b51193f5e4743c44fde Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 22 Jun 2012 11:47:15 -0700 Subject: x86, cpufeature: Catch duplicate CPU feature strings We had a case of duplicate CPU feature strings, a user space ABI violation, for almost two years. Make it a build error so that doesn't happen again. Link: http://lkml.kernel.org/r/4FE34BCB.5050305@linux.intel.com Cc: Jan Beulich Cc: Jean Delvare diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl index dfea390..0c5b549 100644 --- a/arch/x86/kernel/cpu/mkcapflags.pl +++ b/arch/x86/kernel/cpu/mkcapflags.pl @@ -11,22 +11,35 @@ open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n"; print OUT "#include \n\n"; print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n"; +%features = (); +$err = 0; + while (defined($line = )) { if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) { $macro = $1; - $feature = $2; + $feature = "\L$2"; $tail = $3; if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) { - $feature = $1; + $feature = "\L$1"; } - if ($feature ne '') { - printf OUT "\t%-32s = \"%s\",\n", - "[$macro]", "\L$feature"; + next if ($feature eq ''); + + if ($features{$feature}++) { + print STDERR "$in: duplicate feature name: $feature\n"; + $err++; } + printf OUT "\t%-32s = \"%s\",%s\n", "[$macro]", $feature; } } print OUT "};\n"; close(IN); close(OUT); + +if ($err) { + unlink($out); + exit(1); +} + +exit(0); -- cgit v0.10.2 From eb3979f64d25120d60b9e761a4c58f70b1a02f86 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 21 Jun 2012 11:36:50 +0100 Subject: stable: Allow merging of backports for serious user-visible performance issues Distribution kernel maintainers routinely backport fixes for users that were deemed important but not "something critical" as defined by the rules. To users of these kernels they are very serious and failing to fix them reduces the value of -stable. The problem is that the patches fixing these issues are often subtle and prone to regressions in other ways and need greater care and attention. To combat this, these "serious" backports should have a higher barrier to entry. This patch relaxes the rules to allow a distribution maintainer to merge to -stable a backported patch or small series that fixes a "serious" user-visible performance issue. They should include additional information on the user-visible bug affected and a link to the bugzilla entry if available. The same rules about the patch being already in mainline still apply. Signed-off-by: Mel Gorman Cc: stable Signed-off-by: Greg Kroah-Hartman diff --git a/Documentation/stable_kernel_rules.txt b/Documentation/stable_kernel_rules.txt index f0ab5cf..4a7b54b 100644 --- a/Documentation/stable_kernel_rules.txt +++ b/Documentation/stable_kernel_rules.txt @@ -12,6 +12,12 @@ Rules on what kind of patches are accepted, and which ones are not, into the marked CONFIG_BROKEN), an oops, a hang, data corruption, a real security issue, or some "oh, that's not good" issue. In short, something critical. + - Serious issues as reported by a user of a distribution kernel may also + be considered if they fix a notable performance or interactivity issue. + As these fixes are not as obvious and have a higher risk of a subtle + regression they should only be submitted by a distribution kernel + maintainer and include an addendum linking to a bugzilla entry if it + exists and additional information on the user-visible impact. - New device IDs and quirks are also accepted. - No "theoretical race condition" issues, unless an explanation of how the race can be exploited is also provided. -- cgit v0.10.2 From 4661e3568a7d14a93d4e428d246cdb86f4bac6e7 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 22 Jun 2012 17:12:19 -0400 Subject: printk: fix regression in SYSLOG_ACTION_CLEAR Commit 7ff9554bb578ba02166071d2d487b7fc7d860d62 (printk: convert byte-buffer to variable-length record buffer) introduced a regression by accidentally removing a "break" statement from inside the big switch in printk's do_syslog(). The symptom of this bug is that the "dmesg -C" command doesn't only clear the kernel's log buffer; it also disables console logging. This patch (as1561) fixes the regression by adding the missing "break". Signed-off-by: Alan Stern CC: Kay Sievers Signed-off-by: Greg Kroah-Hartman diff --git a/kernel/printk.c b/kernel/printk.c index a2276b9..d6a1412 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1040,6 +1040,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) /* Clear ring buffer */ case SYSLOG_ACTION_CLEAR: syslog_print_all(NULL, 0, true); + break; /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_OFF: if (saved_console_loglevel == -1) -- cgit v0.10.2 From b41772abebc27c61dd578b76da99aa5240b4c99a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Jun 2012 20:50:42 -0700 Subject: rcu: Stop rcu_do_batch() from multiplexing the "count" variable Commit b1420f1c (Make rcu_barrier() less disruptive) rearranged the code in rcu_do_batch(), moving the ->qlen manipulation to follow the requeueing of the callbacks. Unfortunately, this rearrangement clobbered the value of the "count" local variable before the value of rdp->qlen was adjusted, resulting in the value of rdp->qlen being inaccurate. This commit therefore introduces an index variable "i", avoiding the inadvertent multiplexing. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 3b0f133..38ecdda 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1530,7 +1530,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; - int bl, count, count_lazy; + int bl, count, count_lazy, i; /* If no callbacks are ready, just return.*/ if (!cpu_has_callbacks_ready_to_invoke(rdp)) { @@ -1553,9 +1553,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; *rdp->nxttail[RCU_DONE_TAIL] = NULL; tail = rdp->nxttail[RCU_DONE_TAIL]; - for (count = RCU_NEXT_SIZE - 1; count >= 0; count--) - if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL]) - rdp->nxttail[count] = &rdp->nxtlist; + for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) + if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) + rdp->nxttail[i] = &rdp->nxtlist; local_irq_restore(flags); /* Invoke callbacks. */ @@ -1583,9 +1583,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) if (list != NULL) { *tail = rdp->nxtlist; rdp->nxtlist = list; - for (count = 0; count < RCU_NEXT_SIZE; count++) - if (&rdp->nxtlist == rdp->nxttail[count]) - rdp->nxttail[count] = tail; + for (i = 0; i < RCU_NEXT_SIZE; i++) + if (&rdp->nxtlist == rdp->nxttail[i]) + rdp->nxttail[i] = tail; else break; } -- cgit v0.10.2 From bb44c30e53053c653302b53c8671c3c5ca62e881 Mon Sep 17 00:00:00 2001 From: Kevin Hilman Date: Tue, 26 Jun 2012 03:29:57 -0700 Subject: ARM: OMAP2+: nand: fix build error when CONFIG_MTD_ONENAND_OMAP2=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 8259573b (ARM: OMAP2+: nand: Make board_onenand_init() visible to board code) broke the build for configs with OneNAND disabled. By removing the static in the header file, it created a duplicate definition in the .c and the .h files, resuling in a build error: /work/kernel/omap/dev/arch/arm/mach-omap2/board-flash.c:102:111: error: redefinition of 'board_onenand_init' /work/kernel/omap/dev/arch/arm/mach-omap2/board-flash.h:56:51: note: previous definition of 'board_onenand_init' was here make[2]: *** [arch/arm/mach-omap2/board-flash.o] Error 1 make[2]: *** Waiting for unfinished jobs.... make[1]: *** [arch/arm/mach-omap2] Error 2 make: *** [sub-make] Error 2 Fix this by removing the duplicate dummy entry from the C file. Cc: Enric Balletbò i Serra Cc: Javier Martinez Canillas Signed-off-by: Kevin Hilman Signed-off-by: Tony Lindgren diff --git a/arch/arm/mach-omap2/board-flash.c b/arch/arm/mach-omap2/board-flash.c index 70a81f9..53c39d2 100644 --- a/arch/arm/mach-omap2/board-flash.c +++ b/arch/arm/mach-omap2/board-flash.c @@ -97,11 +97,6 @@ __init board_onenand_init(struct mtd_partition *onenand_parts, gpmc_onenand_init(&board_onenand_data); } -#else -void -__init board_onenand_init(struct mtd_partition *nor_parts, u8 nr_parts, u8 cs) -{ -} #endif /* CONFIG_MTD_ONENAND_OMAP2 || CONFIG_MTD_ONENAND_OMAP2_MODULE */ #if defined(CONFIG_MTD_NAND_OMAP2) || \ -- cgit v0.10.2 From 1b6b7c9ff3514772958c075f8c89e42dddf6a4d8 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 26 Jun 2012 07:58:23 -0700 Subject: x86, cpufeature: Remove stray %s, add -w to mkcapflags.pl There was a stray %s left from testing, remove it. Add -w to the #! line (which is parsed by Perl even if the Perl interpreter is invoked explicitly on the command line) to catch these kinds of errors in the future. Reported-by: Jean Delvare Link: http://lkml.kernel.org/r/20120626143246.0c9bf301@endymion.delvare Signed-off-by: H. Peter Anvin diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl index 0c5b549..c7b3fe2 100644 --- a/arch/x86/kernel/cpu/mkcapflags.pl +++ b/arch/x86/kernel/cpu/mkcapflags.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/perl -w # # Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h # @@ -29,7 +29,7 @@ while (defined($line = )) { print STDERR "$in: duplicate feature name: $feature\n"; $err++; } - printf OUT "\t%-32s = \"%s\",%s\n", "[$macro]", $feature; + printf OUT "\t%-32s = \"%s\",\n", "[$macro]", $feature; } } print OUT "};\n"; -- cgit v0.10.2 From 6fda135c908d0f38a0167adcbd71094572e3059b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 26 Jun 2012 12:35:24 -0700 Subject: Revert "printk: return -EINVAL if the message len is bigger than the buf size" This reverts commit b56a39ac263e5b8cafedd551a49c2105e68b98c2. A better patch from Jan will follow this to resolve the issue. Acked-by: Kay Sievers Cc: Fengguang Wu Cc: Yuanhan Liu Cc: Jan Beulich Signed-off-by: Greg Kroah-Hartman diff --git a/kernel/printk.c b/kernel/printk.c index d6a1412..ff05361 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -880,9 +880,7 @@ static int syslog_print(char __user *buf, int size) syslog_seq++; raw_spin_unlock_irq(&logbuf_lock); - if (len > size) - len = -EINVAL; - else if (len > 0 && copy_to_user(buf, text, len)) + if (len > 0 && copy_to_user(buf, text, len)) len = -EFAULT; kfree(text); -- cgit v0.10.2 From 116e90b23f74d303e8d607c7a7d54f60f14ab9f2 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 22 Jun 2012 16:36:09 +0100 Subject: syslog: fill buffer with more than a single message for SYSLOG_ACTION_READ The recent changes to the printk buffer management resulted in SYSLOG_ACTION_READ to only return a single message, whereas previously the buffer would get filled as much as possible. As, when too small to fit everything, filling it to the last byte would be pretty ugly with the new code, the patch arranges for as many messages as possible to get returned in a single invocation. User space tools in at least all SLES versions depend on the old behavior. This at once addresses the issue attempted to get fixed with commit b56a39ac263e5b8cafedd551a49c2105e68b98c2 ("printk: return -EINVAL if the message len is bigger than the buf size"), and since that commit widened the possibility for losing a message altogether, the patch here assumes that this other commit would get reverted first (otherwise the patch here won't apply). Furthermore, this patch also addresses the problem dealt with in commit 4a77a5a06ec66ed05199b301e7c25f42f979afdc ("printk: use mutex lock to stop syslog_seq from going wild"), so I'd recommend reverting that one too (albeit there's no direct collision between the two). Signed-off-by: Jan Beulich Acked-by: Kay Sievers Cc: Yuanhan Liu Signed-off-by: Greg Kroah-Hartman diff --git a/kernel/printk.c b/kernel/printk.c index ff05361..cdfba44 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -862,26 +862,49 @@ static int syslog_print(char __user *buf, int size) { char *text; struct log *msg; - int len; + int len = 0; text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); if (!text) return -ENOMEM; - raw_spin_lock_irq(&logbuf_lock); - if (syslog_seq < log_first_seq) { - /* messages are gone, move to first one */ - syslog_seq = log_first_seq; - syslog_idx = log_first_idx; - } - msg = log_from_idx(syslog_idx); - len = msg_print_text(msg, true, text, LOG_LINE_MAX); - syslog_idx = log_next(syslog_idx); - syslog_seq++; - raw_spin_unlock_irq(&logbuf_lock); + while (size > 0) { + size_t n; + + raw_spin_lock_irq(&logbuf_lock); + if (syslog_seq < log_first_seq) { + /* messages are gone, move to first one */ + syslog_seq = log_first_seq; + syslog_idx = log_first_idx; + } + if (syslog_seq == log_next_seq) { + raw_spin_unlock_irq(&logbuf_lock); + break; + } + msg = log_from_idx(syslog_idx); + n = msg_print_text(msg, true, text, LOG_LINE_MAX); + if (n <= size) { + syslog_idx = log_next(syslog_idx); + syslog_seq++; + } else + n = 0; + raw_spin_unlock_irq(&logbuf_lock); + + if (!n) + break; - if (len > 0 && copy_to_user(buf, text, len)) - len = -EFAULT; + len += n; + size -= n; + buf += n; + n = copy_to_user(buf - n, text, n); + + if (n) { + len -= n; + if (!len) + len = -EFAULT; + break; + } + } kfree(text); return len; -- cgit v0.10.2 From 58c553d4d4637e1955a571350888530cd0073276 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Wed, 27 Jun 2012 10:03:00 +0900 Subject: ARM: EXYNOS: Fix EXYNOS_DEV_DMA Kconfig entry Commit 20ef9e08 ("ARM: EXYNOS: Support DMA for EXYNOS5250 SoC") renamed EXYNOS4_DEV_DMA to EXYNOS_DEV_DMA. But some machine entries still had EXYNOS4_DEV_DMA. Changed them to EXYNOS_DEV_DMA. Signed-off-by: Sachin Kamat Signed-off-by: Kukjin Kim diff --git a/arch/arm/mach-exynos/Kconfig b/arch/arm/mach-exynos/Kconfig index 573be57..6f6d13f 100644 --- a/arch/arm/mach-exynos/Kconfig +++ b/arch/arm/mach-exynos/Kconfig @@ -212,7 +212,7 @@ config MACH_SMDKV310 select EXYNOS_DEV_SYSMMU select EXYNOS4_DEV_AHCI select SAMSUNG_DEV_KEYPAD - select EXYNOS4_DEV_DMA + select EXYNOS_DEV_DMA select SAMSUNG_DEV_PWM select EXYNOS4_DEV_USB_OHCI select EXYNOS4_SETUP_FIMD0 @@ -264,7 +264,7 @@ config MACH_UNIVERSAL_C210 select S5P_DEV_ONENAND select S5P_DEV_TV select EXYNOS_DEV_SYSMMU - select EXYNOS4_DEV_DMA + select EXYNOS_DEV_DMA select EXYNOS_DEV_DRM select EXYNOS4_SETUP_FIMD0 select EXYNOS4_SETUP_I2C1 @@ -303,7 +303,7 @@ config MACH_NURI select S5P_DEV_MFC select S5P_DEV_USB_EHCI select S5P_SETUP_MIPIPHY - select EXYNOS4_DEV_DMA + select EXYNOS_DEV_DMA select EXYNOS_DEV_DRM select EXYNOS4_SETUP_FIMC select EXYNOS4_SETUP_FIMD0 @@ -341,7 +341,7 @@ config MACH_ORIGEN select SAMSUNG_DEV_PWM select EXYNOS_DEV_DRM select EXYNOS_DEV_SYSMMU - select EXYNOS4_DEV_DMA + select EXYNOS_DEV_DMA select EXYNOS4_DEV_USB_OHCI select EXYNOS4_SETUP_FIMD0 select EXYNOS4_SETUP_SDHCI -- cgit v0.10.2 From e90b833ee1aae64b6fca2455001323ffe29e1698 Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Mon, 25 Jun 2012 12:38:23 -0500 Subject: ARM: OMAP4470: Fix OMAP4470 boot failure OMAP4470 currently fails to boot, printing various messages such as ... omap_hwmod: mpu: cannot clk_get main_clk dpll_mpu_m2_ck omap_hwmod: mpu: cannot _init_clocks ------------[ cut here ]------------ WARNING: at arch/arm/mach-omap2/omap_hwmod.c:2062 _init+0x2a0/0x2e4() omap_hwmod: mpu: couldn't init clocks Modules linked in: [] (unwind_backtrace+0x0/0xf4) from [] (warn_slowpath_common+0x4c/0x64) [] (warn_slowpath_common+0x4c/0x64) from [] (warn_slowpath_fmt+0x30/0x40) [] (warn_slowpath_fmt+0x30/0x40) from [] (_init+0x2a0/0x2e4) [] (_init+0x2a0/0x2e4) from [] (omap_hwmod_setup_one+0x40/0x60) [] (omap_hwmod_setup_one+0x40/0x60) from [] (omap_hwmod_setup_one+0x34/0x60) [] (omap_hwmod_setup_one+0x34/0x60) from [] (omap_dm_timer_init_one+0x30/0x250) [] (omap_dm_timer_init_one+0x30/0x250) from [] (omap2_gp_clockevent_init+0x1c/0x108) [] (omap2_gp_clockevent_init+0x1c/0x108) from [] (omap4_timer_init+0x10/0x5c) [] (omap4_timer_init+0x10/0x5c) from [] (time_init+0x20/0x30) [] (time_init+0x20/0x30) from [] (start_kernel+0x1b0/0x304) [] (start_kernel+0x1b0/0x304) from [<80008044>] (0x80008044) ---[ end trace 1b75b31a2719ed1c ]--- The problem is that currently none of the clocks are being registered for OMAP4470 devices and so on boot-up no clocks can be found and the kernel panics. This fix allows the kernel to boot without failure using a simple RAMDISK file system on OMAP4470 blaze board. Per feedback from Paul and Benoit the 4470 clock data is incomplete for new modules such as the 2D graphics block that has been added to the 4470. Therefore add a warning to indicate that the clock data is incomplete. Cc: Paul Walmsley Cc: Benoit Cousson Signed-off-by: Jon Hunter [tony@atomide.com: updated comments] Signed-off-by: Tony Lindgren diff --git a/arch/arm/mach-omap2/clock44xx_data.c b/arch/arm/mach-omap2/clock44xx_data.c index e2b701e..ba6f9a0 100644 --- a/arch/arm/mach-omap2/clock44xx_data.c +++ b/arch/arm/mach-omap2/clock44xx_data.c @@ -3417,9 +3417,12 @@ int __init omap4xxx_clk_init(void) if (cpu_is_omap443x()) { cpu_mask = RATE_IN_4430; cpu_clkflg = CK_443X; - } else if (cpu_is_omap446x()) { + } else if (cpu_is_omap446x() || cpu_is_omap447x()) { cpu_mask = RATE_IN_4460 | RATE_IN_4430; cpu_clkflg = CK_446X | CK_443X; + + if (cpu_is_omap447x()) + pr_warn("WARNING: OMAP4470 clock data incomplete!\n"); } else { return 0; } -- cgit v0.10.2 From 75cc52358799bd6001e7d1a47847f997f5ae99f0 Mon Sep 17 00:00:00 2001 From: Deepthi Dharwar Date: Mon, 25 Jun 2012 23:59:54 +0200 Subject: PM / ACPI: Fix suspend/resume regression caused by cpuidle cleanup. Commit e978aa7d7d57d04eb5f88a7507c4fb98577def77 ( cpuidle: Move dev->last_residency update to driver enter routine; remove dev->last_state) was breaking suspend on laptops, as reported in the below link - https://lkml.org/lkml/2011/11/11/164 This was fixed in commit 3439a8da16bcad6b0982ece938c9f8299bb53584 (ACPI / cpuidle: Remove acpi_idle_suspend (to fix suspend regression) by removing acpi_idle_suspend flag. - https://lkml.org/lkml/2011/11/14/74 But this did fix did not work on all systems as Suspend/resume regression was reported on Lenovo S10-3 recently by Dave. - https://lkml.org/lkml/2012/5/27/115 It looked like with commit e978aa7d broke suspend and with commit 3439a8da resume was not working with acpi_idle driver. This patch fixes the regression that caused this issue in the first place. acpi_idle_suspend flag is essential on some x86 systems to prevent the cpus from going to deeper C-states when suspend is triggered ( commit b04e7bdb984 ) So reverting the commit 3439a8da is essential. By default, irqs are disabled in cpu_idle arch specific call and re-enabled in idle state return path . During suspend, the acpi_idle_suspend flag is set, which prevents the cpus from going to deeper idle states, it is essential to enabling the irqs in this return path too. To address the suspend issue, we were not re-enabling the interrupts while returning from acpi_idle_enter_bm() routine if acpi_idle_suspend flag is set. and this caused suspend failure. In addition to the above, to improve the readability of the code, return of -ENIVAL is replaced with -EBUSY in acpi_idle_suspend return path. Implying that the system is currently busy when suspend is in progress, which prevents the cpus from entering deeper C-states. Reported-and-Tested-by: Dav Hansen Tested-by: Preeti Murthy Signed-off-by: Deepthi Dharwar Reviewed-by: Srivatsa S Bhat Signed-off-by: Rafael J. Wysocki diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index f3decb3..47a8caa 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -224,6 +224,7 @@ static void lapic_timer_state_broadcast(struct acpi_processor *pr, /* * Suspend / resume control */ +static int acpi_idle_suspend; static u32 saved_bm_rld; static void acpi_idle_bm_rld_save(void) @@ -242,13 +243,21 @@ static void acpi_idle_bm_rld_restore(void) int acpi_processor_suspend(struct acpi_device * device, pm_message_t state) { + if (acpi_idle_suspend == 1) + return 0; + acpi_idle_bm_rld_save(); + acpi_idle_suspend = 1; return 0; } int acpi_processor_resume(struct acpi_device * device) { + if (acpi_idle_suspend == 0) + return 0; + acpi_idle_bm_rld_restore(); + acpi_idle_suspend = 0; return 0; } @@ -754,6 +763,12 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev, local_irq_disable(); + if (acpi_idle_suspend) { + local_irq_enable(); + cpu_relax(); + return -EBUSY; + } + lapic_timer_state_broadcast(pr, cx, 1); kt1 = ktime_get_real(); acpi_idle_do_entry(cx); @@ -823,6 +838,12 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev, local_irq_disable(); + if (acpi_idle_suspend) { + local_irq_enable(); + cpu_relax(); + return -EBUSY; + } + if (cx->entry_method != ACPI_CSTATE_FFH) { current_thread_info()->status &= ~TS_POLLING; /* @@ -907,14 +928,21 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, drv, drv->safe_state_index); } else { local_irq_disable(); - acpi_safe_halt(); + if (!acpi_idle_suspend) + acpi_safe_halt(); local_irq_enable(); - return -EINVAL; + return -EBUSY; } } local_irq_disable(); + if (acpi_idle_suspend) { + local_irq_enable(); + cpu_relax(); + return -EBUSY; + } + if (cx->entry_method != ACPI_CSTATE_FFH) { current_thread_info()->status &= ~TS_POLLING; /* -- cgit v0.10.2 From 7784655acc5a946aac49af423cc1099c5d593d73 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 21 Jun 2012 17:50:27 +0000 Subject: powerpc: Fix BPF_JIT code to link with multiple TOCs If the kernel is big enough (eg. allyesconfig), the linker may need to switch TOCs when calling from the BPF JIT code out to the external helpers (skb_copy_bits() & bpf_internal_load_pointer_neg_helper()). In order to do that we need to leave space after the bl for the linker to insert a reload of our TOC pointer. Signed-off-by: Michael Ellerman Acked-by: Matt Evans Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/net/bpf_jit_64.S b/arch/powerpc/net/bpf_jit_64.S index 55ba385..7d3a3b5 100644 --- a/arch/powerpc/net/bpf_jit_64.S +++ b/arch/powerpc/net/bpf_jit_64.S @@ -105,6 +105,7 @@ sk_load_byte_msh_positive_offset: mr r4, r_addr; \ li r6, SIZE; \ bl skb_copy_bits; \ + nop; \ /* R3 = 0 on success */ \ addi r1, r1, BPF_PPC_SLOWPATH_FRAME; \ ld r0, 16(r1); \ @@ -156,6 +157,7 @@ bpf_slow_path_byte_msh: mr r4, r_addr; \ li r5, SIZE; \ bl bpf_internal_load_pointer_neg_helper; \ + nop; \ /* R3 != 0 on success */ \ addi r1, r1, BPF_PPC_SLOWPATH_FRAME; \ ld r0, 16(r1); \ -- cgit v0.10.2 From 82b2521d257b5c0efd51821cf5fa306e53bbb6ba Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Tue, 19 Jun 2012 20:01:45 +0000 Subject: powerpc: Fix uninitialised error in numa.c chroma_defconfig currently gives me this with gcc 4.6: arch/powerpc/mm/numa.c:638:13: error: 'dm' may be used uninitialized in this function [-Werror=uninitialized] It's a bogus warning/error since of_get_drconf_memory() only writes it anyway. Signed-off-by: Michael Neuling cc: [v3.3+] Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index b6edbb3..6e8f677 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -635,7 +635,7 @@ static inline int __init read_usm_ranges(const u32 **usm) */ static void __init parse_drconf_memory(struct device_node *memory) { - const u32 *dm, *usm; + const u32 *uninitialized_var(dm), *usm; unsigned int n, rc, ranges, is_kexec_kdump = 0; unsigned long lmb_size, base, size, sz; int nid; -- cgit v0.10.2 From c58ce2b1e3c75c55e7ebf751afce9f5a30f60b42 Mon Sep 17 00:00:00 2001 From: Tiejun Chen Date: Wed, 6 Jun 2012 20:56:43 +0000 Subject: ppc64: fix missing to check all bits of _TIF_USER_WORK_MASK in preempt In entry_64.S version of ret_from_except_lite, you'll notice that in the !preempt case, after we've checked MSR_PR we test for any TIF flag in _TIF_USER_WORK_MASK to decide whether to go to do_work or not. However, in the preempt case, we do a convoluted trick to test SIGPENDING only if PR was set and always test NEED_RESCHED ... but we forget to test any other bit of _TIF_USER_WORK_MASK !!! So that means that with preempt, we completely fail to test for things like single step, syscall tracing, etc... This should be fixed as the following path: - Test PR. If not set, go to resume_kernel, else continue. - If go resume_kernel, to do that original do_work. - If else, then always test for _TIF_USER_WORK_MASK to decide to do that original user_work, else restore directly. Signed-off-by: Tiejun Chen Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index ed1718f..5971c85 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -558,27 +558,54 @@ _GLOBAL(ret_from_except_lite) mtmsrd r10,1 /* Update machine state */ #endif /* CONFIG_PPC_BOOK3E */ -#ifdef CONFIG_PREEMPT clrrdi r9,r1,THREAD_SHIFT /* current_thread_info() */ - li r0,_TIF_NEED_RESCHED /* bits to check */ ld r3,_MSR(r1) ld r4,TI_FLAGS(r9) - /* Move MSR_PR bit in r3 to _TIF_SIGPENDING position in r0 */ - rlwimi r0,r3,32+TIF_SIGPENDING-MSR_PR_LG,_TIF_SIGPENDING - and. r0,r4,r0 /* check NEED_RESCHED and maybe SIGPENDING */ - bne do_work - -#else /* !CONFIG_PREEMPT */ - ld r3,_MSR(r1) /* Returning to user mode? */ andi. r3,r3,MSR_PR - beq restore /* if not, just restore regs and return */ + beq resume_kernel /* Check current_thread_info()->flags */ + andi. r0,r4,_TIF_USER_WORK_MASK + beq restore + + andi. r0,r4,_TIF_NEED_RESCHED + beq 1f + bl .restore_interrupts + bl .schedule + b .ret_from_except_lite + +1: bl .save_nvgprs + bl .restore_interrupts + addi r3,r1,STACK_FRAME_OVERHEAD + bl .do_notify_resume + b .ret_from_except + +resume_kernel: +#ifdef CONFIG_PREEMPT + /* Check if we need to preempt */ + andi. r0,r4,_TIF_NEED_RESCHED + beq+ restore + /* Check that preempt_count() == 0 and interrupts are enabled */ + lwz r8,TI_PREEMPT(r9) + cmpwi cr1,r8,0 + ld r0,SOFTE(r1) + cmpdi r0,0 + crandc eq,cr1*4+eq,eq + bne restore + + /* + * Here we are preempting the current task. We want to make + * sure we are soft-disabled first + */ + SOFT_DISABLE_INTS(r3,r4) +1: bl .preempt_schedule_irq + + /* Re-test flags and eventually loop */ clrrdi r9,r1,THREAD_SHIFT ld r4,TI_FLAGS(r9) - andi. r0,r4,_TIF_USER_WORK_MASK - bne do_work -#endif /* !CONFIG_PREEMPT */ + andi. r0,r4,_TIF_NEED_RESCHED + bne 1b +#endif /* CONFIG_PREEMPT */ .globl fast_exc_return_irq fast_exc_return_irq: @@ -759,50 +786,6 @@ restore_check_irq_replay: #endif /* CONFIG_PPC_BOOK3E */ 1: b .ret_from_except /* What else to do here ? */ - - -3: -do_work: -#ifdef CONFIG_PREEMPT - andi. r0,r3,MSR_PR /* Returning to user mode? */ - bne user_work - /* Check that preempt_count() == 0 and interrupts are enabled */ - lwz r8,TI_PREEMPT(r9) - cmpwi cr1,r8,0 - ld r0,SOFTE(r1) - cmpdi r0,0 - crandc eq,cr1*4+eq,eq - bne restore - - /* - * Here we are preempting the current task. We want to make - * sure we are soft-disabled first - */ - SOFT_DISABLE_INTS(r3,r4) -1: bl .preempt_schedule_irq - - /* Re-test flags and eventually loop */ - clrrdi r9,r1,THREAD_SHIFT - ld r4,TI_FLAGS(r9) - andi. r0,r4,_TIF_NEED_RESCHED - bne 1b - b restore - -user_work: -#endif /* CONFIG_PREEMPT */ - - andi. r0,r4,_TIF_NEED_RESCHED - beq 1f - bl .restore_interrupts - bl .schedule - b .ret_from_except_lite - -1: bl .save_nvgprs - bl .restore_interrupts - addi r3,r1,STACK_FRAME_OVERHEAD - bl .do_notify_resume - b .ret_from_except - unrecov_restore: addi r3,r1,STACK_FRAME_OVERHEAD bl .unrecoverable_exception -- cgit v0.10.2 From 2cb387ae758d97ee7396a82528c824b8dc510b8a Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Thu, 7 Jun 2012 17:44:23 +0000 Subject: powerpc: Fix Section mismatch warnings in prom_init.c This patches tries to fix a couple of Section mismatch warnings like following one: WARNING: arch/powerpc/kernel/built-in.o(.text+0x2923c): Section mismatch in reference from the function .prom_query_opal() to the function .init.text:.call_prom() The function .prom_query_opal() references the function __init .call_prom(). This is often because .prom_query_opal lacks a __init annotation or the annotation of .call_prom is wrong. Signed-off-by: Li Zhong Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 1b488e5..0794a30 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -1312,7 +1312,7 @@ static struct opal_secondary_data { extern char opal_secondary_entry; -static void prom_query_opal(void) +static void __init prom_query_opal(void) { long rc; @@ -1436,7 +1436,7 @@ static void __init prom_opal_hold_cpus(void) prom_debug("prom_opal_hold_cpus: end...\n"); } -static void prom_opal_takeover(void) +static void __init prom_opal_takeover(void) { struct opal_secondary_data *data = &RELOC(opal_secondary_data); struct opal_takeover_args *args = &data->args; -- cgit v0.10.2 From 2d773aa4810d4a612d1c879faacc38594cc3f841 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 4 Jun 2012 16:27:54 +0000 Subject: powerpc/ftrace: Do not trace restore_interrupts() As I was adding code that affects all archs, I started testing function tracer against PPC64 and found that it currently locks up with 3.4 kernel. I figured it was due to tracing a function that shouldn't be, so I went through the following process to bisect to find the culprit: cat /debug/tracing/available_filter_functions > t num=`wc -l t` sed -ne "1,${num}p" t > t1 let num=num+1 sed -ne "${num},$p" t > t2 cat t1 > /debug/tracing/set_ftrace_filter echo function /debug/tracing/current_tracer It finally came down to this function: restore_interrupts() I'm not sure why this locks up the system. It just seems to prevent scheduling from occurring. Interrupts seem to still work, as I can ping the box. But all user processes freeze. When restore_interrupts() is not traced, function tracing works fine. Cc: stable@kernel.org Signed-off-by: Steven Rostedt Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 7835a5e..1b41502 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -277,7 +277,7 @@ EXPORT_SYMBOL(arch_local_irq_restore); * NOTE: This is called with interrupts hard disabled but not marked * as such in paca->irq_happened, so we need to resync this. */ -void restore_interrupts(void) +void notrace restore_interrupts(void) { if (irqs_disabled()) { local_paca->irq_happened |= PACA_IRQ_HARD_DIS; -- cgit v0.10.2 From 0b17ba7258db83cd02da560884e053b85de371f2 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 27 Jun 2012 13:13:52 +0000 Subject: powerpc: check_and_cede_processor() never cedes Commit f948501b36c6 ("Make hard_irq_disable() actually hard-disable interrupts") caused check_and_cede_processor to stop working. ->irq_happened will never be zero right after a hard_irq_disable so the compiler removes the call to cede_processor completely. The bug was introduced back in the lazy interrupt handling rework of 3.4 but was hidden until recently because hard_irq_disable did nothing. This issue will eventually appear in 3.4 stable since the hard_irq_disable fix is marked stable, so mark this one for stable too. Signed-off-by: Anton Blanchard Cc: stable@vger.kernel.org Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 32b394f..6eb75b8 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -103,6 +103,11 @@ static inline void hard_irq_disable(void) /* include/linux/interrupt.h needs hard_irq_disable to be a macro */ #define hard_irq_disable hard_irq_disable +static inline bool lazy_irq_pending(void) +{ + return !!(get_paca()->irq_happened & ~PACA_IRQ_HARD_DIS); +} + /* * This is called by asynchronous interrupts to conditionally * re-enable hard interrupts when soft-disabled after having diff --git a/arch/powerpc/platforms/pseries/processor_idle.c b/arch/powerpc/platforms/pseries/processor_idle.c index 41a34bc..e61483e 100644 --- a/arch/powerpc/platforms/pseries/processor_idle.c +++ b/arch/powerpc/platforms/pseries/processor_idle.c @@ -106,7 +106,7 @@ static void check_and_cede_processor(void) * we first hard disable then check. */ hard_irq_disable(); - if (get_paca()->irq_happened == 0) + if (!lazy_irq_pending()) cede_processor(); } -- cgit v0.10.2 From bc6dc752f35488160ffac07ae91bed1bddaea32a Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Tue, 26 Jun 2012 21:26:37 +0000 Subject: powerpc/pseries: Fix software invalidate TCE The following added support for powernv but broke pseries/BML: 1f1616e powerpc/powernv: Add TCE SW invalidation support TCE_PCI_SW_INVAL was split into FREE and CREATE flags but the tests in the pseries code were not updated to reflect this. Signed-off-by: Michael Neuling cc: stable@kernel.org [v3.3+] Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 0915b1a..2d311c0 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -106,7 +106,7 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index, tcep++; } - if (tbl->it_type == TCE_PCI_SWINV_CREATE) + if (tbl->it_type & TCE_PCI_SWINV_CREATE) tce_invalidate_pSeries_sw(tbl, tces, tcep - 1); return 0; } @@ -121,7 +121,7 @@ static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages) while (npages--) *(tcep++) = 0; - if (tbl->it_type == TCE_PCI_SWINV_FREE) + if (tbl->it_type & TCE_PCI_SWINV_FREE) tce_invalidate_pSeries_sw(tbl, tces, tcep - 1); } -- cgit v0.10.2 From b0dfa4541e48ac4cc5f017285432c89923ad0f58 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 20 Jun 2012 14:16:57 +0100 Subject: ASoC: wm2200: Add missing BCLK rate Without this very high BCLKs will be configured incorrectly. Reported-by: Axel Lin Signed-off-by: Mark Brown Cc: stable@vger.kernel.org diff --git a/sound/soc/codecs/wm2200.c b/sound/soc/codecs/wm2200.c index acbdc5f..32682c1 100644 --- a/sound/soc/codecs/wm2200.c +++ b/sound/soc/codecs/wm2200.c @@ -1491,6 +1491,7 @@ static int wm2200_bclk_rates_dat[WM2200_NUM_BCLK_RATES] = { static int wm2200_bclk_rates_cd[WM2200_NUM_BCLK_RATES] = { 5644800, + 3763200, 2882400, 1881600, 1411200, -- cgit v0.10.2 From c9fe573a6584034670c1a55ee8162d623519cbbf Mon Sep 17 00:00:00 2001 From: "Hebbar, Gururaja" Date: Tue, 26 Jun 2012 19:25:11 +0530 Subject: ASoC: tlv320aic3x: Fix codec pll configure bug In sound/soc/codecs/tlv320aic3x.c data = snd_soc_read(codec, AIC3X_PLL_PROGA_REG); snd_soc_write(codec, AIC3X_PLL_PROGA_REG, data | (pll_p << PLLP_SHIFT)); In the above code, pll-p value is OR'ed with previous value without clearing it. Bug is not seen if pll-p value doesn't change across Sampling frequency. However on some platforms (like AM335x EVM-SK), pll-p may have different values across different sampling frequencies. In such case, above code configures the pll with a wrong value. Because of this bug, when a audio stream is played with pll value different from previous stream, audio is heard as differently(like its stretched). Signed-off-by: Hebbar, Gururaja Signed-off-by: Mark Brown Cc: stable@vger.kernel.org diff --git a/sound/soc/codecs/tlv320aic3x.c b/sound/soc/codecs/tlv320aic3x.c index 64d2a4f..e9b62b5 100644 --- a/sound/soc/codecs/tlv320aic3x.c +++ b/sound/soc/codecs/tlv320aic3x.c @@ -935,9 +935,7 @@ static int aic3x_hw_params(struct snd_pcm_substream *substream, } found: - data = snd_soc_read(codec, AIC3X_PLL_PROGA_REG); - snd_soc_write(codec, AIC3X_PLL_PROGA_REG, - data | (pll_p << PLLP_SHIFT)); + snd_soc_update_bits(codec, AIC3X_PLL_PROGA_REG, PLLP_MASK, pll_p); snd_soc_write(codec, AIC3X_OVRF_STATUS_AND_PLLR_REG, pll_r << PLLR_SHIFT); snd_soc_write(codec, AIC3X_PLL_PROGB_REG, pll_j << PLLJ_SHIFT); diff --git a/sound/soc/codecs/tlv320aic3x.h b/sound/soc/codecs/tlv320aic3x.h index 6f097fb..08c7f66 100644 --- a/sound/soc/codecs/tlv320aic3x.h +++ b/sound/soc/codecs/tlv320aic3x.h @@ -166,6 +166,7 @@ /* PLL registers bitfields */ #define PLLP_SHIFT 0 +#define PLLP_MASK 7 #define PLLQ_SHIFT 3 #define PLLR_SHIFT 0 #define PLLJ_SHIFT 2 -- cgit v0.10.2 From 8663ff75cdca0a66f808e124c5592735793926af Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Fri, 29 Jun 2012 09:35:52 +0200 Subject: ALSA: hda - Fix no sound from ALC662 after Windows reboot Windows use hidden register to control EAPD. Linux use verb to control EAPD. If windows reboot to Linux, it must change the EAPD control to verb control. Signed-off-by: Kailang Yang Signed-off-by: Takashi Iwai diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 5ccf10a..aa4c25e 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6688,6 +6688,31 @@ static const struct alc_model_fixup alc662_fixup_models[] = { {} }; +static void alc662_fill_coef(struct hda_codec *codec) +{ + int val, coef; + + coef = alc_get_coef0(codec); + + switch (codec->vendor_id) { + case 0x10ec0662: + if ((coef & 0x00f0) == 0x0030) { + val = alc_read_coef_idx(codec, 0x4); /* EAPD Ctrl */ + alc_write_coef_idx(codec, 0x4, val & ~(1<<10)); + } + break; + case 0x10ec0272: + case 0x10ec0273: + case 0x10ec0663: + case 0x10ec0665: + case 0x10ec0670: + case 0x10ec0671: + case 0x10ec0672: + val = alc_read_coef_idx(codec, 0xd); /* EAPD Ctrl */ + alc_write_coef_idx(codec, 0xd, val | (1<<14)); + break; + } +} /* */ @@ -6707,6 +6732,9 @@ static int patch_alc662(struct hda_codec *codec) alc_fix_pll_init(codec, 0x20, 0x04, 15); + spec->init_hook = alc662_fill_coef; + alc662_fill_coef(codec); + alc_pick_fixup(codec, alc662_fixup_models, alc662_fixup_tbl, alc662_fixups); alc_apply_fixup(codec, ALC_FIXUP_ACT_PRE_PROBE); -- cgit v0.10.2 From d31f4d448f7671dc3e6a7a1c92a4c085a36058bb Mon Sep 17 00:00:00 2001 From: Tomasz Bursztyka Date: Thu, 28 Jun 2012 02:57:48 +0000 Subject: netfilter: ipset: fix crash if IPSET_CMD_NONE command is sent This patch fixes a crash if that ipset command is sent over nfnetlink. Signed-off-by: Tomasz Bursztyka Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 819c342..9730882 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -640,6 +640,14 @@ find_free_id(const char *name, ip_set_id_t *index, struct ip_set **set) } static int +ip_set_none(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + return -EOPNOTSUPP; +} + +static int ip_set_create(struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const attr[]) @@ -1539,6 +1547,10 @@ nlmsg_failure: } static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = { + [IPSET_CMD_NONE] = { + .call = ip_set_none, + .attr_count = IPSET_ATTR_CMD_MAX, + }, [IPSET_CMD_CREATE] = { .call = ip_set_create, .attr_count = IPSET_ATTR_CMD_MAX, -- cgit v0.10.2 From 4009e18851ea555959c6017d848983b3d60bf667 Mon Sep 17 00:00:00 2001 From: Tomasz Bursztyka Date: Thu, 28 Jun 2012 02:57:49 +0000 Subject: netfilter: nfnetlink: fix missing rcu_read_unlock in nfnetlink_rcv_msg Bug added in commit 6b75e3e8d664a9a (netfilter: nfnetlink: add RCU in nfnetlink_rcv_msg()) Signed-off-by: Tomasz Bursztyka Acked-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 3e797d1..791d56b 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -169,8 +169,10 @@ replay: err = nla_parse(cda, ss->cb[cb_id].attr_count, attr, attrlen, ss->cb[cb_id].policy); - if (err < 0) + if (err < 0) { + rcu_read_unlock(); return err; + } if (nc->call_rcu) { err = nc->call_rcu(net->nfnl, skb, nlh, -- cgit v0.10.2 From c21b328ea8c7c71cd2daf50557db440bbaa7ef55 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 28 Jun 2012 17:53:07 -0400 Subject: drm/radeon: fix VM page table setup on SI Cayman and trinity allow for variable sized VM page tables, but SI requires that all page tables be the same size. The current code assumes variablely sized VM page tables so SI may end up with part of each page table overlapping with other memory which could end up being interpreted by the VM hw as garbage. Change the code to better accomodate SI. Allocate enough space for at least 2 full page tables and always set last_pfn to max_pfn on SI so each VM is backed by a full page table. This limits us to only 2 VMs active at any given time on SI. This will be rectified and the code can be reunified once we move to two level page tables. Signed-off-by: Alex Deucher Reviewed-by: Jerome Glisse Cc: stable@vger.kernel.org Signed-off-by: Dave Airlie diff --git a/drivers/gpu/drm/radeon/radeon_gart.c b/drivers/gpu/drm/radeon/radeon_gart.c index 59d4493..84b648a 100644 --- a/drivers/gpu/drm/radeon/radeon_gart.c +++ b/drivers/gpu/drm/radeon/radeon_gart.c @@ -289,8 +289,9 @@ int radeon_vm_manager_init(struct radeon_device *rdev) rdev->vm_manager.enabled = false; /* mark first vm as always in use, it's the system one */ + /* allocate enough for 2 full VM pts */ r = radeon_sa_bo_manager_init(rdev, &rdev->vm_manager.sa_manager, - rdev->vm_manager.max_pfn * 8, + rdev->vm_manager.max_pfn * 8 * 2, RADEON_GEM_DOMAIN_VRAM); if (r) { dev_err(rdev->dev, "failed to allocate vm bo (%dKB)\n", @@ -633,7 +634,15 @@ int radeon_vm_init(struct radeon_device *rdev, struct radeon_vm *vm) mutex_init(&vm->mutex); INIT_LIST_HEAD(&vm->list); INIT_LIST_HEAD(&vm->va); - vm->last_pfn = 0; + /* SI requires equal sized PTs for all VMs, so always set + * last_pfn to max_pfn. cayman allows variable sized + * pts so we can grow then as needed. Once we switch + * to two level pts we can unify this again. + */ + if (rdev->family >= CHIP_TAHITI) + vm->last_pfn = rdev->vm_manager.max_pfn; + else + vm->last_pfn = 0; /* map the ib pool buffer at 0 in virtual address space, set * read only */ diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c index c7b61f1..0b02792 100644 --- a/drivers/gpu/drm/radeon/si.c +++ b/drivers/gpu/drm/radeon/si.c @@ -2365,12 +2365,12 @@ int si_pcie_gart_enable(struct radeon_device *rdev) WREG32(0x15DC, 0); /* empty context1-15 */ - /* FIXME start with 1G, once using 2 level pt switch to full + /* FIXME start with 4G, once using 2 level pt switch to full * vm size space */ /* set vm size, must be a multiple of 4 */ WREG32(VM_CONTEXT1_PAGE_TABLE_START_ADDR, 0); - WREG32(VM_CONTEXT1_PAGE_TABLE_END_ADDR, (1 << 30) / RADEON_GPU_PAGE_SIZE); + WREG32(VM_CONTEXT1_PAGE_TABLE_END_ADDR, rdev->vm_manager.max_pfn); for (i = 1; i < 16; i++) { if (i < 8) WREG32(VM_CONTEXT0_PAGE_TABLE_BASE_ADDR + (i << 2), -- cgit v0.10.2 From 084681d14e429cb6192262ac7437f00e2c02f26a Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Thu, 28 Jun 2012 09:38:53 +0200 Subject: printk: flush continuation lines immediately to console Continuation lines are buffered internally, intended to merge the chunked printk()s into a single record, and to isolate potentially racy continuation users from usual terminated line users. This though, has the effect that partial lines are not printed to the console in the moment they are emitted. In case the kernel crashes in the meantime, the potentially interesting printed information would never reach the consoles. Here we share the continuation buffer with the console copy logic, and partial lines are always immediately flushed to the available consoles. They are still buffered internally to improve the readability and integrity of the messages and minimize the amount of needed record headers to store. Signed-off-by: Kay Sievers Tested-by: Steven Rostedt Acked-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman diff --git a/kernel/printk.c b/kernel/printk.c index cdfba44..fbf4d0b 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -193,12 +193,19 @@ static int console_may_schedule; * separated by ',', and find the message after the ';' character. */ +enum log_flags { + LOG_DEFAULT = 0, + LOG_NOCONS = 1, /* already flushed, do not print to console */ +}; + struct log { u64 ts_nsec; /* timestamp in nanoseconds */ u16 len; /* length of entire record */ u16 text_len; /* length of text buffer */ u16 dict_len; /* length of dictionary buffer */ - u16 level; /* syslog level + facility */ + u8 facility; /* syslog facility */ + u8 flags:5; /* internal record flags */ + u8 level:3; /* syslog level */ }; /* @@ -286,6 +293,7 @@ static u32 log_next(u32 idx) /* insert record into the buffer, discard old ones, update heads */ static void log_store(int facility, int level, + enum log_flags flags, u64 ts_nsec, const char *dict, u16 dict_len, const char *text, u16 text_len) { @@ -329,8 +337,13 @@ static void log_store(int facility, int level, msg->text_len = text_len; memcpy(log_dict(msg), dict, dict_len); msg->dict_len = dict_len; - msg->level = (facility << 3) | (level & 7); - msg->ts_nsec = local_clock(); + msg->facility = facility; + msg->level = level & 7; + msg->flags = flags & 0x1f; + if (ts_nsec > 0) + msg->ts_nsec = ts_nsec; + else + msg->ts_nsec = local_clock(); memset(log_dict(msg) + dict_len, 0, pad_len); msg->len = sizeof(struct log) + text_len + dict_len + pad_len; @@ -446,7 +459,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, ts_usec = msg->ts_nsec; do_div(ts_usec, 1000); len = sprintf(user->buf, "%u,%llu,%llu;", - msg->level, user->seq, ts_usec); + (msg->facility << 3) | msg->level, user->seq, ts_usec); /* escape non-printable characters */ for (i = 0; i < msg->text_len; i++) { @@ -787,6 +800,21 @@ static bool printk_time; #endif module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); +static size_t print_time(u64 ts, char *buf) +{ + unsigned long rem_nsec; + + if (!printk_time) + return 0; + + if (!buf) + return 15; + + rem_nsec = do_div(ts, 1000000000); + return sprintf(buf, "[%5lu.%06lu] ", + (unsigned long)ts, rem_nsec / 1000); +} + static size_t print_prefix(const struct log *msg, bool syslog, char *buf) { size_t len = 0; @@ -803,18 +831,7 @@ static size_t print_prefix(const struct log *msg, bool syslog, char *buf) } } - if (printk_time) { - if (buf) { - unsigned long long ts = msg->ts_nsec; - unsigned long rem_nsec = do_div(ts, 1000000000); - - len += sprintf(buf + len, "[%5lu.%06lu] ", - (unsigned long) ts, rem_nsec / 1000); - } else { - len += 15; - } - } - + len += print_time(msg->ts_nsec, buf ? buf + len : NULL); return len; } @@ -1294,15 +1311,92 @@ static inline void printk_delay(void) } } +/* + * Continuation lines are buffered, and not committed to the record buffer + * until the line is complete, or a race forces it. The line fragments + * though, are printed immediately to the consoles to ensure everything has + * reached the console in case of a kernel crash. + */ +static struct cont { + char buf[LOG_LINE_MAX]; + size_t len; /* length == 0 means unused buffer */ + size_t cons; /* bytes written to console */ + struct task_struct *owner; /* task of first print*/ + u64 ts_nsec; /* time of first print */ + u8 level; /* log level of first message */ + u8 facility; /* log level of first message */ + bool flushed:1; /* buffer sealed and committed */ +} cont; + +static void cont_flush(void) +{ + if (cont.flushed) + return; + if (cont.len == 0) + return; + + log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec, + NULL, 0, cont.buf, cont.len); + + cont.flushed = true; +} + +static bool cont_add(int facility, int level, const char *text, size_t len) +{ + if (cont.len && cont.flushed) + return false; + + if (cont.len + len > sizeof(cont.buf)) { + cont_flush(); + return false; + } + + if (!cont.len) { + cont.facility = facility; + cont.level = level; + cont.owner = current; + cont.ts_nsec = local_clock(); + cont.cons = 0; + cont.flushed = false; + } + + memcpy(cont.buf + cont.len, text, len); + cont.len += len; + return true; +} + +static size_t cont_print_text(char *text, size_t size) +{ + size_t textlen = 0; + size_t len; + + if (cont.cons == 0) { + textlen += print_time(cont.ts_nsec, text); + size -= textlen; + } + + len = cont.len - cont.cons; + if (len > 0) { + if (len+1 > size) + len = size-1; + memcpy(text + textlen, cont.buf + cont.cons, len); + textlen += len; + cont.cons = cont.len; + } + + if (cont.flushed) { + text[textlen++] = '\n'; + /* got everything, release buffer */ + cont.len = 0; + } + return textlen; +} + asmlinkage int vprintk_emit(int facility, int level, const char *dict, size_t dictlen, const char *fmt, va_list args) { static int recursion_bug; - static char cont_buf[LOG_LINE_MAX]; - static size_t cont_len; - static int cont_level; - static struct task_struct *cont_task; static char textbuf[LOG_LINE_MAX]; char *text = textbuf; size_t text_len; @@ -1348,7 +1442,8 @@ asmlinkage int vprintk_emit(int facility, int level, recursion_bug = 0; printed_len += strlen(recursion_msg); /* emit KERN_CRIT message */ - log_store(0, 2, NULL, 0, recursion_msg, printed_len); + log_store(0, 2, LOG_DEFAULT, 0, + NULL, 0, recursion_msg, printed_len); } /* @@ -1386,55 +1481,38 @@ asmlinkage int vprintk_emit(int facility, int level, } if (!newline) { - if (cont_len && (prefix || cont_task != current)) { - /* - * Flush earlier buffer, which is either from a - * different thread, or when we got a new prefix. - */ - log_store(facility, cont_level, NULL, 0, cont_buf, cont_len); - cont_len = 0; - } - - if (!cont_len) { - cont_level = level; - cont_task = current; - } + /* + * Flush the conflicting buffer. An earlier newline was missing, + * or another task also prints continuation lines. + */ + if (cont.len && (prefix || cont.owner != current)) + cont_flush(); - /* buffer or append to earlier buffer from the same thread */ - if (cont_len + text_len > sizeof(cont_buf)) - text_len = sizeof(cont_buf) - cont_len; - memcpy(cont_buf + cont_len, text, text_len); - cont_len += text_len; + /* buffer line if possible, otherwise store it right away */ + if (!cont_add(facility, level, text, text_len)) + log_store(facility, level, LOG_DEFAULT, 0, + dict, dictlen, text, text_len); } else { - if (cont_len && cont_task == current) { - if (prefix) { - /* - * New prefix from the same thread; flush. We - * either got no earlier newline, or we race - * with an interrupt. - */ - log_store(facility, cont_level, - NULL, 0, cont_buf, cont_len); - cont_len = 0; - } + bool stored = false; - /* append to the earlier buffer and flush */ - if (cont_len + text_len > sizeof(cont_buf)) - text_len = sizeof(cont_buf) - cont_len; - memcpy(cont_buf + cont_len, text, text_len); - cont_len += text_len; - log_store(facility, cont_level, - NULL, 0, cont_buf, cont_len); - cont_len = 0; - cont_task = NULL; - printed_len = cont_len; - } else { - /* ordinary single and terminated line */ - log_store(facility, level, - dict, dictlen, text, text_len); - printed_len = text_len; + /* + * Flush the conflicting buffer. An earlier newline was missing, + * or we race with a continuation line from an interrupt. + */ + if (cont.len && prefix && cont.owner == current) + cont_flush(); + + /* Merge with our buffer if possible; flush it in any case */ + if (cont.len && cont.owner == current) { + stored = cont_add(facility, level, text, text_len); + cont_flush(); } + + if (!stored) + log_store(facility, level, LOG_DEFAULT, 0, + dict, dictlen, text, text_len); } + printed_len += text_len; /* * Try to acquire and then immediately release the console semaphore. @@ -1521,11 +1599,18 @@ EXPORT_SYMBOL(printk); #else #define LOG_LINE_MAX 0 +static struct cont { + size_t len; + size_t cons; + u8 level; + bool flushed:1; +} cont; static struct log *log_from_idx(u32 idx) { return NULL; } static u32 log_next(u32 idx) { return 0; } static void call_console_drivers(int level, const char *text, size_t len) {} static size_t msg_print_text(const struct log *msg, bool syslog, char *buf, size_t size) { return 0; } +static size_t cont_print_text(char *text, size_t size) { return 0; } #endif /* CONFIG_PRINTK */ @@ -1817,6 +1902,7 @@ static u32 console_idx; */ void console_unlock(void) { + static char text[LOG_LINE_MAX]; static u64 seen_seq; unsigned long flags; bool wake_klogd = false; @@ -1829,10 +1915,23 @@ void console_unlock(void) console_may_schedule = 0; + /* flush buffered message fragment immediately to console */ + raw_spin_lock_irqsave(&logbuf_lock, flags); + if (cont.len && (cont.cons < cont.len || cont.flushed)) { + size_t len; + + len = cont_print_text(text, sizeof(text)); + raw_spin_unlock(&logbuf_lock); + stop_critical_timings(); + call_console_drivers(cont.level, text, len); + start_critical_timings(); + local_irq_restore(flags); + } else + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + again: for (;;) { struct log *msg; - static char text[LOG_LINE_MAX]; size_t len; int level; @@ -1847,13 +1946,22 @@ again: console_seq = log_first_seq; console_idx = log_first_idx; } - +skip: if (console_seq == log_next_seq) break; msg = log_from_idx(console_idx); - level = msg->level & 7; + if (msg->flags & LOG_NOCONS) { + /* + * Skip record we have buffered and already printed + * directly to the console when we received it. + */ + console_idx = log_next(console_idx); + console_seq++; + goto skip; + } + level = msg->level; len = msg_print_text(msg, false, text, sizeof(text)); console_idx = log_next(console_idx); -- cgit v0.10.2 From d36208227d03c44c0a74cd702cc94528162e1703 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 29 Jun 2012 11:40:11 -0400 Subject: printk: Optimize if statement logic where newline exists In reviewing Kay's fix up patch: "printk: Have printk() never buffer its data", I found two if statements that could be combined and optimized. Put together the two 'cont.len && cont.owner == current' if statements into a single one, and check if we need to call cont_add(). This also removes the unneeded double cont_flush() calls. Link: http://lkml.kernel.org/r/1340869133.876.10.camel@mop Signed-off-by: Steven Rostedt Cc: Kay Sievers Signed-off-by: Greg Kroah-Hartman diff --git a/kernel/printk.c b/kernel/printk.c index fbf4d0b..5ae6b09 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1496,15 +1496,14 @@ asmlinkage int vprintk_emit(int facility, int level, bool stored = false; /* - * Flush the conflicting buffer. An earlier newline was missing, - * or we race with a continuation line from an interrupt. + * If an earlier newline was missing and it was the same task, + * either merge it with the current buffer and flush, or if + * there was a race with interrupts (prefix == true) then just + * flush it out and store this line separately. */ - if (cont.len && prefix && cont.owner == current) - cont_flush(); - - /* Merge with our buffer if possible; flush it in any case */ if (cont.len && cont.owner == current) { - stored = cont_add(facility, level, text, text_len); + if (!prefix) + stored = cont_add(facility, level, text, text_len); cont_flush(); } -- cgit v0.10.2 From b03738430c7537d5f87948e0b35d8aaf2688c6b4 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 20 Jun 2012 09:48:43 +0800 Subject: ACPI video: Still use ACPI backlight control if _DOS doesn't exist This fixes a regression in 3.4-rc1 caused by commit ea9f8856bd6d4ed45885b06a338f7362cd6c60e5 (ACPI video: Harden video bus adding.) Some platforms don't have _DOS control method, but the ACPI backlight still works. We should not invoke _DOS for these platforms. https://bugzilla.kernel.org/show_bug.cgi?id=43168 Cc: Igor Murzov Cc: stable@vger.kernel.org Signed-off-by: Zhang Rui Signed-off-by: Len Brown diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index 9577b6f..4134b30 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -558,6 +558,8 @@ acpi_video_bus_DOS(struct acpi_video_bus *video, int bios_flag, int lcd_flag) union acpi_object arg0 = { ACPI_TYPE_INTEGER }; struct acpi_object_list args = { 1, &arg0 }; + if (!video->cap._DOS) + return 0; if (bios_flag < 0 || bios_flag > 3 || lcd_flag < 0 || lcd_flag > 1) return -EINVAL; -- cgit v0.10.2 From 5f1601261050251a5ca293378b492a69d590dacb Mon Sep 17 00:00:00 2001 From: Stuart Hayes Date: Wed, 13 Jun 2012 16:10:45 -0500 Subject: acpi_pad: fix power_saving thread deadlock The acpi_pad driver can get stuck in destroy_power_saving_task() waiting for kthread_stop() to stop a power_saving thread. The problem is that the isolated_cpus_lock mutex is owned when destroy_power_saving_task() calls kthread_stop(), which waits for a power_saving thread to end, and the power_saving thread tries to acquire the isolated_cpus_lock when it calls round_robin_cpu(). This patch fixes the issue by making round_robin_cpu() use its own mutex. https://bugzilla.kernel.org/show_bug.cgi?id=42981 Cc: stable@vger.kernel.org Signed-off-by: Stuart Hayes Signed-off-by: Len Brown diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c index a43fa1a..1502c502 100644 --- a/drivers/acpi/acpi_pad.c +++ b/drivers/acpi/acpi_pad.c @@ -36,6 +36,7 @@ #define ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME "Processor Aggregator" #define ACPI_PROCESSOR_AGGREGATOR_NOTIFY 0x80 static DEFINE_MUTEX(isolated_cpus_lock); +static DEFINE_MUTEX(round_robin_lock); static unsigned long power_saving_mwait_eax; @@ -107,7 +108,7 @@ static void round_robin_cpu(unsigned int tsk_index) if (!alloc_cpumask_var(&tmp, GFP_KERNEL)) return; - mutex_lock(&isolated_cpus_lock); + mutex_lock(&round_robin_lock); cpumask_clear(tmp); for_each_cpu(cpu, pad_busy_cpus) cpumask_or(tmp, tmp, topology_thread_cpumask(cpu)); @@ -116,7 +117,7 @@ static void round_robin_cpu(unsigned int tsk_index) if (cpumask_empty(tmp)) cpumask_andnot(tmp, cpu_online_mask, pad_busy_cpus); if (cpumask_empty(tmp)) { - mutex_unlock(&isolated_cpus_lock); + mutex_unlock(&round_robin_lock); return; } for_each_cpu(cpu, tmp) { @@ -131,7 +132,7 @@ static void round_robin_cpu(unsigned int tsk_index) tsk_in_cpu[tsk_index] = preferred_cpu; cpumask_set_cpu(preferred_cpu, pad_busy_cpus); cpu_weight[preferred_cpu]++; - mutex_unlock(&isolated_cpus_lock); + mutex_unlock(&round_robin_lock); set_cpus_allowed_ptr(current, cpumask_of(preferred_cpu)); } -- cgit v0.10.2 From 5ae95aefb73b32ec4e7c46554304042ba82230ca Mon Sep 17 00:00:00 2001 From: Shawn Guo Date: Tue, 5 Jun 2012 15:16:43 +0800 Subject: ARM: imx6q: fix suspend regression caused by common clk migration When moving to common clk framework, the imx6q clks rom and mmdc_ch1_axi get different on/off states than old clk driver, which breaks suspend function. There might be a better way to manage these clocks, but let's takes the old clk driver approach to fix the regression first. Signed-off-by: Shawn Guo Signed-off-by: Olof Johansson diff --git a/arch/arm/mach-imx/clk-imx6q.c b/arch/arm/mach-imx/clk-imx6q.c index 17dc66a..e1a17ac 100644 --- a/arch/arm/mach-imx/clk-imx6q.c +++ b/arch/arm/mach-imx/clk-imx6q.c @@ -152,13 +152,14 @@ enum mx6q_clks { ssi2, ssi3, uart_ipg, uart_serial, usboh3, usdhc1, usdhc2, usdhc3, usdhc4, vdo_axi, vpu_axi, cko1, pll1_sys, pll2_bus, pll3_usb_otg, pll4_audio, pll5_video, pll6_mlb, pll7_usb_host, pll8_enet, ssi1_ipg, - ssi2_ipg, ssi3_ipg, clk_max + ssi2_ipg, ssi3_ipg, rom, + clk_max }; static struct clk *clk[clk_max]; static enum mx6q_clks const clks_init_on[] __initconst = { - mmdc_ch0_axi, mmdc_ch1_axi, + mmdc_ch0_axi, rom, }; int __init mx6q_clocks_init(void) @@ -364,6 +365,7 @@ int __init mx6q_clocks_init(void) clk[gpmi_bch] = imx_clk_gate2("gpmi_bch", "usdhc4", base + 0x78, 26); clk[gpmi_io] = imx_clk_gate2("gpmi_io", "enfc", base + 0x78, 28); clk[gpmi_apb] = imx_clk_gate2("gpmi_apb", "usdhc3", base + 0x78, 30); + clk[rom] = imx_clk_gate2("rom", "ahb", base + 0x7c, 0); clk[sata] = imx_clk_gate2("sata", "ipg", base + 0x7c, 4); clk[sdma] = imx_clk_gate2("sdma", "ahb", base + 0x7c, 6); clk[spba] = imx_clk_gate2("spba", "ipg", base + 0x7c, 12); -- cgit v0.10.2 From 87fac288083db40b5d5ab845393be268357c8827 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 30 Jun 2012 15:30:46 -0700 Subject: linux/irq.h: fix kernel-doc warning Fix kernel-doc warning. This struct member was removed in commit 875682648b89 ("irq: Remove irq_chip->release()") so remove its associated kernel-doc entry also. Warning(include/linux/irq.h:338): Excess struct/union/enum/typedef member 'release' description in 'irq_chip' Signed-off-by: Randy Dunlap Cc: Richard Weinberger Cc: Thomas Gleixner Signed-off-by: Linus Torvalds diff --git a/include/linux/irq.h b/include/linux/irq.h index 61f5cec..a5261e3 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -301,8 +301,6 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) * @irq_pm_shutdown: function called from core code on shutdown once per chip * @irq_print_chip: optional to print special chip info in show_interrupts * @flags: chip specific flags - * - * @release: release function solely used by UML */ struct irq_chip { const char *name; -- cgit v0.10.2 From 4f0f4af59cb07bcf44d3c07a9e8c26df54d9fff8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 30 Jun 2012 15:37:24 -0700 Subject: printk.c: fix kernel-doc warnings Fix kernel-doc warnings in printk.c: use correct parameter name. Warning(kernel/printk.c:2429): No description found for parameter 'buf' Warning(kernel/printk.c:2429): Excess function parameter 'line' description in 'kmsg_dump_get_buffer' Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds diff --git a/kernel/printk.c b/kernel/printk.c index 5ae6b09..dba1821 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -2538,7 +2538,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line); * kmsg_dump_get_buffer - copy kmsg log lines * @dumper: registered kmsg dumper * @syslog: include the "<4>" prefixes - * @line: buffer to copy the line to + * @buf: buffer to copy the line to * @size: maximum size of the buffer * @len: length of line placed into buffer * -- cgit v0.10.2 From 6887a4131da3adaab011613776d865f4bcfb5678 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 30 Jun 2012 16:08:57 -0700 Subject: Linux 3.5-rc5 diff --git a/Makefile b/Makefile index 3fdfde2..81ea154 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 3 PATCHLEVEL = 5 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Saber-toothed Squirrel # *DOCUMENTATION* -- cgit v0.10.2 From 0e90b49ca4b891f085b57559a3071a4feefb496c Mon Sep 17 00:00:00 2001 From: Mitch A Williams Date: Sat, 30 Jun 2012 00:23:19 +0000 Subject: igbvf: fix divide by zero Using ethtool -C ethX rx-usecs 0 crashes with a divide by zero. Refactor this function to fix this issue and make it more clear what the intent of each conditional is. Add comment regarding using a setting of zero. CC: stable [3.3+] CC: David Ahern Signed-off-by: Mitch Williams Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/intel/igbvf/ethtool.c b/drivers/net/ethernet/intel/igbvf/ethtool.c index 8ce6706..90eef07 100644 --- a/drivers/net/ethernet/intel/igbvf/ethtool.c +++ b/drivers/net/ethernet/intel/igbvf/ethtool.c @@ -357,21 +357,28 @@ static int igbvf_set_coalesce(struct net_device *netdev, struct igbvf_adapter *adapter = netdev_priv(netdev); struct e1000_hw *hw = &adapter->hw; - if ((ec->rx_coalesce_usecs > IGBVF_MAX_ITR_USECS) || - ((ec->rx_coalesce_usecs > 3) && - (ec->rx_coalesce_usecs < IGBVF_MIN_ITR_USECS)) || - (ec->rx_coalesce_usecs == 2)) - return -EINVAL; - - /* convert to rate of irq's per second */ - if (ec->rx_coalesce_usecs && ec->rx_coalesce_usecs <= 3) { + if ((ec->rx_coalesce_usecs >= IGBVF_MIN_ITR_USECS) && + (ec->rx_coalesce_usecs <= IGBVF_MAX_ITR_USECS)) { + adapter->current_itr = ec->rx_coalesce_usecs << 2; + adapter->requested_itr = 1000000000 / + (adapter->current_itr * 256); + } else if ((ec->rx_coalesce_usecs == 3) || + (ec->rx_coalesce_usecs == 2)) { adapter->current_itr = IGBVF_START_ITR; adapter->requested_itr = ec->rx_coalesce_usecs; - } else { - adapter->current_itr = ec->rx_coalesce_usecs << 2; + } else if (ec->rx_coalesce_usecs == 0) { + /* + * The user's desire is to turn off interrupt throttling + * altogether, but due to HW limitations, we can't do that. + * Instead we set a very small value in EITR, which would + * allow ~967k interrupts per second, but allow the adapter's + * internal clocking to still function properly. + */ + adapter->current_itr = 4; adapter->requested_itr = 1000000000 / (adapter->current_itr * 256); - } + } else + return -EINVAL; writel(adapter->current_itr, hw->hw_addr + adapter->rx_ring->itr_register); -- cgit v0.10.2 From 4244854d22bf8f782698c5224b9191c8d2d42610 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Sat, 30 Jun 2012 03:04:26 +0000 Subject: sctp: be more restrictive in transport selection on bundled sacks It was noticed recently that when we send data on a transport, its possible that we might bundle a sack that arrived on a different transport. While this isn't a major problem, it does go against the SHOULD requirement in section 6.4 of RFC 2960: An endpoint SHOULD transmit reply chunks (e.g., SACK, HEARTBEAT ACK, etc.) to the same destination transport address from which it received the DATA or control chunk to which it is replying. This rule should also be followed if the endpoint is bundling DATA chunks together with the reply chunk. This patch seeks to correct that. It restricts the bundling of sack operations to only those transports which have moved the ctsn of the association forward since the last sack. By doing this we guarantee that we only bundle outbound saks on a transport that has received a chunk since the last sack. This brings us into stricter compliance with the RFC. Vlad had initially suggested that we strictly allow only sack bundling on the transport that last moved the ctsn forward. While this makes sense, I was concerned that doing so prevented us from bundling in the case where we had received chunks that moved the ctsn on multiple transports. In those cases, the RFC allows us to select any of the transports having received chunks to bundle the sack on. so I've modified the approach to allow for that, by adding a state variable to each transport that tracks weather it has moved the ctsn since the last sack. This I think keeps our behavior (and performance), close enough to our current profile that I think we can do this without a sysctl knob to enable/disable it. Signed-off-by: Neil Horman CC: Vlad Yaseivch CC: David S. Miller CC: linux-sctp@vger.kernel.org Reported-by: Michele Baldessari Reported-by: sorin serban Acked-by: Vlad Yasevich Signed-off-by: David S. Miller diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index e4652fe..fecdf31 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -912,6 +912,9 @@ struct sctp_transport { /* Is this structure kfree()able? */ malloced:1; + /* Has this transport moved the ctsn since we last sacked */ + __u32 sack_generation; + struct flowi fl; /* This is the peer's IP address and port. */ @@ -1584,6 +1587,7 @@ struct sctp_association { */ __u8 sack_needed; /* Do we need to sack the peer? */ __u32 sack_cnt; + __u32 sack_generation; /* These are capabilities which our peer advertised. */ __u8 ecn_capable:1, /* Can peer do ECN? */ diff --git a/include/net/sctp/tsnmap.h b/include/net/sctp/tsnmap.h index e7728bc..2c5d2b4 100644 --- a/include/net/sctp/tsnmap.h +++ b/include/net/sctp/tsnmap.h @@ -117,7 +117,8 @@ void sctp_tsnmap_free(struct sctp_tsnmap *map); int sctp_tsnmap_check(const struct sctp_tsnmap *, __u32 tsn); /* Mark this TSN as seen. */ -int sctp_tsnmap_mark(struct sctp_tsnmap *, __u32 tsn); +int sctp_tsnmap_mark(struct sctp_tsnmap *, __u32 tsn, + struct sctp_transport *trans); /* Mark this TSN and all lower as seen. */ void sctp_tsnmap_skip(struct sctp_tsnmap *map, __u32 tsn); diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 5bc9ab1..b16517e 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -271,6 +271,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a */ asoc->peer.sack_needed = 1; asoc->peer.sack_cnt = 0; + asoc->peer.sack_generation = 1; /* Assume that the peer will tell us if he recognizes ASCONF * as part of INIT exchange. diff --git a/net/sctp/output.c b/net/sctp/output.c index f1b7d4b..6ae47ac 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -248,6 +248,11 @@ static sctp_xmit_t sctp_packet_bundle_sack(struct sctp_packet *pkt, /* If the SACK timer is running, we have a pending SACK */ if (timer_pending(timer)) { struct sctp_chunk *sack; + + if (pkt->transport->sack_generation != + pkt->transport->asoc->peer.sack_generation) + return retval; + asoc->a_rwnd = asoc->rwnd; sack = sctp_make_sack(asoc); if (sack) { diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index a85eeeb..b6de71e 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -734,8 +734,10 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc) int len; __u32 ctsn; __u16 num_gabs, num_dup_tsns; + struct sctp_association *aptr = (struct sctp_association *)asoc; struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map; struct sctp_gap_ack_block gabs[SCTP_MAX_GABS]; + struct sctp_transport *trans; memset(gabs, 0, sizeof(gabs)); ctsn = sctp_tsnmap_get_ctsn(map); @@ -805,6 +807,20 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc) sctp_addto_chunk(retval, sizeof(__u32) * num_dup_tsns, sctp_tsnmap_get_dups(map)); + /* Once we have a sack generated, check to see what our sack + * generation is, if its 0, reset the transports to 0, and reset + * the association generation to 1 + * + * The idea is that zero is never used as a valid generation for the + * association so no transport will match after a wrap event like this, + * Until the next sack + */ + if (++aptr->peer.sack_generation == 0) { + list_for_each_entry(trans, &asoc->peer.transport_addr_list, + transports) + trans->sack_generation = 0; + aptr->peer.sack_generation = 1; + } nodata: return retval; } diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index c96d1a8..8716da1 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1268,7 +1268,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, case SCTP_CMD_REPORT_TSN: /* Record the arrival of a TSN. */ error = sctp_tsnmap_mark(&asoc->peer.tsn_map, - cmd->obj.u32); + cmd->obj.u32, NULL); break; case SCTP_CMD_REPORT_FWDTSN: diff --git a/net/sctp/transport.c b/net/sctp/transport.c index b026ba0..1dcceb6 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -68,6 +68,8 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer, peer->af_specific = sctp_get_af_specific(addr->sa.sa_family); memset(&peer->saddr, 0, sizeof(union sctp_addr)); + peer->sack_generation = 0; + /* From 6.3.1 RTO Calculation: * * C1) Until an RTT measurement has been made for a packet sent to the diff --git a/net/sctp/tsnmap.c b/net/sctp/tsnmap.c index f1e40ceb..b5fb7c4 100644 --- a/net/sctp/tsnmap.c +++ b/net/sctp/tsnmap.c @@ -114,7 +114,8 @@ int sctp_tsnmap_check(const struct sctp_tsnmap *map, __u32 tsn) /* Mark this TSN as seen. */ -int sctp_tsnmap_mark(struct sctp_tsnmap *map, __u32 tsn) +int sctp_tsnmap_mark(struct sctp_tsnmap *map, __u32 tsn, + struct sctp_transport *trans) { u16 gap; @@ -133,6 +134,9 @@ int sctp_tsnmap_mark(struct sctp_tsnmap *map, __u32 tsn) */ map->max_tsn_seen++; map->cumulative_tsn_ack_point++; + if (trans) + trans->sack_generation = + trans->asoc->peer.sack_generation; map->base_tsn++; } else { /* Either we already have a gap, or about to record a gap, so diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 8a84017..33d8947 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -715,7 +715,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, * can mark it as received so the tsn_map is updated correctly. */ if (sctp_tsnmap_mark(&asoc->peer.tsn_map, - ntohl(chunk->subh.data_hdr->tsn))) + ntohl(chunk->subh.data_hdr->tsn), + chunk->transport)) goto fail_mark; /* First calculate the padding, so we don't inadvertently diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index f2d1de7..f5a6a4f 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -1051,7 +1051,7 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, if (chunk && (freed >= needed)) { __u32 tsn; tsn = ntohl(chunk->subh.data_hdr->tsn); - sctp_tsnmap_mark(&asoc->peer.tsn_map, tsn); + sctp_tsnmap_mark(&asoc->peer.tsn_map, tsn, chunk->transport); sctp_ulpq_tail_data(ulpq, chunk, gfp); sctp_ulpq_partial_delivery(ulpq, chunk, gfp); -- cgit v0.10.2 From 2e1706f234f86ff71056ef69683d734fbf7e9e40 Mon Sep 17 00:00:00 2001 From: Bruce Allan Date: Sat, 30 Jun 2012 20:02:42 +0000 Subject: e1000e: remove use of IP payload checksum Currently only used when packet split mode is enabled with jumbo frames, IP payload checksum (for fragmented UDP packets) is mutually exclusive with receive hashing offload since the hardware uses the same space in the receive descriptor for the hardware-provided packet checksum and the RSS hash, respectively. Users currently must disable jumbos when receive hashing offload is enabled, or vice versa, because of this incompatibility. Since testing has shown that IP payload checksum does not provide any real benefit, just remove it so that there is no longer a choice between jumbos or receive hashing offload but not both as done in other Intel GbE drivers (e.g. e1000, igb). Also, add a missing check for IP checksum error reported by the hardware; let the stack verify the checksum when this happens. CC: stable [3.4] Signed-off-by: Bruce Allan Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/intel/e1000e/defines.h b/drivers/net/ethernet/intel/e1000e/defines.h index 351a409..76edbc1 100644 --- a/drivers/net/ethernet/intel/e1000e/defines.h +++ b/drivers/net/ethernet/intel/e1000e/defines.h @@ -103,6 +103,7 @@ #define E1000_RXD_ERR_SEQ 0x04 /* Sequence Error */ #define E1000_RXD_ERR_CXE 0x10 /* Carrier Extension Error */ #define E1000_RXD_ERR_TCPE 0x20 /* TCP/UDP Checksum Error */ +#define E1000_RXD_ERR_IPE 0x40 /* IP Checksum Error */ #define E1000_RXD_ERR_RXE 0x80 /* Rx Data Error */ #define E1000_RXD_SPC_VLAN_MASK 0x0FFF /* VLAN ID is in lower 12 bits */ diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 31d37a2..623e30b 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -496,7 +496,7 @@ static void e1000_receive_skb(struct e1000_adapter *adapter, * @sk_buff: socket buffer with received data **/ static void e1000_rx_checksum(struct e1000_adapter *adapter, u32 status_err, - __le16 csum, struct sk_buff *skb) + struct sk_buff *skb) { u16 status = (u16)status_err; u8 errors = (u8)(status_err >> 24); @@ -511,8 +511,8 @@ static void e1000_rx_checksum(struct e1000_adapter *adapter, u32 status_err, if (status & E1000_RXD_STAT_IXSM) return; - /* TCP/UDP checksum error bit is set */ - if (errors & E1000_RXD_ERR_TCPE) { + /* TCP/UDP checksum error bit or IP checksum error bit is set */ + if (errors & (E1000_RXD_ERR_TCPE | E1000_RXD_ERR_IPE)) { /* let the stack verify checksum errors */ adapter->hw_csum_err++; return; @@ -523,19 +523,7 @@ static void e1000_rx_checksum(struct e1000_adapter *adapter, u32 status_err, return; /* It must be a TCP or UDP packet with a valid checksum */ - if (status & E1000_RXD_STAT_TCPCS) { - /* TCP checksum is good */ - skb->ip_summed = CHECKSUM_UNNECESSARY; - } else { - /* - * IP fragment with UDP payload - * Hardware complements the payload checksum, so we undo it - * and then put the value in host order for further stack use. - */ - __sum16 sum = (__force __sum16)swab16((__force u16)csum); - skb->csum = csum_unfold(~sum); - skb->ip_summed = CHECKSUM_COMPLETE; - } + skb->ip_summed = CHECKSUM_UNNECESSARY; adapter->hw_csum_good++; } @@ -954,8 +942,7 @@ static bool e1000_clean_rx_irq(struct e1000_ring *rx_ring, int *work_done, skb_put(skb, length); /* Receive Checksum Offload */ - e1000_rx_checksum(adapter, staterr, - rx_desc->wb.lower.hi_dword.csum_ip.csum, skb); + e1000_rx_checksum(adapter, staterr, skb); e1000_rx_hash(netdev, rx_desc->wb.lower.hi_dword.rss, skb); @@ -1341,8 +1328,7 @@ copydone: total_rx_bytes += skb->len; total_rx_packets++; - e1000_rx_checksum(adapter, staterr, - rx_desc->wb.lower.hi_dword.csum_ip.csum, skb); + e1000_rx_checksum(adapter, staterr, skb); e1000_rx_hash(netdev, rx_desc->wb.lower.hi_dword.rss, skb); @@ -1512,9 +1498,8 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_ring *rx_ring, int *work_done, } } - /* Receive Checksum Offload XXX recompute due to CRC strip? */ - e1000_rx_checksum(adapter, staterr, - rx_desc->wb.lower.hi_dword.csum_ip.csum, skb); + /* Receive Checksum Offload */ + e1000_rx_checksum(adapter, staterr, skb); e1000_rx_hash(netdev, rx_desc->wb.lower.hi_dword.rss, skb); @@ -3098,19 +3083,10 @@ static void e1000_configure_rx(struct e1000_adapter *adapter) /* Enable Receive Checksum Offload for TCP and UDP */ rxcsum = er32(RXCSUM); - if (adapter->netdev->features & NETIF_F_RXCSUM) { + if (adapter->netdev->features & NETIF_F_RXCSUM) rxcsum |= E1000_RXCSUM_TUOFL; - - /* - * IPv4 payload checksum for UDP fragments must be - * used in conjunction with packet-split. - */ - if (adapter->rx_ps_pages) - rxcsum |= E1000_RXCSUM_IPPCSE; - } else { + else rxcsum &= ~E1000_RXCSUM_TUOFL; - /* no need to clear IPPCSE as it defaults to 0 */ - } ew32(RXCSUM, rxcsum); if (adapter->hw.mac.type == e1000_pch2lan) { @@ -5241,22 +5217,10 @@ static int e1000_change_mtu(struct net_device *netdev, int new_mtu) int max_frame = new_mtu + ETH_HLEN + ETH_FCS_LEN; /* Jumbo frame support */ - if (max_frame > ETH_FRAME_LEN + ETH_FCS_LEN) { - if (!(adapter->flags & FLAG_HAS_JUMBO_FRAMES)) { - e_err("Jumbo Frames not supported.\n"); - return -EINVAL; - } - - /* - * IP payload checksum (enabled with jumbos/packet-split when - * Rx checksum is enabled) and generation of RSS hash is - * mutually exclusive in the hardware. - */ - if ((netdev->features & NETIF_F_RXCSUM) && - (netdev->features & NETIF_F_RXHASH)) { - e_err("Jumbo frames cannot be enabled when both receive checksum offload and receive hashing are enabled. Disable one of the receive offload features before enabling jumbos.\n"); - return -EINVAL; - } + if ((max_frame > ETH_FRAME_LEN + ETH_FCS_LEN) && + !(adapter->flags & FLAG_HAS_JUMBO_FRAMES)) { + e_err("Jumbo Frames not supported.\n"); + return -EINVAL; } /* Supported frame sizes */ @@ -6030,17 +5994,6 @@ static int e1000_set_features(struct net_device *netdev, NETIF_F_RXALL))) return 0; - /* - * IP payload checksum (enabled with jumbos/packet-split when Rx - * checksum is enabled) and generation of RSS hash is mutually - * exclusive in the hardware. - */ - if (adapter->rx_ps_pages && - (features & NETIF_F_RXCSUM) && (features & NETIF_F_RXHASH)) { - e_err("Enabling both receive checksum offload and receive hashing is not possible with jumbo frames. Disable jumbos or enable only one of the receive offload features.\n"); - return -EINVAL; - } - if (changed & NETIF_F_RXFCS) { if (features & NETIF_F_RXFCS) { adapter->flags2 &= ~FLAG2_CRC_STRIPPING; -- cgit v0.10.2 From 19b52abe3c5d759661500a1dc810924369b2ad46 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 27 Jun 2012 17:28:57 +0100 Subject: ARM: 7438/1: fill possible PMD empty section gaps On ARM with the 2-level page table format, a PMD entry is represented by two consecutive section entries covering 2MB of virtual space. However, static mappings always were allowed to use separate 1MB section entries. This means in practice that a static mapping may create half populated PMDs via create_mapping(). Since commit 0536bdf33f (ARM: move iotable mappings within the vmalloc region) those static mappings are located in the vmalloc area. We must ensure no such half populated PMDs are accessible once vmalloc() or ioremap() start looking at the vmalloc area for nearby free virtual address ranges, or various things leading to a kernel crash will happen. Signed-off-by: Nicolas Pitre Reported-by: Santosh Shilimkar Tested-by: "R, Sricharan" Reviewed-by: Catalin Marinas Cc: stable@vger.kernel.org Signed-off-by: Russell King diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index e5dad60..cf4528d 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -791,6 +791,79 @@ void __init iotable_init(struct map_desc *io_desc, int nr) } } +#ifndef CONFIG_ARM_LPAE + +/* + * The Linux PMD is made of two consecutive section entries covering 2MB + * (see definition in include/asm/pgtable-2level.h). However a call to + * create_mapping() may optimize static mappings by using individual + * 1MB section mappings. This leaves the actual PMD potentially half + * initialized if the top or bottom section entry isn't used, leaving it + * open to problems if a subsequent ioremap() or vmalloc() tries to use + * the virtual space left free by that unused section entry. + * + * Let's avoid the issue by inserting dummy vm entries covering the unused + * PMD halves once the static mappings are in place. + */ + +static void __init pmd_empty_section_gap(unsigned long addr) +{ + struct vm_struct *vm; + + vm = early_alloc_aligned(sizeof(*vm), __alignof__(*vm)); + vm->addr = (void *)addr; + vm->size = SECTION_SIZE; + vm->flags = VM_IOREMAP | VM_ARM_STATIC_MAPPING; + vm->caller = pmd_empty_section_gap; + vm_area_add_early(vm); +} + +static void __init fill_pmd_gaps(void) +{ + struct vm_struct *vm; + unsigned long addr, next = 0; + pmd_t *pmd; + + /* we're still single threaded hence no lock needed here */ + for (vm = vmlist; vm; vm = vm->next) { + if (!(vm->flags & VM_ARM_STATIC_MAPPING)) + continue; + addr = (unsigned long)vm->addr; + if (addr < next) + continue; + + /* + * Check if this vm starts on an odd section boundary. + * If so and the first section entry for this PMD is free + * then we block the corresponding virtual address. + */ + if ((addr & ~PMD_MASK) == SECTION_SIZE) { + pmd = pmd_off_k(addr); + if (pmd_none(*pmd)) + pmd_empty_section_gap(addr & PMD_MASK); + } + + /* + * Then check if this vm ends on an odd section boundary. + * If so and the second section entry for this PMD is empty + * then we block the corresponding virtual address. + */ + addr += vm->size; + if ((addr & ~PMD_MASK) == SECTION_SIZE) { + pmd = pmd_off_k(addr) + 1; + if (pmd_none(*pmd)) + pmd_empty_section_gap(addr); + } + + /* no need to look at any vm entry until we hit the next PMD */ + next = (addr + PMD_SIZE - 1) & PMD_MASK; + } +} + +#else +#define fill_pmd_gaps() do { } while (0) +#endif + static void * __initdata vmalloc_min = (void *)(VMALLOC_END - (240 << 20) - VMALLOC_OFFSET); @@ -1072,6 +1145,7 @@ static void __init devicemaps_init(struct machine_desc *mdesc) */ if (mdesc->map_io) mdesc->map_io(); + fill_pmd_gaps(); /* * Finally flush the caches and tlb to ensure that we're in a -- cgit v0.10.2 From bc1d7702910c7c7e88eb60b58429dbfe293683ce Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 28 Jun 2012 19:28:57 +0000 Subject: powerpc/xmon: Use cpumask iterator to avoid warning We have a bug report where the kernel hits a warning in the cpumask code: WARNING: at include/linux/cpumask.h:107 Which is: WARN_ON_ONCE(cpu >= nr_cpumask_bits); The backtrace is: cpu_cmd cmds xmon_core xmon die xmon is iterating through 0 to NR_CPUS. I'm not sure why we are still open coding this but iterating above nr_cpu_ids is definitely a bug. This patch iterates through all possible cpus, in case we issue a system reset and CPUs in an offline state call in. Perhaps the old code was trying to handle CPUs that were in the partition but were never started (eg kexec into a kernel with an nr_cpus= boot option). They are going to die way before we get into xmon since we haven't set any kernel state up for them. Signed-off-by: Anton Blanchard CC: Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 0f3ab06..eab3492 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -971,7 +971,7 @@ static int cpu_cmd(void) /* print cpus waiting or in xmon */ printf("cpus stopped:"); count = 0; - for (cpu = 0; cpu < NR_CPUS; ++cpu) { + for_each_possible_cpu(cpu) { if (cpumask_test_cpu(cpu, &cpus_in_xmon)) { if (count == 0) printf(" %x", cpu); -- cgit v0.10.2 From 2f584a146a2965b82fce89b8d2f95dc5cfe468d0 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Mon, 25 Jun 2012 13:33:11 +0000 Subject: powerpc/kvm: sldi should be sld Since we are taking a registers, this should never have been an sldi. Talking to paulus offline, this is the correct fix. Was introduced by: commit 19ccb76a1938ab364a412253daec64613acbf3df Author: Paul Mackerras Date: Sat Jul 23 17:42:46 2011 +1000 Talking to paulus, this shouldn't be a literal. Signed-off-by: Michael Neuling CC: [v3.2+] Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index a84aafc..a1044f4 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -810,7 +810,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) lwz r3,VCORE_NAPPING_THREADS(r5) lwz r4,VCPU_PTID(r9) li r0,1 - sldi r0,r0,r4 + sld r0,r0,r4 andc. r3,r3,r0 /* no sense IPI'ing ourselves */ beq 43f mulli r4,r4,PACA_SIZE /* get paca for thread 0 */ -- cgit v0.10.2 From fc448a18ae6219af9a73257b1fbcd009efab4a81 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 10:37:30 +1000 Subject: md/raid10: Don't try to recovery unmatched (and unused) chunks. If a RAID10 has an odd number of chunks - as might happen when there are an odd number of devices - the last chunk has no pair and so is not mirrored. We don't store data there, but when recovering the last device in an array we retry to recover that last chunk from a non-existent location. This results in an error, and the recovery aborts. When we get to that last chunk we should just stop - there is nothing more to do anyway. This bug has been present since the introduction of RAID10, so the patch is appropriate for any -stable kernel. Cc: stable@vger.kernel.org Reported-by: Christian Balzer Tested-by: Christian Balzer Signed-off-by: NeilBrown diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 99ae606..bcf6ea8 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2890,6 +2890,12 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, /* want to reconstruct this device */ rb2 = r10_bio; sect = raid10_find_virt(conf, sector_nr, i); + if (sect >= mddev->resync_max_sectors) { + /* last stripe is not complete - don't + * try to recover this sector. + */ + continue; + } /* Unless we are doing a full sync, or a replacement * we only need to recover the block if it is set in * the bitmap -- cgit v0.10.2 From 5cfb22a1f83e4f04c0a4df89b60053a077222e2b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 11:46:53 +1000 Subject: md/raid5: prefer replacing failed devices over want-replacement devices. If a RAID5 has both a failed device and a device marked as 'WantReplacement', then we should preferentially replace the failed device. However the current code replaces whichever is found first. So split into 2 loops, check fail failed/missing first, and only check for WantReplacement if nothing is failed or missing. Reported-by: majianpeng Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d267672..95fcbbf 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5465,10 +5465,9 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->saved_raid_disk >= 0 && rdev->saved_raid_disk >= first && conf->disks[rdev->saved_raid_disk].rdev == NULL) - disk = rdev->saved_raid_disk; - else - disk = first; - for ( ; disk <= last ; disk++) { + first = rdev->saved_raid_disk; + + for (disk = first; disk <= last; disk++) { p = conf->disks + disk; if (p->rdev == NULL) { clear_bit(In_sync, &rdev->flags); @@ -5477,8 +5476,11 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->saved_raid_disk != disk) conf->fullsync = 1; rcu_assign_pointer(p->rdev, rdev); - break; + goto out; } + } + for (disk = first; disk <= last; disk++) { + p = conf->disks + disk; if (test_bit(WantReplacement, &p->rdev->flags) && p->replacement == NULL) { clear_bit(In_sync, &rdev->flags); @@ -5490,6 +5492,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) break; } } +out: print_raid5_conf(conf); return err; } -- cgit v0.10.2 From 6c0544e255dd6582a9899572e120fb55d9f672a4 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Tue, 12 Jun 2012 08:31:10 +0800 Subject: md/raid5: Do not add data_offset before call to is_badblock In chunk_aligned_read() we are adding data_offset before calling is_badblock. But is_badblock also adds data_offset, so that is bad. So move the addition of data_offset to after the call to is_badblock. This bug was introduced by commit 31c176ecdf3563140e639 md/raid5: avoid reading from known bad blocks. which first appeared in 3.0. So that patch is suitable for any -stable kernel from 3.0.y onwards. However it will need minor revision for most of those (as the comment didn't appear until recently). Cc: stable@vger.kernel.org Signed-off-by: majianpeng Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 95fcbbf..9567a9c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3881,8 +3881,6 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) raid_bio->bi_next = (void*)rdev; align_bi->bi_bdev = rdev->bdev; align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); - /* No reshape active, so we can trust rdev->data_offset */ - align_bi->bi_sector += rdev->data_offset; if (!bio_fits_rdev(align_bi) || is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, @@ -3893,6 +3891,9 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) return 0; } + /* No reshape active, so we can trust rdev->data_offset */ + align_bi->bi_sector += rdev->data_offset; + spin_lock_irq(&conf->device_lock); wait_event_lock_irq(conf->wait_for_stripe, conf->quiesce == 0, -- cgit v0.10.2 From 1850753d2e6d9ca7856581ca5d3cf09521e6a5d7 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Tue, 3 Jul 2012 12:11:54 +1000 Subject: md/raid5: In ops_run_io, inc nr_pending before calling md_wait_for_blocked_rdev In ops_run_io(), the call to md_wait_for_blocked_rdev will decrement nr_pending so we lose the reference we hold on the rdev. So atomic_inc it first to maintain the reference. This bug was introduced by commit 73e92e51b7969ef5477d md/raid5. Don't write to known bad block on doubtful devices. which appeared in 3.0, so patch is suitable for stable kernels since then. Cc: stable@vger.kernel.org Signed-off-by: majianpeng Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9567a9c..befadb4 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -606,6 +606,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) * a chance*/ md_check_recovery(conf->mddev); } + /* + * Because md_wait_for_blocked_rdev + * will dec nr_pending, we must + * increment it first. + */ + atomic_inc(&rdev->nr_pending); md_wait_for_blocked_rdev(rdev, conf->mddev); } else { /* Acknowledged bad block - skip the write */ -- cgit v0.10.2 From 7c2c57c9a98bf5961e438a376486f95346f6b0c5 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Tue, 3 Jul 2012 12:12:26 +1000 Subject: md:Add blk_plug in sync_thread. Add blk_plug in sync_thread will increase the performance of sync. Because sync_thread did not blk_plug,so when raid sync, the bio merge not well. Testing environment: SATA controller: Intel Corporation 82801JI (ICH10 Family) SATA AHCI Controller. OS:Linux xxx 3.5.0-rc2+ #340 SMP Tue Jun 12 09:00:25 CST 2012 x86_64 x86_64 x86_64 GNU/Linux. RAID5: four ST31000524NS disk. Without blk_plug:recovery speed about 63M/Sec; Add blk_plug:recovery speed about 120M/Sec. Using blktrace: blktrace -d /dev/sdb -w 60 -o -|blkparse -i - without blk_plug: Total (8,16): Reads Queued: 309811, 1239MiB Writes Queued: 0, 0KiB Read Dispatches: 283583, 1189MiB Write Dispatches: 0, 0KiB Reads Requeued: 0 Writes Requeued: 0 Reads Completed: 273351, 1149MiB Writes Completed: 0, 0KiB Read Merges: 23533, 94132KiB Write Merges: 0, 0KiB IO unplugs: 0 Timer unplugs: 0 add blk_plug: Total (8,16): Reads Queued: 428697, 1714MiB Writes Queued: 0, 0KiB Read Dispatches: 3954, 1714MiB Write Dispatches: 0, 0KiB Reads Requeued: 0 Writes Requeued: 0 Reads Completed: 3956, 1715MiB Writes Completed: 0, 0KiB Read Merges: 424743, 1698MiB Write Merges: 0, 0KiB IO unplugs: 0 Timer unplugs: 3384 The ratio of merge will be markedly increased. Signed-off-by: majianpeng Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 1c2f904..973aa84 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7298,6 +7298,7 @@ void md_do_sync(struct mddev *mddev) int skipped = 0; struct md_rdev *rdev; char *desc; + struct blk_plug plug; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) @@ -7447,6 +7448,7 @@ void md_do_sync(struct mddev *mddev) } mddev->curr_resync_completed = j; + blk_start_plug(&plug); while (j < max_sectors) { sector_t sectors; @@ -7552,6 +7554,7 @@ void md_do_sync(struct mddev *mddev) * this also signals 'finished resyncing' to md_stop */ out: + blk_finish_plug(&plug); wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); /* tell personality that we are finished */ -- cgit v0.10.2 From 5f066c632fcfd2a33f2eb7077c15c630e9f5ea5b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 12:13:29 +1000 Subject: md/raid5: fix refcount problem when blocked_rdev is set. commit 43220aa0f22cd3ce5b30246d50ccd696d119edea md/raid5: fix a hang on device failure. fixed a hang, but introduced a refcounting in-balance so that if the presence of bad-blocks ever caused an rdev to be 'blocked' we would increment the refcount on the rdev and never decrement it. So added the needed rdev_dec_pending when md_wait_for_blocked_rdev is not called. Reported-by: majianpeng Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index befadb4..62b6b3a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3588,8 +3588,18 @@ static void handle_stripe(struct stripe_head *sh) finish: /* wait for this device to become unblocked */ - if (conf->mddev->external && unlikely(s.blocked_rdev)) - md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev); + if (unlikely(s.blocked_rdev)) { + if (conf->mddev->external) + md_wait_for_blocked_rdev(s.blocked_rdev, + conf->mddev); + else + /* Internal metadata will immediately + * be written by raid5d, so we don't + * need to wait here. + */ + rdev_dec_pending(s.blocked_rdev, + conf->mddev); + } if (s.handle_bad_blocks) for (i = disks; i--; ) { -- cgit v0.10.2 From 09b243577be319ef55310b45c65737008f3ebf12 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 2 Jul 2012 14:03:58 -0700 Subject: security: document no_new_privs Document no_new_privs. Signed-off-by: Andy Lutomirski Acked-by: Kees Cook Signed-off-by: James Morris diff --git a/Documentation/prctl/no_new_privs.txt b/Documentation/prctl/no_new_privs.txt new file mode 100644 index 0000000..cb705ec --- /dev/null +++ b/Documentation/prctl/no_new_privs.txt @@ -0,0 +1,50 @@ +The execve system call can grant a newly-started program privileges that +its parent did not have. The most obvious examples are setuid/setgid +programs and file capabilities. To prevent the parent program from +gaining these privileges as well, the kernel and user code must be +careful to prevent the parent from doing anything that could subvert the +child. For example: + + - The dynamic loader handles LD_* environment variables differently if + a program is setuid. + + - chroot is disallowed to unprivileged processes, since it would allow + /etc/passwd to be replaced from the point of view of a process that + inherited chroot. + + - The exec code has special handling for ptrace. + +These are all ad-hoc fixes. The no_new_privs bit (since Linux 3.5) is a +new, generic mechanism to make it safe for a process to modify its +execution environment in a manner that persists across execve. Any task +can set no_new_privs. Once the bit is set, it is inherited across fork, +clone, and execve and cannot be unset. With no_new_privs set, execve +promises not to grant the privilege to do anything that could not have +been done without the execve call. For example, the setuid and setgid +bits will no longer change the uid or gid; file capabilities will not +add to the permitted set, and LSMs will not relax constraints after +execve. + +Note that no_new_privs does not prevent privilege changes that do not +involve execve. An appropriately privileged task can still call +setuid(2) and receive SCM_RIGHTS datagrams. + +There are two main use cases for no_new_privs so far: + + - Filters installed for the seccomp mode 2 sandbox persist across + execve and can change the behavior of newly-executed programs. + Unprivileged users are therefore only allowed to install such filters + if no_new_privs is set. + + - By itself, no_new_privs can be used to reduce the attack surface + available to an unprivileged user. If everything running with a + given uid has no_new_privs set, then that uid will be unable to + escalate its privileges by directly attacking setuid, setgid, and + fcap-using binaries; it will need to compromise something without the + no_new_privs bit set first. + +In the future, other potentially dangerous kernel features could become +available to unprivileged tasks if no_new_privs is set. In principle, +several options to unshare(2) and clone(2) would be safe when +no_new_privs is set, and no_new_privs + chroot is considerable less +dangerous than chroot by itself. -- cgit v0.10.2 From 055d3747dbf00ce85c6872ecca4d466638e80c22 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 15:55:33 +1000 Subject: md/raid10: fix failure when trying to repair a read error. commit 58c54fcca3bac5bf9290cfed31c76e4c4bfbabaf md/raid10: handle further errors during fix_read_error better. in 3.1 added "r10_sync_page_io" which takes an IO size in sectors. But we were passing the IO size in bytes!!! This resulting in bio_add_page failing, and empty request being sent down, and a consequent BUG_ON in scsi_lib. [fix missing space in error message at same time] This fix is suitable for 3.1.y and later. Cc: stable@vger.kernel.org Reported-by: Christian Balzer Signed-off-by: NeilBrown diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index bcf6ea8..ae73e29 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2310,7 +2310,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 if (r10_sync_page_io(rdev, r10_bio->devs[sl].addr + sect, - s<<9, conf->tmppage, WRITE) + s, conf->tmppage, WRITE) == 0) { /* Well, this device is dead */ printk(KERN_NOTICE @@ -2349,7 +2349,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 switch (r10_sync_page_io(rdev, r10_bio->devs[sl].addr + sect, - s<<9, conf->tmppage, + s, conf->tmppage, READ)) { case 0: /* Well, this device is dead */ @@ -2512,7 +2512,7 @@ read_more: slot = r10_bio->read_slot; printk_ratelimited( KERN_ERR - "md/raid10:%s: %s: redirecting" + "md/raid10:%s: %s: redirecting " "sector %llu to another mirror\n", mdname(mddev), bdevname(rdev->bdev, b), -- cgit v0.10.2 From 0232605d987d8230b254aa139805bbb56a7ca30c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 15:56:52 +1000 Subject: md: make 'name' arg to md_register_thread non-optional. Having the 'name' arg optional and defaulting to the current personality name is no necessary and leads to errors, as when changing the level of an array we can end up using the name of the old level instead of the new one. So make it non-optional and always explicitly pass the name of the level that the array will be. Reported-by: majianpeng Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 973aa84..c601c4b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6751,7 +6751,7 @@ struct md_thread *md_register_thread(void (*run) (struct mddev *), struct mddev thread->tsk = kthread_run(md_thread, thread, "%s_%s", mdname(thread->mddev), - name ?: mddev->pers->name); + name); if (IS_ERR(thread->tsk)) { kfree(thread); return NULL; diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 9339e67..61a1833 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -474,7 +474,8 @@ static int multipath_run (struct mddev *mddev) } { - mddev->thread = md_register_thread(multipathd, mddev, NULL); + mddev->thread = md_register_thread(multipathd, mddev, + "multipath"); if (!mddev->thread) { printk(KERN_ERR "multipath: couldn't allocate thread" " for %s\n", mdname(mddev)); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a9c7981..39b2a8a 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2621,7 +2621,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) goto abort; } err = -ENOMEM; - conf->thread = md_register_thread(raid1d, mddev, NULL); + conf->thread = md_register_thread(raid1d, mddev, "raid1"); if (!conf->thread) { printk(KERN_ERR "md/raid1:%s: couldn't allocate thread\n", diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ae73e29..edc1088 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3427,7 +3427,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); - conf->thread = md_register_thread(raid10d, mddev, NULL); + conf->thread = md_register_thread(raid10d, mddev, "raid10"); if (!conf->thread) goto out; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 62b6b3a..a5135e5 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4840,6 +4840,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) int raid_disk, memory, max_disks; struct md_rdev *rdev; struct disk_info *disk; + char pers_name[6]; if (mddev->new_level != 5 && mddev->new_level != 4 @@ -4963,7 +4964,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) printk(KERN_INFO "md/raid:%s: allocated %dkB\n", mdname(mddev), memory); - conf->thread = md_register_thread(raid5d, mddev, NULL); + sprintf(pers_name, "raid%d", mddev->new_level); + conf->thread = md_register_thread(raid5d, mddev, pers_name); if (!conf->thread) { printk(KERN_ERR "md/raid:%s: couldn't allocate thread.\n", -- cgit v0.10.2 From 2e8ac30312973dd20e6807365349ecb1c7e0ea45 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Tue, 3 Jul 2012 15:57:02 +1000 Subject: md/raid456: When read error cannot be recovered, record bad block We may not be able to fix a bad block if: - the array is degraded - the over-write fails. In these cases we currently eject the device, but we should record a bad block if possible. Signed-off-by: majianpeng Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a5135e5..51169ec 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1743,6 +1743,7 @@ static void raid5_end_read_request(struct bio * bi, int error) } else { const char *bdn = bdevname(rdev->bdev, b); int retry = 0; + int set_bad = 0; clear_bit(R5_UPTODATE, &sh->dev[i].flags); atomic_inc(&rdev->read_errors); @@ -1754,7 +1755,8 @@ static void raid5_end_read_request(struct bio * bi, int error) mdname(conf->mddev), (unsigned long long)s, bdn); - else if (conf->mddev->degraded >= conf->max_degraded) + else if (conf->mddev->degraded >= conf->max_degraded) { + set_bad = 1; printk_ratelimited( KERN_WARNING "md/raid:%s: read error not correctable " @@ -1762,8 +1764,9 @@ static void raid5_end_read_request(struct bio * bi, int error) mdname(conf->mddev), (unsigned long long)s, bdn); - else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) + } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { /* Oh, no!!! */ + set_bad = 1; printk_ratelimited( KERN_WARNING "md/raid:%s: read error NOT corrected!! " @@ -1771,7 +1774,7 @@ static void raid5_end_read_request(struct bio * bi, int error) mdname(conf->mddev), (unsigned long long)s, bdn); - else if (atomic_read(&rdev->read_errors) + } else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) printk(KERN_WARNING "md/raid:%s: Too many read errors, failing device %s.\n", @@ -1783,7 +1786,11 @@ static void raid5_end_read_request(struct bio * bi, int error) else { clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); - md_error(conf->mddev, rdev); + if (!(set_bad + && test_bit(In_sync, &rdev->flags) + && rdev_set_badblocks( + rdev, sh->sector, STRIPE_SECTORS, 0))) + md_error(conf->mddev, rdev); } } rdev_dec_pending(rdev, conf->mddev); -- cgit v0.10.2 From fab363b5ff502d1b39ddcfec04271f5858d9f26e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 3 Jul 2012 15:57:19 +1000 Subject: raid5: delayed stripe fix There isn't locking setting STRIPE_DELAYED and STRIPE_PREREAD_ACTIVE bits, but the two bits have relationship. A delayed stripe can be moved to hold list only when preread active stripe count is below IO_THRESHOLD. If a stripe has both the bits set, such stripe will be in delayed list and preread count not 0, which will make such stripe never leave delayed list. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 51169ec..7245a9d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -196,12 +196,14 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) BUG_ON(!list_empty(&sh->lru)); BUG_ON(atomic_read(&conf->active_stripes)==0); if (test_bit(STRIPE_HANDLE, &sh->state)) { - if (test_bit(STRIPE_DELAYED, &sh->state)) + if (test_bit(STRIPE_DELAYED, &sh->state) && + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) list_add_tail(&sh->lru, &conf->delayed_list); else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && sh->bm_seq - conf->seq_write > 0) list_add_tail(&sh->lru, &conf->bitmap_list); else { + clear_bit(STRIPE_DELAYED, &sh->state); clear_bit(STRIPE_BIT_DELAY, &sh->state); list_add_tail(&sh->lru, &conf->handle_list); } -- cgit v0.10.2 From 32644afd8975d19174bcb9ba34687c32dd810a09 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 15:58:42 +1000 Subject: md/raid1: fix bug in read_balance introduced by hot-replace When we added hot_replace we doubled the number of devices that could be in a RAID1 array. So we doubled how far read_balance would search. Unfortunately we didn't double the point at which it looped back to the beginning - so it effectively loops over all non-replacement disks twice. This doesn't cause bad behaviour, but it pointless and means we never read from replacement devices. Signed-off-by: NeilBrown diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 39b2a8a..34b4665 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -517,8 +517,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect int bad_sectors; int disk = start_disk + i; - if (disk >= conf->raid_disks) - disk -= conf->raid_disks; + if (disk >= conf->raid_disks * 2) + disk -= conf->raid_disks * 2; rdev = rcu_dereference(conf->mirrors[disk].rdev); if (r1_bio->bios[disk] == IO_BLOCKED -- cgit v0.10.2 From f456309106e9657645c81bce1a6bb3230393564e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 15:59:06 +1000 Subject: md: support re-add of recovering devices. We currently only allow a device to be re-added if it appear to be in-sync. This is overly restrictive as it may be desirable to re-add a device that is in the middle of recovery. So remove the test for "InSync" - the test on rdev->raid_disk is sufficient to ensure that the re-add will succeed. Reported-by: Alexander Lyakas Tested-by: Alexander Lyakas Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index c601c4b..a4c219e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5784,8 +5784,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) super_types[mddev->major_version]. validate_super(mddev, rdev); if ((info->state & (1<flags) || - rdev->raid_disk != info->raid_disk)) { + rdev->raid_disk != info->raid_disk) { /* This was a hot-add request, but events doesn't * match, so reject it. */ -- cgit v0.10.2 From b357f04a67c2aeee828b240863cd3f21d6cb3179 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 17:45:31 +1000 Subject: md: fix up plugging (again). The value returned by "mddev_check_plug" is only valid until the next 'schedule' as that will unplug things. This could happen at any call to mempool_alloc. So just calling mddev_check_plug at the start doesn't really make sense. So call it just before, or just after, queuing things for the thread. As the action that happens at unplug is to wake the thread, this makes lots of sense. If we cannot add a plug (which requires a small GFP_ATOMIC alloc) we wake thread immediately. RAID5 is a bit different. Requests are queued for the thread and the thread is woken by release_stripe. So we don't need to wake the thread on failure. However the thread doesn't perform certain actions when there is any active plug, so it is important to install a plug before waking the thread. So for RAID5 we install the plug *before* queuing the request and waking the thread. Without this patch it is possible for raid1 or raid10 to queue a request without then waking the thread, resulting in the array locking up. Also change raid10 to only flush_pending_write when there are not active plugs, just like raid1. This patch is suitable for 3.0 or later. I plan to submit it to -stable, but I'll like to let it spend a few weeks in mainline first to be sure it is completely safe. Signed-off-by: NeilBrown diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 34b4665..8c2754f 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -883,7 +883,6 @@ static void make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); struct md_rdev *blocked_rdev; - int plugged; int first_clone; int sectors_handled; int max_sectors; @@ -1034,7 +1033,6 @@ read_again: * the bad blocks. Each set of writes gets it's own r1bio * with a set of bios attached. */ - plugged = mddev_check_plugged(mddev); disks = conf->raid_disks * 2; retry_write: @@ -1191,6 +1189,8 @@ read_again: bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); + if (!mddev_check_plugged(mddev)) + md_wakeup_thread(mddev->thread); } /* Mustn't call r1_bio_write_done before this next test, * as it could result in the bio being freed. @@ -1213,9 +1213,6 @@ read_again: /* In case raid1d snuck in to freeze_array */ wake_up(&conf->wait_barrier); - - if (do_sync || !bitmap || !plugged) - md_wakeup_thread(mddev->thread); } static void status(struct seq_file *seq, struct mddev *mddev) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index edc1088..acf5a82 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1039,7 +1039,6 @@ static void make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_fua = (bio->bi_rw & REQ_FUA); unsigned long flags; struct md_rdev *blocked_rdev; - int plugged; int sectors_handled; int max_sectors; int sectors; @@ -1239,7 +1238,6 @@ read_again: * of r10_bios is recored in bio->bi_phys_segments just as with * the read case. */ - plugged = mddev_check_plugged(mddev); r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ raid10_find_phys(conf, r10_bio); @@ -1396,6 +1394,8 @@ retry_write: bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); + if (!mddev_check_plugged(mddev, 0, 0)) + md_wakeup_thread(mddev->thread); if (!r10_bio->devs[i].repl_bio) continue; @@ -1423,6 +1423,8 @@ retry_write: bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); + if (!mddev_check_plugged(mddev)) + md_wakeup_thread(mddev->thread); } /* Don't remove the bias on 'remaining' (one_write_done) until @@ -1448,9 +1450,6 @@ retry_write: /* In case raid10d snuck in to freeze_array */ wake_up(&conf->wait_barrier); - - if (do_sync || !mddev->bitmap || !plugged) - md_wakeup_thread(mddev->thread); } static void status(struct seq_file *seq, struct mddev *mddev) @@ -2661,7 +2660,8 @@ static void raid10d(struct mddev *mddev) blk_start_plug(&plug); for (;;) { - flush_pending_writes(conf); + if (atomic_read(&mddev->plug_cnt) == 0) + flush_pending_writes(conf); spin_lock_irqsave(&conf->device_lock, flags); if (list_empty(head)) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7245a9d..04348d7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3997,7 +3997,6 @@ static void make_request(struct mddev *mddev, struct bio * bi) struct stripe_head *sh; const int rw = bio_data_dir(bi); int remaining; - int plugged; if (unlikely(bi->bi_rw & REQ_FLUSH)) { md_flush_request(mddev, bi); @@ -4016,7 +4015,6 @@ static void make_request(struct mddev *mddev, struct bio * bi) bi->bi_next = NULL; bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ - plugged = mddev_check_plugged(mddev); for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { DEFINE_WAIT(w); int previous; @@ -4118,6 +4116,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) if ((bi->bi_rw & REQ_SYNC) && !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); + mddev_check_plugged(mddev); release_stripe(sh); } else { /* cannot get stripe for read-ahead, just give-up */ @@ -4125,10 +4124,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) finish_wait(&conf->wait_for_overlap, &w); break; } - } - if (!plugged) - md_wakeup_thread(mddev->thread); spin_lock_irq(&conf->device_lock); remaining = raid5_dec_bi_phys_segments(bi); -- cgit v0.10.2 From 1ef5325b238676c7a16bcd374250b07e77682736 Mon Sep 17 00:00:00 2001 From: Jerome Glisse Date: Mon, 2 Jul 2012 12:40:54 -0400 Subject: drm/radeon: fix rare segfault MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In gem idle/busy ioctl the radeon object was derefenced after drm_gem_object_unreference_unlocked which in case the object have been destroyed lead to use of a possibly free pointer with possibly wrong data. Signed-off-by: Jerome Glisse Reviewed-by: Alex Deucher Reviewed-by: Christian König Signed-off-by: Dave Airlie diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c index f28bd4b..21ec9f5 100644 --- a/drivers/gpu/drm/radeon/radeon_gem.c +++ b/drivers/gpu/drm/radeon/radeon_gem.c @@ -292,6 +292,7 @@ int radeon_gem_mmap_ioctl(struct drm_device *dev, void *data, int radeon_gem_busy_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { + struct radeon_device *rdev = dev->dev_private; struct drm_radeon_gem_busy *args = data; struct drm_gem_object *gobj; struct radeon_bo *robj; @@ -317,13 +318,14 @@ int radeon_gem_busy_ioctl(struct drm_device *dev, void *data, break; } drm_gem_object_unreference_unlocked(gobj); - r = radeon_gem_handle_lockup(robj->rdev, r); + r = radeon_gem_handle_lockup(rdev, r); return r; } int radeon_gem_wait_idle_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { + struct radeon_device *rdev = dev->dev_private; struct drm_radeon_gem_wait_idle *args = data; struct drm_gem_object *gobj; struct radeon_bo *robj; @@ -336,10 +338,10 @@ int radeon_gem_wait_idle_ioctl(struct drm_device *dev, void *data, robj = gem_to_radeon_bo(gobj); r = radeon_bo_wait(robj, NULL, false); /* callback hw specific functions if any */ - if (robj->rdev->asic->ioctl_wait_idle) - robj->rdev->asic->ioctl_wait_idle(robj->rdev, robj); + if (rdev->asic->ioctl_wait_idle) + robj->rdev->asic->ioctl_wait_idle(rdev, robj); drm_gem_object_unreference_unlocked(gobj); - r = radeon_gem_handle_lockup(robj->rdev, r); + r = radeon_gem_handle_lockup(rdev, r); return r; } -- cgit v0.10.2 From 7b668ebe2fce517873b0c28dd70c10fef1d3dc2f Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 3 Jul 2012 11:22:11 +0200 Subject: drm: edid: Don't add inferred modes with higher resolution When a monitor EDID doesn't give the preferred bit, driver assumes that the mode with the higest resolution and rate is the preferred mode. Meanwhile the recent changes for allowing more modes in the GFT/CVT ranges give actually more modes, and some modes may be over the native size. Thus such a mode would be picked up as the preferred mode although it's no native resolution. For avoiding such a problem, this patch limits the addition of inferred modes by checking not to be greater than other modes. Also, it checks the duplicated mode entry at the same time. Reviewed-by: Adam Jackson Signed-off-by: Takashi Iwai Signed-off-by: Dave Airlie diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index 5873e48..a8743c3 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -1039,6 +1039,24 @@ mode_in_range(const struct drm_display_mode *mode, struct edid *edid, return true; } +static bool valid_inferred_mode(const struct drm_connector *connector, + const struct drm_display_mode *mode) +{ + struct drm_display_mode *m; + bool ok = false; + + list_for_each_entry(m, &connector->probed_modes, head) { + if (mode->hdisplay == m->hdisplay && + mode->vdisplay == m->vdisplay && + drm_mode_vrefresh(mode) == drm_mode_vrefresh(m)) + return false; /* duplicated */ + if (mode->hdisplay <= m->hdisplay && + mode->vdisplay <= m->vdisplay) + ok = true; + } + return ok; +} + static int drm_dmt_modes_for_range(struct drm_connector *connector, struct edid *edid, struct detailed_timing *timing) @@ -1048,7 +1066,8 @@ drm_dmt_modes_for_range(struct drm_connector *connector, struct edid *edid, struct drm_device *dev = connector->dev; for (i = 0; i < drm_num_dmt_modes; i++) { - if (mode_in_range(drm_dmt_modes + i, edid, timing)) { + if (mode_in_range(drm_dmt_modes + i, edid, timing) && + valid_inferred_mode(connector, drm_dmt_modes + i)) { newmode = drm_mode_duplicate(dev, &drm_dmt_modes[i]); if (newmode) { drm_mode_probed_add(connector, newmode); @@ -1088,7 +1107,8 @@ drm_gtf_modes_for_range(struct drm_connector *connector, struct edid *edid, return modes; fixup_mode_1366x768(newmode); - if (!mode_in_range(newmode, edid, timing)) { + if (!mode_in_range(newmode, edid, timing) || + !valid_inferred_mode(connector, newmode)) { drm_mode_destroy(dev, newmode); continue; } @@ -1116,7 +1136,8 @@ drm_cvt_modes_for_range(struct drm_connector *connector, struct edid *edid, return modes; fixup_mode_1366x768(newmode); - if (!mode_in_range(newmode, edid, timing)) { + if (!mode_in_range(newmode, edid, timing) || + !valid_inferred_mode(connector, newmode)) { drm_mode_destroy(dev, newmode); continue; } -- cgit v0.10.2 From 9f846a16d213523fbe6daea17e20df6b8ac5a1e5 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Sun, 1 Jul 2012 17:09:42 +0200 Subject: drm/i915: kick any firmware framebuffers before claiming the gtt Especially vesafb likes to map everything as uc- (yikes), and if that mapping hangs around still while we try to map the gtt as wc the kernel will downgrade our request to uc-, resulting in abyssal performance. Unfortunately we can't do this as early as readon does (i.e. as the first thing we do when initializing the hw) because our fb/mmio space region moves around on a per-gen basis. So I've had to move it below the gtt initialization, but that seems to work, too. The important thing is that we do this before we set up the gtt wc mapping. Now an altogether different question is why people compile their kernels with vesafb enabled, but I guess making things just work isn't bad per se ... v2: - s/radeondrmfb/inteldrmfb/ - fix up error handling v3: Kill #ifdef X86, this is Intel after all. Noticed by Ben Widawsky. v4: Jani Nikula complained about the pointless bool primary initialization. v5: Don't oops if we can't allocate, noticed by Chris Wilson. v6: Resolve conflicts with agp rework and fixup whitespace. This is commit e188719a2891f01b3100d in drm-next. Backport to 3.5 -fixes queue requested by Dave Airlie - due to grub using vesa on fedora their initrd seems to load vesafb before loading the real kms driver. So tons more people actually experience a dead-slow gpu. Hence also the Cc: stable. Cc: stable@vger.kernel.org Reported-and-tested-by: "Kilarski, Bernard R" Reviewed-by: Chris Wilson Signed-off-by: Daniel Vetter Signed-off-by: Dave Airlie diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c index f947926..36822b9 100644 --- a/drivers/gpu/drm/i915/i915_dma.c +++ b/drivers/gpu/drm/i915/i915_dma.c @@ -1401,6 +1401,27 @@ i915_mtrr_setup(struct drm_i915_private *dev_priv, unsigned long base, } } +static void i915_kick_out_firmware_fb(struct drm_i915_private *dev_priv) +{ + struct apertures_struct *ap; + struct pci_dev *pdev = dev_priv->dev->pdev; + bool primary; + + ap = alloc_apertures(1); + if (!ap) + return; + + ap->ranges[0].base = dev_priv->dev->agp->base; + ap->ranges[0].size = + dev_priv->mm.gtt->gtt_mappable_entries << PAGE_SHIFT; + primary = + pdev->resource[PCI_ROM_RESOURCE].flags & IORESOURCE_ROM_SHADOW; + + remove_conflicting_framebuffers(ap, "inteldrmfb", primary); + + kfree(ap); +} + /** * i915_driver_load - setup chip and create an initial config * @dev: DRM device @@ -1446,6 +1467,15 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags) goto free_priv; } + dev_priv->mm.gtt = intel_gtt_get(); + if (!dev_priv->mm.gtt) { + DRM_ERROR("Failed to initialize GTT\n"); + ret = -ENODEV; + goto put_bridge; + } + + i915_kick_out_firmware_fb(dev_priv); + pci_set_master(dev->pdev); /* overlay on gen2 is broken and can't address above 1G */ @@ -1471,13 +1501,6 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags) goto put_bridge; } - dev_priv->mm.gtt = intel_gtt_get(); - if (!dev_priv->mm.gtt) { - DRM_ERROR("Failed to initialize GTT\n"); - ret = -ENODEV; - goto out_rmmap; - } - aperture_size = dev_priv->mm.gtt->gtt_mappable_entries << PAGE_SHIFT; dev_priv->mm.gtt_mapping = -- cgit v0.10.2 From 75331a597cf4cde51d9b0bb22cbd03b9837ef9e4 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 2 Jul 2012 14:34:11 +0900 Subject: security: Fix nommu build. The security + nommu configuration presently blows up with an undefined reference to BDI_CAP_EXEC_MAP: security/security.c: In function 'mmap_prot': security/security.c:687:36: error: dereferencing pointer to incomplete type security/security.c:688:16: error: 'BDI_CAP_EXEC_MAP' undeclared (first use in this function) security/security.c:688:16: note: each undeclared identifier is reported only once for each function it appears in include backing-dev.h directly to fix it up. Signed-off-by: Paul Mundt Signed-off-by: James Morris diff --git a/security/security.c b/security/security.c index 3efc9b1..860aeb3 100644 --- a/security/security.c +++ b/security/security.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #define MAX_LSM_EVM_XATTR 2 -- cgit v0.10.2 From 0d200aefd4ac51787b6b80de1bb7ce93bccd59f6 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Tue, 3 Jul 2012 12:55:31 +0100 Subject: dm thin: commit metadata before creating metadata snapshot Userland sometimes sees a corrupt metadata block if metadata is changing rapidly when a metadata snapshot is reserved for userland, To make the problem go away, commit before we take the metadata snapshot (which is a sensible thing to do anyway). The checksums mean userland spots this corruption immediately so there's no risk of acting on incorrect data. No corruption exists from the kernel's point of view, and thin_check passes after pool shutdown. I believe this is to do with shared blocks at the first level of the {device, mapping} btree. Prior to the metadata-snap support no sharing at this level was possible, so this patch is only required after commit cc8394d86f045b86ff303d3c9e4ce47d97148951 ("dm thin: provide userspace access to pool metadata"). Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 37fdaf8..ce59824 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -2292,6 +2292,13 @@ static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct if (r) return r; + r = dm_pool_commit_metadata(pool->pmd); + if (r) { + DMERR("%s: dm_pool_commit_metadata() failed, error = %d", + __func__, r); + return r; + } + r = dm_pool_reserve_metadata_snap(pool->pmd); if (r) DMWARN("reserve_metadata_snap message failed."); -- cgit v0.10.2 From 25d7cd6faa7ae6ed2565617c3ee2500ccb8a9f7f Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 3 Jul 2012 12:55:33 +0100 Subject: dm persistent data: fix shadow_info_leak on dm_tm_destroy Cleanup the shadow table before destroying the transaction manager. Reference: leak was identified with kmemleak when running test_discard_random_sectors in the thinp-test-suite. Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org Signed-off-by: Alasdair G Kergon diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index 400fe14..02bf78e 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -138,6 +138,9 @@ EXPORT_SYMBOL_GPL(dm_tm_create_non_blocking_clone); void dm_tm_destroy(struct dm_transaction_manager *tm) { + if (!tm->is_clone) + wipe_shadow_table(tm); + kfree(tm); } EXPORT_SYMBOL_GPL(dm_tm_destroy); -- cgit v0.10.2 From 62662303e7f590fdfbb0070ab820a0ad4267c119 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 3 Jul 2012 12:55:35 +0100 Subject: dm persistent data: handle space map checker creation failure If CONFIG_DM_DEBUG_SPACE_MAPS is enabled and dm_sm_checker_create() fails, dm_tm_create_internal() would still return success even though it cleaned up all resources it was supposed to have created. This will lead to a kernel crash: general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC ... RIP: 0010:[] [] dm_bufio_get_block_size+0x9/0x20 Call Trace: [] dm_bm_block_size+0xe/0x10 [] sm_ll_init+0x78/0xd0 [] sm_ll_new_disk+0x16/0xa0 [] dm_sm_disk_create+0xfe/0x160 [] dm_pool_metadata_open+0x16e/0x6a0 [] pool_ctr+0x3f0/0x900 [] dm_table_add_target+0x195/0x450 [] table_load+0xe4/0x330 [] ctl_ioctl+0x15a/0x2c0 [] dm_ctl_ioctl+0x13/0x20 [] do_vfs_ioctl+0x98/0x560 [] sys_ioctl+0x91/0xa0 [] system_call_fastpath+0x16/0x1b Fix the space map checker code to return an appropriate ERR_PTR and have dm_sm_disk_create() and dm_tm_create_internal() check for it with IS_ERR. Reported-by: Vivek Goyal Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org Signed-off-by: Alasdair G Kergon diff --git a/drivers/md/persistent-data/dm-space-map-checker.c b/drivers/md/persistent-data/dm-space-map-checker.c index 50ed53b..6d7c832 100644 --- a/drivers/md/persistent-data/dm-space-map-checker.c +++ b/drivers/md/persistent-data/dm-space-map-checker.c @@ -343,25 +343,25 @@ struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm) int r; struct sm_checker *smc; - if (!sm) - return NULL; + if (IS_ERR_OR_NULL(sm)) + return ERR_PTR(-EINVAL); smc = kmalloc(sizeof(*smc), GFP_KERNEL); if (!smc) - return NULL; + return ERR_PTR(-ENOMEM); memcpy(&smc->sm, &ops_, sizeof(smc->sm)); r = ca_create(&smc->old_counts, sm); if (r) { kfree(smc); - return NULL; + return ERR_PTR(r); } r = ca_create(&smc->counts, sm); if (r) { ca_destroy(&smc->old_counts); kfree(smc); - return NULL; + return ERR_PTR(r); } smc->real_sm = sm; @@ -371,7 +371,7 @@ struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm) ca_destroy(&smc->counts); ca_destroy(&smc->old_counts); kfree(smc); - return NULL; + return ERR_PTR(r); } r = ca_commit(&smc->old_counts, &smc->counts); @@ -379,7 +379,7 @@ struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm) ca_destroy(&smc->counts); ca_destroy(&smc->old_counts); kfree(smc); - return NULL; + return ERR_PTR(r); } return &smc->sm; @@ -391,25 +391,25 @@ struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm) int r; struct sm_checker *smc; - if (!sm) - return NULL; + if (IS_ERR_OR_NULL(sm)) + return ERR_PTR(-EINVAL); smc = kmalloc(sizeof(*smc), GFP_KERNEL); if (!smc) - return NULL; + return ERR_PTR(-ENOMEM); memcpy(&smc->sm, &ops_, sizeof(smc->sm)); r = ca_create(&smc->old_counts, sm); if (r) { kfree(smc); - return NULL; + return ERR_PTR(r); } r = ca_create(&smc->counts, sm); if (r) { ca_destroy(&smc->old_counts); kfree(smc); - return NULL; + return ERR_PTR(r); } smc->real_sm = sm; diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index fc469ba..3d0ed53 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -290,7 +290,16 @@ struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm, dm_block_t nr_blocks) { struct dm_space_map *sm = dm_sm_disk_create_real(tm, nr_blocks); - return dm_sm_checker_create_fresh(sm); + struct dm_space_map *smc; + + if (IS_ERR_OR_NULL(sm)) + return sm; + + smc = dm_sm_checker_create_fresh(sm); + if (IS_ERR(smc)) + dm_sm_destroy(sm); + + return smc; } EXPORT_SYMBOL_GPL(dm_sm_disk_create); diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index 02bf78e..e5604b3 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -347,8 +347,10 @@ static int dm_tm_create_internal(struct dm_block_manager *bm, } *sm = dm_sm_checker_create(inner); - if (!*sm) + if (IS_ERR(*sm)) { + r = PTR_ERR(*sm); goto bad2; + } } else { r = dm_bm_write_lock(dm_tm_get_bm(*tm), sb_location, @@ -367,8 +369,10 @@ static int dm_tm_create_internal(struct dm_block_manager *bm, } *sm = dm_sm_checker_create(inner); - if (!*sm) + if (IS_ERR(*sm)) { + r = PTR_ERR(*sm); goto bad2; + } } return 0; -- cgit v0.10.2 From b0239faaf87c38bb419c9264bf20817438ddc3a9 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 3 Jul 2012 12:55:37 +0100 Subject: dm persistent data: fix allocation failure in space map checker init If CONFIG_DM_DEBUG_SPACE_MAPS is enabled and memory is fragmented and a sufficiently-large metadata device is used in a thin pool then the space map checker will fail to allocate the memory it requires. Switch from kmalloc to vmalloc to allow larger virtually contiguous allocations for the space map checker's internal count arrays. Reported-by: Vivek Goyal Cc: stable@kernel.org Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon diff --git a/drivers/md/persistent-data/dm-space-map-checker.c b/drivers/md/persistent-data/dm-space-map-checker.c index 6d7c832..fc90c11 100644 --- a/drivers/md/persistent-data/dm-space-map-checker.c +++ b/drivers/md/persistent-data/dm-space-map-checker.c @@ -8,6 +8,7 @@ #include #include +#include #ifdef CONFIG_DM_DEBUG_SPACE_MAPS @@ -89,13 +90,23 @@ static int ca_create(struct count_array *ca, struct dm_space_map *sm) ca->nr = nr_blocks; ca->nr_free = nr_blocks; - ca->counts = kzalloc(sizeof(*ca->counts) * nr_blocks, GFP_KERNEL); - if (!ca->counts) - return -ENOMEM; + + if (!nr_blocks) + ca->counts = NULL; + else { + ca->counts = vzalloc(sizeof(*ca->counts) * nr_blocks); + if (!ca->counts) + return -ENOMEM; + } return 0; } +static void ca_destroy(struct count_array *ca) +{ + vfree(ca->counts); +} + static int ca_load(struct count_array *ca, struct dm_space_map *sm) { int r; @@ -126,12 +137,14 @@ static int ca_load(struct count_array *ca, struct dm_space_map *sm) static int ca_extend(struct count_array *ca, dm_block_t extra_blocks) { dm_block_t nr_blocks = ca->nr + extra_blocks; - uint32_t *counts = kzalloc(sizeof(*counts) * nr_blocks, GFP_KERNEL); + uint32_t *counts = vzalloc(sizeof(*counts) * nr_blocks); if (!counts) return -ENOMEM; - memcpy(counts, ca->counts, sizeof(*counts) * ca->nr); - kfree(ca->counts); + if (ca->counts) { + memcpy(counts, ca->counts, sizeof(*counts) * ca->nr); + ca_destroy(ca); + } ca->nr = nr_blocks; ca->nr_free += extra_blocks; ca->counts = counts; @@ -151,11 +164,6 @@ static int ca_commit(struct count_array *old, struct count_array *new) return 0; } -static void ca_destroy(struct count_array *ca) -{ - kfree(ca->counts); -} - /*----------------------------------------------------------------*/ struct sm_checker { -- cgit v0.10.2 From 18068bdd5f59229623b2fa518a6389e346642b0d Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Tue, 3 Jul 2012 12:55:41 +0100 Subject: dm: verity fix documentation Veritysetup is now part of cryptsetup package. Remove on-disk header description (which is not parsed in kernel) and point users to cryptsetup where it the format is documented. Mention units for block size paramaters. Fix target line specification and dmsetup parameters. Signed-off-by: Milan Broz Cc: stable@kernel.org Signed-off-by: Alasdair G Kergon diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt index 32e4879..9884681 100644 --- a/Documentation/device-mapper/verity.txt +++ b/Documentation/device-mapper/verity.txt @@ -7,39 +7,39 @@ This target is read-only. Construction Parameters ======================= - + - This is the version number of the on-disk format. + This is the type of the on-disk hash format. 0 is the original format used in the Chromium OS. - The salt is appended when hashing, digests are stored continuously and - the rest of the block is padded with zeros. + The salt is appended when hashing, digests are stored continuously and + the rest of the block is padded with zeros. 1 is the current format that should be used for new devices. - The salt is prepended when hashing and each digest is - padded with zeros to the power of two. + The salt is prepended when hashing and each digest is + padded with zeros to the power of two. - This is the device containing the data the integrity of which needs to be + This is the device containing data, the integrity of which needs to be checked. It may be specified as a path, like /dev/sdaX, or a device number, :. - This is the device that that supplies the hash tree data. It may be + This is the device that supplies the hash tree data. It may be specified similarly to the device path and may be the same device. If the - same device is used, the hash_start should be outside of the dm-verity - configured device size. + same device is used, the hash_start should be outside the configured + dm-verity device. - The block size on a data device. Each block corresponds to one digest on - the hash device. + The block size on a data device in bytes. + Each block corresponds to one digest on the hash device. - The size of a hash block. + The size of a hash block in bytes. The number of data blocks on the data device. Additional blocks are @@ -65,7 +65,7 @@ Construction Parameters Theory of operation =================== -dm-verity is meant to be setup as part of a verified boot path. This +dm-verity is meant to be set up as part of a verified boot path. This may be anything ranging from a boot using tboot or trustedgrub to just booting from a known-good device (like a USB drive or CD). @@ -73,20 +73,20 @@ When a dm-verity device is configured, it is expected that the caller has been authenticated in some way (cryptographic signatures, etc). After instantiation, all hashes will be verified on-demand during disk access. If they cannot be verified up to the root node of the -tree, the root hash, then the I/O will fail. This should identify +tree, the root hash, then the I/O will fail. This should detect tampering with any data on the device and the hash data. Cryptographic hashes are used to assert the integrity of the device on a -per-block basis. This allows for a lightweight hash computation on first read -into the page cache. Block hashes are stored linearly-aligned to the nearest -block the size of a page. +per-block basis. This allows for a lightweight hash computation on first read +into the page cache. Block hashes are stored linearly, aligned to the nearest +block size. Hash Tree --------- Each node in the tree is a cryptographic hash. If it is a leaf node, the hash -is of some block data on disk. If it is an intermediary node, then the hash is -of a number of child nodes. +of some data block on disk is calculated. If it is an intermediary node, +the hash of a number of child nodes is calculated. Each entry in the tree is a collection of neighboring nodes that fit in one block. The number is determined based on block_size and the size of the @@ -110,63 +110,23 @@ alg = sha256, num_blocks = 32768, block_size = 4096 On-disk format ============== -Below is the recommended on-disk format. The verity kernel code does not -read the on-disk header. It only reads the hash blocks which directly -follow the header. It is expected that a user-space tool will verify the -integrity of the verity_header and then call dmsetup with the correct -parameters. Alternatively, the header can be omitted and the dmsetup -parameters can be passed via the kernel command-line in a rooted chain -of trust where the command-line is verified. +The verity kernel code does not read the verity metadata on-disk header. +It only reads the hash blocks which directly follow the header. +It is expected that a user-space tool will verify the integrity of the +verity header. -The on-disk format is especially useful in cases where the hash blocks -are on a separate partition. The magic number allows easy identification -of the partition contents. Alternatively, the hash blocks can be stored -in the same partition as the data to be verified. In such a configuration -the filesystem on the partition would be sized a little smaller than -the full-partition, leaving room for the hash blocks. - -struct superblock { - uint8_t signature[8] - "verity\0\0"; - - uint8_t version; - 1 - current format - - uint8_t data_block_bits; - log2(data block size) - - uint8_t hash_block_bits; - log2(hash block size) - - uint8_t pad1[1]; - zero padding - - uint16_t salt_size; - big-endian salt size - - uint8_t pad2[2]; - zero padding - - uint32_t data_blocks_hi; - big-endian high 32 bits of the 64-bit number of data blocks - - uint32_t data_blocks_lo; - big-endian low 32 bits of the 64-bit number of data blocks - - uint8_t algorithm[16]; - cryptographic algorithm - - uint8_t salt[384]; - salt (the salt size is specified above) - - uint8_t pad3[88]; - zero padding to 512-byte boundary -} +Alternatively, the header can be omitted and the dmsetup parameters can +be passed via the kernel command-line in a rooted chain of trust where +the command-line is verified. Directly following the header (and with sector number padded to the next hash block boundary) are the hash blocks which are stored a depth at a time (starting from the root), sorted in order of increasing index. +The full specification of kernel parameters and on-disk metadata format +is available at the cryptsetup project's wiki page + http://code.google.com/p/cryptsetup/wiki/DMVerity + Status ====== V (for Valid) is returned if every check performed so far was valid. @@ -174,21 +134,22 @@ If any check failed, C (for Corruption) is returned. Example ======= - -Setup a device: - dmsetup create vroot --table \ - "0 2097152 "\ - "verity 1 /dev/sda1 /dev/sda2 4096 4096 2097152 1 "\ +Set up a device: + # dmsetup create vroot --readonly --table \ + "0 2097152 verity 1 /dev/sda1 /dev/sda2 4096 4096 262144 1 sha256 "\ "4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 "\ "1234000000000000000000000000000000000000000000000000000000000000" A command line tool veritysetup is available to compute or verify -the hash tree or activate the kernel driver. This is available from -the LVM2 upstream repository and may be supplied as a package called -device-mapper-verity-tools: - git://sources.redhat.com/git/lvm2 - http://sourceware.org/git/?p=lvm2.git - http://sourceware.org/cgi-bin/cvsweb.cgi/LVM2/verity?cvsroot=lvm2 - -veritysetup -a vroot /dev/sda1 /dev/sda2 \ - 4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 +the hash tree or activate the kernel device. This is available from +the cryptsetup upstream repository http://code.google.com/p/cryptsetup/ +(as a libcryptsetup extension). + +Create hash on the device: + # veritysetup format /dev/sda1 /dev/sda2 + ... + Root hash: 4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 + +Activate the device: + # veritysetup create vroot /dev/sda1 /dev/sda2 \ + 4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 -- cgit v0.10.2 From 863b13271f1608ab3af6f7a371047d9a66693e38 Mon Sep 17 00:00:00 2001 From: Rajendra Nayak Date: Tue, 3 Jul 2012 12:11:41 +0530 Subject: clk: fix parent validation in __clk_set_parent() The below commit introduced a bug in __clk_set_parent() which could cause it to *skip* the parent validation which makes sure the parent passed to the api is a valid one. commit 7975059db572eb47f0fb272a62afeae272a4b209 Author: Rajendra Nayak Date: Wed Jun 6 14:41:31 2012 +0530 clk: Allow late cache allocation for clk->parents This was identified by the following compiler warning.. drivers/clk/clk.c: In function '__clk_set_parent': drivers/clk/clk.c:1083:5: warning: 'i' may be used uninitialized in this function [-Wuninitialized] .. as reported by Marc Kleine-Budde. There were various options discussed on how to fix this, one being initing 'i' to clk->num_parents, but the below approach was found to be more appropriate as it also makes the 'parent validation' code simpler to read. Reported-by: Marc Kleine-Budde Signed-off-by: Rajendra Nayak Signed-off-by: Mike Turquette Cc: stable@kernel.org diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index dcbe056..9a1eb0c 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -1067,26 +1067,24 @@ static int __clk_set_parent(struct clk *clk, struct clk *parent) old_parent = clk->parent; - /* find index of new parent clock using cached parent ptrs */ - if (clk->parents) - for (i = 0; i < clk->num_parents; i++) - if (clk->parents[i] == parent) - break; - else + if (!clk->parents) clk->parents = kzalloc((sizeof(struct clk*) * clk->num_parents), GFP_KERNEL); /* - * find index of new parent clock using string name comparison - * also try to cache the parent to avoid future calls to __clk_lookup + * find index of new parent clock using cached parent ptrs, + * or if not yet cached, use string name comparison and cache + * them now to avoid future calls to __clk_lookup. */ - if (i == clk->num_parents) - for (i = 0; i < clk->num_parents; i++) - if (!strcmp(clk->parent_names[i], parent->name)) { - if (clk->parents) - clk->parents[i] = __clk_lookup(parent->name); - break; - } + for (i = 0; i < clk->num_parents; i++) { + if (clk->parents && clk->parents[i] == parent) + break; + else if (!strcmp(clk->parent_names[i], parent->name)) { + if (clk->parents) + clk->parents[i] = __clk_lookup(parent->name); + break; + } + } if (i == clk->num_parents) { pr_debug("%s: clock %s is not a possible parent of clock %s\n", -- cgit v0.10.2 From dab058fd5ff834cb3b9de1d930ce731a605eb0c6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 3 Jul 2012 15:51:22 -0700 Subject: floppy: cancel any pending fd_timeouts before adding a new one In commit 070ad7e793dc ("floppy: convert to delayed work and single-thread wq") the 'fd_timeout' timer was converted to a delayed work. However, the "del_timer(&fd_timeout)" was lost in the process, and any previous pending timeouts would stay active when we then re-queued the timeout. This resulted in the floppy probe sequence having a (stale) 20s timeout rather than the intended 3s timeout, and thus made booting with the floppy driver (but no actual floppy controller) take much longer than it should. Of course, there's little reason for most people to compile the floppy driver into the kernel at all, which is why most people never noticed. Canceling the delayed work where we used to do the del_timer() fixes the issue, and makes the floppy probing use the proper new timeout instead. The three second timeout is still very wasteful, but better than the 20s one. Reported-and-tested-by: Andi Kleen Reported-and-tested-by: Calvin Walton Cc: Jiri Kosina Signed-off-by: Linus Torvalds diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index cce7df3..553f43a 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -671,6 +671,7 @@ static void __reschedule_timeout(int drive, const char *message) if (drive == current_reqD) drive = current_drive; + __cancel_delayed_work(&fd_timeout); if (drive < 0 || drive >= N_DRIVE) { delay = 20UL * HZ; -- cgit v0.10.2 From 10684112c9d154172ac34e48a2ab68649e8f63ac Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 4 Jul 2012 09:35:35 +1000 Subject: md/raid10: fix careless build error build error introduced by commit b357f04a67c2aeee8 That function doesn't get extra args until a later patch. Bother. Reported-by: Fengguang Wu Reported-by: Simon Kirby Reported-by: Tobias Klausmann Signed-off-by: NeilBrown diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index acf5a82..8da6282 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1394,7 +1394,7 @@ retry_write: bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev, 0, 0)) + if (!mddev_check_plugged(mddev)) md_wakeup_thread(mddev->thread); if (!r10_bio->devs[i].repl_bio) -- cgit v0.10.2