From 6ae0516b8a50ece5d766be608a305707e0450060 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 16 Dec 2011 14:04:23 +0100 Subject: block, cfq: fix empty queue crash caused by request merge All requests of a queue could be merged to other requests of other queue. Such queue will not have request in it, but it's in service tree. This will cause kernel oops. I encounter a BUG_ON() in cfq_dispatch_request() with next patch, but the issue should exist without the patch. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 4c12869..3548705 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1655,6 +1655,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, struct request *next) { struct cfq_queue *cfqq = RQ_CFQQ(rq); + struct cfq_data *cfqd = q->elevator->elevator_data; + /* * reposition in fifo if next is older than rq */ @@ -1669,6 +1671,16 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, cfq_remove_request(next); cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(next), rq_is_sync(next)); + + cfqq = RQ_CFQQ(next); + /* + * all requests of this queue are merged to other queues, delete it + * from the service tree. If it's the active_queue, + * cfq_dispatch_requests() will choose to expire it or do idle + */ + if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) && + cfqq != cfqd->active_queue) + cfq_del_cfqq_rr(cfqd, cfqq); } static int cfq_allow_merge(struct request_queue *q, struct request *rq, -- cgit v0.10.2 From 8bd6960c6ae65d7f92bfb708154ee813417d7b26 Mon Sep 17 00:00:00 2001 From: KyongHo Cho Date: Fri, 16 Dec 2011 21:38:25 +0900 Subject: iommu: Initialize domain->handler in iommu_domain_alloc() Since it is not guaranteed that an iommu driver initializes in its domain_init() function, it must be initialized with NULL to prevent calling a function in an arbitrary location when iommu fault occurred. Signed-off-by: KyongHo Cho Signed-off-by: Joerg Roedel diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 2fb2963..5b5fa5c 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -90,7 +90,7 @@ struct iommu_domain *iommu_domain_alloc(struct bus_type *bus) if (bus == NULL || bus->iommu_ops == NULL) return NULL; - domain = kmalloc(sizeof(*domain), GFP_KERNEL); + domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) return NULL; -- cgit v0.10.2 From b3bba872ddb0320a7ecb54decae53c13ceb2ed4c Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 8 Dec 2011 16:53:54 -0600 Subject: writeback: show writeback reason with __print_symbolic This makes the binary trace understandable by trace-cmd. CC: Dave Chinner CC: Curt Wohlgemuth CC: Steven Rostedt Signed-off-by: Wu Fengguang diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ac86f8b..517f211 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -47,17 +47,6 @@ struct wb_writeback_work { struct completion *done; /* set if the caller waits */ }; -const char *wb_reason_name[] = { - [WB_REASON_BACKGROUND] = "background", - [WB_REASON_TRY_TO_FREE_PAGES] = "try_to_free_pages", - [WB_REASON_SYNC] = "sync", - [WB_REASON_PERIODIC] = "periodic", - [WB_REASON_LAPTOP_TIMER] = "laptop_timer", - [WB_REASON_FREE_MORE_MEM] = "free_more_memory", - [WB_REASON_FS_FREE_SPACE] = "fs_free_space", - [WB_REASON_FORKER_THREAD] = "forker_thread" -}; - /* * Include the creation of the trace points after defining the * wb_writeback_work structure so that the definition remains local to this diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index b99caa8..99d1d0d 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -21,6 +21,16 @@ {I_REFERENCED, "I_REFERENCED"} \ ) +#define WB_WORK_REASON \ + {WB_REASON_BACKGROUND, "background"}, \ + {WB_REASON_TRY_TO_FREE_PAGES, "try_to_free_pages"}, \ + {WB_REASON_SYNC, "sync"}, \ + {WB_REASON_PERIODIC, "periodic"}, \ + {WB_REASON_LAPTOP_TIMER, "laptop_timer"}, \ + {WB_REASON_FREE_MORE_MEM, "free_more_memory"}, \ + {WB_REASON_FS_FREE_SPACE, "fs_free_space"}, \ + {WB_REASON_FORKER_THREAD, "forker_thread"} + struct wb_writeback_work; DECLARE_EVENT_CLASS(writeback_work_class, @@ -55,7 +65,7 @@ DECLARE_EVENT_CLASS(writeback_work_class, __entry->for_kupdate, __entry->range_cyclic, __entry->for_background, - wb_reason_name[__entry->reason] + __print_symbolic(__entry->reason, WB_WORK_REASON) ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ @@ -184,7 +194,8 @@ TRACE_EVENT(writeback_queue_io, __entry->older, /* older_than_this in jiffies */ __entry->age, /* older_than_this in relative milliseconds */ __entry->moved, - wb_reason_name[__entry->reason]) + __print_symbolic(__entry->reason, WB_WORK_REASON) + ) ); TRACE_EVENT(global_dirty_state, -- cgit v0.10.2 From fa0ad6575f6d459e215dded90b10cc455a889145 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Sat, 5 Nov 2011 12:21:30 +0100 Subject: kconfig: adapt update-po-config to new UML layout Commit 5c48b108 ("um: take arch/um/sys-x86 to arch/x86/um") broke the make target update-po-config, as its symlink trick (again) fails. (Previous breakage was fixed with commit bdc69ca4 ("kconfig: change update-po-config to reflect new layout of arch/um").) The new UML layout allows to drop the symlick trick entirely. And if, one day, another architecture supports UML too, that should now work without again breaking this make target. Signed-off-by: Paul Bolle Signed-off-by: Michal Marek diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile index ba573fe..914833d 100644 --- a/scripts/kconfig/Makefile +++ b/scripts/kconfig/Makefile @@ -60,8 +60,8 @@ update-po-config: $(obj)/kxgettext $(obj)/gconf.glade.h --directory=$(srctree) --directory=$(objtree) \ --output $(obj)/config.pot $(Q)sed -i s/CHARSET/UTF-8/ $(obj)/config.pot - $(Q)ln -fs Kconfig.x86 arch/um/Kconfig - $(Q)(for i in `ls $(srctree)/arch/*/Kconfig`; \ + $(Q)(for i in `ls $(srctree)/arch/*/Kconfig \ + $(srctree)/arch/*/um/Kconfig`; \ do \ echo " GEN $$i"; \ $(obj)/kxgettext $$i \ @@ -69,7 +69,6 @@ update-po-config: $(obj)/kxgettext $(obj)/gconf.glade.h done ) $(Q)msguniq --sort-by-file --to-code=UTF-8 $(obj)/config.pot \ --output $(obj)/linux.pot - $(Q)rm -f $(srctree)/arch/um/Kconfig $(Q)rm -f $(obj)/config.pot PHONY += allnoconfig allyesconfig allmodconfig alldefconfig randconfig -- cgit v0.10.2 From c070e38e4ee005f55895df177a9e14d90d6204b3 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 16 Nov 2011 10:54:02 -0300 Subject: [media] omap3isp: Fix crash caused by subdevs now having a pointer to devnodes Commit 3e0ec41c5c5ee14e27f65e28d4a616de34f59a97 ("V4L: dynamically allocate video_device nodes in subdevices") makes the embedding video_device directly. Fix accesses to the devnode accordingly. Signed-off-by: Laurent Pinchart Acked-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/media/video/omap3isp/ispccdc.c b/drivers/media/video/omap3isp/ispccdc.c index b0b0fa5..54a4a3f 100644 --- a/drivers/media/video/omap3isp/ispccdc.c +++ b/drivers/media/video/omap3isp/ispccdc.c @@ -1408,7 +1408,7 @@ static void ccdc_hs_vs_isr(struct isp_ccdc_device *ccdc) { struct isp_pipeline *pipe = to_isp_pipeline(&ccdc->video_out.video.entity); - struct video_device *vdev = &ccdc->subdev.devnode; + struct video_device *vdev = ccdc->subdev.devnode; struct v4l2_event event; memset(&event, 0, sizeof(event)); diff --git a/drivers/media/video/omap3isp/ispstat.c b/drivers/media/video/omap3isp/ispstat.c index 68d5394..bc0b2c7 100644 --- a/drivers/media/video/omap3isp/ispstat.c +++ b/drivers/media/video/omap3isp/ispstat.c @@ -496,7 +496,7 @@ static int isp_stat_bufs_alloc(struct ispstat *stat, u32 size) static void isp_stat_queue_event(struct ispstat *stat, int err) { - struct video_device *vdev = &stat->subdev.devnode; + struct video_device *vdev = stat->subdev.devnode; struct v4l2_event event; struct omap3isp_stat_event_status *status = (void *)event.u.data; -- cgit v0.10.2 From 609f6ea1c9cdfe0c43a927e13205a57d0c266d5a Mon Sep 17 00:00:00 2001 From: majianpeng Date: Wed, 21 Dec 2011 15:27:24 +0100 Subject: block: re-use existing 'reading' variable instead of checking direction again Signed-off-by: majianpeng Signed-off-by: Jens Axboe diff --git a/block/blk-map.c b/block/blk-map.c index 164cd00..623e1cd 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -311,7 +311,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, if (IS_ERR(bio)) return PTR_ERR(bio); - if (rq_data_dir(rq) == WRITE) + if (!reading) bio->bi_rw |= REQ_WRITE; if (do_copy) -- cgit v0.10.2 From e30e2fdfe56288576ee9e04dbb06b4bd5f282203 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 22 Dec 2011 02:45:29 +0530 Subject: VFS: Fix race between CPU hotplug and lglocks Currently, the *_global_[un]lock_online() routines are not at all synchronized with CPU hotplug. Soft-lockups detected as a consequence of this race was reported earlier at https://lkml.org/lkml/2011/8/24/185. (Thanks to Cong Meng for finding out that the root-cause of this issue is the race condition between br_write_[un]lock() and CPU hotplug, which results in the lock states getting messed up). Fixing this race by just adding {get,put}_online_cpus() at appropriate places in *_global_[un]lock_online() is not a good option, because, then suddenly br_write_[un]lock() would become blocking, whereas they have been kept as non-blocking all this time, and we would want to keep them that way. So, overall, we want to ensure 3 things: 1. br_write_lock() and br_write_unlock() must remain as non-blocking. 2. The corresponding lock and unlock of the per-cpu spinlocks must not happen for different sets of CPUs. 3. Either prevent any new CPU online operation in between this lock-unlock, or ensure that the newly onlined CPU does not proceed with its corresponding per-cpu spinlock unlocked. To achieve all this: (a) We introduce a new spinlock that is taken by the *_global_lock_online() routine and released by the *_global_unlock_online() routine. (b) We register a callback for CPU hotplug notifications, and this callback takes the same spinlock as above. (c) We maintain a bitmap which is close to the cpu_online_mask, and once it is initialized in the lock_init() code, all future updates to it are done in the callback, under the above spinlock. (d) The above bitmap is used (instead of cpu_online_mask) while locking and unlocking the per-cpu locks. The callback takes the spinlock upon the CPU_UP_PREPARE event. So, if the br_write_lock-unlock sequence is in progress, the callback keeps spinning, thus preventing the CPU online operation till the lock-unlock sequence is complete. This takes care of requirement (3). The bitmap that we maintain remains unmodified throughout the lock-unlock sequence, since all updates to it are managed by the callback, which takes the same spinlock as the one taken by the lock code and released only by the unlock routine. Combining this with (d) above, satisfies requirement (2). Overall, since we use a spinlock (mentioned in (a)) to prevent CPU hotplug operations from racing with br_write_lock-unlock, requirement (1) is also taken care of. By the way, it is to be noted that a CPU offline operation can actually run in parallel with our lock-unlock sequence, because our callback doesn't react to notifications earlier than CPU_DEAD (in order to maintain our bitmap properly). And this means, since we use our own bitmap (which is stale, on purpose) during the lock-unlock sequence, we could end up unlocking the per-cpu lock of an offline CPU (because we had locked it earlier, when the CPU was online), in order to satisfy requirement (2). But this is harmless, though it looks a bit awkward. Debugged-by: Cong Meng Signed-off-by: Srivatsa S. Bhat Signed-off-by: Al Viro Cc: stable@vger.kernel.org diff --git a/include/linux/lglock.h b/include/linux/lglock.h index f549056..87f402c 100644 --- a/include/linux/lglock.h +++ b/include/linux/lglock.h @@ -22,6 +22,7 @@ #include #include #include +#include /* can make br locks by using local lock for read side, global lock for write */ #define br_lock_init(name) name##_lock_init() @@ -72,9 +73,31 @@ #define DEFINE_LGLOCK(name) \ \ + DEFINE_SPINLOCK(name##_cpu_lock); \ + cpumask_t name##_cpus __read_mostly; \ DEFINE_PER_CPU(arch_spinlock_t, name##_lock); \ DEFINE_LGLOCK_LOCKDEP(name); \ \ + static int \ + name##_lg_cpu_callback(struct notifier_block *nb, \ + unsigned long action, void *hcpu) \ + { \ + switch (action & ~CPU_TASKS_FROZEN) { \ + case CPU_UP_PREPARE: \ + spin_lock(&name##_cpu_lock); \ + cpu_set((unsigned long)hcpu, name##_cpus); \ + spin_unlock(&name##_cpu_lock); \ + break; \ + case CPU_UP_CANCELED: case CPU_DEAD: \ + spin_lock(&name##_cpu_lock); \ + cpu_clear((unsigned long)hcpu, name##_cpus); \ + spin_unlock(&name##_cpu_lock); \ + } \ + return NOTIFY_OK; \ + } \ + static struct notifier_block name##_lg_cpu_notifier = { \ + .notifier_call = name##_lg_cpu_callback, \ + }; \ void name##_lock_init(void) { \ int i; \ LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \ @@ -83,6 +106,11 @@ lock = &per_cpu(name##_lock, i); \ *lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; \ } \ + register_hotcpu_notifier(&name##_lg_cpu_notifier); \ + get_online_cpus(); \ + for_each_online_cpu(i) \ + cpu_set(i, name##_cpus); \ + put_online_cpus(); \ } \ EXPORT_SYMBOL(name##_lock_init); \ \ @@ -124,9 +152,9 @@ \ void name##_global_lock_online(void) { \ int i; \ - preempt_disable(); \ + spin_lock(&name##_cpu_lock); \ rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_); \ - for_each_online_cpu(i) { \ + for_each_cpu(i, &name##_cpus) { \ arch_spinlock_t *lock; \ lock = &per_cpu(name##_lock, i); \ arch_spin_lock(lock); \ @@ -137,12 +165,12 @@ void name##_global_unlock_online(void) { \ int i; \ rwlock_release(&name##_lock_dep_map, 1, _RET_IP_); \ - for_each_online_cpu(i) { \ + for_each_cpu(i, &name##_cpus) { \ arch_spinlock_t *lock; \ lock = &per_cpu(name##_lock, i); \ arch_spin_unlock(lock); \ } \ - preempt_enable(); \ + spin_unlock(&name##_cpu_lock); \ } \ EXPORT_SYMBOL(name##_global_unlock_online); \ \ -- cgit v0.10.2 From 77e00f2ea94abee1ad13bdfde19cf7aa25992b0e Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 21 Dec 2011 11:58:17 -0500 Subject: drm/radeon/kms: bail on BTC parts if MC ucode is missing We already do this for cayman, need to also do it for BTC parts. The default memory and voltage setup is not adequate for advanced operation. Continuing will result in an unusable display. Signed-off-by: Alex Deucher Cc: stable@kernel.org Cc: Jean Delvare Signed-off-by: Dave Airlie diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c index 5e00d16..92c9628 100644 --- a/drivers/gpu/drm/radeon/evergreen.c +++ b/drivers/gpu/drm/radeon/evergreen.c @@ -3276,6 +3276,18 @@ int evergreen_init(struct radeon_device *rdev) rdev->accel_working = false; } } + + /* Don't start up if the MC ucode is missing on BTC parts. + * The default clocks and voltages before the MC ucode + * is loaded are not suffient for advanced operations. + */ + if (ASIC_IS_DCE5(rdev)) { + if (!rdev->mc_fw && !(rdev->flags & RADEON_IS_IGP)) { + DRM_ERROR("radeon: MC ucode required for NI+.\n"); + return -EINVAL; + } + } + return 0; } -- cgit v0.10.2 From 8a78389651b3e411ec5a7df61404734f52d6f4eb Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Wed, 21 Dec 2011 05:18:33 -0500 Subject: vmwgfx: fix incorrect VRAM size check in vmw_kms_fb_create() Commit e133e737 didn't correctly fix the integer overflow issue. - unsigned int required_size; + u64 required_size; ... required_size = mode_cmd->pitch * mode_cmd->height; - if (unlikely(required_size > dev_priv->vram_size)) { + if (unlikely(required_size > (u64) dev_priv->vram_size)) { Note that both pitch and height are u32. Their product is still u32 and would overflow before being assigned to required_size. A correct way is to convert pitch and height to u64 before the multiplication. required_size = (u64)mode_cmd->pitch * (u64)mode_cmd->height; This patch calls the existing vmw_kms_validate_mode_vram() for validation. Signed-off-by: Xi Wang Reviewed-and-tested-by: Thomas Hellstrom Signed-off-by: Dave Airlie diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c index 8aa1dbb..f94b33a 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c @@ -1093,7 +1093,6 @@ static struct drm_framebuffer *vmw_kms_fb_create(struct drm_device *dev, struct vmw_surface *surface = NULL; struct vmw_dma_buffer *bo = NULL; struct ttm_base_object *user_obj; - u64 required_size; int ret; /** @@ -1102,8 +1101,9 @@ static struct drm_framebuffer *vmw_kms_fb_create(struct drm_device *dev, * requested framebuffer. */ - required_size = mode_cmd->pitch * mode_cmd->height; - if (unlikely(required_size > (u64) dev_priv->vram_size)) { + if (!vmw_kms_validate_mode_vram(dev_priv, + mode_cmd->pitch, + mode_cmd->height)) { DRM_ERROR("VRAM size is too small for requested mode.\n"); return ERR_PTR(-ENOMEM); } -- cgit v0.10.2 From 7cc8583372a21d98a23b703ad96cab03180b5030 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 22 Dec 2011 13:23:59 -0800 Subject: sparc64: Fix MSIQ HV call ordering in pci_sun4v_msiq_build_irq(). This silently was working for many years and stopped working on Niagara-T3 machines. We need to set the MSIQ to VALID before we can set it's state to IDLE. On Niagara-T3, setting the state to IDLE first was causing HV_EINVAL errors. The hypervisor documentation says, rather ambiguously, that the MSIQ must be "initialized" before one can set the state. I previously understood this to mean merely that a successful setconf() operation has been performed on the MSIQ, which we have done at this point. But it seems to also mean that it has been set VALID too. Signed-off-by: David S. Miller diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c index b272cda..af5755d 100644 --- a/arch/sparc/kernel/pci_sun4v.c +++ b/arch/sparc/kernel/pci_sun4v.c @@ -849,10 +849,10 @@ static int pci_sun4v_msiq_build_irq(struct pci_pbm_info *pbm, if (!irq) return -ENOMEM; - if (pci_sun4v_msiq_setstate(pbm->devhandle, msiqid, HV_MSIQSTATE_IDLE)) - return -EINVAL; if (pci_sun4v_msiq_setvalid(pbm->devhandle, msiqid, HV_MSIQ_VALID)) return -EINVAL; + if (pci_sun4v_msiq_setstate(pbm->devhandle, msiqid, HV_MSIQSTATE_IDLE)) + return -EINVAL; return irq; } -- cgit v0.10.2 From 09cd9270ea52e0f9851528e8ed028073f96b3c34 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 23 Dec 2011 09:56:55 +1100 Subject: md/linear: fix hot-add of devices to linear arrays. commit d70ed2e4fafdbef0800e73942482bb075c21578b broke hot-add to a linear array. After that commit, metadata if not written to devices until they have been fully integrated into the array as determined by saved_raid_disk. That patch arranged to clear that field after a recovery completed. However for linear arrays, there is no recovery - the integration is instantaneous. So we need to explicitly clear the saved_raid_disk field. Signed-off-by: NeilBrown diff --git a/drivers/md/linear.c b/drivers/md/linear.c index c3273ef..6274565 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -230,6 +230,7 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) return -EINVAL; rdev->raid_disk = rdev->saved_raid_disk; + rdev->saved_raid_disk = -1; newconf = linear_conf(mddev,mddev->raid_disks+1); -- cgit v0.10.2 From 30d7a4836847bdb10b32c78a4879d4aebe0f193b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 23 Dec 2011 09:57:00 +1100 Subject: md/raid5: ensure correct assessment of drives during degraded reshape. While reshaping a degraded array (as when reshaping a RAID0 by first converting it to a degraded RAID4) we currently get confused about which devices are in_sync. In most cases we get it right, but in the region that is being reshaped we need to treat non-failed devices as in-sync when we have the data but haven't actually written it out yet. Reported-by: Adam Kwolek Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 31670f8..858fdbb 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3065,11 +3065,17 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) } } else if (test_bit(In_sync, &rdev->flags)) set_bit(R5_Insync, &dev->flags); - else { + else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) /* in sync if before recovery_offset */ - if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) - set_bit(R5_Insync, &dev->flags); - } + set_bit(R5_Insync, &dev->flags); + else if (test_bit(R5_UPTODATE, &dev->flags) && + test_bit(R5_Expanded, &dev->flags)) + /* If we've reshaped into here, we assume it is Insync. + * We will shortly update recovery_offset to make + * it official. + */ + set_bit(R5_Insync, &dev->flags); + if (rdev && test_bit(R5_WriteError, &dev->flags)) { clear_bit(R5_Insync, &dev->flags); if (!test_bit(Faulty, &rdev->flags)) { -- cgit v0.10.2 From 60fc13702a1b35118c1548e9c257fa038cecb658 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 23 Dec 2011 09:57:19 +1100 Subject: md: don't give up looking for spares on first failure-to-add Before performing a recovery we try to remove any spares that might not be working, then add any that might have become relevant. Currently we abort on the first spare that cannot be added. This is a false optimisation. It is conceivable that - depending on rules in the personality - a subsequent spare might be accepted. Also the loop does other things like count the available spares and reset the 'recovery_offset' value. If we abort early these might not happen properly. So remove the early abort. In particular if you have an array what is undergoing recovery and which has extra spares, then the recovery may not restart after as reboot as the could of 'spares' might end up as zero. Reported-by: Anssi Hannula Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index ee98173..f47f1f8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7360,8 +7360,7 @@ static int remove_and_add_spares(struct mddev *mddev) spares++; md_new_event(mddev); set_bit(MD_CHANGE_DEVS, &mddev->flags); - } else - break; + } } } } -- cgit v0.10.2 From 961902c0f8240175729274cd14198872f42072b7 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 23 Dec 2011 09:57:48 +1100 Subject: md/bitmap: It is OK to clear bits during recovery. commit d0a4bb492772ce5c4bdfba3744a99ed6f6fb238f introduced a regression which is annoying but fairly harmless. When writing to an array that is undergoing recovery (a spare in being integrated into the array), writing to the array will set bits in the bitmap, but they will not be cleared when the write completes. For bits covering areas that have not been recovered yet this is not a problem as the recovery will clear the bits. However bits set in already-recovered region will stay set and never be cleared. This doesn't risk data integrity. The only negatives are: - next time there is a crash, more resyncing than necessary will be done. - the bitmap doesn't look clean, which is confusing. While an array is recovering we don't want to update the 'events_cleared' setting in the bitmap but we do still want to clear bits that have very recently been set - providing they were written to the recovering device. So split those two needs - which previously both depended on 'success' and always clear the bit of the write went to all devices. Signed-off-by: NeilBrown diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index b690711..6d03774 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1393,9 +1393,6 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto atomic_read(&bitmap->behind_writes), bitmap->mddev->bitmap_info.max_write_behind); } - if (bitmap->mddev->degraded) - /* Never clear bits or update events_cleared when degraded */ - success = 0; while (sectors) { sector_t blocks; @@ -1409,7 +1406,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto return; } - if (success && + if (success && !bitmap->mddev->degraded && bitmap->events_cleared < bitmap->mddev->events) { bitmap->events_cleared = bitmap->mddev->events; bitmap->need_sync = 1; -- cgit v0.10.2 From 55205c916e179e09773d98d290334d319f45ac6b Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Thu, 22 Dec 2011 16:15:40 +0100 Subject: oprofile, arm/sh: Fix oprofile_arch_exit() linkage issue This change fixes a linking problem, which happens if oprofile is selected to be compiled as built-in: `oprofile_arch_exit' referenced in section `.init.text' of arch/arm/oprofile/built-in.o: defined in discarded section `.exit.text' of arch/arm/oprofile/built-in.o The problem is appeared after commit 87121ca504, which introduced oprofile_arch_exit() calls from __init function. Note that the aforementioned commit has been backported to stable branches, and the problem is known to be reproduced at least with 3.0.13 and 3.1.5 kernels. Signed-off-by: Vladimir Zapolskiy Signed-off-by: Robert Richter Cc: Will Deacon Cc: oprofile-list Cc: Link: http://lkml.kernel.org/r/20111222151540.GB16765@erda.amd.com Signed-off-by: Ingo Molnar diff --git a/arch/arm/oprofile/common.c b/arch/arm/oprofile/common.c index c074e66..4e0a371 100644 --- a/arch/arm/oprofile/common.c +++ b/arch/arm/oprofile/common.c @@ -116,7 +116,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops) return oprofile_perf_init(ops); } -void __exit oprofile_arch_exit(void) +void oprofile_arch_exit(void) { oprofile_perf_exit(); } diff --git a/arch/sh/oprofile/common.c b/arch/sh/oprofile/common.c index b4c2d2b..e4dd5d5 100644 --- a/arch/sh/oprofile/common.c +++ b/arch/sh/oprofile/common.c @@ -49,7 +49,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops) return oprofile_perf_init(ops); } -void __exit oprofile_arch_exit(void) +void oprofile_arch_exit(void) { oprofile_perf_exit(); kfree(sh_pmu_op_name); @@ -60,5 +60,5 @@ int __init oprofile_arch_init(struct oprofile_operations *ops) ops->backtrace = sh_backtrace; return -ENODEV; } -void __exit oprofile_arch_exit(void) {} +void oprofile_arch_exit(void) {} #endif /* CONFIG_HW_PERF_EVENTS */ -- cgit v0.10.2 From 8d532b2afb2eacc84588db709ec280a3d1219be3 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 23 Dec 2011 07:53:00 -0500 Subject: Btrfs: fix worker lock misuse in find_worker Dan Carpenter noticed that we were doing a double unlock on the worker lock, and sometimes picking a worker thread without the lock held. This fixes both errors. Signed-off-by: Chris Mason Reported-by: Dan Carpenter diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index cb97174..0b39458 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -563,8 +563,8 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) struct list_head *fallback; int ret; -again: spin_lock_irqsave(&workers->lock, flags); +again: worker = next_worker(workers); if (!worker) { @@ -579,6 +579,7 @@ again: spin_unlock_irqrestore(&workers->lock, flags); /* we're below the limit, start another worker */ ret = __btrfs_start_workers(workers); + spin_lock_irqsave(&workers->lock, flags); if (ret) goto fallback; goto again; -- cgit v0.10.2 From 08c422c27f855d27b0b3d9fa30ebd938d4ae6f1f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 23 Dec 2011 07:58:13 -0500 Subject: Btrfs: call d_instantiate after all ops are setup This closes races where btrfs is calling d_instantiate too soon during inode creation. All of the callers of btrfs_add_nondir are updated to instantiate after the inode is fully setup in memory. Signed-off-by: Al Viro Signed-off-by: Chris Mason diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 740e67b..13b0542 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4590,10 +4590,6 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans, int err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, dentry->d_name.len, backref, index); - if (!err) { - d_instantiate(dentry, inode); - return 0; - } if (err > 0) err = -EEXIST; return err; @@ -4655,6 +4651,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, else { init_special_inode(inode, inode->i_mode, rdev); btrfs_update_inode(trans, root, inode); + d_instantiate(dentry, inode); } out_unlock: nr = trans->blocks_used; @@ -4722,6 +4719,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &btrfs_aops; inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + d_instantiate(dentry, inode); } out_unlock: nr = trans->blocks_used; @@ -4779,6 +4777,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *parent = dentry->d_parent; err = btrfs_update_inode(trans, root, inode); BUG_ON(err); + d_instantiate(dentry, inode); btrfs_log_new_name(trans, inode, NULL, parent); } @@ -7245,6 +7244,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, drop_inode = 1; out_unlock: + if (!err) + d_instantiate(dentry, inode); nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); if (drop_inode) { -- cgit v0.10.2 From 2e64694de21a812d637dcbea4471ad1f7897b049 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 23 Dec 2011 14:24:25 +0100 Subject: perf/x86: Fix raw_spin_unlock_irqrestore() usage Use raw_spin_unlock_irqrestore() as equivalent to raw_spin_lock_irqsave(). Signed-off-by: Robert Richter Cc: Stephane Eranian Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1324646665-13334-1-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 8d601b1..121f1be 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1169,7 +1169,7 @@ again: */ c = &unconstrained; } else if (intel_try_alt_er(event, orig_idx)) { - raw_spin_unlock(&era->lock); + raw_spin_unlock_irqrestore(&era->lock, flags); goto again; } raw_spin_unlock_irqrestore(&era->lock, flags); -- cgit v0.10.2 From 0b8fd3033c308e4088760aa1d38ce77197b4e074 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Dec 2011 15:49:55 +0000 Subject: xfs: log the inode in ->write_inode calls for kupdate If the writeback code writes back an inode because it has expired we currently use the non-blockin ->write_inode path. This means any inode that is pinned is skipped. With delayed logging and a workload that has very little log traffic otherwise it is very likely that an inode that gets constantly written to is always pinned, and thus we keep refusing to write it. The VM writeback code at that point redirties it and doesn't try to write it again for another 30 seconds. This means under certain scenarious time based metadata writeback never happens. Fix this by calling into xfs_log_inode for kupdate in addition to data integrity syncs, and thus transfer the inode to the log ASAP. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Tested-by: Mark Tinguely Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 3eca58f..1add17c 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -905,7 +905,7 @@ xfs_fs_write_inode( if (!ip->i_update_core) return 0; - if (wbc->sync_mode == WB_SYNC_ALL) { + if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) { /* * Make sure the inode has made it it into the log. Instead * of forcing it all the way to stable storage using a -- cgit v0.10.2 From be4f1ac828776bbc7868a68b465cd8eedb733cfd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Dec 2011 20:08:41 +0000 Subject: xfs: log all dirty inodes in xfs_fs_sync_fs Since Linux 2.6.36 the writeback code has introduces various measures for live lock prevention during sync(). Unfortunately some of these are actively harmful for the XFS model, where the inode gets marked dirty for metadata from the data I/O handler. The older_than_this checks that are now more strictly enforced since writeback: avoid livelocking WB_SYNC_ALL writeback by only calling into __writeback_inodes_sb and thus only sampling the current cut off time once. But on a slow enough devices the previous asynchronous sync pass might not have fully completed yet, and thus XFS might mark metadata dirty only after that sampling of the cut off time for the blocking pass already happened. I have not myself reproduced this myself on a real system, but by introducing artificial delay into the XFS I/O completion workqueues it can be reproduced easily. Fix this by iterating over all XFS inodes in ->sync_fs and log all that are dirty. This might log inode that only got redirtied after the previous pass, but given how cheap delayed logging of inodes is it isn't a major concern for performance. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Tested-by: Mark Tinguely Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 1add17c..8a89949 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -869,27 +869,6 @@ xfs_fs_dirty_inode( } STATIC int -xfs_log_inode( - struct xfs_inode *ip) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; - int error; - - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - return xfs_trans_commit(tp, 0); -} - -STATIC int xfs_fs_write_inode( struct inode *inode, struct writeback_control *wbc) @@ -902,8 +881,6 @@ xfs_fs_write_inode( if (XFS_FORCED_SHUTDOWN(mp)) return -XFS_ERROR(EIO); - if (!ip->i_update_core) - return 0; if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) { /* @@ -913,11 +890,14 @@ xfs_fs_write_inode( * ->sync_fs call do that for thus, which reduces the number * of synchronous log forces dramatically. */ - error = xfs_log_inode(ip); + error = xfs_log_dirty_inode(ip, NULL, 0); if (error) goto out; return 0; } else { + if (!ip->i_update_core) + return 0; + /* * We make this non-blocking if the inode is contended, return * EAGAIN to indicate to the caller that they did not succeed. diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index be5c51d..f0994aedc 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -336,6 +336,32 @@ xfs_sync_fsdata( return error; } +int +xfs_log_dirty_inode( + struct xfs_inode *ip, + struct xfs_perag *pag, + int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + if (!ip->i_update_core) + return 0; + + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return xfs_trans_commit(tp, 0); +} + /* * When remounting a filesystem read-only or freezing the filesystem, we have * two phases to execute. This first phase is syncing the data before we @@ -359,6 +385,16 @@ xfs_quiesce_data( { int error, error2 = 0; + /* + * Log all pending size and timestamp updates. The vfs writeback + * code is supposed to do this, but due to its overagressive + * livelock detection it will skip inodes where appending writes + * were written out in the first non-blocking sync phase if their + * completion took long enough that it happened after taking the + * timestamp for the cut-off in the blocking phase. + */ + xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0); + xfs_qm_sync(mp, SYNC_TRYLOCK); xfs_qm_sync(mp, SYNC_WAIT); diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index 941202e..fa96547 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -34,6 +34,8 @@ void xfs_quiesce_attr(struct xfs_mount *mp); void xfs_flush_inodes(struct xfs_inode *ip); +int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags); + int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); int xfs_reclaim_inodes_count(struct xfs_mount *mp); void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); -- cgit v0.10.2 From 5f0a6e2d503896062f641639dacfe5055c2f593b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 23 Dec 2011 21:51:06 -0800 Subject: Linux 3.2-rc7 diff --git a/Makefile b/Makefile index a43733d..ea51081 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 3 PATCHLEVEL = 2 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Saber-toothed Squirrel # *DOCUMENTATION* -- cgit v0.10.2 From 81378f728fe560e175fb2e8fd33206793567e896 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 24 Dec 2011 19:03:46 +0100 Subject: netfilter: ctnetlink: fix return value of ctnetlink_get_expect() This fixes one bogus error that is returned to user-space: libnetfilter_conntrack/utils# ./expect_get TEST: get expectation (-1)(Unknown error 18446744073709551504) This patch includes the correct handling for EAGAIN (nfnetlink uses this error value to restart the operation after module auto-loading). Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index ef21b22..3d7ea7a 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1869,25 +1869,30 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, err = -ENOMEM; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) + if (skb2 == NULL) { + nf_ct_expect_put(exp); goto out; + } rcu_read_lock(); err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp); rcu_read_unlock(); + nf_ct_expect_put(exp); if (err <= 0) goto free; - nf_ct_expect_put(exp); + err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (err < 0) + goto out; - return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + return 0; free: kfree_skb(skb2); out: - nf_ct_expect_put(exp); - return err; + /* this avoids a loop in nfnetlink. */ + return err == -EAGAIN ? -ENOBUFS : err; } static int -- cgit v0.10.2 From 1a31a4a8388a90e9240fb4e5e5c9c909fcfdfd0e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 24 Dec 2011 19:28:47 +0100 Subject: netfilter: ctnetlink: fix scheduling while atomic if helper is autoloaded This patch fixes one scheduling while atomic error: [ 385.565186] ctnetlink v0.93: registering with nfnetlink. [ 385.565349] BUG: scheduling while atomic: lt-expect_creat/16163/0x00000200 It can be triggered with utils/expect_create included in libnetfilter_conntrack if the FTP helper is not loaded. Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 3d7ea7a..b697777 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1358,12 +1358,15 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, nf_ct_protonum(ct)); if (helper == NULL) { rcu_read_unlock(); + spin_unlock_bh(&nf_conntrack_lock); #ifdef CONFIG_MODULES if (request_module("nfct-helper-%s", helpname) < 0) { + spin_lock_bh(&nf_conntrack_lock); err = -EOPNOTSUPP; goto err1; } + spin_lock_bh(&nf_conntrack_lock); rcu_read_lock(); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), -- cgit v0.10.2 From bb52c7acf871537a468433775151339f783d2e8c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 23 Dec 2011 19:28:51 +0000 Subject: netem: dont call vfree() under spinlock and BH disabled commit 6373a9a286 (netem: use vmalloc for distribution table) added a regression, since vfree() is called while holding a spinlock and BH being disabled. Fix this by doing the pointers swap in critical section, and freeing after spinlock release. Also add __GFP_NOWARN to the kmalloc() try, since we fallback to vmalloc(). Signed-off-by: Eric Dumazet Acked-by: Stephen Hemminger Signed-off-by: David S. Miller diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index eb3b9a8..a4ab207 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -488,7 +488,7 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) return -EINVAL; s = sizeof(struct disttable) + n * sizeof(s16); - d = kmalloc(s, GFP_KERNEL); + d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN); if (!d) d = vmalloc(s); if (!d) @@ -501,9 +501,10 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) root_lock = qdisc_root_sleeping_lock(sch); spin_lock_bh(root_lock); - dist_free(q->delay_dist); - q->delay_dist = d; + swap(q->delay_dist, d); spin_unlock_bh(root_lock); + + dist_free(d); return 0; } -- cgit v0.10.2 From 2ca526bf4953380abfe5dff455e356967b239c70 Mon Sep 17 00:00:00 2001 From: Stefan Richter Date: Tue, 20 Dec 2011 21:23:28 +0100 Subject: MAINTAINERS: firewire git URL update Signed-off-by: Stefan Richter diff --git a/MAINTAINERS b/MAINTAINERS index 28f65c2..37deb5e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2624,7 +2624,7 @@ FIREWIRE SUBSYSTEM M: Stefan Richter L: linux1394-devel@lists.sourceforge.net W: http://ieee1394.wiki.kernel.org/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/ieee1394/linux1394-2.6.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/ieee1394/linux1394.git S: Maintained F: drivers/firewire/ F: include/linux/firewire*.h -- cgit v0.10.2 From 0924ab2cfa98b1ece26c033d696651fd62896c69 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 14 Dec 2011 19:25:13 +0100 Subject: KVM: x86: Prevent starting PIT timers in the absence of irqchip support User space may create the PIT and forgets about setting up the irqchips. In that case, firing PIT IRQs will crash the host: BUG: unable to handle kernel NULL pointer dereference at 0000000000000128 IP: [] kvm_set_irq+0x30/0x170 [kvm] ... Call Trace: [] pit_do_work+0x51/0xd0 [kvm] [] process_one_work+0x111/0x4d0 [] worker_thread+0x152/0x340 [] kthread+0x7e/0x90 [] kernel_thread_helper+0x4/0x10 Prevent this by checking the irqchip mode before starting a timer. We can't deny creating the PIT if the irqchips aren't set up yet as current user land expects this order to work. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 76e3f1c..405f262 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -338,11 +338,15 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) return HRTIMER_NORESTART; } -static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) +static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) { + struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; struct kvm_timer *pt = &ps->pit_timer; s64 interval; + if (!irqchip_in_kernel(kvm)) + return; + interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); pr_debug("create pit timer, interval is %llu nsec\n", interval); @@ -394,13 +398,13 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) /* FIXME: enhance mode 4 precision */ case 4: if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) { - create_pit_timer(ps, val, 0); + create_pit_timer(kvm, val, 0); } break; case 2: case 3: if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){ - create_pit_timer(ps, val, 1); + create_pit_timer(kvm, val, 1); } break; default: -- cgit v0.10.2 From 423873736b78f549fbfa2f715f2e4de7e6c5e1e9 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 20 Dec 2011 21:59:03 -0700 Subject: KVM: Remove ability to assign a device without iommu support This option has no users and it exposes a security hole that we can allow devices to be assigned without iommu protection. Make KVM_DEV_ASSIGN_ENABLE_IOMMU a mandatory option. Signed-off-by: Alex Williamson Signed-off-by: Marcelo Tosatti diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 7945b0b..ee2c96b 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1151,6 +1151,9 @@ following flags are specified: /* Depends on KVM_CAP_IOMMU */ #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) +The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure +isolation of the device. Usages not specifying this flag are deprecated. + 4.49 KVM_DEASSIGN_PCI_DEVICE Capability: KVM_CAP_DEVICE_DEASSIGNMENT diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index 3ad0925..a251a28 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -487,6 +487,9 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, struct kvm_assigned_dev_kernel *match; struct pci_dev *dev; + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)) + return -EINVAL; + mutex_lock(&kvm->lock); idx = srcu_read_lock(&kvm->srcu); @@ -544,16 +547,14 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, list_add(&match->list, &kvm->arch.assigned_dev_head); - if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { - if (!kvm->arch.iommu_domain) { - r = kvm_iommu_map_guest(kvm); - if (r) - goto out_list_del; - } - r = kvm_assign_device(kvm, match); + if (!kvm->arch.iommu_domain) { + r = kvm_iommu_map_guest(kvm); if (r) goto out_list_del; } + r = kvm_assign_device(kvm, match); + if (r) + goto out_list_del; out: srcu_read_unlock(&kvm->srcu, idx); @@ -593,8 +594,7 @@ static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, goto out; } - if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) - kvm_deassign_device(kvm, match); + kvm_deassign_device(kvm, match); kvm_free_assigned_device(kvm, match); -- cgit v0.10.2 From 3d27e23b17010c668db311140b17bbbb70c78fb9 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 20 Dec 2011 21:59:09 -0700 Subject: KVM: Device assignment permission checks Only allow KVM device assignment to attach to devices which: - Are not bridges - Have BAR resources (assume others are special devices) - The user has permissions to use Assigning a bridge is a configuration error, it's not supported, and typically doesn't result in the behavior the user is expecting anyway. Devices without BAR resources are typically chipset components that also don't have host drivers. We don't want users to hold such devices captive or cause system problems by fencing them off into an iommu domain. We determine "permission to use" by testing whether the user has access to the PCI sysfs resource files. By default a normal user will not have access to these files, so it provides a good indication that an administration agent has granted the user access to the device. [Yang Bai: add missing #include] [avi: fix comment style] Signed-off-by: Alex Williamson Signed-off-by: Yang Bai Signed-off-by: Marcelo Tosatti diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index ee2c96b..4df9af4 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1154,6 +1154,10 @@ following flags are specified: The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure isolation of the device. Usages not specifying this flag are deprecated. +Only PCI header type 0 devices with PCI BAR resources are supported by +device assignment. The user requesting this ioctl must have read/write +access to the PCI sysfs resource files associated with the device. + 4.49 KVM_DEASSIGN_PCI_DEVICE Capability: KVM_CAP_DEVICE_DEASSIGNMENT diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index a251a28..758e3b3 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include "irq.h" static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, @@ -480,12 +482,73 @@ out: return r; } +/* + * We want to test whether the caller has been granted permissions to + * use this device. To be able to configure and control the device, + * the user needs access to PCI configuration space and BAR resources. + * These are accessed through PCI sysfs. PCI config space is often + * passed to the process calling this ioctl via file descriptor, so we + * can't rely on access to that file. We can check for permissions + * on each of the BAR resource files, which is a pretty clear + * indicator that the user has been granted access to the device. + */ +static int probe_sysfs_permissions(struct pci_dev *dev) +{ +#ifdef CONFIG_SYSFS + int i; + bool bar_found = false; + + for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) { + char *kpath, *syspath; + struct path path; + struct inode *inode; + int r; + + if (!pci_resource_len(dev, i)) + continue; + + kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL); + if (!kpath) + return -ENOMEM; + + /* Per sysfs-rules, sysfs is always at /sys */ + syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i); + kfree(kpath); + if (!syspath) + return -ENOMEM; + + r = kern_path(syspath, LOOKUP_FOLLOW, &path); + kfree(syspath); + if (r) + return r; + + inode = path.dentry->d_inode; + + r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS); + path_put(&path); + if (r) + return r; + + bar_found = true; + } + + /* If no resources, probably something special */ + if (!bar_found) + return -EPERM; + + return 0; +#else + return -EINVAL; /* No way to control the device without sysfs */ +#endif +} + static int kvm_vm_ioctl_assign_device(struct kvm *kvm, struct kvm_assigned_pci_dev *assigned_dev) { int r = 0, idx; struct kvm_assigned_dev_kernel *match; struct pci_dev *dev; + u8 header_type; if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)) return -EINVAL; @@ -516,6 +579,18 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, r = -EINVAL; goto out_free; } + + /* Don't allow bridges to be assigned */ + pci_read_config_byte(dev, PCI_HEADER_TYPE, &header_type); + if ((header_type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL) { + r = -EPERM; + goto out_put; + } + + r = probe_sysfs_permissions(dev); + if (r) + goto out_put; + if (pci_enable_device(dev)) { printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); r = -EBUSY; -- cgit v0.10.2 From 4d25a066b69fb749a39d0d4c610689dd765a0b0e Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 21 Dec 2011 12:28:29 +0100 Subject: KVM: Don't automatically expose the TSC deadline timer in cpuid Unlike all of the other cpuid bits, the TSC deadline timer bit is set unconditionally, regardless of what userspace wants. This is broken in several ways: - if userspace doesn't use KVM_CREATE_IRQCHIP, and doesn't emulate the TSC deadline timer feature, a guest that uses the feature will break - live migration to older host kernels that don't support the TSC deadline timer will cause the feature to be pulled from under the guest's feet; breaking it - guests that are broken wrt the feature will fail. Fix by not enabling the feature automatically; instead report it to userspace. Because the feature depends on KVM_CREATE_IRQCHIP, which we cannot guarantee will be called, we expose it via a KVM_CAP_TSC_DEADLINE_TIMER and not KVM_GET_SUPPORTED_CPUID. Fixes the Illumos guest kernel, which uses the TSC deadline timer feature. [avi: add the KVM_CAP + documentation] Reported-by: Alexey Zaytsev Tested-by: Alexey Zaytsev Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 4df9af4..e2a4b52 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1100,6 +1100,15 @@ emulate them efficiently. The fields in each entry are defined as follows: eax, ebx, ecx, edx: the values returned by the cpuid instruction for this function/index combination +The TSC deadline timer feature (CPUID leaf 1, ecx[24]) is always returned +as false, since the feature depends on KVM_CREATE_IRQCHIP for local APIC +support. Instead it is reported via + + ioctl(KVM_CHECK_EXTENSION, KVM_CAP_TSC_DEADLINE_TIMER) + +if that returns true and you use KVM_CREATE_IRQCHIP, or if you emulate the +feature in userspace, then you can enable the feature for KVM_SET_CPUID2. + 4.47 KVM_PPC_GET_PVINFO Capability: KVM_CAP_PPC_GET_PVINFO diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c38efd7..4c938da 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -602,7 +602,6 @@ static void update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; struct kvm_lapic *apic = vcpu->arch.apic; - u32 timer_mode_mask; best = kvm_find_cpuid_entry(vcpu, 1, 0); if (!best) @@ -615,15 +614,12 @@ static void update_cpuid(struct kvm_vcpu *vcpu) best->ecx |= bit(X86_FEATURE_OSXSAVE); } - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - best->function == 0x1) { - best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER); - timer_mode_mask = 3 << 17; - } else - timer_mode_mask = 1 << 17; - - if (apic) - apic->lapic_timer.timer_mode_mask = timer_mode_mask; + if (apic) { + if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER)) + apic->lapic_timer.timer_mode_mask = 3 << 17; + else + apic->lapic_timer.timer_mode_mask = 1 << 17; + } } int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -2135,6 +2131,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_TSC_CONTROL: r = kvm_has_tsc_control; break; + case KVM_CAP_TSC_DEADLINE_TIMER: + r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER); + break; default: r = 0; break; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index c3892fc..68e67e5 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -557,6 +557,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ #define KVM_CAP_PPC_PAPR 68 #define KVM_CAP_S390_GMAP 71 +#define KVM_CAP_TSC_DEADLINE_TIMER 72 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v0.10.2 From 36cc66d638d3ffbc635b0d48b29c1128fdad38f4 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Tue, 8 Nov 2011 07:08:52 +0000 Subject: KVM: PPC: move compute_tlbie_rb to book3s_64 common header compute_tlbie_rb is only used on ppc64 and cannot be compiled on ppc32. Signed-off-by: Andreas Schwab Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index d4df013..69c7377 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -381,39 +381,6 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) } #endif -static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, - unsigned long pte_index) -{ - unsigned long rb, va_low; - - rb = (v & ~0x7fUL) << 16; /* AVA field */ - va_low = pte_index >> 3; - if (v & HPTE_V_SECONDARY) - va_low = ~va_low; - /* xor vsid from AVA */ - if (!(v & HPTE_V_1TB_SEG)) - va_low ^= v >> 12; - else - va_low ^= v >> 24; - va_low &= 0x7ff; - if (v & HPTE_V_LARGE) { - rb |= 1; /* L field */ - if (cpu_has_feature(CPU_FTR_ARCH_206) && - (r & 0xff000)) { - /* non-16MB large page, must be 64k */ - /* (masks depend on page size) */ - rb |= 0x1000; /* page encoding in LP field */ - rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */ - rb |= (va_low & 0xfe); /* AVAL field (P7 doesn't seem to care) */ - } - } else { - /* 4kB page */ - rb |= (va_low & 0x7ff) << 12; /* remaining 11b of VA */ - } - rb |= (v >> 54) & 0x300; /* B field */ - return rb; -} - /* Magic register values loaded into r3 and r4 before the 'sc' assembly * instruction for the OSI hypercalls */ #define OSI_SC_MAGIC_R3 0x113724FA diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index e43fe42..d0ac94f 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -29,4 +29,37 @@ static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu) #define SPAPR_TCE_SHIFT 12 +static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, + unsigned long pte_index) +{ + unsigned long rb, va_low; + + rb = (v & ~0x7fUL) << 16; /* AVA field */ + va_low = pte_index >> 3; + if (v & HPTE_V_SECONDARY) + va_low = ~va_low; + /* xor vsid from AVA */ + if (!(v & HPTE_V_1TB_SEG)) + va_low ^= v >> 12; + else + va_low ^= v >> 24; + va_low &= 0x7ff; + if (v & HPTE_V_LARGE) { + rb |= 1; /* L field */ + if (cpu_has_feature(CPU_FTR_ARCH_206) && + (r & 0xff000)) { + /* non-16MB large page, must be 64k */ + /* (masks depend on page size) */ + rb |= 0x1000; /* page encoding in LP field */ + rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */ + rb |= (va_low & 0xfe); /* AVAL field (P7 doesn't seem to care) */ + } + } else { + /* 4kB page */ + rb |= (va_low & 0x7ff) << 12; /* remaining 11b of VA */ + } + rb |= (v >> 54) & 0x300; /* B field */ + return rb; +} + #endif /* __ASM_KVM_BOOK3S_64_H__ */ -- cgit v0.10.2 From 96f38d72867bc54c312decaf8463f1e9607136da Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Tue, 8 Nov 2011 07:17:39 +0000 Subject: KVM: PPC: protect use of kvmppc_h_pr kvmppc_h_pr is only available if CONFIG_KVM_BOOK3S_64_PR. Signed-off-by: Andreas Schwab Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 3c791e1..e2cfb9e 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -658,10 +658,12 @@ program_interrupt: ulong cmd = kvmppc_get_gpr(vcpu, 3); int i; +#ifdef CONFIG_KVM_BOOK3S_64_PR if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) { r = RESUME_GUEST; break; } +#endif run->papr_hcall.nr = cmd; for (i = 0; i < 9; ++i) { -- cgit v0.10.2 From 251da03897b383904901620835044e298061875f Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Thu, 10 Nov 2011 16:03:20 +0000 Subject: KVM: PPC: fix kvmppc_start_thread() for CONFIG_SMP=N Currently kvmppc_start_thread() tries to wake other SMT threads via xics_wake_cpu(). Unfortunately xics_wake_cpu only exists when CONFIG_SMP=Y so when compiling with CONFIG_SMP=N we get: arch/powerpc/kvm/built-in.o: In function `.kvmppc_start_thread': book3s_hv.c:(.text+0xa1e0): undefined reference to `.xics_wake_cpu' The following should be fine since kvmppc_start_thread() shouldn't called to start non-zero threads when SMP=N since threads_per_core=1. Signed-off-by: Michael Neuling Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 0cb137a..336983d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -538,7 +538,7 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu) tpaca->kvm_hstate.napping = 0; vcpu->cpu = vc->pcpu; smp_wmb(); -#ifdef CONFIG_PPC_ICP_NATIVE +#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) if (vcpu->arch.ptid) { tpaca->cpu_start = 0x80; wmb(); -- cgit v0.10.2 From fae9dbb4b462d2c908186a47464c7a5299ee27a9 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Tue, 20 Dec 2011 14:43:45 +0000 Subject: KVM: PPC: e500: include linux/export.h This is required for THIS_MODULE. We recently stopped acquiring it via some other header. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 26d2090..8c0d45a 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include -- cgit v0.10.2 From 6d4b9e38d3980826abccfbd90e95bf4bd41b8dd2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 26 Dec 2011 10:25:26 -0800 Subject: vfs: fix handling of lock allocation failure in lease-break case Bruce Fields notes that commit 778fc546f749 ("locks: fix tracking of inprogress lease breaks") introduced a possible error pointer dereference on failure to allocate memory. locks_conflict() will dereference the passed-in new lease lock structure that may be an error pointer. This means an open (without O_NONBLOCK set) on a file with a lease applied (generally only done when Samba or nfsd (with v4) is running) could crash if a kmalloc() fails. So instead of playing games with IS_ERROR() all over the place, just check the allocation failure early. That makes the code more straightforward, and avoids this possible bad pointer dereference. Based-on-patch-by: J. Bruce Fields Cc: Al Viro Signed-off-by: Linus Torvalds diff --git a/fs/locks.c b/fs/locks.c index 3b0d05d..637694b 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1205,6 +1205,8 @@ int __break_lease(struct inode *inode, unsigned int mode) int want_write = (mode & O_ACCMODE) != O_RDONLY; new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); + if (IS_ERR(new_fl)) + return PTR_ERR(new_fl); lock_flocks(); @@ -1221,12 +1223,6 @@ int __break_lease(struct inode *inode, unsigned int mode) if (fl->fl_owner == current->files) i_have_this_lease = 1; - if (IS_ERR(new_fl) && !i_have_this_lease - && ((mode & O_NONBLOCK) == 0)) { - error = PTR_ERR(new_fl); - goto out; - } - break_time = 0; if (lease_break_time > 0) { break_time = jiffies + lease_break_time * HZ; @@ -1284,8 +1280,7 @@ restart: out: unlock_flocks(); - if (!IS_ERR(new_fl)) - locks_free_lock(new_fl); + locks_free_lock(new_fl); return error; } -- cgit v0.10.2 From ebbd857e6b9a92c0aff4aacd1b1d2361d888633e Mon Sep 17 00:00:00 2001 From: Keith Packard Date: Mon, 26 Dec 2011 17:02:10 -0800 Subject: drm/i915: Disable semaphores by default on SNB Semaphores still cause problems on some machines: > From Udo Steinberg: > > With Linux-3.2-rc6 I'm frequently seeing GPU hangs when large amounts of > text scroll in an xterm, such as when extracting a tar archive. Such as this > one (note the timestamps): > > I can reproduce it fairly easily with something > as simple as: > > while true; do dmesg; done This patch turns them off on SNB while leaving them on for IVB. Reported-by: Udo Steinberg Cc: Daniel Vetter Cc: Eugeni Dodonov Signed-off-by: Keith Packard Signed-off-by: Linus Torvalds diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index c681dc1..b9da890 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -756,9 +756,9 @@ intel_enable_semaphores(struct drm_device *dev) if (i915_semaphores >= 0) return i915_semaphores; - /* Enable semaphores on SNB when IO remapping is off */ + /* Disable semaphores on SNB */ if (INTEL_INFO(dev)->gen == 6) - return !intel_iommu_enabled; + return 0; return 1; } -- cgit v0.10.2 From 371de6e4e0042adf4f9b54c414154f57414ddd37 Mon Sep 17 00:00:00 2001 From: Keith Packard Date: Mon, 26 Dec 2011 17:02:11 -0800 Subject: drm/i915: Disable RC6 on Sandybridge by default RC6 fails again. > I found my system freeze mostly during starting up X and KDE. Sometimes it > works for some minutes, sometimes it freezes immediatly. When the freeze > happens, everything is dead (even the reset button does not work, I need to > power cycle). > I disabled RC6, and my system runs wonderfully. > The system is a Z68 Pro board with Sandybridge i5-2500K processor, 8 > GB of RAM and UEFI firmware. Reported-by: Kai Krakow Signed-off-by: Keith Packard Signed-off-by: Linus Torvalds diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index d809b03..daa5743 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -7922,13 +7922,11 @@ static bool intel_enable_rc6(struct drm_device *dev) return 0; /* - * Enable rc6 on Sandybridge if DMA remapping is disabled + * Disable rc6 on Sandybridge */ if (INTEL_INFO(dev)->gen == 6) { - DRM_DEBUG_DRIVER("Sandybridge: intel_iommu_enabled %s -- RC6 %sabled\n", - intel_iommu_enabled ? "true" : "false", - !intel_iommu_enabled ? "en" : "dis"); - return !intel_iommu_enabled; + DRM_DEBUG_DRIVER("Sandybridge: RC6 disabled\n"); + return 0; } DRM_DEBUG_DRIVER("RC6 enabled\n"); return 1; -- cgit v0.10.2 From aef950b4ba3196622a5bd5e21ab1d63f30658285 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 27 Dec 2011 22:32:41 -0500 Subject: packet: fix possible dev refcnt leak when bind fail If bind is fail when bind is called after set PACKET_FANOUT sock option, the dev refcnt will leak. Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 3891702..d9d4970 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2448,8 +2448,12 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc { struct packet_sock *po = pkt_sk(sk); - if (po->fanout) + if (po->fanout) { + if (dev) + dev_put(dev); + return -EINVAL; + } lock_sock(sk); -- cgit v0.10.2 From f2b20d436534f22ccc3f5ad172499fcb013bb315 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 29 Dec 2011 09:16:28 +0100 Subject: block: fix blk_queue_end_tag() Commit 5e081591 "block: warn if tag is greater than real_max_depth" cleaned up blk_queue_end_tag() to warn when the tag is truly invalid (greater than real_max_depth). However, it changed behavior in the tag < max_depth case to not end the request. Leading to triggering of BUG_ON(blk_queued_rq(rq)) in the request completion path: http://marc.info/?l=linux-kernel&m=132204370518629&w=2 In order to allow blk_queue_resize_tags() to shrink the tag space blk_queue_end_tag() must always complete tags with a value less than real_max_depth regardless of the current max_depth. The comment about "handling the shrink case" seems to be what prompted changes in this space, so remove it and BUG on all invalid tags (made even simpler by Matthew's suggestion to use an unsigned compare). Signed-off-by: Dan Williams Cc: Tao Ma Cc: Matthew Wilcox Reported-by: Meelis Roos Reported-by: Ed Nadolski Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe diff --git a/block/blk-tag.c b/block/blk-tag.c index e74d6d1..4af6f5c 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -282,18 +282,9 @@ EXPORT_SYMBOL(blk_queue_resize_tags); void blk_queue_end_tag(struct request_queue *q, struct request *rq) { struct blk_queue_tag *bqt = q->queue_tags; - int tag = rq->tag; + unsigned tag = rq->tag; /* negative tags invalid */ - BUG_ON(tag == -1); - - if (unlikely(tag >= bqt->max_depth)) { - /* - * This can happen after tag depth has been reduced. - * But tag shouldn't be larger than real_max_depth. - */ - WARN_ON(tag >= bqt->real_max_depth); - return; - } + BUG_ON(tag >= bqt->real_max_depth); list_del_init(&rq->queuelist); rq->cmd_flags &= ~REQ_QUEUED; -- cgit v0.10.2 From 757e55c23dc62eb5adf45368a72f6b26d6a71ae5 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 29 Dec 2011 19:09:21 -0200 Subject: gspca: Fix bulk mode cameras no longer working (regression fix) The new iso bandwidth calculation code accidentally has broken support for bulk mode cameras. This has broken the following drivers: finepix, jeilinj, ovfx2, ov534, ov534_9, se401, sq905, sq905c, sq930x, stv0680, vicam. Thix patch fixes this. Fix tested with: se401, sq905, sq905c, stv0680 & vicam cams. Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Linus Torvalds diff --git a/drivers/media/video/gspca/gspca.c b/drivers/media/video/gspca/gspca.c index 881e04c..512f32f 100644 --- a/drivers/media/video/gspca/gspca.c +++ b/drivers/media/video/gspca/gspca.c @@ -838,13 +838,13 @@ static int gspca_init_transfer(struct gspca_dev *gspca_dev) gspca_dev->usb_err = 0; /* do the specific subdriver stuff before endpoint selection */ - gspca_dev->alt = 0; + intf = usb_ifnum_to_if(gspca_dev->dev, gspca_dev->iface); + gspca_dev->alt = gspca_dev->cam.bulk ? intf->num_altsetting : 0; if (gspca_dev->sd_desc->isoc_init) { ret = gspca_dev->sd_desc->isoc_init(gspca_dev); if (ret < 0) goto unlock; } - intf = usb_ifnum_to_if(gspca_dev->dev, gspca_dev->iface); xfer = gspca_dev->cam.bulk ? USB_ENDPOINT_XFER_BULK : USB_ENDPOINT_XFER_ISOC; -- cgit v0.10.2 From e26a51148f3ebd859bca8bf2e0f212839b447f62 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 28 Dec 2011 15:57:11 -0800 Subject: mm/mempolicy.c: refix mbind_range() vma issue commit 8aacc9f550 ("mm/mempolicy.c: fix pgoff in mbind vma merge") is the slightly incorrect fix. Why? Think following case. 1. map 4 pages of a file at offset 0 [0123] 2. map 2 pages just after the first mapping of the same file but with page offset 2 [0123][23] 3. mbind() 2 pages from the first mapping at offset 2. mbind_range() should treat new vma is, [0123][23] |23| mbind vma but it does [0123][23] |01| mbind vma Oops. then, it makes wrong vma merge and splitting ([01][0123] or similar). This patch fixes it. [testcase] test result - before the patch case4: 126: test failed. expect '2,4', actual '2,2,2' case5: passed case6: passed case7: passed case8: passed case_n: 246: test failed. expect '4,2', actual '1,4' ------------[ cut here ]------------ kernel BUG at mm/filemap.c:135! invalid opcode: 0000 [#4] SMP DEBUG_PAGEALLOC (snip long bug on messages) test result - after the patch case4: passed case5: passed case6: passed case7: passed case8: passed case_n: passed source: mbind_vma_test.c ============================================================ #include #include #include #include #include #include #include static unsigned long pagesize; void* mmap_addr; struct bitmask *nmask; char buf[1024]; FILE *file; char retbuf[10240] = ""; int mapped_fd; char *rubysrc = "ruby -e '\ pid = %d; \ vstart = 0x%llx; \ vend = 0x%llx; \ s = `pmap -q #{pid}`; \ rary = []; \ s.each_line {|line|; \ ary=line.split(\" \"); \ addr = ary[0].to_i(16); \ if(vstart <= addr && addr < vend) then \ rary.push(ary[1].to_i()/4); \ end; \ }; \ print rary.join(\",\"); \ '"; void init(void) { void* addr; char buf[128]; nmask = numa_allocate_nodemask(); numa_bitmask_setbit(nmask, 0); pagesize = getpagesize(); sprintf(buf, "%s", "mbind_vma_XXXXXX"); mapped_fd = mkstemp(buf); if (mapped_fd == -1) perror("mkstemp "), exit(1); unlink(buf); if (lseek(mapped_fd, pagesize*8, SEEK_SET) < 0) perror("lseek "), exit(1); if (write(mapped_fd, "\0", 1) < 0) perror("write "), exit(1); addr = mmap(NULL, pagesize*8, PROT_NONE, MAP_SHARED, mapped_fd, 0); if (addr == MAP_FAILED) perror("mmap "), exit(1); if (mprotect(addr+pagesize, pagesize*6, PROT_READ|PROT_WRITE) < 0) perror("mprotect "), exit(1); mmap_addr = addr + pagesize; /* make page populate */ memset(mmap_addr, 0, pagesize*6); } void fin(void) { void* addr = mmap_addr - pagesize; munmap(addr, pagesize*8); memset(buf, 0, sizeof(buf)); memset(retbuf, 0, sizeof(retbuf)); } void mem_bind(int index, int len) { int err; err = mbind(mmap_addr+pagesize*index, pagesize*len, MPOL_BIND, nmask->maskp, nmask->size, 0); if (err) perror("mbind "), exit(err); } void mem_interleave(int index, int len) { int err; err = mbind(mmap_addr+pagesize*index, pagesize*len, MPOL_INTERLEAVE, nmask->maskp, nmask->size, 0); if (err) perror("mbind "), exit(err); } void mem_unbind(int index, int len) { int err; err = mbind(mmap_addr+pagesize*index, pagesize*len, MPOL_DEFAULT, NULL, 0, 0); if (err) perror("mbind "), exit(err); } void Assert(char *expected, char *value, char *name, int line) { if (strcmp(expected, value) == 0) { fprintf(stderr, "%s: passed\n", name); return; } else { fprintf(stderr, "%s: %d: test failed. expect '%s', actual '%s'\n", name, line, expected, value); // exit(1); } } /* AAAA PPPPPPNNNNNN might become PPNNNNNNNNNN case 4 below */ void case4(void) { init(); sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6); mem_bind(0, 4); mem_unbind(2, 2); file = popen(buf, "r"); fread(retbuf, sizeof(retbuf), 1, file); Assert("2,4", retbuf, "case4", __LINE__); fin(); } /* AAAA PPPPPPNNNNNN might become PPPPPPPPPPNN case 5 below */ void case5(void) { init(); sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6); mem_bind(0, 2); mem_bind(2, 2); file = popen(buf, "r"); fread(retbuf, sizeof(retbuf), 1, file); Assert("4,2", retbuf, "case5", __LINE__); fin(); } /* AAAA PPPPNNNNXXXX might become PPPPPPPPPPPP 6 */ void case6(void) { init(); sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6); mem_bind(0, 2); mem_bind(4, 2); mem_bind(2, 2); file = popen(buf, "r"); fread(retbuf, sizeof(retbuf), 1, file); Assert("6", retbuf, "case6", __LINE__); fin(); } /* AAAA PPPPNNNNXXXX might become PPPPPPPPXXXX 7 */ void case7(void) { init(); sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6); mem_bind(0, 2); mem_interleave(4, 2); mem_bind(2, 2); file = popen(buf, "r"); fread(retbuf, sizeof(retbuf), 1, file); Assert("4,2", retbuf, "case7", __LINE__); fin(); } /* AAAA PPPPNNNNXXXX might become PPPPNNNNNNNN 8 */ void case8(void) { init(); sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6); mem_bind(0, 2); mem_interleave(4, 2); mem_interleave(2, 2); file = popen(buf, "r"); fread(retbuf, sizeof(retbuf), 1, file); Assert("2,4", retbuf, "case8", __LINE__); fin(); } void case_n(void) { init(); sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6); /* make redundunt mappings [0][1234][34][7] */ mmap(mmap_addr + pagesize*4, pagesize*2, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED, mapped_fd, pagesize*3); /* Expect to do nothing. */ mem_unbind(2, 2); file = popen(buf, "r"); fread(retbuf, sizeof(retbuf), 1, file); Assert("4,2", retbuf, "case_n", __LINE__); fin(); } int main(int argc, char** argv) { case4(); case5(); case6(); case7(); case8(); case_n(); return 0; } ============================================================= Signed-off-by: KOSAKI Motohiro Acked-by: Johannes Weiner Cc: Minchan Kim Cc: Caspar Zhang Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: Hugh Dickins Cc: Mel Gorman Cc: Lee Schermerhorn Cc: [3.1.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mempolicy.c b/mm/mempolicy.c index adc3954..c3fdbcb 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -636,6 +636,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; + pgoff_t pgoff; unsigned long vmstart; unsigned long vmend; @@ -643,13 +644,21 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, if (!vma || vma->vm_start > start) return -EFAULT; + if (start > vma->vm_start) + prev = vma; + for (; vma && vma->vm_start < end; prev = vma, vma = next) { next = vma->vm_next; vmstart = max(start, vma->vm_start); vmend = min(end, vma->vm_end); + if (mpol_equal(vma_policy(vma), new_pol)) + continue; + + pgoff = vma->vm_pgoff + + ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, vma->vm_pgoff, + vma->anon_vma, vma->vm_file, pgoff, new_pol); if (prev) { vma = prev; -- cgit v0.10.2 From 34845636a184f3be91a531098192592cbe6db587 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Wed, 28 Dec 2011 15:57:15 -0800 Subject: procfs: do not confuse jiffies with cputime64_t Commit 2a95ea6c0d129b4 ("procfs: do not overflow get_{idle,iowait}_time for nohz") did not take into account that one some architectures jiffies and cputime use different units. This causes get_idle_time() to return numbers in the wrong units, making the idle time fields in /proc/stat wrong. Instead of converting the usec value returned by get_cpu_{idle,iowait}_time_us to units of jiffies, use the new function usecs_to_cputime64 to convert it to the correct unit of cputime64_t. Signed-off-by: Andreas Schwab Acked-by: Michal Hocko Cc: Arnd Bergmann Cc: "Artem S. Tashkinov" Cc: Dave Jones Cc: Alexey Dobriyan Cc: Thomas Gleixner Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/ia64/include/asm/cputime.h b/arch/ia64/include/asm/cputime.h index 6073b18..5a274af 100644 --- a/arch/ia64/include/asm/cputime.h +++ b/arch/ia64/include/asm/cputime.h @@ -60,6 +60,7 @@ typedef u64 cputime64_t; */ #define cputime_to_usecs(__ct) ((__ct) / NSEC_PER_USEC) #define usecs_to_cputime(__usecs) ((__usecs) * NSEC_PER_USEC) +#define usecs_to_cputime64(__usecs) usecs_to_cputime(__usecs) /* * Convert cputime <-> seconds diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 1cf20bd..98b7c4b 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -150,6 +150,8 @@ static inline cputime_t usecs_to_cputime(const unsigned long us) return ct; } +#define usecs_to_cputime64(us) usecs_to_cputime(us) + /* * Convert cputime <-> seconds */ diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index 0814348..b9acaaa 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -87,6 +87,8 @@ usecs_to_cputime(const unsigned int m) return (cputime_t) m * 4096; } +#define usecs_to_cputime64(m) usecs_to_cputime(m) + /* * Convert cputime to milliseconds and back. */ diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 2a30d67..0855e6f 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -32,7 +32,7 @@ static cputime64_t get_idle_time(int cpu) idle = kstat_cpu(cpu).cpustat.idle; idle = cputime64_add(idle, arch_idle_time(cpu)); } else - idle = nsecs_to_jiffies64(1000 * idle_time); + idle = usecs_to_cputime64(idle_time); return idle; } @@ -46,7 +46,7 @@ static cputime64_t get_iowait_time(int cpu) /* !NO_HZ so we can rely on cpustat.iowait */ iowait = kstat_cpu(cpu).cpustat.iowait; else - iowait = nsecs_to_jiffies64(1000 * iowait_time); + iowait = usecs_to_cputime64(iowait_time); return iowait; } diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h index 62ce682..12a1764f 100644 --- a/include/asm-generic/cputime.h +++ b/include/asm-generic/cputime.h @@ -40,6 +40,7 @@ typedef u64 cputime64_t; */ #define cputime_to_usecs(__ct) jiffies_to_usecs(__ct) #define usecs_to_cputime(__msecs) usecs_to_jiffies(__msecs) +#define usecs_to_cputime64(__msecs) nsecs_to_jiffies64((__msecs) * 1000) /* * Convert cputime to seconds and back. -- cgit v0.10.2 From b0365c8d0cb6e79eb5f21418ae61ab511f31b575 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Wed, 28 Dec 2011 15:57:16 -0800 Subject: mm: hugetlb: fix non-atomic enqueue of huge page If a huge page is enqueued under the protection of hugetlb_lock, then the operation is atomic and safe. Signed-off-by: Hillf Danton Reviewed-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: [2.6.37+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 73f17c0..2316840 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -901,7 +901,6 @@ retry: h->resv_huge_pages += delta; ret = 0; - spin_unlock(&hugetlb_lock); /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) { if ((--needed) < 0) @@ -915,6 +914,7 @@ retry: VM_BUG_ON(page_count(page)); enqueue_huge_page(h, page); } + spin_unlock(&hugetlb_lock); /* Free unnecessary surplus pages to the buddy allocator */ free: -- cgit v0.10.2