From 50fb4f7fc907efff65eadb0b74387a9ffed6e849 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:22 -0800 Subject: freezer: fix current->state restoration race in refrigerator() refrigerator() saves current->state before entering frozen state and restores it before returning using __set_current_state(); however, this is racy, for example, please consider the following sequence. set_current_state(TASK_INTERRUPTIBLE); try_to_freeze(); if (kthread_should_stop()) break; schedule(); If kthread_stop() races with ->state restoration, the restoration can restore ->state to TASK_INTERRUPTIBLE after kthread_stop() sets it to TASK_RUNNING but kthread_should_stop() may still see zero ->should_stop because there's no memory barrier between restoring TASK_INTERRUPTIBLE and kthread_should_stop() test. This isn't restricted to kthread_should_stop(). current->state is often used in memory barrier based synchronization and silently restoring it w/o mb breaks them. Use set_current_state() instead. Signed-off-by: Tejun Heo diff --git a/kernel/freezer.c b/kernel/freezer.c index 7be56c5..3f46010 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -58,7 +58,13 @@ void refrigerator(void) current->flags &= ~PF_FREEZING; pr_debug("%s left refrigerator\n", current->comm); - __set_current_state(save); + + /* + * Restore saved task state before returning. The mb'd version + * needs to be used; otherwise, it might silently break + * synchronization which depends on ordered task state change. + */ + set_current_state(save); } EXPORT_SYMBOL(refrigerator); -- cgit v0.10.2 From 3a7cbd50f74907580eb47a8d08e1f29741b81abf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:22 -0800 Subject: freezer: don't unnecessarily set PF_NOFREEZE explicitly Some drivers set PF_NOFREEZE in their kthread functions which is completely unnecessary and racy - some part of freezer code doesn't consider cases where PF_NOFREEZE is set asynchronous to freezer operations. In general, there's no reason to allow setting PF_NOFREEZE explicitly. Remove them and change the documentation to note that setting PF_NOFREEZE directly isn't allowed. -v2: Dropped change to twl4030-irq.c as it no longer uses PF_NOFREEZE. Signed-off-by: Tejun Heo Acked-by: "Gustavo F. Padovan" Acked-by: Samuel Ortiz Cc: Marcel Holtmann Cc: wwang diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt index 316c2ba..587e082 100644 --- a/Documentation/power/freezing-of-tasks.txt +++ b/Documentation/power/freezing-of-tasks.txt @@ -67,7 +67,7 @@ III. Which kernel threads are freezable? Kernel threads are not freezable by default. However, a kernel thread may clear PF_NOFREEZE for itself by calling set_freezable() (the resetting of PF_NOFREEZE -directly is strongly discouraged). From this point it is regarded as freezable +directly is not allowed). From this point it is regarded as freezable and must call try_to_freeze() in a suitable place. IV. Why do we do that? diff --git a/drivers/bluetooth/btmrvl_main.c b/drivers/bluetooth/btmrvl_main.c index a88a78c..6c3defa 100644 --- a/drivers/bluetooth/btmrvl_main.c +++ b/drivers/bluetooth/btmrvl_main.c @@ -475,8 +475,6 @@ static int btmrvl_service_main_thread(void *data) init_waitqueue_entry(&wait, current); - current->flags |= PF_NOFREEZE; - for (;;) { add_wait_queue(&thread->wait_q, &wait); diff --git a/drivers/mfd/twl6030-irq.c b/drivers/mfd/twl6030-irq.c index 3eee45f..c6b456a 100644 --- a/drivers/mfd/twl6030-irq.c +++ b/drivers/mfd/twl6030-irq.c @@ -138,8 +138,6 @@ static int twl6030_irq_thread(void *data) static const unsigned max_i2c_errors = 100; int ret; - current->flags |= PF_NOFREEZE; - while (!kthread_should_stop()) { int i; union { diff --git a/drivers/staging/rts_pstor/rtsx.c b/drivers/staging/rts_pstor/rtsx.c index 480b0ed..8a7803c 100644 --- a/drivers/staging/rts_pstor/rtsx.c +++ b/drivers/staging/rts_pstor/rtsx.c @@ -466,8 +466,6 @@ static int rtsx_control_thread(void *__dev) struct rtsx_chip *chip = dev->chip; struct Scsi_Host *host = rtsx_to_host(dev); - current->flags |= PF_NOFREEZE; - for (;;) { if (wait_for_completion_interruptible(&dev->cmnd_ready)) break; -- cgit v0.10.2 From a0acae0e886d44bd5ce6d2f173c1ace0fcf0d9f6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:22 -0800 Subject: freezer: unexport refrigerator() and update try_to_freeze() slightly There is no reason to export two functions for entering the refrigerator. Calling refrigerator() instead of try_to_freeze() doesn't save anything noticeable or removes any race condition. * Rename refrigerator() to __refrigerator() and make it return bool indicating whether it scheduled out for freezing. * Update try_to_freeze() to return bool and relay the return value of __refrigerator() if freezing(). * Convert all refrigerator() users to try_to_freeze(). * Update documentation accordingly. * While at it, add might_sleep() to try_to_freeze(). Signed-off-by: Tejun Heo Cc: Samuel Ortiz Cc: Chris Mason Cc: "Theodore Ts'o" Cc: Steven Whitehouse Cc: Andrew Morton Cc: Jan Kara Cc: KONISHI Ryusuke Cc: Christoph Hellwig diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt index 587e082..3ab9fbd 100644 --- a/Documentation/power/freezing-of-tasks.txt +++ b/Documentation/power/freezing-of-tasks.txt @@ -21,7 +21,7 @@ freeze_processes() (defined in kernel/power/process.c) is called. It executes try_to_freeze_tasks() that sets TIF_FREEZE for all of the freezable tasks and either wakes them up, if they are kernel threads, or sends fake signals to them, if they are user space processes. A task that has TIF_FREEZE set, should react -to it by calling the function called refrigerator() (defined in +to it by calling the function called __refrigerator() (defined in kernel/freezer.c), which sets the task's PF_FROZEN flag, changes its state to TASK_UNINTERRUPTIBLE and makes it loop until PF_FROZEN is cleared for it. Then, we say that the task is 'frozen' and therefore the set of functions @@ -29,10 +29,10 @@ handling this mechanism is referred to as 'the freezer' (these functions are defined in kernel/power/process.c, kernel/freezer.c & include/linux/freezer.h). User space processes are generally frozen before kernel threads. -It is not recommended to call refrigerator() directly. Instead, it is -recommended to use the try_to_freeze() function (defined in -include/linux/freezer.h), that checks the task's TIF_FREEZE flag and makes the -task enter refrigerator() if the flag is set. +__refrigerator() must not be called directly. Instead, use the +try_to_freeze() function (defined in include/linux/freezer.h), that checks +the task's TIF_FREEZE flag and makes the task enter __refrigerator() if the +flag is set. For user space processes try_to_freeze() is called automatically from the signal-handling code, but the freezable kernel threads need to call it @@ -61,7 +61,7 @@ wait_event_freezable() and wait_event_freezable_timeout() macros. After the system memory state has been restored from a hibernation image and devices have been reinitialized, the function thaw_processes() is called in order to clear the PF_FROZEN flag for each frozen task. Then, the tasks that -have been frozen leave refrigerator() and continue running. +have been frozen leave __refrigerator() and continue running. III. Which kernel threads are freezable? diff --git a/drivers/net/irda/stir4200.c b/drivers/net/irda/stir4200.c index 41c96b3..e880c79 100644 --- a/drivers/net/irda/stir4200.c +++ b/drivers/net/irda/stir4200.c @@ -750,7 +750,7 @@ static int stir_transmit_thread(void *arg) write_reg(stir, REG_CTRL1, CTRL1_TXPWD|CTRL1_RXPWD); - refrigerator(); + try_to_freeze(); if (change_speed(stir, stir->speed)) break; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 7ec1409..98ab240 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -340,7 +340,7 @@ again: if (freezing(current)) { worker->working = 0; spin_unlock_irq(&worker->lock); - refrigerator(); + try_to_freeze(); } else { spin_unlock_irq(&worker->lock); if (!kthread_should_stop()) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 62afe5c..622654f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1579,9 +1579,7 @@ static int cleaner_kthread(void *arg) btrfs_run_defrag_inodes(root->fs_info); } - if (freezing(current)) { - refrigerator(); - } else { + if (!try_to_freeze()) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop()) schedule(); @@ -1635,9 +1633,7 @@ sleep: wake_up_process(root->fs_info->cleaner_kthread); mutex_unlock(&root->fs_info->transaction_kthread_mutex); - if (freezing(current)) { - refrigerator(); - } else { + if (!try_to_freeze()) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop() && !btrfs_transaction_blocked(root->fs_info)) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9953d80..877350e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2882,8 +2882,7 @@ cont_thread: } mutex_unlock(&eli->li_list_mtx); - if (freezing(current)) - refrigerator(); + try_to_freeze(); cur = jiffies; if ((time_after_eq(cur, next_wakeup)) || diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 5986464..8154d42 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -951,8 +951,8 @@ int gfs2_logd(void *data) wake_up(&sdp->sd_log_waitq); t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; - if (freezing(current)) - refrigerator(); + + try_to_freeze(); do { prepare_to_wait(&sdp->sd_logd_waitq, &wait, diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 7e528dc..d49669e 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1427,8 +1427,8 @@ int gfs2_quotad(void *data) /* Check for & recover partially truncated inodes */ quotad_check_trunc_list(sdp); - if (freezing(current)) - refrigerator(); + try_to_freeze(); + t = min(quotad_timeo, statfs_timeo); prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index fea8dd6..a96cff0 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -166,7 +166,7 @@ loop: */ jbd_debug(1, "Now suspending kjournald\n"); spin_unlock(&journal->j_state_lock); - refrigerator(); + try_to_freeze(); spin_lock(&journal->j_state_lock); } else { /* diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0fa0123..c0a5f9f 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -173,7 +173,7 @@ loop: */ jbd_debug(1, "Now suspending kjournald2\n"); write_unlock(&journal->j_state_lock); - refrigerator(); + try_to_freeze(); write_lock(&journal->j_state_lock); } else { /* diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index cc5f811..2eb952c 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2349,7 +2349,7 @@ int jfsIOWait(void *arg) if (freezing(current)) { spin_unlock_irq(&log_redrive_lock); - refrigerator(); + try_to_freeze(); } else { set_current_state(TASK_INTERRUPTIBLE); spin_unlock_irq(&log_redrive_lock); diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index af96060..bb8b661 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -2800,7 +2800,7 @@ int jfs_lazycommit(void *arg) if (freezing(current)) { LAZY_UNLOCK(flags); - refrigerator(); + try_to_freeze(); } else { DECLARE_WAITQUEUE(wq, current); @@ -2994,7 +2994,7 @@ int jfs_sync(void *arg) if (freezing(current)) { TXN_UNLOCK(); - refrigerator(); + try_to_freeze(); } else { set_current_state(TASK_INTERRUPTIBLE); TXN_UNLOCK(); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index bb24ab6..0e72ad6 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2470,7 +2470,7 @@ static int nilfs_segctor_thread(void *arg) if (freezing(current)) { spin_unlock(&sci->sc_state_lock); - refrigerator(); + try_to_freeze(); spin_lock(&sci->sc_state_lock); } else { DEFINE_WAIT(wait); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index cf0ac05..0188299 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1703,7 +1703,7 @@ xfsbufd( if (unlikely(freezing(current))) { set_bit(XBT_FORCE_SLEEP, &target->bt_flags); - refrigerator(); + try_to_freeze(); } else { clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); } diff --git a/include/linux/freezer.h b/include/linux/freezer.h index a5386e3..7a9427e 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -47,18 +47,17 @@ static inline bool should_send_signal(struct task_struct *p) /* Takes and releases task alloc lock using task_lock() */ extern int thaw_process(struct task_struct *p); -extern void refrigerator(void); +extern bool __refrigerator(void); extern int freeze_processes(void); extern int freeze_kernel_threads(void); extern void thaw_processes(void); -static inline int try_to_freeze(void) +static inline bool try_to_freeze(void) { - if (freezing(current)) { - refrigerator(); - return 1; - } else - return 0; + might_sleep(); + if (likely(!freezing(current))) + return false; + return __refrigerator(); } extern bool freeze_task(struct task_struct *p, bool sig_only); @@ -181,12 +180,12 @@ static inline void set_freeze_flag(struct task_struct *p) {} static inline void clear_freeze_flag(struct task_struct *p) {} static inline int thaw_process(struct task_struct *p) { return 1; } -static inline void refrigerator(void) {} +static inline bool __refrigerator(void) { return false; } static inline int freeze_processes(void) { return -ENOSYS; } static inline int freeze_kernel_threads(void) { return -ENOSYS; } static inline void thaw_processes(void) {} -static inline int try_to_freeze(void) { return 0; } +static inline bool try_to_freeze(void) { return false; } static inline void freezer_do_not_count(void) {} static inline void freezer_count(void) {} diff --git a/kernel/freezer.c b/kernel/freezer.c index 3f46010..732f14f 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -23,10 +23,11 @@ static inline void frozen_process(void) } /* Refrigerator is place where frozen processes are stored :-). */ -void refrigerator(void) +bool __refrigerator(void) { /* Hmm, should we be allowed to suspend when there are realtime processes around? */ + bool was_frozen = false; long save; task_lock(current); @@ -35,7 +36,7 @@ void refrigerator(void) task_unlock(current); } else { task_unlock(current); - return; + return was_frozen; } save = current->state; pr_debug("%s entered refrigerator\n", current->comm); @@ -51,6 +52,7 @@ void refrigerator(void) set_current_state(TASK_UNINTERRUPTIBLE); if (!frozen(current)) break; + was_frozen = true; schedule(); } @@ -65,8 +67,10 @@ void refrigerator(void) * synchronization which depends on ordered task state change. */ set_current_state(save); + + return was_frozen; } -EXPORT_SYMBOL(refrigerator); +EXPORT_SYMBOL(__refrigerator); static void fake_signal_wake_up(struct task_struct *p) { -- cgit v0.10.2 From 8a32c441c1609f80e55df75422324a1151208f40 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:23 -0800 Subject: freezer: implement and use kthread_freezable_should_stop() Writeback and thinkpad_acpi have been using thaw_process() to prevent deadlock between the freezer and kthread_stop(); unfortunately, this is inherently racy - nothing prevents freezing from happening between thaw_process() and kthread_stop(). This patch implements kthread_freezable_should_stop() which enters refrigerator if necessary but is guaranteed to return if kthread_stop() is invoked. Both thaw_process() users are converted to use the new function. Note that this deadlock condition exists for many of freezable kthreads. They need to be converted to use the new should_stop or freezable workqueue. Tested with synthetic test case. Signed-off-by: Tejun Heo Acked-by: Henrique de Moraes Holschuh Cc: Jens Axboe Cc: Oleg Nesterov diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 7b82868..4b11fc9 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -2456,8 +2456,9 @@ static int hotkey_kthread(void *data) u32 poll_mask, event_mask; unsigned int si, so; unsigned long t; - unsigned int change_detector, must_reset; + unsigned int change_detector; unsigned int poll_freq; + bool was_frozen; mutex_lock(&hotkey_thread_mutex); @@ -2488,14 +2489,14 @@ static int hotkey_kthread(void *data) t = 100; /* should never happen... */ } t = msleep_interruptible(t); - if (unlikely(kthread_should_stop())) + if (unlikely(kthread_freezable_should_stop(&was_frozen))) break; - must_reset = try_to_freeze(); - if (t > 0 && !must_reset) + + if (t > 0 && !was_frozen) continue; mutex_lock(&hotkey_thread_data_mutex); - if (must_reset || hotkey_config_change != change_detector) { + if (was_frozen || hotkey_config_change != change_detector) { /* forget old state on thaw or config change */ si = so; t = 0; @@ -2528,10 +2529,6 @@ exit: static void hotkey_poll_stop_sync(void) { if (tpacpi_hotkey_task) { - if (frozen(tpacpi_hotkey_task) || - freezing(tpacpi_hotkey_task)) - thaw_process(tpacpi_hotkey_task); - kthread_stop(tpacpi_hotkey_task); tpacpi_hotkey_task = NULL; mutex_lock(&hotkey_thread_mutex); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 73c3992..271fde5 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -947,7 +947,7 @@ int bdi_writeback_thread(void *data) trace_writeback_thread_start(bdi); - while (!kthread_should_stop()) { + while (!kthread_freezable_should_stop(NULL)) { /* * Remove own delayed wake-up timer, since we are already awake * and we'll take care of the preriodic write-back. @@ -977,8 +977,6 @@ int bdi_writeback_thread(void *data) */ schedule(); } - - try_to_freeze(); } /* Flush any work that raced with us exiting */ diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 7a9427e..d02b784 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -47,7 +47,7 @@ static inline bool should_send_signal(struct task_struct *p) /* Takes and releases task alloc lock using task_lock() */ extern int thaw_process(struct task_struct *p); -extern bool __refrigerator(void); +extern bool __refrigerator(bool check_kthr_stop); extern int freeze_processes(void); extern int freeze_kernel_threads(void); extern void thaw_processes(void); @@ -57,7 +57,7 @@ static inline bool try_to_freeze(void) might_sleep(); if (likely(!freezing(current))) return false; - return __refrigerator(); + return __refrigerator(false); } extern bool freeze_task(struct task_struct *p, bool sig_only); @@ -180,7 +180,7 @@ static inline void set_freeze_flag(struct task_struct *p) {} static inline void clear_freeze_flag(struct task_struct *p) {} static inline int thaw_process(struct task_struct *p) { return 1; } -static inline bool __refrigerator(void) { return false; } +static inline bool __refrigerator(bool check_kthr_stop) { return false; } static inline int freeze_processes(void) { return -ENOSYS; } static inline int freeze_kernel_threads(void) { return -ENOSYS; } static inline void thaw_processes(void) {} diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 5cac19b..0714b24 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -35,6 +35,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void kthread_bind(struct task_struct *k, unsigned int cpu); int kthread_stop(struct task_struct *k); int kthread_should_stop(void); +bool kthread_freezable_should_stop(bool *was_frozen); void *kthread_data(struct task_struct *k); int kthreadd(void *unused); diff --git a/kernel/freezer.c b/kernel/freezer.c index 732f14f..b83c30e 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -9,6 +9,7 @@ #include #include #include +#include /* * freezing is complete, mark current process as frozen @@ -23,7 +24,7 @@ static inline void frozen_process(void) } /* Refrigerator is place where frozen processes are stored :-). */ -bool __refrigerator(void) +bool __refrigerator(bool check_kthr_stop) { /* Hmm, should we be allowed to suspend when there are realtime processes around? */ @@ -50,7 +51,8 @@ bool __refrigerator(void) for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!frozen(current)) + if (!frozen(current) || + (check_kthr_stop && kthread_should_stop())) break; was_frozen = true; schedule(); diff --git a/kernel/kthread.c b/kernel/kthread.c index b6d216a..1c36dea 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -59,6 +59,31 @@ int kthread_should_stop(void) EXPORT_SYMBOL(kthread_should_stop); /** + * kthread_freezable_should_stop - should this freezable kthread return now? + * @was_frozen: optional out parameter, indicates whether %current was frozen + * + * kthread_should_stop() for freezable kthreads, which will enter + * refrigerator if necessary. This function is safe from kthread_stop() / + * freezer deadlock and freezable kthreads should use this function instead + * of calling try_to_freeze() directly. + */ +bool kthread_freezable_should_stop(bool *was_frozen) +{ + bool frozen = false; + + might_sleep(); + + if (unlikely(freezing(current))) + frozen = __refrigerator(true); + + if (was_frozen) + *was_frozen = frozen; + + return kthread_should_stop(); +} +EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); + +/** * kthread_data - return data value specified on kthread creation * @task: kthread task in question * diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 71034f4..7ba8fea 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -600,14 +600,10 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) /* * Finally, kill the kernel thread. We don't need to be RCU - * safe anymore, since the bdi is gone from visibility. Force - * unfreeze of the thread before calling kthread_stop(), otherwise - * it would never exet if it is currently stuck in the refrigerator. + * safe anymore, since the bdi is gone from visibility. */ - if (bdi->wb.task) { - thaw_process(bdi->wb.task); + if (bdi->wb.task) kthread_stop(bdi->wb.task); - } } /* -- cgit v0.10.2 From a5be2d0d1a8746e7be5210e3d6b904455000443c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:23 -0800 Subject: freezer: rename thaw_process() to __thaw_task() and simplify the implementation thaw_process() now has only internal users - system and cgroup freezers. Remove the unnecessary return value, rename, unexport and collapse __thaw_process() into it. This will help further updates to the freezer code. -v3: oom_kill grew a use of thaw_process() while this patch was pending. Convert it to use __thaw_task() for now. In the longer term, this should be handled by allowing tasks to die if killed even if it's frozen. -v2: minor style update as suggested by Matt. Signed-off-by: Tejun Heo Cc: Paul Menage Cc: Matt Helsley diff --git a/include/linux/freezer.h b/include/linux/freezer.h index d02b784..ba4f512 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -45,7 +45,7 @@ static inline bool should_send_signal(struct task_struct *p) } /* Takes and releases task alloc lock using task_lock() */ -extern int thaw_process(struct task_struct *p); +extern void __thaw_task(struct task_struct *t); extern bool __refrigerator(bool check_kthr_stop); extern int freeze_processes(void); @@ -178,7 +178,6 @@ static inline int frozen(struct task_struct *p) { return 0; } static inline int freezing(struct task_struct *p) { return 0; } static inline void set_freeze_flag(struct task_struct *p) {} static inline void clear_freeze_flag(struct task_struct *p) {} -static inline int thaw_process(struct task_struct *p) { return 1; } static inline bool __refrigerator(bool check_kthr_stop) { return false; } static inline int freeze_processes(void) { return -ENOSYS; } diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 5e828a2..a6d405a 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -130,7 +130,7 @@ struct cgroup_subsys freezer_subsys; * write_lock css_set_lock (cgroup iterator start) * task->alloc_lock * read_lock css_set_lock (cgroup iterator start) - * task->alloc_lock (inside thaw_process(), prevents race with refrigerator()) + * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) * sighand->siglock */ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, @@ -300,9 +300,8 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) struct task_struct *task; cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) { - thaw_process(task); - } + while ((task = cgroup_iter_next(cgroup, &it))) + __thaw_task(task); cgroup_iter_end(cgroup, &it); freezer->state = CGROUP_THAWED; diff --git a/kernel/freezer.c b/kernel/freezer.c index b83c30e..c851d58 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -145,18 +145,8 @@ void cancel_freezing(struct task_struct *p) } } -static int __thaw_process(struct task_struct *p) -{ - if (frozen(p)) { - p->flags &= ~PF_FROZEN; - return 1; - } - clear_freeze_flag(p); - return 0; -} - /* - * Wake up a frozen process + * Wake up a frozen task * * task_lock() is needed to prevent the race with refrigerator() which may * occur if the freezing of tasks fails. Namely, without the lock, if the @@ -164,15 +154,18 @@ static int __thaw_process(struct task_struct *p) * refrigerator() could call frozen_process(), in which case the task would be * frozen and no one would thaw it. */ -int thaw_process(struct task_struct *p) +void __thaw_task(struct task_struct *p) { + bool was_frozen; + task_lock(p); - if (__thaw_process(p) == 1) { - task_unlock(p); - wake_up_process(p); - return 1; - } + was_frozen = frozen(p); + if (was_frozen) + p->flags &= ~PF_FROZEN; + else + clear_freeze_flag(p); task_unlock(p); - return 0; + + if (was_frozen) + wake_up_process(p); } -EXPORT_SYMBOL(thaw_process); diff --git a/kernel/power/process.c b/kernel/power/process.c index addbbe5..fe27872 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -186,7 +186,7 @@ static void thaw_tasks(bool nosig_only) if (cgroup_freezing_or_frozen(p)) continue; - thaw_process(p); + __thaw_task(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 76f2c5a..3134ee2 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -328,7 +328,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, */ if (test_tsk_thread_flag(p, TIF_MEMDIE)) { if (unlikely(frozen(p))) - thaw_process(p); + __thaw_task(p); return ERR_PTR(-1UL); } if (!p->mm) -- cgit v0.10.2 From a585042f7b933539a0b6bc63650c2d49ffb2e55d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:23 -0800 Subject: freezer: remove racy clear_freeze_flag() and set PF_NOFREEZE on dead tasks clear_freeze_flag() in exit_mm() is racy. Freezing can start afterwards. Remove it. Skipping freezer for exiting task will be properly implemented later. Also, freezable() was testing exit_state directly to make system freezer ignore dead tasks. Let the exiting task set PF_NOFREEZE after entering TASK_DEAD instead. Signed-off-by: Tejun Heo Cc: Oleg Nesterov diff --git a/kernel/exit.c b/kernel/exit.c index d0b7d98..95a4141 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -679,8 +679,6 @@ static void exit_mm(struct task_struct * tsk) tsk->mm = NULL; up_read(&mm->mmap_sem); enter_lazy_tlb(mm, current); - /* We don't want this task to be frozen prematurely */ - clear_freeze_flag(tsk); task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); @@ -1040,6 +1038,7 @@ NORET_TYPE void do_exit(long code) exit_rcu(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; + tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ schedule(); BUG(); /* Avoid "noreturn function does return". */ diff --git a/kernel/power/process.c b/kernel/power/process.c index fe27872..23822dc 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -25,8 +25,7 @@ static inline int freezable(struct task_struct * p) { if ((p == current) || - (p->flags & PF_NOFREEZE) || - (p->exit_state != 0)) + (p->flags & PF_NOFREEZE)) return 0; return 1; } -- cgit v0.10.2 From 6cd8dedcdd8e8de01391a7cf25f0b2afeb24f8f4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:23 -0800 Subject: freezer: don't distinguish nosig tasks on thaw There's no point in thawing nosig tasks before others. There's no ordering requirement between the two groups on thaw, which the staged thawing can't guarantee anyway. Simplify thaw_processes() by removing the distinction and collapsing thaw_tasks() into thaw_processes(). This will help further updates to freezer. Signed-off-by: Tejun Heo diff --git a/kernel/power/process.c b/kernel/power/process.c index 23822dc..9db048f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -170,34 +170,28 @@ int freeze_kernel_threads(void) return error; } -static void thaw_tasks(bool nosig_only) +void thaw_processes(void) { struct task_struct *g, *p; + oom_killer_enable(); + + printk("Restarting tasks ... "); + + thaw_workqueues(); + read_lock(&tasklist_lock); do_each_thread(g, p) { if (!freezable(p)) continue; - if (nosig_only && should_send_signal(p)) - continue; - if (cgroup_freezing_or_frozen(p)) continue; __thaw_task(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); -} - -void thaw_processes(void) -{ - oom_killer_enable(); - printk("Restarting tasks ... "); - thaw_workqueues(); - thaw_tasks(true); - thaw_tasks(false); schedule(); printk("done.\n"); } -- cgit v0.10.2 From 0c9af09262864a2744091ee94c98c4a8fd60c98b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:24 -0800 Subject: freezer: use dedicated lock instead of task_lock() + memory barrier Freezer synchronization is needlessly complicated - it's by no means a hot path and the priority is staying unintrusive and safe. This patch makes it simply use a dedicated lock instead of piggy-backing on task_lock() and playing with memory barriers. On the failure path of try_to_freeze_tasks(), locking is moved from it to cancel_freezing(). This makes the frozen() test racy but the race here is a non-issue as the warning is printed for tasks which failed to enter frozen for 20 seconds and race on PF_FROZEN at the last moment doesn't change anything. This simplifies freezer implementation and eases further changes including some race fixes. Signed-off-by: Tejun Heo diff --git a/kernel/freezer.c b/kernel/freezer.c index c851d58..4130e48 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -11,17 +11,8 @@ #include #include -/* - * freezing is complete, mark current process as frozen - */ -static inline void frozen_process(void) -{ - if (!unlikely(current->flags & PF_NOFREEZE)) { - current->flags |= PF_FROZEN; - smp_wmb(); - } - clear_freeze_flag(current); -} +/* protects freezing and frozen transitions */ +static DEFINE_SPINLOCK(freezer_lock); /* Refrigerator is place where frozen processes are stored :-). */ bool __refrigerator(bool check_kthr_stop) @@ -31,14 +22,16 @@ bool __refrigerator(bool check_kthr_stop) bool was_frozen = false; long save; - task_lock(current); - if (freezing(current)) { - frozen_process(); - task_unlock(current); - } else { - task_unlock(current); + spin_lock_irq(&freezer_lock); + if (!freezing(current)) { + spin_unlock_irq(&freezer_lock); return was_frozen; } + if (!(current->flags & PF_NOFREEZE)) + current->flags |= PF_FROZEN; + clear_freeze_flag(current); + spin_unlock_irq(&freezer_lock); + save = current->state; pr_debug("%s entered refrigerator\n", current->comm); @@ -99,21 +92,18 @@ static void fake_signal_wake_up(struct task_struct *p) */ bool freeze_task(struct task_struct *p, bool sig_only) { - /* - * We first check if the task is freezing and next if it has already - * been frozen to avoid the race with frozen_process() which first marks - * the task as frozen and next clears its TIF_FREEZE. - */ - if (!freezing(p)) { - smp_rmb(); - if (frozen(p)) - return false; - - if (!sig_only || should_send_signal(p)) - set_freeze_flag(p); - else - return false; - } + unsigned long flags; + bool ret = false; + + spin_lock_irqsave(&freezer_lock, flags); + + if (sig_only && !should_send_signal(p)) + goto out_unlock; + + if (frozen(p)) + goto out_unlock; + + set_freeze_flag(p); if (should_send_signal(p)) { fake_signal_wake_up(p); @@ -123,26 +113,28 @@ bool freeze_task(struct task_struct *p, bool sig_only) * TASK_RUNNING transition can't race with task state * testing in try_to_freeze_tasks(). */ - } else if (sig_only) { - return false; } else { wake_up_state(p, TASK_INTERRUPTIBLE); } - - return true; + ret = true; +out_unlock: + spin_unlock_irqrestore(&freezer_lock, flags); + return ret; } void cancel_freezing(struct task_struct *p) { unsigned long flags; + spin_lock_irqsave(&freezer_lock, flags); if (freezing(p)) { pr_debug(" clean up: %s\n", p->comm); clear_freeze_flag(p); - spin_lock_irqsave(&p->sighand->siglock, flags); + spin_lock(&p->sighand->siglock); recalc_sigpending_and_wake(p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); + spin_unlock(&p->sighand->siglock); } + spin_unlock_irqrestore(&freezer_lock, flags); } /* @@ -156,16 +148,14 @@ void cancel_freezing(struct task_struct *p) */ void __thaw_task(struct task_struct *p) { - bool was_frozen; + unsigned long flags; - task_lock(p); - was_frozen = frozen(p); - if (was_frozen) + spin_lock_irqsave(&freezer_lock, flags); + if (frozen(p)) { p->flags &= ~PF_FROZEN; - else - clear_freeze_flag(p); - task_unlock(p); - - if (was_frozen) wake_up_process(p); + } else { + clear_freeze_flag(p); + } + spin_unlock_irqrestore(&freezer_lock, flags); } diff --git a/kernel/power/process.c b/kernel/power/process.c index 9db048f..bd420ca 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -118,11 +118,9 @@ static int try_to_freeze_tasks(bool sig_only) read_lock(&tasklist_lock); do_each_thread(g, p) { - task_lock(p); if (!wakeup && freezing(p) && !freezer_should_skip(p)) sched_show_task(p); cancel_freezing(p); - task_unlock(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); } else { -- cgit v0.10.2 From 6907483b4e803a20f0b48cc9afa3817420ce61c5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:24 -0800 Subject: freezer: make freezing indicate freeze condition in effect Currently freezing (TIF_FREEZE) and frozen (PF_FROZEN) states are interlocked - freezing is set to request freeze and when the task actually freezes, it clears freezing and sets frozen. This interlocking makes things more complex than necessary - freezing doesn't mean there's freezing condition in effect and frozen doesn't match the task actually entering and leaving frozen state (it's cleared by the thawing task). This patch makes freezing indicate that freeze condition is in effect. A task enters and stays frozen if freezing. This makes PF_FROZEN manipulation done only by the task itself and prevents wakeup from __thaw_task() leaking outside of refrigerator. The only place which needs to tell freezing && !frozen is try_to_freeze_task() to whine about tasks which don't enter frozen. It's updated to test the condition explicitly. With the change, frozen() state my linger after __thaw_task() until the task wakes up and exits fridge. This can trigger BUG_ON() in update_if_frozen(). Work it around by testing freezing() && frozen() instead of frozen(). -v2: Oleg pointed out missing re-check of freezing() when trying to clear FROZEN and possible spurious BUG_ON() trigger in update_if_frozen(). Both fixed. Signed-off-by: Tejun Heo Cc: Oleg Nesterov Cc: Paul Menage diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index a6d405a..cd27b08 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -231,7 +231,7 @@ static void update_if_frozen(struct cgroup *cgroup, cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { ntotal++; - if (frozen(task)) + if (freezing(task) && frozen(task)) nfrozen++; } diff --git a/kernel/freezer.c b/kernel/freezer.c index 4130e48..a8822be 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -22,14 +22,19 @@ bool __refrigerator(bool check_kthr_stop) bool was_frozen = false; long save; + /* + * Enter FROZEN. If NOFREEZE, schedule immediate thawing by + * clearing freezing. + */ spin_lock_irq(&freezer_lock); +repeat: if (!freezing(current)) { spin_unlock_irq(&freezer_lock); return was_frozen; } - if (!(current->flags & PF_NOFREEZE)) - current->flags |= PF_FROZEN; - clear_freeze_flag(current); + if (current->flags & PF_NOFREEZE) + clear_freeze_flag(current); + current->flags |= PF_FROZEN; spin_unlock_irq(&freezer_lock); save = current->state; @@ -44,7 +49,7 @@ bool __refrigerator(bool check_kthr_stop) for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!frozen(current) || + if (!freezing(current) || (check_kthr_stop && kthread_should_stop())) break; was_frozen = true; @@ -54,6 +59,13 @@ bool __refrigerator(bool check_kthr_stop) /* Remove the accounting blocker */ current->flags &= ~PF_FREEZING; + /* leave FROZEN */ + spin_lock_irq(&freezer_lock); + if (freezing(current)) + goto repeat; + current->flags &= ~PF_FROZEN; + spin_unlock_irq(&freezer_lock); + pr_debug("%s left refrigerator\n", current->comm); /* @@ -137,25 +149,19 @@ void cancel_freezing(struct task_struct *p) spin_unlock_irqrestore(&freezer_lock, flags); } -/* - * Wake up a frozen task - * - * task_lock() is needed to prevent the race with refrigerator() which may - * occur if the freezing of tasks fails. Namely, without the lock, if the - * freezing of tasks failed, thaw_tasks() might have run before a task in - * refrigerator() could call frozen_process(), in which case the task would be - * frozen and no one would thaw it. - */ void __thaw_task(struct task_struct *p) { unsigned long flags; + /* + * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to + * be visible to @p as waking up implies wmb. Waking up inside + * freezer_lock also prevents wakeups from leaking outside + * refrigerator. + */ spin_lock_irqsave(&freezer_lock, flags); - if (frozen(p)) { - p->flags &= ~PF_FROZEN; + clear_freeze_flag(p); + if (frozen(p)) wake_up_process(p); - } else { - clear_freeze_flag(p); - } spin_unlock_irqrestore(&freezer_lock, flags); } diff --git a/kernel/power/process.c b/kernel/power/process.c index bd420ca..e6e2739 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -118,7 +118,8 @@ static int try_to_freeze_tasks(bool sig_only) read_lock(&tasklist_lock); do_each_thread(g, p) { - if (!wakeup && freezing(p) && !freezer_should_skip(p)) + if (!wakeup && !freezer_should_skip(p) && + freezing(p) && !frozen(p)) sched_show_task(p); cancel_freezing(p); } while_each_thread(g, p); -- cgit v0.10.2 From 85f1d476653f52c97ca75466b2494e67c1cbd25d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:24 -0800 Subject: freezer: test freezable conditions while holding freezer_lock try_to_freeze_tasks() and thaw_processes() use freezable() and frozen() as preliminary tests before initiating operations on a task. These are done without any synchronization and hinder with synchronization cleanup without any real performance benefits. In try_to_freeze_tasks(), open code self test and move PF_NOFREEZE and frozen() tests inside freezer_lock in freeze_task(). thaw_processes() can simply drop freezable() test as frozen() test in __thaw_task() is enough. Note: This used to be a part of larger patch to fix set_freezable() race. Separated out to satisfy ordering among dependent fixes. Signed-off-by: Tejun Heo Cc: Oleg Nesterov diff --git a/kernel/freezer.c b/kernel/freezer.c index a8822be..a257ecd 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -109,7 +109,8 @@ bool freeze_task(struct task_struct *p, bool sig_only) spin_lock_irqsave(&freezer_lock, flags); - if (sig_only && !should_send_signal(p)) + if ((p->flags & PF_NOFREEZE) || + (sig_only && !should_send_signal(p))) goto out_unlock; if (frozen(p)) diff --git a/kernel/power/process.c b/kernel/power/process.c index e6e2739..e59676f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -22,14 +22,6 @@ */ #define TIMEOUT (20 * HZ) -static inline int freezable(struct task_struct * p) -{ - if ((p == current) || - (p->flags & PF_NOFREEZE)) - return 0; - return 1; -} - static int try_to_freeze_tasks(bool sig_only) { struct task_struct *g, *p; @@ -52,10 +44,7 @@ static int try_to_freeze_tasks(bool sig_only) todo = 0; read_lock(&tasklist_lock); do_each_thread(g, p) { - if (frozen(p) || !freezable(p)) - continue; - - if (!freeze_task(p, sig_only)) + if (p == current || !freeze_task(p, sig_only)) continue; /* @@ -181,9 +170,6 @@ void thaw_processes(void) read_lock(&tasklist_lock); do_each_thread(g, p) { - if (!freezable(p)) - continue; - if (cgroup_freezing_or_frozen(p)) continue; -- cgit v0.10.2 From 376fede80e74d98b49d1ba9ac18f23c9fd026ddd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:24 -0800 Subject: freezer: kill PF_FREEZING With the previous changes, there's no meaningful difference between PF_FREEZING and PF_FROZEN. Remove PF_FREEZING and use PF_FROZEN instead in task_contributes_to_load(). Signed-off-by: Tejun Heo diff --git a/include/linux/sched.h b/include/linux/sched.h index 68daf4f..d12bd03 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -220,7 +220,7 @@ extern char ___assert_task_state[1 - 2*!!( ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) #define task_contributes_to_load(task) \ ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ - (task->flags & PF_FREEZING) == 0) + (task->flags & PF_FROZEN) == 0) #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) @@ -1773,7 +1773,6 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ -#define PF_FREEZING 0x00004000 /* freeze in progress. do not account to load */ #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ diff --git a/kernel/freezer.c b/kernel/freezer.c index a257ecd..b8b5621 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -44,9 +44,6 @@ repeat: recalc_sigpending(); /* We sent fake signal, clean it up */ spin_unlock_irq(¤t->sighand->siglock); - /* prevent accounting of that task to load */ - current->flags |= PF_FREEZING; - for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!freezing(current) || @@ -56,9 +53,6 @@ repeat: schedule(); } - /* Remove the accounting blocker */ - current->flags &= ~PF_FREEZING; - /* leave FROZEN */ spin_lock_irq(&freezer_lock); if (freezing(current)) -- cgit v0.10.2 From 03afed8bc296fa70186ba832c1126228bb992465 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:24 -0800 Subject: freezer: clean up freeze_processes() failure path freeze_processes() failure path is rather messy. Freezing is canceled for workqueues and tasks which aren't frozen yet but frozen tasks are left alone and should be thawed by the caller and of course some callers (xen and kexec) didn't do it. This patch updates __thaw_task() to handle cancelation correctly and makes freeze_processes() and freeze_kernel_threads() call thaw_processes() on failure instead so that the system is fully thawed on failure. Unnecessary [suspend_]thaw_processes() calls are removed from kernel/power/hibernate.c, suspend.c and user.c. While at it, restructure error checking if clause in suspend_prepare() to be less weird. -v2: Srivatsa spotted missing removal of suspend_thaw_processes() in suspend_prepare() and error in commit message. Updated. Signed-off-by: Tejun Heo Acked-by: Srivatsa S. Bhat diff --git a/include/linux/freezer.h b/include/linux/freezer.h index ba4f512..93f411a 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -61,7 +61,6 @@ static inline bool try_to_freeze(void) } extern bool freeze_task(struct task_struct *p, bool sig_only); -extern void cancel_freezing(struct task_struct *p); #ifdef CONFIG_CGROUP_FREEZER extern int cgroup_freezing_or_frozen(struct task_struct *task); diff --git a/kernel/freezer.c b/kernel/freezer.c index b8b5621..11e32d4 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -129,21 +129,6 @@ out_unlock: return ret; } -void cancel_freezing(struct task_struct *p) -{ - unsigned long flags; - - spin_lock_irqsave(&freezer_lock, flags); - if (freezing(p)) { - pr_debug(" clean up: %s\n", p->comm); - clear_freeze_flag(p); - spin_lock(&p->sighand->siglock); - recalc_sigpending_and_wake(p); - spin_unlock(&p->sighand->siglock); - } - spin_unlock_irqrestore(&freezer_lock, flags); -} - void __thaw_task(struct task_struct *p) { unsigned long flags; @@ -153,10 +138,18 @@ void __thaw_task(struct task_struct *p) * be visible to @p as waking up implies wmb. Waking up inside * freezer_lock also prevents wakeups from leaking outside * refrigerator. + * + * If !FROZEN, @p hasn't reached refrigerator, recalc sigpending to + * avoid leaving dangling TIF_SIGPENDING behind. */ spin_lock_irqsave(&freezer_lock, flags); clear_freeze_flag(p); - if (frozen(p)) + if (frozen(p)) { wake_up_process(p); + } else { + spin_lock(&p->sighand->siglock); + recalc_sigpending_and_wake(p); + spin_unlock(&p->sighand->siglock); + } spin_unlock_irqrestore(&freezer_lock, flags); } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 196c0126..ba2319f 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -607,17 +607,6 @@ static void power_down(void) while(1); } -static int prepare_processes(void) -{ - int error = 0; - - if (freeze_processes()) { - error = -EBUSY; - thaw_processes(); - } - return error; -} - /** * hibernate - Carry out system hibernation, including saving the image. */ @@ -650,7 +639,7 @@ int hibernate(void) sys_sync(); printk("done.\n"); - error = prepare_processes(); + error = freeze_processes(); if (error) goto Finish; @@ -811,7 +800,7 @@ static int software_resume(void) goto close_finish; pr_debug("PM: Preparing processes for restore.\n"); - error = prepare_processes(); + error = freeze_processes(); if (error) { swsusp_close(FMODE_READ); goto Done; diff --git a/kernel/power/process.c b/kernel/power/process.c index e59676f..ce64383 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -91,11 +91,6 @@ static int try_to_freeze_tasks(bool sig_only) elapsed_csecs = elapsed_csecs64; if (todo) { - /* This does not unfreeze processes that are already frozen - * (we have slightly ugly calling convention in that respect, - * and caller must call thaw_processes() if something fails), - * but it cleans up leftover PF_FREEZE requests. - */ printk("\n"); printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " "(%d tasks refusing to freeze, wq_busy=%d):\n", @@ -103,14 +98,11 @@ static int try_to_freeze_tasks(bool sig_only) elapsed_csecs / 100, elapsed_csecs % 100, todo - wq_busy, wq_busy); - thaw_workqueues(); - read_lock(&tasklist_lock); do_each_thread(g, p) { if (!wakeup && !freezer_should_skip(p) && freezing(p) && !frozen(p)) sched_show_task(p); - cancel_freezing(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); } else { @@ -123,6 +115,8 @@ static int try_to_freeze_tasks(bool sig_only) /** * freeze_processes - Signal user space processes to enter the refrigerator. + * + * On success, returns 0. On failure, -errno and system is fully thawed. */ int freeze_processes(void) { @@ -137,11 +131,15 @@ int freeze_processes(void) printk("\n"); BUG_ON(in_atomic()); + if (error) + thaw_processes(); return error; } /** * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. + * + * On success, returns 0. On failure, -errno and system is fully thawed. */ int freeze_kernel_threads(void) { @@ -155,6 +153,8 @@ int freeze_kernel_threads(void) printk("\n"); BUG_ON(in_atomic()); + if (error) + thaw_processes(); return error; } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4953dc0..d336b27 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -106,13 +106,11 @@ static int suspend_prepare(void) goto Finish; error = suspend_freeze_processes(); - if (error) { - suspend_stats.failed_freeze++; - dpm_save_failed_step(SUSPEND_FREEZE); - } else + if (!error) return 0; - suspend_thaw_processes(); + suspend_stats.failed_freeze++; + dpm_save_failed_step(SUSPEND_FREEZE); usermodehelper_enable(); Finish: pm_notifier_call_chain(PM_POST_SUSPEND); diff --git a/kernel/power/user.c b/kernel/power/user.c index 6d8f535..7cc3f5b 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -257,10 +257,8 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, break; error = freeze_processes(); - if (error) { - thaw_processes(); + if (error) usermodehelper_enable(); - } if (!error) data->frozen = 1; break; -- cgit v0.10.2 From 22b4e111fa01a1147aa562ceaf18a752a928ef4e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:25 -0800 Subject: cgroup_freezer: prepare for removal of TIF_FREEZE TIF_FREEZE will be removed soon and freezing() will directly test whether any freezing condition is in effect. Make the following changes in preparation. * Rename cgroup_freezing_or_frozen() to cgroup_freezing() and make it return bool. * Make cgroup_freezing() access task_freezer() under rcu read lock instead of task_lock(). This makes the state dereferencing racy against task moving to another cgroup; however, it was already racy without this change as ->state dereference wasn't synchronized. This will be later dealt with using attach hooks. * freezer->state is now set before trying to push tasks into the target state. -v2: Oleg pointed out that freeze_change_state() was setting freeze->state incorrectly to CGROUP_FROZEN instead of CGROUP_FREEZING. Fixed. -v3: Matt pointed out that setting CGROUP_FROZEN used to always invoke try_to_freeze_cgroup() regardless of the current state. Patch updated such that the actual freeze/thaw operations are always performed on invocation. This shouldn't make any difference unless something is broken. Signed-off-by: Tejun Heo Acked-by: Paul Menage Cc: Li Zefan Cc: Oleg Nesterov diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 93f411a..b2b4abc 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -63,11 +63,11 @@ static inline bool try_to_freeze(void) extern bool freeze_task(struct task_struct *p, bool sig_only); #ifdef CONFIG_CGROUP_FREEZER -extern int cgroup_freezing_or_frozen(struct task_struct *task); +extern bool cgroup_freezing(struct task_struct *task); #else /* !CONFIG_CGROUP_FREEZER */ -static inline int cgroup_freezing_or_frozen(struct task_struct *task) +static inline bool cgroup_freezing(struct task_struct *task) { - return 0; + return false; } #endif /* !CONFIG_CGROUP_FREEZER */ diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index cd27b08..e6a1b8d 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -48,19 +48,17 @@ static inline struct freezer *task_freezer(struct task_struct *task) struct freezer, css); } -static inline int __cgroup_freezing_or_frozen(struct task_struct *task) +bool cgroup_freezing(struct task_struct *task) { - enum freezer_state state = task_freezer(task)->state; - return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); -} + enum freezer_state state; + bool ret; -int cgroup_freezing_or_frozen(struct task_struct *task) -{ - int result; - task_lock(task); - result = __cgroup_freezing_or_frozen(task); - task_unlock(task); - return result; + rcu_read_lock(); + state = task_freezer(task)->state; + ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN; + rcu_read_unlock(); + + return ret; } /* @@ -102,9 +100,6 @@ struct cgroup_subsys freezer_subsys; * freezer_can_attach(): * cgroup_mutex (held by caller of can_attach) * - * cgroup_freezing_or_frozen(): - * task->alloc_lock (to get task's cgroup) - * * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): * freezer->lock * sighand->siglock (if the cgroup is freezing) @@ -177,13 +172,7 @@ static int freezer_can_attach(struct cgroup_subsys *ss, static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { - rcu_read_lock(); - if (__cgroup_freezing_or_frozen(tsk)) { - rcu_read_unlock(); - return -EBUSY; - } - rcu_read_unlock(); - return 0; + return cgroup_freezing(tsk) ? -EBUSY : 0; } static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) @@ -279,7 +268,6 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) struct task_struct *task; unsigned int num_cant_freeze_now = 0; - freezer->state = CGROUP_FREEZING; cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { if (!freeze_task(task, true)) @@ -303,8 +291,6 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) while ((task = cgroup_iter_next(cgroup, &it))) __thaw_task(task); cgroup_iter_end(cgroup, &it); - - freezer->state = CGROUP_THAWED; } static int freezer_change_state(struct cgroup *cgroup, @@ -318,20 +304,20 @@ static int freezer_change_state(struct cgroup *cgroup, spin_lock_irq(&freezer->lock); update_if_frozen(cgroup, freezer); - if (goal_state == freezer->state) - goto out; switch (goal_state) { case CGROUP_THAWED: + freezer->state = CGROUP_THAWED; unfreeze_cgroup(cgroup, freezer); break; case CGROUP_FROZEN: + freezer->state = CGROUP_FREEZING; retval = try_to_freeze_cgroup(cgroup, freezer); break; default: BUG(); } -out: + spin_unlock_irq(&freezer->lock); return retval; diff --git a/kernel/power/process.c b/kernel/power/process.c index ce64383..9f6f5c7 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -170,7 +170,7 @@ void thaw_processes(void) read_lock(&tasklist_lock); do_each_thread(g, p) { - if (cgroup_freezing_or_frozen(p)) + if (cgroup_freezing(p)) continue; __thaw_task(p); -- cgit v0.10.2 From a3201227f803ad7fd43180c5195dbe5a2bf998aa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:25 -0800 Subject: freezer: make freezing() test freeze conditions in effect instead of TIF_FREEZE Using TIF_FREEZE for freezing worked when there was only single freezing condition (the PM one); however, now there is also the cgroup_freezer and single bit flag is getting clumsy. thaw_processes() is already testing whether cgroup freezing in in effect to avoid thawing tasks which were frozen by both PM and cgroup freezers. This is racy (nothing prevents race against cgroup freezing) and fragile. A much simpler way is to test actual freeze conditions from freezing() - ie. directly test whether PM or cgroup freezing is in effect. This patch adds variables to indicate whether and what type of freezing conditions are in effect and reimplements freezing() such that it directly tests whether any of the two freezing conditions is active and the task should freeze. On fast path, freezing() is still very cheap - it only tests system_freezing_cnt. This makes the clumsy dancing aroung TIF_FREEZE unnecessary and freeze/thaw operations more usual - updating state variables for the new state and nudging target tasks so that they notice the new state and comply. As long as the nudging happens after state update, it's race-free. * This allows use of freezing() in freeze_task(). Replace the open coded tests with freezing(). * p != current test is added to warning printing conditions in try_to_freeze_tasks() failure path. This is necessary as freezing() is now true for the task which initiated freezing too. -v2: Oleg pointed out that re-freezing FROZEN cgroup could increment system_freezing_cnt. Fixed. Signed-off-by: Tejun Heo Acked-by: Paul Menage (for the cgroup portions) diff --git a/include/linux/freezer.h b/include/linux/freezer.h index b2b4abc..8e29f2b 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -5,8 +5,13 @@ #include #include +#include #ifdef CONFIG_FREEZER +extern atomic_t system_freezing_cnt; /* nr of freezing conds in effect */ +extern bool pm_freezing; /* PM freezing in effect */ +extern bool pm_nosig_freezing; /* PM nosig freezing in effect */ + /* * Check if a process has been frozen */ @@ -15,28 +20,16 @@ static inline int frozen(struct task_struct *p) return p->flags & PF_FROZEN; } -/* - * Check if there is a request to freeze a process - */ -static inline int freezing(struct task_struct *p) -{ - return test_tsk_thread_flag(p, TIF_FREEZE); -} +extern bool freezing_slow_path(struct task_struct *p); /* - * Request that a process be frozen - */ -static inline void set_freeze_flag(struct task_struct *p) -{ - set_tsk_thread_flag(p, TIF_FREEZE); -} - -/* - * Sometimes we may need to cancel the previous 'freeze' request + * Check if there is a request to freeze a process */ -static inline void clear_freeze_flag(struct task_struct *p) +static inline bool freezing(struct task_struct *p) { - clear_tsk_thread_flag(p, TIF_FREEZE); + if (likely(!atomic_read(&system_freezing_cnt))) + return false; + return freezing_slow_path(p); } static inline bool should_send_signal(struct task_struct *p) @@ -174,9 +167,7 @@ static inline void set_freezable_with_signal(void) }) #else /* !CONFIG_FREEZER */ static inline int frozen(struct task_struct *p) { return 0; } -static inline int freezing(struct task_struct *p) { return 0; } -static inline void set_freeze_flag(struct task_struct *p) {} -static inline void clear_freeze_flag(struct task_struct *p) {} +static inline bool freezing(struct task_struct *p) { return false; } static inline bool __refrigerator(bool check_kthr_stop) { return false; } static inline int freeze_processes(void) { return -ENOSYS; } diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e6a1b8d..2327ad1 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -145,7 +145,11 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, static void freezer_destroy(struct cgroup_subsys *ss, struct cgroup *cgroup) { - kfree(cgroup_freezer(cgroup)); + struct freezer *freezer = cgroup_freezer(cgroup); + + if (freezer->state != CGROUP_THAWED) + atomic_dec(&system_freezing_cnt); + kfree(freezer); } /* @@ -307,10 +311,14 @@ static int freezer_change_state(struct cgroup *cgroup, switch (goal_state) { case CGROUP_THAWED: + if (freezer->state != CGROUP_THAWED) + atomic_dec(&system_freezing_cnt); freezer->state = CGROUP_THAWED; unfreeze_cgroup(cgroup, freezer); break; case CGROUP_FROZEN: + if (freezer->state == CGROUP_THAWED) + atomic_inc(&system_freezing_cnt); freezer->state = CGROUP_FREEZING; retval = try_to_freeze_cgroup(cgroup, freezer); break; diff --git a/kernel/fork.c b/kernel/fork.c index ba0d172..d53316e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -997,7 +997,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) new_flags |= PF_FORKNOEXEC; new_flags |= PF_STARTING; p->flags = new_flags; - clear_freeze_flag(p); } SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) diff --git a/kernel/freezer.c b/kernel/freezer.c index 11e32d4..f53cd5a 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -11,9 +11,41 @@ #include #include +/* total number of freezing conditions in effect */ +atomic_t system_freezing_cnt = ATOMIC_INIT(0); +EXPORT_SYMBOL(system_freezing_cnt); + +/* indicate whether PM freezing is in effect, protected by pm_mutex */ +bool pm_freezing; +bool pm_nosig_freezing; + /* protects freezing and frozen transitions */ static DEFINE_SPINLOCK(freezer_lock); +/** + * freezing_slow_path - slow path for testing whether a task needs to be frozen + * @p: task to be tested + * + * This function is called by freezing() if system_freezing_cnt isn't zero + * and tests whether @p needs to enter and stay in frozen state. Can be + * called under any context. The freezers are responsible for ensuring the + * target tasks see the updated state. + */ +bool freezing_slow_path(struct task_struct *p) +{ + if (p->flags & PF_NOFREEZE) + return false; + + if (pm_nosig_freezing || cgroup_freezing(p)) + return true; + + if (pm_freezing && !(p->flags & PF_FREEZER_NOSIG)) + return true; + + return false; +} +EXPORT_SYMBOL(freezing_slow_path); + /* Refrigerator is place where frozen processes are stored :-). */ bool __refrigerator(bool check_kthr_stop) { @@ -23,17 +55,11 @@ bool __refrigerator(bool check_kthr_stop) long save; /* - * Enter FROZEN. If NOFREEZE, schedule immediate thawing by - * clearing freezing. + * No point in checking freezing() again - the caller already did. + * Proceed to enter FROZEN. */ spin_lock_irq(&freezer_lock); repeat: - if (!freezing(current)) { - spin_unlock_irq(&freezer_lock); - return was_frozen; - } - if (current->flags & PF_NOFREEZE) - clear_freeze_flag(current); current->flags |= PF_FROZEN; spin_unlock_irq(&freezer_lock); @@ -99,18 +125,12 @@ static void fake_signal_wake_up(struct task_struct *p) bool freeze_task(struct task_struct *p, bool sig_only) { unsigned long flags; - bool ret = false; spin_lock_irqsave(&freezer_lock, flags); - - if ((p->flags & PF_NOFREEZE) || - (sig_only && !should_send_signal(p))) - goto out_unlock; - - if (frozen(p)) - goto out_unlock; - - set_freeze_flag(p); + if (!freezing(p) || frozen(p)) { + spin_unlock_irqrestore(&freezer_lock, flags); + return false; + } if (should_send_signal(p)) { fake_signal_wake_up(p); @@ -123,10 +143,9 @@ bool freeze_task(struct task_struct *p, bool sig_only) } else { wake_up_state(p, TASK_INTERRUPTIBLE); } - ret = true; -out_unlock: + spin_unlock_irqrestore(&freezer_lock, flags); - return ret; + return true; } void __thaw_task(struct task_struct *p) @@ -143,7 +162,6 @@ void __thaw_task(struct task_struct *p) * avoid leaving dangling TIF_SIGPENDING behind. */ spin_lock_irqsave(&freezer_lock, flags); - clear_freeze_flag(p); if (frozen(p)) { wake_up_process(p); } else { diff --git a/kernel/power/process.c b/kernel/power/process.c index 9f6f5c7..0beb51e 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -101,7 +101,7 @@ static int try_to_freeze_tasks(bool sig_only) read_lock(&tasklist_lock); do_each_thread(g, p) { if (!wakeup && !freezer_should_skip(p) && - freezing(p) && !frozen(p)) + p != current && freezing(p) && !frozen(p)) sched_show_task(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); @@ -122,7 +122,11 @@ int freeze_processes(void) { int error; + if (!pm_freezing) + atomic_inc(&system_freezing_cnt); + printk("Freezing user space processes ... "); + pm_freezing = true; error = try_to_freeze_tasks(true); if (!error) { printk("done."); @@ -146,6 +150,7 @@ int freeze_kernel_threads(void) int error; printk("Freezing remaining freezable tasks ... "); + pm_nosig_freezing = true; error = try_to_freeze_tasks(false); if (!error) printk("done."); @@ -162,6 +167,11 @@ void thaw_processes(void) { struct task_struct *g, *p; + if (pm_freezing) + atomic_dec(&system_freezing_cnt); + pm_freezing = false; + pm_nosig_freezing = false; + oom_killer_enable(); printk("Restarting tasks ... "); @@ -170,9 +180,6 @@ void thaw_processes(void) read_lock(&tasklist_lock); do_each_thread(g, p) { - if (cgroup_freezing(p)) - continue; - __thaw_task(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); -- cgit v0.10.2 From d88e4cb67197d007fb778d62fe17360e970d5bfa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:25 -0800 Subject: freezer: remove now unused TIF_FREEZE Signed-off-by: Tejun Heo Cc: Arnd Bergmann Cc: linux-arch@vger.kernel.org diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h index ff73db0..28335bd 100644 --- a/arch/alpha/include/asm/thread_info.h +++ b/arch/alpha/include/asm/thread_info.h @@ -79,7 +79,6 @@ register struct thread_info *__current_thread_info __asm__("$8"); #define TIF_UAC_SIGBUS 12 /* ! userspace part of 'osf_sysinfo' */ #define TIF_MEMDIE 13 /* is terminating due to OOM killer */ #define TIF_RESTORE_SIGMASK 14 /* restore signal mask in do_signal */ -#define TIF_FREEZE 16 /* is freezing for suspend */ #define _TIF_SYSCALL_TRACE (1< Date: Mon, 21 Nov 2011 12:32:25 -0800 Subject: freezer: remove should_send_signal() and update frozen() should_send_signal() is only used in freezer.c. Exporting them only increases chance of abuse. Open code the two users and remove it. Update frozen() to return bool. Signed-off-by: Tejun Heo diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 8e29f2b..3d50913 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -15,7 +15,7 @@ extern bool pm_nosig_freezing; /* PM nosig freezing in effect */ /* * Check if a process has been frozen */ -static inline int frozen(struct task_struct *p) +static inline bool frozen(struct task_struct *p) { return p->flags & PF_FROZEN; } @@ -32,11 +32,6 @@ static inline bool freezing(struct task_struct *p) return freezing_slow_path(p); } -static inline bool should_send_signal(struct task_struct *p) -{ - return !(p->flags & PF_FREEZER_NOSIG); -} - /* Takes and releases task alloc lock using task_lock() */ extern void __thaw_task(struct task_struct *t); @@ -166,7 +161,7 @@ static inline void set_freezable_with_signal(void) __retval; \ }) #else /* !CONFIG_FREEZER */ -static inline int frozen(struct task_struct *p) { return 0; } +static inline bool frozen(struct task_struct *p) { return false; } static inline bool freezing(struct task_struct *p) { return false; } static inline bool __refrigerator(bool check_kthr_stop) { return false; } diff --git a/kernel/freezer.c b/kernel/freezer.c index f53cd5a..95a1238 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -132,7 +132,7 @@ bool freeze_task(struct task_struct *p, bool sig_only) return false; } - if (should_send_signal(p)) { + if (!(p->flags & PF_FREEZER_NOSIG)) { fake_signal_wake_up(p); /* * fake_signal_wake_up() goes through p's scheduler -- cgit v0.10.2 From 96ee6d8539c9fc6742908d85eb9723abb5c91854 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:25 -0800 Subject: freezer: fix set_freezable[_with_signal]() race A kthread doing set_freezable*() may race with on-going PM freeze and the freezer might think all tasks are frozen while the new freezable kthread is merrily proceeding to execute code paths which aren't supposed to be executing during PM freeze. Reimplement set_freezable[_with_signal]() using __set_freezable() such that freezable PF flags are modified under freezer_lock and try_to_freeze() is called afterwards. This eliminates race condition against freezing. Note: Separated out from larger patch to resolve fix order dependency Oleg pointed out. Signed-off-by: Tejun Heo Cc: Oleg Nesterov diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 3d50913..a0f1b3a 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -49,6 +49,7 @@ static inline bool try_to_freeze(void) } extern bool freeze_task(struct task_struct *p, bool sig_only); +extern bool __set_freezable(bool with_signal); #ifdef CONFIG_CGROUP_FREEZER extern bool cgroup_freezing(struct task_struct *task); @@ -106,18 +107,18 @@ static inline int freezer_should_skip(struct task_struct *p) /* * Tell the freezer that the current task should be frozen by it */ -static inline void set_freezable(void) +static inline bool set_freezable(void) { - current->flags &= ~PF_NOFREEZE; + return __set_freezable(false); } /* * Tell the freezer that the current task should be frozen by it and that it * should send a fake signal to the task to freeze it. */ -static inline void set_freezable_with_signal(void) +static inline bool set_freezable_with_signal(void) { - current->flags &= ~(PF_NOFREEZE | PF_FREEZER_NOSIG); + return __set_freezable(true); } /* diff --git a/kernel/freezer.c b/kernel/freezer.c index 95a1238..b1e7a7b 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -171,3 +171,28 @@ void __thaw_task(struct task_struct *p) } spin_unlock_irqrestore(&freezer_lock, flags); } + +/** + * __set_freezable - make %current freezable + * @with_signal: do we want %TIF_SIGPENDING for notification too? + * + * Mark %current freezable and enter refrigerator if necessary. + */ +bool __set_freezable(bool with_signal) +{ + might_sleep(); + + /* + * Modify flags while holding freezer_lock. This ensures the + * freezer notices that we aren't frozen yet or the freezing + * condition is visible to try_to_freeze() below. + */ + spin_lock_irq(&freezer_lock); + current->flags &= ~PF_NOFREEZE; + if (with_signal) + current->flags &= ~PF_FREEZER_NOSIG; + spin_unlock_irq(&freezer_lock); + + return try_to_freeze(); +} +EXPORT_SYMBOL(__set_freezable); -- cgit v0.10.2 From 5ece3eae4cdb968c269e0bc7e2c0e2b223552025 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:26 -0800 Subject: freezer: restructure __refrigerator() If another freeze happens before all tasks leave FROZEN state after being thawed, the freezer can see the existing FROZEN and consider the tasks to be frozen but they can clear FROZEN without checking the new freezing(). Oleg suggested restructuring __refrigerator() such that there's single condition check section inside freezer_lock and sigpending is cleared afterwards, which fixes the problem and simplifies the code. Restructure accordingly. -v2: Frozen loop exited without releasing freezer_lock. Fixed. Signed-off-by: Tejun Heo Reported-by: Oleg Nesterov Acked-by: Oleg Nesterov Cc: "Rafael J. Wysocki" diff --git a/kernel/freezer.c b/kernel/freezer.c index b1e7a7b..c3496c6 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -52,39 +52,29 @@ bool __refrigerator(bool check_kthr_stop) /* Hmm, should we be allowed to suspend when there are realtime processes around? */ bool was_frozen = false; - long save; + long save = current->state; - /* - * No point in checking freezing() again - the caller already did. - * Proceed to enter FROZEN. - */ - spin_lock_irq(&freezer_lock); -repeat: - current->flags |= PF_FROZEN; - spin_unlock_irq(&freezer_lock); - - save = current->state; pr_debug("%s entered refrigerator\n", current->comm); - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); /* We sent fake signal, clean it up */ - spin_unlock_irq(¤t->sighand->siglock); - for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); + + spin_lock_irq(&freezer_lock); + current->flags |= PF_FROZEN; if (!freezing(current) || (check_kthr_stop && kthread_should_stop())) + current->flags &= ~PF_FROZEN; + spin_unlock_irq(&freezer_lock); + + if (!(current->flags & PF_FROZEN)) break; was_frozen = true; schedule(); } - /* leave FROZEN */ - spin_lock_irq(&freezer_lock); - if (freezing(current)) - goto repeat; - current->flags &= ~PF_FROZEN; - spin_unlock_irq(&freezer_lock); + spin_lock_irq(¤t->sighand->siglock); + recalc_sigpending(); /* We sent fake signal, clean it up */ + spin_unlock_irq(¤t->sighand->siglock); pr_debug("%s left refrigerator\n", current->comm); -- cgit v0.10.2 From 37ad8aca94a1da2112a7c56151390914e80d1113 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:26 -0800 Subject: freezer: use lock_task_sighand() in fake_signal_wake_up() cgroup_freezer calls freeze_task() without holding tasklist_lock and, if the task is exiting, its ->sighand may be gone by the time fake_signal_wake_up() is called. Use lock_task_sighand() instead of accessing ->sighand directly. Signed-off-by: Tejun Heo Reported-by: Oleg Nesterov Acked-by: Oleg Nesterov Cc: "Rafael J. Wysocki" Cc: Paul Menage diff --git a/kernel/freezer.c b/kernel/freezer.c index c3496c6..389549f 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -93,9 +93,10 @@ static void fake_signal_wake_up(struct task_struct *p) { unsigned long flags; - spin_lock_irqsave(&p->sighand->siglock, flags); - signal_wake_up(p, 0); - spin_unlock_irqrestore(&p->sighand->siglock, flags); + if (lock_task_sighand(p, &flags)) { + signal_wake_up(p, 0); + unlock_task_sighand(p, &flags); + } } /** -- cgit v0.10.2 From 839e3407d90a810318d17c17ceb3d5928a910704 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:26 -0800 Subject: freezer: remove unused @sig_only from freeze_task() After "freezer: make freezing() test freeze conditions in effect instead of TIF_FREEZE", freezing() returns authoritative answer on whether the current task should freeze or not and freeze_task() doesn't need or use @sig_only. Remove it. While at it, rewrite function comment for freeze_task() and rename @sig_only to @user_only in try_to_freeze_tasks(). This patch doesn't cause any functional change. Signed-off-by: Tejun Heo Acked-by: Oleg Nesterov diff --git a/include/linux/freezer.h b/include/linux/freezer.h index a0f1b3a..a28842e 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -48,7 +48,7 @@ static inline bool try_to_freeze(void) return __refrigerator(false); } -extern bool freeze_task(struct task_struct *p, bool sig_only); +extern bool freeze_task(struct task_struct *p); extern bool __set_freezable(bool with_signal); #ifdef CONFIG_CGROUP_FREEZER diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 2327ad1..e411a60 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -206,7 +206,7 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) /* Locking avoids race with FREEZING -> THAWED transitions. */ if (freezer->state == CGROUP_FREEZING) - freeze_task(task, true); + freeze_task(task); spin_unlock_irq(&freezer->lock); } @@ -274,7 +274,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { - if (!freeze_task(task, true)) + if (!freeze_task(task)) continue; if (frozen(task)) continue; diff --git a/kernel/freezer.c b/kernel/freezer.c index 389549f..2589a61 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -100,20 +100,17 @@ static void fake_signal_wake_up(struct task_struct *p) } /** - * freeze_task - send a freeze request to given task - * @p: task to send the request to - * @sig_only: if set, the request will only be sent if the task has the - * PF_FREEZER_NOSIG flag unset - * Return value: 'false', if @sig_only is set and the task has - * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise + * freeze_task - send a freeze request to given task + * @p: task to send the request to * - * The freeze request is sent by setting the tasks's TIF_FREEZE flag and - * either sending a fake signal to it or waking it up, depending on whether - * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task - * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its - * TIF_FREEZE flag will not be set. + * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE + * flag and either sending a fake signal to it or waking it up, depending + * on whether it has %PF_FREEZER_NOSIG set. + * + * RETURNS: + * %false, if @p is not freezing or already frozen; %true, otherwise */ -bool freeze_task(struct task_struct *p, bool sig_only) +bool freeze_task(struct task_struct *p) { unsigned long flags; diff --git a/kernel/power/process.c b/kernel/power/process.c index 0beb51e..77274c9 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -22,7 +22,7 @@ */ #define TIMEOUT (20 * HZ) -static int try_to_freeze_tasks(bool sig_only) +static int try_to_freeze_tasks(bool user_only) { struct task_struct *g, *p; unsigned long end_time; @@ -37,14 +37,14 @@ static int try_to_freeze_tasks(bool sig_only) end_time = jiffies + TIMEOUT; - if (!sig_only) + if (!user_only) freeze_workqueues_begin(); while (true) { todo = 0; read_lock(&tasklist_lock); do_each_thread(g, p) { - if (p == current || !freeze_task(p, sig_only)) + if (p == current || !freeze_task(p)) continue; /* @@ -65,7 +65,7 @@ static int try_to_freeze_tasks(bool sig_only) } while_each_thread(g, p); read_unlock(&tasklist_lock); - if (!sig_only) { + if (!user_only) { wq_busy = freeze_workqueues_busy(); todo += wq_busy; } -- cgit v0.10.2 From ec012476af73a1a8a82565a915e9b48c2e337878 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:26 -0800 Subject: usb_storage: don't use set_freezable_with_signal() The current implementation of set_freezable_with_signal() is buggy and tricky to get right. usb-storage is the only user and its use can be avoided trivially. All usb-storage wants is to be able to sleep with timeout and get woken up if freezing() becomes true. This can be trivially implemented by doing interruptible wait w/ freezing() included in the wait condition. There's no reason to use set_freezable_with_signal(). Perform interruptible wait on freezing() instead of using set_freezable_with_signal(), which is scheduled for removal. Signed-off-by: Tejun Heo Cc: Oleg Nesterov Cc: "Rafael J. Wysocki" Cc: Seth Forshee Cc: Alan Stern Cc: Greg Kroah-Hartman diff --git a/drivers/usb/storage/usb.c b/drivers/usb/storage/usb.c index c325e69..aa84b3d 100644 --- a/drivers/usb/storage/usb.c +++ b/drivers/usb/storage/usb.c @@ -831,7 +831,8 @@ static int usb_stor_scan_thread(void * __us) dev_dbg(dev, "device found\n"); - set_freezable_with_signal(); + set_freezable(); + /* * Wait for the timeout to expire or for a disconnect * @@ -839,16 +840,16 @@ static int usb_stor_scan_thread(void * __us) * fail to freeze, but we can't be non-freezable either. Nor can * khubd freeze while waiting for scanning to complete as it may * hold the device lock, causing a hang when suspending devices. - * So we request a fake signal when freezing and use - * interruptible sleep to kick us out of our wait early when - * freezing happens. + * So instead of using wait_event_freezable(), explicitly test + * for (DONT_SCAN || freezing) in interruptible wait and proceed + * if any of DONT_SCAN, freezing or timeout has happened. */ if (delay_use > 0) { dev_dbg(dev, "waiting for device to settle " "before scanning\n"); wait_event_interruptible_timeout(us->delay_wait, - test_bit(US_FLIDX_DONT_SCAN, &us->dflags), - delay_use * HZ); + test_bit(US_FLIDX_DONT_SCAN, &us->dflags) || + freezing(current), delay_use * HZ); } /* If the device is still connected, perform the scanning */ -- cgit v0.10.2 From adfa543e7314b36ac55a40019977de6e47946dd7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Nov 2011 09:28:16 -0800 Subject: dmatest: don't use set_freezable_with_signal() Commit 981ed70d8e (dmatest: make dmatest threads freezable) made dmatest kthread use set_freezable_with_signal(); however, the interface is scheduled to be removed in the next merge window. The problem is that unlike userland tasks there's no default place which handles signal pending state and it isn't clear who owns and/or is responsible for clearing TIF_SIGPENDING. For example, in the current code, try_to_freeze() clears TIF_SIGPENDING but it isn't sure whether it actually owns the TIF_SIGPENDING nor is it race-free - ie. the task may continue to run with TIF_SIGPENDING set after the freezable section. Unfortunately, we don't have wait_for_completion_freezable_timeout(). This patch open codes it and uses wait_event_freezable_timeout() instead and removes timeout reloading - wait_event_freezable_timeout() won't return across freezing events (currently racy but fix scheduled) and timer doesn't decrement while the task is in freezer. Although this does lose timer-reset-over-freezing, given that timeout is supposed to be long enough and failure to finish inside is considered irrecoverable, I don't think this is worth the complexity. While at it, move completion to outer scope and explain that we're ignoring dangling pointer problem after timeout. This should give slightly better chance at avoiding oops after timeout. Signed-off-by: Tejun Heo Acked-by: Dan Williams Cc: Guennadi Liakhovetski Cc: Nicolas Ferre diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c index eb1d864..2b8661b 100644 --- a/drivers/dma/dmatest.c +++ b/drivers/dma/dmatest.c @@ -214,9 +214,18 @@ static unsigned int dmatest_verify(u8 **bufs, unsigned int start, return error_count; } -static void dmatest_callback(void *completion) +/* poor man's completion - we want to use wait_event_freezable() on it */ +struct dmatest_done { + bool done; + wait_queue_head_t *wait; +}; + +static void dmatest_callback(void *arg) { - complete(completion); + struct dmatest_done *done = arg; + + done->done = true; + wake_up_all(done->wait); } /* @@ -235,7 +244,9 @@ static void dmatest_callback(void *completion) */ static int dmatest_func(void *data) { + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(done_wait); struct dmatest_thread *thread = data; + struct dmatest_done done = { .wait = &done_wait }; struct dma_chan *chan; const char *thread_name; unsigned int src_off, dst_off, len; @@ -252,7 +263,7 @@ static int dmatest_func(void *data) int i; thread_name = current->comm; - set_freezable_with_signal(); + set_freezable(); ret = -ENOMEM; @@ -306,9 +317,6 @@ static int dmatest_func(void *data) struct dma_async_tx_descriptor *tx = NULL; dma_addr_t dma_srcs[src_cnt]; dma_addr_t dma_dsts[dst_cnt]; - struct completion cmp; - unsigned long start, tmo, end = 0 /* compiler... */; - bool reload = true; u8 align = 0; total_tests++; @@ -391,9 +399,9 @@ static int dmatest_func(void *data) continue; } - init_completion(&cmp); + done.done = false; tx->callback = dmatest_callback; - tx->callback_param = &cmp; + tx->callback_param = &done; cookie = tx->tx_submit(tx); if (dma_submit_error(cookie)) { @@ -407,20 +415,20 @@ static int dmatest_func(void *data) } dma_async_issue_pending(chan); - do { - start = jiffies; - if (reload) - end = start + msecs_to_jiffies(timeout); - else if (end <= start) - end = start + 1; - tmo = wait_for_completion_interruptible_timeout(&cmp, - end - start); - reload = try_to_freeze(); - } while (tmo == -ERESTARTSYS); + wait_event_freezable_timeout(done_wait, done.done, + msecs_to_jiffies(timeout)); status = dma_async_is_tx_complete(chan, cookie, NULL, NULL); - if (tmo == 0) { + if (!done.done) { + /* + * We're leaving the timed out dma operation with + * dangling pointer to done_wait. To make this + * correct, we'll need to allocate wait_done for + * each test iteration and perform "who's gonna + * free it this time?" dancing. For now, just + * leave it dangling. + */ pr_warning("%s: #%u: test timed out\n", thread_name, total_tests - 1); failed_tests++; -- cgit v0.10.2 From 34b087e48367c252e343c2f8de65676a78af1e4a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Nov 2011 09:28:17 -0800 Subject: freezer: kill unused set_freezable_with_signal() There's no in-kernel user of set_freezable_with_signal() left. Mixing TIF_SIGPENDING with kernel threads can lead to nasty corner cases as kernel threads never travel signal delivery path on their own. e.g. the current implementation is buggy in the cancelation path of __thaw_task(). It calls recalc_sigpending_and_wake() in an attempt to clear TIF_SIGPENDING but the function never clears it regardless of sigpending state. This means that signallable freezable kthreads may continue executing with !freezing() && stuck TIF_SIGPENDING, which can be troublesome. This patch removes set_freezable_with_signal() along with PF_FREEZER_NOSIG and recalc_sigpending*() calls in freezer. User tasks get TIF_SIGPENDING, kernel tasks get woken up and the spurious sigpending is dealt with in the usual signal delivery path. Signed-off-by: Tejun Heo Acked-by: Oleg Nesterov diff --git a/include/linux/freezer.h b/include/linux/freezer.h index a28842e..a33550f 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -49,7 +49,7 @@ static inline bool try_to_freeze(void) } extern bool freeze_task(struct task_struct *p); -extern bool __set_freezable(bool with_signal); +extern bool set_freezable(void); #ifdef CONFIG_CGROUP_FREEZER extern bool cgroup_freezing(struct task_struct *task); @@ -105,23 +105,6 @@ static inline int freezer_should_skip(struct task_struct *p) } /* - * Tell the freezer that the current task should be frozen by it - */ -static inline bool set_freezable(void) -{ - return __set_freezable(false); -} - -/* - * Tell the freezer that the current task should be frozen by it and that it - * should send a fake signal to the task to freeze it. - */ -static inline bool set_freezable_with_signal(void) -{ - return __set_freezable(true); -} - -/* * Freezer-friendly wrappers around wait_event_interruptible(), * wait_event_killable() and wait_event_interruptible_timeout(), originally * defined in @@ -176,7 +159,6 @@ static inline void freezer_do_not_count(void) {} static inline void freezer_count(void) {} static inline int freezer_should_skip(struct task_struct *p) { return 0; } static inline void set_freezable(void) {} -static inline void set_freezable_with_signal(void) {} #define wait_event_freezable(wq, condition) \ wait_event_interruptible(wq, condition) diff --git a/include/linux/sched.h b/include/linux/sched.h index d12bd03..2f90470 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1788,7 +1788,6 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ -#define PF_FREEZER_NOSIG 0x80000000 /* Freezer won't send signals to it */ /* * Only the _current_ task can read/write to tsk->flags, but other diff --git a/kernel/freezer.c b/kernel/freezer.c index 2589a61..9815b8d 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -39,7 +39,7 @@ bool freezing_slow_path(struct task_struct *p) if (pm_nosig_freezing || cgroup_freezing(p)) return true; - if (pm_freezing && !(p->flags & PF_FREEZER_NOSIG)) + if (pm_freezing && !(p->flags & PF_KTHREAD)) return true; return false; @@ -72,10 +72,6 @@ bool __refrigerator(bool check_kthr_stop) schedule(); } - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); /* We sent fake signal, clean it up */ - spin_unlock_irq(¤t->sighand->siglock); - pr_debug("%s left refrigerator\n", current->comm); /* @@ -120,7 +116,7 @@ bool freeze_task(struct task_struct *p) return false; } - if (!(p->flags & PF_FREEZER_NOSIG)) { + if (!(p->flags & PF_KTHREAD)) { fake_signal_wake_up(p); /* * fake_signal_wake_up() goes through p's scheduler @@ -145,28 +141,19 @@ void __thaw_task(struct task_struct *p) * be visible to @p as waking up implies wmb. Waking up inside * freezer_lock also prevents wakeups from leaking outside * refrigerator. - * - * If !FROZEN, @p hasn't reached refrigerator, recalc sigpending to - * avoid leaving dangling TIF_SIGPENDING behind. */ spin_lock_irqsave(&freezer_lock, flags); - if (frozen(p)) { + if (frozen(p)) wake_up_process(p); - } else { - spin_lock(&p->sighand->siglock); - recalc_sigpending_and_wake(p); - spin_unlock(&p->sighand->siglock); - } spin_unlock_irqrestore(&freezer_lock, flags); } /** - * __set_freezable - make %current freezable - * @with_signal: do we want %TIF_SIGPENDING for notification too? + * set_freezable - make %current freezable * * Mark %current freezable and enter refrigerator if necessary. */ -bool __set_freezable(bool with_signal) +bool set_freezable(void) { might_sleep(); @@ -177,10 +164,8 @@ bool __set_freezable(bool with_signal) */ spin_lock_irq(&freezer_lock); current->flags &= ~PF_NOFREEZE; - if (with_signal) - current->flags &= ~PF_FREEZER_NOSIG; spin_unlock_irq(&freezer_lock); return try_to_freeze(); } -EXPORT_SYMBOL(__set_freezable); +EXPORT_SYMBOL(set_freezable); diff --git a/kernel/kthread.c b/kernel/kthread.c index 1c36dea..3d3de633 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -282,7 +282,7 @@ int kthreadd(void *unused) set_cpus_allowed_ptr(tsk, cpu_all_mask); set_mems_allowed(node_states[N_HIGH_MEMORY]); - current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; + current->flags |= PF_NOFREEZE; for (;;) { set_current_state(TASK_INTERRUPTIBLE); -- cgit v0.10.2 From 24b7ead3fb0bae267c2ee50898eb4c13aedd1e9f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 23 Nov 2011 09:28:17 -0800 Subject: freezer: fix wait_event_freezable/__thaw_task races wait_event_freezable() and friends stop the waiting if try_to_freeze() fails. This is not right, we can race with __thaw_task() and in this case - wait_event_freezable() returns the wrong ERESTARTSYS - wait_event_freezable_timeout() can return the positive value while condition == F Change the code to always check __retval/condition before return. Note: with or without this patch the timeout logic looks strange, probably we should recalc timeout if try_to_freeze() returns T. Signed-off-by: Oleg Nesterov Acked-by: Tejun Heo diff --git a/include/linux/freezer.h b/include/linux/freezer.h index a33550f..09570ac 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -122,28 +122,30 @@ static inline int freezer_should_skip(struct task_struct *p) #define wait_event_freezable(wq, condition) \ ({ \ int __retval; \ - do { \ + for (;;) { \ __retval = wait_event_interruptible(wq, \ (condition) || freezing(current)); \ - if (__retval && !freezing(current)) \ + if (__retval || (condition)) \ break; \ - else if (!(condition)) \ - __retval = -ERESTARTSYS; \ - } while (try_to_freeze()); \ + try_to_freeze(); \ + } \ __retval; \ }) - #define wait_event_freezable_timeout(wq, condition, timeout) \ ({ \ long __retval = timeout; \ - do { \ + for (;;) { \ __retval = wait_event_interruptible_timeout(wq, \ (condition) || freezing(current), \ __retval); \ - } while (try_to_freeze()); \ + if (__retval <= 0 || (condition)) \ + break; \ + try_to_freeze(); \ + } \ __retval; \ }) + #else /* !CONFIG_FREEZER */ static inline bool frozen(struct task_struct *p) { return false; } static inline bool freezing(struct task_struct *p) { return false; } -- cgit v0.10.2 From 341d4166175e9b7911444f5a33b1c9efb8f15c85 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sat, 19 Nov 2011 14:39:01 +0100 Subject: PM: Fix indentation and remove extraneous whitespaces in kernel/power/main.c Lack of proper indentation of the goto statement decreases the readability of code significantly. In fact, this made me look twice at the code to check whether it really does what it should be doing. Fix this. And in the same file, there are some extra whitespaces. Get rid of them too. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki diff --git a/kernel/power/main.c b/kernel/power/main.c index 36e0f09..7d36fb3 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -3,7 +3,7 @@ * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab - * + * * This file is released under the GPLv2 * */ @@ -240,7 +240,7 @@ struct kobject *power_kobj; * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and * 'disk' (Suspend-to-Disk). * - * store() accepts one of those strings, translates it into the + * store() accepts one of those strings, translates it into the * proper enumerated value, and initiates a suspend transition. */ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -282,7 +282,7 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, /* First, check if we are requested to hibernate */ if (len == 4 && !strncmp(buf, "disk", len)) { error = hibernate(); - goto Exit; + goto Exit; } #ifdef CONFIG_SUSPEND -- cgit v0.10.2 From 6a76b7a9cc93dec6ae58d70f1257d234291908e0 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Mon, 21 Nov 2011 23:32:35 +0100 Subject: PM / Memory-hotplug: Avoid task freezing failures The lock_system_sleep() function is used in the memory hotplug code at several places in order to implement mutual exclusion with hibernation. However, this function tries to acquire the 'pm_mutex' lock using mutex_lock() and hence blocks in TASK_UNINTERRUPTIBLE state if it doesn't get the lock. This would lead to task freezing failures and hence hibernation failure as a consequence, even though the hibernation call path successfully acquired the lock. But it is to be noted that, since this task tries to acquire pm_mutex, if it blocks due to this, we are *100% sure* that this task is not going to run as long as hibernation sequence is in progress, since hibernation releases 'pm_mutex' only at the very end, when everything is done. And this means, this task is going to be anyway blocked for much more longer than what the freezer intends to achieve; which means, freezing and thawing doesn't really make any difference to this task! So, to fix freezing failures, we just ask the freezer to skip freezing this task, since it is already "frozen enough". But instead of calling freezer_do_not_count() and freezer_count() as it is, we use only the relevant parts of those functions, because restrictions such as 'the task should be a userspace one' etc., might not be relevant in this scenario. Signed-off-by: Srivatsa S. Bhat Acked-by: Tejun Heo Signed-off-by: Rafael J. Wysocki diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 57a6924..1f7fff4 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -380,12 +380,16 @@ static inline void unlock_system_sleep(void) {} static inline void lock_system_sleep(void) { + /* simplified freezer_do_not_count() */ + current->flags |= PF_FREEZER_SKIP; mutex_lock(&pm_mutex); } static inline void unlock_system_sleep(void) { mutex_unlock(&pm_mutex); + /* simplified freezer_count() */ + current->flags &= ~PF_FREEZER_SKIP; } #endif -- cgit v0.10.2 From d74e278aaf3b0fe4b02af67055aa71babcc0cebe Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 21 Nov 2011 23:33:28 +0100 Subject: PM / Sleep: Remove unnecessary label and jumps to it form PM core code The "End" label in device_prepare() in drivers/base/power/main.c is not necessary and the jumps to it have no real effect, so remove them all. Signed-off-by: Rafael J. Wysocki Reviewed-by: Srivatsa S. Bhat diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index c3d2dfc..1172aea 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1033,22 +1033,16 @@ static int device_prepare(struct device *dev, pm_message_t state) if (dev->pm_domain->ops.prepare) error = dev->pm_domain->ops.prepare(dev); suspend_report_result(dev->pm_domain->ops.prepare, error); - if (error) - goto End; } else if (dev->type && dev->type->pm) { pm_dev_dbg(dev, state, "preparing type "); if (dev->type->pm->prepare) error = dev->type->pm->prepare(dev); suspend_report_result(dev->type->pm->prepare, error); - if (error) - goto End; } else if (dev->class && dev->class->pm) { pm_dev_dbg(dev, state, "preparing class "); if (dev->class->pm->prepare) error = dev->class->pm->prepare(dev); suspend_report_result(dev->class->pm->prepare, error); - if (error) - goto End; } else if (dev->bus && dev->bus->pm) { pm_dev_dbg(dev, state, "preparing "); if (dev->bus->pm->prepare) @@ -1056,7 +1050,6 @@ static int device_prepare(struct device *dev, pm_message_t state) suspend_report_result(dev->bus->pm->prepare, error); } - End: device_unlock(dev); return error; -- cgit v0.10.2 From 64e94aafb6a5c4f419e9b8f93950914b5ac162a9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 21 Nov 2011 23:33:55 +0100 Subject: PM / Sleep: Simplify device_suspend_noirq() Remove a few if () and return statements in device_suspend_noirq() that aren't really necessary. Signed-off-by: Rafael J. Wysocki Reviewed-by: Srivatsa S. Bhat diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 1172aea..406f82c 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -763,31 +763,23 @@ static pm_message_t resume_event(pm_message_t sleep_state) */ static int device_suspend_noirq(struct device *dev, pm_message_t state) { - int error; + int error = 0; if (dev->pm_domain) { pm_dev_dbg(dev, state, "LATE power domain "); error = pm_noirq_op(dev, &dev->pm_domain->ops, state); - if (error) - return error; } else if (dev->type && dev->type->pm) { pm_dev_dbg(dev, state, "LATE type "); error = pm_noirq_op(dev, dev->type->pm, state); - if (error) - return error; } else if (dev->class && dev->class->pm) { pm_dev_dbg(dev, state, "LATE class "); error = pm_noirq_op(dev, dev->class->pm, state); - if (error) - return error; } else if (dev->bus && dev->bus->pm) { pm_dev_dbg(dev, state, "LATE "); error = pm_noirq_op(dev, dev->bus->pm, state); - if (error) - return error; } - return 0; + return error; } /** -- cgit v0.10.2 From 953a206393b1533ceb0e7d725cc5a8c8d7ed97dd Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Tue, 22 Nov 2011 23:20:31 +0100 Subject: PM / Hibernate: Refactor and simplify hibernation_snapshot() code The goto statements in hibernation_snapshot() are a bit complex. Refactor the code to remove some of them, thereby simplifying the implementation. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a6b0503..ebf62c3 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -333,7 +333,7 @@ static int create_image(int platform_mode) */ int hibernation_snapshot(int platform_mode) { - pm_message_t msg = PMSG_RECOVER; + pm_message_t msg; int error; error = platform_begin(platform_mode); @@ -362,26 +362,26 @@ int hibernation_snapshot(int platform_mode) error = dpm_prepare(PMSG_FREEZE); if (error) { - dpm_complete(msg); + dpm_complete(PMSG_RECOVER); goto Cleanup; } suspend_console(); pm_restrict_gfp_mask(); + error = dpm_suspend(PMSG_FREEZE); - if (error) - goto Recover_platform; - if (hibernation_test(TEST_DEVICES)) - goto Recover_platform; + if (error || hibernation_test(TEST_DEVICES)) + platform_recover(platform_mode); + else + error = create_image(platform_mode); - error = create_image(platform_mode); /* - * Control returns here (1) after the image has been created or the + * In the case that we call create_image() above, the control + * returns here (1) after the image has been created or the * image creation has failed and (2) after a successful restore. */ - Resume_devices: /* We may need to release the preallocated image pages here. */ if (error || !in_suspend) swsusp_free(); @@ -399,10 +399,6 @@ int hibernation_snapshot(int platform_mode) platform_end(platform_mode); return error; - Recover_platform: - platform_recover(platform_mode); - goto Resume_devices; - Cleanup: swsusp_free(); goto Close; -- cgit v0.10.2 From 62c9ea6b120688d800b4d892eaf737c20a73e86b Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Fri, 25 Nov 2011 00:44:55 +0100 Subject: Freezer: fix more fallout from the thaw_process rename Commit 944e192db53c "freezer: rename thaw_process() to __thaw_task() and simplify the implementation" did not create a !CONFIG_FREEZER version of __thaw_task(). Signed-off-by: Stephen Rothwell Signed-off-by: Rafael J. Wysocki diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 09570ac..c1ee283 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -149,6 +149,7 @@ static inline int freezer_should_skip(struct task_struct *p) #else /* !CONFIG_FREEZER */ static inline bool frozen(struct task_struct *p) { return false; } static inline bool freezing(struct task_struct *p) { return false; } +static inline void __thaw_task(struct task_struct *t) {} static inline bool __refrigerator(bool check_kthr_stop) { return false; } static inline int freeze_processes(void) { return -ENOSYS; } -- cgit v0.10.2 From 0118521cc7acb3ccbc1a01d6144ac32be9d56a4c Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 1 Dec 2011 22:32:43 +0100 Subject: PM / Hibernate: Enable usermodehelpers in software_resume() error path In the software_resume() function defined in kernel/power/hibernate.c, if the call to create_basic_memory_bitmaps() fails, the usermodehelpers are not enabled (which had been disabled in the previous step). Fix it. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index ebf62c3..1fcf9de 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -807,8 +807,10 @@ static int software_resume(void) goto close_finish; error = create_basic_memory_bitmaps(); - if (error) + if (error) { + usermodehelper_enable(); goto close_finish; + } pr_debug("PM: Preparing processes for restore.\n"); error = prepare_processes(); -- cgit v0.10.2 From 97819a26224f019e73d88bb2fd4eb5a614860461 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 1 Dec 2011 22:33:10 +0100 Subject: PM / Hibernate: Thaw processes in SNAPSHOT_CREATE_IMAGE ioctl test path Commit 2aede851ddf08666f68ffc17be446420e9d2a056 (PM / Hibernate: Freeze kernel threads after preallocating memory) moved the freezing of kernel threads to hibernation_snapshot() function. So now, if the call to hibernation_snapshot() returns early due to a successful hibernation test, the caller has to thaw processes to ensure that the system gets back to its original state. But in SNAPSHOT_CREATE_IMAGE hibernation ioctl, the caller does not thaw processes in case hibernation_snapshot() returned due to a successful freezer test. Fix this issue. But note we still send the value of 'in_suspend' (which is now 0) to userspace, because we are not in an error path per-se, and moreover, the value of in_suspend correctly depicts the situation here. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 1fcf9de..c10cb0f 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -55,7 +55,7 @@ enum { static int hibernation_mode = HIBERNATION_SHUTDOWN; -static bool freezer_test_done; +bool freezer_test_done; static const struct platform_hibernation_ops *hibernation_ops; diff --git a/kernel/power/power.h b/kernel/power/power.h index 23a2db1..0c4defe 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -50,6 +50,8 @@ static inline char *check_image_kernel(struct swsusp_info *info) #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) /* kernel/power/hibernate.c */ +extern bool freezer_test_done; + extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); extern int hibernation_platform_enter(void); diff --git a/kernel/power/user.c b/kernel/power/user.c index 6d8f535..e2aff0f 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -283,10 +283,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, } pm_restore_gfp_mask(); error = hibernation_snapshot(data->platform_support); - if (!error) + if (!error) { error = put_user(in_suspend, (int __user *)arg); - if (!error) - data->ready = 1; + if (!error && !freezer_test_done) + data->ready = 1; + if (freezer_test_done) { + freezer_test_done = false; + thaw_processes(); + } + } break; case SNAPSHOT_ATOMIC_RESTORE: -- cgit v0.10.2 From 48580ab8729865c81e148d59159fbe2aa7865511 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 1 Dec 2011 22:33:20 +0100 Subject: PM / Hibernate: Remove deprecated hibernation test modes The hibernation test modes 'test' and 'testproc' are deprecated, because the 'pm_test' framework offers much more fine-grained control for debugging suspend and hibernation related problems. So, remove the deprecated 'test' and 'testproc' hibernation test modes. Suggested-by: Rafael J. Wysocki Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c10cb0f..5314a94 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -43,8 +43,6 @@ int in_suspend __nosavedata; enum { HIBERNATION_INVALID, HIBERNATION_PLATFORM, - HIBERNATION_TEST, - HIBERNATION_TESTPROC, HIBERNATION_SHUTDOWN, HIBERNATION_REBOOT, /* keep last */ @@ -96,15 +94,6 @@ static void hibernation_debug_sleep(void) mdelay(5000); } -static int hibernation_testmode(int mode) -{ - if (hibernation_mode == mode) { - hibernation_debug_sleep(); - return 1; - } - return 0; -} - static int hibernation_test(int level) { if (pm_test_level == level) { @@ -114,7 +103,6 @@ static int hibernation_test(int level) return 0; } #else /* !CONFIG_PM_DEBUG */ -static int hibernation_testmode(int mode) { return 0; } static int hibernation_test(int level) { return 0; } #endif /* !CONFIG_PM_DEBUG */ @@ -278,8 +266,7 @@ static int create_image(int platform_mode) goto Platform_finish; error = disable_nonboot_cpus(); - if (error || hibernation_test(TEST_CPUS) - || hibernation_testmode(HIBERNATION_TEST)) + if (error || hibernation_test(TEST_CPUS)) goto Enable_cpus; local_irq_disable(); @@ -349,8 +336,7 @@ int hibernation_snapshot(int platform_mode) if (error) goto Cleanup; - if (hibernation_test(TEST_FREEZER) || - hibernation_testmode(HIBERNATION_TESTPROC)) { + if (hibernation_test(TEST_FREEZER)) { /* * Indicate to the caller that we are returning due to a @@ -586,9 +572,6 @@ int hibernation_platform_enter(void) static void power_down(void) { switch (hibernation_mode) { - case HIBERNATION_TEST: - case HIBERNATION_TESTPROC: - break; case HIBERNATION_REBOOT: kernel_restart(NULL); break; @@ -853,8 +836,6 @@ static const char * const hibernation_modes[] = { [HIBERNATION_PLATFORM] = "platform", [HIBERNATION_SHUTDOWN] = "shutdown", [HIBERNATION_REBOOT] = "reboot", - [HIBERNATION_TEST] = "test", - [HIBERNATION_TESTPROC] = "testproc", }; /* @@ -863,17 +844,15 @@ static const char * const hibernation_modes[] = { * Hibernation can be handled in several ways. There are a few different ways * to put the system into the sleep state: using the platform driver (e.g. ACPI * or other hibernation_ops), powering it off or rebooting it (for testing - * mostly), or using one of the two available test modes. + * mostly). * * The sysfs file /sys/power/disk provides an interface for selecting the * hibernation mode to use. Reading from this file causes the available modes - * to be printed. There are 5 modes that can be supported: + * to be printed. There are 3 modes that can be supported: * * 'platform' * 'shutdown' * 'reboot' - * 'test' - * 'testproc' * * If a platform hibernation driver is in use, 'platform' will be supported * and will be used by default. Otherwise, 'shutdown' will be used by default. @@ -897,8 +876,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, switch (i) { case HIBERNATION_SHUTDOWN: case HIBERNATION_REBOOT: - case HIBERNATION_TEST: - case HIBERNATION_TESTPROC: break; case HIBERNATION_PLATFORM: if (hibernation_ops) @@ -939,8 +916,6 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, switch (mode) { case HIBERNATION_SHUTDOWN: case HIBERNATION_REBOOT: - case HIBERNATION_TEST: - case HIBERNATION_TESTPROC: hibernation_mode = mode; break; case HIBERNATION_PLATFORM: -- cgit v0.10.2 From 5a50a7c32d630d6cdb13d69afabb0cc81b2f379c Mon Sep 17 00:00:00 2001 From: Keng-Yu Lin Date: Fri, 2 Dec 2011 00:04:23 +0100 Subject: ACPI / PM: Do not save/restore NVS on Asus K54C/K54HR The models do not resume correctly without acpi_sleep=nonvs. Signed-off-by: Keng-Yu Lin Signed-off-by: Rafael J. Wysocki diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 6d9a3ab..0a7ed69 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -476,6 +476,22 @@ static struct dmi_system_id __initdata acpisleep_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "VGN-FW520F"), }, }, + { + .callback = init_nvs_nosave, + .ident = "Asus K54C", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "K54C"), + }, + }, + { + .callback = init_nvs_nosave, + .ident = "Asus K54HR", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "K54HR"), + }, + }, {}, }; #endif /* CONFIG_SUSPEND */ -- cgit v0.10.2 From 0c6aebe31861c470c8cfbfdfdfd72d1369a6440b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 3 Dec 2011 00:23:43 +0100 Subject: PM / Sleep: Unify diagnostic messages from device suspend/resume Make pm_op() and pm_noirq_op() use the same helper function for running callbacks, which will cause them to use the same format of diagnostic messages. This also reduces the complexity and size of the code quite a bit. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 406f82c..b570189 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -164,8 +164,9 @@ static ktime_t initcall_debug_start(struct device *dev) ktime_t calltime = ktime_set(0, 0); if (initcall_debug) { - pr_info("calling %s+ @ %i\n", - dev_name(dev), task_pid_nr(current)); + pr_info("calling %s+ @ %i, parent: %s\n", + dev_name(dev), task_pid_nr(current), + dev->parent ? dev_name(dev->parent) : "none"); calltime = ktime_get(); } @@ -210,6 +211,24 @@ static void dpm_wait_for_children(struct device *dev, bool async) device_for_each_child(dev, &async, dpm_wait_fn); } +static int dpm_run_callback(struct device *dev, int (*cb)(struct device *)) +{ + ktime_t calltime; + int error; + + if (!cb) + return 0; + + calltime = initcall_debug_start(dev); + + error = cb(dev); + suspend_report_result(cb, error); + + initcall_debug_report(dev, calltime, error); + + return error; +} + /** * pm_op - Execute the PM operation appropriate for given PM event. * @dev: Device to handle. @@ -221,59 +240,36 @@ static int pm_op(struct device *dev, pm_message_t state) { int error = 0; - ktime_t calltime; - - calltime = initcall_debug_start(dev); switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: - if (ops->suspend) { - error = ops->suspend(dev); - suspend_report_result(ops->suspend, error); - } + error = dpm_run_callback(dev, ops->suspend); break; case PM_EVENT_RESUME: - if (ops->resume) { - error = ops->resume(dev); - suspend_report_result(ops->resume, error); - } + error = dpm_run_callback(dev, ops->resume); break; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: - if (ops->freeze) { - error = ops->freeze(dev); - suspend_report_result(ops->freeze, error); - } + error = dpm_run_callback(dev, ops->freeze); break; case PM_EVENT_HIBERNATE: - if (ops->poweroff) { - error = ops->poweroff(dev); - suspend_report_result(ops->poweroff, error); - } + error = dpm_run_callback(dev, ops->poweroff); break; case PM_EVENT_THAW: case PM_EVENT_RECOVER: - if (ops->thaw) { - error = ops->thaw(dev); - suspend_report_result(ops->thaw, error); - } + error = dpm_run_callback(dev, ops->thaw); break; case PM_EVENT_RESTORE: - if (ops->restore) { - error = ops->restore(dev); - suspend_report_result(ops->restore, error); - } + error = dpm_run_callback(dev, ops->restore); break; #endif /* CONFIG_HIBERNATE_CALLBACKS */ default: error = -EINVAL; } - initcall_debug_report(dev, calltime, error); - return error; } @@ -291,70 +287,36 @@ static int pm_noirq_op(struct device *dev, pm_message_t state) { int error = 0; - ktime_t calltime = ktime_set(0, 0), delta, rettime; - - if (initcall_debug) { - pr_info("calling %s+ @ %i, parent: %s\n", - dev_name(dev), task_pid_nr(current), - dev->parent ? dev_name(dev->parent) : "none"); - calltime = ktime_get(); - } switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: - if (ops->suspend_noirq) { - error = ops->suspend_noirq(dev); - suspend_report_result(ops->suspend_noirq, error); - } + error = dpm_run_callback(dev, ops->suspend_noirq); break; case PM_EVENT_RESUME: - if (ops->resume_noirq) { - error = ops->resume_noirq(dev); - suspend_report_result(ops->resume_noirq, error); - } + error = dpm_run_callback(dev, ops->resume_noirq); break; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: - if (ops->freeze_noirq) { - error = ops->freeze_noirq(dev); - suspend_report_result(ops->freeze_noirq, error); - } + error = dpm_run_callback(dev, ops->freeze_noirq); break; case PM_EVENT_HIBERNATE: - if (ops->poweroff_noirq) { - error = ops->poweroff_noirq(dev); - suspend_report_result(ops->poweroff_noirq, error); - } + error = dpm_run_callback(dev, ops->poweroff_noirq); break; case PM_EVENT_THAW: case PM_EVENT_RECOVER: - if (ops->thaw_noirq) { - error = ops->thaw_noirq(dev); - suspend_report_result(ops->thaw_noirq, error); - } + error = dpm_run_callback(dev, ops->thaw_noirq); break; case PM_EVENT_RESTORE: - if (ops->restore_noirq) { - error = ops->restore_noirq(dev); - suspend_report_result(ops->restore_noirq, error); - } + error = dpm_run_callback(dev, ops->restore_noirq); break; #endif /* CONFIG_HIBERNATE_CALLBACKS */ default: error = -EINVAL; } - if (initcall_debug) { - rettime = ktime_get(); - delta = ktime_sub(rettime, calltime); - printk("initcall %s_i+ returned %d after %Ld usecs\n", - dev_name(dev), error, - (unsigned long long)ktime_to_ns(delta) >> 10); - } - return error; } @@ -486,26 +448,6 @@ void dpm_resume_noirq(pm_message_t state) EXPORT_SYMBOL_GPL(dpm_resume_noirq); /** - * legacy_resume - Execute a legacy (bus or class) resume callback for device. - * @dev: Device to resume. - * @cb: Resume callback to execute. - */ -static int legacy_resume(struct device *dev, int (*cb)(struct device *dev)) -{ - int error; - ktime_t calltime; - - calltime = initcall_debug_start(dev); - - error = cb(dev); - suspend_report_result(cb, error); - - initcall_debug_report(dev, calltime, error); - - return error; -} - -/** * device_resume - Execute "resume" callbacks for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. @@ -553,7 +495,7 @@ static int device_resume(struct device *dev, pm_message_t state, bool async) goto End; } else if (dev->class->resume) { pm_dev_dbg(dev, state, "legacy class "); - error = legacy_resume(dev, dev->class->resume); + error = dpm_run_callback(dev, dev->class->resume); goto End; } } @@ -564,7 +506,7 @@ static int device_resume(struct device *dev, pm_message_t state, bool async) error = pm_op(dev, dev->bus->pm, state); } else if (dev->bus->resume) { pm_dev_dbg(dev, state, "legacy "); - error = legacy_resume(dev, dev->bus->resume); + error = dpm_run_callback(dev, dev->bus->resume); } } -- cgit v0.10.2 From d310310cbff18ec385c6ab4d58f33b100192a96a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 1 Dec 2011 22:44:39 +0100 Subject: Freezer / sunrpc / NFS: don't allow TASK_KILLABLE sleeps to block the freezer Allow the freezer to skip wait_on_bit_killable sleeps in the sunrpc layer. This should allow suspend and hibernate events to proceed, even when there are RPC's pending on the wire. Also, wrap the TASK_KILLABLE sleeps in NFS layer in freezer_do_not_count and freezer_count calls. This allows the freezer to skip tasks that are sleeping while looping on EJUKEBOX or NFS4ERR_DELAY sorts of errors. Signed-off-by: Jeff Layton Signed-off-by: Rafael J. Wysocki diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 50a15fa..bf3a57b 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -77,7 +78,7 @@ int nfs_wait_bit_killable(void *word) { if (fatal_signal_pending(current)) return -ERESTARTSYS; - schedule(); + freezable_schedule(); return 0; } diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index d4bc9ed9..91943953 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "iostat.h" #include "internal.h" @@ -32,7 +33,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) res = rpc_call_sync(clnt, msg, flags); if (res != -EJUKEBOX && res != -EKEYEXPIRED) break; - schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); + freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; } while (!fatal_signal_pending(current)); return res; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index be2bbac..b28bb19 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -53,6 +53,7 @@ #include #include #include +#include #include "nfs4_fs.h" #include "delegation.h" @@ -241,7 +242,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) *timeout = NFS4_POLL_RETRY_MIN; if (*timeout > NFS4_POLL_RETRY_MAX) *timeout = NFS4_POLL_RETRY_MAX; - schedule_timeout_killable(*timeout); + freezable_schedule_timeout_killable(*timeout); if (fatal_signal_pending(current)) res = -ERESTARTSYS; *timeout <<= 1; @@ -3950,7 +3951,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4 static unsigned long nfs4_set_lock_task_retry(unsigned long timeout) { - schedule_timeout_killable(timeout); + freezable_schedule_timeout_killable(timeout); timeout <<= 1; if (timeout > NFS4_LOCK_MAXTIMEOUT) return NFS4_LOCK_MAXTIMEOUT; diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index f48125d..0c672588 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "internal.h" #define NFSDBG_FACILITY NFSDBG_PROC @@ -59,7 +60,7 @@ nfs_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) res = rpc_call_sync(clnt, msg, flags); if (res != -EKEYEXPIRED) break; - schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); + freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; } while (!fatal_signal_pending(current)); return res; diff --git a/include/linux/freezer.h b/include/linux/freezer.h index c1ee283..30f06c2 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -105,6 +105,29 @@ static inline int freezer_should_skip(struct task_struct *p) } /* + * These macros are intended to be used whenever you want allow a task that's + * sleeping in TASK_UNINTERRUPTIBLE or TASK_KILLABLE state to be frozen. Note + * that neither return any clear indication of whether a freeze event happened + * while in this function. + */ + +/* Like schedule(), but should not block the freezer. */ +#define freezable_schedule() \ +({ \ + freezer_do_not_count(); \ + schedule(); \ + freezer_count(); \ +}) + +/* Like schedule_timeout_killable(), but should not block the freezer. */ +#define freezable_schedule_timeout_killable(timeout) \ +({ \ + freezer_do_not_count(); \ + schedule_timeout_killable(timeout); \ + freezer_count(); \ +}) + +/* * Freezer-friendly wrappers around wait_event_interruptible(), * wait_event_killable() and wait_event_interruptible_timeout(), originally * defined in @@ -163,6 +186,11 @@ static inline void freezer_count(void) {} static inline int freezer_should_skip(struct task_struct *p) { return 0; } static inline void set_freezable(void) {} +#define freezable_schedule() schedule() + +#define freezable_schedule_timeout_killable(timeout) \ + schedule_timeout_killable(timeout) + #define wait_event_freezable(wq, condition) \ wait_event_interruptible(wq, condition) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index d12ffa5..5317b93 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -231,7 +232,7 @@ static int rpc_wait_bit_killable(void *word) { if (fatal_signal_pending(current)) return -ERESTARTSYS; - schedule(); + freezable_schedule(); return 0; } -- cgit v0.10.2 From e5b16746f0f2d6883c226af52d90904ce0f7eee8 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sat, 3 Dec 2011 00:20:30 +0100 Subject: PM / Hibernate: Replace unintuitive 'if' condition in kernel/power/user.c with 'else' In the snapshot_ioctl() function, under SNAPSHOT_FREEZE, the code below freeze_processes() is a bit unintuitive. Improve it by replacing the second 'if' condition with an 'else' clause. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki diff --git a/kernel/power/user.c b/kernel/power/user.c index c202e2e..06ea33d 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -259,7 +259,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = freeze_processes(); if (error) usermodehelper_enable(); - if (!error) + else data->frozen = 1; break; -- cgit v0.10.2 From 467de1fc67d1bd2954eaac7019c564f28fa2b6a5 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Tue, 6 Dec 2011 23:17:51 +0100 Subject: PM / Freezer: Remove the "userspace only" constraint from freezer[_do_not]_count() At present, the functions freezer_count() and freezer_do_not_count() impose the restriction that they are effective only for userspace processes. However, now, these functions have found more utility than originally intended by the commit which introduced it: ba96a0c8 (freezer: fix vfork problem). And moreover, even the vfork issue actually does not need the above restriction in these functions. So, modify these functions to make them work even for kernel threads, so that they can be used at other places in the kernel, where the userspace restriction doesn't apply. Suggested-by: Oleg Nesterov Suggested-by: Tejun Heo Acked-by: Tejun Heo Reviewed-by: Oleg Nesterov Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 30f06c2..7bcfe73 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -67,33 +67,27 @@ static inline bool cgroup_freezing(struct task_struct *task) * appropriately in case the child has exited before the freezing of tasks is * complete. However, we don't want kernel threads to be frozen in unexpected * places, so we allow them to block freeze_processes() instead or to set - * PF_NOFREEZE if needed and PF_FREEZER_SKIP is only set for userland vfork - * parents. Fortunately, in the ____call_usermodehelper() case the parent won't - * really block freeze_processes(), since ____call_usermodehelper() (the child) - * does a little before exec/exit and it can't be frozen before waking up the - * parent. + * PF_NOFREEZE if needed. Fortunately, in the ____call_usermodehelper() case the + * parent won't really block freeze_processes(), since ____call_usermodehelper() + * (the child) does a little before exec/exit and it can't be frozen before + * waking up the parent. */ -/* - * If the current task is a user space one, tell the freezer not to count it as - * freezable. - */ + +/* Tell the freezer not to count the current task as freezable. */ static inline void freezer_do_not_count(void) { - if (current->mm) - current->flags |= PF_FREEZER_SKIP; + current->flags |= PF_FREEZER_SKIP; } /* - * If the current task is a user space one, tell the freezer to count it as - * freezable again and try to freeze it. + * Tell the freezer to count the current task as freezable again and try to + * freeze it. */ static inline void freezer_count(void) { - if (current->mm) { - current->flags &= ~PF_FREEZER_SKIP; - try_to_freeze(); - } + current->flags &= ~PF_FREEZER_SKIP; + try_to_freeze(); } /* -- cgit v0.10.2 From 33e638b9070ba5e8812836e20390da6a6af13900 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Tue, 6 Dec 2011 23:18:12 +0100 Subject: PM / Sleep: Use the freezer_count() functions in [un]lock_system_sleep() APIs Now that freezer_count() and freezer_do_not_count() don't have the restriction that they are effective only when called by userspace processes, use them in lock_system_sleep() and unlock_system_sleep() instead of open-coding their parts. Signed-off-by: Srivatsa S. Bhat Acked-by: Tejun Heo Signed-off-by: Rafael J. Wysocki diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 1f7fff4..906d62c 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #ifdef CONFIG_VT @@ -380,16 +381,14 @@ static inline void unlock_system_sleep(void) {} static inline void lock_system_sleep(void) { - /* simplified freezer_do_not_count() */ - current->flags |= PF_FREEZER_SKIP; + freezer_do_not_count(); mutex_lock(&pm_mutex); } static inline void unlock_system_sleep(void) { mutex_unlock(&pm_mutex); - /* simplified freezer_count() */ - current->flags &= ~PF_FREEZER_SKIP; + freezer_count(); } #endif -- cgit v0.10.2 From 9b6fc5dc879bc90f765db0e95eefcf123d0d06dd Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Tue, 6 Dec 2011 23:24:38 +0100 Subject: PM / Sleep: Make [un]lock_system_sleep() generic The [un]lock_system_sleep() APIs were originally introduced to mutually exclude memory hotplug and hibernation. Directly using mutex_lock(&pm_mutex) to achieve mutual exclusion with suspend or hibernation code can lead to freezing failures. However, the APIs [un]lock_system_sleep() can be safely used to achieve the same, without causing freezing failures. So, since it would be beneficial to modify all the existing users of mutex_lock(&pm_mutex) (in all parts of the kernel), so that they use these safe APIs intead, make these APIs generic by removing the restriction that they work only when CONFIG_HIBERNATE_CALLBACKS is set. Moreover, that restriction didn't buy us anything anyway. Suggested-by: Tejun Heo Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 906d62c..95040cc 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -332,6 +332,8 @@ static inline bool system_entering_hibernation(void) { return false; } #define PM_RESTORE_PREPARE 0x0005 /* Going to restore a saved image */ #define PM_POST_RESTORE 0x0006 /* Restore failed */ +extern struct mutex pm_mutex; + #ifdef CONFIG_PM_SLEEP void save_processor_state(void); void restore_processor_state(void); @@ -352,6 +354,19 @@ extern bool events_check_enabled; extern bool pm_wakeup_pending(void); extern bool pm_get_wakeup_count(unsigned int *count); extern bool pm_save_wakeup_count(unsigned int count); + +static inline void lock_system_sleep(void) +{ + freezer_do_not_count(); + mutex_lock(&pm_mutex); +} + +static inline void unlock_system_sleep(void) +{ + mutex_unlock(&pm_mutex); + freezer_count(); +} + #else /* !CONFIG_PM_SLEEP */ static inline int register_pm_notifier(struct notifier_block *nb) @@ -367,30 +382,11 @@ static inline int unregister_pm_notifier(struct notifier_block *nb) #define pm_notifier(fn, pri) do { (void)(fn); } while (0) static inline bool pm_wakeup_pending(void) { return false; } -#endif /* !CONFIG_PM_SLEEP */ - -extern struct mutex pm_mutex; -#ifndef CONFIG_HIBERNATE_CALLBACKS static inline void lock_system_sleep(void) {} static inline void unlock_system_sleep(void) {} -#else - -/* Let some subsystems like memory hotadd exclude hibernation */ - -static inline void lock_system_sleep(void) -{ - freezer_do_not_count(); - mutex_lock(&pm_mutex); -} - -static inline void unlock_system_sleep(void) -{ - mutex_unlock(&pm_mutex); - freezer_count(); -} -#endif +#endif /* !CONFIG_PM_SLEEP */ #ifdef CONFIG_ARCH_SAVE_PAGE_KEYS /* -- cgit v0.10.2 From bcda53faf5814c0c6025a0bd47108adfcbe9f199 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Wed, 7 Dec 2011 22:29:54 +0100 Subject: PM / Sleep: Replace mutex_[un]lock(&pm_mutex) with [un]lock_system_sleep() Using [un]lock_system_sleep() is safer than directly using mutex_[un]lock() on 'pm_mutex', since the latter could lead to freezing failures. Hence convert all the present users of mutex_[un]lock(&pm_mutex) to use these safe APIs instead. Suggested-by: Tejun Heo Signed-off-by: Srivatsa S. Bhat Reviewed-by: Simon Horman Signed-off-by: Rafael J. Wysocki diff --git a/kernel/kexec.c b/kernel/kexec.c index dc7bc08..090ee10 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1523,7 +1523,7 @@ int kernel_kexec(void) #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { - mutex_lock(&pm_mutex); + lock_system_sleep(); pm_prepare_console(); error = freeze_processes(); if (error) { @@ -1576,7 +1576,7 @@ int kernel_kexec(void) thaw_processes(); Restore_console: pm_restore_console(); - mutex_unlock(&pm_mutex); + unlock_system_sleep(); } #endif diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 605149a..6d6d288 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -69,14 +69,14 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops) WARN_ON(1); return; } - mutex_lock(&pm_mutex); + lock_system_sleep(); hibernation_ops = ops; if (ops) hibernation_mode = HIBERNATION_PLATFORM; else if (hibernation_mode == HIBERNATION_PLATFORM) hibernation_mode = HIBERNATION_SHUTDOWN; - mutex_unlock(&pm_mutex); + unlock_system_sleep(); } static bool entering_platform_hibernation; @@ -597,7 +597,7 @@ int hibernate(void) { int error; - mutex_lock(&pm_mutex); + lock_system_sleep(); /* The snapshot device should not be opened while we're running */ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { error = -EBUSY; @@ -665,7 +665,7 @@ int hibernate(void) pm_restore_console(); atomic_inc(&snapshot_device_available); Unlock: - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return error; } @@ -893,7 +893,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, p = memchr(buf, '\n', n); len = p ? p - buf : n; - mutex_lock(&pm_mutex); + lock_system_sleep(); for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { if (len == strlen(hibernation_modes[i]) && !strncmp(buf, hibernation_modes[i], len)) { @@ -919,7 +919,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, if (!error) pr_debug("PM: Hibernation mode set to '%s'\n", hibernation_modes[mode]); - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return error ? error : n; } @@ -946,9 +946,9 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, if (maj != MAJOR(res) || min != MINOR(res)) goto out; - mutex_lock(&pm_mutex); + lock_system_sleep(); swsusp_resume_device = res; - mutex_unlock(&pm_mutex); + unlock_system_sleep(); printk(KERN_INFO "PM: Starting manual resume from disk\n"); noresume = 0; software_resume(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 7d36fb3..9824b41e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -116,7 +116,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, p = memchr(buf, '\n', n); len = p ? p - buf : n; - mutex_lock(&pm_mutex); + lock_system_sleep(); level = TEST_FIRST; for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) @@ -126,7 +126,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, break; } - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return error ? error : n; } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index d336b27..4fd51be 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -42,9 +42,9 @@ static const struct platform_suspend_ops *suspend_ops; */ void suspend_set_ops(const struct platform_suspend_ops *ops) { - mutex_lock(&pm_mutex); + lock_system_sleep(); suspend_ops = ops; - mutex_unlock(&pm_mutex); + unlock_system_sleep(); } EXPORT_SYMBOL_GPL(suspend_set_ops); diff --git a/kernel/power/user.c b/kernel/power/user.c index 06ea33d..98ade21 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -71,7 +71,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) struct snapshot_data *data; int error; - mutex_lock(&pm_mutex); + lock_system_sleep(); if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { error = -EBUSY; @@ -123,7 +123,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->platform_support = 0; Unlock: - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return error; } @@ -132,7 +132,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) { struct snapshot_data *data; - mutex_lock(&pm_mutex); + lock_system_sleep(); swsusp_free(); free_basic_memory_bitmaps(); @@ -146,7 +146,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) PM_POST_HIBERNATION : PM_POST_RESTORE); atomic_inc(&snapshot_device_available); - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return 0; } @@ -158,7 +158,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, ssize_t res; loff_t pg_offp = *offp & ~PAGE_MASK; - mutex_lock(&pm_mutex); + lock_system_sleep(); data = filp->private_data; if (!data->ready) { @@ -179,7 +179,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, *offp += res; Unlock: - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return res; } @@ -191,7 +191,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, ssize_t res; loff_t pg_offp = *offp & ~PAGE_MASK; - mutex_lock(&pm_mutex); + lock_system_sleep(); data = filp->private_data; @@ -208,7 +208,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, if (res > 0) *offp += res; unlock: - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return res; } -- cgit v0.10.2 From cba3176e88fa134ece3ae1cf7e35dab9972d7853 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Wed, 7 Dec 2011 22:30:09 +0100 Subject: PM / Sleep: Recommend [un]lock_system_sleep() over using pm_mutex directly Update the documentation to explain the perils of directly using mutex_[un]lock(&pm_mutex) and recommend the usage of the safe APIs [un]lock_system_sleep() instead. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt index 3ab9fbd..6ccb68f 100644 --- a/Documentation/power/freezing-of-tasks.txt +++ b/Documentation/power/freezing-of-tasks.txt @@ -176,3 +176,28 @@ tasks, since it generally exists anyway. A driver must have all firmwares it may need in RAM before suspend() is called. If keeping them is not practical, for example due to their size, they must be requested early enough using the suspend notifier API described in notifiers.txt. + +VI. Are there any precautions to be taken to prevent freezing failures? + +Yes, there are. + +First of all, grabbing the 'pm_mutex' lock to mutually exclude a piece of code +from system-wide sleep such as suspend/hibernation is not encouraged. +If possible, that piece of code must instead hook onto the suspend/hibernation +notifiers to achieve mutual exclusion. Look at the CPU-Hotplug code +(kernel/cpu.c) for an example. + +However, if that is not feasible, and grabbing 'pm_mutex' is deemed necessary, +it is strongly discouraged to directly call mutex_[un]lock(&pm_mutex) since +that could lead to freezing failures, because if the suspend/hibernate code +successfully acquired the 'pm_mutex' lock, and hence that other entity failed +to acquire the lock, then that task would get blocked in TASK_UNINTERRUPTIBLE +state. As a consequence, the freezer would not be able to freeze that task, +leading to freezing failure. + +However, the [un]lock_system_sleep() APIs are safe to use in this scenario, +since they ask the freezer to skip freezing this task, since it is anyway +"frozen enough" as it is blocked on 'pm_mutex', which will be released +only after the entire suspend/hibernation sequence is complete. +So, to summarize, use [un]lock_system_sleep() instead of directly using +mutex_[un]lock(&pm_mutex). That would prevent freezing failures. -- cgit v0.10.2 From b298d289c79211508f11cb50749b0d1d54eb244a Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Fri, 9 Dec 2011 23:36:36 +0100 Subject: PM / Sleep: Fix freezer failures due to racy usermodehelper_is_disabled() Commit a144c6a (PM: Print a warning if firmware is requested when tasks are frozen) introduced usermodehelper_is_disabled() to warn and exit immediately if firmware is requested when usermodehelpers are disabled. However, it is racy. Consider the following scenario, currently used in drivers/base/firmware_class.c: ... if (usermodehelper_is_disabled()) goto out; /* Do actual work */ ... out: return err; Nothing prevents someone from disabling usermodehelpers just after the check in the 'if' condition, which means that it is quite possible to try doing the "actual work" with usermodehelpers disabled, leading to undesirable consequences. In particular, this race condition in _request_firmware() causes task freezing failures whenever suspend/hibernation is in progress because, it wrongly waits to get the firmware/microcode image from userspace when actually the usermodehelpers are disabled or userspace has been frozen. Some of the example scenarios that cause freezing failures due to this race are those that depend on userspace via request_firmware(), such as x86 microcode module initialization and microcode image reload. Previous discussions about this issue can be found at: http://thread.gmane.org/gmane.linux.kernel/1198291/focus=1200591 This patch adds proper synchronization to fix this issue. It is to be noted that this patchset fixes the freezing failures but doesn't remove the warnings. IOW, it does not attempt to add explicit synchronization to x86 microcode driver to avoid requesting microcode image at inopportune moments. Because, the warnings were introduced to highlight such cases, in the first place. And we need not silence the warnings, since we take care of the *real* problem (freezing failure) and hence, after that, the warnings are pretty harmless anyway. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c index 06ed6b4..d5585da 100644 --- a/drivers/base/firmware_class.c +++ b/drivers/base/firmware_class.c @@ -534,6 +534,8 @@ static int _request_firmware(const struct firmware **firmware_p, return 0; } + read_lock_usermodehelper(); + if (WARN_ON(usermodehelper_is_disabled())) { dev_err(device, "firmware: %s will not be loaded\n", name); retval = -EBUSY; @@ -572,6 +574,8 @@ static int _request_firmware(const struct firmware **firmware_p, fw_destroy_instance(fw_priv); out: + read_unlock_usermodehelper(); + if (retval) { release_firmware(firmware); *firmware_p = NULL; diff --git a/include/linux/kmod.h b/include/linux/kmod.h index b16f653..722f477 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -117,5 +117,7 @@ extern void usermodehelper_init(void); extern int usermodehelper_disable(void); extern void usermodehelper_enable(void); extern bool usermodehelper_is_disabled(void); +extern void read_lock_usermodehelper(void); +extern void read_unlock_usermodehelper(void); #endif /* __LINUX_KMOD_H__ */ diff --git a/kernel/kmod.c b/kernel/kmod.c index a4bea97..81b4a27 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -50,6 +51,7 @@ static struct workqueue_struct *khelper_wq; static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; static DEFINE_SPINLOCK(umh_sysctl_lock); +static DECLARE_RWSEM(umhelper_sem); #ifdef CONFIG_MODULES @@ -275,6 +277,7 @@ static void __call_usermodehelper(struct work_struct *work) * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY * (used for preventing user land processes from being created after the user * land has been frozen during a system-wide hibernation or suspend operation). + * Should always be manipulated under umhelper_sem acquired for write. */ static int usermodehelper_disabled = 1; @@ -293,6 +296,18 @@ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); */ #define RUNNING_HELPERS_TIMEOUT (5 * HZ) +void read_lock_usermodehelper(void) +{ + down_read(&umhelper_sem); +} +EXPORT_SYMBOL_GPL(read_lock_usermodehelper); + +void read_unlock_usermodehelper(void) +{ + up_read(&umhelper_sem); +} +EXPORT_SYMBOL_GPL(read_unlock_usermodehelper); + /** * usermodehelper_disable - prevent new helpers from being started */ @@ -300,8 +315,10 @@ int usermodehelper_disable(void) { long retval; + down_write(&umhelper_sem); usermodehelper_disabled = 1; - smp_mb(); + up_write(&umhelper_sem); + /* * From now on call_usermodehelper_exec() won't start any new * helpers, so it is sufficient if running_helpers turns out to @@ -314,7 +331,9 @@ int usermodehelper_disable(void) if (retval) return 0; + down_write(&umhelper_sem); usermodehelper_disabled = 0; + up_write(&umhelper_sem); return -EAGAIN; } @@ -323,7 +342,9 @@ int usermodehelper_disable(void) */ void usermodehelper_enable(void) { + down_write(&umhelper_sem); usermodehelper_disabled = 0; + up_write(&umhelper_sem); } /** -- cgit v0.10.2 From cf007e3526a785a95a738d5a8fba44f1f4fe33e0 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 8 Dec 2011 23:42:53 +0100 Subject: PM / Hibernate: Remove deprecated hibernation snapshot ioctls Several snapshot ioctls were marked for removal quite some time ago, since they were deprecated. Remove them. Suggested-by: Rafael J. Wysocki Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 3d84912..9f51fc4 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -85,17 +85,6 @@ Who: Robin Getz & Matt Mackall --------------------------- -What: Deprecated snapshot ioctls -When: 2.6.36 - -Why: The ioctls in kernel/power/user.c were marked as deprecated long time - ago. Now they notify users about that so that they need to replace - their userspace. After some more time, remove them completely. - -Who: Jiri Slaby - ---------------------------- - What: The ieee80211_regdom module parameter When: March 2010 / desktop catchup diff --git a/kernel/power/user.c b/kernel/power/user.c index 98ade21..78bdb44 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -30,28 +30,6 @@ #include "power.h" -/* - * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and - * will be removed in the future. They are only preserved here for - * compatibility with existing userland utilities. - */ -#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) -#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int) - -#define PMOPS_PREPARE 1 -#define PMOPS_ENTER 2 -#define PMOPS_FINISH 3 - -/* - * NOTE: The following ioctl definitions are wrong and have been replaced with - * correct ones. They are only preserved here for compatibility with existing - * userland utilities and will be removed in the future. - */ -#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *) -#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long) -#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *) -#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *) - #define SNAPSHOT_MINOR 231 @@ -213,15 +191,6 @@ unlock: return res; } -static void snapshot_deprecated_ioctl(unsigned int cmd) -{ - if (printk_ratelimit()) - printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will " - "be removed soon, update your suspend-to-disk " - "utilities\n", - __builtin_return_address(0), cmd); -} - static long snapshot_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -272,8 +241,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, data->frozen = 0; break; - case SNAPSHOT_ATOMIC_SNAPSHOT: - snapshot_deprecated_ioctl(cmd); case SNAPSHOT_CREATE_IMAGE: if (data->mode != O_RDONLY || !data->frozen || data->ready) { error = -EPERM; @@ -308,8 +275,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, data->ready = 0; break; - case SNAPSHOT_SET_IMAGE_SIZE: - snapshot_deprecated_ioctl(cmd); case SNAPSHOT_PREF_IMAGE_SIZE: image_size = arg; break; @@ -324,16 +289,12 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = put_user(size, (loff_t __user *)arg); break; - case SNAPSHOT_AVAIL_SWAP: - snapshot_deprecated_ioctl(cmd); case SNAPSHOT_AVAIL_SWAP_SIZE: size = count_swap_pages(data->swap, 1); size <<= PAGE_SHIFT; error = put_user(size, (loff_t __user *)arg); break; - case SNAPSHOT_GET_SWAP_PAGE: - snapshot_deprecated_ioctl(cmd); case SNAPSHOT_ALLOC_SWAP_PAGE: if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { error = -ENODEV; @@ -356,27 +317,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, free_all_swap_pages(data->swap); break; - case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ - snapshot_deprecated_ioctl(cmd); - if (!swsusp_swap_in_use()) { - /* - * User space encodes device types as two-byte values, - * so we need to recode them - */ - if (old_decode_dev(arg)) { - data->swap = swap_type_of(old_decode_dev(arg), - 0, NULL); - if (data->swap < 0) - error = -ENODEV; - } else { - data->swap = -1; - error = -EINVAL; - } - } else { - error = -EPERM; - } - break; - case SNAPSHOT_S2RAM: if (!data->frozen) { error = -EPERM; @@ -399,33 +339,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = hibernation_platform_enter(); break; - case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ - snapshot_deprecated_ioctl(cmd); - error = -EINVAL; - - switch (arg) { - - case PMOPS_PREPARE: - data->platform_support = 1; - error = 0; - break; - - case PMOPS_ENTER: - if (data->platform_support) - error = hibernation_platform_enter(); - break; - - case PMOPS_FINISH: - if (data->platform_support) - error = 0; - break; - - default: - printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg); - - } - break; - case SNAPSHOT_SET_SWAP_AREA: if (swsusp_swap_in_use()) { error = -EPERM; -- cgit v0.10.2 From 8ca6d9bcc8d33c592c0855b4b1481bc723ac7e85 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 15 Dec 2011 20:59:23 +0100 Subject: PM / Sleep: Simplify generic system suspend callbacks The pm_runtime_suspended() check in __pm_generic_call() doesn't really help and may cause problems to happen, because in some cases the system suspend callbacks need to be called even if the given device has been suspended by runtime PM. For example, if the device generally supports remote wakeup and is not enabled to wake up the system from sleep, it should be prevented from generating wakeup signals during system suspend and that has to be done by the suspend callbacks that the pm_runtime_suspended() check prevents from being executed. Similarly, it may not be a good idea to unconditionally change the runtime PM status of the device to 'active' in __pm_generic_resume(), because the driver may want to leave the device in the 'suspended' state, depending on what happened to it before the system suspend and whether or not it is enabled to wake up the system. For the above reasons, remove the pm_runtime_suspended() check from __pm_generic_call() and remove the code changing the device's runtime PM status from __pm_generic_resume(). Signed-off-by: Rafael J. Wysocki diff --git a/drivers/base/power/generic_ops.c b/drivers/base/power/generic_ops.c index 265a0ee..1b878b955 100644 --- a/drivers/base/power/generic_ops.c +++ b/drivers/base/power/generic_ops.c @@ -97,16 +97,16 @@ int pm_generic_prepare(struct device *dev) * @event: PM transition of the system under way. * @bool: Whether or not this is the "noirq" stage. * - * If the device has not been suspended at run time, execute the - * suspend/freeze/poweroff/thaw callback provided by its driver, if defined, and - * return its error code. Otherwise, return zero. + * Execute the suspend/freeze/poweroff/thaw callback provided by the driver of + * @dev, if defined, and return its error code. Return 0 if the callback is + * not present. */ static int __pm_generic_call(struct device *dev, int event, bool noirq) { const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; int (*callback)(struct device *); - if (!pm || pm_runtime_suspended(dev)) + if (!pm) return 0; switch (event) { @@ -217,14 +217,12 @@ EXPORT_SYMBOL_GPL(pm_generic_thaw); * @bool: Whether or not this is the "noirq" stage. * * Execute the resume/resotre callback provided by the @dev's driver, if - * defined. If it returns 0, change the device's runtime PM status to 'active'. - * Return the callback's error code. + * defined, and return its error code. Return 0 if the callback is not present. */ static int __pm_generic_resume(struct device *dev, int event, bool noirq) { const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; int (*callback)(struct device *); - int ret; if (!pm) return 0; @@ -241,17 +239,7 @@ static int __pm_generic_resume(struct device *dev, int event, bool noirq) break; } - if (!callback) - return 0; - - ret = callback(dev); - if (!ret && !noirq && pm_runtime_enabled(dev)) { - pm_runtime_disable(dev); - pm_runtime_set_active(dev); - pm_runtime_enable(dev); - } - - return ret; + return callback ? callback(dev) : 0; } /** -- cgit v0.10.2 From 1eac8111e0763853266a171ce11214da3a347a0a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 15 Dec 2011 20:59:30 +0100 Subject: PM / Sleep: Merge internal functions in generic_ops.c After the change that removed the code related to runtime PM from __pm_generic_call() and __pm_generic_resume() these two functions need not be separate any more, so merge them. Signed-off-by: Rafael J. Wysocki diff --git a/drivers/base/power/generic_ops.c b/drivers/base/power/generic_ops.c index 1b878b955..5a5b154 100644 --- a/drivers/base/power/generic_ops.c +++ b/drivers/base/power/generic_ops.c @@ -97,7 +97,7 @@ int pm_generic_prepare(struct device *dev) * @event: PM transition of the system under way. * @bool: Whether or not this is the "noirq" stage. * - * Execute the suspend/freeze/poweroff/thaw callback provided by the driver of + * Execute the PM callback corresponding to @event provided by the driver of * @dev, if defined, and return its error code. Return 0 if the callback is * not present. */ @@ -119,9 +119,15 @@ static int __pm_generic_call(struct device *dev, int event, bool noirq) case PM_EVENT_HIBERNATE: callback = noirq ? pm->poweroff_noirq : pm->poweroff; break; + case PM_EVENT_RESUME: + callback = noirq ? pm->resume_noirq : pm->resume; + break; case PM_EVENT_THAW: callback = noirq ? pm->thaw_noirq : pm->thaw; break; + case PM_EVENT_RESTORE: + callback = noirq ? pm->restore_noirq : pm->restore; + break; default: callback = NULL; break; @@ -211,44 +217,12 @@ int pm_generic_thaw(struct device *dev) EXPORT_SYMBOL_GPL(pm_generic_thaw); /** - * __pm_generic_resume - Generic resume/restore callback for subsystems. - * @dev: Device to handle. - * @event: PM transition of the system under way. - * @bool: Whether or not this is the "noirq" stage. - * - * Execute the resume/resotre callback provided by the @dev's driver, if - * defined, and return its error code. Return 0 if the callback is not present. - */ -static int __pm_generic_resume(struct device *dev, int event, bool noirq) -{ - const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - int (*callback)(struct device *); - - if (!pm) - return 0; - - switch (event) { - case PM_EVENT_RESUME: - callback = noirq ? pm->resume_noirq : pm->resume; - break; - case PM_EVENT_RESTORE: - callback = noirq ? pm->restore_noirq : pm->restore; - break; - default: - callback = NULL; - break; - } - - return callback ? callback(dev) : 0; -} - -/** * pm_generic_resume_noirq - Generic resume_noirq callback for subsystems. * @dev: Device to resume. */ int pm_generic_resume_noirq(struct device *dev) { - return __pm_generic_resume(dev, PM_EVENT_RESUME, true); + return __pm_generic_call(dev, PM_EVENT_RESUME, true); } EXPORT_SYMBOL_GPL(pm_generic_resume_noirq); @@ -258,7 +232,7 @@ EXPORT_SYMBOL_GPL(pm_generic_resume_noirq); */ int pm_generic_resume(struct device *dev) { - return __pm_generic_resume(dev, PM_EVENT_RESUME, false); + return __pm_generic_call(dev, PM_EVENT_RESUME, false); } EXPORT_SYMBOL_GPL(pm_generic_resume); @@ -268,7 +242,7 @@ EXPORT_SYMBOL_GPL(pm_generic_resume); */ int pm_generic_restore_noirq(struct device *dev) { - return __pm_generic_resume(dev, PM_EVENT_RESTORE, true); + return __pm_generic_call(dev, PM_EVENT_RESTORE, true); } EXPORT_SYMBOL_GPL(pm_generic_restore_noirq); @@ -278,7 +252,7 @@ EXPORT_SYMBOL_GPL(pm_generic_restore_noirq); */ int pm_generic_restore(struct device *dev) { - return __pm_generic_resume(dev, PM_EVENT_RESTORE, false); + return __pm_generic_call(dev, PM_EVENT_RESTORE, false); } EXPORT_SYMBOL_GPL(pm_generic_restore); -- cgit v0.10.2 From 9cf519d1c15fa05a538c2b3963c5f3903daf765a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 18 Dec 2011 00:34:01 +0100 Subject: PM / Sleep: Make pm_op() and pm_noirq_op() return callback pointers Make the pm_op() and pm_noirq_op() functions return pointers to appropriate callbacks instead of executing those callbacks and returning their results. This change is required for a subsequent modification that will execute the corresponding driver callback if the subsystem callback returned by either pm_op(), or pm_noirq_op() is NULL. Signed-off-by: Rafael J. Wysocki diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index b570189..b5cef7e 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -32,6 +32,8 @@ #include "../base.h" #include "power.h" +typedef int (*pm_callback_t)(struct device *); + /* * The entries in the dpm_list list are in a depth first order, simply * because children are guaranteed to be discovered after parents, and @@ -211,113 +213,70 @@ static void dpm_wait_for_children(struct device *dev, bool async) device_for_each_child(dev, &async, dpm_wait_fn); } -static int dpm_run_callback(struct device *dev, int (*cb)(struct device *)) -{ - ktime_t calltime; - int error; - - if (!cb) - return 0; - - calltime = initcall_debug_start(dev); - - error = cb(dev); - suspend_report_result(cb, error); - - initcall_debug_report(dev, calltime, error); - - return error; -} - /** - * pm_op - Execute the PM operation appropriate for given PM event. - * @dev: Device to handle. + * pm_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. */ -static int pm_op(struct device *dev, - const struct dev_pm_ops *ops, - pm_message_t state) +static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state) { - int error = 0; - switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: - error = dpm_run_callback(dev, ops->suspend); - break; + return ops->suspend; case PM_EVENT_RESUME: - error = dpm_run_callback(dev, ops->resume); - break; + return ops->resume; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: - error = dpm_run_callback(dev, ops->freeze); - break; + return ops->freeze; case PM_EVENT_HIBERNATE: - error = dpm_run_callback(dev, ops->poweroff); - break; + return ops->poweroff; case PM_EVENT_THAW: case PM_EVENT_RECOVER: - error = dpm_run_callback(dev, ops->thaw); + return ops->thaw; break; case PM_EVENT_RESTORE: - error = dpm_run_callback(dev, ops->restore); - break; + return ops->restore; #endif /* CONFIG_HIBERNATE_CALLBACKS */ - default: - error = -EINVAL; } - return error; + return NULL; } /** - * pm_noirq_op - Execute the PM operation appropriate for given PM event. - * @dev: Device to handle. + * pm_noirq_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. * * The driver of @dev will not receive interrupts while this function is being * executed. */ -static int pm_noirq_op(struct device *dev, - const struct dev_pm_ops *ops, - pm_message_t state) +static pm_callback_t pm_noirq_op(const struct dev_pm_ops *ops, pm_message_t state) { - int error = 0; - switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: - error = dpm_run_callback(dev, ops->suspend_noirq); - break; + return ops->suspend_noirq; case PM_EVENT_RESUME: - error = dpm_run_callback(dev, ops->resume_noirq); - break; + return ops->resume_noirq; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: - error = dpm_run_callback(dev, ops->freeze_noirq); - break; + return ops->freeze_noirq; case PM_EVENT_HIBERNATE: - error = dpm_run_callback(dev, ops->poweroff_noirq); - break; + return ops->poweroff_noirq; case PM_EVENT_THAW: case PM_EVENT_RECOVER: - error = dpm_run_callback(dev, ops->thaw_noirq); - break; + return ops->thaw_noirq; case PM_EVENT_RESTORE: - error = dpm_run_callback(dev, ops->restore_noirq); - break; + return ops->restore_noirq; #endif /* CONFIG_HIBERNATE_CALLBACKS */ - default: - error = -EINVAL; } - return error; + return NULL; } static char *pm_verb(int event) @@ -375,6 +334,26 @@ static void dpm_show_time(ktime_t starttime, pm_message_t state, char *info) usecs / USEC_PER_MSEC, usecs % USEC_PER_MSEC); } +static int dpm_run_callback(pm_callback_t cb, struct device *dev, + pm_message_t state, char *info) +{ + ktime_t calltime; + int error; + + if (!cb) + return 0; + + calltime = initcall_debug_start(dev); + + pm_dev_dbg(dev, state, info); + error = cb(dev); + suspend_report_result(cb, error); + + initcall_debug_report(dev, calltime, error); + + return error; +} + /*------------------------- Resume routines -------------------------*/ /** @@ -387,25 +366,29 @@ static void dpm_show_time(ktime_t starttime, pm_message_t state, char *info) */ static int device_resume_noirq(struct device *dev, pm_message_t state) { + pm_callback_t callback = NULL; + char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->pm_domain) { - pm_dev_dbg(dev, state, "EARLY power domain "); - error = pm_noirq_op(dev, &dev->pm_domain->ops, state); + info = "EARLY power domain "; + callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { - pm_dev_dbg(dev, state, "EARLY type "); - error = pm_noirq_op(dev, dev->type->pm, state); + info = "EARLY type "; + callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { - pm_dev_dbg(dev, state, "EARLY class "); - error = pm_noirq_op(dev, dev->class->pm, state); + info = "EARLY class "; + callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { - pm_dev_dbg(dev, state, "EARLY "); - error = pm_noirq_op(dev, dev->bus->pm, state); + info = "EARLY "; + callback = pm_noirq_op(dev->bus->pm, state); } + error = dpm_run_callback(callback, dev, state, info); + TRACE_RESUME(error); return error; } @@ -455,6 +438,8 @@ EXPORT_SYMBOL_GPL(dpm_resume_noirq); */ static int device_resume(struct device *dev, pm_message_t state, bool async) { + pm_callback_t callback = NULL; + char *info = NULL; int error = 0; bool put = false; @@ -477,40 +462,41 @@ static int device_resume(struct device *dev, pm_message_t state, bool async) put = true; if (dev->pm_domain) { - pm_dev_dbg(dev, state, "power domain "); - error = pm_op(dev, &dev->pm_domain->ops, state); + info = "power domain "; + callback = pm_op(&dev->pm_domain->ops, state); goto End; } if (dev->type && dev->type->pm) { - pm_dev_dbg(dev, state, "type "); - error = pm_op(dev, dev->type->pm, state); + info = "type "; + callback = pm_op(dev->type->pm, state); goto End; } if (dev->class) { if (dev->class->pm) { - pm_dev_dbg(dev, state, "class "); - error = pm_op(dev, dev->class->pm, state); + info = "class "; + callback = pm_op(dev->class->pm, state); goto End; } else if (dev->class->resume) { - pm_dev_dbg(dev, state, "legacy class "); - error = dpm_run_callback(dev, dev->class->resume); + info = "legacy class "; + callback = dev->class->resume; goto End; } } if (dev->bus) { if (dev->bus->pm) { - pm_dev_dbg(dev, state, ""); - error = pm_op(dev, dev->bus->pm, state); + info = ""; + callback = pm_op(dev->bus->pm, state); } else if (dev->bus->resume) { - pm_dev_dbg(dev, state, "legacy "); - error = dpm_run_callback(dev, dev->bus->resume); + info = "legacy "; + callback = dev->bus->resume; } } End: + error = dpm_run_callback(callback, dev, state, info); dev->power.is_suspended = false; Unlock: @@ -705,23 +691,24 @@ static pm_message_t resume_event(pm_message_t sleep_state) */ static int device_suspend_noirq(struct device *dev, pm_message_t state) { - int error = 0; + pm_callback_t callback = NULL; + char *info = NULL; if (dev->pm_domain) { - pm_dev_dbg(dev, state, "LATE power domain "); - error = pm_noirq_op(dev, &dev->pm_domain->ops, state); + info = "LATE power domain "; + callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { - pm_dev_dbg(dev, state, "LATE type "); - error = pm_noirq_op(dev, dev->type->pm, state); + info = "LATE type "; + callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { - pm_dev_dbg(dev, state, "LATE class "); - error = pm_noirq_op(dev, dev->class->pm, state); + info = "LATE class "; + callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { - pm_dev_dbg(dev, state, "LATE "); - error = pm_noirq_op(dev, dev->bus->pm, state); + info = "LATE "; + callback = pm_noirq_op(dev->bus->pm, state); } - return error; + return dpm_run_callback(callback, dev, state, info); } /** @@ -798,6 +785,8 @@ static int legacy_suspend(struct device *dev, pm_message_t state, */ static int __device_suspend(struct device *dev, pm_message_t state, bool async) { + pm_callback_t callback = NULL; + char *info = NULL; int error = 0; dpm_wait_for_children(dev, async); @@ -818,22 +807,22 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) device_lock(dev); if (dev->pm_domain) { - pm_dev_dbg(dev, state, "power domain "); - error = pm_op(dev, &dev->pm_domain->ops, state); - goto End; + info = "power domain "; + callback = pm_op(&dev->pm_domain->ops, state); + goto Run; } if (dev->type && dev->type->pm) { - pm_dev_dbg(dev, state, "type "); - error = pm_op(dev, dev->type->pm, state); - goto End; + info = "type "; + callback = pm_op(dev->type->pm, state); + goto Run; } if (dev->class) { if (dev->class->pm) { - pm_dev_dbg(dev, state, "class "); - error = pm_op(dev, dev->class->pm, state); - goto End; + info = "class "; + callback = pm_op(dev->class->pm, state); + goto Run; } else if (dev->class->suspend) { pm_dev_dbg(dev, state, "legacy class "); error = legacy_suspend(dev, state, dev->class->suspend); @@ -843,14 +832,18 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) if (dev->bus) { if (dev->bus->pm) { - pm_dev_dbg(dev, state, ""); - error = pm_op(dev, dev->bus->pm, state); + info = ""; + callback = pm_op(dev->bus->pm, state); } else if (dev->bus->suspend) { pm_dev_dbg(dev, state, "legacy "); error = legacy_suspend(dev, state, dev->bus->suspend); + goto End; } } + Run: + error = dpm_run_callback(callback, dev, state, info); + End: if (!error) { dev->power.is_suspended = true; -- cgit v0.10.2 From 35cd133c6130c1eb52806808abee9d62e6854a27 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 18 Dec 2011 00:34:13 +0100 Subject: PM: Run the driver callback directly if the subsystem one is not there Make the PM core execute driver PM callbacks directly if the corresponding subsystem callbacks are not present. There are three reasons for doing that. First, it reflects the behavior of drivers/base/dd.c:really_probe() that runs the driver's .probe() callback directly if the bus type's one is not defined, so this change will remove one arbitrary difference between the PM core and the remaining parts of the driver core. Second, it will allow some subsystems, whose PM callbacks don't do anything except for executing driver callbacks, to be simplified quite a bit by removing those "forward-only" callbacks. Finally, it will allow us to remove one level of indirection in the system suspend and resume code paths where it is not necessary, which is going to lead to less debug noise with initcall_debug passed in the kernel command line (messages won't be printed for driverless devices whose subsystems don't provide PM callbacks among other things). Signed-off-by: Rafael J. Wysocki diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt index 3139fb5..20af7de 100644 --- a/Documentation/power/devices.txt +++ b/Documentation/power/devices.txt @@ -126,7 +126,9 @@ The core methods to suspend and resume devices reside in struct dev_pm_ops pointed to by the ops member of struct dev_pm_domain, or by the pm member of struct bus_type, struct device_type and struct class. They are mostly of interest to the people writing infrastructure for platforms and buses, like PCI -or USB, or device type and device class drivers. +or USB, or device type and device class drivers. They also are relevant to the +writers of device drivers whose subsystems (PM domains, device types, device +classes and bus types) don't provide all power management methods. Bus drivers implement these methods as appropriate for the hardware and the drivers using it; PCI works differently from USB, and so on. Not many people @@ -268,32 +270,35 @@ various phases always run after tasks have been frozen and before they are unfrozen. Furthermore, the *_noirq phases run at a time when IRQ handlers have been disabled (except for those marked with the IRQF_NO_SUSPEND flag). -All phases use PM domain, bus, type, or class callbacks (that is, methods -defined in dev->pm_domain->ops, dev->bus->pm, dev->type->pm, or dev->class->pm). -These callbacks are regarded by the PM core as mutually exclusive. Moreover, -PM domain callbacks always take precedence over bus, type and class callbacks, -while type callbacks take precedence over bus and class callbacks, and class -callbacks take precedence over bus callbacks. To be precise, the following -rules are used to determine which callback to execute in the given phase: +All phases use PM domain, bus, type, class or driver callbacks (that is, methods +defined in dev->pm_domain->ops, dev->bus->pm, dev->type->pm, dev->class->pm or +dev->driver->pm). These callbacks are regarded by the PM core as mutually +exclusive. Moreover, PM domain callbacks always take precedence over all of the +other callbacks and, for example, type callbacks take precedence over bus, class +and driver callbacks. To be precise, the following rules are used to determine +which callback to execute in the given phase: - 1. If dev->pm_domain is present, the PM core will attempt to execute the - callback included in dev->pm_domain->ops. If that callback is not - present, no action will be carried out for the given device. + 1. If dev->pm_domain is present, the PM core will choose the callback + included in dev->pm_domain->ops for execution 2. Otherwise, if both dev->type and dev->type->pm are present, the callback - included in dev->type->pm will be executed. + included in dev->type->pm will be chosen for execution. 3. Otherwise, if both dev->class and dev->class->pm are present, the - callback included in dev->class->pm will be executed. + callback included in dev->class->pm will be chosen for execution. 4. Otherwise, if both dev->bus and dev->bus->pm are present, the callback - included in dev->bus->pm will be executed. + included in dev->bus->pm will be chosen for execution. This allows PM domains and device types to override callbacks provided by bus types or device classes if necessary. -These callbacks may in turn invoke device- or driver-specific methods stored in -dev->driver->pm, but they don't have to. +The PM domain, type, class and bus callbacks may in turn invoke device- or +driver-specific methods stored in dev->driver->pm, but they don't have to do +that. + +If the subsystem callback chosen for execution is not present, the PM core will +execute the corresponding method from dev->driver->pm instead if there is one. Entering System Suspend diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt index c2ae8bf..4abe83e 100644 --- a/Documentation/power/runtime_pm.txt +++ b/Documentation/power/runtime_pm.txt @@ -57,6 +57,10 @@ the following: 4. Bus type of the device, if both dev->bus and dev->bus->pm are present. +If the subsystem chosen by applying the above rules doesn't provide the relevant +callback, the PM core will invoke the corresponding driver callback stored in +dev->driver->pm directly (if present). + The PM core always checks which callback to use in the order given above, so the priority order of callbacks from high to low is: PM domain, device type, class and bus type. Moreover, the high-priority one will always take precedence over @@ -64,86 +68,88 @@ a low-priority one. The PM domain, bus type, device type and class callbacks are referred to as subsystem-level callbacks in what follows. By default, the callbacks are always invoked in process context with interrupts -enabled. However, subsystems can use the pm_runtime_irq_safe() helper function -to tell the PM core that their ->runtime_suspend(), ->runtime_resume() and -->runtime_idle() callbacks may be invoked in atomic context with interrupts -disabled for a given device. This implies that the callback routines in -question must not block or sleep, but it also means that the synchronous helper -functions listed at the end of Section 4 may be used for that device within an -interrupt handler or generally in an atomic context. - -The subsystem-level suspend callback is _entirely_ _responsible_ for handling -the suspend of the device as appropriate, which may, but need not include -executing the device driver's own ->runtime_suspend() callback (from the +enabled. However, the pm_runtime_irq_safe() helper function can be used to tell +the PM core that it is safe to run the ->runtime_suspend(), ->runtime_resume() +and ->runtime_idle() callbacks for the given device in atomic context with +interrupts disabled. This implies that the callback routines in question must +not block or sleep, but it also means that the synchronous helper functions +listed at the end of Section 4 may be used for that device within an interrupt +handler or generally in an atomic context. + +The subsystem-level suspend callback, if present, is _entirely_ _responsible_ +for handling the suspend of the device as appropriate, which may, but need not +include executing the device driver's own ->runtime_suspend() callback (from the PM core's point of view it is not necessary to implement a ->runtime_suspend() callback in a device driver as long as the subsystem-level suspend callback knows what to do to handle the device). - * Once the subsystem-level suspend callback has completed successfully - for given device, the PM core regards the device as suspended, which need - not mean that the device has been put into a low power state. It is - supposed to mean, however, that the device will not process data and will - not communicate with the CPU(s) and RAM until the subsystem-level resume - callback is executed for it. The runtime PM status of a device after - successful execution of the subsystem-level suspend callback is 'suspended'. - - * If the subsystem-level suspend callback returns -EBUSY or -EAGAIN, - the device's runtime PM status is 'active', which means that the device - _must_ be fully operational afterwards. - - * If the subsystem-level suspend callback returns an error code different - from -EBUSY or -EAGAIN, the PM core regards this as a fatal error and will - refuse to run the helper functions described in Section 4 for the device, - until the status of it is directly set either to 'active', or to 'suspended' - (the PM core provides special helper functions for this purpose). - -In particular, if the driver requires remote wake-up capability (i.e. hardware + * Once the subsystem-level suspend callback (or the driver suspend callback, + if invoked directly) has completed successfully for the given device, the PM + core regards the device as suspended, which need not mean that it has been + put into a low power state. It is supposed to mean, however, that the + device will not process data and will not communicate with the CPU(s) and + RAM until the appropriate resume callback is executed for it. The runtime + PM status of a device after successful execution of the suspend callback is + 'suspended'. + + * If the suspend callback returns -EBUSY or -EAGAIN, the device's runtime PM + status remains 'active', which means that the device _must_ be fully + operational afterwards. + + * If the suspend callback returns an error code different from -EBUSY and + -EAGAIN, the PM core regards this as a fatal error and will refuse to run + the helper functions described in Section 4 for the device until its status + is directly set to either'active', or 'suspended' (the PM core provides + special helper functions for this purpose). + +In particular, if the driver requires remote wakeup capability (i.e. hardware mechanism allowing the device to request a change of its power state, such as PCI PME) for proper functioning and device_run_wake() returns 'false' for the device, then ->runtime_suspend() should return -EBUSY. On the other hand, if -device_run_wake() returns 'true' for the device and the device is put into a low -power state during the execution of the subsystem-level suspend callback, it is -expected that remote wake-up will be enabled for the device. Generally, remote -wake-up should be enabled for all input devices put into a low power state at -run time. - -The subsystem-level resume callback is _entirely_ _responsible_ for handling the -resume of the device as appropriate, which may, but need not include executing -the device driver's own ->runtime_resume() callback (from the PM core's point of -view it is not necessary to implement a ->runtime_resume() callback in a device -driver as long as the subsystem-level resume callback knows what to do to handle -the device). - - * Once the subsystem-level resume callback has completed successfully, the PM - core regards the device as fully operational, which means that the device - _must_ be able to complete I/O operations as needed. The runtime PM status - of the device is then 'active'. - - * If the subsystem-level resume callback returns an error code, the PM core - regards this as a fatal error and will refuse to run the helper functions - described in Section 4 for the device, until its status is directly set - either to 'active' or to 'suspended' (the PM core provides special helper - functions for this purpose). - -The subsystem-level idle callback is executed by the PM core whenever the device -appears to be idle, which is indicated to the PM core by two counters, the -device's usage counter and the counter of 'active' children of the device. +device_run_wake() returns 'true' for the device and the device is put into a +low-power state during the execution of the suspend callback, it is expected +that remote wakeup will be enabled for the device. Generally, remote wakeup +should be enabled for all input devices put into low-power states at run time. + +The subsystem-level resume callback, if present, is _entirely_ _responsible_ for +handling the resume of the device as appropriate, which may, but need not +include executing the device driver's own ->runtime_resume() callback (from the +PM core's point of view it is not necessary to implement a ->runtime_resume() +callback in a device driver as long as the subsystem-level resume callback knows +what to do to handle the device). + + * Once the subsystem-level resume callback (or the driver resume callback, if + invoked directly) has completed successfully, the PM core regards the device + as fully operational, which means that the device _must_ be able to complete + I/O operations as needed. The runtime PM status of the device is then + 'active'. + + * If the resume callback returns an error code, the PM core regards this as a + fatal error and will refuse to run the helper functions described in Section + 4 for the device, until its status is directly set to either 'active', or + 'suspended' (by means of special helper functions provided by the PM core + for this purpose). + +The idle callback (a subsystem-level one, if present, or the driver one) is +executed by the PM core whenever the device appears to be idle, which is +indicated to the PM core by two counters, the device's usage counter and the +counter of 'active' children of the device. * If any of these counters is decreased using a helper function provided by the PM core and it turns out to be equal to zero, the other counter is checked. If that counter also is equal to zero, the PM core executes the - subsystem-level idle callback with the device as an argument. + idle callback with the device as its argument. -The action performed by a subsystem-level idle callback is totally dependent on -the subsystem in question, but the expected and recommended action is to check +The action performed by the idle callback is totally dependent on the subsystem +(or driver) in question, but the expected and recommended action is to check if the device can be suspended (i.e. if all of the conditions necessary for suspending the device are satisfied) and to queue up a suspend request for the device in that case. The value returned by this callback is ignored by the PM core. The helper functions provided by the PM core, described in Section 4, guarantee -that the following constraints are met with respect to the bus type's runtime -PM callbacks: +that the following constraints are met with respect to runtime PM callbacks for +one device: (1) The callbacks are mutually exclusive (e.g. it is forbidden to execute ->runtime_suspend() in parallel with ->runtime_resume() or with another diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index b5cef7e..e2cc3d2 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -383,10 +383,15 @@ static int device_resume_noirq(struct device *dev, pm_message_t state) info = "EARLY class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { - info = "EARLY "; + info = "EARLY bus "; callback = pm_noirq_op(dev->bus->pm, state); } + if (!callback && dev->driver && dev->driver->pm) { + info = "EARLY driver "; + callback = pm_noirq_op(dev->driver->pm, state); + } + error = dpm_run_callback(callback, dev, state, info); TRACE_RESUME(error); @@ -464,20 +469,20 @@ static int device_resume(struct device *dev, pm_message_t state, bool async) if (dev->pm_domain) { info = "power domain "; callback = pm_op(&dev->pm_domain->ops, state); - goto End; + goto Driver; } if (dev->type && dev->type->pm) { info = "type "; callback = pm_op(dev->type->pm, state); - goto End; + goto Driver; } if (dev->class) { if (dev->class->pm) { info = "class "; callback = pm_op(dev->class->pm, state); - goto End; + goto Driver; } else if (dev->class->resume) { info = "legacy class "; callback = dev->class->resume; @@ -487,14 +492,21 @@ static int device_resume(struct device *dev, pm_message_t state, bool async) if (dev->bus) { if (dev->bus->pm) { - info = ""; + info = "bus "; callback = pm_op(dev->bus->pm, state); } else if (dev->bus->resume) { - info = "legacy "; + info = "legacy bus "; callback = dev->bus->resume; + goto End; } } + Driver: + if (!callback && dev->driver && dev->driver->pm) { + info = "driver "; + callback = pm_op(dev->driver->pm, state); + } + End: error = dpm_run_callback(callback, dev, state, info); dev->power.is_suspended = false; @@ -588,24 +600,33 @@ void dpm_resume(pm_message_t state) */ static void device_complete(struct device *dev, pm_message_t state) { + void (*callback)(struct device *) = NULL; + char *info = NULL; + device_lock(dev); if (dev->pm_domain) { - pm_dev_dbg(dev, state, "completing power domain "); - if (dev->pm_domain->ops.complete) - dev->pm_domain->ops.complete(dev); + info = "completing power domain "; + callback = dev->pm_domain->ops.complete; } else if (dev->type && dev->type->pm) { - pm_dev_dbg(dev, state, "completing type "); - if (dev->type->pm->complete) - dev->type->pm->complete(dev); + info = "completing type "; + callback = dev->type->pm->complete; } else if (dev->class && dev->class->pm) { - pm_dev_dbg(dev, state, "completing class "); - if (dev->class->pm->complete) - dev->class->pm->complete(dev); + info = "completing class "; + callback = dev->class->pm->complete; } else if (dev->bus && dev->bus->pm) { - pm_dev_dbg(dev, state, "completing "); - if (dev->bus->pm->complete) - dev->bus->pm->complete(dev); + info = "completing bus "; + callback = dev->bus->pm->complete; + } + + if (!callback && dev->driver && dev->driver->pm) { + info = "completing driver "; + callback = dev->driver->pm->complete; + } + + if (callback) { + pm_dev_dbg(dev, state, info); + callback(dev); } device_unlock(dev); @@ -704,10 +725,15 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state) info = "LATE class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { - info = "LATE "; + info = "LATE bus "; callback = pm_noirq_op(dev->bus->pm, state); } + if (!callback && dev->driver && dev->driver->pm) { + info = "LATE driver "; + callback = pm_noirq_op(dev->driver->pm, state); + } + return dpm_run_callback(callback, dev, state, info); } @@ -832,16 +858,21 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) if (dev->bus) { if (dev->bus->pm) { - info = ""; + info = "bus "; callback = pm_op(dev->bus->pm, state); } else if (dev->bus->suspend) { - pm_dev_dbg(dev, state, "legacy "); + pm_dev_dbg(dev, state, "legacy bus "); error = legacy_suspend(dev, state, dev->bus->suspend); goto End; } } Run: + if (!callback && dev->driver && dev->driver->pm) { + info = "driver "; + callback = pm_op(dev->driver->pm, state); + } + error = dpm_run_callback(callback, dev, state, info); End: @@ -949,6 +980,8 @@ int dpm_suspend(pm_message_t state) */ static int device_prepare(struct device *dev, pm_message_t state) { + int (*callback)(struct device *) = NULL; + char *info = NULL; int error = 0; device_lock(dev); @@ -956,25 +989,27 @@ static int device_prepare(struct device *dev, pm_message_t state) dev->power.wakeup_path = device_may_wakeup(dev); if (dev->pm_domain) { - pm_dev_dbg(dev, state, "preparing power domain "); - if (dev->pm_domain->ops.prepare) - error = dev->pm_domain->ops.prepare(dev); - suspend_report_result(dev->pm_domain->ops.prepare, error); + info = "preparing power domain "; + callback = dev->pm_domain->ops.prepare; } else if (dev->type && dev->type->pm) { - pm_dev_dbg(dev, state, "preparing type "); - if (dev->type->pm->prepare) - error = dev->type->pm->prepare(dev); - suspend_report_result(dev->type->pm->prepare, error); + info = "preparing type "; + callback = dev->type->pm->prepare; } else if (dev->class && dev->class->pm) { - pm_dev_dbg(dev, state, "preparing class "); - if (dev->class->pm->prepare) - error = dev->class->pm->prepare(dev); - suspend_report_result(dev->class->pm->prepare, error); + info = "preparing class "; + callback = dev->class->pm->prepare; } else if (dev->bus && dev->bus->pm) { - pm_dev_dbg(dev, state, "preparing "); - if (dev->bus->pm->prepare) - error = dev->bus->pm->prepare(dev); - suspend_report_result(dev->bus->pm->prepare, error); + info = "preparing bus "; + callback = dev->bus->pm->prepare; + } + + if (!callback && dev->driver && dev->driver->pm) { + info = "preparing driver "; + callback = dev->driver->pm->prepare; + } + + if (callback) { + error = callback(dev); + suspend_report_result(callback, error); } device_unlock(dev); diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 8c78443..c56efd7 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -250,6 +250,9 @@ static int rpm_idle(struct device *dev, int rpmflags) else callback = NULL; + if (!callback && dev->driver && dev->driver->pm) + callback = dev->driver->pm->runtime_idle; + if (callback) __rpm_callback(callback, dev); @@ -413,6 +416,9 @@ static int rpm_suspend(struct device *dev, int rpmflags) else callback = NULL; + if (!callback && dev->driver && dev->driver->pm) + callback = dev->driver->pm->runtime_suspend; + retval = rpm_callback(callback, dev); if (retval) { __update_runtime_status(dev, RPM_ACTIVE); @@ -633,6 +639,9 @@ static int rpm_resume(struct device *dev, int rpmflags) else callback = NULL; + if (!callback && dev->driver && dev->driver->pm) + callback = dev->driver->pm->runtime_resume; + retval = rpm_callback(callback, dev); if (retval) { __update_runtime_status(dev, RPM_SUSPENDED); -- cgit v0.10.2 From 9b39e73d0c2b265a7f8748b0e9a9f09be84079a8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 18 Dec 2011 00:34:24 +0100 Subject: PM / Sleep: Remove forward-only callbacks from platform bus type The forward-only PM callbacks provided by the platform bus type are not necessary any more, because the PM core executes driver callbacks when the corresponding subsystem callbacks are not present, so drop them. Signed-off-by: Rafael J. Wysocki diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 7a24895..7d912d5 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -700,25 +700,6 @@ static int platform_legacy_resume(struct device *dev) return ret; } -int platform_pm_prepare(struct device *dev) -{ - struct device_driver *drv = dev->driver; - int ret = 0; - - if (drv && drv->pm && drv->pm->prepare) - ret = drv->pm->prepare(dev); - - return ret; -} - -void platform_pm_complete(struct device *dev) -{ - struct device_driver *drv = dev->driver; - - if (drv && drv->pm && drv->pm->complete) - drv->pm->complete(dev); -} - #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_SUSPEND @@ -741,22 +722,6 @@ int platform_pm_suspend(struct device *dev) return ret; } -int platform_pm_suspend_noirq(struct device *dev) -{ - struct device_driver *drv = dev->driver; - int ret = 0; - - if (!drv) - return 0; - - if (drv->pm) { - if (drv->pm->suspend_noirq) - ret = drv->pm->suspend_noirq(dev); - } - - return ret; -} - int platform_pm_resume(struct device *dev) { struct device_driver *drv = dev->driver; @@ -775,22 +740,6 @@ int platform_pm_resume(struct device *dev) return ret; } -int platform_pm_resume_noirq(struct device *dev) -{ - struct device_driver *drv = dev->driver; - int ret = 0; - - if (!drv) - return 0; - - if (drv->pm) { - if (drv->pm->resume_noirq) - ret = drv->pm->resume_noirq(dev); - } - - return ret; -} - #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS @@ -813,22 +762,6 @@ int platform_pm_freeze(struct device *dev) return ret; } -int platform_pm_freeze_noirq(struct device *dev) -{ - struct device_driver *drv = dev->driver; - int ret = 0; - - if (!drv) - return 0; - - if (drv->pm) { - if (drv->pm->freeze_noirq) - ret = drv->pm->freeze_noirq(dev); - } - - return ret; -} - int platform_pm_thaw(struct device *dev) { struct device_driver *drv = dev->driver; @@ -847,22 +780,6 @@ int platform_pm_thaw(struct device *dev) return ret; } -int platform_pm_thaw_noirq(struct device *dev) -{ - struct device_driver *drv = dev->driver; - int ret = 0; - - if (!drv) - return 0; - - if (drv->pm) { - if (drv->pm->thaw_noirq) - ret = drv->pm->thaw_noirq(dev); - } - - return ret; -} - int platform_pm_poweroff(struct device *dev) { struct device_driver *drv = dev->driver; @@ -881,22 +798,6 @@ int platform_pm_poweroff(struct device *dev) return ret; } -int platform_pm_poweroff_noirq(struct device *dev) -{ - struct device_driver *drv = dev->driver; - int ret = 0; - - if (!drv) - return 0; - - if (drv->pm) { - if (drv->pm->poweroff_noirq) - ret = drv->pm->poweroff_noirq(dev); - } - - return ret; -} - int platform_pm_restore(struct device *dev) { struct device_driver *drv = dev->driver; @@ -915,22 +816,6 @@ int platform_pm_restore(struct device *dev) return ret; } -int platform_pm_restore_noirq(struct device *dev) -{ - struct device_driver *drv = dev->driver; - int ret = 0; - - if (!drv) - return 0; - - if (drv->pm) { - if (drv->pm->restore_noirq) - ret = drv->pm->restore_noirq(dev); - } - - return ret; -} - #endif /* CONFIG_HIBERNATE_CALLBACKS */ static const struct dev_pm_ops platform_dev_pm_ops = { diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 2a23f7d..b5267c9 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -264,62 +264,34 @@ static inline char *early_platform_driver_setup_func(void) \ } #endif /* MODULE */ -#ifdef CONFIG_PM_SLEEP -extern int platform_pm_prepare(struct device *dev); -extern void platform_pm_complete(struct device *dev); -#else -#define platform_pm_prepare NULL -#define platform_pm_complete NULL -#endif - #ifdef CONFIG_SUSPEND extern int platform_pm_suspend(struct device *dev); -extern int platform_pm_suspend_noirq(struct device *dev); extern int platform_pm_resume(struct device *dev); -extern int platform_pm_resume_noirq(struct device *dev); #else #define platform_pm_suspend NULL #define platform_pm_resume NULL -#define platform_pm_suspend_noirq NULL -#define platform_pm_resume_noirq NULL #endif #ifdef CONFIG_HIBERNATE_CALLBACKS extern int platform_pm_freeze(struct device *dev); -extern int platform_pm_freeze_noirq(struct device *dev); extern int platform_pm_thaw(struct device *dev); -extern int platform_pm_thaw_noirq(struct device *dev); extern int platform_pm_poweroff(struct device *dev); -extern int platform_pm_poweroff_noirq(struct device *dev); extern int platform_pm_restore(struct device *dev); -extern int platform_pm_restore_noirq(struct device *dev); #else #define platform_pm_freeze NULL #define platform_pm_thaw NULL #define platform_pm_poweroff NULL #define platform_pm_restore NULL -#define platform_pm_freeze_noirq NULL -#define platform_pm_thaw_noirq NULL -#define platform_pm_poweroff_noirq NULL -#define platform_pm_restore_noirq NULL #endif #ifdef CONFIG_PM_SLEEP #define USE_PLATFORM_PM_SLEEP_OPS \ - .prepare = platform_pm_prepare, \ - .complete = platform_pm_complete, \ .suspend = platform_pm_suspend, \ .resume = platform_pm_resume, \ .freeze = platform_pm_freeze, \ .thaw = platform_pm_thaw, \ .poweroff = platform_pm_poweroff, \ - .restore = platform_pm_restore, \ - .suspend_noirq = platform_pm_suspend_noirq, \ - .resume_noirq = platform_pm_resume_noirq, \ - .freeze_noirq = platform_pm_freeze_noirq, \ - .thaw_noirq = platform_pm_thaw_noirq, \ - .poweroff_noirq = platform_pm_poweroff_noirq, \ - .restore_noirq = platform_pm_restore_noirq, + .restore = platform_pm_restore, #else #define USE_PLATFORM_PM_SLEEP_OPS #endif -- cgit v0.10.2 From 8114ab763b2d297c8af49bf380a093d76e929692 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 18 Dec 2011 00:34:31 +0100 Subject: PM / Sleep: Remove forward-only callbacks from AMBA bus type The forward-only PM callbacks provided by the AMBA bus type are not necessary any more, because the PM core executes driver callbacks when the corresponding subsystem callbacks are not present, so drop them. Signed-off-by: Rafael J. Wysocki diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c index bd230e8..0304b3f 100644 --- a/drivers/amba/bus.c +++ b/drivers/amba/bus.c @@ -109,31 +109,7 @@ static int amba_legacy_resume(struct device *dev) return ret; } -static int amba_pm_prepare(struct device *dev) -{ - struct device_driver *drv = dev->driver; - int ret = 0; - - if (drv && drv->pm && drv->pm->prepare) - ret = drv->pm->prepare(dev); - - return ret; -} - -static void amba_pm_complete(struct device *dev) -{ - struct device_driver *drv = dev->driver; - - if (drv && drv->pm && drv->pm->complete) - drv->pm->complete(dev); -} - -#else /* !CONFIG_PM_SLEEP */ - -#define amba_pm_prepare NULL -#define amba_pm_complete NULL - -#endif /* !CONFIG_PM_SLEEP */ +#endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_SUSPEND @@ -155,22 +131,6 @@ static int amba_pm_suspend(struct device *dev) return ret; } -static int amba_pm_suspend_noirq(struct device *dev) -{ - struct device_driver *drv = dev->driver; - int ret = 0; - - if (!drv) - return 0; - - if (drv->pm) { - if (drv->pm->suspend_noirq) - ret = drv->pm->suspend_noirq(dev); - } - - return ret; -} - static int amba_pm_resume(struct device *dev) { struct device_driver *drv = dev->driver; @@ -189,28 +149,10 @@ static int amba_pm_resume(struct device *dev) return ret; } -static int amba_pm_resume_noirq(struct device *dev) -{ - struct device_driver *drv = dev->driver; - int ret = 0; - - if (!drv) - return 0; - - if (drv->pm) { - if (drv->pm->resume_noirq) - ret = drv->pm->resume_noirq(dev); - } - - return ret; -} - #else /* !CONFIG_SUSPEND */ #define amba_pm_suspend NULL #define amba_pm_resume NULL -#define amba_pm_suspend_noirq NULL -#define amba_pm_resume_noirq NULL #endif /* !CONFIG_SUSPEND */ @@ -234,22 +176,6 @@ static int amba_pm_freeze(struct device *dev) return ret; } -static int amba_pm_freeze_noirq(struct device *dev) -{ - struct device_driver *drv = dev->driver; - int ret = 0; - - if (!drv) - return 0; - - if (drv->pm) { - if (drv->pm->freeze_noirq) - ret = drv->pm->freeze_noirq(dev); - } - - return ret; -} - static int amba_pm_thaw(struct device *dev) { struct device_driver *drv = dev->driver; @@ -268,22 +194,6 @@ static int amba_pm_thaw(struct device *dev) return ret; } -static int amba_pm_thaw_noirq(struct device *dev) -{ - struct device_driver *drv = dev->driver; - int ret = 0; - - if (!drv) - return 0; - - if (drv->pm) { - if (drv->pm->thaw_noirq) - ret = drv->pm->thaw_noirq(dev); - } - - return ret; -} - static int amba_pm_poweroff(struct device *dev) { struct device_driver *drv = dev->driver; @@ -302,22 +212,6 @@ static int amba_pm_poweroff(struct device *dev) return ret; } -static int amba_pm_poweroff_noirq(struct device *dev) -{ - struct device_driver *drv = dev->driver; - int ret = 0; - - if (!drv) - return 0; - - if (drv->pm) { - if (drv->pm->poweroff_noirq) - ret = drv->pm->poweroff_noirq(dev); - } - - return ret; -} - static int amba_pm_restore(struct device *dev) { struct device_driver *drv = dev->driver; @@ -336,32 +230,12 @@ static int amba_pm_restore(struct device *dev) return ret; } -static int amba_pm_restore_noirq(struct device *dev) -{ - struct device_driver *drv = dev->driver; - int ret = 0; - - if (!drv) - return 0; - - if (drv->pm) { - if (drv->pm->restore_noirq) - ret = drv->pm->restore_noirq(dev); - } - - return ret; -} - #else /* !CONFIG_HIBERNATE_CALLBACKS */ #define amba_pm_freeze NULL #define amba_pm_thaw NULL #define amba_pm_poweroff NULL #define amba_pm_restore NULL -#define amba_pm_freeze_noirq NULL -#define amba_pm_thaw_noirq NULL -#define amba_pm_poweroff_noirq NULL -#define amba_pm_restore_noirq NULL #endif /* !CONFIG_HIBERNATE_CALLBACKS */ @@ -402,20 +276,12 @@ static int amba_pm_runtime_resume(struct device *dev) #ifdef CONFIG_PM static const struct dev_pm_ops amba_pm = { - .prepare = amba_pm_prepare, - .complete = amba_pm_complete, .suspend = amba_pm_suspend, .resume = amba_pm_resume, .freeze = amba_pm_freeze, .thaw = amba_pm_thaw, .poweroff = amba_pm_poweroff, .restore = amba_pm_restore, - .suspend_noirq = amba_pm_suspend_noirq, - .resume_noirq = amba_pm_resume_noirq, - .freeze_noirq = amba_pm_freeze_noirq, - .thaw_noirq = amba_pm_thaw_noirq, - .poweroff_noirq = amba_pm_poweroff_noirq, - .restore_noirq = amba_pm_restore_noirq, SET_RUNTIME_PM_OPS( amba_pm_runtime_suspend, amba_pm_runtime_resume, -- cgit v0.10.2 From 90363ddf0a1a4dccfbb8d0c10b8f488bc7fa69f8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 18 Dec 2011 00:34:42 +0100 Subject: PM: Drop generic_subsys_pm_ops Since the PM core is now going to execute driver callbacks directly if the corresponding subsystem callbacks are not present, forward-only subsystem callbacks (i.e. such that only execute the corresponding driver callbacks) are not necessary any more. Thus it is possible to remove generic_subsys_pm_ops, because the only callback in there that is not forward-only, .runtime_idle, is not really used by the only user of generic_subsys_pm_ops, which is vio_bus_type. However, the generic callback routines themselves cannot be removed from generic_ops.c, because they are used individually by a number of subsystems. Signed-off-by: Rafael J. Wysocki diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index f65af61..8b08629 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -1406,7 +1406,6 @@ static struct bus_type vio_bus_type = { .match = vio_bus_match, .probe = vio_bus_probe, .remove = vio_bus_remove, - .pm = GENERIC_SUBSYS_PM_OPS, }; /** diff --git a/drivers/base/power/generic_ops.c b/drivers/base/power/generic_ops.c index 5a5b154..10bdd79 100644 --- a/drivers/base/power/generic_ops.c +++ b/drivers/base/power/generic_ops.c @@ -276,28 +276,3 @@ void pm_generic_complete(struct device *dev) pm_runtime_idle(dev); } #endif /* CONFIG_PM_SLEEP */ - -struct dev_pm_ops generic_subsys_pm_ops = { -#ifdef CONFIG_PM_SLEEP - .prepare = pm_generic_prepare, - .suspend = pm_generic_suspend, - .suspend_noirq = pm_generic_suspend_noirq, - .resume = pm_generic_resume, - .resume_noirq = pm_generic_resume_noirq, - .freeze = pm_generic_freeze, - .freeze_noirq = pm_generic_freeze_noirq, - .thaw = pm_generic_thaw, - .thaw_noirq = pm_generic_thaw_noirq, - .poweroff = pm_generic_poweroff, - .poweroff_noirq = pm_generic_poweroff_noirq, - .restore = pm_generic_restore, - .restore_noirq = pm_generic_restore_noirq, - .complete = pm_generic_complete, -#endif -#ifdef CONFIG_PM_RUNTIME - .runtime_suspend = pm_generic_runtime_suspend, - .runtime_resume = pm_generic_runtime_resume, - .runtime_idle = pm_generic_runtime_idle, -#endif -}; -EXPORT_SYMBOL_GPL(generic_subsys_pm_ops); diff --git a/include/linux/pm.h b/include/linux/pm.h index 3f3ed83..21e04dd 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -300,19 +300,6 @@ const struct dev_pm_ops name = { \ SET_RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ } -/* - * Use this for subsystems (bus types, device types, device classes) that don't - * need any special suspend/resume handling in addition to invoking the PM - * callbacks provided by device drivers supporting both the system sleep PM and - * runtime PM, make the pm member point to generic_subsys_pm_ops. - */ -#ifdef CONFIG_PM -extern struct dev_pm_ops generic_subsys_pm_ops; -#define GENERIC_SUBSYS_PM_OPS (&generic_subsys_pm_ops) -#else -#define GENERIC_SUBSYS_PM_OPS NULL -#endif - /** * PM_EVENT_ messages * -- cgit v0.10.2