summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt4
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/audit.c8
-rw-r--r--kernel/configs.c15
-rw-r--r--kernel/cpu.c66
-rw-r--r--kernel/cpuset.c52
-rw-r--r--kernel/die_notifier.c38
-rw-r--r--kernel/exit.c19
-rw-r--r--kernel/fork.c90
-rw-r--r--kernel/futex.c1036
-rw-r--r--kernel/futex_compat.c22
-rw-r--r--kernel/hrtimer.c3
-rw-r--r--kernel/irq/handle.c5
-rw-r--r--kernel/irq/manage.c10
-rw-r--r--kernel/irq/proc.c15
-rw-r--r--kernel/irq/spurious.c4
-rw-r--r--kernel/itimer.c60
-rw-r--r--kernel/kallsyms.c81
-rw-r--r--kernel/kexec.c4
-rw-r--r--kernel/kmod.c13
-rw-r--r--kernel/kprobes.c293
-rw-r--r--kernel/kthread.c113
-rw-r--r--kernel/lockdep.c51
-rw-r--r--kernel/module.c89
-rw-r--r--kernel/mutex.c8
-rw-r--r--kernel/nsproxy.c139
-rw-r--r--kernel/params.c2
-rw-r--r--kernel/pid.c11
-rw-r--r--kernel/posix-cpu-timers.c14
-rw-r--r--kernel/posix-timers.c1
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/disk.c195
-rw-r--r--kernel/power/main.c42
-rw-r--r--kernel/power/power.h7
-rw-r--r--kernel/power/process.c6
-rw-r--r--kernel/power/snapshot.c15
-rw-r--r--kernel/power/swap.c1
-rw-r--r--kernel/power/user.c13
-rw-r--r--kernel/printk.c27
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/rcupdate.c2
-rw-r--r--kernel/rcutorture.c45
-rw-r--r--kernel/relay.c37
-rw-r--r--kernel/rtmutex.c41
-rw-r--r--kernel/rtmutex_common.h34
-rw-r--r--kernel/rwsem.c2
-rw-r--r--kernel/sched.c369
-rw-r--r--kernel/signal.c141
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/softlockup.c52
-rw-r--r--kernel/stop_machine.c8
-rw-r--r--kernel/sys.c117
-rw-r--r--kernel/sysctl.c23
-rw-r--r--kernel/time.c61
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clocksource.c51
-rw-r--r--kernel/time/tick-common.c8
-rw-r--r--kernel/time/tick-internal.h1
-rw-r--r--kernel/time/tick-sched.c51
-rw-r--r--kernel/time/timekeeping.c476
-rw-r--r--kernel/time/timer_list.c40
-rw-r--r--kernel/time/timer_stats.c14
-rw-r--r--kernel/timer.c532
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/utsname.c41
-rw-r--r--kernel/wait.c2
-rw-r--r--kernel/workqueue.c783
67 files changed, 3342 insertions, 2178 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 0b46a5d..c64ce9c 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -23,7 +23,7 @@ config PREEMPT_VOLUNTARY
"explicit preemption points" to the kernel code. These new
preemption points have been selected to reduce the maximum
latency of rescheduling, providing faster application reactions,
- at the cost of slighly lower throughput.
+ at the cost of slightly lower throughput.
This allows reaction to interactive events by allowing a
low priority process to voluntarily preempt itself even if it
@@ -43,7 +43,7 @@ config PREEMPT
even if it is in kernel mode executing a system call and would
otherwise not be about to reach a natural preemption point.
This allows applications to run more 'smoothly' even when the
- system is under load, at the cost of slighly lower throughput
+ system is under load, at the cost of slightly lower throughput
and a slight runtime overhead to kernel code.
Select this if you are building a kernel for a desktop or
diff --git a/kernel/Makefile b/kernel/Makefile
index ac6b27a..642d427 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
- hrtimer.o rwsem.o latency.o nsproxy.o srcu.o
+ hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
diff --git a/kernel/audit.c b/kernel/audit.c
index 4e9d208..d13276d 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -515,8 +515,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
err = -EPERM;
break;
case AUDIT_USER:
- case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
- case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
+ case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
+ case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
if (security_netlink_recv(skb, CAP_AUDIT_WRITE))
err = -EPERM;
break;
@@ -614,8 +614,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
loginuid, sid);
break;
case AUDIT_USER:
- case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
- case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
+ case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
+ case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
if (!audit_enabled && msg_type != AUDIT_USER_AVC)
return 0;
diff --git a/kernel/configs.c b/kernel/configs.c
index 8fa1fb2..e84d3f9 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -61,18 +61,9 @@ static ssize_t
ikconfig_read_current(struct file *file, char __user *buf,
size_t len, loff_t * offset)
{
- loff_t pos = *offset;
- ssize_t count;
-
- if (pos >= kernel_config_data_size)
- return 0;
-
- count = min(len, (size_t)(kernel_config_data_size - pos));
- if (copy_to_user(buf, kernel_config_data + MAGIC_SIZE + pos, count))
- return -EFAULT;
-
- *offset += count;
- return count;
+ return simple_read_from_buffer(buf, len, offset,
+ kernel_config_data + MAGIC_SIZE,
+ kernel_config_data_size);
}
static const struct file_operations ikconfig_file_ops = {
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 36e7084..208cf34 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -97,7 +97,7 @@ static inline void check_for_tasks(int cpu)
(!cputime_eq(p->utime, cputime_zero) ||
!cputime_eq(p->stime, cputime_zero)))
printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
- (state = %ld, flags = %lx) \n",
+ (state = %ld, flags = %x) \n",
p->comm, p->pid, cpu, p->state, p->flags);
}
write_unlock_irq(&tasklist_lock);
@@ -120,11 +120,13 @@ static int take_cpu_down(void *unused)
}
/* Requires cpu_add_remove_lock to be held */
-static int _cpu_down(unsigned int cpu)
+static int _cpu_down(unsigned int cpu, int tasks_frozen)
{
- int err;
+ int err, nr_calls = 0;
struct task_struct *p;
cpumask_t old_allowed, tmp;
+ void *hcpu = (void *)(long)cpu;
+ unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
if (num_online_cpus() == 1)
return -EBUSY;
@@ -132,12 +134,16 @@ static int _cpu_down(unsigned int cpu)
if (!cpu_online(cpu))
return -EINVAL;
- err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
- (void *)(long)cpu);
+ raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
+ err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
+ hcpu, -1, &nr_calls);
if (err == NOTIFY_BAD) {
+ __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
+ hcpu, nr_calls, NULL);
printk("%s: attempt to take down CPU %u failed\n",
__FUNCTION__, cpu);
- return -EINVAL;
+ err = -EINVAL;
+ goto out_release;
}
/* Ensure that we are not runnable on dying cpu */
@@ -152,8 +158,8 @@ static int _cpu_down(unsigned int cpu)
if (IS_ERR(p) || cpu_online(cpu)) {
/* CPU didn't die: tell everyone. Can't complain. */
- if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
- (void *)(long)cpu) == NOTIFY_BAD)
+ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
+ hcpu) == NOTIFY_BAD)
BUG();
if (IS_ERR(p)) {
@@ -170,13 +176,9 @@ static int _cpu_down(unsigned int cpu)
/* This actually kills the CPU. */
__cpu_die(cpu);
- /* Move it here so it can run. */
- kthread_bind(p, get_cpu());
- put_cpu();
-
/* CPU is completely dead: tell everyone. Too late to complain. */
- if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD,
- (void *)(long)cpu) == NOTIFY_BAD)
+ if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod,
+ hcpu) == NOTIFY_BAD)
BUG();
check_for_tasks(cpu);
@@ -185,6 +187,8 @@ out_thread:
err = kthread_stop(p);
out_allowed:
set_cpus_allowed(current, old_allowed);
+out_release:
+ raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
return err;
}
@@ -196,7 +200,7 @@ int cpu_down(unsigned int cpu)
if (cpu_hotplug_disabled)
err = -EBUSY;
else
- err = _cpu_down(cpu);
+ err = _cpu_down(cpu, 0);
mutex_unlock(&cpu_add_remove_lock);
return err;
@@ -204,15 +208,18 @@ int cpu_down(unsigned int cpu)
#endif /*CONFIG_HOTPLUG_CPU*/
/* Requires cpu_add_remove_lock to be held */
-static int __cpuinit _cpu_up(unsigned int cpu)
+static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
{
- int ret;
+ int ret, nr_calls = 0;
void *hcpu = (void *)(long)cpu;
+ unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
if (cpu_online(cpu) || !cpu_present(cpu))
return -EINVAL;
- ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
+ raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
+ ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
+ -1, &nr_calls);
if (ret == NOTIFY_BAD) {
printk("%s: attempt to bring up CPU %u failed\n",
__FUNCTION__, cpu);
@@ -229,12 +236,13 @@ static int __cpuinit _cpu_up(unsigned int cpu)
BUG_ON(!cpu_online(cpu));
/* Now call notifier in preparation. */
- raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
+ raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
out_notify:
if (ret != 0)
- raw_notifier_call_chain(&cpu_chain,
- CPU_UP_CANCELED, hcpu);
+ __raw_notifier_call_chain(&cpu_chain,
+ CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+ raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
return ret;
}
@@ -247,19 +255,13 @@ int __cpuinit cpu_up(unsigned int cpu)
if (cpu_hotplug_disabled)
err = -EBUSY;
else
- err = _cpu_up(cpu);
+ err = _cpu_up(cpu, 0);
mutex_unlock(&cpu_add_remove_lock);
return err;
}
#ifdef CONFIG_SUSPEND_SMP
-/* Needed to prevent the microcode driver from requesting firmware in its CPU
- * hotplug notifier during the suspend/resume.
- */
-int suspend_cpu_hotplug;
-EXPORT_SYMBOL(suspend_cpu_hotplug);
-
static cpumask_t frozen_cpus;
int disable_nonboot_cpus(void)
@@ -267,7 +269,6 @@ int disable_nonboot_cpus(void)
int cpu, first_cpu, error = 0;
mutex_lock(&cpu_add_remove_lock);
- suspend_cpu_hotplug = 1;
first_cpu = first_cpu(cpu_online_map);
/* We take down all of the non-boot CPUs in one shot to avoid races
* with the userspace trying to use the CPU hotplug at the same time
@@ -277,7 +278,7 @@ int disable_nonboot_cpus(void)
for_each_online_cpu(cpu) {
if (cpu == first_cpu)
continue;
- error = _cpu_down(cpu);
+ error = _cpu_down(cpu, 1);
if (!error) {
cpu_set(cpu, frozen_cpus);
printk("CPU%d is down\n", cpu);
@@ -294,7 +295,6 @@ int disable_nonboot_cpus(void)
} else {
printk(KERN_ERR "Non-boot CPUs are not disabled\n");
}
- suspend_cpu_hotplug = 0;
mutex_unlock(&cpu_add_remove_lock);
return error;
}
@@ -309,10 +309,9 @@ void enable_nonboot_cpus(void)
if (cpus_empty(frozen_cpus))
goto out;
- suspend_cpu_hotplug = 1;
printk("Enabling non-boot CPUs ...\n");
for_each_cpu_mask(cpu, frozen_cpus) {
- error = _cpu_up(cpu);
+ error = _cpu_up(cpu, 1);
if (!error) {
printk("CPU%d is up\n", cpu);
continue;
@@ -320,7 +319,6 @@ void enable_nonboot_cpus(void)
printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
}
cpus_clear(frozen_cpus);
- suspend_cpu_hotplug = 0;
out:
mutex_unlock(&cpu_add_remove_lock);
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d240349..f57854b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -42,7 +42,6 @@
#include <linux/seq_file.h>
#include <linux/security.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
#include <linux/spinlock.h>
#include <linux/stat.h>
#include <linux/string.h>
@@ -822,11 +821,22 @@ static int update_cpumask(struct cpuset *cs, char *buf)
return -EACCES;
trialcs = *cs;
- retval = cpulist_parse(buf, trialcs.cpus_allowed);
- if (retval < 0)
- return retval;
+
+ /*
+ * We allow a cpuset's cpus_allowed to be empty; if it has attached
+ * tasks, we'll catch it later when we validate the change and return
+ * -ENOSPC.
+ */
+ if (!buf[0] || (buf[0] == '\n' && !buf[1])) {
+ cpus_clear(trialcs.cpus_allowed);
+ } else {
+ retval = cpulist_parse(buf, trialcs.cpus_allowed);
+ if (retval < 0)
+ return retval;
+ }
cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map);
- if (cpus_empty(trialcs.cpus_allowed))
+ /* cpus_allowed cannot be empty for a cpuset with attached tasks. */
+ if (atomic_read(&cs->count) && cpus_empty(trialcs.cpus_allowed))
return -ENOSPC;
retval = validate_change(cs, &trialcs);
if (retval < 0)
@@ -919,16 +929,27 @@ static int update_nodemask(struct cpuset *cs, char *buf)
return -EACCES;
trialcs = *cs;
- retval = nodelist_parse(buf, trialcs.mems_allowed);
- if (retval < 0)
- goto done;
+
+ /*
+ * We allow a cpuset's mems_allowed to be empty; if it has attached
+ * tasks, we'll catch it later when we validate the change and return
+ * -ENOSPC.
+ */
+ if (!buf[0] || (buf[0] == '\n' && !buf[1])) {
+ nodes_clear(trialcs.mems_allowed);
+ } else {
+ retval = nodelist_parse(buf, trialcs.mems_allowed);
+ if (retval < 0)
+ goto done;
+ }
nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);
oldmem = cs->mems_allowed;
if (nodes_equal(oldmem, trialcs.mems_allowed)) {
retval = 0; /* Too easy - nothing to do */
goto done;
}
- if (nodes_empty(trialcs.mems_allowed)) {
+ /* mems_allowed cannot be empty for a cpuset with attached tasks. */
+ if (atomic_read(&cs->count) && nodes_empty(trialcs.mems_allowed)) {
retval = -ENOSPC;
goto done;
}
@@ -1751,12 +1772,7 @@ static ssize_t cpuset_tasks_read(struct file *file, char __user *buf,
{
struct ctr_struct *ctr = file->private_data;
- if (*ppos + nbytes > ctr->bufsz)
- nbytes = ctr->bufsz - *ppos;
- if (copy_to_user(buf, ctr->buf + *ppos, nbytes))
- return -EFAULT;
- *ppos += nbytes;
- return nbytes;
+ return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
}
static int cpuset_tasks_release(struct inode *unused_inode, struct file *file)
@@ -2200,10 +2216,6 @@ void cpuset_fork(struct task_struct *child)
* it is holding that mutex while calling check_for_release(),
* which calls kmalloc(), so can't be called holding callback_mutex().
*
- * We don't need to task_lock() this reference to tsk->cpuset,
- * because tsk is already marked PF_EXITING, so attach_task() won't
- * mess with it, or task is a failed fork, never visible to attach_task.
- *
* the_top_cpuset_hack:
*
* Set the exiting tasks cpuset to the root cpuset (top_cpuset).
@@ -2242,8 +2254,10 @@ void cpuset_exit(struct task_struct *tsk)
{
struct cpuset *cs;
+ task_lock(current);
cs = tsk->cpuset;
tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */
+ task_unlock(current);
if (notify_on_release(cs)) {
char *pathbuf = NULL;
diff --git a/kernel/die_notifier.c b/kernel/die_notifier.c
new file mode 100644
index 0000000..0d98827
--- /dev/null
+++ b/kernel/die_notifier.c
@@ -0,0 +1,38 @@
+
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/vmalloc.h>
+#include <linux/kdebug.h>
+
+
+static ATOMIC_NOTIFIER_HEAD(die_chain);
+
+int notify_die(enum die_val val, const char *str,
+ struct pt_regs *regs, long err, int trap, int sig)
+{
+ struct die_args args = {
+ .regs = regs,
+ .str = str,
+ .err = err,
+ .trapnr = trap,
+ .signr = sig,
+
+ };
+
+ return atomic_notifier_call_chain(&die_chain, val, &args);
+}
+
+int register_die_notifier(struct notifier_block *nb)
+{
+ vmalloc_sync_all();
+ return atomic_notifier_chain_register(&die_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_die_notifier);
+
+int unregister_die_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&die_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_die_notifier);
+
+
diff --git a/kernel/exit.c b/kernel/exit.c
index 9236924..b0c6f0c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -7,7 +7,6 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
-#include <linux/smp_lock.h>
#include <linux/module.h>
#include <linux/capability.h>
#include <linux/completion.h>
@@ -27,6 +26,7 @@
#include <linux/profile.h>
#include <linux/mount.h>
#include <linux/proc_fs.h>
+#include <linux/kthread.h>
#include <linux/mempolicy.h>
#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
@@ -255,26 +255,25 @@ static int has_stopped_jobs(struct pid *pgrp)
}
/**
- * reparent_to_init - Reparent the calling kernel thread to the init task of the pid space that the thread belongs to.
+ * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
*
* If a kernel thread is launched as a result of a system call, or if
- * it ever exits, it should generally reparent itself to init so that
- * it is correctly cleaned up on exit.
+ * it ever exits, it should generally reparent itself to kthreadd so it
+ * isn't in the way of other processes and is correctly cleaned up on exit.
*
* The various task state such as scheduling policy and priority may have
* been inherited from a user process, so we reset them to sane values here.
*
- * NOTE that reparent_to_init() gives the caller full capabilities.
+ * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
*/
-static void reparent_to_init(void)
+static void reparent_to_kthreadd(void)
{
write_lock_irq(&tasklist_lock);
ptrace_unlink(current);
/* Reparent to init */
remove_parent(current);
- current->parent = child_reaper(current);
- current->real_parent = child_reaper(current);
+ current->real_parent = current->parent = kthreadd_task;
add_parent(current);
/* Set the exit signal to SIGCHLD so we signal init on exit */
@@ -348,7 +347,7 @@ int disallow_signal(int sig)
return -EINVAL;
spin_lock_irq(&current->sighand->siglock);
- sigaddset(&current->blocked, sig);
+ current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
return 0;
@@ -401,7 +400,7 @@ void daemonize(const char *name, ...)
current->files = init_task.files;
atomic_inc(&current->files->count);
- reparent_to_init();
+ reparent_to_kthreadd();
}
EXPORT_SYMBOL(daemonize);
diff --git a/kernel/fork.c b/kernel/fork.c
index b7d169d..5dd3979 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -14,7 +14,6 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/unistd.h>
-#include <linux/smp_lock.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/completion.h>
@@ -106,7 +105,7 @@ static struct kmem_cache *mm_cachep;
void free_task(struct task_struct *tsk)
{
- free_thread_info(tsk->thread_info);
+ free_thread_info(tsk->stack);
rt_mutex_debug_task_free(tsk);
free_task_struct(tsk);
}
@@ -176,7 +175,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
}
*tsk = *orig;
- tsk->thread_info = ti;
+ tsk->stack = ti;
setup_thread_stack(tsk, orig);
#ifdef CONFIG_CC_STACKPROTECTOR
@@ -1516,26 +1515,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
}
/*
- * Unshare the mnt_namespace structure if it is being shared
- */
-static int unshare_mnt_namespace(unsigned long unshare_flags,
- struct mnt_namespace **new_nsp, struct fs_struct *new_fs)
-{
- struct mnt_namespace *ns = current->nsproxy->mnt_ns;
-
- if ((unshare_flags & CLONE_NEWNS) && ns) {
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs);
- if (!*new_nsp)
- return -ENOMEM;
- }
-
- return 0;
-}
-
-/*
* Unsharing of sighand is not supported yet
*/
static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
@@ -1593,16 +1572,6 @@ static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **n
return 0;
}
-#ifndef CONFIG_IPC_NS
-static inline int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns)
-{
- if (flags & CLONE_NEWIPC)
- return -EINVAL;
-
- return 0;
-}
-#endif
-
/*
* unshare allows a process to 'unshare' part of the process
* context which was originally shared using clone. copy_*
@@ -1615,14 +1584,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
{
int err = 0;
struct fs_struct *fs, *new_fs = NULL;
- struct mnt_namespace *ns, *new_ns = NULL;
struct sighand_struct *new_sigh = NULL;
struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
struct files_struct *fd, *new_fd = NULL;
struct sem_undo_list *new_ulist = NULL;
struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL;
- struct uts_namespace *uts, *new_uts = NULL;
- struct ipc_namespace *ipc, *new_ipc = NULL;
check_unshare_flags(&unshare_flags);
@@ -1637,36 +1603,24 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
goto bad_unshare_out;
if ((err = unshare_fs(unshare_flags, &new_fs)))
goto bad_unshare_cleanup_thread;
- if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs)))
- goto bad_unshare_cleanup_fs;
if ((err = unshare_sighand(unshare_flags, &new_sigh)))
- goto bad_unshare_cleanup_ns;
+ goto bad_unshare_cleanup_fs;
if ((err = unshare_vm(unshare_flags, &new_mm)))
goto bad_unshare_cleanup_sigh;
if ((err = unshare_fd(unshare_flags, &new_fd)))
goto bad_unshare_cleanup_vm;
if ((err = unshare_semundo(unshare_flags, &new_ulist)))
goto bad_unshare_cleanup_fd;
- if ((err = unshare_utsname(unshare_flags, &new_uts)))
+ if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
+ new_fs)))
goto bad_unshare_cleanup_semundo;
- if ((err = unshare_ipcs(unshare_flags, &new_ipc)))
- goto bad_unshare_cleanup_uts;
-
- if (new_ns || new_uts || new_ipc) {
- old_nsproxy = current->nsproxy;
- new_nsproxy = dup_namespaces(old_nsproxy);
- if (!new_nsproxy) {
- err = -ENOMEM;
- goto bad_unshare_cleanup_ipc;
- }
- }
- if (new_fs || new_ns || new_mm || new_fd || new_ulist ||
- new_uts || new_ipc) {
+ if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) {
task_lock(current);
if (new_nsproxy) {
+ old_nsproxy = current->nsproxy;
current->nsproxy = new_nsproxy;
new_nsproxy = old_nsproxy;
}
@@ -1677,12 +1631,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
new_fs = fs;
}
- if (new_ns) {
- ns = current->nsproxy->mnt_ns;
- current->nsproxy->mnt_ns = new_ns;
- new_ns = ns;
- }
-
if (new_mm) {
mm = current->mm;
active_mm = current->active_mm;
@@ -1698,32 +1646,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
new_fd = fd;
}
- if (new_uts) {
- uts = current->nsproxy->uts_ns;
- current->nsproxy->uts_ns = new_uts;
- new_uts = uts;
- }
-
- if (new_ipc) {
- ipc = current->nsproxy->ipc_ns;
- current->nsproxy->ipc_ns = new_ipc;
- new_ipc = ipc;
- }
-
task_unlock(current);
}
if (new_nsproxy)
put_nsproxy(new_nsproxy);
-bad_unshare_cleanup_ipc:
- if (new_ipc)
- put_ipc_ns(new_ipc);
-
-bad_unshare_cleanup_uts:
- if (new_uts)
- put_uts_ns(new_uts);
-
bad_unshare_cleanup_semundo:
bad_unshare_cleanup_fd:
if (new_fd)
@@ -1738,10 +1666,6 @@ bad_unshare_cleanup_sigh:
if (atomic_dec_and_test(&new_sigh->count))
kmem_cache_free(sighand_cachep, new_sigh);
-bad_unshare_cleanup_ns:
- if (new_ns)
- put_mnt_ns(new_ns);
-
bad_unshare_cleanup_fs:
if (new_fs)
put_fs_struct(new_fs);
diff --git a/kernel/futex.c b/kernel/futex.c
index 5a270b5..b7ce15c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -16,6 +16,9 @@
* Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
* Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
*
+ * PRIVATE futexes by Eric Dumazet
+ * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
+ *
* Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
* enough at me, Linus for the original (flawed) idea, Matthew
* Kirkwood for proof-of-concept implementation.
@@ -48,37 +51,18 @@
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
+#include <linux/module.h>
#include <asm/futex.h>
#include "rtmutex_common.h"
-#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
-/*
- * Futexes are matched on equal values of this key.
- * The key type depends on whether it's a shared or private mapping.
- * Don't rearrange members without looking at hash_futex().
- *
- * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
- * We set bit 0 to indicate if it's an inode-based key.
- */
-union futex_key {
- struct {
- unsigned long pgoff;
- struct inode *inode;
- int offset;
- } shared;
- struct {
- unsigned long address;
- struct mm_struct *mm;
- int offset;
- } private;
- struct {
- unsigned long word;
- void *ptr;
- int offset;
- } both;
-};
+#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
/*
* Priority Inheritance state:
@@ -106,12 +90,12 @@ struct futex_pi_state {
* we can wake only the relevant ones (hashed queues may be shared).
*
* A futex_q has a woken state, just like tasks have TASK_RUNNING.
- * It is considered woken when list_empty(&q->list) || q->lock_ptr == 0.
+ * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
* The order of wakup is always to make the first condition true, then
* wake up q->waiters, then make the second condition true.
*/
struct futex_q {
- struct list_head list;
+ struct plist_node list;
wait_queue_head_t waiters;
/* Which hash list lock to use: */
@@ -127,14 +111,20 @@ struct futex_q {
/* Optional priority inheritance state: */
struct futex_pi_state *pi_state;
struct task_struct *task;
+
+ /*
+ * This waiter is used in case of requeue from a
+ * normal futex to a PI-futex
+ */
+ struct rt_mutex_waiter waiter;
};
/*
* Split the global futex_lock into every hash list lock.
*/
struct futex_hash_bucket {
- spinlock_t lock;
- struct list_head chain;
+ spinlock_t lock;
+ struct plist_head chain;
};
static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
@@ -163,19 +153,26 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
&& key1->both.offset == key2->both.offset);
}
-/*
- * Get parameters which are the keys for a futex.
+/**
+ * get_futex_key - Get parameters which are the keys for a futex.
+ * @uaddr: virtual address of the futex
+ * @shared: NULL for a PROCESS_PRIVATE futex,
+ * &current->mm->mmap_sem for a PROCESS_SHARED futex
+ * @key: address where result is stored.
+ *
+ * Returns a negative error code or 0
+ * The key words are stored in *key on success.
*
* For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
* offset_within_page). For private mappings, it's (uaddr, current->mm).
* We can usually work out the index without swapping in the page.
*
- * Returns: 0, or negative error code.
- * The key words are stored in *key on success.
- *
- * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
+ * fshared is NULL for PROCESS_PRIVATE futexes
+ * For other futexes, it points to &current->mm->mmap_sem and
+ * caller must have taken the reader lock. but NOT any spinlocks.
*/
-static int get_futex_key(u32 __user *uaddr, union futex_key *key)
+int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
+ union futex_key *key)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
@@ -187,11 +184,25 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
* The futex address must be "naturally" aligned.
*/
key->both.offset = address % PAGE_SIZE;
- if (unlikely((key->both.offset % sizeof(u32)) != 0))
+ if (unlikely((address % sizeof(u32)) != 0))
return -EINVAL;
address -= key->both.offset;
/*
+ * PROCESS_PRIVATE futexes are fast.
+ * As the mm cannot disappear under us and the 'key' only needs
+ * virtual address, we dont even have to find the underlying vma.
+ * Note : We do have to check 'uaddr' is a valid user address,
+ * but access_ok() should be faster than find_vma()
+ */
+ if (!fshared) {
+ if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
+ return -EFAULT;
+ key->private.mm = mm;
+ key->private.address = address;
+ return 0;
+ }
+ /*
* The futex is hashed differently depending on whether
* it's in a shared or private mapping. So check vma first.
*/
@@ -205,6 +216,9 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
+ /* Save the user address in the ley */
+ key->uaddr = uaddr;
+
/*
* Private mappings are handled in a simple way.
*
@@ -215,6 +229,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
* mappings of _writable_ handles.
*/
if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
+ key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
key->private.mm = mm;
key->private.address = address;
return 0;
@@ -224,7 +239,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
* Linear file mappings are also simple.
*/
key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
- key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
+ key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
+ vma->vm_pgoff);
@@ -246,37 +261,46 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
}
return err;
}
+EXPORT_SYMBOL_GPL(get_futex_key);
/*
* Take a reference to the resource addressed by a key.
* Can be called while holding spinlocks.
*
- * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
- * function, if it is called at all. mmap_sem keeps key->shared.inode valid.
*/
-static inline void get_key_refs(union futex_key *key)
+inline void get_futex_key_refs(union futex_key *key)
{
- if (key->both.ptr != 0) {
- if (key->both.offset & 1)
+ if (key->both.ptr == 0)
+ return;
+ switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+ case FUT_OFF_INODE:
atomic_inc(&key->shared.inode->i_count);
- else
+ break;
+ case FUT_OFF_MMSHARED:
atomic_inc(&key->private.mm->mm_count);
+ break;
}
}
+EXPORT_SYMBOL_GPL(get_futex_key_refs);
/*
* Drop a reference to the resource addressed by a key.
* The hash bucket spinlock must not be held.
*/
-static void drop_key_refs(union futex_key *key)
+void drop_futex_key_refs(union futex_key *key)
{
- if (key->both.ptr != 0) {
- if (key->both.offset & 1)
+ if (key->both.ptr == 0)
+ return;
+ switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+ case FUT_OFF_INODE:
iput(key->shared.inode);
- else
+ break;
+ case FUT_OFF_MMSHARED:
mmdrop(key->private.mm);
+ break;
}
}
+EXPORT_SYMBOL_GPL(drop_futex_key_refs);
static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
{
@@ -290,28 +314,38 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
}
/*
- * Fault handling. Called with current->mm->mmap_sem held.
+ * Fault handling.
+ * if fshared is non NULL, current->mm->mmap_sem is already held
*/
-static int futex_handle_fault(unsigned long address, int attempt)
+static int futex_handle_fault(unsigned long address,
+ struct rw_semaphore *fshared, int attempt)
{
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
+ int ret = -EFAULT;
- if (attempt > 2 || !(vma = find_vma(mm, address)) ||
- vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
- return -EFAULT;
+ if (attempt > 2)
+ return ret;
- switch (handle_mm_fault(mm, vma, address, 1)) {
- case VM_FAULT_MINOR:
- current->min_flt++;
- break;
- case VM_FAULT_MAJOR:
- current->maj_flt++;
- break;
- default:
- return -EFAULT;
+ if (!fshared)
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, address);
+ if (vma && address >= vma->vm_start &&
+ (vma->vm_flags & VM_WRITE)) {
+ switch (handle_mm_fault(mm, vma, address, 1)) {
+ case VM_FAULT_MINOR:
+ ret = 0;
+ current->min_flt++;
+ break;
+ case VM_FAULT_MAJOR:
+ ret = 0;
+ current->maj_flt++;
+ break;
+ }
}
- return 0;
+ if (!fshared)
+ up_read(&mm->mmap_sem);
+ return ret;
}
/*
@@ -461,18 +495,19 @@ void exit_pi_state_list(struct task_struct *curr)
}
static int
-lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
+lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+ union futex_key *key, struct futex_pi_state **ps)
{
struct futex_pi_state *pi_state = NULL;
struct futex_q *this, *next;
- struct list_head *head;
+ struct plist_head *head;
struct task_struct *p;
pid_t pid;
head = &hb->chain;
- list_for_each_entry_safe(this, next, head, list) {
- if (match_futex(&this->key, &me->key)) {
+ plist_for_each_entry_safe(this, next, head, list) {
+ if (match_futex(&this->key, key)) {
/*
* Another waiter already exists - bump up
* the refcount and return its pi_state:
@@ -487,7 +522,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
WARN_ON(!atomic_read(&pi_state->refcount));
atomic_inc(&pi_state->refcount);
- me->pi_state = pi_state;
+ *ps = pi_state;
return 0;
}
@@ -514,7 +549,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
/* Store the key for possible exit cleanups: */
- pi_state->key = me->key;
+ pi_state->key = *key;
spin_lock_irq(&p->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
@@ -524,7 +559,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
put_task_struct(p);
- me->pi_state = pi_state;
+ *ps = pi_state;
return 0;
}
@@ -535,12 +570,12 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
*/
static void wake_futex(struct futex_q *q)
{
- list_del_init(&q->list);
+ plist_del(&q->list, &q->list.plist);
if (q->filp)
send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
/*
* The lock in wake_up_all() is a crucial memory barrier after the
- * list_del_init() and also before assigning to q->lock_ptr.
+ * plist_del() and also before assigning to q->lock_ptr.
*/
wake_up_all(&q->waiters);
/*
@@ -584,6 +619,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
*/
if (!(uval & FUTEX_OWNER_DIED)) {
newval = FUTEX_WAITERS | new_owner->pid;
+ /* Keep the FUTEX_WAITER_REQUEUED flag if it was set */
+ newval |= (uval & FUTEX_WAITER_REQUEUED);
pagefault_disable();
curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -651,17 +688,19 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
* Wake up all waiters hashed on the physical page that is mapped
* to this virtual address:
*/
-static int futex_wake(u32 __user *uaddr, int nr_wake)
+static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
+ int nr_wake)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
- struct list_head *head;
+ struct plist_head *head;
union futex_key key;
int ret;
- down_read(&current->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr, &key);
+ ret = get_futex_key(uaddr, fshared, &key);
if (unlikely(ret != 0))
goto out;
@@ -669,7 +708,7 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
spin_lock(&hb->lock);
head = &hb->chain;
- list_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key)) {
if (this->pi_state) {
ret = -EINVAL;
@@ -683,7 +722,261 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
spin_unlock(&hb->lock);
out:
- up_read(&current->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
+ return ret;
+}
+
+/*
+ * Called from futex_requeue_pi.
+ * Set FUTEX_WAITERS and FUTEX_WAITER_REQUEUED flags on the
+ * PI-futex value; search its associated pi_state if an owner exist
+ * or create a new one without owner.
+ */
+static inline int
+lookup_pi_state_for_requeue(u32 __user *uaddr, struct futex_hash_bucket *hb,
+ union futex_key *key,
+ struct futex_pi_state **pi_state)
+{
+ u32 curval, uval, newval;
+
+retry:
+ /*
+ * We can't handle a fault cleanly because we can't
+ * release the locks here. Simply return the fault.
+ */
+ if (get_futex_value_locked(&curval, uaddr))
+ return -EFAULT;
+
+ /* set the flags FUTEX_WAITERS and FUTEX_WAITER_REQUEUED */
+ if ((curval & (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED))
+ != (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED)) {
+ /*
+ * No waiters yet, we prepare the futex to have some waiters.
+ */
+
+ uval = curval;
+ newval = uval | FUTEX_WAITERS | FUTEX_WAITER_REQUEUED;
+
+ pagefault_disable();
+ curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+ pagefault_enable();
+
+ if (unlikely(curval == -EFAULT))
+ return -EFAULT;
+ if (unlikely(curval != uval))
+ goto retry;
+ }
+
+ if (!(curval & FUTEX_TID_MASK)
+ || lookup_pi_state(curval, hb, key, pi_state)) {
+ /* the futex has no owner (yet) or the lookup failed:
+ allocate one pi_state without owner */
+
+ *pi_state = alloc_pi_state();
+
+ /* Already stores the key: */
+ (*pi_state)->key = *key;
+
+ /* init the mutex without owner */
+ __rt_mutex_init(&(*pi_state)->pi_mutex, NULL);
+ }
+
+ return 0;
+}
+
+/*
+ * Keep the first nr_wake waiter from futex1, wake up one,
+ * and requeue the next nr_requeue waiters following hashed on
+ * one physical page to another physical page (PI-futex uaddr2)
+ */
+static int futex_requeue_pi(u32 __user *uaddr1,
+ struct rw_semaphore *fshared,
+ u32 __user *uaddr2,
+ int nr_wake, int nr_requeue, u32 *cmpval)
+{
+ union futex_key key1, key2;
+ struct futex_hash_bucket *hb1, *hb2;
+ struct plist_head *head1;
+ struct futex_q *this, *next;
+ struct futex_pi_state *pi_state2 = NULL;
+ struct rt_mutex_waiter *waiter, *top_waiter = NULL;
+ struct rt_mutex *lock2 = NULL;
+ int ret, drop_count = 0;
+
+ if (refill_pi_state_cache())
+ return -ENOMEM;
+
+retry:
+ /*
+ * First take all the futex related locks:
+ */
+ if (fshared)
+ down_read(fshared);
+
+ ret = get_futex_key(uaddr1, fshared, &key1);
+ if (unlikely(ret != 0))
+ goto out;
+ ret = get_futex_key(uaddr2, fshared, &key2);
+ if (unlikely(ret != 0))
+ goto out;
+
+ hb1 = hash_futex(&key1);
+ hb2 = hash_futex(&key2);
+
+ double_lock_hb(hb1, hb2);
+
+ if (likely(cmpval != NULL)) {
+ u32 curval;
+
+ ret = get_futex_value_locked(&curval, uaddr1);
+
+ if (unlikely(ret)) {
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
+
+ /*
+ * If we would have faulted, release mmap_sem, fault
+ * it in and start all over again.
+ */
+ if (fshared)
+ up_read(fshared);
+
+ ret = get_user(curval, uaddr1);
+
+ if (!ret)
+ goto retry;
+
+ return ret;
+ }
+ if (curval != *cmpval) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ }
+
+ head1 = &hb1->chain;
+ plist_for_each_entry_safe(this, next, head1, list) {
+ if (!match_futex (&this->key, &key1))
+ continue;
+ if (++ret <= nr_wake) {
+ wake_futex(this);
+ } else {
+ /*
+ * FIRST: get and set the pi_state
+ */
+ if (!pi_state2) {
+ int s;
+ /* do this only the first time we requeue someone */
+ s = lookup_pi_state_for_requeue(uaddr2, hb2,
+ &key2, &pi_state2);
+ if (s) {
+ ret = s;
+ goto out_unlock;
+ }
+
+ lock2 = &pi_state2->pi_mutex;
+ spin_lock(&lock2->wait_lock);
+
+ /* Save the top waiter of the wait_list */
+ if (rt_mutex_has_waiters(lock2))
+ top_waiter = rt_mutex_top_waiter(lock2);
+ } else
+ atomic_inc(&pi_state2->refcount);
+
+
+ this->pi_state = pi_state2;
+
+ /*
+ * SECOND: requeue futex_q to the correct hashbucket
+ */
+
+ /*
+ * If key1 and key2 hash to the same bucket, no need to
+ * requeue.
+ */
+ if (likely(head1 != &hb2->chain)) {
+ plist_del(&this->list, &hb1->chain);
+ plist_add(&this->list, &hb2->chain);
+ this->lock_ptr = &hb2->lock;
+#ifdef CONFIG_DEBUG_PI_LIST
+ this->list.plist.lock = &hb2->lock;
+#endif
+ }
+ this->key = key2;
+ get_futex_key_refs(&key2);
+ drop_count++;
+
+
+ /*
+ * THIRD: queue it to lock2
+ */
+ spin_lock_irq(&this->task->pi_lock);
+ waiter = &this->waiter;
+ waiter->task = this->task;
+ waiter->lock = lock2;
+ plist_node_init(&waiter->list_entry, this->task->prio);
+ plist_node_init(&waiter->pi_list_entry, this->task->prio);
+ plist_add(&waiter->list_entry, &lock2->wait_list);
+ this->task->pi_blocked_on = waiter;
+ spin_unlock_irq(&this->task->pi_lock);
+
+ if (ret - nr_wake >= nr_requeue)
+ break;
+ }
+ }
+
+ /* If we've requeued some tasks and the top_waiter of the rt_mutex
+ has changed, we must adjust the priority of the owner, if any */
+ if (drop_count) {
+ struct task_struct *owner = rt_mutex_owner(lock2);
+ if (owner &&
+ (top_waiter != (waiter = rt_mutex_top_waiter(lock2)))) {
+ int chain_walk = 0;
+
+ spin_lock_irq(&owner->pi_lock);
+ if (top_waiter)
+ plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
+ else
+ /*
+ * There was no waiters before the requeue,
+ * the flag must be updated
+ */
+ mark_rt_mutex_waiters(lock2);
+
+ plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
+ __rt_mutex_adjust_prio(owner);
+ if (owner->pi_blocked_on) {
+ chain_walk = 1;
+ get_task_struct(owner);
+ }
+
+ spin_unlock_irq(&owner->pi_lock);
+ spin_unlock(&lock2->wait_lock);
+
+ if (chain_walk)
+ rt_mutex_adjust_prio_chain(owner, 0, lock2, NULL,
+ current);
+ } else {
+ /* No owner or the top_waiter does not change */
+ mark_rt_mutex_waiters(lock2);
+ spin_unlock(&lock2->wait_lock);
+ }
+ }
+
+out_unlock:
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
+
+ /* drop_futex_key_refs() must be called outside the spinlocks. */
+ while (--drop_count >= 0)
+ drop_futex_key_refs(&key1);
+
+out:
+ if (fshared)
+ up_read(fshared);
return ret;
}
@@ -692,22 +985,24 @@ out:
* to this virtual address:
*/
static int
-futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
+ u32 __user *uaddr2,
int nr_wake, int nr_wake2, int op)
{
union futex_key key1, key2;
struct futex_hash_bucket *hb1, *hb2;
- struct list_head *head;
+ struct plist_head *head;
struct futex_q *this, *next;
int ret, op_ret, attempt = 0;
retryfull:
- down_read(&current->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr1, &key1);
+ ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, &key2);
+ ret = get_futex_key(uaddr2, fshared, &key2);
if (unlikely(ret != 0))
goto out;
@@ -747,11 +1042,10 @@ retry:
* still holding the mmap_sem.
*/
if (attempt++) {
- if (futex_handle_fault((unsigned long)uaddr2,
- attempt)) {
- ret = -EFAULT;
+ ret = futex_handle_fault((unsigned long)uaddr2,
+ fshared, attempt);
+ if (ret)
goto out;
- }
goto retry;
}
@@ -759,7 +1053,8 @@ retry:
* If we would have faulted, release mmap_sem,
* fault it in and start all over again.
*/
- up_read(&current->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
ret = get_user(dummy, uaddr2);
if (ret)
@@ -770,7 +1065,7 @@ retry:
head = &hb1->chain;
- list_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key1)) {
wake_futex(this);
if (++ret >= nr_wake)
@@ -782,7 +1077,7 @@ retry:
head = &hb2->chain;
op_ret = 0;
- list_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key2)) {
wake_futex(this);
if (++op_ret >= nr_wake2)
@@ -796,7 +1091,8 @@ retry:
if (hb1 != hb2)
spin_unlock(&hb2->lock);
out:
- up_read(&current->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
return ret;
}
@@ -804,22 +1100,24 @@ out:
* Requeue all waiters hashed on one physical page to another
* physical page.
*/
-static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
+static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
+ u32 __user *uaddr2,
int nr_wake, int nr_requeue, u32 *cmpval)
{
union futex_key key1, key2;
struct futex_hash_bucket *hb1, *hb2;
- struct list_head *head1;
+ struct plist_head *head1;
struct futex_q *this, *next;
int ret, drop_count = 0;
retry:
- down_read(&current->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr1, &key1);
+ ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, &key2);
+ ret = get_futex_key(uaddr2, fshared, &key2);
if (unlikely(ret != 0))
goto out;
@@ -842,7 +1140,8 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
* If we would have faulted, release mmap_sem, fault
* it in and start all over again.
*/
- up_read(&current->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
ret = get_user(curval, uaddr1);
@@ -858,7 +1157,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
}
head1 = &hb1->chain;
- list_for_each_entry_safe(this, next, head1, list) {
+ plist_for_each_entry_safe(this, next, head1, list) {
if (!match_futex (&this->key, &key1))
continue;
if (++ret <= nr_wake) {
@@ -869,11 +1168,15 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
* requeue.
*/
if (likely(head1 != &hb2->chain)) {
- list_move_tail(&this->list, &hb2->chain);
+ plist_del(&this->list, &hb1->chain);
+ plist_add(&this->list, &hb2->chain);
this->lock_ptr = &hb2->lock;
- }
+#ifdef CONFIG_DEBUG_PI_LIST
+ this->list.plist.lock = &hb2->lock;
+#endif
+ }
this->key = key2;
- get_key_refs(&key2);
+ get_futex_key_refs(&key2);
drop_count++;
if (ret - nr_wake >= nr_requeue)
@@ -886,12 +1189,13 @@ out_unlock:
if (hb1 != hb2)
spin_unlock(&hb2->lock);
- /* drop_key_refs() must be called outside the spinlocks. */
+ /* drop_futex_key_refs() must be called outside the spinlocks. */
while (--drop_count >= 0)
- drop_key_refs(&key1);
+ drop_futex_key_refs(&key1);
out:
- up_read(&current->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
return ret;
}
@@ -906,7 +1210,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
init_waitqueue_head(&q->waiters);
- get_key_refs(&q->key);
+ get_futex_key_refs(&q->key);
hb = hash_futex(&q->key);
q->lock_ptr = &hb->lock;
@@ -916,7 +1220,23 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
{
- list_add_tail(&q->list, &hb->chain);
+ int prio;
+
+ /*
+ * The priority used to register this element is
+ * - either the real thread-priority for the real-time threads
+ * (i.e. threads with a priority lower than MAX_RT_PRIO)
+ * - or MAX_RT_PRIO for non-RT threads.
+ * Thus, all RT-threads are woken first in priority order, and
+ * the others are woken last, in FIFO order.
+ */
+ prio = min(current->normal_prio, MAX_RT_PRIO);
+
+ plist_node_init(&q->list, prio);
+#ifdef CONFIG_DEBUG_PI_LIST
+ q->list.plist.lock = &hb->lock;
+#endif
+ plist_add(&q->list, &hb->chain);
q->task = current;
spin_unlock(&hb->lock);
}
@@ -925,7 +1245,7 @@ static inline void
queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
{
spin_unlock(&hb->lock);
- drop_key_refs(&q->key);
+ drop_futex_key_refs(&q->key);
}
/*
@@ -971,8 +1291,8 @@ static int unqueue_me(struct futex_q *q)
spin_unlock(lock_ptr);
goto retry;
}
- WARN_ON(list_empty(&q->list));
- list_del(&q->list);
+ WARN_ON(plist_node_empty(&q->list));
+ plist_del(&q->list, &q->list.plist);
BUG_ON(q->pi_state);
@@ -980,29 +1300,94 @@ static int unqueue_me(struct futex_q *q)
ret = 1;
}
- drop_key_refs(&q->key);
+ drop_futex_key_refs(&q->key);
return ret;
}
/*
* PI futexes can not be requeued and must remove themself from the
- * hash bucket. The hash bucket lock is held on entry and dropped here.
+ * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
+ * and dropped here.
*/
-static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
+static void unqueue_me_pi(struct futex_q *q)
{
- WARN_ON(list_empty(&q->list));
- list_del(&q->list);
+ WARN_ON(plist_node_empty(&q->list));
+ plist_del(&q->list, &q->list.plist);
BUG_ON(!q->pi_state);
free_pi_state(q->pi_state);
q->pi_state = NULL;
- spin_unlock(&hb->lock);
+ spin_unlock(q->lock_ptr);
- drop_key_refs(&q->key);
+ drop_futex_key_refs(&q->key);
}
-static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
+/*
+ * Fixup the pi_state owner with current.
+ *
+ * The cur->mm semaphore must be held, it is released at return of this
+ * function.
+ */
+static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared,
+ struct futex_q *q,
+ struct futex_hash_bucket *hb,
+ struct task_struct *curr)
+{
+ u32 newtid = curr->pid | FUTEX_WAITERS;
+ struct futex_pi_state *pi_state = q->pi_state;
+ u32 uval, curval, newval;
+ int ret;
+
+ /* Owner died? */
+ if (pi_state->owner != NULL) {
+ spin_lock_irq(&pi_state->owner->pi_lock);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ spin_unlock_irq(&pi_state->owner->pi_lock);
+ } else
+ newtid |= FUTEX_OWNER_DIED;
+
+ pi_state->owner = curr;
+
+ spin_lock_irq(&curr->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &curr->pi_state_list);
+ spin_unlock_irq(&curr->pi_lock);
+
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(q);
+ if (fshared)
+ up_read(fshared);
+ /*
+ * We own it, so we have to replace the pending owner
+ * TID. This must be atomic as we have preserve the
+ * owner died bit here.
+ */
+ ret = get_user(uval, uaddr);
+ while (!ret) {
+ newval = (uval & FUTEX_OWNER_DIED) | newtid;
+ newval |= (uval & FUTEX_WAITER_REQUEUED);
+ curval = futex_atomic_cmpxchg_inatomic(uaddr,
+ uval, newval);
+ if (curval == -EFAULT)
+ ret = -EFAULT;
+ if (curval == uval)
+ break;
+ uval = curval;
+ }
+ return ret;
+}
+
+/*
+ * In case we must use restart_block to restart a futex_wait,
+ * we encode in the 'arg3' shared capability
+ */
+#define ARG3_SHARED 1
+
+static long futex_wait_restart(struct restart_block *restart);
+static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
+ u32 val, ktime_t *abs_time)
{
struct task_struct *curr = current;
DECLARE_WAITQUEUE(wait, curr);
@@ -1010,12 +1395,15 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
struct futex_q q;
u32 uval;
int ret;
+ struct hrtimer_sleeper t, *to = NULL;
+ int rem = 0;
q.pi_state = NULL;
retry:
- down_read(&curr->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr, &q.key);
+ ret = get_futex_key(uaddr, fshared, &q.key);
if (unlikely(ret != 0))
goto out_release_sem;
@@ -1038,8 +1426,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
* a wakeup when *uaddr != val on entry to the syscall. This is
* rare, but normal.
*
- * We hold the mmap semaphore, so the mapping cannot have changed
- * since we looked it up in get_futex_key.
+ * for shared futexes, we hold the mmap semaphore, so the mapping
+ * cannot have changed since we looked it up in get_futex_key.
*/
ret = get_futex_value_locked(&uval, uaddr);
@@ -1050,7 +1438,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
* If we would have faulted, release mmap_sem, fault it in and
* start all over again.
*/
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
ret = get_user(uval, uaddr);
@@ -1062,6 +1451,14 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
if (uval != val)
goto out_unlock_release_sem;
+ /*
+ * This rt_mutex_waiter structure is prepared here and will
+ * be used only if this task is requeued from a normal futex to
+ * a PI-futex with futex_requeue_pi.
+ */
+ debug_rt_mutex_init_waiter(&q.waiter);
+ q.waiter.task = NULL;
+
/* Only actually queue if *uaddr contained val. */
__queue_me(&q, hb);
@@ -1069,7 +1466,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
* Now the futex is queued and we have checked the data, we
* don't want to hold mmap_sem while we sleep.
*/
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
/*
* There might have been scheduling since the queue_me(), as we
@@ -1084,11 +1482,34 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
__set_current_state(TASK_INTERRUPTIBLE);
add_wait_queue(&q.waiters, &wait);
/*
- * !list_empty() is safe here without any lock.
+ * !plist_node_empty() is safe here without any lock.
* q.lock_ptr != 0 is not safe, because of ordering against wakeup.
*/
- if (likely(!list_empty(&q.list)))
- time = schedule_timeout(time);
+ if (likely(!plist_node_empty(&q.list))) {
+ if (!abs_time)
+ schedule();
+ else {
+ to = &t;
+ hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init_sleeper(&t, current);
+ t.timer.expires = *abs_time;
+
+ hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS);
+
+ /*
+ * the timer could have already expired, in which
+ * case current would be flagged for rescheduling.
+ * Don't bother calling schedule.
+ */
+ if (likely(t.task))
+ schedule();
+
+ hrtimer_cancel(&t.timer);
+
+ /* Flag if a timeout occured */
+ rem = (t.task == NULL);
+ }
+ }
__set_current_state(TASK_RUNNING);
/*
@@ -1096,62 +1517,203 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
* we are the only user of it.
*/
+ if (q.pi_state) {
+ /*
+ * We were woken but have been requeued on a PI-futex.
+ * We have to complete the lock acquisition by taking
+ * the rtmutex.
+ */
+
+ struct rt_mutex *lock = &q.pi_state->pi_mutex;
+
+ spin_lock(&lock->wait_lock);
+ if (unlikely(q.waiter.task)) {
+ remove_waiter(lock, &q.waiter);
+ }
+ spin_unlock(&lock->wait_lock);
+
+ if (rem)
+ ret = -ETIMEDOUT;
+ else
+ ret = rt_mutex_timed_lock(lock, to, 1);
+
+ if (fshared)
+ down_read(fshared);
+ spin_lock(q.lock_ptr);
+
+ /*
+ * Got the lock. We might not be the anticipated owner if we
+ * did a lock-steal - fix up the PI-state in that case.
+ */
+ if (!ret && q.pi_state->owner != curr) {
+ /*
+ * We MUST play with the futex we were requeued on,
+ * NOT the current futex.
+ * We can retrieve it from the key of the pi_state
+ */
+ uaddr = q.pi_state->key.uaddr;
+
+ /* mmap_sem and hash_bucket lock are unlocked at
+ return of this function */
+ ret = fixup_pi_state_owner(uaddr, fshared,
+ &q, hb, curr);
+ } else {
+ /*
+ * Catch the rare case, where the lock was released
+ * when we were on the way back before we locked
+ * the hash bucket.
+ */
+ if (ret && q.pi_state->owner == curr) {
+ if (rt_mutex_trylock(&q.pi_state->pi_mutex))
+ ret = 0;
+ }
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(&q);
+ if (fshared)
+ up_read(fshared);
+ }
+
+ debug_rt_mutex_free_waiter(&q.waiter);
+
+ return ret;
+ }
+
+ debug_rt_mutex_free_waiter(&q.waiter);
+
/* If we were woken (and unqueued), we succeeded, whatever. */
if (!unqueue_me(&q))
return 0;
- if (time == 0)
+ if (rem)
return -ETIMEDOUT;
+
/*
* We expect signal_pending(current), but another thread may
* have handled it for us already.
*/
- return -EINTR;
+ if (!abs_time)
+ return -ERESTARTSYS;
+ else {
+ struct restart_block *restart;
+ restart = &current_thread_info()->restart_block;
+ restart->fn = futex_wait_restart;
+ restart->arg0 = (unsigned long)uaddr;
+ restart->arg1 = (unsigned long)val;
+ restart->arg2 = (unsigned long)abs_time;
+ restart->arg3 = 0;
+ if (fshared)
+ restart->arg3 |= ARG3_SHARED;
+ return -ERESTART_RESTARTBLOCK;
+ }
out_unlock_release_sem:
queue_unlock(&q, hb);
out_release_sem:
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
return ret;
}
+
+static long futex_wait_restart(struct restart_block *restart)
+{
+ u32 __user *uaddr = (u32 __user *)restart->arg0;
+ u32 val = (u32)restart->arg1;
+ ktime_t *abs_time = (ktime_t *)restart->arg2;
+ struct rw_semaphore *fshared = NULL;
+
+ restart->fn = do_no_restart_syscall;
+ if (restart->arg3 & ARG3_SHARED)
+ fshared = &current->mm->mmap_sem;
+ return (long)futex_wait(uaddr, fshared, val, abs_time);
+}
+
+
+static void set_pi_futex_owner(struct futex_hash_bucket *hb,
+ union futex_key *key, struct task_struct *p)
+{
+ struct plist_head *head;
+ struct futex_q *this, *next;
+ struct futex_pi_state *pi_state = NULL;
+ struct rt_mutex *lock;
+
+ /* Search a waiter that should already exists */
+
+ head = &hb->chain;
+
+ plist_for_each_entry_safe(this, next, head, list) {
+ if (match_futex (&this->key, key)) {
+ pi_state = this->pi_state;
+ break;
+ }
+ }
+
+ BUG_ON(!pi_state);
+
+ /* set p as pi_state's owner */
+ lock = &pi_state->pi_mutex;
+
+ spin_lock(&lock->wait_lock);
+ spin_lock_irq(&p->pi_lock);
+
+ list_add(&pi_state->list, &p->pi_state_list);
+ pi_state->owner = p;
+
+
+ /* set p as pi_mutex's owner */
+ debug_rt_mutex_proxy_lock(lock, p);
+ WARN_ON(rt_mutex_owner(lock));
+ rt_mutex_set_owner(lock, p, 0);
+ rt_mutex_deadlock_account_lock(lock, p);
+
+ plist_add(&rt_mutex_top_waiter(lock)->pi_list_entry,
+ &p->pi_waiters);
+ __rt_mutex_adjust_prio(p);
+
+ spin_unlock_irq(&p->pi_lock);
+ spin_unlock(&lock->wait_lock);
+}
+
/*
* Userspace tried a 0 -> TID atomic transition of the futex value
* and failed. The kernel side here does the whole locking operation:
* if there are waiters then it will block, it does PI, etc. (Due to
* races the kernel might see a 0 value of the futex too.)
*/
-static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
- long nsec, int trylock)
+static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
+ int detect, ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct task_struct *curr = current;
struct futex_hash_bucket *hb;
u32 uval, newval, curval;
struct futex_q q;
- int ret, attempt = 0;
+ int ret, lock_held, attempt = 0;
if (refill_pi_state_cache())
return -ENOMEM;
- if (sec != MAX_SCHEDULE_TIMEOUT) {
+ if (time) {
to = &timeout;
hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
hrtimer_init_sleeper(to, current);
- to->timer.expires = ktime_set(sec, nsec);
+ to->timer.expires = *time;
}
q.pi_state = NULL;
retry:
- down_read(&curr->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr, &q.key);
+ ret = get_futex_key(uaddr, fshared, &q.key);
if (unlikely(ret != 0))
goto out_release_sem;
hb = queue_lock(&q, -1, NULL);
retry_locked:
+ lock_held = 0;
+
/*
* To avoid races, we attempt to take the lock here again
* (by doing a 0 -> TID atomic cmpxchg), while holding all
@@ -1170,7 +1732,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
if (!detect && 0)
force_sig(SIGKILL, current);
- ret = -EDEADLK;
+ /*
+ * Normally, this check is done in user space.
+ * In case of requeue, the owner may attempt to lock this futex,
+ * even if the ownership has already been given by the previous
+ * waker.
+ * In the usual case, this is a case of deadlock, but not in case
+ * of REQUEUE_PI.
+ */
+ if (!(curval & FUTEX_WAITER_REQUEUED))
+ ret = -EDEADLK;
goto out_unlock_release_sem;
}
@@ -1182,7 +1753,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
goto out_unlock_release_sem;
uval = curval;
- newval = uval | FUTEX_WAITERS;
+ /*
+ * In case of a requeue, check if there already is an owner
+ * If not, just take the futex.
+ */
+ if ((curval & FUTEX_WAITER_REQUEUED) && !(curval & FUTEX_TID_MASK)) {
+ /* set current as futex owner */
+ newval = curval | current->pid;
+ lock_held = 1;
+ } else
+ /* Set the WAITERS flag, so the owner will know it has someone
+ to wake at next unlock */
+ newval = curval | FUTEX_WAITERS;
pagefault_disable();
curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -1193,11 +1775,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
if (unlikely(curval != uval))
goto retry_locked;
+ if (lock_held) {
+ set_pi_futex_owner(hb, &q.key, curr);
+ goto out_unlock_release_sem;
+ }
+
/*
* We dont have the lock. Look up the PI state (or create it if
* we are the first waiter):
*/
- ret = lookup_pi_state(uval, hb, &q);
+ ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
if (unlikely(ret)) {
/*
@@ -1239,7 +1826,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
* Now the futex is queued and we have checked the data, we
* don't want to hold mmap_sem while we sleep.
*/
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
WARN_ON(!q.pi_state);
/*
@@ -1253,52 +1841,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
ret = ret ? 0 : -EWOULDBLOCK;
}
- down_read(&curr->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
spin_lock(q.lock_ptr);
/*
* Got the lock. We might not be the anticipated owner if we
* did a lock-steal - fix up the PI-state in that case.
*/
- if (!ret && q.pi_state->owner != curr) {
- u32 newtid = current->pid | FUTEX_WAITERS;
-
- /* Owner died? */
- if (q.pi_state->owner != NULL) {
- spin_lock_irq(&q.pi_state->owner->pi_lock);
- WARN_ON(list_empty(&q.pi_state->list));
- list_del_init(&q.pi_state->list);
- spin_unlock_irq(&q.pi_state->owner->pi_lock);
- } else
- newtid |= FUTEX_OWNER_DIED;
-
- q.pi_state->owner = current;
-
- spin_lock_irq(&current->pi_lock);
- WARN_ON(!list_empty(&q.pi_state->list));
- list_add(&q.pi_state->list, &current->pi_state_list);
- spin_unlock_irq(&current->pi_lock);
-
- /* Unqueue and drop the lock */
- unqueue_me_pi(&q, hb);
- up_read(&curr->mm->mmap_sem);
- /*
- * We own it, so we have to replace the pending owner
- * TID. This must be atomic as we have preserve the
- * owner died bit here.
- */
- ret = get_user(uval, uaddr);
- while (!ret) {
- newval = (uval & FUTEX_OWNER_DIED) | newtid;
- curval = futex_atomic_cmpxchg_inatomic(uaddr,
- uval, newval);
- if (curval == -EFAULT)
- ret = -EFAULT;
- if (curval == uval)
- break;
- uval = curval;
- }
- } else {
+ if (!ret && q.pi_state->owner != curr)
+ /* mmap_sem is unlocked at return of this function */
+ ret = fixup_pi_state_owner(uaddr, fshared, &q, hb, curr);
+ else {
/*
* Catch the rare case, where the lock was released
* when we were on the way back before we locked
@@ -1309,8 +1863,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
ret = 0;
}
/* Unqueue and drop the lock */
- unqueue_me_pi(&q, hb);
- up_read(&curr->mm->mmap_sem);
+ unqueue_me_pi(&q);
+ if (fshared)
+ up_read(fshared);
}
if (!detect && ret == -EDEADLK && 0)
@@ -1322,7 +1877,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
queue_unlock(&q, hb);
out_release_sem:
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
return ret;
uaddr_faulted:
@@ -1333,15 +1889,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
* still holding the mmap_sem.
*/
if (attempt++) {
- if (futex_handle_fault((unsigned long)uaddr, attempt)) {
- ret = -EFAULT;
+ ret = futex_handle_fault((unsigned long)uaddr, fshared,
+ attempt);
+ if (ret)
goto out_unlock_release_sem;
- }
goto retry_locked;
}
queue_unlock(&q, hb);
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
ret = get_user(uval, uaddr);
if (!ret && (uval != -EFAULT))
@@ -1355,12 +1912,12 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
* This is the in-kernel slowpath: we look up the PI state (if any),
* and do the rt-mutex unlock.
*/
-static int futex_unlock_pi(u32 __user *uaddr)
+static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
u32 uval;
- struct list_head *head;
+ struct plist_head *head;
union futex_key key;
int ret, attempt = 0;
@@ -1375,9 +1932,10 @@ retry:
/*
* First take all the futex related locks:
*/
- down_read(&current->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr, &key);
+ ret = get_futex_key(uaddr, fshared, &key);
if (unlikely(ret != 0))
goto out;
@@ -1411,7 +1969,7 @@ retry_locked:
*/
head = &hb->chain;
- list_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, head, list) {
if (!match_futex (&this->key, &key))
continue;
ret = wake_futex_pi(uaddr, uval, this);
@@ -1436,7 +1994,8 @@ retry_locked:
out_unlock:
spin_unlock(&hb->lock);
out:
- up_read(&current->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
return ret;
@@ -1448,15 +2007,16 @@ pi_faulted:
* still holding the mmap_sem.
*/
if (attempt++) {
- if (futex_handle_fault((unsigned long)uaddr, attempt)) {
- ret = -EFAULT;
+ ret = futex_handle_fault((unsigned long)uaddr, fshared,
+ attempt);
+ if (ret)
goto out_unlock;
- }
goto retry_locked;
}
spin_unlock(&hb->lock);
- up_read(&current->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
ret = get_user(uval, uaddr);
if (!ret && (uval != -EFAULT))
@@ -1485,10 +2045,10 @@ static unsigned int futex_poll(struct file *filp,
poll_wait(filp, &q->waiters, wait);
/*
- * list_empty() is safe here without any lock.
+ * plist_node_empty() is safe here without any lock.
* q->lock_ptr != 0 is not safe, because of ordering against wakeup.
*/
- if (list_empty(&q->list))
+ if (plist_node_empty(&q->list))
ret = POLLIN | POLLRDNORM;
return ret;
@@ -1508,6 +2068,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
struct futex_q *q;
struct file *filp;
int ret, err;
+ struct rw_semaphore *fshared;
static unsigned long printk_interval;
if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
@@ -1549,11 +2110,12 @@ static int futex_fd(u32 __user *uaddr, int signal)
}
q->pi_state = NULL;
- down_read(&current->mm->mmap_sem);
- err = get_futex_key(uaddr, &q->key);
+ fshared = &current->mm->mmap_sem;
+ down_read(fshared);
+ err = get_futex_key(uaddr, fshared, &q->key);
if (unlikely(err != 0)) {
- up_read(&current->mm->mmap_sem);
+ up_read(fshared);
kfree(q);
goto error;
}
@@ -1565,7 +2127,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
filp->private_data = q;
queue_me(q, ret, filp);
- up_read(&current->mm->mmap_sem);
+ up_read(fshared);
/* Now we map fd to filp, so userspace can access it */
fd_install(ret, filp);
@@ -1678,6 +2240,8 @@ retry:
* userspace.
*/
mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+ /* Also keep the FUTEX_WAITER_REQUEUED flag if set */
+ mval |= (uval & FUTEX_WAITER_REQUEUED);
nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
if (nval == -EFAULT)
@@ -1692,7 +2256,7 @@ retry:
*/
if (!pi) {
if (uval & FUTEX_WAITERS)
- futex_wake(uaddr, 1);
+ futex_wake(uaddr, &curr->mm->mmap_sem, 1);
}
}
return 0;
@@ -1748,7 +2312,8 @@ void exit_robust_list(struct task_struct *curr)
return;
if (pending)
- handle_futex_death((void __user *)pending + futex_offset, curr, pip);
+ handle_futex_death((void __user *)pending + futex_offset,
+ curr, pip);
while (entry != &head->list) {
/*
@@ -1774,39 +2339,47 @@ void exit_robust_list(struct task_struct *curr)
}
}
-long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
+long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
int ret;
+ int cmd = op & FUTEX_CMD_MASK;
+ struct rw_semaphore *fshared = NULL;
- switch (op) {
+ if (!(op & FUTEX_PRIVATE_FLAG))
+ fshared = &current->mm->mmap_sem;
+
+ switch (cmd) {
case FUTEX_WAIT:
- ret = futex_wait(uaddr, val, timeout);
+ ret = futex_wait(uaddr, fshared, val, timeout);
break;
case FUTEX_WAKE:
- ret = futex_wake(uaddr, val);
+ ret = futex_wake(uaddr, fshared, val);
break;
case FUTEX_FD:
/* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
ret = futex_fd(uaddr, val);
break;
case FUTEX_REQUEUE:
- ret = futex_requeue(uaddr, uaddr2, val, val2, NULL);
+ ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
break;
case FUTEX_CMP_REQUEUE:
- ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
+ ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
break;
case FUTEX_WAKE_OP:
- ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
+ ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
break;
case FUTEX_LOCK_PI:
- ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
+ ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
break;
case FUTEX_UNLOCK_PI:
- ret = futex_unlock_pi(uaddr);
+ ret = futex_unlock_pi(uaddr, fshared);
break;
case FUTEX_TRYLOCK_PI:
- ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
+ ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
+ break;
+ case FUTEX_CMP_REQUEUE_PI:
+ ret = futex_requeue_pi(uaddr, fshared, uaddr2, val, val2, &val3);
break;
default:
ret = -ENOSYS;
@@ -1819,29 +2392,30 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
struct timespec __user *utime, u32 __user *uaddr2,
u32 val3)
{
- struct timespec t;
- unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+ struct timespec ts;
+ ktime_t t, *tp = NULL;
u32 val2 = 0;
+ int cmd = op & FUTEX_CMD_MASK;
- if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
- if (copy_from_user(&t, utime, sizeof(t)) != 0)
+ if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) {
+ if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
return -EFAULT;
- if (!timespec_valid(&t))
+ if (!timespec_valid(&ts))
return -EINVAL;
- if (op == FUTEX_WAIT)
- timeout = timespec_to_jiffies(&t) + 1;
- else {
- timeout = t.tv_sec;
- val2 = t.tv_nsec;
- }
+
+ t = timespec_to_ktime(ts);
+ if (cmd == FUTEX_WAIT)
+ t = ktime_add(ktime_get(), t);
+ tp = &t;
}
/*
- * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
+ * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
*/
- if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE
+ || cmd == FUTEX_CMP_REQUEUE_PI)
val2 = (u32) (unsigned long) utime;
- return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
+ return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}
static int futexfs_get_sb(struct file_system_type *fs_type,
@@ -1871,7 +2445,7 @@ static int __init init(void)
}
for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
- INIT_LIST_HEAD(&futex_queues[i].chain);
+ plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
spin_lock_init(&futex_queues[i].lock);
}
return 0;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 50f24ee..338a9b4 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -141,24 +141,24 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
struct compat_timespec __user *utime, u32 __user *uaddr2,
u32 val3)
{
- struct timespec t;
- unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+ struct timespec ts;
+ ktime_t t, *tp = NULL;
int val2 = 0;
if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
- if (get_compat_timespec(&t, utime))
+ if (get_compat_timespec(&ts, utime))
return -EFAULT;
- if (!timespec_valid(&t))
+ if (!timespec_valid(&ts))
return -EINVAL;
+
+ t = timespec_to_ktime(ts);
if (op == FUTEX_WAIT)
- timeout = timespec_to_jiffies(&t) + 1;
- else {
- timeout = t.tv_sec;
- val2 = t.tv_nsec;
- }
+ t = ktime_add(ktime_get(), t);
+ tp = &t;
}
- if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+ if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
+ || op == FUTEX_CMP_REQUEUE_PI)
val2 = (int) (unsigned long) utime;
- return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
+ return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 1b30331..23c03f4 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -669,6 +669,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
return orun;
}
+EXPORT_SYMBOL_GPL(hrtimer_forward);
/*
* enqueue_hrtimer - internal function to (re)start a timer
@@ -1410,11 +1411,13 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
init_hrtimers_cpu(cpu);
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu);
migrate_hrtimers(cpu);
break;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index aff1f0f..e391cbb 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -22,7 +22,6 @@
* handle_bad_irq - handle spurious and unhandled irqs
* @irq: the interrupt number
* @desc: description of the interrupt
- * @regs: pointer to a register structure
*
* Handles spurious and unhandled IRQ's. It also prints a debugmessage.
*/
@@ -48,7 +47,7 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc)
*
* Controller mappings for all interrupt sources:
*/
-struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
+struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
[0 ... NR_IRQS-1] = {
.status = IRQ_DISABLED,
.chip = &no_irq_chip,
@@ -180,6 +179,8 @@ fastcall unsigned int __do_IRQ(unsigned int irq)
if (desc->chip->ack)
desc->chip->ack(irq);
action_ret = handle_IRQ_event(irq, desc->action);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret);
desc->chip->end(irq);
return 1;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5597c15..203a518 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -317,10 +317,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
}
*p = new;
-#if defined(CONFIG_IRQ_PER_CPU)
- if (new->flags & IRQF_PERCPU)
- desc->status |= IRQ_PER_CPU;
-#endif
+
/* Exclude IRQ from balancing */
if (new->flags & IRQF_NOBALANCING)
desc->status |= IRQ_NO_BALANCING;
@@ -328,6 +325,11 @@ int setup_irq(unsigned int irq, struct irqaction *new)
if (!shared) {
irq_chip_set_defaults(desc->chip);
+#if defined(CONFIG_IRQ_PER_CPU)
+ if (new->flags & IRQF_PERCPU)
+ desc->status |= IRQ_PER_CPU;
+#endif
+
/* Setup the type (level, edge polarity) if configured: */
if (new->flags & IRQF_TRIGGER_MASK) {
if (desc->chip && desc->chip->set_type)
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 2db91eb..ddde0ef 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -66,12 +66,19 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
{
struct irq_desc *desc = irq_desc + irq;
struct irqaction *action;
+ unsigned long flags;
+ int ret = 1;
- for (action = desc->action ; action; action = action->next)
+ spin_lock_irqsave(&desc->lock, flags);
+ for (action = desc->action ; action; action = action->next) {
if ((action != new_action) && action->name &&
- !strcmp(new_action->name, action->name))
- return 0;
- return 1;
+ !strcmp(new_action->name, action->name)) {
+ ret = 0;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&desc->lock, flags);
+ return ret;
}
void register_handler_proc(unsigned int irq, struct irqaction *action)
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 9d8c79b..b0d81aa 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -146,7 +146,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
if (unlikely(irqfixup)) {
/* Don't punish working computers */
- if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) {
+ if ((irqfixup == 2 && ((irq == 0) ||
+ (desc->action->flags & IRQF_IRQPOLL))) ||
+ action_ret == IRQ_NONE) {
int ok = misrouted_irq(irq);
if (action_ret == IRQ_NONE)
desc->irqs_unhandled -= ok;
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 307c6a6..3205e8e 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -7,7 +7,6 @@
/* These are all the functions necessary to implement itimers */
#include <linux/mm.h>
-#include <linux/smp_lock.h>
#include <linux/interrupt.h>
#include <linux/syscalls.h>
#include <linux/time.h>
@@ -139,59 +138,11 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
}
/*
- * We do not care about correctness. We just sanitize the values so
- * the ktime_t operations which expect normalized values do not
- * break. This converts negative values to long timeouts similar to
- * the code in kernel versions < 2.6.16
- *
- * Print a limited number of warning messages when an invalid timeval
- * is detected.
- */
-static void fixup_timeval(struct timeval *tv, int interval)
-{
- static int warnlimit = 10;
- unsigned long tmp;
-
- if (warnlimit > 0) {
- warnlimit--;
- printk(KERN_WARNING
- "setitimer: %s (pid = %d) provided "
- "invalid timeval %s: tv_sec = %ld tv_usec = %ld\n",
- current->comm, current->pid,
- interval ? "it_interval" : "it_value",
- tv->tv_sec, (long) tv->tv_usec);
- }
-
- tmp = tv->tv_usec;
- if (tmp >= USEC_PER_SEC) {
- tv->tv_usec = tmp % USEC_PER_SEC;
- tv->tv_sec += tmp / USEC_PER_SEC;
- }
-
- tmp = tv->tv_sec;
- if (tmp > LONG_MAX)
- tv->tv_sec = LONG_MAX;
-}
-
-/*
* Returns true if the timeval is in canonical form
*/
#define timeval_valid(t) \
(((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC))
-/*
- * Check for invalid timevals, sanitize them and print a limited
- * number of warnings.
- */
-static void check_itimerval(struct itimerval *value) {
-
- if (unlikely(!timeval_valid(&value->it_value)))
- fixup_timeval(&value->it_value, 0);
-
- if (unlikely(!timeval_valid(&value->it_interval)))
- fixup_timeval(&value->it_interval, 1);
-}
-
int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
{
struct task_struct *tsk = current;
@@ -201,15 +152,10 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
/*
* Validate the timevals in value.
- *
- * Note: Although the spec requires that invalid values shall
- * return -EINVAL, we just fixup the value and print a limited
- * number of warnings in order not to break users of this
- * historical misfeature.
- *
- * Scheduled for replacement in March 2007
*/
- check_itimerval(value);
+ if (!timeval_valid(&value->it_value) ||
+ !timeval_valid(&value->it_interval))
+ return -EINVAL;
switch (which) {
case ITIMER_REAL:
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 5a0de84..f1bda23 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -214,8 +214,10 @@ static unsigned long get_symbol_pos(unsigned long addr,
symbol_end = (unsigned long)_etext;
}
- *symbolsize = symbol_end - symbol_start;
- *offset = addr - symbol_start;
+ if (symbolsize)
+ *symbolsize = symbol_end - symbol_start;
+ if (offset)
+ *offset = addr - symbol_start;
return low;
}
@@ -267,6 +269,42 @@ const char *kallsyms_lookup(unsigned long addr,
return NULL;
}
+int lookup_symbol_name(unsigned long addr, char *symname)
+{
+ symname[0] = '\0';
+ symname[KSYM_NAME_LEN] = '\0';
+
+ if (is_ksym_addr(addr)) {
+ unsigned long pos;
+
+ pos = get_symbol_pos(addr, NULL, NULL);
+ /* Grab name */
+ kallsyms_expand_symbol(get_symbol_offset(pos), symname);
+ return 0;
+ }
+ /* see if it's in a module */
+ return lookup_module_symbol_name(addr, symname);
+}
+
+int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
+ unsigned long *offset, char *modname, char *name)
+{
+ name[0] = '\0';
+ name[KSYM_NAME_LEN] = '\0';
+
+ if (is_ksym_addr(addr)) {
+ unsigned long pos;
+
+ pos = get_symbol_pos(addr, size, offset);
+ /* Grab name */
+ kallsyms_expand_symbol(get_symbol_offset(pos), name);
+ modname[0] = '\0';
+ return 0;
+ }
+ /* see if it's in a module */
+ return lookup_module_symbol_attrs(addr, size, offset, modname, name);
+}
+
/* Look up a kernel symbol and return it in a text buffer. */
int sprint_symbol(char *buffer, unsigned long address)
{
@@ -301,25 +339,20 @@ void __print_symbol(const char *fmt, unsigned long address)
struct kallsym_iter
{
loff_t pos;
- struct module *owner;
unsigned long value;
unsigned int nameoff; /* If iterating in core kernel symbols */
char type;
char name[KSYM_NAME_LEN+1];
+ char module_name[MODULE_NAME_LEN + 1];
+ int exported;
};
static int get_ksymbol_mod(struct kallsym_iter *iter)
{
- iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
- &iter->value, &iter->type,
- iter->name, sizeof(iter->name));
- if (iter->owner == NULL)
+ if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value,
+ &iter->type, iter->name, iter->module_name,
+ &iter->exported) < 0)
return 0;
-
- /* Label it "global" if it is exported, "local" if not exported. */
- iter->type = is_exported(iter->name, iter->owner)
- ? toupper(iter->type) : tolower(iter->type);
-
return 1;
}
@@ -328,7 +361,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
{
unsigned off = iter->nameoff;
- iter->owner = NULL;
+ iter->module_name[0] = '\0';
iter->value = kallsyms_addresses[iter->pos];
iter->type = kallsyms_get_symbol_type(off);
@@ -392,12 +425,17 @@ static int s_show(struct seq_file *m, void *p)
if (!iter->name[0])
return 0;
- if (iter->owner)
+ if (iter->module_name[0]) {
+ char type;
+
+ /* Label it "global" if it is exported,
+ * "local" if not exported. */
+ type = iter->exported ? toupper(iter->type) :
+ tolower(iter->type);
seq_printf(m, "%0*lx %c %s\t[%s]\n",
(int)(2*sizeof(void*)),
- iter->value, iter->type, iter->name,
- module_name(iter->owner));
- else
+ iter->value, type, iter->name, iter->module_name);
+ } else
seq_printf(m, "%0*lx %c %s\n",
(int)(2*sizeof(void*)),
iter->value, iter->type, iter->name);
@@ -432,18 +470,11 @@ static int kallsyms_open(struct inode *inode, struct file *file)
return ret;
}
-static int kallsyms_release(struct inode *inode, struct file *file)
-{
- struct seq_file *m = (struct seq_file *)file->private_data;
- kfree(m->private);
- return seq_release(inode, file);
-}
-
static const struct file_operations kallsyms_operations = {
.open = kallsyms_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = kallsyms_release,
+ .release = seq_release_private,
};
static int __init kallsyms_init(void)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2a59c8a..25db14b 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1118,8 +1118,8 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
memset(&prstatus, 0, sizeof(prstatus));
prstatus.pr_pid = current->pid;
elf_core_copy_regs(&prstatus.pr_reg, regs);
- buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
- sizeof(prstatus));
+ buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+ &prstatus, sizeof(prstatus));
final_note(buf);
}
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 7962761..4d32eb0 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -23,7 +23,6 @@
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/kmod.h>
-#include <linux/smp_lock.h>
#include <linux/slab.h>
#include <linux/mnt_namespace.h>
#include <linux/completion.h>
@@ -136,7 +135,6 @@ static int ____call_usermodehelper(void *data)
/* Unblock all signals and set the session keyring. */
new_session = key_get(sub_info->ring);
- flush_signals(current);
spin_lock_irq(&current->sighand->siglock);
old_session = __install_session_keyring(current, new_session);
flush_signal_handlers(current, 1);
@@ -166,6 +164,12 @@ static int ____call_usermodehelper(void *data)
/* We can run anywhere, unlike our parent keventd(). */
set_cpus_allowed(current, CPU_MASK_ALL);
+ /*
+ * Our parent is keventd, which runs with elevated scheduling priority.
+ * Avoid propagating that into the userspace child.
+ */
+ set_user_nice(current, 0);
+
retval = -EPERM;
if (current->fs->root)
retval = kernel_execve(sub_info->path,
@@ -181,14 +185,9 @@ static int wait_for_helper(void *data)
{
struct subprocess_info *sub_info = data;
pid_t pid;
- struct k_sigaction sa;
/* Install a handler: if SIGCLD isn't handled sys_wait4 won't
* populate the status, but will return -ECHILD. */
- sa.sa.sa_handler = SIG_IGN;
- sa.sa.sa_flags = 0;
- siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
- do_sigaction(SIGCHLD, &sa, NULL);
allow_signal(SIGCHLD);
pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d25a9ad..9e47d8c 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -35,16 +35,19 @@
#include <linux/hash.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/stddef.h>
#include <linux/module.h>
#include <linux/moduleloader.h>
#include <linux/kallsyms.h>
#include <linux/freezer.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
+#include <linux/kdebug.h>
+
#include <asm-generic/sections.h>
#include <asm/cacheflush.h>
#include <asm/errno.h>
-#include <asm/kdebug.h>
+#include <asm/uaccess.h>
#define KPROBE_HASH_BITS 6
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
@@ -63,6 +66,9 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
static atomic_t kprobe_count;
+/* NOTE: change this value only with kprobe_mutex held */
+static bool kprobe_enabled;
+
DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
@@ -132,9 +138,8 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
struct kprobe_insn_page *kip;
struct hlist_node *pos;
- retry:
- hlist_for_each(pos, &kprobe_insn_pages) {
- kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+ retry:
+ hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
if (kip->nused < INSNS_PER_PAGE) {
int i;
for (i = 0; i < INSNS_PER_PAGE; i++) {
@@ -155,9 +160,8 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
}
/* All out of space. Need to allocate a new page. Use slot 0. */
kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
- if (!kip) {
+ if (!kip)
return NULL;
- }
/*
* Use module_alloc so this page is within +/- 2GB of where the
@@ -213,9 +217,8 @@ static int __kprobes collect_garbage_slots(void)
if (check_safety() != 0)
return -EAGAIN;
- hlist_for_each_safe(pos, next, &kprobe_insn_pages) {
+ hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
int i;
- kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
if (kip->ngarbage == 0)
continue;
kip->ngarbage = 0; /* we will collect all garbages */
@@ -234,8 +237,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
struct kprobe_insn_page *kip;
struct hlist_node *pos;
- hlist_for_each(pos, &kprobe_insn_pages) {
- kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+ hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
if (kip->insns <= slot &&
slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
int i = (slot - kip->insns) / MAX_INSN_SIZE;
@@ -248,9 +250,9 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
break;
}
}
- if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) {
+
+ if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE)
collect_garbage_slots();
- }
}
#endif
@@ -316,7 +318,6 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
reset_kprobe_instance();
}
}
- return;
}
static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
@@ -362,46 +363,6 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
}
/* Called with kretprobe_lock held */
-struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
-{
- struct hlist_node *node;
- struct kretprobe_instance *ri;
- hlist_for_each_entry(ri, node, &rp->free_instances, uflist)
- return ri;
- return NULL;
-}
-
-/* Called with kretprobe_lock held */
-static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
- *rp)
-{
- struct hlist_node *node;
- struct kretprobe_instance *ri;
- hlist_for_each_entry(ri, node, &rp->used_instances, uflist)
- return ri;
- return NULL;
-}
-
-/* Called with kretprobe_lock held */
-void __kprobes add_rp_inst(struct kretprobe_instance *ri)
-{
- /*
- * Remove rp inst off the free list -
- * Add it back when probed function returns
- */
- hlist_del(&ri->uflist);
-
- /* Add rp inst onto table */
- INIT_HLIST_NODE(&ri->hlist);
- hlist_add_head(&ri->hlist,
- &kretprobe_inst_table[hash_ptr(ri->task, KPROBE_HASH_BITS)]);
-
- /* Also add this rp inst to the used list. */
- INIT_HLIST_NODE(&ri->uflist);
- hlist_add_head(&ri->uflist, &ri->rp->used_instances);
-}
-
-/* Called with kretprobe_lock held */
void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
struct hlist_head *head)
{
@@ -454,7 +415,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
static inline void free_rp_inst(struct kretprobe *rp)
{
struct kretprobe_instance *ri;
- while ((ri = get_free_rp_inst(rp)) != NULL) {
+ struct hlist_node *pos, *next;
+
+ hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, uflist) {
hlist_del(&ri->uflist);
kfree(ri);
}
@@ -535,8 +498,8 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
static int __kprobes in_kprobes_functions(unsigned long addr)
{
- if (addr >= (unsigned long)__kprobes_text_start
- && addr < (unsigned long)__kprobes_text_end)
+ if (addr >= (unsigned long)__kprobes_text_start &&
+ addr < (unsigned long)__kprobes_text_end)
return -EINVAL;
return 0;
}
@@ -563,19 +526,24 @@ static int __kprobes __register_kprobe(struct kprobe *p,
return -EINVAL;
p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset);
- if ((!kernel_text_address((unsigned long) p->addr)) ||
- in_kprobes_functions((unsigned long) p->addr))
+ if (!kernel_text_address((unsigned long) p->addr) ||
+ in_kprobes_functions((unsigned long) p->addr))
return -EINVAL;
p->mod_refcounted = 0;
- /* Check are we probing a module */
- if ((probed_mod = module_text_address((unsigned long) p->addr))) {
+
+ /*
+ * Check if are we probing a module.
+ */
+ probed_mod = module_text_address((unsigned long) p->addr);
+ if (probed_mod) {
struct module *calling_mod = module_text_address(called_from);
- /* We must allow modules to probe themself and
- * in this case avoid incrementing the module refcount,
- * so as to allow unloading of self probing modules.
+ /*
+ * We must allow modules to probe themself and in this case
+ * avoid incrementing the module refcount, so as to allow
+ * unloading of self probing modules.
*/
- if (calling_mod && (calling_mod != probed_mod)) {
+ if (calling_mod && calling_mod != probed_mod) {
if (unlikely(!try_module_get(probed_mod)))
return -EINVAL;
p->mod_refcounted = 1;
@@ -593,19 +561,21 @@ static int __kprobes __register_kprobe(struct kprobe *p,
goto out;
}
- if ((ret = arch_prepare_kprobe(p)) != 0)
+ ret = arch_prepare_kprobe(p);
+ if (ret)
goto out;
INIT_HLIST_NODE(&p->hlist);
hlist_add_head_rcu(&p->hlist,
&kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
- if (atomic_add_return(1, &kprobe_count) == \
+ if (kprobe_enabled) {
+ if (atomic_add_return(1, &kprobe_count) == \
(ARCH_INACTIVE_KPROBE_COUNT + 1))
- register_page_fault_notifier(&kprobe_page_fault_nb);
-
- arch_arm_kprobe(p);
+ register_page_fault_notifier(&kprobe_page_fault_nb);
+ arch_arm_kprobe(p);
+ }
out:
mutex_unlock(&kprobe_mutex);
@@ -616,8 +586,7 @@ out:
int __kprobes register_kprobe(struct kprobe *p)
{
- return __register_kprobe(p,
- (unsigned long)__builtin_return_address(0));
+ return __register_kprobe(p, (unsigned long)__builtin_return_address(0));
}
void __kprobes unregister_kprobe(struct kprobe *p)
@@ -641,11 +610,16 @@ void __kprobes unregister_kprobe(struct kprobe *p)
return;
}
valid_p:
- if ((old_p == p) || ((old_p->pre_handler == aggr_pre_handler) &&
- (p->list.next == &old_p->list) &&
- (p->list.prev == &old_p->list))) {
- /* Only probe on the hash list */
- arch_disarm_kprobe(p);
+ if (old_p == p ||
+ (old_p->pre_handler == aggr_pre_handler &&
+ p->list.next == &old_p->list && p->list.prev == &old_p->list)) {
+ /*
+ * Only probe on the hash list. Disarm only if kprobes are
+ * enabled - otherwise, the breakpoint would already have
+ * been removed. We save on flushing icache.
+ */
+ if (kprobe_enabled)
+ arch_disarm_kprobe(p);
hlist_del_rcu(&old_p->hlist);
cleanup_p = 1;
} else {
@@ -656,9 +630,11 @@ valid_p:
mutex_unlock(&kprobe_mutex);
synchronize_sched();
- if (p->mod_refcounted &&
- (mod = module_text_address((unsigned long)p->addr)))
- module_put(mod);
+ if (p->mod_refcounted) {
+ mod = module_text_address((unsigned long)p->addr);
+ if (mod)
+ module_put(mod);
+ }
if (cleanup_p) {
if (p != old_p) {
@@ -729,7 +705,21 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
/*TODO: consider to only swap the RA after the last pre_handler fired */
spin_lock_irqsave(&kretprobe_lock, flags);
- arch_prepare_kretprobe(rp, regs);
+ if (!hlist_empty(&rp->free_instances)) {
+ struct kretprobe_instance *ri;
+
+ ri = hlist_entry(rp->free_instances.first,
+ struct kretprobe_instance, uflist);
+ ri->rp = rp;
+ ri->task = current;
+ arch_prepare_kretprobe(ri, regs);
+
+ /* XXX(hch): why is there no hlist_move_head? */
+ hlist_del(&ri->uflist);
+ hlist_add_head(&ri->uflist, &ri->rp->used_instances);
+ hlist_add_head(&ri->hlist, kretprobe_inst_table_head(ri->task));
+ } else
+ rp->nmissed++;
spin_unlock_irqrestore(&kretprobe_lock, flags);
return 0;
}
@@ -792,11 +782,13 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp)
{
unsigned long flags;
struct kretprobe_instance *ri;
+ struct hlist_node *pos, *next;
unregister_kprobe(&rp->kp);
+
/* No race here */
spin_lock_irqsave(&kretprobe_lock, flags);
- while ((ri = get_used_rp_inst(rp)) != NULL) {
+ hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) {
ri->rp = NULL;
hlist_del(&ri->uflist);
}
@@ -816,6 +808,9 @@ static int __init init_kprobes(void)
}
atomic_set(&kprobe_count, 0);
+ /* By default, kprobes are enabled */
+ kprobe_enabled = true;
+
err = arch_init_kprobes();
if (!err)
err = register_die_notifier(&kprobe_exceptions_nb);
@@ -825,7 +820,7 @@ static int __init init_kprobes(void)
#ifdef CONFIG_DEBUG_FS
static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
- const char *sym, int offset,char *modname)
+ const char *sym, int offset,char *modname)
{
char *kprobe_type;
@@ -867,13 +862,13 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
struct kprobe *p, *kp;
const char *sym = NULL;
unsigned int i = *(loff_t *) v;
- unsigned long size, offset = 0;
+ unsigned long offset = 0;
char *modname, namebuf[128];
head = &kprobe_table[i];
preempt_disable();
hlist_for_each_entry_rcu(p, node, head, hlist) {
- sym = kallsyms_lookup((unsigned long)p->addr, &size,
+ sym = kallsyms_lookup((unsigned long)p->addr, NULL,
&offset, &modname, namebuf);
if (p->pre_handler == aggr_pre_handler) {
list_for_each_entry_rcu(kp, &p->list, list)
@@ -904,21 +899,149 @@ static struct file_operations debugfs_kprobes_operations = {
.release = seq_release,
};
+static void __kprobes enable_all_kprobes(void)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct kprobe *p;
+ unsigned int i;
+
+ mutex_lock(&kprobe_mutex);
+
+ /* If kprobes are already enabled, just return */
+ if (kprobe_enabled)
+ goto already_enabled;
+
+ /*
+ * Re-register the page fault notifier only if there are any
+ * active probes at the time of enabling kprobes globally
+ */
+ if (atomic_read(&kprobe_count) > ARCH_INACTIVE_KPROBE_COUNT)
+ register_page_fault_notifier(&kprobe_page_fault_nb);
+
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ head = &kprobe_table[i];
+ hlist_for_each_entry_rcu(p, node, head, hlist)
+ arch_arm_kprobe(p);
+ }
+
+ kprobe_enabled = true;
+ printk(KERN_INFO "Kprobes globally enabled\n");
+
+already_enabled:
+ mutex_unlock(&kprobe_mutex);
+ return;
+}
+
+static void __kprobes disable_all_kprobes(void)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct kprobe *p;
+ unsigned int i;
+
+ mutex_lock(&kprobe_mutex);
+
+ /* If kprobes are already disabled, just return */
+ if (!kprobe_enabled)
+ goto already_disabled;
+
+ kprobe_enabled = false;
+ printk(KERN_INFO "Kprobes globally disabled\n");
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ head = &kprobe_table[i];
+ hlist_for_each_entry_rcu(p, node, head, hlist) {
+ if (!arch_trampoline_kprobe(p))
+ arch_disarm_kprobe(p);
+ }
+ }
+
+ mutex_unlock(&kprobe_mutex);
+ /* Allow all currently running kprobes to complete */
+ synchronize_sched();
+
+ mutex_lock(&kprobe_mutex);
+ /* Unconditionally unregister the page_fault notifier */
+ unregister_page_fault_notifier(&kprobe_page_fault_nb);
+
+already_disabled:
+ mutex_unlock(&kprobe_mutex);
+ return;
+}
+
+/*
+ * XXX: The debugfs bool file interface doesn't allow for callbacks
+ * when the bool state is switched. We can reuse that facility when
+ * available
+ */
+static ssize_t read_enabled_file_bool(struct file *file,
+ char __user *user_buf, size_t count, loff_t *ppos)
+{
+ char buf[3];
+
+ if (kprobe_enabled)
+ buf[0] = '1';
+ else
+ buf[0] = '0';
+ buf[1] = '\n';
+ buf[2] = 0x00;
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t write_enabled_file_bool(struct file *file,
+ const char __user *user_buf, size_t count, loff_t *ppos)
+{
+ char buf[32];
+ int buf_size;
+
+ buf_size = min(count, (sizeof(buf)-1));
+ if (copy_from_user(buf, user_buf, buf_size))
+ return -EFAULT;
+
+ switch (buf[0]) {
+ case 'y':
+ case 'Y':
+ case '1':
+ enable_all_kprobes();
+ break;
+ case 'n':
+ case 'N':
+ case '0':
+ disable_all_kprobes();
+ break;
+ }
+
+ return count;
+}
+
+static struct file_operations fops_kp = {
+ .read = read_enabled_file_bool,
+ .write = write_enabled_file_bool,
+};
+
static int __kprobes debugfs_kprobe_init(void)
{
struct dentry *dir, *file;
+ unsigned int value = 1;
dir = debugfs_create_dir("kprobes", NULL);
if (!dir)
return -ENOMEM;
- file = debugfs_create_file("list", 0444, dir , 0 ,
+ file = debugfs_create_file("list", 0444, dir, NULL,
&debugfs_kprobes_operations);
if (!file) {
debugfs_remove(dir);
return -ENOMEM;
}
+ file = debugfs_create_file("enabled", 0600, dir,
+ &value, &fops_kp);
+ if (!file) {
+ debugfs_remove(dir);
+ return -ENOMEM;
+ }
+
return 0;
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 87c50cc..df8a8e8 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1,7 +1,7 @@
/* Kernel thread helper functions.
* Copyright (C) 2004 IBM Corporation, Rusty Russell.
*
- * Creation is done via keventd, so that we get a clean environment
+ * Creation is done via kthreadd, so that we get a clean environment
* even if we're invoked from userspace (think modprobe, hotplug cpu,
* etc.).
*/
@@ -15,24 +15,22 @@
#include <linux/mutex.h>
#include <asm/semaphore.h>
-/*
- * We dont want to execute off keventd since it might
- * hold a semaphore our callers hold too:
- */
-static struct workqueue_struct *helper_wq;
+static DEFINE_SPINLOCK(kthread_create_lock);
+static LIST_HEAD(kthread_create_list);
+struct task_struct *kthreadd_task;
struct kthread_create_info
{
- /* Information passed to kthread() from keventd. */
+ /* Information passed to kthread() from kthreadd. */
int (*threadfn)(void *data);
void *data;
struct completion started;
- /* Result passed back to kthread_create() from keventd. */
+ /* Result passed back to kthread_create() from kthreadd. */
struct task_struct *result;
struct completion done;
- struct work_struct work;
+ struct list_head list;
};
struct kthread_stop_info
@@ -60,42 +58,17 @@ int kthread_should_stop(void)
}
EXPORT_SYMBOL(kthread_should_stop);
-static void kthread_exit_files(void)
-{
- struct fs_struct *fs;
- struct task_struct *tsk = current;
-
- exit_fs(tsk); /* current->fs->count--; */
- fs = init_task.fs;
- tsk->fs = fs;
- atomic_inc(&fs->count);
- exit_files(tsk);
- current->files = init_task.files;
- atomic_inc(&tsk->files->count);
-}
-
static int kthread(void *_create)
{
struct kthread_create_info *create = _create;
int (*threadfn)(void *data);
void *data;
- sigset_t blocked;
int ret = -EINTR;
- kthread_exit_files();
-
- /* Copy data: it's on keventd's stack */
+ /* Copy data: it's on kthread's stack */
threadfn = create->threadfn;
data = create->data;
- /* Block and flush all signals (in case we're not from keventd). */
- sigfillset(&blocked);
- sigprocmask(SIG_BLOCK, &blocked, NULL);
- flush_signals(current);
-
- /* By default we can run anywhere, unlike keventd. */
- set_cpus_allowed(current, CPU_MASK_ALL);
-
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_INTERRUPTIBLE);
complete(&create->started);
@@ -112,11 +85,8 @@ static int kthread(void *_create)
return 0;
}
-/* We are keventd: create a thread. */
-static void keventd_create_kthread(struct work_struct *work)
+static void create_kthread(struct kthread_create_info *create)
{
- struct kthread_create_info *create =
- container_of(work, struct kthread_create_info, work);
int pid;
/* We want our own signal handler (we take no signals by default). */
@@ -162,17 +132,14 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
create.data = data;
init_completion(&create.started);
init_completion(&create.done);
- INIT_WORK(&create.work, keventd_create_kthread);
-
- /*
- * The workqueue needs to start up first:
- */
- if (!helper_wq)
- create.work.func(&create.work);
- else {
- queue_work(helper_wq, &create.work);
- wait_for_completion(&create.done);
- }
+
+ spin_lock(&kthread_create_lock);
+ list_add_tail(&create.list, &kthread_create_list);
+ wake_up_process(kthreadd_task);
+ spin_unlock(&kthread_create_lock);
+
+ wait_for_completion(&create.done);
+
if (!IS_ERR(create.result)) {
va_list args;
va_start(args, namefmt);
@@ -180,7 +147,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
namefmt, args);
va_end(args);
}
-
return create.result;
}
EXPORT_SYMBOL(kthread_create);
@@ -245,12 +211,47 @@ int kthread_stop(struct task_struct *k)
}
EXPORT_SYMBOL(kthread_stop);
-static __init int helper_init(void)
+
+static __init void kthreadd_setup(void)
{
- helper_wq = create_singlethread_workqueue("kthread");
- BUG_ON(!helper_wq);
+ struct task_struct *tsk = current;
- return 0;
+ set_task_comm(tsk, "kthreadd");
+
+ ignore_signals(tsk);
+
+ set_user_nice(tsk, -5);
+ set_cpus_allowed(tsk, CPU_MASK_ALL);
}
-core_initcall(helper_init);
+int kthreadd(void *unused)
+{
+ /* Setup a clean context for our children to inherit. */
+ kthreadd_setup();
+
+ current->flags |= PF_NOFREEZE;
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (list_empty(&kthread_create_list))
+ schedule();
+ __set_current_state(TASK_RUNNING);
+
+ spin_lock(&kthread_create_lock);
+ while (!list_empty(&kthread_create_list)) {
+ struct kthread_create_info *create;
+
+ create = list_entry(kthread_create_list.next,
+ struct kthread_create_info, list);
+ list_del_init(&create->list);
+ spin_unlock(&kthread_create_lock);
+
+ create_kthread(create);
+
+ spin_lock(&kthread_create_lock);
+ }
+ spin_unlock(&kthread_create_lock);
+ }
+
+ return 0;
+}
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 7065a68..1a5ff22 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -257,9 +257,8 @@ static int save_trace(struct stack_trace *trace)
trace->entries = stack_trace + nr_stack_trace_entries;
trace->skip = 3;
- trace->all_contexts = 0;
- save_stack_trace(trace, NULL);
+ save_stack_trace(trace);
trace->max_entries = trace->nr_entries;
@@ -341,10 +340,7 @@ static const char *usage_str[] =
const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
{
- unsigned long offs, size;
- char *modname;
-
- return kallsyms_lookup((unsigned long)key, &size, &offs, &modname, str);
+ return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
}
void
@@ -1313,8 +1309,9 @@ out_unlock_set:
/*
* Look up a dependency chain. If the key is not present yet then
- * add it and return 0 - in this case the new dependency chain is
- * validated. If the key is already hashed, return 1.
+ * add it and return 1 - in this case the new dependency chain is
+ * validated. If the key is already hashed, return 0.
+ * (On return with 1 graph_lock is held.)
*/
static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class)
{
@@ -1577,7 +1574,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
* Mark a lock with a usage bit, and validate the state transition:
*/
static int mark_lock(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit new_bit, unsigned long ip)
+ enum lock_usage_bit new_bit)
{
unsigned int new_mask = 1 << new_bit, ret = 1;
@@ -1600,14 +1597,6 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
this->class->usage_mask |= new_mask;
-#ifdef CONFIG_TRACE_IRQFLAGS
- if (new_bit == LOCK_ENABLED_HARDIRQS ||
- new_bit == LOCK_ENABLED_HARDIRQS_READ)
- ip = curr->hardirq_enable_ip;
- else if (new_bit == LOCK_ENABLED_SOFTIRQS ||
- new_bit == LOCK_ENABLED_SOFTIRQS_READ)
- ip = curr->softirq_enable_ip;
-#endif
if (!save_trace(this->class->usage_traces + new_bit))
return 0;
@@ -1806,7 +1795,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
* Mark all held locks with a usage bit:
*/
static int
-mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip)
+mark_held_locks(struct task_struct *curr, int hardirq)
{
enum lock_usage_bit usage_bit;
struct held_lock *hlock;
@@ -1826,7 +1815,7 @@ mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip)
else
usage_bit = LOCK_ENABLED_SOFTIRQS;
}
- if (!mark_lock(curr, hlock, usage_bit, ip))
+ if (!mark_lock(curr, hlock, usage_bit))
return 0;
}
@@ -1879,7 +1868,7 @@ void trace_hardirqs_on(void)
* We are going to turn hardirqs on, so set the
* usage bit for all held locks:
*/
- if (!mark_held_locks(curr, 1, ip))
+ if (!mark_held_locks(curr, 1))
return;
/*
* If we have softirqs enabled, then set the usage
@@ -1887,7 +1876,7 @@ void trace_hardirqs_on(void)
* this bit from being set before)
*/
if (curr->softirqs_enabled)
- if (!mark_held_locks(curr, 0, ip))
+ if (!mark_held_locks(curr, 0))
return;
curr->hardirq_enable_ip = ip;
@@ -1955,7 +1944,7 @@ void trace_softirqs_on(unsigned long ip)
* enabled too:
*/
if (curr->hardirqs_enabled)
- mark_held_locks(curr, 0, ip);
+ mark_held_locks(curr, 0);
}
/*
@@ -2093,43 +2082,43 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (read) {
if (curr->hardirq_context)
if (!mark_lock(curr, hlock,
- LOCK_USED_IN_HARDIRQ_READ, ip))
+ LOCK_USED_IN_HARDIRQ_READ))
return 0;
if (curr->softirq_context)
if (!mark_lock(curr, hlock,
- LOCK_USED_IN_SOFTIRQ_READ, ip))
+ LOCK_USED_IN_SOFTIRQ_READ))
return 0;
} else {
if (curr->hardirq_context)
- if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ, ip))
+ if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
return 0;
if (curr->softirq_context)
- if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ, ip))
+ if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ))
return 0;
}
}
if (!hardirqs_off) {
if (read) {
if (!mark_lock(curr, hlock,
- LOCK_ENABLED_HARDIRQS_READ, ip))
+ LOCK_ENABLED_HARDIRQS_READ))
return 0;
if (curr->softirqs_enabled)
if (!mark_lock(curr, hlock,
- LOCK_ENABLED_SOFTIRQS_READ, ip))
+ LOCK_ENABLED_SOFTIRQS_READ))
return 0;
} else {
if (!mark_lock(curr, hlock,
- LOCK_ENABLED_HARDIRQS, ip))
+ LOCK_ENABLED_HARDIRQS))
return 0;
if (curr->softirqs_enabled)
if (!mark_lock(curr, hlock,
- LOCK_ENABLED_SOFTIRQS, ip))
+ LOCK_ENABLED_SOFTIRQS))
return 0;
}
}
#endif
/* mark it as used: */
- if (!mark_lock(curr, hlock, LOCK_USED, ip))
+ if (!mark_lock(curr, hlock, LOCK_USED))
return 0;
out_calc_hash:
/*
diff --git a/kernel/module.c b/kernel/module.c
index 1eb8ca5..9bd93de 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -19,6 +19,7 @@
#include <linux/module.h>
#include <linux/moduleloader.h>
#include <linux/init.h>
+#include <linux/kallsyms.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
@@ -95,9 +96,9 @@ static inline void add_taint_module(struct module *mod, unsigned flag)
mod->taints |= flag;
}
-/* A thread that wants to hold a reference to a module only while it
- * is running can call ths to safely exit.
- * nfsd and lockd use this.
+/*
+ * A thread that wants to hold a reference to a module only while it
+ * is running can call this to safely exit. nfsd and lockd use this.
*/
void __module_put_and_exit(struct module *mod, long code)
{
@@ -310,14 +311,14 @@ static int split_block(unsigned int i, unsigned short size)
{
/* Reallocation required? */
if (pcpu_num_used + 1 > pcpu_num_allocated) {
- int *new = kmalloc(sizeof(new[0]) * pcpu_num_allocated*2,
- GFP_KERNEL);
+ int *new;
+
+ new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2,
+ GFP_KERNEL);
if (!new)
return 0;
- memcpy(new, pcpu_size, sizeof(new[0])*pcpu_num_allocated);
pcpu_num_allocated *= 2;
- kfree(pcpu_size);
pcpu_size = new;
}
@@ -1198,7 +1199,7 @@ static int __unlink_module(void *_mod)
return 0;
}
-/* Free a module, remove from lists, etc (must hold module mutex). */
+/* Free a module, remove from lists, etc (must hold module_mutex). */
static void free_module(struct module *mod)
{
/* Delete from various lists */
@@ -1245,7 +1246,7 @@ EXPORT_SYMBOL_GPL(__symbol_get);
/*
* Ensure that an exported symbol [global namespace] does not already exist
- * in the Kernel or in some other modules exported symbol table.
+ * in the kernel or in some other module's exported symbol table.
*/
static int verify_export_symbols(struct module *mod)
{
@@ -1471,7 +1472,7 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
}
#ifdef CONFIG_KALLSYMS
-int is_exported(const char *name, const struct module *mod)
+static int is_exported(const char *name, const struct module *mod)
{
if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
return 1;
@@ -2097,8 +2098,10 @@ static const char *get_ksymbol(struct module *mod,
if (!best)
return NULL;
- *size = nextval - mod->symtab[best].st_value;
- *offset = addr - mod->symtab[best].st_value;
+ if (size)
+ *size = nextval - mod->symtab[best].st_value;
+ if (offset)
+ *offset = addr - mod->symtab[best].st_value;
return mod->strtab + mod->symtab[best].st_name;
}
@@ -2123,8 +2126,58 @@ const char *module_address_lookup(unsigned long addr,
return NULL;
}
-struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
- char *type, char *name, size_t namelen)
+int lookup_module_symbol_name(unsigned long addr, char *symname)
+{
+ struct module *mod;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(mod, &modules, list) {
+ if (within(addr, mod->module_init, mod->init_size) ||
+ within(addr, mod->module_core, mod->core_size)) {
+ const char *sym;
+
+ sym = get_ksymbol(mod, addr, NULL, NULL);
+ if (!sym)
+ goto out;
+ strlcpy(symname, sym, KSYM_NAME_LEN + 1);
+ mutex_unlock(&module_mutex);
+ return 0;
+ }
+ }
+out:
+ mutex_unlock(&module_mutex);
+ return -ERANGE;
+}
+
+int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
+ unsigned long *offset, char *modname, char *name)
+{
+ struct module *mod;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(mod, &modules, list) {
+ if (within(addr, mod->module_init, mod->init_size) ||
+ within(addr, mod->module_core, mod->core_size)) {
+ const char *sym;
+
+ sym = get_ksymbol(mod, addr, size, offset);
+ if (!sym)
+ goto out;
+ if (modname)
+ strlcpy(modname, mod->name, MODULE_NAME_LEN + 1);
+ if (name)
+ strlcpy(name, sym, KSYM_NAME_LEN + 1);
+ mutex_unlock(&module_mutex);
+ return 0;
+ }
+ }
+out:
+ mutex_unlock(&module_mutex);
+ return -ERANGE;
+}
+
+int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ char *name, char *module_name, int *exported)
{
struct module *mod;
@@ -2134,14 +2187,16 @@ struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
*value = mod->symtab[symnum].st_value;
*type = mod->symtab[symnum].st_info;
strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
- namelen);
+ KSYM_NAME_LEN + 1);
+ strlcpy(module_name, mod->name, MODULE_NAME_LEN + 1);
+ *exported = is_exported(name, mod);
mutex_unlock(&module_mutex);
- return mod;
+ return 0;
}
symnum -= mod->num_symtab;
}
mutex_unlock(&module_mutex);
- return NULL;
+ return -ERANGE;
}
static unsigned long mod_find_symname(struct module *mod, const char *name)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index e7cbbb8..303eab1 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -133,7 +133,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
debug_mutex_lock_common(lock, &waiter);
mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
- debug_mutex_add_waiter(lock, &waiter, task->thread_info);
+ debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
/* add waiting tasks to the end of the waitqueue (FIFO): */
list_add_tail(&waiter.list, &lock->wait_list);
@@ -159,7 +159,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
*/
if (unlikely(state == TASK_INTERRUPTIBLE &&
signal_pending(task))) {
- mutex_remove_waiter(lock, &waiter, task->thread_info);
+ mutex_remove_waiter(lock, &waiter, task_thread_info(task));
mutex_release(&lock->dep_map, 1, _RET_IP_);
spin_unlock_mutex(&lock->wait_lock, flags);
@@ -175,8 +175,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
}
/* got the lock - rejoice! */
- mutex_remove_waiter(lock, &waiter, task->thread_info);
- debug_mutex_set_owner(lock, task->thread_info);
+ mutex_remove_waiter(lock, &waiter, task_thread_info(task));
+ debug_mutex_set_owner(lock, task_thread_info(task));
/* set it to 0 if there are no waiters left: */
if (likely(list_empty(&lock->wait_list)))
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f5b9ee6..1bc4b55 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -38,10 +38,8 @@ void get_task_namespaces(struct task_struct *tsk)
/*
* creates a copy of "orig" with refcount 1.
- * This does not grab references to the contained namespaces,
- * so that needs to be done by dup_namespaces.
*/
-static inline struct nsproxy *clone_namespaces(struct nsproxy *orig)
+static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
{
struct nsproxy *ns;
@@ -52,26 +50,49 @@ static inline struct nsproxy *clone_namespaces(struct nsproxy *orig)
}
/*
- * copies the nsproxy, setting refcount to 1, and grabbing a
- * reference to all contained namespaces. Called from
- * sys_unshare()
+ * Create new nsproxy and all of its the associated namespaces.
+ * Return the newly created nsproxy. Do not attach this to the task,
+ * leave it to the caller to do proper locking and attach it to task.
*/
-struct nsproxy *dup_namespaces(struct nsproxy *orig)
+static struct nsproxy *create_new_namespaces(int flags, struct task_struct *tsk,
+ struct fs_struct *new_fs)
{
- struct nsproxy *ns = clone_namespaces(orig);
+ struct nsproxy *new_nsp;
- if (ns) {
- if (ns->mnt_ns)
- get_mnt_ns(ns->mnt_ns);
- if (ns->uts_ns)
- get_uts_ns(ns->uts_ns);
- if (ns->ipc_ns)
- get_ipc_ns(ns->ipc_ns);
- if (ns->pid_ns)
- get_pid_ns(ns->pid_ns);
- }
+ new_nsp = clone_nsproxy(tsk->nsproxy);
+ if (!new_nsp)
+ return ERR_PTR(-ENOMEM);
- return ns;
+ new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs);
+ if (IS_ERR(new_nsp->mnt_ns))
+ goto out_ns;
+
+ new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns);
+ if (IS_ERR(new_nsp->uts_ns))
+ goto out_uts;
+
+ new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
+ if (IS_ERR(new_nsp->ipc_ns))
+ goto out_ipc;
+
+ new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns);
+ if (IS_ERR(new_nsp->pid_ns))
+ goto out_pid;
+
+ return new_nsp;
+
+out_pid:
+ if (new_nsp->ipc_ns)
+ put_ipc_ns(new_nsp->ipc_ns);
+out_ipc:
+ if (new_nsp->uts_ns)
+ put_uts_ns(new_nsp->uts_ns);
+out_uts:
+ if (new_nsp->mnt_ns)
+ put_mnt_ns(new_nsp->mnt_ns);
+out_ns:
+ kfree(new_nsp);
+ return ERR_PTR(-ENOMEM);
}
/*
@@ -92,47 +113,21 @@ int copy_namespaces(int flags, struct task_struct *tsk)
if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
return 0;
- new_ns = clone_namespaces(old_ns);
- if (!new_ns) {
- err = -ENOMEM;
+ if (!capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
goto out;
}
- tsk->nsproxy = new_ns;
-
- err = copy_mnt_ns(flags, tsk);
- if (err)
- goto out_ns;
-
- err = copy_utsname(flags, tsk);
- if (err)
- goto out_uts;
-
- err = copy_ipcs(flags, tsk);
- if (err)
- goto out_ipc;
-
- err = copy_pid_ns(flags, tsk);
- if (err)
- goto out_pid;
+ new_ns = create_new_namespaces(flags, tsk, tsk->fs);
+ if (IS_ERR(new_ns)) {
+ err = PTR_ERR(new_ns);
+ goto out;
+ }
+ tsk->nsproxy = new_ns;
out:
put_nsproxy(old_ns);
return err;
-
-out_pid:
- if (new_ns->ipc_ns)
- put_ipc_ns(new_ns->ipc_ns);
-out_ipc:
- if (new_ns->uts_ns)
- put_uts_ns(new_ns->uts_ns);
-out_uts:
- if (new_ns->mnt_ns)
- put_mnt_ns(new_ns->mnt_ns);
-out_ns:
- tsk->nsproxy = old_ns;
- kfree(new_ns);
- goto out;
}
void free_nsproxy(struct nsproxy *ns)
@@ -147,3 +142,41 @@ void free_nsproxy(struct nsproxy *ns)
put_pid_ns(ns->pid_ns);
kfree(ns);
}
+
+/*
+ * Called from unshare. Unshare all the namespaces part of nsproxy.
+ * On sucess, returns the new nsproxy and a reference to old nsproxy
+ * to make sure it stays around.
+ */
+int unshare_nsproxy_namespaces(unsigned long unshare_flags,
+ struct nsproxy **new_nsp, struct fs_struct *new_fs)
+{
+ struct nsproxy *old_ns = current->nsproxy;
+ int err = 0;
+
+ if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
+ return 0;
+
+#ifndef CONFIG_IPC_NS
+ if (unshare_flags & CLONE_NEWIPC)
+ return -EINVAL;
+#endif
+
+#ifndef CONFIG_UTS_NS
+ if (unshare_flags & CLONE_NEWUTS)
+ return -EINVAL;
+#endif
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ get_nsproxy(old_ns);
+
+ *new_nsp = create_new_namespaces(unshare_flags, current,
+ new_fs ? new_fs : current->fs);
+ if (IS_ERR(*new_nsp)) {
+ err = PTR_ERR(*new_nsp);
+ put_nsproxy(old_ns);
+ }
+ return err;
+}
diff --git a/kernel/params.c b/kernel/params.c
index 3121723..e61c46c 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -269,7 +269,7 @@ int param_get_invbool(char *buffer, struct kernel_param *kp)
return param_get_bool(buffer, &dummy);
}
-/* We cheat here and temporarily mangle the string. */
+/* We break the rule and mangle the string. */
static int param_array(const char *name,
const char *val,
unsigned int min, unsigned int max,
diff --git a/kernel/pid.c b/kernel/pid.c
index 9c80bc2..d3ad724 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -360,16 +360,11 @@ struct pid *find_ge_pid(int nr)
}
EXPORT_SYMBOL_GPL(find_get_pid);
-int copy_pid_ns(int flags, struct task_struct *tsk)
+struct pid_namespace *copy_pid_ns(int flags, struct pid_namespace *old_ns)
{
- struct pid_namespace *old_ns = tsk->nsproxy->pid_ns;
- int err = 0;
-
- if (!old_ns)
- return 0;
-
+ BUG_ON(!old_ns);
get_pid_ns(old_ns);
- return err;
+ return old_ns;
}
void free_pid_ns(struct kref *kref)
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 657f776..1de710e1 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -971,7 +971,7 @@ static void check_thread_timers(struct task_struct *tsk,
maxfire = 20;
tsk->it_prof_expires = cputime_zero;
while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
@@ -986,7 +986,7 @@ static void check_thread_timers(struct task_struct *tsk,
maxfire = 20;
tsk->it_virt_expires = cputime_zero;
while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
@@ -1001,7 +1001,7 @@ static void check_thread_timers(struct task_struct *tsk,
maxfire = 20;
tsk->it_sched_expires = 0;
while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || tsk->sched_time < t->expires.sched) {
@@ -1057,7 +1057,7 @@ static void check_process_timers(struct task_struct *tsk,
maxfire = 20;
prof_expires = cputime_zero;
while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) {
@@ -1072,7 +1072,7 @@ static void check_process_timers(struct task_struct *tsk,
maxfire = 20;
virt_expires = cputime_zero;
while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || cputime_lt(utime, t->expires.cpu)) {
@@ -1087,7 +1087,7 @@ static void check_process_timers(struct task_struct *tsk,
maxfire = 20;
sched_expires = 0;
while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || sched_time < t->expires.sched) {
@@ -1400,7 +1400,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
*/
head = &tsk->signal->cpu_timers[clock_idx];
if (list_empty(head) ||
- cputime_ge(list_entry(head->next,
+ cputime_ge(list_first_entry(head,
struct cpu_timer_list, entry)->expires.cpu,
*newval)) {
/*
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 44318ca..588c99d 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -31,7 +31,6 @@
* POSIX clocks & timers
*/
#include <linux/mm.h>
-#include <linux/smp_lock.h>
#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/time.h>
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 8777217..495b7d4 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -79,7 +79,7 @@ config PM_SYSFS_DEPRECATED
config SOFTWARE_SUSPEND
bool "Software Suspend (Hibernation)"
- depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
+ depends on PM && SWAP && (((X86 || PPC64_SWSUSP) && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
---help---
Enable the suspend to disk (STD) functionality, which is usually
called "hibernation" in user interfaces. STD checkpoints the
@@ -139,7 +139,7 @@ config PM_STD_PARTITION
config SUSPEND_SMP
bool
- depends on HOTPLUG_CPU && X86 && PM
+ depends on HOTPLUG_CPU && (X86 || PPC64) && PM
default y
config APM_EMULATION
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 0633137..b5f0543 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -30,30 +30,69 @@ char resume_file[256] = CONFIG_PM_STD_PARTITION;
dev_t swsusp_resume_device;
sector_t swsusp_resume_block;
+enum {
+ HIBERNATION_INVALID,
+ HIBERNATION_PLATFORM,
+ HIBERNATION_TEST,
+ HIBERNATION_TESTPROC,
+ HIBERNATION_SHUTDOWN,
+ HIBERNATION_REBOOT,
+ /* keep last */
+ __HIBERNATION_AFTER_LAST
+};
+#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1)
+#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1)
+
+static int hibernation_mode = HIBERNATION_SHUTDOWN;
+
+struct hibernation_ops *hibernation_ops;
+
+/**
+ * hibernation_set_ops - set the global hibernate operations
+ * @ops: the hibernation operations to use in subsequent hibernation transitions
+ */
+
+void hibernation_set_ops(struct hibernation_ops *ops)
+{
+ if (ops && !(ops->prepare && ops->enter && ops->finish)) {
+ WARN_ON(1);
+ return;
+ }
+ mutex_lock(&pm_mutex);
+ hibernation_ops = ops;
+ if (ops)
+ hibernation_mode = HIBERNATION_PLATFORM;
+ else if (hibernation_mode == HIBERNATION_PLATFORM)
+ hibernation_mode = HIBERNATION_SHUTDOWN;
+
+ mutex_unlock(&pm_mutex);
+}
+
+
/**
* platform_prepare - prepare the machine for hibernation using the
* platform driver if so configured and return an error code if it fails
*/
-static inline int platform_prepare(void)
+static int platform_prepare(void)
{
- int error = 0;
+ return (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) ?
+ hibernation_ops->prepare() : 0;
+}
- switch (pm_disk_mode) {
- case PM_DISK_TEST:
- case PM_DISK_TESTPROC:
- case PM_DISK_SHUTDOWN:
- case PM_DISK_REBOOT:
- break;
- default:
- if (pm_ops && pm_ops->prepare)
- error = pm_ops->prepare(PM_SUSPEND_DISK);
- }
- return error;
+/**
+ * platform_finish - switch the machine to the normal mode of operation
+ * using the platform driver (must be called after platform_prepare())
+ */
+
+static void platform_finish(void)
+{
+ if (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops)
+ hibernation_ops->finish();
}
/**
- * power_down - Shut machine down for hibernate.
+ * power_down - Shut the machine down for hibernation.
*
* Use the platform driver, if configured so; otherwise try
* to power off or reboot.
@@ -61,20 +100,20 @@ static inline int platform_prepare(void)
static void power_down(void)
{
- switch (pm_disk_mode) {
- case PM_DISK_TEST:
- case PM_DISK_TESTPROC:
+ switch (hibernation_mode) {
+ case HIBERNATION_TEST:
+ case HIBERNATION_TESTPROC:
break;
- case PM_DISK_SHUTDOWN:
+ case HIBERNATION_SHUTDOWN:
kernel_power_off();
break;
- case PM_DISK_REBOOT:
+ case HIBERNATION_REBOOT:
kernel_restart(NULL);
break;
- default:
- if (pm_ops && pm_ops->enter) {
+ case HIBERNATION_PLATFORM:
+ if (hibernation_ops) {
kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
- pm_ops->enter(PM_SUSPEND_DISK);
+ hibernation_ops->enter();
break;
}
}
@@ -87,20 +126,6 @@ static void power_down(void)
while(1);
}
-static inline void platform_finish(void)
-{
- switch (pm_disk_mode) {
- case PM_DISK_TEST:
- case PM_DISK_TESTPROC:
- case PM_DISK_SHUTDOWN:
- case PM_DISK_REBOOT:
- break;
- default:
- if (pm_ops && pm_ops->finish)
- pm_ops->finish(PM_SUSPEND_DISK);
- }
-}
-
static void unprepare_processes(void)
{
thaw_processes();
@@ -120,13 +145,10 @@ static int prepare_processes(void)
}
/**
- * pm_suspend_disk - The granpappy of hibernation power management.
- *
- * If not, then call swsusp to do its thing, then figure out how
- * to power down the system.
+ * hibernate - The granpappy of the built-in hibernation management
*/
-int pm_suspend_disk(void)
+int hibernate(void)
{
int error;
@@ -143,7 +165,8 @@ int pm_suspend_disk(void)
if (error)
goto Finish;
- if (pm_disk_mode == PM_DISK_TESTPROC) {
+ mutex_lock(&pm_mutex);
+ if (hibernation_mode == HIBERNATION_TESTPROC) {
printk("swsusp debug: Waiting for 5 seconds.\n");
mdelay(5000);
goto Thaw;
@@ -168,7 +191,7 @@ int pm_suspend_disk(void)
if (error)
goto Enable_cpus;
- if (pm_disk_mode == PM_DISK_TEST) {
+ if (hibernation_mode == HIBERNATION_TEST) {
printk("swsusp debug: Waiting for 5 seconds.\n");
mdelay(5000);
goto Enable_cpus;
@@ -205,6 +228,7 @@ int pm_suspend_disk(void)
device_resume();
resume_console();
Thaw:
+ mutex_unlock(&pm_mutex);
unprepare_processes();
Finish:
free_basic_memory_bitmaps();
@@ -220,7 +244,7 @@ int pm_suspend_disk(void)
* Called as a late_initcall (so all devices are discovered and
* initialized), we call swsusp to see if we have a saved image or not.
* If so, we quiesce devices, the restore the saved image. We will
- * return above (in pm_suspend_disk() ) if everything goes well.
+ * return above (in hibernate() ) if everything goes well.
* Otherwise, we fail gracefully and return to the normally
* scheduled program.
*
@@ -315,25 +339,26 @@ static int software_resume(void)
late_initcall(software_resume);
-static const char * const pm_disk_modes[] = {
- [PM_DISK_PLATFORM] = "platform",
- [PM_DISK_SHUTDOWN] = "shutdown",
- [PM_DISK_REBOOT] = "reboot",
- [PM_DISK_TEST] = "test",
- [PM_DISK_TESTPROC] = "testproc",
+static const char * const hibernation_modes[] = {
+ [HIBERNATION_PLATFORM] = "platform",
+ [HIBERNATION_SHUTDOWN] = "shutdown",
+ [HIBERNATION_REBOOT] = "reboot",
+ [HIBERNATION_TEST] = "test",
+ [HIBERNATION_TESTPROC] = "testproc",
};
/**
- * disk - Control suspend-to-disk mode
+ * disk - Control hibernation mode
*
* Suspend-to-disk can be handled in several ways. We have a few options
* for putting the system to sleep - using the platform driver (e.g. ACPI
- * or other pm_ops), powering off the system or rebooting the system
- * (for testing) as well as the two test modes.
+ * or other hibernation_ops), powering off the system or rebooting the
+ * system (for testing) as well as the two test modes.
*
* The system can support 'platform', and that is known a priori (and
- * encoded in pm_ops). However, the user may choose 'shutdown' or 'reboot'
- * as alternatives, as well as the test modes 'test' and 'testproc'.
+ * encoded by the presence of hibernation_ops). However, the user may
+ * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the
+ * test modes, 'test' or 'testproc'.
*
* show() will display what the mode is currently set to.
* store() will accept one of
@@ -345,7 +370,7 @@ static const char * const pm_disk_modes[] = {
* 'testproc'
*
* It will only change to 'platform' if the system
- * supports it (as determined from pm_ops->pm_disk_mode).
+ * supports it (as determined by having hibernation_ops).
*/
static ssize_t disk_show(struct kset *kset, char *buf)
@@ -353,28 +378,25 @@ static ssize_t disk_show(struct kset *kset, char *buf)
int i;
char *start = buf;
- for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) {
- if (!pm_disk_modes[i])
+ for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
+ if (!hibernation_modes[i])
continue;
switch (i) {
- case PM_DISK_SHUTDOWN:
- case PM_DISK_REBOOT:
- case PM_DISK_TEST:
- case PM_DISK_TESTPROC:
+ case HIBERNATION_SHUTDOWN:
+ case HIBERNATION_REBOOT:
+ case HIBERNATION_TEST:
+ case HIBERNATION_TESTPROC:
break;
- default:
- if (pm_ops && pm_ops->enter &&
- (i == pm_ops->pm_disk_mode))
+ case HIBERNATION_PLATFORM:
+ if (hibernation_ops)
break;
/* not a valid mode, continue with loop */
continue;
}
- if (i == pm_disk_mode)
- buf += sprintf(buf, "[%s]", pm_disk_modes[i]);
+ if (i == hibernation_mode)
+ buf += sprintf(buf, "[%s] ", hibernation_modes[i]);
else
- buf += sprintf(buf, "%s", pm_disk_modes[i]);
- if (i+1 != PM_DISK_MAX)
- buf += sprintf(buf, " ");
+ buf += sprintf(buf, "%s ", hibernation_modes[i]);
}
buf += sprintf(buf, "\n");
return buf-start;
@@ -387,39 +409,38 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
int i;
int len;
char *p;
- suspend_disk_method_t mode = 0;
+ int mode = HIBERNATION_INVALID;
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
mutex_lock(&pm_mutex);
- for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) {
- if (!strncmp(buf, pm_disk_modes[i], len)) {
+ for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
+ if (!strncmp(buf, hibernation_modes[i], len)) {
mode = i;
break;
}
}
- if (mode) {
+ if (mode != HIBERNATION_INVALID) {
switch (mode) {
- case PM_DISK_SHUTDOWN:
- case PM_DISK_REBOOT:
- case PM_DISK_TEST:
- case PM_DISK_TESTPROC:
- pm_disk_mode = mode;
+ case HIBERNATION_SHUTDOWN:
+ case HIBERNATION_REBOOT:
+ case HIBERNATION_TEST:
+ case HIBERNATION_TESTPROC:
+ hibernation_mode = mode;
break;
- default:
- if (pm_ops && pm_ops->enter &&
- (mode == pm_ops->pm_disk_mode))
- pm_disk_mode = mode;
+ case HIBERNATION_PLATFORM:
+ if (hibernation_ops)
+ hibernation_mode = mode;
else
error = -EINVAL;
}
- } else {
+ } else
error = -EINVAL;
- }
- pr_debug("PM: suspend-to-disk mode set to '%s'\n",
- pm_disk_modes[mode]);
+ if (!error)
+ pr_debug("PM: suspend-to-disk mode set to '%s'\n",
+ hibernation_modes[mode]);
mutex_unlock(&pm_mutex);
return error ? error : n;
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f6dda68..40d56a3 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -30,7 +30,6 @@
DEFINE_MUTEX(pm_mutex);
struct pm_ops *pm_ops;
-suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
/**
* pm_set_ops - Set the global power method table.
@@ -41,10 +40,6 @@ void pm_set_ops(struct pm_ops * ops)
{
mutex_lock(&pm_mutex);
pm_ops = ops;
- if (ops && ops->pm_disk_mode != PM_DISK_INVALID) {
- pm_disk_mode = ops->pm_disk_mode;
- } else
- pm_disk_mode = PM_DISK_SHUTDOWN;
mutex_unlock(&pm_mutex);
}
@@ -184,24 +179,12 @@ static void suspend_finish(suspend_state_t state)
static const char * const pm_states[PM_SUSPEND_MAX] = {
[PM_SUSPEND_STANDBY] = "standby",
[PM_SUSPEND_MEM] = "mem",
- [PM_SUSPEND_DISK] = "disk",
};
static inline int valid_state(suspend_state_t state)
{
- /* Suspend-to-disk does not really need low-level support.
- * It can work with shutdown/reboot if needed. If it isn't
- * configured, then it cannot be supported.
- */
- if (state == PM_SUSPEND_DISK)
-#ifdef CONFIG_SOFTWARE_SUSPEND
- return 1;
-#else
- return 0;
-#endif
-
- /* all other states need lowlevel support and need to be
- * valid to the lowlevel implementation, no valid callback
+ /* All states need lowlevel support and need to be valid
+ * to the lowlevel implementation, no valid callback
* implies that none are valid. */
if (!pm_ops || !pm_ops->valid || !pm_ops->valid(state))
return 0;
@@ -229,11 +212,6 @@ static int enter_state(suspend_state_t state)
if (!mutex_trylock(&pm_mutex))
return -EBUSY;
- if (state == PM_SUSPEND_DISK) {
- error = pm_suspend_disk();
- goto Unlock;
- }
-
pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
if ((error = suspend_prepare(state)))
goto Unlock;
@@ -251,7 +229,7 @@ static int enter_state(suspend_state_t state)
/**
* pm_suspend - Externally visible function for suspending system.
- * @state: Enumarted value of state to enter.
+ * @state: Enumerated value of state to enter.
*
* Determine whether or not value is within range, get state
* structure, and enter (above).
@@ -289,7 +267,13 @@ static ssize_t state_show(struct kset *kset, char *buf)
if (pm_states[i] && valid_state(i))
s += sprintf(s,"%s ", pm_states[i]);
}
- s += sprintf(s,"\n");
+#ifdef CONFIG_SOFTWARE_SUSPEND
+ s += sprintf(s, "%s\n", "disk");
+#else
+ if (s != buf)
+ /* convert the last space to a newline */
+ *(s-1) = '\n';
+#endif
return (s - buf);
}
@@ -304,6 +288,12 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
+ /* First, check if we are requested to hibernate */
+ if (!strncmp(buf, "disk", len)) {
+ error = hibernate();
+ return error ? error : n;
+ }
+
for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
if (*s && !strncmp(buf, *s, len))
break;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 34b4354..5138148 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -25,12 +25,7 @@ struct swsusp_info {
*/
#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
-extern int pm_suspend_disk(void);
-#else
-static inline int pm_suspend_disk(void)
-{
- return -EPERM;
-}
+extern struct hibernation_ops *hibernation_ops;
#endif
extern int pfn_is_nosave(unsigned long);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0eb5c42..0884193 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -8,7 +8,6 @@
#undef DEBUG
-#include <linux/smp_lock.h>
#include <linux/interrupt.h>
#include <linux/suspend.h>
#include <linux/module.h>
@@ -25,10 +24,9 @@
static inline int freezeable(struct task_struct * p)
{
- if ((p == current) ||
+ if ((p == current) ||
(p->flags & PF_NOFREEZE) ||
- (p->exit_state == EXIT_ZOMBIE) ||
- (p->exit_state == EXIT_DEAD))
+ (p->exit_state != 0))
return 0;
return 1;
}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 128da11..a3b7854 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -14,7 +14,6 @@
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/suspend.h>
-#include <linux/smp_lock.h>
#include <linux/delay.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
@@ -608,7 +607,8 @@ static LIST_HEAD(nosave_regions);
*/
void __init
-register_nosave_region(unsigned long start_pfn, unsigned long end_pfn)
+__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
+ int use_kmalloc)
{
struct nosave_region *region;
@@ -624,8 +624,13 @@ register_nosave_region(unsigned long start_pfn, unsigned long end_pfn)
goto Report;
}
}
- /* This allocation cannot fail */
- region = alloc_bootmem_low(sizeof(struct nosave_region));
+ if (use_kmalloc) {
+ /* during init, this shouldn't fail */
+ region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL);
+ BUG_ON(!region);
+ } else
+ /* This allocation cannot fail */
+ region = alloc_bootmem_low(sizeof(struct nosave_region));
region->start_pfn = start_pfn;
region->end_pfn = end_pfn;
list_add_tail(&region->list, &nosave_regions);
@@ -1228,7 +1233,7 @@ asmlinkage int swsusp_save(void)
nr_copy_pages = nr_pages;
nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
- printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
+ printk("swsusp: critical section: done (%d pages copied)\n", nr_pages);
return 0;
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index e83ed99..b8b235c 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -12,7 +12,6 @@
*/
#include <linux/module.h>
-#include <linux/smp_lock.h>
#include <linux/file.h>
#include <linux/utsname.h>
#include <linux/version.h>
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 040560d..24d7d78 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -130,16 +130,16 @@ static inline int platform_prepare(void)
{
int error = 0;
- if (pm_ops && pm_ops->prepare)
- error = pm_ops->prepare(PM_SUSPEND_DISK);
+ if (hibernation_ops)
+ error = hibernation_ops->prepare();
return error;
}
static inline void platform_finish(void)
{
- if (pm_ops && pm_ops->finish)
- pm_ops->finish(PM_SUSPEND_DISK);
+ if (hibernation_ops)
+ hibernation_ops->finish();
}
static inline int snapshot_suspend(int platform_suspend)
@@ -384,7 +384,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
switch (arg) {
case PMOPS_PREPARE:
- if (pm_ops && pm_ops->enter) {
+ if (hibernation_ops) {
data->platform_suspend = 1;
error = 0;
} else {
@@ -395,8 +395,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
case PMOPS_ENTER:
if (data->platform_suspend) {
kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
- error = pm_ops->enter(PM_SUSPEND_DISK);
- error = 0;
+ error = hibernation_ops->enter();
}
break;
diff --git a/kernel/printk.c b/kernel/printk.c
index 4b47e59..0bbdeac 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -20,7 +20,6 @@
#include <linux/mm.h>
#include <linux/tty.h>
#include <linux/tty_driver.h>
-#include <linux/smp_lock.h>
#include <linux/console.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -931,8 +930,16 @@ void register_console(struct console *console)
{
int i;
unsigned long flags;
+ struct console *bootconsole = NULL;
- if (preferred_console < 0)
+ if (console_drivers) {
+ if (console->flags & CON_BOOT)
+ return;
+ if (console_drivers->flags & CON_BOOT)
+ bootconsole = console_drivers;
+ }
+
+ if (preferred_console < 0 || bootconsole || !console_drivers)
preferred_console = selected_console;
/*
@@ -978,8 +985,11 @@ void register_console(struct console *console)
if (!(console->flags & CON_ENABLED))
return;
- if (console_drivers && (console_drivers->flags & CON_BOOT)) {
- unregister_console(console_drivers);
+ if (bootconsole) {
+ printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n",
+ bootconsole->name, bootconsole->index,
+ console->name, console->index);
+ unregister_console(bootconsole);
console->flags &= ~CON_PRINTBUFFER;
}
@@ -1030,16 +1040,11 @@ int unregister_console(struct console *console)
}
}
- /* If last console is removed, we re-enable picking the first
- * one that gets registered. Without that, pmac early boot console
- * would prevent fbcon from taking over.
- *
+ /*
* If this isn't the last console and it has CON_CONSDEV set, we
* need to set it on the next preferred console.
*/
- if (console_drivers == NULL)
- preferred_console = selected_console;
- else if (console->flags & CON_CONSDEV)
+ if (console_drivers != NULL && console->flags & CON_CONSDEV)
console_drivers->flags |= CON_CONSDEV;
release_console_sem();
diff --git a/kernel/profile.c b/kernel/profile.c
index 9bfadb2..cc91b9b 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -340,6 +340,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
node = cpu_to_node(cpu);
per_cpu(cpu_profile_flip, cpu) = 0;
if (!per_cpu(cpu_profile_hits, cpu)[1]) {
@@ -365,10 +366,13 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
__free_page(page);
return NOTIFY_BAD;
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
cpu_set(cpu, prof_cpu_mask);
break;
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
cpu_clear(cpu, prof_cpu_mask);
if (per_cpu(cpu_profile_hits, cpu)[0]) {
page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 3554b76..2c2dd84 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -558,9 +558,11 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
long cpu = (long)hcpu;
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
rcu_online_cpu(cpu);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
rcu_offline_cpu(cpu);
break;
default:
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index bcd14e8..55ba82a 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -502,10 +502,6 @@ static struct rcu_torture_ops sched_ops = {
.name = "sched"
};
-static struct rcu_torture_ops *torture_ops[] =
- { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &srcu_ops,
- &sched_ops, NULL };
-
/*
* RCU torture writer kthread. Repeatedly substitutes a new structure
* for that pointed to by rcu_torture_current, freeing the old structure
@@ -534,7 +530,7 @@ rcu_torture_writer(void *arg)
rp->rtort_mbtest = 1;
rcu_assign_pointer(rcu_torture_current, rp);
smp_wmb();
- if (old_rp != NULL) {
+ if (old_rp) {
i = old_rp->rtort_pipe_count;
if (i > RCU_TORTURE_PIPE_LEN)
i = RCU_TORTURE_PIPE_LEN;
@@ -685,7 +681,7 @@ rcu_torture_printk(char *page)
atomic_read(&rcu_torture_wcount[i]));
}
cnt += sprintf(&page[cnt], "\n");
- if (cur_ops->stats != NULL)
+ if (cur_ops->stats)
cnt += cur_ops->stats(&page[cnt]);
return cnt;
}
@@ -749,13 +745,13 @@ static void rcu_torture_shuffle_tasks(void)
set_cpus_allowed(current, tmp_mask);
- if (reader_tasks != NULL) {
+ if (reader_tasks) {
for (i = 0; i < nrealreaders; i++)
if (reader_tasks[i])
set_cpus_allowed(reader_tasks[i], tmp_mask);
}
- if (fakewriter_tasks != NULL) {
+ if (fakewriter_tasks) {
for (i = 0; i < nfakewriters; i++)
if (fakewriter_tasks[i])
set_cpus_allowed(fakewriter_tasks[i], tmp_mask);
@@ -808,21 +804,21 @@ rcu_torture_cleanup(void)
int i;
fullstop = 1;
- if (shuffler_task != NULL) {
+ if (shuffler_task) {
VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
kthread_stop(shuffler_task);
}
shuffler_task = NULL;
- if (writer_task != NULL) {
+ if (writer_task) {
VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
kthread_stop(writer_task);
}
writer_task = NULL;
- if (reader_tasks != NULL) {
+ if (reader_tasks) {
for (i = 0; i < nrealreaders; i++) {
- if (reader_tasks[i] != NULL) {
+ if (reader_tasks[i]) {
VERBOSE_PRINTK_STRING(
"Stopping rcu_torture_reader task");
kthread_stop(reader_tasks[i]);
@@ -834,9 +830,9 @@ rcu_torture_cleanup(void)
}
rcu_torture_current = NULL;
- if (fakewriter_tasks != NULL) {
+ if (fakewriter_tasks) {
for (i = 0; i < nfakewriters; i++) {
- if (fakewriter_tasks[i] != NULL) {
+ if (fakewriter_tasks[i]) {
VERBOSE_PRINTK_STRING(
"Stopping rcu_torture_fakewriter task");
kthread_stop(fakewriter_tasks[i]);
@@ -847,7 +843,7 @@ rcu_torture_cleanup(void)
fakewriter_tasks = NULL;
}
- if (stats_task != NULL) {
+ if (stats_task) {
VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
kthread_stop(stats_task);
}
@@ -858,7 +854,7 @@ rcu_torture_cleanup(void)
rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
- if (cur_ops->cleanup != NULL)
+ if (cur_ops->cleanup)
cur_ops->cleanup();
if (atomic_read(&n_rcu_torture_error))
rcu_torture_print_module_parms("End of test: FAILURE");
@@ -866,27 +862,28 @@ rcu_torture_cleanup(void)
rcu_torture_print_module_parms("End of test: SUCCESS");
}
-static int
+static int __init
rcu_torture_init(void)
{
int i;
int cpu;
int firsterr = 0;
+ static struct rcu_torture_ops *torture_ops[] =
+ { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
+ &srcu_ops, &sched_ops, };
/* Process args and tell the world that the torturer is on the job. */
-
- for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) {
+ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
cur_ops = torture_ops[i];
- if (strcmp(torture_type, cur_ops->name) == 0) {
+ if (strcmp(torture_type, cur_ops->name) == 0)
break;
- }
}
- if (cur_ops == NULL) {
+ if (i == ARRAY_SIZE(torture_ops)) {
printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
torture_type);
return (-EINVAL);
}
- if (cur_ops->init != NULL)
+ if (cur_ops->init)
cur_ops->init(); /* no "goto unwind" prior to this point!!! */
if (nreaders >= 0)
@@ -899,7 +896,7 @@ rcu_torture_init(void)
/* Set up the freelist. */
INIT_LIST_HEAD(&rcu_torture_freelist);
- for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) {
+ for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) {
rcu_tortures[i].rtort_mbtest = 0;
list_add_tail(&rcu_tortures[i].rtort_free,
&rcu_torture_freelist);
diff --git a/kernel/relay.c b/kernel/relay.c
index 577f251..4311101 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -310,16 +310,13 @@ static struct rchan_callbacks default_channel_callbacks = {
/**
* wakeup_readers - wake up readers waiting on a channel
- * @work: work struct that contains the the channel buffer
+ * @data: contains the channel buffer
*
- * This is the work function used to defer reader waking. The
- * reason waking is deferred is that calling directly from write
- * causes problems if you're writing from say the scheduler.
+ * This is the timer function used to defer reader waking.
*/
-static void wakeup_readers(struct work_struct *work)
+static void wakeup_readers(unsigned long data)
{
- struct rchan_buf *buf =
- container_of(work, struct rchan_buf, wake_readers.work);
+ struct rchan_buf *buf = (struct rchan_buf *)data;
wake_up_interruptible(&buf->read_wait);
}
@@ -337,11 +334,9 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
if (init) {
init_waitqueue_head(&buf->read_wait);
kref_init(&buf->kref);
- INIT_DELAYED_WORK(&buf->wake_readers, NULL);
- } else {
- cancel_delayed_work(&buf->wake_readers);
- flush_scheduled_work();
- }
+ setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
+ } else
+ del_timer_sync(&buf->timer);
buf->subbufs_produced = 0;
buf->subbufs_consumed = 0;
@@ -447,8 +442,7 @@ end:
static void relay_close_buf(struct rchan_buf *buf)
{
buf->finalized = 1;
- cancel_delayed_work(&buf->wake_readers);
- flush_scheduled_work();
+ del_timer_sync(&buf->timer);
kref_put(&buf->kref, relay_remove_buf);
}
@@ -490,6 +484,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
switch(action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
mutex_lock(&relay_channels_mutex);
list_for_each_entry(chan, &relay_channels, list) {
if (chan->buf[hotcpu])
@@ -506,6 +501,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
mutex_unlock(&relay_channels_mutex);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
/* No need to flush the cpu : will be flushed upon
* final relay_flush() call. */
break;
@@ -608,11 +604,14 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
buf->padding[old_subbuf];
smp_mb();
- if (waitqueue_active(&buf->read_wait)) {
- PREPARE_DELAYED_WORK(&buf->wake_readers,
- wakeup_readers);
- schedule_delayed_work(&buf->wake_readers, 1);
- }
+ if (waitqueue_active(&buf->read_wait))
+ /*
+ * Calling wake_up_interruptible() from here
+ * will deadlock if we happen to be logging
+ * from the scheduler (trying to re-grab
+ * rq->lock), so defer it.
+ */
+ __mod_timer(&buf->timer, jiffies + 1);
}
old = buf->data;
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 180978c..12879f6 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -56,7 +56,7 @@
* state.
*/
-static void
+void
rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
unsigned long mask)
{
@@ -81,29 +81,6 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
}
/*
- * We can speed up the acquire/release, if the architecture
- * supports cmpxchg and if there's no debugging state to be set up
- */
-#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
-# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
-{
- unsigned long owner, *p = (unsigned long *) &lock->owner;
-
- do {
- owner = *p;
- } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
-}
-#else
-# define rt_mutex_cmpxchg(l,c,n) (0)
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
-{
- lock->owner = (struct task_struct *)
- ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
-}
-#endif
-
-/*
* Calculate task priority from the waiter list priority
*
* Return task->normal_prio when the waiter list is empty or when
@@ -123,7 +100,7 @@ int rt_mutex_getprio(struct task_struct *task)
*
* This can be both boosting and unboosting. task->pi_lock must be held.
*/
-static void __rt_mutex_adjust_prio(struct task_struct *task)
+void __rt_mutex_adjust_prio(struct task_struct *task)
{
int prio = rt_mutex_getprio(task);
@@ -159,11 +136,11 @@ int max_lock_depth = 1024;
* Decreases task's usage by one - may thus free the task.
* Returns 0 or -EDEADLK.
*/
-static int rt_mutex_adjust_prio_chain(struct task_struct *task,
- int deadlock_detect,
- struct rt_mutex *orig_lock,
- struct rt_mutex_waiter *orig_waiter,
- struct task_struct *top_task)
+int rt_mutex_adjust_prio_chain(struct task_struct *task,
+ int deadlock_detect,
+ struct rt_mutex *orig_lock,
+ struct rt_mutex_waiter *orig_waiter,
+ struct task_struct *top_task)
{
struct rt_mutex *lock;
struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
@@ -524,8 +501,8 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
*
* Must be called with lock->wait_lock held
*/
-static void remove_waiter(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter)
+void remove_waiter(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter)
{
int first = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 9c75856..242ec7e 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -113,6 +113,29 @@ static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
}
/*
+ * We can speed up the acquire/release, if the architecture
+ * supports cmpxchg and if there's no debugging state to be set up
+ */
+#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+ do {
+ owner = *p;
+ } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+}
+#else
+# define rt_mutex_cmpxchg(l,c,n) (0)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ lock->owner = (struct task_struct *)
+ ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
+}
+#endif
+
+/*
* PI-futex support (proxy locking functions, etc.):
*/
extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
@@ -120,4 +143,15 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
struct task_struct *proxy_owner);
extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
struct task_struct *proxy_owner);
+
+extern void rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
+ unsigned long mask);
+extern void __rt_mutex_adjust_prio(struct task_struct *task);
+extern int rt_mutex_adjust_prio_chain(struct task_struct *task,
+ int deadlock_detect,
+ struct rt_mutex *orig_lock,
+ struct rt_mutex_waiter *orig_waiter,
+ struct task_struct *top_task);
+extern void remove_waiter(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter);
#endif
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 291ded5..9a87886 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -60,7 +60,7 @@ int down_write_trylock(struct rw_semaphore *sem)
int ret = __down_write_trylock(sem);
if (ret == 1)
- rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+ rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
return ret;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index 0227f16..799d23b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -52,8 +52,9 @@
#include <linux/tsacct_kern.h>
#include <linux/kprobes.h>
#include <linux/delayacct.h>
-#include <asm/tlb.h>
+#include <linux/reciprocal_div.h>
+#include <asm/tlb.h>
#include <asm/unistd.h>
/*
@@ -181,6 +182,27 @@ static unsigned int static_prio_timeslice(int static_prio)
return SCALE_PRIO(DEF_TIMESLICE, static_prio);
}
+#ifdef CONFIG_SMP
+/*
+ * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
+ * Since cpu_power is a 'constant', we can use a reciprocal divide.
+ */
+static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
+{
+ return reciprocal_divide(load, sg->reciprocal_cpu_power);
+}
+
+/*
+ * Each time a sched group cpu_power is changed,
+ * we must compute its reciprocal value
+ */
+static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
+{
+ sg->__cpu_power += val;
+ sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
+}
+#endif
+
/*
* task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
* to time slice values: [800ms ... 100ms ... 5ms]
@@ -223,6 +245,10 @@ struct rq {
unsigned long raw_weighted_load;
#ifdef CONFIG_SMP
unsigned long cpu_load[3];
+ unsigned char idle_at_tick;
+#ifdef CONFIG_NO_HZ
+ unsigned char in_nohz_recently;
+#endif
#endif
unsigned long long nr_switches;
@@ -278,7 +304,8 @@ struct rq {
struct lock_class_key rq_lock_key;
};
-static DEFINE_PER_CPU(struct rq, runqueues);
+static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
+static DEFINE_MUTEX(sched_hotcpu_mutex);
static inline int cpu_of(struct rq *rq)
{
@@ -1049,6 +1076,17 @@ static void resched_task(struct task_struct *p)
if (!tsk_is_polling(p))
smp_send_reschedule(cpu);
}
+
+static void resched_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ if (!spin_trylock_irqsave(&rq->lock, flags))
+ return;
+ resched_task(cpu_curr(cpu));
+ spin_unlock_irqrestore(&rq->lock, flags);
+}
#else
static inline void resched_task(struct task_struct *p)
{
@@ -1241,7 +1279,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
}
/* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ avg_load = sg_div_cpu_power(group,
+ avg_load * SCHED_LOAD_SCALE);
if (local_group) {
this_load = avg_load;
@@ -1368,7 +1407,16 @@ static int wake_idle(int cpu, struct task_struct *p)
struct sched_domain *sd;
int i;
- if (idle_cpu(cpu))
+ /*
+ * If it is idle, then it is the best cpu to run this task.
+ *
+ * This cpu is also the best, if it has more than one task already.
+ * Siblings must be also busy(in most cases) as they didn't already
+ * pickup the extra load from this cpu and hence we need not check
+ * sibling runqueue info. This will avoid the checks and cache miss
+ * penalities associated with that.
+ */
+ if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
return cpu;
for_each_domain(cpu, sd) {
@@ -2352,12 +2400,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
}
total_load += avg_load;
- total_pwr += group->cpu_power;
+ total_pwr += group->__cpu_power;
/* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ avg_load = sg_div_cpu_power(group,
+ avg_load * SCHED_LOAD_SCALE);
- group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
+ group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
if (local_group) {
this_load = avg_load;
@@ -2468,8 +2517,8 @@ group_next:
max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
/* How much load to actually move to equalise the imbalance */
- *imbalance = min(max_pull * busiest->cpu_power,
- (avg_load - this_load) * this->cpu_power)
+ *imbalance = min(max_pull * busiest->__cpu_power,
+ (avg_load - this_load) * this->__cpu_power)
/ SCHED_LOAD_SCALE;
/*
@@ -2503,28 +2552,29 @@ small_imbalance:
* moving them.
*/
- pwr_now += busiest->cpu_power *
- min(busiest_load_per_task, max_load);
- pwr_now += this->cpu_power *
- min(this_load_per_task, this_load);
+ pwr_now += busiest->__cpu_power *
+ min(busiest_load_per_task, max_load);
+ pwr_now += this->__cpu_power *
+ min(this_load_per_task, this_load);
pwr_now /= SCHED_LOAD_SCALE;
/* Amount of load we'd subtract */
- tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
- busiest->cpu_power;
+ tmp = sg_div_cpu_power(busiest,
+ busiest_load_per_task * SCHED_LOAD_SCALE);
if (max_load > tmp)
- pwr_move += busiest->cpu_power *
+ pwr_move += busiest->__cpu_power *
min(busiest_load_per_task, max_load - tmp);
/* Amount of load we'd add */
- if (max_load * busiest->cpu_power <
+ if (max_load * busiest->__cpu_power <
busiest_load_per_task * SCHED_LOAD_SCALE)
- tmp = max_load * busiest->cpu_power / this->cpu_power;
+ tmp = sg_div_cpu_power(this,
+ max_load * busiest->__cpu_power);
else
- tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
- this->cpu_power;
- pwr_move += this->cpu_power *
- min(this_load_per_task, this_load + tmp);
+ tmp = sg_div_cpu_power(this,
+ busiest_load_per_task * SCHED_LOAD_SCALE);
+ pwr_move += this->__cpu_power *
+ min(this_load_per_task, this_load + tmp);
pwr_move /= SCHED_LOAD_SCALE;
/* Move if we gain throughput */
@@ -2657,6 +2707,12 @@ redo:
double_rq_unlock(this_rq, busiest);
local_irq_restore(flags);
+ /*
+ * some other cpu did the load balance for us.
+ */
+ if (nr_moved && this_cpu != smp_processor_id())
+ resched_cpu(this_cpu);
+
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(all_pinned)) {
cpu_clear(cpu_of(busiest), cpus);
@@ -2927,32 +2983,98 @@ static void update_load(struct rq *this_rq)
}
}
+#ifdef CONFIG_NO_HZ
+static struct {
+ atomic_t load_balancer;
+ cpumask_t cpu_mask;
+} nohz ____cacheline_aligned = {
+ .load_balancer = ATOMIC_INIT(-1),
+ .cpu_mask = CPU_MASK_NONE,
+};
+
/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * This routine will try to nominate the ilb (idle load balancing)
+ * owner among the cpus whose ticks are stopped. ilb owner will do the idle
+ * load balancing on behalf of all those cpus. If all the cpus in the system
+ * go into this tickless mode, then there will be no ilb owner (as there is
+ * no need for one) and all the cpus will sleep till the next wakeup event
+ * arrives...
*
+ * For the ilb owner, tick is not stopped. And this tick will be used
+ * for idle load balancing. ilb owner will still be part of
+ * nohz.cpu_mask..
+ *
+ * While stopping the tick, this cpu will become the ilb owner if there
+ * is no other owner. And will be the owner till that cpu becomes busy
+ * or if all cpus in the system stop their ticks at which point
+ * there is no need for ilb owner.
+ *
+ * When the ilb owner becomes busy, it nominates another owner, during the
+ * next busy scheduler_tick()
+ */
+int select_nohz_load_balancer(int stop_tick)
+{
+ int cpu = smp_processor_id();
+
+ if (stop_tick) {
+ cpu_set(cpu, nohz.cpu_mask);
+ cpu_rq(cpu)->in_nohz_recently = 1;
+
+ /*
+ * If we are going offline and still the leader, give up!
+ */
+ if (cpu_is_offline(cpu) &&
+ atomic_read(&nohz.load_balancer) == cpu) {
+ if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ BUG();
+ return 0;
+ }
+
+ /* time for ilb owner also to sleep */
+ if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+ if (atomic_read(&nohz.load_balancer) == cpu)
+ atomic_set(&nohz.load_balancer, -1);
+ return 0;
+ }
+
+ if (atomic_read(&nohz.load_balancer) == -1) {
+ /* make me the ilb owner */
+ if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
+ return 1;
+ } else if (atomic_read(&nohz.load_balancer) == cpu)
+ return 1;
+ } else {
+ if (!cpu_isset(cpu, nohz.cpu_mask))
+ return 0;
+
+ cpu_clear(cpu, nohz.cpu_mask);
+
+ if (atomic_read(&nohz.load_balancer) == cpu)
+ if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ BUG();
+ }
+ return 0;
+}
+#endif
+
+static DEFINE_SPINLOCK(balancing);
+
+/*
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
*
* Balancing parameters are set up in arch_init_sched_domains.
*/
-static DEFINE_SPINLOCK(balancing);
-
-static void run_rebalance_domains(struct softirq_action *h)
+static inline void rebalance_domains(int cpu, enum idle_type idle)
{
- int this_cpu = smp_processor_id(), balance = 1;
- struct rq *this_rq = cpu_rq(this_cpu);
+ int balance = 1;
+ struct rq *rq = cpu_rq(cpu);
unsigned long interval;
struct sched_domain *sd;
- /*
- * We are idle if there are no processes running. This
- * is valid even if we are the idle process (SMT).
- */
- enum idle_type idle = !this_rq->nr_running ?
- SCHED_IDLE : NOT_IDLE;
- /* Earliest time when we have to call run_rebalance_domains again */
+ /* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
- for_each_domain(this_cpu, sd) {
+ for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
@@ -2971,7 +3093,7 @@ static void run_rebalance_domains(struct softirq_action *h)
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {
- if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
+ if (load_balance(cpu, rq, sd, idle, &balance)) {
/*
* We've pulled tasks over so either we're no
* longer idle, or one of our SMT siblings is
@@ -2995,7 +3117,114 @@ out:
if (!balance)
break;
}
- this_rq->next_balance = next_balance;
+ rq->next_balance = next_balance;
+}
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static void run_rebalance_domains(struct softirq_action *h)
+{
+ int local_cpu = smp_processor_id();
+ struct rq *local_rq = cpu_rq(local_cpu);
+ enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE;
+
+ rebalance_domains(local_cpu, idle);
+
+#ifdef CONFIG_NO_HZ
+ /*
+ * If this cpu is the owner for idle load balancing, then do the
+ * balancing on behalf of the other idle cpus whose ticks are
+ * stopped.
+ */
+ if (local_rq->idle_at_tick &&
+ atomic_read(&nohz.load_balancer) == local_cpu) {
+ cpumask_t cpus = nohz.cpu_mask;
+ struct rq *rq;
+ int balance_cpu;
+
+ cpu_clear(local_cpu, cpus);
+ for_each_cpu_mask(balance_cpu, cpus) {
+ /*
+ * If this cpu gets work to do, stop the load balancing
+ * work being done for other cpus. Next load
+ * balancing owner will pick it up.
+ */
+ if (need_resched())
+ break;
+
+ rebalance_domains(balance_cpu, SCHED_IDLE);
+
+ rq = cpu_rq(balance_cpu);
+ if (time_after(local_rq->next_balance, rq->next_balance))
+ local_rq->next_balance = rq->next_balance;
+ }
+ }
+#endif
+}
+
+/*
+ * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
+ *
+ * In case of CONFIG_NO_HZ, this is the place where we nominate a new
+ * idle load balancing owner or decide to stop the periodic load balancing,
+ * if the whole system is idle.
+ */
+static inline void trigger_load_balance(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_NO_HZ
+ /*
+ * If we were in the nohz mode recently and busy at the current
+ * scheduler tick, then check if we need to nominate new idle
+ * load balancer.
+ */
+ if (rq->in_nohz_recently && !rq->idle_at_tick) {
+ rq->in_nohz_recently = 0;
+
+ if (atomic_read(&nohz.load_balancer) == cpu) {
+ cpu_clear(cpu, nohz.cpu_mask);
+ atomic_set(&nohz.load_balancer, -1);
+ }
+
+ if (atomic_read(&nohz.load_balancer) == -1) {
+ /*
+ * simple selection for now: Nominate the
+ * first cpu in the nohz list to be the next
+ * ilb owner.
+ *
+ * TBD: Traverse the sched domains and nominate
+ * the nearest cpu in the nohz.cpu_mask.
+ */
+ int ilb = first_cpu(nohz.cpu_mask);
+
+ if (ilb != NR_CPUS)
+ resched_cpu(ilb);
+ }
+ }
+
+ /*
+ * If this cpu is idle and doing idle load balancing for all the
+ * cpus with ticks stopped, is it time for that to stop?
+ */
+ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
+ cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+ resched_cpu(cpu);
+ return;
+ }
+
+ /*
+ * If this cpu is idle and the idle load balancing is done by
+ * someone else, then no need raise the SCHED_SOFTIRQ
+ */
+ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
+ cpu_isset(cpu, nohz.cpu_mask))
+ return;
+#endif
+ if (time_after_eq(jiffies, rq->next_balance))
+ raise_softirq(SCHED_SOFTIRQ);
}
#else
/*
@@ -3218,16 +3447,17 @@ void scheduler_tick(void)
unsigned long long now = sched_clock();
struct task_struct *p = current;
int cpu = smp_processor_id();
+ int idle_at_tick = idle_cpu(cpu);
struct rq *rq = cpu_rq(cpu);
update_cpu_clock(p, rq, now);
- if (p != rq->idle)
+ if (!idle_at_tick)
task_running_tick(rq, p);
#ifdef CONFIG_SMP
update_load(rq);
- if (time_after_eq(jiffies, rq->next_balance))
- raise_softirq(SCHED_SOFTIRQ);
+ rq->idle_at_tick = idle_at_tick;
+ trigger_load_balance(cpu);
#endif
}
@@ -4291,13 +4521,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
struct task_struct *p;
int retval;
- lock_cpu_hotplug();
+ mutex_lock(&sched_hotcpu_mutex);
read_lock(&tasklist_lock);
p = find_process_by_pid(pid);
if (!p) {
read_unlock(&tasklist_lock);
- unlock_cpu_hotplug();
+ mutex_unlock(&sched_hotcpu_mutex);
return -ESRCH;
}
@@ -4324,7 +4554,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
out_unlock:
put_task_struct(p);
- unlock_cpu_hotplug();
+ mutex_unlock(&sched_hotcpu_mutex);
return retval;
}
@@ -4381,7 +4611,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
struct task_struct *p;
int retval;
- lock_cpu_hotplug();
+ mutex_lock(&sched_hotcpu_mutex);
read_lock(&tasklist_lock);
retval = -ESRCH;
@@ -4397,7 +4627,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
out_unlock:
read_unlock(&tasklist_lock);
- unlock_cpu_hotplug();
+ mutex_unlock(&sched_hotcpu_mutex);
if (retval)
return retval;
@@ -4750,6 +4980,8 @@ void show_state_filter(unsigned long state_filter)
show_task(p);
} while_each_thread(g, p);
+ touch_all_softlockup_watchdogs();
+
read_unlock(&tasklist_lock);
/*
* Only show locks if all tasks are dumped:
@@ -5157,7 +5389,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
struct rq *rq;
switch (action) {
+ case CPU_LOCK_ACQUIRE:
+ mutex_lock(&sched_hotcpu_mutex);
+ break;
+
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
if (IS_ERR(p))
return NOTIFY_BAD;
@@ -5171,12 +5408,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
/* Strictly unneccessary, as first user will wake it. */
wake_up_process(cpu_rq(cpu)->migration_thread);
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
if (!cpu_rq(cpu)->migration_thread)
break;
/* Unbind it from offline cpu so it can run. Fall thru. */
@@ -5187,6 +5426,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
migrate_live_tasks(cpu);
rq = cpu_rq(cpu);
kthread_stop(rq->migration_thread);
@@ -5202,7 +5442,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
BUG_ON(rq->nr_running != 0);
/* No need to migrate the tasks: it was best-effort if
- * they didn't do lock_cpu_hotplug(). Just wake up
+ * they didn't take sched_hotcpu_mutex. Just wake up
* the requestors. */
spin_lock_irq(&rq->lock);
while (!list_empty(&rq->migration_queue)) {
@@ -5216,6 +5456,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
spin_unlock_irq(&rq->lock);
break;
#endif
+ case CPU_LOCK_RELEASE:
+ mutex_unlock(&sched_hotcpu_mutex);
+ break;
}
return NOTIFY_OK;
}
@@ -5304,7 +5547,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
break;
}
- if (!group->cpu_power) {
+ if (!group->__cpu_power) {
printk("\n");
printk(KERN_ERR "ERROR: domain->cpu_power not "
"set\n");
@@ -5481,7 +5724,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
continue;
sg->cpumask = CPU_MASK_NONE;
- sg->cpu_power = 0;
+ sg->__cpu_power = 0;
for_each_cpu_mask(j, span) {
if (group_fn(j, cpu_map, NULL) != group)
@@ -6170,7 +6413,7 @@ next_sg:
continue;
}
- sg->cpu_power += sd->groups->cpu_power;
+ sg_inc_cpu_power(sg, sd->groups->__cpu_power);
}
sg = sg->next;
if (sg != group_head)
@@ -6245,6 +6488,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
child = sd->child;
+ sd->groups->__cpu_power = 0;
+
/*
* For perf policy, if the groups in child domain share resources
* (for example cores sharing some portions of the cache hierarchy
@@ -6255,18 +6500,16 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
(child->flags &
(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
- sd->groups->cpu_power = SCHED_LOAD_SCALE;
+ sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
return;
}
- sd->groups->cpu_power = 0;
-
/*
* add cpu_power of each child group to this groups cpu_power
*/
group = child->groups;
do {
- sd->groups->cpu_power += group->cpu_power;
+ sg_inc_cpu_power(sd->groups, group->__cpu_power);
group = group->next;
} while (group != child->groups);
}
@@ -6426,7 +6669,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
sd = &per_cpu(node_domains, j);
sd->groups = sg;
}
- sg->cpu_power = 0;
+ sg->__cpu_power = 0;
sg->cpumask = nodemask;
sg->next = sg;
cpus_or(covered, covered, nodemask);
@@ -6454,7 +6697,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
"Can not alloc domain group for node %d\n", j);
goto error;
}
- sg->cpu_power = 0;
+ sg->__cpu_power = 0;
sg->cpumask = tmp;
sg->next = prev->next;
cpus_or(covered, covered, tmp);
@@ -6591,10 +6834,10 @@ int arch_reinit_sched_domains(void)
{
int err;
- lock_cpu_hotplug();
+ mutex_lock(&sched_hotcpu_mutex);
detach_destroy_domains(&cpu_online_map);
err = arch_init_sched_domains(&cpu_online_map);
- unlock_cpu_hotplug();
+ mutex_unlock(&sched_hotcpu_mutex);
return err;
}
@@ -6673,14 +6916,20 @@ static int update_sched_domains(struct notifier_block *nfb,
{
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
detach_destroy_domains(&cpu_online_map);
return NOTIFY_OK;
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
/*
* Fall through and re-initialise the domains.
*/
@@ -6699,12 +6948,12 @@ void __init sched_init_smp(void)
{
cpumask_t non_isolated_cpus;
- lock_cpu_hotplug();
+ mutex_lock(&sched_hotcpu_mutex);
arch_init_sched_domains(&cpu_online_map);
cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
if (cpus_empty(non_isolated_cpus))
cpu_set(smp_processor_id(), non_isolated_cpus);
- unlock_cpu_hotplug();
+ mutex_unlock(&sched_hotcpu_mutex);
/* XXX: Theoretical race here - CPU may be hotplugged now */
hotcpu_notifier(update_sched_domains, 0);
diff --git a/kernel/signal.c b/kernel/signal.c
index 2b4087d..2ac3a66 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -12,7 +12,6 @@
#include <linux/slab.h>
#include <linux/module.h>
-#include <linux/smp_lock.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/fs.h>
@@ -39,125 +38,6 @@
static struct kmem_cache *sigqueue_cachep;
-/*
- * In POSIX a signal is sent either to a specific thread (Linux task)
- * or to the process as a whole (Linux thread group). How the signal
- * is sent determines whether it's to one thread or the whole group,
- * which determines which signal mask(s) are involved in blocking it
- * from being delivered until later. When the signal is delivered,
- * either it's caught or ignored by a user handler or it has a default
- * effect that applies to the whole thread group (POSIX process).
- *
- * The possible effects an unblocked signal set to SIG_DFL can have are:
- * ignore - Nothing Happens
- * terminate - kill the process, i.e. all threads in the group,
- * similar to exit_group. The group leader (only) reports
- * WIFSIGNALED status to its parent.
- * coredump - write a core dump file describing all threads using
- * the same mm and then kill all those threads
- * stop - stop all the threads in the group, i.e. TASK_STOPPED state
- *
- * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored.
- * Other signals when not blocked and set to SIG_DFL behaves as follows.
- * The job control signals also have other special effects.
- *
- * +--------------------+------------------+
- * | POSIX signal | default action |
- * +--------------------+------------------+
- * | SIGHUP | terminate |
- * | SIGINT | terminate |
- * | SIGQUIT | coredump |
- * | SIGILL | coredump |
- * | SIGTRAP | coredump |
- * | SIGABRT/SIGIOT | coredump |
- * | SIGBUS | coredump |
- * | SIGFPE | coredump |
- * | SIGKILL | terminate(+) |
- * | SIGUSR1 | terminate |
- * | SIGSEGV | coredump |
- * | SIGUSR2 | terminate |
- * | SIGPIPE | terminate |
- * | SIGALRM | terminate |
- * | SIGTERM | terminate |
- * | SIGCHLD | ignore |
- * | SIGCONT | ignore(*) |
- * | SIGSTOP | stop(*)(+) |
- * | SIGTSTP | stop(*) |
- * | SIGTTIN | stop(*) |
- * | SIGTTOU | stop(*) |
- * | SIGURG | ignore |
- * | SIGXCPU | coredump |
- * | SIGXFSZ | coredump |
- * | SIGVTALRM | terminate |
- * | SIGPROF | terminate |
- * | SIGPOLL/SIGIO | terminate |
- * | SIGSYS/SIGUNUSED | coredump |
- * | SIGSTKFLT | terminate |
- * | SIGWINCH | ignore |
- * | SIGPWR | terminate |
- * | SIGRTMIN-SIGRTMAX | terminate |
- * +--------------------+------------------+
- * | non-POSIX signal | default action |
- * +--------------------+------------------+
- * | SIGEMT | coredump |
- * +--------------------+------------------+
- *
- * (+) For SIGKILL and SIGSTOP the action is "always", not just "default".
- * (*) Special job control effects:
- * When SIGCONT is sent, it resumes the process (all threads in the group)
- * from TASK_STOPPED state and also clears any pending/queued stop signals
- * (any of those marked with "stop(*)"). This happens regardless of blocking,
- * catching, or ignoring SIGCONT. When any stop signal is sent, it clears
- * any pending/queued SIGCONT signals; this happens regardless of blocking,
- * catching, or ignored the stop signal, though (except for SIGSTOP) the
- * default action of stopping the process may happen later or never.
- */
-
-#ifdef SIGEMT
-#define M_SIGEMT M(SIGEMT)
-#else
-#define M_SIGEMT 0
-#endif
-
-#if SIGRTMIN > BITS_PER_LONG
-#define M(sig) (1ULL << ((sig)-1))
-#else
-#define M(sig) (1UL << ((sig)-1))
-#endif
-#define T(sig, mask) (M(sig) & (mask))
-
-#define SIG_KERNEL_ONLY_MASK (\
- M(SIGKILL) | M(SIGSTOP) )
-
-#define SIG_KERNEL_STOP_MASK (\
- M(SIGSTOP) | M(SIGTSTP) | M(SIGTTIN) | M(SIGTTOU) )
-
-#define SIG_KERNEL_COREDUMP_MASK (\
- M(SIGQUIT) | M(SIGILL) | M(SIGTRAP) | M(SIGABRT) | \
- M(SIGFPE) | M(SIGSEGV) | M(SIGBUS) | M(SIGSYS) | \
- M(SIGXCPU) | M(SIGXFSZ) | M_SIGEMT )
-
-#define SIG_KERNEL_IGNORE_MASK (\
- M(SIGCONT) | M(SIGCHLD) | M(SIGWINCH) | M(SIGURG) )
-
-#define sig_kernel_only(sig) \
- (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_ONLY_MASK))
-#define sig_kernel_coredump(sig) \
- (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_COREDUMP_MASK))
-#define sig_kernel_ignore(sig) \
- (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_IGNORE_MASK))
-#define sig_kernel_stop(sig) \
- (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK))
-
-#define sig_needs_tasklist(sig) ((sig) == SIGCONT)
-
-#define sig_user_defined(t, signr) \
- (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \
- ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
-
-#define sig_fatal(t, signr) \
- (!T(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
- (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)
static int sig_ignored(struct task_struct *t, int sig)
{
@@ -329,6 +209,16 @@ void flush_signals(struct task_struct *t)
spin_unlock_irqrestore(&t->sighand->siglock, flags);
}
+void ignore_signals(struct task_struct *t)
+{
+ int i;
+
+ for (i = 0; i < _NSIG; ++i)
+ t->sighand->action[i].sa.sa_handler = SIG_IGN;
+
+ flush_signals(t);
+}
+
/*
* Flush all handlers for a task.
*/
@@ -1033,17 +923,6 @@ void zap_other_threads(struct task_struct *p)
if (t->exit_state)
continue;
- /*
- * We don't want to notify the parent, since we are
- * killed as part of a thread group due to another
- * thread doing an execve() or similar. So set the
- * exit signal to -1 to allow immediate reaping of
- * the process. But don't detach the thread group
- * leader.
- */
- if (t != p->group_leader)
- t->exit_signal = -1;
-
/* SIGKILL will be handled before any pending SIGSTOP */
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 8b75008..0b9886a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -593,6 +593,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
if (IS_ERR(p)) {
printk("ksoftirqd for %i failed\n", hotcpu);
@@ -602,16 +603,19 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
per_cpu(ksoftirqd, hotcpu) = p;
break;
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
wake_up_process(per_cpu(ksoftirqd, hotcpu));
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
if (!per_cpu(ksoftirqd, hotcpu))
break;
/* Unbind so it can run. Fall thru. */
kthread_bind(per_cpu(ksoftirqd, hotcpu),
any_online_cpu(cpu_online_map));
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
p = per_cpu(ksoftirqd, hotcpu);
per_cpu(ksoftirqd, hotcpu) = NULL;
kthread_stop(p);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 50afeb8..0131e29 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -34,12 +34,32 @@ static struct notifier_block panic_block = {
.notifier_call = softlock_panic,
};
+/*
+ * Returns seconds, approximately. We don't need nanosecond
+ * resolution, and we don't need to waste time with a big divide when
+ * 2^30ns == 1.074s.
+ */
+static unsigned long get_timestamp(void)
+{
+ return sched_clock() >> 30; /* 2^30 ~= 10^9 */
+}
+
void touch_softlockup_watchdog(void)
{
- __raw_get_cpu_var(touch_timestamp) = jiffies;
+ __raw_get_cpu_var(touch_timestamp) = get_timestamp();
}
EXPORT_SYMBOL(touch_softlockup_watchdog);
+void touch_all_softlockup_watchdogs(void)
+{
+ int cpu;
+
+ /* Cause each CPU to re-update its timestamp rather than complain */
+ for_each_online_cpu(cpu)
+ per_cpu(touch_timestamp, cpu) = 0;
+}
+EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
+
/*
* This callback runs from the timer interrupt, and checks
* whether the watchdog thread has hung or not:
@@ -48,9 +68,18 @@ void softlockup_tick(void)
{
int this_cpu = smp_processor_id();
unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu);
+ unsigned long print_timestamp;
+ unsigned long now;
+
+ if (touch_timestamp == 0) {
+ touch_softlockup_watchdog();
+ return;
+ }
+
+ print_timestamp = per_cpu(print_timestamp, this_cpu);
- /* prevent double reports: */
- if (per_cpu(print_timestamp, this_cpu) == touch_timestamp ||
+ /* report at most once a second */
+ if (print_timestamp < (touch_timestamp + 1) ||
did_panic ||
!per_cpu(watchdog_task, this_cpu))
return;
@@ -61,12 +90,14 @@ void softlockup_tick(void)
return;
}
+ now = get_timestamp();
+
/* Wake up the high-prio watchdog task every second: */
- if (time_after(jiffies, touch_timestamp + HZ))
+ if (now > (touch_timestamp + 1))
wake_up_process(per_cpu(watchdog_task, this_cpu));
/* Warn about unreasonable 10+ seconds delays: */
- if (time_after(jiffies, touch_timestamp + 10*HZ)) {
+ if (now > (touch_timestamp + 10)) {
per_cpu(print_timestamp, this_cpu) = touch_timestamp;
spin_lock(&print_lock);
@@ -82,11 +113,14 @@ void softlockup_tick(void)
*/
static int watchdog(void * __bind_cpu)
{
- struct sched_param param = { .sched_priority = 99 };
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
sched_setscheduler(current, SCHED_FIFO, &param);
current->flags |= PF_NOFREEZE;
+ /* initialize timestamp */
+ touch_softlockup_watchdog();
+
/*
* Run briefly once per second to reset the softlockup timestamp.
* If this gets delayed for more than 10 seconds then the
@@ -112,27 +146,31 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
BUG_ON(per_cpu(watchdog_task, hotcpu));
p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
if (IS_ERR(p)) {
printk("watchdog for %i failed\n", hotcpu);
return NOTIFY_BAD;
}
- per_cpu(touch_timestamp, hotcpu) = jiffies;
+ per_cpu(touch_timestamp, hotcpu) = 0;
per_cpu(watchdog_task, hotcpu) = p;
kthread_bind(p, hotcpu);
break;
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
wake_up_process(per_cpu(watchdog_task, hotcpu));
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
if (!per_cpu(watchdog_task, hotcpu))
break;
/* Unbind so it can run. Fall thru. */
kthread_bind(per_cpu(watchdog_task, hotcpu),
any_online_cpu(cpu_online_map));
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
p = per_cpu(watchdog_task, hotcpu);
per_cpu(watchdog_task, hotcpu) = NULL;
kthread_stop(p);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 1245804..daabb74 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,11 +1,12 @@
/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
* GPL v2 and any later version.
*/
-#include <linux/stop_machine.h>
-#include <linux/kthread.h>
-#include <linux/sched.h>
#include <linux/cpu.h>
#include <linux/err.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/stop_machine.h>
#include <linux/syscalls.h>
#include <asm/atomic.h>
#include <asm/semaphore.h>
@@ -208,3 +209,4 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
return ret;
}
+EXPORT_SYMBOL_GPL(stop_machine_run);
diff --git a/kernel/sys.c b/kernel/sys.c
index fe1f3ab..cdb7e94 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -134,19 +134,39 @@ static int notifier_chain_unregister(struct notifier_block **nl,
return -ENOENT;
}
+/**
+ * notifier_call_chain - Informs the registered notifiers about an event.
+ * @nl: Pointer to head of the blocking notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: Number of notifier functions to be called. Don't care
+ * value of this parameter is -1.
+ * @nr_calls: Records the number of notifications sent. Don't care
+ * value of this field is NULL.
+ * @returns: notifier_call_chain returns the value returned by the
+ * last notifier function called.
+ */
+
static int __kprobes notifier_call_chain(struct notifier_block **nl,
- unsigned long val, void *v)
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
{
int ret = NOTIFY_DONE;
struct notifier_block *nb, *next_nb;
nb = rcu_dereference(*nl);
- while (nb) {
+
+ while (nb && nr_to_call) {
next_nb = rcu_dereference(nb->next);
ret = nb->notifier_call(nb, val, v);
+
+ if (nr_calls)
+ (*nr_calls)++;
+
if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
break;
nb = next_nb;
+ nr_to_call--;
}
return ret;
}
@@ -205,10 +225,12 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
/**
- * atomic_notifier_call_chain - Call functions in an atomic notifier chain
+ * __atomic_notifier_call_chain - Call functions in an atomic notifier chain
* @nh: Pointer to head of the atomic notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: See the comment for notifier_call_chain.
+ * @nr_calls: See the comment for notifier_call_chain.
*
* Calls each function in a notifier chain in turn. The functions
* run in an atomic context, so they must not block.
@@ -222,19 +244,27 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
* of the last notifier function called.
*/
-int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
- unsigned long val, void *v)
+int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
{
int ret;
rcu_read_lock();
- ret = notifier_call_chain(&nh->head, val, v);
+ ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
rcu_read_unlock();
return ret;
}
-EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
+EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
+int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+ unsigned long val, void *v)
+{
+ return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
+}
+
+EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
/*
* Blocking notifier chain routines. All access to the chain is
* synchronized by an rwsem.
@@ -304,10 +334,12 @@ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
/**
- * blocking_notifier_call_chain - Call functions in a blocking notifier chain
+ * __blocking_notifier_call_chain - Call functions in a blocking notifier chain
* @nh: Pointer to head of the blocking notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: See comment for notifier_call_chain.
+ * @nr_calls: See comment for notifier_call_chain.
*
* Calls each function in a notifier chain in turn. The functions
* run in a process context, so they are allowed to block.
@@ -320,8 +352,9 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
* of the last notifier function called.
*/
-int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
- unsigned long val, void *v)
+int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
{
int ret = NOTIFY_DONE;
@@ -332,12 +365,19 @@ int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
*/
if (rcu_dereference(nh->head)) {
down_read(&nh->rwsem);
- ret = notifier_call_chain(&nh->head, val, v);
+ ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
+ nr_calls);
up_read(&nh->rwsem);
}
return ret;
}
+EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
+int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+ unsigned long val, void *v)
+{
+ return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
+}
EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
/*
@@ -383,10 +423,12 @@ int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
/**
- * raw_notifier_call_chain - Call functions in a raw notifier chain
+ * __raw_notifier_call_chain - Call functions in a raw notifier chain
* @nh: Pointer to head of the raw notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: See comment for notifier_call_chain.
+ * @nr_calls: See comment for notifier_call_chain
*
* Calls each function in a notifier chain in turn. The functions
* run in an undefined context.
@@ -400,10 +442,19 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
* of the last notifier function called.
*/
+int __raw_notifier_call_chain(struct raw_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
+{
+ return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
+}
+
+EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
+
int raw_notifier_call_chain(struct raw_notifier_head *nh,
unsigned long val, void *v)
{
- return notifier_call_chain(&nh->head, val, v);
+ return __raw_notifier_call_chain(nh, val, v, -1, NULL);
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
@@ -478,10 +529,12 @@ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
/**
- * srcu_notifier_call_chain - Call functions in an SRCU notifier chain
+ * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain
* @nh: Pointer to head of the SRCU notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: See comment for notifier_call_chain.
+ * @nr_calls: See comment for notifier_call_chain
*
* Calls each function in a notifier chain in turn. The functions
* run in a process context, so they are allowed to block.
@@ -494,18 +547,25 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
* of the last notifier function called.
*/
-int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
- unsigned long val, void *v)
+int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
{
int ret;
int idx;
idx = srcu_read_lock(&nh->srcu);
- ret = notifier_call_chain(&nh->head, val, v);
+ ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
srcu_read_unlock(&nh->srcu, idx);
return ret;
}
+EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
+int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+ unsigned long val, void *v)
+{
+ return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
+}
EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
/**
@@ -881,7 +941,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
#ifdef CONFIG_SOFTWARE_SUSPEND
case LINUX_REBOOT_CMD_SW_SUSPEND:
{
- int ret = pm_suspend(PM_SUSPEND_DISK);
+ int ret = hibernate();
unlock_kernel();
return ret;
}
@@ -1292,7 +1352,7 @@ asmlinkage long sys_setfsuid(uid_t uid)
}
/*
- * Samma på svenska..
+ * Samma på svenska..
*/
asmlinkage long sys_setfsgid(gid_t gid)
{
@@ -1923,6 +1983,16 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
if (retval)
return retval;
+ if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) {
+ /*
+ * The caller is asking for an immediate RLIMIT_CPU
+ * expiry. But we use the zero value to mean "it was
+ * never set". So let's cheat and make it one second
+ * instead
+ */
+ new_rlim.rlim_cur = 1;
+ }
+
task_lock(current->group_leader);
*old_rlim = new_rlim;
task_unlock(current->group_leader);
@@ -1944,15 +2014,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
unsigned long rlim_cur = new_rlim.rlim_cur;
cputime_t cputime;
- if (rlim_cur == 0) {
- /*
- * The caller is asking for an immediate RLIMIT_CPU
- * expiry. But we use the zero value to mean "it was
- * never set". So let's cheat and make it one second
- * instead
- */
- rlim_cur = 1;
- }
cputime = secs_to_cputime(rlim_cur);
read_lock(&tasklist_lock);
spin_lock_irq(&current->sighand->siglock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c904748..4073353 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -76,6 +76,8 @@ extern int pid_max_min, pid_max_max;
extern int sysctl_drop_caches;
extern int percpu_pagelist_fraction;
extern int compat_log;
+extern int maps_protect;
+extern int sysctl_stat_interval;
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
@@ -603,6 +605,16 @@ static ctl_table kern_table[] = {
.proc_handler = &proc_dointvec,
},
#endif
+#ifdef CONFIG_PROC_FS
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "maps_protect",
+ .data = &maps_protect,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
{ .ctl_name = 0 }
};
@@ -846,6 +858,17 @@ static ctl_table vm_table[] = {
.extra2 = &one_hundred,
},
#endif
+#ifdef CONFIG_SMP
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "stat_interval",
+ .data = &sysctl_stat_interval,
+ .maxlen = sizeof(sysctl_stat_interval),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+#endif
#if defined(CONFIG_X86_32) || \
(defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
{
diff --git a/kernel/time.c b/kernel/time.c
index ba18ec4..f04791f 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -31,7 +31,6 @@
#include <linux/timex.h>
#include <linux/capability.h>
#include <linux/errno.h>
-#include <linux/smp_lock.h>
#include <linux/syscalls.h>
#include <linux/security.h>
#include <linux/fs.h>
@@ -247,6 +246,36 @@ struct timespec current_fs_time(struct super_block *sb)
}
EXPORT_SYMBOL(current_fs_time);
+/*
+ * Convert jiffies to milliseconds and back.
+ *
+ * Avoid unnecessary multiplications/divisions in the
+ * two most common HZ cases:
+ */
+unsigned int inline jiffies_to_msecs(const unsigned long j)
+{
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+ return (MSEC_PER_SEC / HZ) * j;
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+ return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
+#else
+ return (j * MSEC_PER_SEC) / HZ;
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_msecs);
+
+unsigned int inline jiffies_to_usecs(const unsigned long j)
+{
+#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+ return (USEC_PER_SEC / HZ) * j;
+#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
+ return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
+#else
+ return (j * USEC_PER_SEC) / HZ;
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_usecs);
+
/**
* timespec_trunc - Truncate timespec to a granularity
* @t: Timespec
@@ -473,36 +502,6 @@ struct timeval ns_to_timeval(const s64 nsec)
EXPORT_SYMBOL(ns_to_timeval);
/*
- * Convert jiffies to milliseconds and back.
- *
- * Avoid unnecessary multiplications/divisions in the
- * two most common HZ cases:
- */
-unsigned int jiffies_to_msecs(const unsigned long j)
-{
-#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
- return (MSEC_PER_SEC / HZ) * j;
-#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
- return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
-#else
- return (j * MSEC_PER_SEC) / HZ;
-#endif
-}
-EXPORT_SYMBOL(jiffies_to_msecs);
-
-unsigned int jiffies_to_usecs(const unsigned long j)
-{
-#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
- return (USEC_PER_SEC / HZ) * j;
-#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
- return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
-#else
- return (j * USEC_PER_SEC) / HZ;
-#endif
-}
-EXPORT_SYMBOL(jiffies_to_usecs);
-
-/*
* When we convert to jiffies then we interpret incoming values
* the following way:
*
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 93bccba..99b6034 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
-obj-y += ntp.o clocksource.o jiffies.o timer_list.o
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index fe5c7db..3db5c3c 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -74,15 +74,17 @@ static struct clocksource *watchdog;
static struct timer_list watchdog_timer;
static DEFINE_SPINLOCK(watchdog_lock);
static cycle_t watchdog_last;
+static int watchdog_resumed;
+
/*
- * Interval: 0.5sec Treshold: 0.0625s
+ * Interval: 0.5sec Threshold: 0.0625s
*/
#define WATCHDOG_INTERVAL (HZ >> 1)
-#define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4)
+#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
{
- if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD)
+ if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD)
return;
printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
@@ -98,15 +100,26 @@ static void clocksource_watchdog(unsigned long data)
struct clocksource *cs, *tmp;
cycle_t csnow, wdnow;
int64_t wd_nsec, cs_nsec;
+ int resumed;
spin_lock(&watchdog_lock);
+ resumed = watchdog_resumed;
+ if (unlikely(resumed))
+ watchdog_resumed = 0;
+
wdnow = watchdog->read();
wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
watchdog_last = wdnow;
list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
csnow = cs->read();
+
+ if (unlikely(resumed)) {
+ cs->wd_last = csnow;
+ continue;
+ }
+
/* Initialized ? */
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
@@ -136,6 +149,13 @@ static void clocksource_watchdog(unsigned long data)
}
spin_unlock(&watchdog_lock);
}
+static void clocksource_resume_watchdog(void)
+{
+ spin_lock(&watchdog_lock);
+ watchdog_resumed = 1;
+ spin_unlock(&watchdog_lock);
+}
+
static void clocksource_check_watchdog(struct clocksource *cs)
{
struct clocksource *cse;
@@ -182,9 +202,34 @@ static void clocksource_check_watchdog(struct clocksource *cs)
if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
}
+
+static inline void clocksource_resume_watchdog(void) { }
#endif
/**
+ * clocksource_resume - resume the clocksource(s)
+ */
+void clocksource_resume(void)
+{
+ struct list_head *tmp;
+ unsigned long flags;
+
+ spin_lock_irqsave(&clocksource_lock, flags);
+
+ list_for_each(tmp, &clocksource_list) {
+ struct clocksource *cs;
+
+ cs = list_entry(tmp, struct clocksource, list);
+ if (cs->resume)
+ cs->resume();
+ }
+
+ clocksource_resume_watchdog();
+
+ spin_unlock_irqrestore(&clocksource_lock, flags);
+}
+
+/**
* clocksource_get_next - Returns the selected clocksource
*
*/
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index bfda3f7..a96ec9a 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -31,7 +31,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
*/
ktime_t tick_next_period;
ktime_t tick_period;
-static int tick_do_timer_cpu = -1;
+int tick_do_timer_cpu __read_mostly = -1;
DEFINE_SPINLOCK(tick_device_lock);
/*
@@ -295,6 +295,12 @@ static void tick_shutdown(unsigned int *cpup)
clockevents_exchange_device(dev, NULL);
td->evtdev = NULL;
}
+ /* Transfer the do_timer job away from this cpu */
+ if (*cpup == tick_do_timer_cpu) {
+ int cpu = first_cpu(cpu_online_map);
+
+ tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1;
+ }
spin_unlock_irqrestore(&tick_device_lock, flags);
}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index c9d203b..bb13f27 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -5,6 +5,7 @@ DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
extern spinlock_t tick_device_lock;
extern ktime_t tick_next_period;
extern ktime_t tick_period;
+extern int tick_do_timer_cpu __read_mostly;
extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
extern void tick_handle_periodic(struct clock_event_device *dev);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 51556b9..3483e6c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -217,10 +217,30 @@ void tick_nohz_stop_sched_tick(void)
* the scheduler tick in nohz_restart_sched_tick.
*/
if (!ts->tick_stopped) {
+ if (select_nohz_load_balancer(1)) {
+ /*
+ * sched tick not stopped!
+ */
+ cpu_clear(cpu, nohz_cpu_mask);
+ goto out;
+ }
+
ts->idle_tick = ts->sched_timer.expires;
ts->tick_stopped = 1;
ts->idle_jiffies = last_jiffies;
}
+
+ /*
+ * If this cpu is the one which updates jiffies, then
+ * give up the assignment and let it be taken by the
+ * cpu which runs the tick timer next, which might be
+ * this cpu as well. If we don't drop this here the
+ * jiffies might be stale and do_timer() never
+ * invoked.
+ */
+ if (cpu == tick_do_timer_cpu)
+ tick_do_timer_cpu = -1;
+
/*
* calculate the expiry time for the next timer wheel
* timer
@@ -273,6 +293,7 @@ void tick_nohz_restart_sched_tick(void)
now = ktime_get();
local_irq_disable();
+ select_nohz_load_balancer(0);
tick_do_update_jiffies64(now);
cpu_clear(cpu, nohz_cpu_mask);
@@ -338,12 +359,24 @@ static void tick_nohz_handler(struct clock_event_device *dev)
{
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
struct pt_regs *regs = get_irq_regs();
+ int cpu = smp_processor_id();
ktime_t now = ktime_get();
dev->next_event.tv64 = KTIME_MAX;
+ /*
+ * Check if the do_timer duty was dropped. We don't care about
+ * concurrency: This happens only when the cpu in charge went
+ * into a long sleep. If two cpus happen to assign themself to
+ * this duty, then the jiffies update is still serialized by
+ * xtime_lock.
+ */
+ if (unlikely(tick_do_timer_cpu == -1))
+ tick_do_timer_cpu = cpu;
+
/* Check, if the jiffies need an update */
- tick_do_update_jiffies64(now);
+ if (tick_do_timer_cpu == cpu)
+ tick_do_update_jiffies64(now);
/*
* When we are idle and the tick is stopped, we have to touch
@@ -431,9 +464,23 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
struct hrtimer_cpu_base *base = timer->base->cpu_base;
struct pt_regs *regs = get_irq_regs();
ktime_t now = ktime_get();
+ int cpu = smp_processor_id();
+
+#ifdef CONFIG_NO_HZ
+ /*
+ * Check if the do_timer duty was dropped. We don't care about
+ * concurrency: This happens only when the cpu in charge went
+ * into a long sleep. If two cpus happen to assign themself to
+ * this duty, then the jiffies update is still serialized by
+ * xtime_lock.
+ */
+ if (unlikely(tick_do_timer_cpu == -1))
+ tick_do_timer_cpu = cpu;
+#endif
/* Check, if the jiffies need an update */
- tick_do_update_jiffies64(now);
+ if (tick_do_timer_cpu == cpu)
+ tick_do_update_jiffies64(now);
/*
* Do not call, when we are not in irq context and have
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
new file mode 100644
index 0000000..f9217bf
--- /dev/null
+++ b/kernel/time/timekeeping.c
@@ -0,0 +1,476 @@
+/*
+ * linux/kernel/time/timekeeping.c
+ *
+ * Kernel timekeeping code and accessor functions
+ *
+ * This code was moved from linux/kernel/timer.c.
+ * Please see that file for copyright and history logs.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/sysdev.h>
+#include <linux/clocksource.h>
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <linux/tick.h>
+
+
+/*
+ * This read-write spinlock protects us from races in SMP while
+ * playing with xtime and avenrun.
+ */
+__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
+
+EXPORT_SYMBOL(xtime_lock);
+
+
+/*
+ * The current time
+ * wall_to_monotonic is what we need to add to xtime (or xtime corrected
+ * for sub jiffie times) to get to monotonic time. Monotonic is pegged
+ * at zero at system boot time, so wall_to_monotonic will be negative,
+ * however, we will ALWAYS keep the tv_nsec part positive so we can use
+ * the usual normalization.
+ */
+struct timespec xtime __attribute__ ((aligned (16)));
+struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
+
+EXPORT_SYMBOL(xtime);
+
+
+static struct clocksource *clock; /* pointer to current clocksource */
+
+
+#ifdef CONFIG_GENERIC_TIME
+/**
+ * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
+ *
+ * private function, must hold xtime_lock lock when being
+ * called. Returns the number of nanoseconds since the
+ * last call to update_wall_time() (adjusted by NTP scaling)
+ */
+static inline s64 __get_nsec_offset(void)
+{
+ cycle_t cycle_now, cycle_delta;
+ s64 ns_offset;
+
+ /* read clocksource: */
+ cycle_now = clocksource_read(clock);
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+
+ /* convert to nanoseconds: */
+ ns_offset = cyc2ns(clock, cycle_delta);
+
+ return ns_offset;
+}
+
+/**
+ * __get_realtime_clock_ts - Returns the time of day in a timespec
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec. Used by
+ * do_gettimeofday() and get_realtime_clock_ts().
+ */
+static inline void __get_realtime_clock_ts(struct timespec *ts)
+{
+ unsigned long seq;
+ s64 nsecs;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+
+ *ts = xtime;
+ nsecs = __get_nsec_offset();
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ timespec_add_ns(ts, nsecs);
+}
+
+/**
+ * getnstimeofday - Returns the time of day in a timespec
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec.
+ */
+void getnstimeofday(struct timespec *ts)
+{
+ __get_realtime_clock_ts(ts);
+}
+
+EXPORT_SYMBOL(getnstimeofday);
+
+/**
+ * do_gettimeofday - Returns the time of day in a timeval
+ * @tv: pointer to the timeval to be set
+ *
+ * NOTE: Users should be converted to using get_realtime_clock_ts()
+ */
+void do_gettimeofday(struct timeval *tv)
+{
+ struct timespec now;
+
+ __get_realtime_clock_ts(&now);
+ tv->tv_sec = now.tv_sec;
+ tv->tv_usec = now.tv_nsec/1000;
+}
+
+EXPORT_SYMBOL(do_gettimeofday);
+/**
+ * do_settimeofday - Sets the time of day
+ * @tv: pointer to the timespec variable containing the new time
+ *
+ * Sets the time of day to the new time and update NTP and notify hrtimers
+ */
+int do_settimeofday(struct timespec *tv)
+{
+ unsigned long flags;
+ time_t wtm_sec, sec = tv->tv_sec;
+ long wtm_nsec, nsec = tv->tv_nsec;
+
+ if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
+ return -EINVAL;
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+
+ nsec -= __get_nsec_offset();
+
+ wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
+ wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
+
+ set_normalized_timespec(&xtime, sec, nsec);
+ set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+
+ clock->error = 0;
+ ntp_clear();
+
+ update_vsyscall(&xtime, clock);
+
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+
+ /* signal hrtimers about time change */
+ clock_was_set();
+
+ return 0;
+}
+
+EXPORT_SYMBOL(do_settimeofday);
+
+/**
+ * change_clocksource - Swaps clocksources if a new one is available
+ *
+ * Accumulates current time interval and initializes new clocksource
+ */
+static void change_clocksource(void)
+{
+ struct clocksource *new;
+ cycle_t now;
+ u64 nsec;
+
+ new = clocksource_get_next();
+
+ if (clock == new)
+ return;
+
+ now = clocksource_read(new);
+ nsec = __get_nsec_offset();
+ timespec_add_ns(&xtime, nsec);
+
+ clock = new;
+ clock->cycle_last = now;
+
+ clock->error = 0;
+ clock->xtime_nsec = 0;
+ clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+
+ tick_clock_notify();
+
+ printk(KERN_INFO "Time: %s clocksource has been installed.\n",
+ clock->name);
+}
+#else
+static inline void change_clocksource(void) { }
+#endif
+
+/**
+ * timekeeping_is_continuous - check to see if timekeeping is free running
+ */
+int timekeeping_is_continuous(void)
+{
+ unsigned long seq;
+ int ret;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+
+ ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ return ret;
+}
+
+/**
+ * read_persistent_clock - Return time in seconds from the persistent clock.
+ *
+ * Weak dummy function for arches that do not yet support it.
+ * Returns seconds from epoch using the battery backed persistent clock.
+ * Returns zero if unsupported.
+ *
+ * XXX - Do be sure to remove it once all arches implement it.
+ */
+unsigned long __attribute__((weak)) read_persistent_clock(void)
+{
+ return 0;
+}
+
+/*
+ * timekeeping_init - Initializes the clocksource and common timekeeping values
+ */
+void __init timekeeping_init(void)
+{
+ unsigned long flags;
+ unsigned long sec = read_persistent_clock();
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+
+ ntp_clear();
+
+ clock = clocksource_get_next();
+ clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+ clock->cycle_last = clocksource_read(clock);
+
+ xtime.tv_sec = sec;
+ xtime.tv_nsec = 0;
+ set_normalized_timespec(&wall_to_monotonic,
+ -xtime.tv_sec, -xtime.tv_nsec);
+
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+}
+
+/* flag for if timekeeping is suspended */
+static int timekeeping_suspended;
+/* time in seconds when suspend began */
+static unsigned long timekeeping_suspend_time;
+
+/**
+ * timekeeping_resume - Resumes the generic timekeeping subsystem.
+ * @dev: unused
+ *
+ * This is for the generic clocksource timekeeping.
+ * xtime/wall_to_monotonic/jiffies/etc are
+ * still managed by arch specific suspend/resume code.
+ */
+static int timekeeping_resume(struct sys_device *dev)
+{
+ unsigned long flags;
+ unsigned long now = read_persistent_clock();
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+
+ if (now && (now > timekeeping_suspend_time)) {
+ unsigned long sleep_length = now - timekeeping_suspend_time;
+
+ xtime.tv_sec += sleep_length;
+ wall_to_monotonic.tv_sec -= sleep_length;
+ }
+ /* re-base the last cycle value */
+ clock->cycle_last = clocksource_read(clock);
+ clock->error = 0;
+ timekeeping_suspended = 0;
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+
+ touch_softlockup_watchdog();
+
+ clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
+
+ /* Resume hrtimers */
+ hres_timers_resume();
+
+ return 0;
+}
+
+static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+ timekeeping_suspended = 1;
+ timekeeping_suspend_time = read_persistent_clock();
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+
+ clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+
+ return 0;
+}
+
+/* sysfs resume/suspend bits for timekeeping */
+static struct sysdev_class timekeeping_sysclass = {
+ .resume = timekeeping_resume,
+ .suspend = timekeeping_suspend,
+ set_kset_name("timekeeping"),
+};
+
+static struct sys_device device_timer = {
+ .id = 0,
+ .cls = &timekeeping_sysclass,
+};
+
+static int __init timekeeping_init_device(void)
+{
+ int error = sysdev_class_register(&timekeeping_sysclass);
+ if (!error)
+ error = sysdev_register(&device_timer);
+ return error;
+}
+
+device_initcall(timekeeping_init_device);
+
+/*
+ * If the error is already larger, we look ahead even further
+ * to compensate for late or lost adjustments.
+ */
+static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
+ s64 *offset)
+{
+ s64 tick_error, i;
+ u32 look_ahead, adj;
+ s32 error2, mult;
+
+ /*
+ * Use the current error value to determine how much to look ahead.
+ * The larger the error the slower we adjust for it to avoid problems
+ * with losing too many ticks, otherwise we would overadjust and
+ * produce an even larger error. The smaller the adjustment the
+ * faster we try to adjust for it, as lost ticks can do less harm
+ * here. This is tuned so that an error of about 1 msec is adusted
+ * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
+ */
+ error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
+ error2 = abs(error2);
+ for (look_ahead = 0; error2 > 0; look_ahead++)
+ error2 >>= 2;
+
+ /*
+ * Now calculate the error in (1 << look_ahead) ticks, but first
+ * remove the single look ahead already included in the error.
+ */
+ tick_error = current_tick_length() >>
+ (TICK_LENGTH_SHIFT - clock->shift + 1);
+ tick_error -= clock->xtime_interval >> 1;
+ error = ((error - tick_error) >> look_ahead) + tick_error;
+
+ /* Finally calculate the adjustment shift value. */
+ i = *interval;
+ mult = 1;
+ if (error < 0) {
+ error = -error;
+ *interval = -*interval;
+ *offset = -*offset;
+ mult = -1;
+ }
+ for (adj = 0; error > i; adj++)
+ error >>= 1;
+
+ *interval <<= adj;
+ *offset <<= adj;
+ return mult << adj;
+}
+
+/*
+ * Adjust the multiplier to reduce the error value,
+ * this is optimized for the most common adjustments of -1,0,1,
+ * for other values we can do a bit more work.
+ */
+static void clocksource_adjust(struct clocksource *clock, s64 offset)
+{
+ s64 error, interval = clock->cycle_interval;
+ int adj;
+
+ error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
+ if (error > interval) {
+ error >>= 2;
+ if (likely(error <= interval))
+ adj = 1;
+ else
+ adj = clocksource_bigadjust(error, &interval, &offset);
+ } else if (error < -interval) {
+ error >>= 2;
+ if (likely(error >= -interval)) {
+ adj = -1;
+ interval = -interval;
+ offset = -offset;
+ } else
+ adj = clocksource_bigadjust(error, &interval, &offset);
+ } else
+ return;
+
+ clock->mult += adj;
+ clock->xtime_interval += interval;
+ clock->xtime_nsec -= offset;
+ clock->error -= (interval - offset) <<
+ (TICK_LENGTH_SHIFT - clock->shift);
+}
+
+/**
+ * update_wall_time - Uses the current clocksource to increment the wall time
+ *
+ * Called from the timer interrupt, must hold a write on xtime_lock.
+ */
+void update_wall_time(void)
+{
+ cycle_t offset;
+
+ /* Make sure we're fully resumed: */
+ if (unlikely(timekeeping_suspended))
+ return;
+
+#ifdef CONFIG_GENERIC_TIME
+ offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
+#else
+ offset = clock->cycle_interval;
+#endif
+ clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
+
+ /* normally this loop will run just once, however in the
+ * case of lost or late ticks, it will accumulate correctly.
+ */
+ while (offset >= clock->cycle_interval) {
+ /* accumulate one interval */
+ clock->xtime_nsec += clock->xtime_interval;
+ clock->cycle_last += clock->cycle_interval;
+ offset -= clock->cycle_interval;
+
+ if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
+ clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
+ xtime.tv_sec++;
+ second_overflow();
+ }
+
+ /* interpolator bits */
+ time_interpolator_update(clock->xtime_interval
+ >> clock->shift);
+
+ /* accumulate error between NTP and clock interval */
+ clock->error += current_tick_length();
+ clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
+ }
+
+ /* correct the clock when NTP error is too big */
+ clocksource_adjust(clock, offset);
+
+ /* store full nanoseconds into xtime */
+ xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
+ clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
+
+ /* check to see if there is a new clocksource to use */
+ change_clocksource();
+ update_vsyscall(&xtime, clock);
+}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 59df5e8..8bbcfb7 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -38,17 +38,12 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
static void print_name_offset(struct seq_file *m, void *sym)
{
- unsigned long addr = (unsigned long)sym;
- char namebuf[KSYM_NAME_LEN+1];
- unsigned long size, offset;
- const char *sym_name;
- char *modname;
-
- sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
- if (sym_name)
- SEQ_printf(m, "%s", sym_name);
- else
+ char symname[KSYM_NAME_LEN+1];
+
+ if (lookup_symbol_name((unsigned long)sym, symname) < 0)
SEQ_printf(m, "<%p>", sym);
+ else
+ SEQ_printf(m, "%s", symname);
}
static void
@@ -70,7 +65,7 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
#endif
SEQ_printf(m, "\n");
- SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n",
+ SEQ_printf(m, " # expires at %Lu nsecs [in %Lu nsecs]\n",
(unsigned long long)ktime_to_ns(timer->expires),
(unsigned long long)(ktime_to_ns(timer->expires) - now));
}
@@ -116,14 +111,14 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
{
SEQ_printf(m, " .index: %d\n",
base->index);
- SEQ_printf(m, " .resolution: %Ld nsecs\n",
+ SEQ_printf(m, " .resolution: %Lu nsecs\n",
(unsigned long long)ktime_to_ns(base->resolution));
SEQ_printf(m, " .get_time: ");
print_name_offset(m, base->get_time);
SEQ_printf(m, "\n");
#ifdef CONFIG_HIGH_RES_TIMERS
- SEQ_printf(m, " .offset: %Ld nsecs\n",
- ktime_to_ns(base->offset));
+ SEQ_printf(m, " .offset: %Lu nsecs\n",
+ (unsigned long long) ktime_to_ns(base->offset));
#endif
SEQ_printf(m, "active timers:\n");
print_active_timers(m, base, now);
@@ -140,10 +135,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
print_base(m, cpu_base->clock_base + i, now);
}
#define P(x) \
- SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(cpu_base->x))
+ SEQ_printf(m, " .%-15s: %Lu\n", #x, \
+ (unsigned long long)(cpu_base->x))
#define P_ns(x) \
- SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \
- (u64)(ktime_to_ns(cpu_base->x)))
+ SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \
+ (unsigned long long)(ktime_to_ns(cpu_base->x)))
#ifdef CONFIG_HIGH_RES_TIMERS
P_ns(expires_next);
@@ -155,10 +151,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
#ifdef CONFIG_TICK_ONESHOT
# define P(x) \
- SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(ts->x))
+ SEQ_printf(m, " .%-15s: %Lu\n", #x, \
+ (unsigned long long)(ts->x))
# define P_ns(x) \
- SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \
- (u64)(ktime_to_ns(ts->x)))
+ SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \
+ (unsigned long long)(ktime_to_ns(ts->x)))
{
struct tick_sched *ts = tick_get_tick_sched(cpu);
P(nohz_mode);
@@ -172,7 +169,8 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
P(last_jiffies);
P(next_jiffies);
P_ns(idle_expires);
- SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies);
+ SEQ_printf(m, "jiffies: %Lu\n",
+ (unsigned long long)jiffies);
}
#endif
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 1bc4882..868f1bc 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -257,16 +257,12 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
static void print_name_offset(struct seq_file *m, unsigned long addr)
{
- char namebuf[KSYM_NAME_LEN+1];
- unsigned long size, offset;
- const char *sym_name;
- char *modname;
-
- sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
- if (sym_name)
- seq_printf(m, "%s", sym_name);
- else
+ char symname[KSYM_NAME_LEN+1];
+
+ if (lookup_symbol_name(addr, symname) < 0)
seq_printf(m, "<%p>", (void *)addr);
+ else
+ seq_printf(m, "%s", symname);
}
static int tstats_show(struct seq_file *m, void *v)
diff --git a/kernel/timer.c b/kernel/timer.c
index b22bd39..59a28b1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1,7 +1,7 @@
/*
* linux/kernel/timer.c
*
- * Kernel internal timers, kernel timekeeping, basic process system calls
+ * Kernel internal timers, basic process system calls
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
@@ -74,7 +74,7 @@ struct tvec_t_base_s {
tvec_t tv3;
tvec_t tv4;
tvec_t tv5;
-} ____cacheline_aligned_in_smp;
+} ____cacheline_aligned;
typedef struct tvec_t_base_s tvec_base_t;
@@ -82,6 +82,37 @@ tvec_base_t boot_tvec_bases;
EXPORT_SYMBOL(boot_tvec_bases);
static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
+/*
+ * Note that all tvec_bases is 2 byte aligned and lower bit of
+ * base in timer_list is guaranteed to be zero. Use the LSB for
+ * the new flag to indicate whether the timer is deferrable
+ */
+#define TBASE_DEFERRABLE_FLAG (0x1)
+
+/* Functions below help us manage 'deferrable' flag */
+static inline unsigned int tbase_get_deferrable(tvec_base_t *base)
+{
+ return (unsigned int)((unsigned long)base & TBASE_DEFERRABLE_FLAG);
+}
+
+static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
+{
+ return (tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG);
+}
+
+static inline void timer_set_deferrable(struct timer_list *timer)
+{
+ timer->base = (tvec_base_t *)((unsigned long)timer->base |
+ TBASE_DEFERRABLE_FLAG);
+}
+
+static inline void
+timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
+{
+ timer->base = (tvec_base_t *)((unsigned long)new_base |
+ tbase_get_deferrable(timer->base));
+}
+
/**
* __round_jiffies - function to round jiffies to a full second
* @j: the time in (absolute) jiffies that should be rounded
@@ -295,6 +326,13 @@ void fastcall init_timer(struct timer_list *timer)
}
EXPORT_SYMBOL(init_timer);
+void fastcall init_timer_deferrable(struct timer_list *timer)
+{
+ init_timer(timer);
+ timer_set_deferrable(timer);
+}
+EXPORT_SYMBOL(init_timer_deferrable);
+
static inline void detach_timer(struct timer_list *timer,
int clear_pending)
{
@@ -325,10 +363,11 @@ static tvec_base_t *lock_timer_base(struct timer_list *timer,
tvec_base_t *base;
for (;;) {
- base = timer->base;
+ tvec_base_t *prelock_base = timer->base;
+ base = tbase_get_base(prelock_base);
if (likely(base != NULL)) {
spin_lock_irqsave(&base->lock, *flags);
- if (likely(base == timer->base))
+ if (likely(prelock_base == timer->base))
return base;
/* The timer has migrated to another CPU */
spin_unlock_irqrestore(&base->lock, *flags);
@@ -365,11 +404,11 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
*/
if (likely(base->running_timer != timer)) {
/* See the comment in lock_timer_base() */
- timer->base = NULL;
+ timer_set_base(timer, NULL);
spin_unlock(&base->lock);
base = new_base;
spin_lock(&base->lock);
- timer->base = base;
+ timer_set_base(timer, base);
}
}
@@ -397,7 +436,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
timer_stats_timer_set_start_info(timer);
BUG_ON(timer_pending(timer) || !timer->function);
spin_lock_irqsave(&base->lock, flags);
- timer->base = base;
+ timer_set_base(timer, base);
internal_add_timer(base, timer);
spin_unlock_irqrestore(&base->lock, flags);
}
@@ -550,7 +589,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
* don't have to detach them individually.
*/
list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
- BUG_ON(timer->base != base);
+ BUG_ON(tbase_get_base(timer->base) != base);
internal_add_timer(base, timer);
}
@@ -590,7 +629,7 @@ static inline void __run_timers(tvec_base_t *base)
void (*fn)(unsigned long);
unsigned long data;
- timer = list_entry(head->next,struct timer_list,entry);
+ timer = list_first_entry(head, struct timer_list,entry);
fn = timer->function;
data = timer->data;
@@ -636,6 +675,9 @@ static unsigned long __next_timer_interrupt(tvec_base_t *base)
index = slot = timer_jiffies & TVR_MASK;
do {
list_for_each_entry(nte, base->tv1.vec + slot, entry) {
+ if (tbase_get_deferrable(nte->base))
+ continue;
+
found = 1;
expires = nte->expires;
/* Look at the cascade bucket(s)? */
@@ -752,455 +794,6 @@ unsigned long next_timer_interrupt(void)
#endif
-/******************************************************************/
-
-/*
- * The current time
- * wall_to_monotonic is what we need to add to xtime (or xtime corrected
- * for sub jiffie times) to get to monotonic time. Monotonic is pegged
- * at zero at system boot time, so wall_to_monotonic will be negative,
- * however, we will ALWAYS keep the tv_nsec part positive so we can use
- * the usual normalization.
- */
-struct timespec xtime __attribute__ ((aligned (16)));
-struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
-
-EXPORT_SYMBOL(xtime);
-
-
-/* XXX - all of this timekeeping code should be later moved to time.c */
-#include <linux/clocksource.h>
-static struct clocksource *clock; /* pointer to current clocksource */
-
-#ifdef CONFIG_GENERIC_TIME
-/**
- * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
- *
- * private function, must hold xtime_lock lock when being
- * called. Returns the number of nanoseconds since the
- * last call to update_wall_time() (adjusted by NTP scaling)
- */
-static inline s64 __get_nsec_offset(void)
-{
- cycle_t cycle_now, cycle_delta;
- s64 ns_offset;
-
- /* read clocksource: */
- cycle_now = clocksource_read(clock);
-
- /* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-
- /* convert to nanoseconds: */
- ns_offset = cyc2ns(clock, cycle_delta);
-
- return ns_offset;
-}
-
-/**
- * __get_realtime_clock_ts - Returns the time of day in a timespec
- * @ts: pointer to the timespec to be set
- *
- * Returns the time of day in a timespec. Used by
- * do_gettimeofday() and get_realtime_clock_ts().
- */
-static inline void __get_realtime_clock_ts(struct timespec *ts)
-{
- unsigned long seq;
- s64 nsecs;
-
- do {
- seq = read_seqbegin(&xtime_lock);
-
- *ts = xtime;
- nsecs = __get_nsec_offset();
-
- } while (read_seqretry(&xtime_lock, seq));
-
- timespec_add_ns(ts, nsecs);
-}
-
-/**
- * getnstimeofday - Returns the time of day in a timespec
- * @ts: pointer to the timespec to be set
- *
- * Returns the time of day in a timespec.
- */
-void getnstimeofday(struct timespec *ts)
-{
- __get_realtime_clock_ts(ts);
-}
-
-EXPORT_SYMBOL(getnstimeofday);
-
-/**
- * do_gettimeofday - Returns the time of day in a timeval
- * @tv: pointer to the timeval to be set
- *
- * NOTE: Users should be converted to using get_realtime_clock_ts()
- */
-void do_gettimeofday(struct timeval *tv)
-{
- struct timespec now;
-
- __get_realtime_clock_ts(&now);
- tv->tv_sec = now.tv_sec;
- tv->tv_usec = now.tv_nsec/1000;
-}
-
-EXPORT_SYMBOL(do_gettimeofday);
-/**
- * do_settimeofday - Sets the time of day
- * @tv: pointer to the timespec variable containing the new time
- *
- * Sets the time of day to the new time and update NTP and notify hrtimers
- */
-int do_settimeofday(struct timespec *tv)
-{
- unsigned long flags;
- time_t wtm_sec, sec = tv->tv_sec;
- long wtm_nsec, nsec = tv->tv_nsec;
-
- if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
- return -EINVAL;
-
- write_seqlock_irqsave(&xtime_lock, flags);
-
- nsec -= __get_nsec_offset();
-
- wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
- wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
- set_normalized_timespec(&xtime, sec, nsec);
- set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
- clock->error = 0;
- ntp_clear();
-
- update_vsyscall(&xtime, clock);
-
- write_sequnlock_irqrestore(&xtime_lock, flags);
-
- /* signal hrtimers about time change */
- clock_was_set();
-
- return 0;
-}
-
-EXPORT_SYMBOL(do_settimeofday);
-
-/**
- * change_clocksource - Swaps clocksources if a new one is available
- *
- * Accumulates current time interval and initializes new clocksource
- */
-static void change_clocksource(void)
-{
- struct clocksource *new;
- cycle_t now;
- u64 nsec;
-
- new = clocksource_get_next();
-
- if (clock == new)
- return;
-
- now = clocksource_read(new);
- nsec = __get_nsec_offset();
- timespec_add_ns(&xtime, nsec);
-
- clock = new;
- clock->cycle_last = now;
-
- clock->error = 0;
- clock->xtime_nsec = 0;
- clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
-
- tick_clock_notify();
-
- printk(KERN_INFO "Time: %s clocksource has been installed.\n",
- clock->name);
-}
-#else
-static inline void change_clocksource(void) { }
-#endif
-
-/**
- * timekeeping_is_continuous - check to see if timekeeping is free running
- */
-int timekeeping_is_continuous(void)
-{
- unsigned long seq;
- int ret;
-
- do {
- seq = read_seqbegin(&xtime_lock);
-
- ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
-
- } while (read_seqretry(&xtime_lock, seq));
-
- return ret;
-}
-
-/**
- * read_persistent_clock - Return time in seconds from the persistent clock.
- *
- * Weak dummy function for arches that do not yet support it.
- * Returns seconds from epoch using the battery backed persistent clock.
- * Returns zero if unsupported.
- *
- * XXX - Do be sure to remove it once all arches implement it.
- */
-unsigned long __attribute__((weak)) read_persistent_clock(void)
-{
- return 0;
-}
-
-/*
- * timekeeping_init - Initializes the clocksource and common timekeeping values
- */
-void __init timekeeping_init(void)
-{
- unsigned long flags;
- unsigned long sec = read_persistent_clock();
-
- write_seqlock_irqsave(&xtime_lock, flags);
-
- ntp_clear();
-
- clock = clocksource_get_next();
- clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
- clock->cycle_last = clocksource_read(clock);
-
- xtime.tv_sec = sec;
- xtime.tv_nsec = 0;
- set_normalized_timespec(&wall_to_monotonic,
- -xtime.tv_sec, -xtime.tv_nsec);
-
- write_sequnlock_irqrestore(&xtime_lock, flags);
-}
-
-/* flag for if timekeeping is suspended */
-static int timekeeping_suspended;
-/* time in seconds when suspend began */
-static unsigned long timekeeping_suspend_time;
-
-/**
- * timekeeping_resume - Resumes the generic timekeeping subsystem.
- * @dev: unused
- *
- * This is for the generic clocksource timekeeping.
- * xtime/wall_to_monotonic/jiffies/etc are
- * still managed by arch specific suspend/resume code.
- */
-static int timekeeping_resume(struct sys_device *dev)
-{
- unsigned long flags;
- unsigned long now = read_persistent_clock();
-
- write_seqlock_irqsave(&xtime_lock, flags);
-
- if (now && (now > timekeeping_suspend_time)) {
- unsigned long sleep_length = now - timekeeping_suspend_time;
-
- xtime.tv_sec += sleep_length;
- wall_to_monotonic.tv_sec -= sleep_length;
- }
- /* re-base the last cycle value */
- clock->cycle_last = clocksource_read(clock);
- clock->error = 0;
- timekeeping_suspended = 0;
- write_sequnlock_irqrestore(&xtime_lock, flags);
-
- touch_softlockup_watchdog();
-
- clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-
- /* Resume hrtimers */
- hres_timers_resume();
-
- return 0;
-}
-
-static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
-{
- unsigned long flags;
-
- write_seqlock_irqsave(&xtime_lock, flags);
- timekeeping_suspended = 1;
- timekeeping_suspend_time = read_persistent_clock();
- write_sequnlock_irqrestore(&xtime_lock, flags);
-
- clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
-
- return 0;
-}
-
-/* sysfs resume/suspend bits for timekeeping */
-static struct sysdev_class timekeeping_sysclass = {
- .resume = timekeeping_resume,
- .suspend = timekeeping_suspend,
- set_kset_name("timekeeping"),
-};
-
-static struct sys_device device_timer = {
- .id = 0,
- .cls = &timekeeping_sysclass,
-};
-
-static int __init timekeeping_init_device(void)
-{
- int error = sysdev_class_register(&timekeeping_sysclass);
- if (!error)
- error = sysdev_register(&device_timer);
- return error;
-}
-
-device_initcall(timekeeping_init_device);
-
-/*
- * If the error is already larger, we look ahead even further
- * to compensate for late or lost adjustments.
- */
-static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
- s64 *offset)
-{
- s64 tick_error, i;
- u32 look_ahead, adj;
- s32 error2, mult;
-
- /*
- * Use the current error value to determine how much to look ahead.
- * The larger the error the slower we adjust for it to avoid problems
- * with losing too many ticks, otherwise we would overadjust and
- * produce an even larger error. The smaller the adjustment the
- * faster we try to adjust for it, as lost ticks can do less harm
- * here. This is tuned so that an error of about 1 msec is adusted
- * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
- */
- error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
- error2 = abs(error2);
- for (look_ahead = 0; error2 > 0; look_ahead++)
- error2 >>= 2;
-
- /*
- * Now calculate the error in (1 << look_ahead) ticks, but first
- * remove the single look ahead already included in the error.
- */
- tick_error = current_tick_length() >>
- (TICK_LENGTH_SHIFT - clock->shift + 1);
- tick_error -= clock->xtime_interval >> 1;
- error = ((error - tick_error) >> look_ahead) + tick_error;
-
- /* Finally calculate the adjustment shift value. */
- i = *interval;
- mult = 1;
- if (error < 0) {
- error = -error;
- *interval = -*interval;
- *offset = -*offset;
- mult = -1;
- }
- for (adj = 0; error > i; adj++)
- error >>= 1;
-
- *interval <<= adj;
- *offset <<= adj;
- return mult << adj;
-}
-
-/*
- * Adjust the multiplier to reduce the error value,
- * this is optimized for the most common adjustments of -1,0,1,
- * for other values we can do a bit more work.
- */
-static void clocksource_adjust(struct clocksource *clock, s64 offset)
-{
- s64 error, interval = clock->cycle_interval;
- int adj;
-
- error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
- if (error > interval) {
- error >>= 2;
- if (likely(error <= interval))
- adj = 1;
- else
- adj = clocksource_bigadjust(error, &interval, &offset);
- } else if (error < -interval) {
- error >>= 2;
- if (likely(error >= -interval)) {
- adj = -1;
- interval = -interval;
- offset = -offset;
- } else
- adj = clocksource_bigadjust(error, &interval, &offset);
- } else
- return;
-
- clock->mult += adj;
- clock->xtime_interval += interval;
- clock->xtime_nsec -= offset;
- clock->error -= (interval - offset) <<
- (TICK_LENGTH_SHIFT - clock->shift);
-}
-
-/**
- * update_wall_time - Uses the current clocksource to increment the wall time
- *
- * Called from the timer interrupt, must hold a write on xtime_lock.
- */
-static void update_wall_time(void)
-{
- cycle_t offset;
-
- /* Make sure we're fully resumed: */
- if (unlikely(timekeeping_suspended))
- return;
-
-#ifdef CONFIG_GENERIC_TIME
- offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
-#else
- offset = clock->cycle_interval;
-#endif
- clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
-
- /* normally this loop will run just once, however in the
- * case of lost or late ticks, it will accumulate correctly.
- */
- while (offset >= clock->cycle_interval) {
- /* accumulate one interval */
- clock->xtime_nsec += clock->xtime_interval;
- clock->cycle_last += clock->cycle_interval;
- offset -= clock->cycle_interval;
-
- if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
- clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
- xtime.tv_sec++;
- second_overflow();
- }
-
- /* interpolator bits */
- time_interpolator_update(clock->xtime_interval
- >> clock->shift);
-
- /* accumulate error between NTP and clock interval */
- clock->error += current_tick_length();
- clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
- }
-
- /* correct the clock when NTP error is too big */
- clocksource_adjust(clock, offset);
-
- /* store full nanoseconds into xtime */
- xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
- clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
-
- /* check to see if there is a new clocksource to use */
- change_clocksource();
- update_vsyscall(&xtime, clock);
-}
-
/*
* Called from the timer interrupt handler to charge one tick to the current
* process. user_tick is 1 if the tick is user time, 0 for system.
@@ -1264,14 +857,6 @@ static inline void calc_load(unsigned long ticks)
}
/*
- * This read-write spinlock protects us from races in SMP while
- * playing with xtime and avenrun.
- */
-__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
-
-EXPORT_SYMBOL(xtime_lock);
-
-/*
* This function runs timers and the timer-tq in bottom half context.
*/
static void run_timer_softirq(struct softirq_action *h)
@@ -1617,6 +1202,13 @@ static int __devinit init_timers_cpu(int cpu)
cpu_to_node(cpu));
if (!base)
return -ENOMEM;
+
+ /* Make sure that tvec_base is 2 byte aligned */
+ if (tbase_get_deferrable(base)) {
+ WARN_ON(1);
+ kfree(base);
+ return -ENOMEM;
+ }
memset(base, 0, sizeof(*base));
per_cpu(tvec_bases, cpu) = base;
} else {
@@ -1656,9 +1248,9 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
struct timer_list *timer;
while (!list_empty(head)) {
- timer = list_entry(head->next, struct timer_list, entry);
+ timer = list_first_entry(head, struct timer_list, entry);
detach_timer(timer, 0);
- timer->base = new_base;
+ timer_set_base(timer, new_base);
internal_add_timer(new_base, timer);
}
}
@@ -1701,11 +1293,13 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
long cpu = (long)hcpu;
switch(action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
if (init_timers_cpu(cpu) < 0)
return NOTIFY_BAD;
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
migrate_timers(cpu);
break;
#endif
@@ -1905,6 +1499,8 @@ unregister_time_interpolator(struct time_interpolator *ti)
prev = &curr->next;
}
+ clocksource_resume();
+
write_seqlock_irqsave(&xtime_lock, flags);
if (ti == time_interpolator) {
/* we lost the best time-interpolator: */
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 187e2a4..dd308ba 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -6,7 +6,6 @@
#include <linux/mm.h>
#include <linux/utsname.h>
#include <linux/mman.h>
-#include <linux/smp_lock.h>
#include <linux/notifier.h>
#include <linux/reboot.h>
#include <linux/prctl.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index c859164..160c8c5 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -32,58 +32,25 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
}
/*
- * unshare the current process' utsname namespace.
- * called only in sys_unshare()
- */
-int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts)
-{
- if (unshare_flags & CLONE_NEWUTS) {
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- *new_uts = clone_uts_ns(current->nsproxy->uts_ns);
- if (!*new_uts)
- return -ENOMEM;
- }
-
- return 0;
-}
-
-/*
* Copy task tsk's utsname namespace, or clone it if flags
* specifies CLONE_NEWUTS. In latter case, changes to the
* utsname of this process won't be seen by parent, and vice
* versa.
*/
-int copy_utsname(int flags, struct task_struct *tsk)
+struct uts_namespace *copy_utsname(int flags, struct uts_namespace *old_ns)
{
- struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
struct uts_namespace *new_ns;
- int err = 0;
-
- if (!old_ns)
- return 0;
+ BUG_ON(!old_ns);
get_uts_ns(old_ns);
if (!(flags & CLONE_NEWUTS))
- return 0;
-
- if (!capable(CAP_SYS_ADMIN)) {
- err = -EPERM;
- goto out;
- }
+ return old_ns;
new_ns = clone_uts_ns(old_ns);
- if (!new_ns) {
- err = -ENOMEM;
- goto out;
- }
- tsk->nsproxy->uts_ns = new_ns;
-out:
put_uts_ns(old_ns);
- return err;
+ return new_ns;
}
void free_uts_ns(struct kref *kref)
diff --git a/kernel/wait.c b/kernel/wait.c
index 59a82f6..444ddbf 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -61,7 +61,7 @@ EXPORT_SYMBOL(remove_wait_queue);
* The spin_unlock() itself is semi-permeable and only protects
* one way (it only protects stuff inside the critical region and
* stops them from bleeding out - it would still allow subsequent
- * loads to move into the the critical region).
+ * loads to move into the critical region).
*/
void fastcall
prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b6fa5e6..fb56fed 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -36,30 +36,20 @@
/*
* The per-CPU workqueue (if single thread, we always use the first
* possible cpu).
- *
- * The sequence counters are for flush_scheduled_work(). It wants to wait
- * until all currently-scheduled works are completed, but it doesn't
- * want to be livelocked by new, incoming ones. So it waits until
- * remove_sequence is >= the insert_sequence which pertained when
- * flush_scheduled_work() was called.
*/
struct cpu_workqueue_struct {
spinlock_t lock;
- long remove_sequence; /* Least-recently added (next to run) */
- long insert_sequence; /* Next to add */
-
struct list_head worklist;
wait_queue_head_t more_work;
- wait_queue_head_t work_done;
+ struct work_struct *current_work;
struct workqueue_struct *wq;
struct task_struct *thread;
+ int should_stop;
int run_depth; /* Detect run_workqueue() recursion depth */
-
- int freezeable; /* Freeze the thread during suspend */
} ____cacheline_aligned;
/*
@@ -68,8 +58,10 @@ struct cpu_workqueue_struct {
*/
struct workqueue_struct {
struct cpu_workqueue_struct *cpu_wq;
+ struct list_head list;
const char *name;
- struct list_head list; /* Empty if single thread */
+ int singlethread;
+ int freezeable; /* Freeze threads during suspend */
};
/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
@@ -77,106 +69,68 @@ struct workqueue_struct {
static DEFINE_MUTEX(workqueue_mutex);
static LIST_HEAD(workqueues);
-static int singlethread_cpu;
+static int singlethread_cpu __read_mostly;
+static cpumask_t cpu_singlethread_map __read_mostly;
+/* optimization, we could use cpu_possible_map */
+static cpumask_t cpu_populated_map __read_mostly;
/* If it's single threaded, it isn't in the list of workqueues. */
static inline int is_single_threaded(struct workqueue_struct *wq)
{
- return list_empty(&wq->list);
+ return wq->singlethread;
+}
+
+static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq)
+{
+ return is_single_threaded(wq)
+ ? &cpu_singlethread_map : &cpu_populated_map;
+}
+
+static
+struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
+{
+ if (unlikely(is_single_threaded(wq)))
+ cpu = singlethread_cpu;
+ return per_cpu_ptr(wq->cpu_wq, cpu);
}
/*
* Set the workqueue on which a work item is to be run
* - Must *only* be called if the pending flag is set
*/
-static inline void set_wq_data(struct work_struct *work, void *wq)
+static inline void set_wq_data(struct work_struct *work,
+ struct cpu_workqueue_struct *cwq)
{
unsigned long new;
BUG_ON(!work_pending(work));
- new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING);
+ new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING);
new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work);
atomic_long_set(&work->data, new);
}
-static inline void *get_wq_data(struct work_struct *work)
+static inline
+struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
{
return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
}
-static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work)
+static void insert_work(struct cpu_workqueue_struct *cwq,
+ struct work_struct *work, int tail)
{
- int ret = 0;
- unsigned long flags;
-
- spin_lock_irqsave(&cwq->lock, flags);
+ set_wq_data(work, cwq);
/*
- * We need to re-validate the work info after we've gotten
- * the cpu_workqueue lock. We can run the work now iff:
- *
- * - the wq_data still matches the cpu_workqueue_struct
- * - AND the work is still marked pending
- * - AND the work is still on a list (which will be this
- * workqueue_struct list)
- *
- * All these conditions are important, because we
- * need to protect against the work being run right
- * now on another CPU (all but the last one might be
- * true if it's currently running and has not been
- * released yet, for example).
+ * Ensure that we get the right work->data if we see the
+ * result of list_add() below, see try_to_grab_pending().
*/
- if (get_wq_data(work) == cwq
- && work_pending(work)
- && !list_empty(&work->entry)) {
- work_func_t f = work->func;
- list_del_init(&work->entry);
- spin_unlock_irqrestore(&cwq->lock, flags);
-
- if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work)))
- work_release(work);
- f(work);
-
- spin_lock_irqsave(&cwq->lock, flags);
- cwq->remove_sequence++;
- wake_up(&cwq->work_done);
- ret = 1;
- }
- spin_unlock_irqrestore(&cwq->lock, flags);
- return ret;
-}
-
-/**
- * run_scheduled_work - run scheduled work synchronously
- * @work: work to run
- *
- * This checks if the work was pending, and runs it
- * synchronously if so. It returns a boolean to indicate
- * whether it had any scheduled work to run or not.
- *
- * NOTE! This _only_ works for normal work_structs. You
- * CANNOT use this for delayed work, because the wq data
- * for delayed work will not point properly to the per-
- * CPU workqueue struct, but will change!
- */
-int fastcall run_scheduled_work(struct work_struct *work)
-{
- for (;;) {
- struct cpu_workqueue_struct *cwq;
-
- if (!work_pending(work))
- return 0;
- if (list_empty(&work->entry))
- return 0;
- /* NOTE! This depends intimately on __queue_work! */
- cwq = get_wq_data(work);
- if (!cwq)
- return 0;
- if (__run_work(cwq, work))
- return 1;
- }
+ smp_wmb();
+ if (tail)
+ list_add_tail(&work->entry, &cwq->worklist);
+ else
+ list_add(&work->entry, &cwq->worklist);
+ wake_up(&cwq->more_work);
}
-EXPORT_SYMBOL(run_scheduled_work);
/* Preempt must be disabled. */
static void __queue_work(struct cpu_workqueue_struct *cwq,
@@ -185,10 +139,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
unsigned long flags;
spin_lock_irqsave(&cwq->lock, flags);
- set_wq_data(work, cwq);
- list_add_tail(&work->entry, &cwq->worklist);
- cwq->insert_sequence++;
- wake_up(&cwq->more_work);
+ insert_work(cwq, work, 1);
spin_unlock_irqrestore(&cwq->lock, flags);
}
@@ -204,16 +155,14 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
*/
int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
{
- int ret = 0, cpu = get_cpu();
+ int ret = 0;
if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
- if (unlikely(is_single_threaded(wq)))
- cpu = singlethread_cpu;
BUG_ON(!list_empty(&work->entry));
- __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+ __queue_work(wq_per_cpu(wq, get_cpu()), work);
+ put_cpu();
ret = 1;
}
- put_cpu();
return ret;
}
EXPORT_SYMBOL_GPL(queue_work);
@@ -221,13 +170,10 @@ EXPORT_SYMBOL_GPL(queue_work);
void delayed_work_timer_fn(unsigned long __data)
{
struct delayed_work *dwork = (struct delayed_work *)__data;
- struct workqueue_struct *wq = get_wq_data(&dwork->work);
- int cpu = smp_processor_id();
+ struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
+ struct workqueue_struct *wq = cwq->wq;
- if (unlikely(is_single_threaded(wq)))
- cpu = singlethread_cpu;
-
- __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work);
+ __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work);
}
/**
@@ -241,27 +187,11 @@ void delayed_work_timer_fn(unsigned long __data)
int fastcall queue_delayed_work(struct workqueue_struct *wq,
struct delayed_work *dwork, unsigned long delay)
{
- int ret = 0;
- struct timer_list *timer = &dwork->timer;
- struct work_struct *work = &dwork->work;
-
- timer_stats_timer_set_start_info(timer);
+ timer_stats_timer_set_start_info(&dwork->timer);
if (delay == 0)
- return queue_work(wq, work);
-
- if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
- BUG_ON(timer_pending(timer));
- BUG_ON(!list_empty(&work->entry));
+ return queue_work(wq, &dwork->work);
- /* This stores wq for the moment, for the timer_fn */
- set_wq_data(work, wq);
- timer->expires = jiffies + delay;
- timer->data = (unsigned long)dwork;
- timer->function = delayed_work_timer_fn;
- add_timer(timer);
- ret = 1;
- }
- return ret;
+ return queue_delayed_work_on(-1, wq, dwork, delay);
}
EXPORT_SYMBOL_GPL(queue_delayed_work);
@@ -285,12 +215,16 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
BUG_ON(timer_pending(timer));
BUG_ON(!list_empty(&work->entry));
- /* This stores wq for the moment, for the timer_fn */
- set_wq_data(work, wq);
+ /* This stores cwq for the moment, for the timer_fn */
+ set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
timer->expires = jiffies + delay;
timer->data = (unsigned long)dwork;
timer->function = delayed_work_timer_fn;
- add_timer_on(timer, cpu);
+
+ if (unlikely(cpu >= 0))
+ add_timer_on(timer, cpu);
+ else
+ add_timer(timer);
ret = 1;
}
return ret;
@@ -299,13 +233,7 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
static void run_workqueue(struct cpu_workqueue_struct *cwq)
{
- unsigned long flags;
-
- /*
- * Keep taking off work from the queue until
- * done.
- */
- spin_lock_irqsave(&cwq->lock, flags);
+ spin_lock_irq(&cwq->lock);
cwq->run_depth++;
if (cwq->run_depth > 3) {
/* morton gets to eat his hat */
@@ -318,12 +246,12 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
struct work_struct, entry);
work_func_t f = work->func;
+ cwq->current_work = work;
list_del_init(cwq->worklist.next);
- spin_unlock_irqrestore(&cwq->lock, flags);
+ spin_unlock_irq(&cwq->lock);
BUG_ON(get_wq_data(work) != cwq);
- if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work)))
- work_release(work);
+ work_clear_pending(work);
f(work);
if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
@@ -337,63 +265,81 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
dump_stack();
}
- spin_lock_irqsave(&cwq->lock, flags);
- cwq->remove_sequence++;
- wake_up(&cwq->work_done);
+ spin_lock_irq(&cwq->lock);
+ cwq->current_work = NULL;
}
cwq->run_depth--;
- spin_unlock_irqrestore(&cwq->lock, flags);
+ spin_unlock_irq(&cwq->lock);
+}
+
+/*
+ * NOTE: the caller must not touch *cwq if this func returns true
+ */
+static int cwq_should_stop(struct cpu_workqueue_struct *cwq)
+{
+ int should_stop = cwq->should_stop;
+
+ if (unlikely(should_stop)) {
+ spin_lock_irq(&cwq->lock);
+ should_stop = cwq->should_stop && list_empty(&cwq->worklist);
+ if (should_stop)
+ cwq->thread = NULL;
+ spin_unlock_irq(&cwq->lock);
+ }
+
+ return should_stop;
}
static int worker_thread(void *__cwq)
{
struct cpu_workqueue_struct *cwq = __cwq;
- DECLARE_WAITQUEUE(wait, current);
- struct k_sigaction sa;
- sigset_t blocked;
+ DEFINE_WAIT(wait);
- if (!cwq->freezeable)
+ if (!cwq->wq->freezeable)
current->flags |= PF_NOFREEZE;
set_user_nice(current, -5);
- /* Block and flush all signals */
- sigfillset(&blocked);
- sigprocmask(SIG_BLOCK, &blocked, NULL);
- flush_signals(current);
-
- /*
- * We inherited MPOL_INTERLEAVE from the booting kernel.
- * Set MPOL_DEFAULT to insure node local allocations.
- */
- numa_default_policy();
-
- /* SIG_IGN makes children autoreap: see do_notify_parent(). */
- sa.sa.sa_handler = SIG_IGN;
- sa.sa.sa_flags = 0;
- siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
- do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
+ for (;;) {
+ prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
+ if (!freezing(current) && !cwq->should_stop
+ && list_empty(&cwq->worklist))
+ schedule();
+ finish_wait(&cwq->more_work, &wait);
- set_current_state(TASK_INTERRUPTIBLE);
- while (!kthread_should_stop()) {
- if (cwq->freezeable)
- try_to_freeze();
+ try_to_freeze();
- add_wait_queue(&cwq->more_work, &wait);
- if (list_empty(&cwq->worklist))
- schedule();
- else
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&cwq->more_work, &wait);
+ if (cwq_should_stop(cwq))
+ break;
- if (!list_empty(&cwq->worklist))
- run_workqueue(cwq);
- set_current_state(TASK_INTERRUPTIBLE);
+ run_workqueue(cwq);
}
- __set_current_state(TASK_RUNNING);
+
return 0;
}
+struct wq_barrier {
+ struct work_struct work;
+ struct completion done;
+};
+
+static void wq_barrier_func(struct work_struct *work)
+{
+ struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
+ complete(&barr->done);
+}
+
+static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
+ struct wq_barrier *barr, int tail)
+{
+ INIT_WORK(&barr->work, wq_barrier_func);
+ __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
+
+ init_completion(&barr->done);
+
+ insert_work(cwq, &barr->work, tail);
+}
+
static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
{
if (cwq->thread == current) {
@@ -403,21 +349,18 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
*/
run_workqueue(cwq);
} else {
- DEFINE_WAIT(wait);
- long sequence_needed;
+ struct wq_barrier barr;
+ int active = 0;
spin_lock_irq(&cwq->lock);
- sequence_needed = cwq->insert_sequence;
-
- while (sequence_needed - cwq->remove_sequence > 0) {
- prepare_to_wait(&cwq->work_done, &wait,
- TASK_UNINTERRUPTIBLE);
- spin_unlock_irq(&cwq->lock);
- schedule();
- spin_lock_irq(&cwq->lock);
+ if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
+ insert_wq_barrier(cwq, &barr, 1);
+ active = 1;
}
- finish_wait(&cwq->work_done, &wait);
spin_unlock_irq(&cwq->lock);
+
+ if (active)
+ wait_for_completion(&barr.done);
}
}
@@ -428,151 +371,145 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
* Forces execution of the workqueue and blocks until its completion.
* This is typically used in driver shutdown handlers.
*
- * This function will sample each workqueue's current insert_sequence number and
- * will sleep until the head sequence is greater than or equal to that. This
- * means that we sleep until all works which were queued on entry have been
- * handled, but we are not livelocked by new incoming ones.
+ * We sleep until all works which were queued on entry have been handled,
+ * but we are not livelocked by new incoming ones.
*
* This function used to run the workqueues itself. Now we just wait for the
* helper threads to do it.
*/
void fastcall flush_workqueue(struct workqueue_struct *wq)
{
+ const cpumask_t *cpu_map = wq_cpu_map(wq);
+ int cpu;
+
might_sleep();
+ for_each_cpu_mask(cpu, *cpu_map)
+ flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
+}
+EXPORT_SYMBOL_GPL(flush_workqueue);
- if (is_single_threaded(wq)) {
- /* Always use first cpu's area. */
- flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
- } else {
- int cpu;
+/*
+ * Upon a successful return, the caller "owns" WORK_STRUCT_PENDING bit,
+ * so this work can't be re-armed in any way.
+ */
+static int try_to_grab_pending(struct work_struct *work)
+{
+ struct cpu_workqueue_struct *cwq;
+ int ret = 0;
- mutex_lock(&workqueue_mutex);
- for_each_online_cpu(cpu)
- flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
- mutex_unlock(&workqueue_mutex);
+ if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
+ return 1;
+
+ /*
+ * The queueing is in progress, or it is already queued. Try to
+ * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
+ */
+
+ cwq = get_wq_data(work);
+ if (!cwq)
+ return ret;
+
+ spin_lock_irq(&cwq->lock);
+ if (!list_empty(&work->entry)) {
+ /*
+ * This work is queued, but perhaps we locked the wrong cwq.
+ * In that case we must see the new value after rmb(), see
+ * insert_work()->wmb().
+ */
+ smp_rmb();
+ if (cwq == get_wq_data(work)) {
+ list_del_init(&work->entry);
+ ret = 1;
+ }
}
+ spin_unlock_irq(&cwq->lock);
+
+ return ret;
}
-EXPORT_SYMBOL_GPL(flush_workqueue);
-static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
- int cpu, int freezeable)
+static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
+ struct work_struct *work)
{
- struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
- struct task_struct *p;
+ struct wq_barrier barr;
+ int running = 0;
- spin_lock_init(&cwq->lock);
- cwq->wq = wq;
- cwq->thread = NULL;
- cwq->insert_sequence = 0;
- cwq->remove_sequence = 0;
- cwq->freezeable = freezeable;
- INIT_LIST_HEAD(&cwq->worklist);
- init_waitqueue_head(&cwq->more_work);
- init_waitqueue_head(&cwq->work_done);
+ spin_lock_irq(&cwq->lock);
+ if (unlikely(cwq->current_work == work)) {
+ insert_wq_barrier(cwq, &barr, 0);
+ running = 1;
+ }
+ spin_unlock_irq(&cwq->lock);
- if (is_single_threaded(wq))
- p = kthread_create(worker_thread, cwq, "%s", wq->name);
- else
- p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu);
- if (IS_ERR(p))
- return NULL;
- cwq->thread = p;
- return p;
+ if (unlikely(running))
+ wait_for_completion(&barr.done);
}
-struct workqueue_struct *__create_workqueue(const char *name,
- int singlethread, int freezeable)
+static void wait_on_work(struct work_struct *work)
{
- int cpu, destroy = 0;
+ struct cpu_workqueue_struct *cwq;
struct workqueue_struct *wq;
- struct task_struct *p;
+ const cpumask_t *cpu_map;
+ int cpu;
- wq = kzalloc(sizeof(*wq), GFP_KERNEL);
- if (!wq)
- return NULL;
+ might_sleep();
- wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
- if (!wq->cpu_wq) {
- kfree(wq);
- return NULL;
- }
+ cwq = get_wq_data(work);
+ if (!cwq)
+ return;
- wq->name = name;
- mutex_lock(&workqueue_mutex);
- if (singlethread) {
- INIT_LIST_HEAD(&wq->list);
- p = create_workqueue_thread(wq, singlethread_cpu, freezeable);
- if (!p)
- destroy = 1;
- else
- wake_up_process(p);
- } else {
- list_add(&wq->list, &workqueues);
- for_each_online_cpu(cpu) {
- p = create_workqueue_thread(wq, cpu, freezeable);
- if (p) {
- kthread_bind(p, cpu);
- wake_up_process(p);
- } else
- destroy = 1;
- }
- }
- mutex_unlock(&workqueue_mutex);
+ wq = cwq->wq;
+ cpu_map = wq_cpu_map(wq);
- /*
- * Was there any error during startup? If yes then clean up:
- */
- if (destroy) {
- destroy_workqueue(wq);
- wq = NULL;
- }
- return wq;
+ for_each_cpu_mask(cpu, *cpu_map)
+ wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
}
-EXPORT_SYMBOL_GPL(__create_workqueue);
-static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
+/**
+ * cancel_work_sync - block until a work_struct's callback has terminated
+ * @work: the work which is to be flushed
+ *
+ * cancel_work_sync() will cancel the work if it is queued. If the work's
+ * callback appears to be running, cancel_work_sync() will block until it
+ * has completed.
+ *
+ * It is possible to use this function if the work re-queues itself. It can
+ * cancel the work even if it migrates to another workqueue, however in that
+ * case it only guarantees that work->func() has completed on the last queued
+ * workqueue.
+ *
+ * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not
+ * pending, otherwise it goes into a busy-wait loop until the timer expires.
+ *
+ * The caller must ensure that workqueue_struct on which this work was last
+ * queued can't be destroyed before this function returns.
+ */
+void cancel_work_sync(struct work_struct *work)
{
- struct cpu_workqueue_struct *cwq;
- unsigned long flags;
- struct task_struct *p;
-
- cwq = per_cpu_ptr(wq->cpu_wq, cpu);
- spin_lock_irqsave(&cwq->lock, flags);
- p = cwq->thread;
- cwq->thread = NULL;
- spin_unlock_irqrestore(&cwq->lock, flags);
- if (p)
- kthread_stop(p);
+ while (!try_to_grab_pending(work))
+ cpu_relax();
+ wait_on_work(work);
+ work_clear_pending(work);
}
+EXPORT_SYMBOL_GPL(cancel_work_sync);
/**
- * destroy_workqueue - safely terminate a workqueue
- * @wq: target workqueue
+ * cancel_rearming_delayed_work - reliably kill off a delayed work.
+ * @dwork: the delayed work struct
*
- * Safely destroy a workqueue. All work currently pending will be done first.
+ * It is possible to use this function if @dwork rearms itself via queue_work()
+ * or queue_delayed_work(). See also the comment for cancel_work_sync().
*/
-void destroy_workqueue(struct workqueue_struct *wq)
+void cancel_rearming_delayed_work(struct delayed_work *dwork)
{
- int cpu;
-
- flush_workqueue(wq);
-
- /* We don't need the distraction of CPUs appearing and vanishing. */
- mutex_lock(&workqueue_mutex);
- if (is_single_threaded(wq))
- cleanup_workqueue_thread(wq, singlethread_cpu);
- else {
- for_each_online_cpu(cpu)
- cleanup_workqueue_thread(wq, cpu);
- list_del(&wq->list);
- }
- mutex_unlock(&workqueue_mutex);
- free_percpu(wq->cpu_wq);
- kfree(wq);
+ while (!del_timer(&dwork->timer) &&
+ !try_to_grab_pending(&dwork->work))
+ cpu_relax();
+ wait_on_work(&dwork->work);
+ work_clear_pending(&dwork->work);
}
-EXPORT_SYMBOL_GPL(destroy_workqueue);
+EXPORT_SYMBOL(cancel_rearming_delayed_work);
-static struct workqueue_struct *keventd_wq;
+static struct workqueue_struct *keventd_wq __read_mostly;
/**
* schedule_work - put work task in global workqueue
@@ -638,7 +575,7 @@ int schedule_on_each_cpu(work_func_t func)
if (!works)
return -ENOMEM;
- mutex_lock(&workqueue_mutex);
+ preempt_disable(); /* CPU hotplug */
for_each_online_cpu(cpu) {
struct work_struct *work = per_cpu_ptr(works, cpu);
@@ -646,7 +583,7 @@ int schedule_on_each_cpu(work_func_t func)
set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
}
- mutex_unlock(&workqueue_mutex);
+ preempt_enable();
flush_workqueue(keventd_wq);
free_percpu(works);
return 0;
@@ -659,29 +596,6 @@ void flush_scheduled_work(void)
EXPORT_SYMBOL(flush_scheduled_work);
/**
- * cancel_rearming_delayed_workqueue - reliably kill off a delayed work whose handler rearms the delayed work.
- * @wq: the controlling workqueue structure
- * @dwork: the delayed work struct
- */
-void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
- struct delayed_work *dwork)
-{
- while (!cancel_delayed_work(dwork))
- flush_workqueue(wq);
-}
-EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
-
-/**
- * cancel_rearming_delayed_work - reliably kill off a delayed keventd work whose handler rearms the delayed work.
- * @dwork: the delayed work struct
- */
-void cancel_rearming_delayed_work(struct delayed_work *dwork)
-{
- cancel_rearming_delayed_workqueue(keventd_wq, dwork);
-}
-EXPORT_SYMBOL(cancel_rearming_delayed_work);
-
-/**
* execute_in_process_context - reliably execute the routine with user context
* @fn: the function to execute
* @ew: guaranteed storage for the execute work structure (must
@@ -728,94 +642,209 @@ int current_is_keventd(void)
}
-/* Take the work from this (downed) CPU. */
-static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
+static struct cpu_workqueue_struct *
+init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
{
struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
- struct list_head list;
- struct work_struct *work;
- spin_lock_irq(&cwq->lock);
- list_replace_init(&cwq->worklist, &list);
+ cwq->wq = wq;
+ spin_lock_init(&cwq->lock);
+ INIT_LIST_HEAD(&cwq->worklist);
+ init_waitqueue_head(&cwq->more_work);
+
+ return cwq;
+}
+
+static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+{
+ struct workqueue_struct *wq = cwq->wq;
+ const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d";
+ struct task_struct *p;
+
+ p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
+ /*
+ * Nobody can add the work_struct to this cwq,
+ * if (caller is __create_workqueue)
+ * nobody should see this wq
+ * else // caller is CPU_UP_PREPARE
+ * cpu is not on cpu_online_map
+ * so we can abort safely.
+ */
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ cwq->thread = p;
+ cwq->should_stop = 0;
+
+ return 0;
+}
+
+static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+{
+ struct task_struct *p = cwq->thread;
- while (!list_empty(&list)) {
- printk("Taking work for %s\n", wq->name);
- work = list_entry(list.next,struct work_struct,entry);
- list_del(&work->entry);
- __queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work);
+ if (p != NULL) {
+ if (cpu >= 0)
+ kthread_bind(p, cpu);
+ wake_up_process(p);
}
- spin_unlock_irq(&cwq->lock);
}
-/* We're holding the cpucontrol mutex here */
-static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+struct workqueue_struct *__create_workqueue(const char *name,
+ int singlethread, int freezeable)
{
- unsigned int hotcpu = (unsigned long)hcpu;
struct workqueue_struct *wq;
+ struct cpu_workqueue_struct *cwq;
+ int err = 0, cpu;
- switch (action) {
- case CPU_UP_PREPARE:
- mutex_lock(&workqueue_mutex);
- /* Create a new workqueue thread for it. */
- list_for_each_entry(wq, &workqueues, list) {
- if (!create_workqueue_thread(wq, hotcpu, 0)) {
- printk("workqueue for %i failed\n", hotcpu);
- return NOTIFY_BAD;
- }
- }
- break;
+ wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+ if (!wq)
+ return NULL;
- case CPU_ONLINE:
- /* Kick off worker threads. */
- list_for_each_entry(wq, &workqueues, list) {
- struct cpu_workqueue_struct *cwq;
+ wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
+ if (!wq->cpu_wq) {
+ kfree(wq);
+ return NULL;
+ }
- cwq = per_cpu_ptr(wq->cpu_wq, hotcpu);
- kthread_bind(cwq->thread, hotcpu);
- wake_up_process(cwq->thread);
- }
- mutex_unlock(&workqueue_mutex);
- break;
+ wq->name = name;
+ wq->singlethread = singlethread;
+ wq->freezeable = freezeable;
+ INIT_LIST_HEAD(&wq->list);
- case CPU_UP_CANCELED:
- list_for_each_entry(wq, &workqueues, list) {
- if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread)
+ if (singlethread) {
+ cwq = init_cpu_workqueue(wq, singlethread_cpu);
+ err = create_workqueue_thread(cwq, singlethread_cpu);
+ start_workqueue_thread(cwq, -1);
+ } else {
+ mutex_lock(&workqueue_mutex);
+ list_add(&wq->list, &workqueues);
+
+ for_each_possible_cpu(cpu) {
+ cwq = init_cpu_workqueue(wq, cpu);
+ if (err || !cpu_online(cpu))
continue;
- /* Unbind so it can run. */
- kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
- any_online_cpu(cpu_online_map));
- cleanup_workqueue_thread(wq, hotcpu);
+ err = create_workqueue_thread(cwq, cpu);
+ start_workqueue_thread(cwq, cpu);
}
mutex_unlock(&workqueue_mutex);
- break;
+ }
+
+ if (err) {
+ destroy_workqueue(wq);
+ wq = NULL;
+ }
+ return wq;
+}
+EXPORT_SYMBOL_GPL(__create_workqueue);
+
+static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+{
+ struct wq_barrier barr;
+ int alive = 0;
+
+ spin_lock_irq(&cwq->lock);
+ if (cwq->thread != NULL) {
+ insert_wq_barrier(cwq, &barr, 1);
+ cwq->should_stop = 1;
+ alive = 1;
+ }
+ spin_unlock_irq(&cwq->lock);
+
+ if (alive) {
+ wait_for_completion(&barr.done);
- case CPU_DOWN_PREPARE:
+ while (unlikely(cwq->thread != NULL))
+ cpu_relax();
+ /*
+ * Wait until cwq->thread unlocks cwq->lock,
+ * it won't touch *cwq after that.
+ */
+ smp_rmb();
+ spin_unlock_wait(&cwq->lock);
+ }
+}
+
+/**
+ * destroy_workqueue - safely terminate a workqueue
+ * @wq: target workqueue
+ *
+ * Safely destroy a workqueue. All work currently pending will be done first.
+ */
+void destroy_workqueue(struct workqueue_struct *wq)
+{
+ const cpumask_t *cpu_map = wq_cpu_map(wq);
+ struct cpu_workqueue_struct *cwq;
+ int cpu;
+
+ mutex_lock(&workqueue_mutex);
+ list_del(&wq->list);
+ mutex_unlock(&workqueue_mutex);
+
+ for_each_cpu_mask(cpu, *cpu_map) {
+ cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+ cleanup_workqueue_thread(cwq, cpu);
+ }
+
+ free_percpu(wq->cpu_wq);
+ kfree(wq);
+}
+EXPORT_SYMBOL_GPL(destroy_workqueue);
+
+static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+ struct cpu_workqueue_struct *cwq;
+ struct workqueue_struct *wq;
+
+ action &= ~CPU_TASKS_FROZEN;
+
+ switch (action) {
+ case CPU_LOCK_ACQUIRE:
mutex_lock(&workqueue_mutex);
- break;
+ return NOTIFY_OK;
- case CPU_DOWN_FAILED:
+ case CPU_LOCK_RELEASE:
mutex_unlock(&workqueue_mutex);
- break;
+ return NOTIFY_OK;
- case CPU_DEAD:
- list_for_each_entry(wq, &workqueues, list)
- cleanup_workqueue_thread(wq, hotcpu);
- list_for_each_entry(wq, &workqueues, list)
- take_over_work(wq, hotcpu);
- mutex_unlock(&workqueue_mutex);
- break;
+ case CPU_UP_PREPARE:
+ cpu_set(cpu, cpu_populated_map);
+ }
+
+ list_for_each_entry(wq, &workqueues, list) {
+ cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ if (!create_workqueue_thread(cwq, cpu))
+ break;
+ printk(KERN_ERR "workqueue for %i failed\n", cpu);
+ return NOTIFY_BAD;
+
+ case CPU_ONLINE:
+ start_workqueue_thread(cwq, cpu);
+ break;
+
+ case CPU_UP_CANCELED:
+ start_workqueue_thread(cwq, -1);
+ case CPU_DEAD:
+ cleanup_workqueue_thread(cwq, cpu);
+ break;
+ }
}
return NOTIFY_OK;
}
-void init_workqueues(void)
+void __init init_workqueues(void)
{
+ cpu_populated_map = cpu_online_map;
singlethread_cpu = first_cpu(cpu_possible_map);
+ cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu);
hotcpu_notifier(workqueue_cpu_callback, 0);
keventd_wq = create_workqueue("events");
BUG_ON(!keventd_wq);
}
-